proiel 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +99 -0
- data/bin/console +6 -0
- data/bin/setup +5 -0
- data/lib/proiel/annotation_schema.rb +127 -0
- data/lib/proiel/citations.rb +84 -0
- data/lib/proiel/div.rb +133 -0
- data/lib/proiel/positional_tag.rb +127 -0
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
- data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
- data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
- data/lib/proiel/proiel_xml/reader.rb +237 -0
- data/lib/proiel/proiel_xml/schema.rb +81 -0
- data/lib/proiel/proiel_xml/validator.rb +177 -0
- data/lib/proiel/sentence.rb +191 -0
- data/lib/proiel/source.rb +114 -0
- data/lib/proiel/statistics.rb +41 -0
- data/lib/proiel/token.rb +407 -0
- data/lib/proiel/tokenization.rb +90 -0
- data/lib/proiel/treebank.rb +214 -0
- data/lib/proiel/treebank_object.rb +21 -0
- data/lib/proiel/version.rb +9 -0
- data/lib/proiel.rb +28 -0
- metadata +210 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A source object in a treebank.
|
8
|
+
class Source < TreebankObject
|
9
|
+
# @return [String] ID of the source
|
10
|
+
attr_reader :id
|
11
|
+
|
12
|
+
# @return [Treebank] treebank that the div belongs to
|
13
|
+
attr_reader :treebank
|
14
|
+
|
15
|
+
# @return [String] language of the source as an ISO 639-3 language tag
|
16
|
+
attr_reader :language
|
17
|
+
|
18
|
+
# @return [DateTime] export time for the source
|
19
|
+
attr_reader :export_time
|
20
|
+
|
21
|
+
# @return [Hash{Symbol, String}] metadata fields for the source
|
22
|
+
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
23
|
+
attr_reader :metadata
|
24
|
+
|
25
|
+
# Creates a new source object.
|
26
|
+
def initialize(parent, id, export_time, language, metadata, &block)
|
27
|
+
@treebank = parent
|
28
|
+
@id = id.freeze
|
29
|
+
@export_time = DateTime.parse(export_time).freeze
|
30
|
+
@language = language.freeze
|
31
|
+
@metadata = metadata.freeze
|
32
|
+
@children = block.call(self) if block_given?
|
33
|
+
end
|
34
|
+
|
35
|
+
# @return [String] a complete citation for the source
|
36
|
+
def citation
|
37
|
+
citation_part
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the printable form of the source with all token forms and any
|
41
|
+
# presentation data.
|
42
|
+
#
|
43
|
+
# @return [String] the printable form of the source
|
44
|
+
def printable_form(options = {})
|
45
|
+
@children.map { |d| d.printable_form(options) }.compact.join
|
46
|
+
end
|
47
|
+
|
48
|
+
# Accesses metadata fields.
|
49
|
+
#
|
50
|
+
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
51
|
+
def method_missing(method_name, *args, &block)
|
52
|
+
if @metadata.key?(method_name) and args.empty?
|
53
|
+
@metadata[method_name]
|
54
|
+
else
|
55
|
+
super
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Finds all divs in the source.
|
60
|
+
#
|
61
|
+
# @return [Enumerator] divs in the source
|
62
|
+
def divs
|
63
|
+
@children.to_enum
|
64
|
+
end
|
65
|
+
|
66
|
+
# Finds all sentences in the source.
|
67
|
+
#
|
68
|
+
# @return [Enumerator] sentences in the source
|
69
|
+
#
|
70
|
+
# @example Iterating sentences
|
71
|
+
# sentences.each { |s| puts s.id }
|
72
|
+
#
|
73
|
+
# @example Create an array with only reviewed sentences
|
74
|
+
# sentences.select(&:reviewed?)
|
75
|
+
#
|
76
|
+
# @example Counting sentences
|
77
|
+
# sentences.count #=> 200
|
78
|
+
#
|
79
|
+
def sentences
|
80
|
+
Enumerator.new do |y|
|
81
|
+
@children.each do |div|
|
82
|
+
div.sentences.each do |sentence|
|
83
|
+
y << sentence
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Finds all tokens in the source.
|
90
|
+
#
|
91
|
+
# @return [Enumerator] tokens in the source
|
92
|
+
#
|
93
|
+
# @example Iterating tokens
|
94
|
+
# tokens.each { |t| puts t.id }
|
95
|
+
#
|
96
|
+
# @example Create an array with only empty tokens
|
97
|
+
# tokens.select(&:is_empty?)
|
98
|
+
#
|
99
|
+
# @example Counting tokens
|
100
|
+
# puts tokens.count #=> 200
|
101
|
+
#
|
102
|
+
def tokens
|
103
|
+
Enumerator.new do |y|
|
104
|
+
@children.each do |div|
|
105
|
+
div.sentences.each do |sentence|
|
106
|
+
sentence.tokens.each do |token|
|
107
|
+
y << token
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Statistics
|
8
|
+
# Computes the line of best fit using the least-squares method.
|
9
|
+
#
|
10
|
+
# @param x [Array<Number>] x-values
|
11
|
+
# @param y [Array<Number>] y-values
|
12
|
+
#
|
13
|
+
# @return [Array(Float, Float)] y-intercept and slope
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# x = [8, 2, 11, 6, 5, 4, 12, 9, 6, 1]
|
17
|
+
# y = [3, 10, 3, 6, 8, 12, 1, 4, 9, 14]
|
18
|
+
# a, b = PROIEL::Statistics.least_squares(x, y)
|
19
|
+
# a # => 14.081081081081088
|
20
|
+
# b # => -1.1064189189189197
|
21
|
+
#
|
22
|
+
def self.least_squares(x, y)
|
23
|
+
raise ArgumentError unless x.is_a?(Array)
|
24
|
+
raise ArgumentError unless y.is_a?(Array)
|
25
|
+
raise ArgumentError, 'array lengths differ' unless x.size == y.size
|
26
|
+
|
27
|
+
x_mean = x.reduce(&:+).to_f / x.size
|
28
|
+
y_mean = y.reduce(&:+).to_f / y.size
|
29
|
+
x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
|
30
|
+
xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
|
31
|
+
|
32
|
+
sxy = xy_sum - x.length * x_mean * y_mean
|
33
|
+
sx2 = x_sqsum - x.length * (x_mean ** 2)
|
34
|
+
|
35
|
+
beta = sxy / sx2
|
36
|
+
alfa = y_mean - beta * x_mean
|
37
|
+
|
38
|
+
[alfa, beta]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/proiel/token.rb
ADDED
@@ -0,0 +1,407 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A token object in a treebank.
|
8
|
+
class Token < TreebankObject
|
9
|
+
# A class representing a token sentence in the PROIEL treebank.
|
10
|
+
extend Memoist
|
11
|
+
|
12
|
+
# @return [Fixnum] ID of the sentence
|
13
|
+
attr_reader :id
|
14
|
+
|
15
|
+
# @return [Sentence] parent sentence object
|
16
|
+
attr_accessor :sentence
|
17
|
+
|
18
|
+
# @return [nil, Fixnum] ID of head token
|
19
|
+
attr_reader :head_id
|
20
|
+
|
21
|
+
# @return [nil, String] token form
|
22
|
+
attr_reader :form
|
23
|
+
|
24
|
+
# @return [nil, String] token lemma
|
25
|
+
attr_reader :lemma
|
26
|
+
|
27
|
+
# @return [nil, String] token part of speech tag
|
28
|
+
attr_reader :part_of_speech
|
29
|
+
|
30
|
+
# @return [nil, String] token part of speech tag
|
31
|
+
alias :pos :part_of_speech
|
32
|
+
|
33
|
+
# @return [nil, String] token morphological tag
|
34
|
+
attr_reader :morphology
|
35
|
+
|
36
|
+
# @return [nil, String] token relation tag
|
37
|
+
attr_reader :relation
|
38
|
+
|
39
|
+
# @return [nil, String] token empty token sort tag
|
40
|
+
attr_reader :empty_token_sort
|
41
|
+
|
42
|
+
# @return [nil, String] citation part
|
43
|
+
attr_reader :citation_part
|
44
|
+
|
45
|
+
# @return [nil, String] presentation material before form
|
46
|
+
attr_reader :presentation_before
|
47
|
+
|
48
|
+
# @return [nil, String] presentation material after form
|
49
|
+
attr_reader :presentation_after
|
50
|
+
|
51
|
+
# @return [nil, Fixnum] ID of antecedent token
|
52
|
+
attr_reader :antecedent_id
|
53
|
+
|
54
|
+
# @return [nil, String] information status tag
|
55
|
+
attr_reader :information_status
|
56
|
+
|
57
|
+
# @return [nil, String] contrast group tag
|
58
|
+
attr_reader :contrast_group
|
59
|
+
|
60
|
+
# @return [nil, String] free-form foreign IDs
|
61
|
+
attr_reader :foreign_ids
|
62
|
+
|
63
|
+
# @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
|
64
|
+
attr_reader :slashes
|
65
|
+
|
66
|
+
# Creates a new token object.
|
67
|
+
def initialize(parent, id, head_id, form, lemma, part_of_speech,
|
68
|
+
morphology, relation, empty_token_sort, citation_part,
|
69
|
+
presentation_before, presentation_after, antecedent_id,
|
70
|
+
information_status, contrast_group, foreign_ids, slashes)
|
71
|
+
@sentence = parent
|
72
|
+
|
73
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
74
|
+
@id = id
|
75
|
+
|
76
|
+
raise ArgumentError, 'integer or nil expected' unless head_id.nil? or head_id.is_a?(Integer)
|
77
|
+
@head_id = head_id
|
78
|
+
|
79
|
+
raise ArgumentError, 'string or nil expected' unless form.nil? or form.is_a?(String)
|
80
|
+
@form = form.freeze
|
81
|
+
|
82
|
+
raise ArgumentError, 'string or nil expected' unless lemma.nil? or lemma.is_a?(String)
|
83
|
+
@lemma = lemma.freeze
|
84
|
+
|
85
|
+
raise ArgumentError, 'string or nil expected' unless part_of_speech.nil? or part_of_speech.is_a?(String)
|
86
|
+
@part_of_speech = part_of_speech.freeze
|
87
|
+
|
88
|
+
raise ArgumentError, 'string or nil expected' unless morphology.nil? or morphology.is_a?(String)
|
89
|
+
@morphology = morphology.freeze
|
90
|
+
|
91
|
+
raise ArgumentError, 'string or nil expected' unless relation.nil? or relation.is_a?(String)
|
92
|
+
@relation = relation.freeze
|
93
|
+
|
94
|
+
raise ArgumentError, 'string or nil expected' unless empty_token_sort.nil? or empty_token_sort.is_a?(String)
|
95
|
+
@empty_token_sort = empty_token_sort.freeze
|
96
|
+
|
97
|
+
raise ArgumentError, 'string or nil expected' unless citation_part.nil? or citation_part.is_a?(String)
|
98
|
+
@citation_part = citation_part.freeze
|
99
|
+
|
100
|
+
raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
|
101
|
+
@presentation_before = presentation_before.freeze
|
102
|
+
|
103
|
+
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
104
|
+
@presentation_after = presentation_after.freeze
|
105
|
+
|
106
|
+
raise ArgumentError, 'integer or nil expected' unless antecedent_id.nil? or antecedent_id.is_a?(Integer)
|
107
|
+
@antecedent_id = antecedent_id
|
108
|
+
|
109
|
+
raise ArgumentError, 'string or nil expected' unless information_status.nil? or information_status.is_a?(String)
|
110
|
+
@information_status = information_status.freeze
|
111
|
+
|
112
|
+
raise ArgumentError, 'string or nil expected' unless contrast_group.nil? or contrast_group.is_a?(String)
|
113
|
+
@contrast_group = contrast_group.freeze
|
114
|
+
|
115
|
+
raise ArgumentError, 'string or nil expected' unless foreign_ids.nil? or foreign_ids.is_a?(String)
|
116
|
+
@foreign_ids = foreign_ids.freeze
|
117
|
+
|
118
|
+
raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
|
119
|
+
@slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
|
120
|
+
end
|
121
|
+
|
122
|
+
# @return [Div] parent div object
|
123
|
+
def div
|
124
|
+
@sentence.div
|
125
|
+
end
|
126
|
+
|
127
|
+
# @return [Source] parent source object
|
128
|
+
def source
|
129
|
+
@sentence.div.source
|
130
|
+
end
|
131
|
+
|
132
|
+
# @return [Treebank] parent treebank object
|
133
|
+
def treebank
|
134
|
+
@sentence.div.source.treebank
|
135
|
+
end
|
136
|
+
|
137
|
+
# @return [String] language of the token as an ISO 639-3 language tag
|
138
|
+
def language
|
139
|
+
source.language
|
140
|
+
end
|
141
|
+
|
142
|
+
memoize :language
|
143
|
+
|
144
|
+
# @return [nil, String] a complete citation for the token
|
145
|
+
def citation
|
146
|
+
if citation_part
|
147
|
+
[source.citation_part, citation_part].compact.join(' ')
|
148
|
+
else
|
149
|
+
nil
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns the printable form of the token with any presentation data.
|
154
|
+
#
|
155
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
156
|
+
#
|
157
|
+
# @return [String] the printable form of the token
|
158
|
+
def printable_form(custom_token_formatter: nil)
|
159
|
+
printable_form =
|
160
|
+
if custom_token_formatter
|
161
|
+
custom_token_formatter.call(id, form)
|
162
|
+
else
|
163
|
+
form
|
164
|
+
end
|
165
|
+
|
166
|
+
[presentation_before, printable_form, presentation_after].compact.join
|
167
|
+
end
|
168
|
+
|
169
|
+
# @return [Hash<Symbol,String>] token part of speech tag as a hash
|
170
|
+
def part_of_speech_hash
|
171
|
+
if part_of_speech
|
172
|
+
POS_POSITIONAL_TAG_SEQUENCE.zip(part_of_speech.split('')).reject { |_, v| v == '-' }.to_h
|
173
|
+
else
|
174
|
+
{}
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
memoize :part_of_speech_hash
|
179
|
+
|
180
|
+
alias :pos_hash :part_of_speech_hash
|
181
|
+
|
182
|
+
# Returns the part of speech tag if set, but also provides a suitable
|
183
|
+
# part of speech tag for empty elements.
|
184
|
+
#
|
185
|
+
# @return [String] part of speech tag
|
186
|
+
def part_of_speech_with_nulls
|
187
|
+
part_of_speech || NULL_PARTS_OF_SPEECH[empty_token_sort]
|
188
|
+
end
|
189
|
+
|
190
|
+
alias :pos_with_nulls :part_of_speech_with_nulls
|
191
|
+
|
192
|
+
# @return [Hash<Symbol,String>] token morphology tag as a hash
|
193
|
+
def morphology_hash
|
194
|
+
if morphology
|
195
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE.zip(morphology.split('')).reject { |_, v| v == '-' }.to_h
|
196
|
+
else
|
197
|
+
{}
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
memoize :morphology_hash
|
202
|
+
|
203
|
+
# Checks if the token is the root of its dependency graph.
|
204
|
+
#
|
205
|
+
# If the token belongs to a sentence that lacks dependency annotation,
|
206
|
+
# all tokens are treated as roots. If a sentence has partial or complete
|
207
|
+
# dependency annotation there may still be multiple root tokens.
|
208
|
+
#
|
209
|
+
# @return [true, false]
|
210
|
+
def is_root?
|
211
|
+
head_id.nil?
|
212
|
+
end
|
213
|
+
|
214
|
+
# Finds the head of this token.
|
215
|
+
#
|
216
|
+
# The head is the parent of the this token in the tree that has tokens as
|
217
|
+
# nodes and primary relations as edges.
|
218
|
+
#
|
219
|
+
# @return [Token] head
|
220
|
+
def head
|
221
|
+
if is_root?
|
222
|
+
nil
|
223
|
+
else
|
224
|
+
treebank.find_token(head_id)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
memoize :head
|
229
|
+
|
230
|
+
alias :parent :head
|
231
|
+
|
232
|
+
# Finds dependent of this token in the dependency graph.
|
233
|
+
#
|
234
|
+
# The dependents are the children of the this token in the tree that has
|
235
|
+
# tokens as nodes and primary relations as edges.
|
236
|
+
#
|
237
|
+
# The order of the returned dependents is indeterminate.
|
238
|
+
#
|
239
|
+
# @return [Array<Token>] dependent
|
240
|
+
def dependents
|
241
|
+
@sentence.tokens.select { |t| t.head_id == @id }
|
242
|
+
end
|
243
|
+
|
244
|
+
memoize :dependents
|
245
|
+
|
246
|
+
alias :children :dependents
|
247
|
+
|
248
|
+
# Finds ancestors of this token in the dependency graph.
|
249
|
+
#
|
250
|
+
# The ancestors are the ancestors of the this token in the tree that has
|
251
|
+
# tokens as nodes and primary relations as edges.
|
252
|
+
#
|
253
|
+
# The order of the returned ancestors is as follows: The first
|
254
|
+
# ancestor is the head of this token, the next ancestor is
|
255
|
+
# the head of the previous token, and so on.
|
256
|
+
#
|
257
|
+
# @return [Array<Token>] ancestors
|
258
|
+
def ancestors
|
259
|
+
if is_root?
|
260
|
+
[]
|
261
|
+
else
|
262
|
+
[head] + head.ancestors
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
memoize :ancestors
|
267
|
+
|
268
|
+
# Finds descendents of this token in the dependency graph.
|
269
|
+
#
|
270
|
+
# The descendents are the ancestors of the this token in the tree that has
|
271
|
+
# tokens as nodes and primary relations as edges.
|
272
|
+
#
|
273
|
+
# The order of the returned descendents is as indeterminate.
|
274
|
+
#
|
275
|
+
# @return [Array<Token>] descendents
|
276
|
+
def descendents
|
277
|
+
dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
|
278
|
+
end
|
279
|
+
|
280
|
+
memoize :descendents
|
281
|
+
|
282
|
+
alias :descendants :descendents
|
283
|
+
|
284
|
+
# Tests if the token is empty.
|
285
|
+
#
|
286
|
+
# A token is empty if it does not have a form. If the token is empty,
|
287
|
+
# {Token#empty_token_sort} explains its function.
|
288
|
+
#
|
289
|
+
# @see Token#has_content?
|
290
|
+
#
|
291
|
+
# @return [true, false]
|
292
|
+
def is_empty?
|
293
|
+
!empty_token_sort.nil?
|
294
|
+
end
|
295
|
+
|
296
|
+
# Tests if the token has content.
|
297
|
+
#
|
298
|
+
# A token has content if it has a form.
|
299
|
+
#
|
300
|
+
# @see Token#is_empty?
|
301
|
+
#
|
302
|
+
# @return [true, false]
|
303
|
+
def has_content?
|
304
|
+
empty_token_sort.nil?
|
305
|
+
end
|
306
|
+
|
307
|
+
# Tests if the token has a citation.
|
308
|
+
#
|
309
|
+
# A token has a citation if `citation_part` is not `nil`.
|
310
|
+
#
|
311
|
+
# @return [true, false]
|
312
|
+
def has_citation?
|
313
|
+
!citation_part.nil?
|
314
|
+
end
|
315
|
+
|
316
|
+
# Checks if the token is a PRO token.
|
317
|
+
#
|
318
|
+
# @return [true, false]
|
319
|
+
def pro?
|
320
|
+
empty_token_sort == 'P'
|
321
|
+
end
|
322
|
+
|
323
|
+
# Finds the common ancestors that this token and another token
|
324
|
+
# share in the dependency graph.
|
325
|
+
#
|
326
|
+
# If `inclusive` is `false`, a common ancestor is defined strictly
|
327
|
+
# as a common ancestor of both tokens. If `inclusive` is `true`,
|
328
|
+
# one of the tokens can be a common ancestor of the other.
|
329
|
+
#
|
330
|
+
# Ancestors are returned in the same order as {Token#ancestors}.
|
331
|
+
#
|
332
|
+
# @example
|
333
|
+
# x.head # => w
|
334
|
+
# w.head # => z
|
335
|
+
# y.head # => z
|
336
|
+
# z.head # => u
|
337
|
+
#
|
338
|
+
# x.common_ancestors(y, inclusive: false) # => [z, u]
|
339
|
+
# x.common_ancestors(w, inclusive: false) # => [z, u]
|
340
|
+
# x.common_ancestors(x, inclusive: false) # => [w, z, u]
|
341
|
+
#
|
342
|
+
# x.common_ancestors(y, inclusive: true) # => [z, u]
|
343
|
+
# x.common_ancestors(w, inclusive: true) # => [w, z, u]
|
344
|
+
# x.common_ancestors(x, inclusive: true) # => [x, w, z, u]
|
345
|
+
#
|
346
|
+
# @see Token#first_common_ancestor
|
347
|
+
# @see Token#first_common_ancestor_path
|
348
|
+
#
|
349
|
+
# @return [Array<Token>] common ancestors
|
350
|
+
def common_ancestors(other_token, inclusive: false)
|
351
|
+
if inclusive
|
352
|
+
x, y = [self] + ancestors, [other_token] + other_token.ancestors
|
353
|
+
else
|
354
|
+
x, y = ancestors, other_token.ancestors
|
355
|
+
end
|
356
|
+
|
357
|
+
x & y
|
358
|
+
end
|
359
|
+
|
360
|
+
# Finds the first common ancestor that this token and another token
|
361
|
+
# share in the dependency graph.
|
362
|
+
#
|
363
|
+
# If `inclusive` is `false`, a common ancestor is defined strictly
|
364
|
+
# as a common ancestor of both tokens. If `inclusive` is `true`,
|
365
|
+
# one of the tokens can be a common ancestor of the other.
|
366
|
+
#
|
367
|
+
# @example
|
368
|
+
# x.head # => w
|
369
|
+
# w.head # => z
|
370
|
+
# y.head # => z
|
371
|
+
# z.head # => u
|
372
|
+
#
|
373
|
+
# x.first_common_ancestor(y, inclusive: false) # => z
|
374
|
+
# x.first_common_ancestor(w, inclusive: false) # => z
|
375
|
+
# x.first_common_ancestor(x, inclusive: false) # => w
|
376
|
+
#
|
377
|
+
# x.first_common_ancestor(y, inclusive: true) # => z
|
378
|
+
# x.first_common_ancestor(w, inclusive: true) # => w
|
379
|
+
# x.first_common_ancestor(x, inclusive: true) # => x
|
380
|
+
#
|
381
|
+
# @see Token#common_ancestors
|
382
|
+
# @see Token#first_common_ancestor_path
|
383
|
+
#
|
384
|
+
# @return [nil, Token] first common ancestor
|
385
|
+
def first_common_ancestor(other_token, inclusive: false)
|
386
|
+
common_ancestors(other_token, inclusive: inclusive).first
|
387
|
+
end
|
388
|
+
|
389
|
+
private
|
390
|
+
|
391
|
+
# FIXME: extract this from the header of the PROIEL XML file instead and
|
392
|
+
# subclass PositionalTag
|
393
|
+
POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
|
394
|
+
|
395
|
+
# FIXME: extract this from the header of the PROIEL XML file instead and
|
396
|
+
# subclass PositionalTag
|
397
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
|
398
|
+
person number tense mood voice gender case degree strength inflection
|
399
|
+
)
|
400
|
+
|
401
|
+
NULL_PARTS_OF_SPEECH = {
|
402
|
+
'V' => 'V-',
|
403
|
+
'C' => 'C-',
|
404
|
+
'P' => 'Pp',
|
405
|
+
}
|
406
|
+
end
|
407
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Tokenization
|
8
|
+
# Loads tokenization patterns from a configuration file.
|
9
|
+
#
|
10
|
+
# The configuration file should be a JSON file. The keys should
|
11
|
+
# be language tags and the values tokenization patterns.
|
12
|
+
#
|
13
|
+
# The method can be called multiple times. On the first invocation
|
14
|
+
# patterns will be loaded, on subsequent invocations patterns will
|
15
|
+
# be updated. Only patterns for languages that are defined in the
|
16
|
+
# configuration file will be updated, other patterns will remain unchanged.
|
17
|
+
#
|
18
|
+
# @param filename [String] name of tokenization pattern file
|
19
|
+
#
|
20
|
+
# @return [Hash] loaded patterns
|
21
|
+
#
|
22
|
+
def self.load_patterns(filename)
|
23
|
+
raise ArgumentError, 'invalid filename' unless filename.is_a?(String)
|
24
|
+
|
25
|
+
patterns = JSON.parse(File.read(filename))
|
26
|
+
|
27
|
+
regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
|
28
|
+
|
29
|
+
@@regexes ||= {}
|
30
|
+
@@regexes.merge!(regexes)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Makes a regular expression from a pattern given in the configuration file.
|
34
|
+
#
|
35
|
+
# The regular expression is to avoid partial matches. Multi-line matches
|
36
|
+
# are allowed in case characters that are interpreted as line separators
|
37
|
+
# occur in the data.
|
38
|
+
#
|
39
|
+
# @param pattern [String] tokenization pattern
|
40
|
+
#
|
41
|
+
# @return [Regexp]
|
42
|
+
#
|
43
|
+
def self.make_regex(pattern)
|
44
|
+
raise ArgumentError, 'invalid pattern' unless pattern.is_a?(String)
|
45
|
+
|
46
|
+
Regexp.new("^#{pattern}$", Regexp::MULTILINE)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Tests if a token form is splitable. Any form with more than one character
|
50
|
+
# is splitable.
|
51
|
+
#
|
52
|
+
# @param form [String, nil] token form to Tests
|
53
|
+
#
|
54
|
+
# @return [true, false]
|
55
|
+
#
|
56
|
+
def self.is_splitable?(form)
|
57
|
+
raise ArgumentError, 'invalid form' unless form.is_a?(String) or form.nil?
|
58
|
+
|
59
|
+
form and form.length > 1
|
60
|
+
end
|
61
|
+
|
62
|
+
# Splits a token form using the tokenization patterns that apply for a
|
63
|
+
# the specified language. Tokenization patterns must already have been
|
64
|
+
# loaded.
|
65
|
+
#
|
66
|
+
# @param language_tag [String] ISO 639-3 tag for the language whose patterns
|
67
|
+
# should be used to split the token form
|
68
|
+
# @param form [String] token form to split
|
69
|
+
#
|
70
|
+
# @return [Array<String>]
|
71
|
+
#
|
72
|
+
def self.split_form(language_tag, form)
|
73
|
+
raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
|
74
|
+
raise ArgumentError, 'invalid form' unless form.is_a?(String)
|
75
|
+
|
76
|
+
if form[/\W+/]
|
77
|
+
# Split on any non-word character like a space or punctuation
|
78
|
+
form.split(/(\W+)/)
|
79
|
+
elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
|
80
|
+
# Apply language-specific pattern
|
81
|
+
form.match(@@regexes[language_tag]).captures
|
82
|
+
elsif form == ''
|
83
|
+
['']
|
84
|
+
else
|
85
|
+
# Give up and split by character
|
86
|
+
form.split(/()/)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|