proiel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +99 -0
- data/bin/console +6 -0
- data/bin/setup +5 -0
- data/lib/proiel/annotation_schema.rb +127 -0
- data/lib/proiel/citations.rb +84 -0
- data/lib/proiel/div.rb +133 -0
- data/lib/proiel/positional_tag.rb +127 -0
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
- data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
- data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
- data/lib/proiel/proiel_xml/reader.rb +237 -0
- data/lib/proiel/proiel_xml/schema.rb +81 -0
- data/lib/proiel/proiel_xml/validator.rb +177 -0
- data/lib/proiel/sentence.rb +191 -0
- data/lib/proiel/source.rb +114 -0
- data/lib/proiel/statistics.rb +41 -0
- data/lib/proiel/token.rb +407 -0
- data/lib/proiel/tokenization.rb +90 -0
- data/lib/proiel/treebank.rb +214 -0
- data/lib/proiel/treebank_object.rb +21 -0
- data/lib/proiel/version.rb +9 -0
- data/lib/proiel.rb +28 -0
- metadata +210 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A source object in a treebank.
|
8
|
+
class Source < TreebankObject
|
9
|
+
# @return [String] ID of the source
|
10
|
+
attr_reader :id
|
11
|
+
|
12
|
+
# @return [Treebank] treebank that the div belongs to
|
13
|
+
attr_reader :treebank
|
14
|
+
|
15
|
+
# @return [String] language of the source as an ISO 639-3 language tag
|
16
|
+
attr_reader :language
|
17
|
+
|
18
|
+
# @return [DateTime] export time for the source
|
19
|
+
attr_reader :export_time
|
20
|
+
|
21
|
+
# @return [Hash{Symbol, String}] metadata fields for the source
|
22
|
+
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
23
|
+
attr_reader :metadata
|
24
|
+
|
25
|
+
# Creates a new source object.
|
26
|
+
def initialize(parent, id, export_time, language, metadata, &block)
|
27
|
+
@treebank = parent
|
28
|
+
@id = id.freeze
|
29
|
+
@export_time = DateTime.parse(export_time).freeze
|
30
|
+
@language = language.freeze
|
31
|
+
@metadata = metadata.freeze
|
32
|
+
@children = block.call(self) if block_given?
|
33
|
+
end
|
34
|
+
|
35
|
+
# @return [String] a complete citation for the source
|
36
|
+
def citation
|
37
|
+
citation_part
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the printable form of the source with all token forms and any
|
41
|
+
# presentation data.
|
42
|
+
#
|
43
|
+
# @return [String] the printable form of the source
|
44
|
+
def printable_form(options = {})
|
45
|
+
@children.map { |d| d.printable_form(options) }.compact.join
|
46
|
+
end
|
47
|
+
|
48
|
+
# Accesses metadata fields.
|
49
|
+
#
|
50
|
+
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
51
|
+
def method_missing(method_name, *args, &block)
|
52
|
+
if @metadata.key?(method_name) and args.empty?
|
53
|
+
@metadata[method_name]
|
54
|
+
else
|
55
|
+
super
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Finds all divs in the source.
|
60
|
+
#
|
61
|
+
# @return [Enumerator] divs in the source
|
62
|
+
def divs
|
63
|
+
@children.to_enum
|
64
|
+
end
|
65
|
+
|
66
|
+
# Finds all sentences in the source.
|
67
|
+
#
|
68
|
+
# @return [Enumerator] sentences in the source
|
69
|
+
#
|
70
|
+
# @example Iterating sentences
|
71
|
+
# sentences.each { |s| puts s.id }
|
72
|
+
#
|
73
|
+
# @example Create an array with only reviewed sentences
|
74
|
+
# sentences.select(&:reviewed?)
|
75
|
+
#
|
76
|
+
# @example Counting sentences
|
77
|
+
# sentences.count #=> 200
|
78
|
+
#
|
79
|
+
def sentences
|
80
|
+
Enumerator.new do |y|
|
81
|
+
@children.each do |div|
|
82
|
+
div.sentences.each do |sentence|
|
83
|
+
y << sentence
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Finds all tokens in the source.
|
90
|
+
#
|
91
|
+
# @return [Enumerator] tokens in the source
|
92
|
+
#
|
93
|
+
# @example Iterating tokens
|
94
|
+
# tokens.each { |t| puts t.id }
|
95
|
+
#
|
96
|
+
# @example Create an array with only empty tokens
|
97
|
+
# tokens.select(&:is_empty?)
|
98
|
+
#
|
99
|
+
# @example Counting tokens
|
100
|
+
# puts tokens.count #=> 200
|
101
|
+
#
|
102
|
+
def tokens
|
103
|
+
Enumerator.new do |y|
|
104
|
+
@children.each do |div|
|
105
|
+
div.sentences.each do |sentence|
|
106
|
+
sentence.tokens.each do |token|
|
107
|
+
y << token
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Statistics
|
8
|
+
# Computes the line of best fit using the least-squares method.
|
9
|
+
#
|
10
|
+
# @param x [Array<Number>] x-values
|
11
|
+
# @param y [Array<Number>] y-values
|
12
|
+
#
|
13
|
+
# @return [Array(Float, Float)] y-intercept and slope
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# x = [8, 2, 11, 6, 5, 4, 12, 9, 6, 1]
|
17
|
+
# y = [3, 10, 3, 6, 8, 12, 1, 4, 9, 14]
|
18
|
+
# a, b = PROIEL::Statistics.least_squares(x, y)
|
19
|
+
# a # => 14.081081081081088
|
20
|
+
# b # => -1.1064189189189197
|
21
|
+
#
|
22
|
+
def self.least_squares(x, y)
|
23
|
+
raise ArgumentError unless x.is_a?(Array)
|
24
|
+
raise ArgumentError unless y.is_a?(Array)
|
25
|
+
raise ArgumentError, 'array lengths differ' unless x.size == y.size
|
26
|
+
|
27
|
+
x_mean = x.reduce(&:+).to_f / x.size
|
28
|
+
y_mean = y.reduce(&:+).to_f / y.size
|
29
|
+
x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
|
30
|
+
xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
|
31
|
+
|
32
|
+
sxy = xy_sum - x.length * x_mean * y_mean
|
33
|
+
sx2 = x_sqsum - x.length * (x_mean ** 2)
|
34
|
+
|
35
|
+
beta = sxy / sx2
|
36
|
+
alfa = y_mean - beta * x_mean
|
37
|
+
|
38
|
+
[alfa, beta]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/proiel/token.rb
ADDED
@@ -0,0 +1,407 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A token object in a treebank.
|
8
|
+
class Token < TreebankObject
|
9
|
+
# A class representing a token sentence in the PROIEL treebank.
|
10
|
+
extend Memoist
|
11
|
+
|
12
|
+
# @return [Fixnum] ID of the sentence
|
13
|
+
attr_reader :id
|
14
|
+
|
15
|
+
# @return [Sentence] parent sentence object
|
16
|
+
attr_accessor :sentence
|
17
|
+
|
18
|
+
# @return [nil, Fixnum] ID of head token
|
19
|
+
attr_reader :head_id
|
20
|
+
|
21
|
+
# @return [nil, String] token form
|
22
|
+
attr_reader :form
|
23
|
+
|
24
|
+
# @return [nil, String] token lemma
|
25
|
+
attr_reader :lemma
|
26
|
+
|
27
|
+
# @return [nil, String] token part of speech tag
|
28
|
+
attr_reader :part_of_speech
|
29
|
+
|
30
|
+
# @return [nil, String] token part of speech tag
|
31
|
+
alias :pos :part_of_speech
|
32
|
+
|
33
|
+
# @return [nil, String] token morphological tag
|
34
|
+
attr_reader :morphology
|
35
|
+
|
36
|
+
# @return [nil, String] token relation tag
|
37
|
+
attr_reader :relation
|
38
|
+
|
39
|
+
# @return [nil, String] token empty token sort tag
|
40
|
+
attr_reader :empty_token_sort
|
41
|
+
|
42
|
+
# @return [nil, String] citation part
|
43
|
+
attr_reader :citation_part
|
44
|
+
|
45
|
+
# @return [nil, String] presentation material before form
|
46
|
+
attr_reader :presentation_before
|
47
|
+
|
48
|
+
# @return [nil, String] presentation material after form
|
49
|
+
attr_reader :presentation_after
|
50
|
+
|
51
|
+
# @return [nil, Fixnum] ID of antecedent token
|
52
|
+
attr_reader :antecedent_id
|
53
|
+
|
54
|
+
# @return [nil, String] information status tag
|
55
|
+
attr_reader :information_status
|
56
|
+
|
57
|
+
# @return [nil, String] contrast group tag
|
58
|
+
attr_reader :contrast_group
|
59
|
+
|
60
|
+
# @return [nil, String] free-form foreign IDs
|
61
|
+
attr_reader :foreign_ids
|
62
|
+
|
63
|
+
# @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
|
64
|
+
attr_reader :slashes
|
65
|
+
|
66
|
+
# Creates a new token object.
|
67
|
+
def initialize(parent, id, head_id, form, lemma, part_of_speech,
|
68
|
+
morphology, relation, empty_token_sort, citation_part,
|
69
|
+
presentation_before, presentation_after, antecedent_id,
|
70
|
+
information_status, contrast_group, foreign_ids, slashes)
|
71
|
+
@sentence = parent
|
72
|
+
|
73
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
74
|
+
@id = id
|
75
|
+
|
76
|
+
raise ArgumentError, 'integer or nil expected' unless head_id.nil? or head_id.is_a?(Integer)
|
77
|
+
@head_id = head_id
|
78
|
+
|
79
|
+
raise ArgumentError, 'string or nil expected' unless form.nil? or form.is_a?(String)
|
80
|
+
@form = form.freeze
|
81
|
+
|
82
|
+
raise ArgumentError, 'string or nil expected' unless lemma.nil? or lemma.is_a?(String)
|
83
|
+
@lemma = lemma.freeze
|
84
|
+
|
85
|
+
raise ArgumentError, 'string or nil expected' unless part_of_speech.nil? or part_of_speech.is_a?(String)
|
86
|
+
@part_of_speech = part_of_speech.freeze
|
87
|
+
|
88
|
+
raise ArgumentError, 'string or nil expected' unless morphology.nil? or morphology.is_a?(String)
|
89
|
+
@morphology = morphology.freeze
|
90
|
+
|
91
|
+
raise ArgumentError, 'string or nil expected' unless relation.nil? or relation.is_a?(String)
|
92
|
+
@relation = relation.freeze
|
93
|
+
|
94
|
+
raise ArgumentError, 'string or nil expected' unless empty_token_sort.nil? or empty_token_sort.is_a?(String)
|
95
|
+
@empty_token_sort = empty_token_sort.freeze
|
96
|
+
|
97
|
+
raise ArgumentError, 'string or nil expected' unless citation_part.nil? or citation_part.is_a?(String)
|
98
|
+
@citation_part = citation_part.freeze
|
99
|
+
|
100
|
+
raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
|
101
|
+
@presentation_before = presentation_before.freeze
|
102
|
+
|
103
|
+
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
104
|
+
@presentation_after = presentation_after.freeze
|
105
|
+
|
106
|
+
raise ArgumentError, 'integer or nil expected' unless antecedent_id.nil? or antecedent_id.is_a?(Integer)
|
107
|
+
@antecedent_id = antecedent_id
|
108
|
+
|
109
|
+
raise ArgumentError, 'string or nil expected' unless information_status.nil? or information_status.is_a?(String)
|
110
|
+
@information_status = information_status.freeze
|
111
|
+
|
112
|
+
raise ArgumentError, 'string or nil expected' unless contrast_group.nil? or contrast_group.is_a?(String)
|
113
|
+
@contrast_group = contrast_group.freeze
|
114
|
+
|
115
|
+
raise ArgumentError, 'string or nil expected' unless foreign_ids.nil? or foreign_ids.is_a?(String)
|
116
|
+
@foreign_ids = foreign_ids.freeze
|
117
|
+
|
118
|
+
raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
|
119
|
+
@slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
|
120
|
+
end
|
121
|
+
|
122
|
+
# @return [Div] parent div object
|
123
|
+
def div
|
124
|
+
@sentence.div
|
125
|
+
end
|
126
|
+
|
127
|
+
# @return [Source] parent source object
|
128
|
+
def source
|
129
|
+
@sentence.div.source
|
130
|
+
end
|
131
|
+
|
132
|
+
# @return [Treebank] parent treebank object
|
133
|
+
def treebank
|
134
|
+
@sentence.div.source.treebank
|
135
|
+
end
|
136
|
+
|
137
|
+
# @return [String] language of the token as an ISO 639-3 language tag
|
138
|
+
def language
|
139
|
+
source.language
|
140
|
+
end
|
141
|
+
|
142
|
+
memoize :language
|
143
|
+
|
144
|
+
# @return [nil, String] a complete citation for the token
|
145
|
+
def citation
|
146
|
+
if citation_part
|
147
|
+
[source.citation_part, citation_part].compact.join(' ')
|
148
|
+
else
|
149
|
+
nil
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns the printable form of the token with any presentation data.
|
154
|
+
#
|
155
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
156
|
+
#
|
157
|
+
# @return [String] the printable form of the token
|
158
|
+
def printable_form(custom_token_formatter: nil)
|
159
|
+
printable_form =
|
160
|
+
if custom_token_formatter
|
161
|
+
custom_token_formatter.call(id, form)
|
162
|
+
else
|
163
|
+
form
|
164
|
+
end
|
165
|
+
|
166
|
+
[presentation_before, printable_form, presentation_after].compact.join
|
167
|
+
end
|
168
|
+
|
169
|
+
# @return [Hash<Symbol,String>] token part of speech tag as a hash
|
170
|
+
def part_of_speech_hash
|
171
|
+
if part_of_speech
|
172
|
+
POS_POSITIONAL_TAG_SEQUENCE.zip(part_of_speech.split('')).reject { |_, v| v == '-' }.to_h
|
173
|
+
else
|
174
|
+
{}
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
memoize :part_of_speech_hash
|
179
|
+
|
180
|
+
alias :pos_hash :part_of_speech_hash
|
181
|
+
|
182
|
+
# Returns the part of speech tag if set, but also provides a suitable
|
183
|
+
# part of speech tag for empty elements.
|
184
|
+
#
|
185
|
+
# @return [String] part of speech tag
|
186
|
+
def part_of_speech_with_nulls
|
187
|
+
part_of_speech || NULL_PARTS_OF_SPEECH[empty_token_sort]
|
188
|
+
end
|
189
|
+
|
190
|
+
alias :pos_with_nulls :part_of_speech_with_nulls
|
191
|
+
|
192
|
+
# @return [Hash<Symbol,String>] token morphology tag as a hash
|
193
|
+
def morphology_hash
|
194
|
+
if morphology
|
195
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE.zip(morphology.split('')).reject { |_, v| v == '-' }.to_h
|
196
|
+
else
|
197
|
+
{}
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
memoize :morphology_hash
|
202
|
+
|
203
|
+
# Checks if the token is the root of its dependency graph.
|
204
|
+
#
|
205
|
+
# If the token belongs to a sentence that lacks dependency annotation,
|
206
|
+
# all tokens are treated as roots. If a sentence has partial or complete
|
207
|
+
# dependency annotation there may still be multiple root tokens.
|
208
|
+
#
|
209
|
+
# @return [true, false]
|
210
|
+
def is_root?
|
211
|
+
head_id.nil?
|
212
|
+
end
|
213
|
+
|
214
|
+
# Finds the head of this token.
|
215
|
+
#
|
216
|
+
# The head is the parent of the this token in the tree that has tokens as
|
217
|
+
# nodes and primary relations as edges.
|
218
|
+
#
|
219
|
+
# @return [Token] head
|
220
|
+
def head
|
221
|
+
if is_root?
|
222
|
+
nil
|
223
|
+
else
|
224
|
+
treebank.find_token(head_id)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
memoize :head
|
229
|
+
|
230
|
+
alias :parent :head
|
231
|
+
|
232
|
+
# Finds dependent of this token in the dependency graph.
|
233
|
+
#
|
234
|
+
# The dependents are the children of the this token in the tree that has
|
235
|
+
# tokens as nodes and primary relations as edges.
|
236
|
+
#
|
237
|
+
# The order of the returned dependents is indeterminate.
|
238
|
+
#
|
239
|
+
# @return [Array<Token>] dependent
|
240
|
+
def dependents
|
241
|
+
@sentence.tokens.select { |t| t.head_id == @id }
|
242
|
+
end
|
243
|
+
|
244
|
+
memoize :dependents
|
245
|
+
|
246
|
+
alias :children :dependents
|
247
|
+
|
248
|
+
# Finds ancestors of this token in the dependency graph.
|
249
|
+
#
|
250
|
+
# The ancestors are the ancestors of the this token in the tree that has
|
251
|
+
# tokens as nodes and primary relations as edges.
|
252
|
+
#
|
253
|
+
# The order of the returned ancestors is as follows: The first
|
254
|
+
# ancestor is the head of this token, the next ancestor is
|
255
|
+
# the head of the previous token, and so on.
|
256
|
+
#
|
257
|
+
# @return [Array<Token>] ancestors
|
258
|
+
def ancestors
|
259
|
+
if is_root?
|
260
|
+
[]
|
261
|
+
else
|
262
|
+
[head] + head.ancestors
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
memoize :ancestors
|
267
|
+
|
268
|
+
# Finds descendents of this token in the dependency graph.
|
269
|
+
#
|
270
|
+
# The descendents are the ancestors of the this token in the tree that has
|
271
|
+
# tokens as nodes and primary relations as edges.
|
272
|
+
#
|
273
|
+
# The order of the returned descendents is as indeterminate.
|
274
|
+
#
|
275
|
+
# @return [Array<Token>] descendents
|
276
|
+
def descendents
|
277
|
+
dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
|
278
|
+
end
|
279
|
+
|
280
|
+
memoize :descendents
|
281
|
+
|
282
|
+
alias :descendants :descendents
|
283
|
+
|
284
|
+
# Tests if the token is empty.
|
285
|
+
#
|
286
|
+
# A token is empty if it does not have a form. If the token is empty,
|
287
|
+
# {Token#empty_token_sort} explains its function.
|
288
|
+
#
|
289
|
+
# @see Token#has_content?
|
290
|
+
#
|
291
|
+
# @return [true, false]
|
292
|
+
def is_empty?
|
293
|
+
!empty_token_sort.nil?
|
294
|
+
end
|
295
|
+
|
296
|
+
# Tests if the token has content.
|
297
|
+
#
|
298
|
+
# A token has content if it has a form.
|
299
|
+
#
|
300
|
+
# @see Token#is_empty?
|
301
|
+
#
|
302
|
+
# @return [true, false]
|
303
|
+
def has_content?
|
304
|
+
empty_token_sort.nil?
|
305
|
+
end
|
306
|
+
|
307
|
+
# Tests if the token has a citation.
|
308
|
+
#
|
309
|
+
# A token has a citation if `citation_part` is not `nil`.
|
310
|
+
#
|
311
|
+
# @return [true, false]
|
312
|
+
def has_citation?
|
313
|
+
!citation_part.nil?
|
314
|
+
end
|
315
|
+
|
316
|
+
# Checks if the token is a PRO token.
|
317
|
+
#
|
318
|
+
# @return [true, false]
|
319
|
+
def pro?
|
320
|
+
empty_token_sort == 'P'
|
321
|
+
end
|
322
|
+
|
323
|
+
# Finds the common ancestors that this token and another token
|
324
|
+
# share in the dependency graph.
|
325
|
+
#
|
326
|
+
# If `inclusive` is `false`, a common ancestor is defined strictly
|
327
|
+
# as a common ancestor of both tokens. If `inclusive` is `true`,
|
328
|
+
# one of the tokens can be a common ancestor of the other.
|
329
|
+
#
|
330
|
+
# Ancestors are returned in the same order as {Token#ancestors}.
|
331
|
+
#
|
332
|
+
# @example
|
333
|
+
# x.head # => w
|
334
|
+
# w.head # => z
|
335
|
+
# y.head # => z
|
336
|
+
# z.head # => u
|
337
|
+
#
|
338
|
+
# x.common_ancestors(y, inclusive: false) # => [z, u]
|
339
|
+
# x.common_ancestors(w, inclusive: false) # => [z, u]
|
340
|
+
# x.common_ancestors(x, inclusive: false) # => [w, z, u]
|
341
|
+
#
|
342
|
+
# x.common_ancestors(y, inclusive: true) # => [z, u]
|
343
|
+
# x.common_ancestors(w, inclusive: true) # => [w, z, u]
|
344
|
+
# x.common_ancestors(x, inclusive: true) # => [x, w, z, u]
|
345
|
+
#
|
346
|
+
# @see Token#first_common_ancestor
|
347
|
+
# @see Token#first_common_ancestor_path
|
348
|
+
#
|
349
|
+
# @return [Array<Token>] common ancestors
|
350
|
+
def common_ancestors(other_token, inclusive: false)
|
351
|
+
if inclusive
|
352
|
+
x, y = [self] + ancestors, [other_token] + other_token.ancestors
|
353
|
+
else
|
354
|
+
x, y = ancestors, other_token.ancestors
|
355
|
+
end
|
356
|
+
|
357
|
+
x & y
|
358
|
+
end
|
359
|
+
|
360
|
+
# Finds the first common ancestor that this token and another token
|
361
|
+
# share in the dependency graph.
|
362
|
+
#
|
363
|
+
# If `inclusive` is `false`, a common ancestor is defined strictly
|
364
|
+
# as a common ancestor of both tokens. If `inclusive` is `true`,
|
365
|
+
# one of the tokens can be a common ancestor of the other.
|
366
|
+
#
|
367
|
+
# @example
|
368
|
+
# x.head # => w
|
369
|
+
# w.head # => z
|
370
|
+
# y.head # => z
|
371
|
+
# z.head # => u
|
372
|
+
#
|
373
|
+
# x.first_common_ancestor(y, inclusive: false) # => z
|
374
|
+
# x.first_common_ancestor(w, inclusive: false) # => z
|
375
|
+
# x.first_common_ancestor(x, inclusive: false) # => w
|
376
|
+
#
|
377
|
+
# x.first_common_ancestor(y, inclusive: true) # => z
|
378
|
+
# x.first_common_ancestor(w, inclusive: true) # => w
|
379
|
+
# x.first_common_ancestor(x, inclusive: true) # => x
|
380
|
+
#
|
381
|
+
# @see Token#common_ancestors
|
382
|
+
# @see Token#first_common_ancestor_path
|
383
|
+
#
|
384
|
+
# @return [nil, Token] first common ancestor
|
385
|
+
def first_common_ancestor(other_token, inclusive: false)
|
386
|
+
common_ancestors(other_token, inclusive: inclusive).first
|
387
|
+
end
|
388
|
+
|
389
|
+
private
|
390
|
+
|
391
|
+
# FIXME: extract this from the header of the PROIEL XML file instead and
|
392
|
+
# subclass PositionalTag
|
393
|
+
POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
|
394
|
+
|
395
|
+
# FIXME: extract this from the header of the PROIEL XML file instead and
|
396
|
+
# subclass PositionalTag
|
397
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
|
398
|
+
person number tense mood voice gender case degree strength inflection
|
399
|
+
)
|
400
|
+
|
401
|
+
NULL_PARTS_OF_SPEECH = {
|
402
|
+
'V' => 'V-',
|
403
|
+
'C' => 'C-',
|
404
|
+
'P' => 'Pp',
|
405
|
+
}
|
406
|
+
end
|
407
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Tokenization
|
8
|
+
# Loads tokenization patterns from a configuration file.
|
9
|
+
#
|
10
|
+
# The configuration file should be a JSON file. The keys should
|
11
|
+
# be language tags and the values tokenization patterns.
|
12
|
+
#
|
13
|
+
# The method can be called multiple times. On the first invocation
|
14
|
+
# patterns will be loaded, on subsequent invocations patterns will
|
15
|
+
# be updated. Only patterns for languages that are defined in the
|
16
|
+
# configuration file will be updated, other patterns will remain unchanged.
|
17
|
+
#
|
18
|
+
# @param filename [String] name of tokenization pattern file
|
19
|
+
#
|
20
|
+
# @return [Hash] loaded patterns
|
21
|
+
#
|
22
|
+
def self.load_patterns(filename)
|
23
|
+
raise ArgumentError, 'invalid filename' unless filename.is_a?(String)
|
24
|
+
|
25
|
+
patterns = JSON.parse(File.read(filename))
|
26
|
+
|
27
|
+
regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
|
28
|
+
|
29
|
+
@@regexes ||= {}
|
30
|
+
@@regexes.merge!(regexes)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Makes a regular expression from a pattern given in the configuration file.
|
34
|
+
#
|
35
|
+
# The regular expression is to avoid partial matches. Multi-line matches
|
36
|
+
# are allowed in case characters that are interpreted as line separators
|
37
|
+
# occur in the data.
|
38
|
+
#
|
39
|
+
# @param pattern [String] tokenization pattern
|
40
|
+
#
|
41
|
+
# @return [Regexp]
|
42
|
+
#
|
43
|
+
def self.make_regex(pattern)
|
44
|
+
raise ArgumentError, 'invalid pattern' unless pattern.is_a?(String)
|
45
|
+
|
46
|
+
Regexp.new("^#{pattern}$", Regexp::MULTILINE)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Tests if a token form is splitable. Any form with more than one character
|
50
|
+
# is splitable.
|
51
|
+
#
|
52
|
+
# @param form [String, nil] token form to Tests
|
53
|
+
#
|
54
|
+
# @return [true, false]
|
55
|
+
#
|
56
|
+
def self.is_splitable?(form)
|
57
|
+
raise ArgumentError, 'invalid form' unless form.is_a?(String) or form.nil?
|
58
|
+
|
59
|
+
form and form.length > 1
|
60
|
+
end
|
61
|
+
|
62
|
+
# Splits a token form using the tokenization patterns that apply for a
|
63
|
+
# the specified language. Tokenization patterns must already have been
|
64
|
+
# loaded.
|
65
|
+
#
|
66
|
+
# @param language_tag [String] ISO 639-3 tag for the language whose patterns
|
67
|
+
# should be used to split the token form
|
68
|
+
# @param form [String] token form to split
|
69
|
+
#
|
70
|
+
# @return [Array<String>]
|
71
|
+
#
|
72
|
+
def self.split_form(language_tag, form)
|
73
|
+
raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
|
74
|
+
raise ArgumentError, 'invalid form' unless form.is_a?(String)
|
75
|
+
|
76
|
+
if form[/\W+/]
|
77
|
+
# Split on any non-word character like a space or punctuation
|
78
|
+
form.split(/(\W+)/)
|
79
|
+
elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
|
80
|
+
# Apply language-specific pattern
|
81
|
+
form.match(@@regexes[language_tag]).captures
|
82
|
+
elsif form == ''
|
83
|
+
['']
|
84
|
+
else
|
85
|
+
# Give up and split by character
|
86
|
+
form.split(/()/)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|