proiel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A source object in a treebank.
8
+ class Source < TreebankObject
9
+ # @return [String] ID of the source
10
+ attr_reader :id
11
+
12
+ # @return [Treebank] treebank that the div belongs to
13
+ attr_reader :treebank
14
+
15
+ # @return [String] language of the source as an ISO 639-3 language tag
16
+ attr_reader :language
17
+
18
+ # @return [DateTime] export time for the source
19
+ attr_reader :export_time
20
+
21
+ # @return [Hash{Symbol, String}] metadata fields for the source
22
+ # @see PROIEL::Treebank::METADATA_ELEMENTS
23
+ attr_reader :metadata
24
+
25
+ # Creates a new source object.
26
+ def initialize(parent, id, export_time, language, metadata, &block)
27
+ @treebank = parent
28
+ @id = id.freeze
29
+ @export_time = DateTime.parse(export_time).freeze
30
+ @language = language.freeze
31
+ @metadata = metadata.freeze
32
+ @children = block.call(self) if block_given?
33
+ end
34
+
35
+ # @return [String] a complete citation for the source
36
+ def citation
37
+ citation_part
38
+ end
39
+
40
+ # Returns the printable form of the source with all token forms and any
41
+ # presentation data.
42
+ #
43
+ # @return [String] the printable form of the source
44
+ def printable_form(options = {})
45
+ @children.map { |d| d.printable_form(options) }.compact.join
46
+ end
47
+
48
+ # Accesses metadata fields.
49
+ #
50
+ # @see PROIEL::Treebank::METADATA_ELEMENTS
51
+ def method_missing(method_name, *args, &block)
52
+ if @metadata.key?(method_name) and args.empty?
53
+ @metadata[method_name]
54
+ else
55
+ super
56
+ end
57
+ end
58
+
59
+ # Finds all divs in the source.
60
+ #
61
+ # @return [Enumerator] divs in the source
62
+ def divs
63
+ @children.to_enum
64
+ end
65
+
66
+ # Finds all sentences in the source.
67
+ #
68
+ # @return [Enumerator] sentences in the source
69
+ #
70
+ # @example Iterating sentences
71
+ # sentences.each { |s| puts s.id }
72
+ #
73
+ # @example Create an array with only reviewed sentences
74
+ # sentences.select(&:reviewed?)
75
+ #
76
+ # @example Counting sentences
77
+ # sentences.count #=> 200
78
+ #
79
+ def sentences
80
+ Enumerator.new do |y|
81
+ @children.each do |div|
82
+ div.sentences.each do |sentence|
83
+ y << sentence
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ # Finds all tokens in the source.
90
+ #
91
+ # @return [Enumerator] tokens in the source
92
+ #
93
+ # @example Iterating tokens
94
+ # tokens.each { |t| puts t.id }
95
+ #
96
+ # @example Create an array with only empty tokens
97
+ # tokens.select(&:is_empty?)
98
+ #
99
+ # @example Counting tokens
100
+ # puts tokens.count #=> 200
101
+ #
102
+ def tokens
103
+ Enumerator.new do |y|
104
+ @children.each do |div|
105
+ div.sentences.each do |sentence|
106
+ sentence.tokens.each do |token|
107
+ y << token
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,41 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Statistics
8
+ # Computes the line of best fit using the least-squares method.
9
+ #
10
+ # @param x [Array<Number>] x-values
11
+ # @param y [Array<Number>] y-values
12
+ #
13
+ # @return [Array(Float, Float)] y-intercept and slope
14
+ #
15
+ # @example
16
+ # x = [8, 2, 11, 6, 5, 4, 12, 9, 6, 1]
17
+ # y = [3, 10, 3, 6, 8, 12, 1, 4, 9, 14]
18
+ # a, b = PROIEL::Statistics.least_squares(x, y)
19
+ # a # => 14.081081081081088
20
+ # b # => -1.1064189189189197
21
+ #
22
+ def self.least_squares(x, y)
23
+ raise ArgumentError unless x.is_a?(Array)
24
+ raise ArgumentError unless y.is_a?(Array)
25
+ raise ArgumentError, 'array lengths differ' unless x.size == y.size
26
+
27
+ x_mean = x.reduce(&:+).to_f / x.size
28
+ y_mean = y.reduce(&:+).to_f / y.size
29
+ x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
30
+ xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
31
+
32
+ sxy = xy_sum - x.length * x_mean * y_mean
33
+ sx2 = x_sqsum - x.length * (x_mean ** 2)
34
+
35
+ beta = sxy / sx2
36
+ alfa = y_mean - beta * x_mean
37
+
38
+ [alfa, beta]
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,407 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A token object in a treebank.
8
+ class Token < TreebankObject
9
+ # A class representing a token sentence in the PROIEL treebank.
10
+ extend Memoist
11
+
12
+ # @return [Fixnum] ID of the sentence
13
+ attr_reader :id
14
+
15
+ # @return [Sentence] parent sentence object
16
+ attr_accessor :sentence
17
+
18
+ # @return [nil, Fixnum] ID of head token
19
+ attr_reader :head_id
20
+
21
+ # @return [nil, String] token form
22
+ attr_reader :form
23
+
24
+ # @return [nil, String] token lemma
25
+ attr_reader :lemma
26
+
27
+ # @return [nil, String] token part of speech tag
28
+ attr_reader :part_of_speech
29
+
30
+ # @return [nil, String] token part of speech tag
31
+ alias :pos :part_of_speech
32
+
33
+ # @return [nil, String] token morphological tag
34
+ attr_reader :morphology
35
+
36
+ # @return [nil, String] token relation tag
37
+ attr_reader :relation
38
+
39
+ # @return [nil, String] token empty token sort tag
40
+ attr_reader :empty_token_sort
41
+
42
+ # @return [nil, String] citation part
43
+ attr_reader :citation_part
44
+
45
+ # @return [nil, String] presentation material before form
46
+ attr_reader :presentation_before
47
+
48
+ # @return [nil, String] presentation material after form
49
+ attr_reader :presentation_after
50
+
51
+ # @return [nil, Fixnum] ID of antecedent token
52
+ attr_reader :antecedent_id
53
+
54
+ # @return [nil, String] information status tag
55
+ attr_reader :information_status
56
+
57
+ # @return [nil, String] contrast group tag
58
+ attr_reader :contrast_group
59
+
60
+ # @return [nil, String] free-form foreign IDs
61
+ attr_reader :foreign_ids
62
+
63
+ # @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
64
+ attr_reader :slashes
65
+
66
+ # Creates a new token object.
67
+ def initialize(parent, id, head_id, form, lemma, part_of_speech,
68
+ morphology, relation, empty_token_sort, citation_part,
69
+ presentation_before, presentation_after, antecedent_id,
70
+ information_status, contrast_group, foreign_ids, slashes)
71
+ @sentence = parent
72
+
73
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
74
+ @id = id
75
+
76
+ raise ArgumentError, 'integer or nil expected' unless head_id.nil? or head_id.is_a?(Integer)
77
+ @head_id = head_id
78
+
79
+ raise ArgumentError, 'string or nil expected' unless form.nil? or form.is_a?(String)
80
+ @form = form.freeze
81
+
82
+ raise ArgumentError, 'string or nil expected' unless lemma.nil? or lemma.is_a?(String)
83
+ @lemma = lemma.freeze
84
+
85
+ raise ArgumentError, 'string or nil expected' unless part_of_speech.nil? or part_of_speech.is_a?(String)
86
+ @part_of_speech = part_of_speech.freeze
87
+
88
+ raise ArgumentError, 'string or nil expected' unless morphology.nil? or morphology.is_a?(String)
89
+ @morphology = morphology.freeze
90
+
91
+ raise ArgumentError, 'string or nil expected' unless relation.nil? or relation.is_a?(String)
92
+ @relation = relation.freeze
93
+
94
+ raise ArgumentError, 'string or nil expected' unless empty_token_sort.nil? or empty_token_sort.is_a?(String)
95
+ @empty_token_sort = empty_token_sort.freeze
96
+
97
+ raise ArgumentError, 'string or nil expected' unless citation_part.nil? or citation_part.is_a?(String)
98
+ @citation_part = citation_part.freeze
99
+
100
+ raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
101
+ @presentation_before = presentation_before.freeze
102
+
103
+ raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
104
+ @presentation_after = presentation_after.freeze
105
+
106
+ raise ArgumentError, 'integer or nil expected' unless antecedent_id.nil? or antecedent_id.is_a?(Integer)
107
+ @antecedent_id = antecedent_id
108
+
109
+ raise ArgumentError, 'string or nil expected' unless information_status.nil? or information_status.is_a?(String)
110
+ @information_status = information_status.freeze
111
+
112
+ raise ArgumentError, 'string or nil expected' unless contrast_group.nil? or contrast_group.is_a?(String)
113
+ @contrast_group = contrast_group.freeze
114
+
115
+ raise ArgumentError, 'string or nil expected' unless foreign_ids.nil? or foreign_ids.is_a?(String)
116
+ @foreign_ids = foreign_ids.freeze
117
+
118
+ raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
119
+ @slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
120
+ end
121
+
122
+ # @return [Div] parent div object
123
+ def div
124
+ @sentence.div
125
+ end
126
+
127
+ # @return [Source] parent source object
128
+ def source
129
+ @sentence.div.source
130
+ end
131
+
132
+ # @return [Treebank] parent treebank object
133
+ def treebank
134
+ @sentence.div.source.treebank
135
+ end
136
+
137
+ # @return [String] language of the token as an ISO 639-3 language tag
138
+ def language
139
+ source.language
140
+ end
141
+
142
+ memoize :language
143
+
144
+ # @return [nil, String] a complete citation for the token
145
+ def citation
146
+ if citation_part
147
+ [source.citation_part, citation_part].compact.join(' ')
148
+ else
149
+ nil
150
+ end
151
+ end
152
+
153
+ # Returns the printable form of the token with any presentation data.
154
+ #
155
+ # @param custom_token_formatter [Lambda] formatting function for tokens
156
+ #
157
+ # @return [String] the printable form of the token
158
+ def printable_form(custom_token_formatter: nil)
159
+ printable_form =
160
+ if custom_token_formatter
161
+ custom_token_formatter.call(id, form)
162
+ else
163
+ form
164
+ end
165
+
166
+ [presentation_before, printable_form, presentation_after].compact.join
167
+ end
168
+
169
+ # @return [Hash<Symbol,String>] token part of speech tag as a hash
170
+ def part_of_speech_hash
171
+ if part_of_speech
172
+ POS_POSITIONAL_TAG_SEQUENCE.zip(part_of_speech.split('')).reject { |_, v| v == '-' }.to_h
173
+ else
174
+ {}
175
+ end
176
+ end
177
+
178
+ memoize :part_of_speech_hash
179
+
180
+ alias :pos_hash :part_of_speech_hash
181
+
182
+ # Returns the part of speech tag if set, but also provides a suitable
183
+ # part of speech tag for empty elements.
184
+ #
185
+ # @return [String] part of speech tag
186
+ def part_of_speech_with_nulls
187
+ part_of_speech || NULL_PARTS_OF_SPEECH[empty_token_sort]
188
+ end
189
+
190
+ alias :pos_with_nulls :part_of_speech_with_nulls
191
+
192
+ # @return [Hash<Symbol,String>] token morphology tag as a hash
193
+ def morphology_hash
194
+ if morphology
195
+ MORPHOLOGY_POSITIONAL_TAG_SEQUENCE.zip(morphology.split('')).reject { |_, v| v == '-' }.to_h
196
+ else
197
+ {}
198
+ end
199
+ end
200
+
201
+ memoize :morphology_hash
202
+
203
+ # Checks if the token is the root of its dependency graph.
204
+ #
205
+ # If the token belongs to a sentence that lacks dependency annotation,
206
+ # all tokens are treated as roots. If a sentence has partial or complete
207
+ # dependency annotation there may still be multiple root tokens.
208
+ #
209
+ # @return [true, false]
210
+ def is_root?
211
+ head_id.nil?
212
+ end
213
+
214
+ # Finds the head of this token.
215
+ #
216
+ # The head is the parent of the this token in the tree that has tokens as
217
+ # nodes and primary relations as edges.
218
+ #
219
+ # @return [Token] head
220
+ def head
221
+ if is_root?
222
+ nil
223
+ else
224
+ treebank.find_token(head_id)
225
+ end
226
+ end
227
+
228
+ memoize :head
229
+
230
+ alias :parent :head
231
+
232
+ # Finds dependent of this token in the dependency graph.
233
+ #
234
+ # The dependents are the children of the this token in the tree that has
235
+ # tokens as nodes and primary relations as edges.
236
+ #
237
+ # The order of the returned dependents is indeterminate.
238
+ #
239
+ # @return [Array<Token>] dependent
240
+ def dependents
241
+ @sentence.tokens.select { |t| t.head_id == @id }
242
+ end
243
+
244
+ memoize :dependents
245
+
246
+ alias :children :dependents
247
+
248
+ # Finds ancestors of this token in the dependency graph.
249
+ #
250
+ # The ancestors are the ancestors of the this token in the tree that has
251
+ # tokens as nodes and primary relations as edges.
252
+ #
253
+ # The order of the returned ancestors is as follows: The first
254
+ # ancestor is the head of this token, the next ancestor is
255
+ # the head of the previous token, and so on.
256
+ #
257
+ # @return [Array<Token>] ancestors
258
+ def ancestors
259
+ if is_root?
260
+ []
261
+ else
262
+ [head] + head.ancestors
263
+ end
264
+ end
265
+
266
+ memoize :ancestors
267
+
268
+ # Finds descendents of this token in the dependency graph.
269
+ #
270
+ # The descendents are the ancestors of the this token in the tree that has
271
+ # tokens as nodes and primary relations as edges.
272
+ #
273
+ # The order of the returned descendents is as indeterminate.
274
+ #
275
+ # @return [Array<Token>] descendents
276
+ def descendents
277
+ dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
278
+ end
279
+
280
+ memoize :descendents
281
+
282
+ alias :descendants :descendents
283
+
284
+ # Tests if the token is empty.
285
+ #
286
+ # A token is empty if it does not have a form. If the token is empty,
287
+ # {Token#empty_token_sort} explains its function.
288
+ #
289
+ # @see Token#has_content?
290
+ #
291
+ # @return [true, false]
292
+ def is_empty?
293
+ !empty_token_sort.nil?
294
+ end
295
+
296
+ # Tests if the token has content.
297
+ #
298
+ # A token has content if it has a form.
299
+ #
300
+ # @see Token#is_empty?
301
+ #
302
+ # @return [true, false]
303
+ def has_content?
304
+ empty_token_sort.nil?
305
+ end
306
+
307
+ # Tests if the token has a citation.
308
+ #
309
+ # A token has a citation if `citation_part` is not `nil`.
310
+ #
311
+ # @return [true, false]
312
+ def has_citation?
313
+ !citation_part.nil?
314
+ end
315
+
316
+ # Checks if the token is a PRO token.
317
+ #
318
+ # @return [true, false]
319
+ def pro?
320
+ empty_token_sort == 'P'
321
+ end
322
+
323
+ # Finds the common ancestors that this token and another token
324
+ # share in the dependency graph.
325
+ #
326
+ # If `inclusive` is `false`, a common ancestor is defined strictly
327
+ # as a common ancestor of both tokens. If `inclusive` is `true`,
328
+ # one of the tokens can be a common ancestor of the other.
329
+ #
330
+ # Ancestors are returned in the same order as {Token#ancestors}.
331
+ #
332
+ # @example
333
+ # x.head # => w
334
+ # w.head # => z
335
+ # y.head # => z
336
+ # z.head # => u
337
+ #
338
+ # x.common_ancestors(y, inclusive: false) # => [z, u]
339
+ # x.common_ancestors(w, inclusive: false) # => [z, u]
340
+ # x.common_ancestors(x, inclusive: false) # => [w, z, u]
341
+ #
342
+ # x.common_ancestors(y, inclusive: true) # => [z, u]
343
+ # x.common_ancestors(w, inclusive: true) # => [w, z, u]
344
+ # x.common_ancestors(x, inclusive: true) # => [x, w, z, u]
345
+ #
346
+ # @see Token#first_common_ancestor
347
+ # @see Token#first_common_ancestor_path
348
+ #
349
+ # @return [Array<Token>] common ancestors
350
+ def common_ancestors(other_token, inclusive: false)
351
+ if inclusive
352
+ x, y = [self] + ancestors, [other_token] + other_token.ancestors
353
+ else
354
+ x, y = ancestors, other_token.ancestors
355
+ end
356
+
357
+ x & y
358
+ end
359
+
360
+ # Finds the first common ancestor that this token and another token
361
+ # share in the dependency graph.
362
+ #
363
+ # If `inclusive` is `false`, a common ancestor is defined strictly
364
+ # as a common ancestor of both tokens. If `inclusive` is `true`,
365
+ # one of the tokens can be a common ancestor of the other.
366
+ #
367
+ # @example
368
+ # x.head # => w
369
+ # w.head # => z
370
+ # y.head # => z
371
+ # z.head # => u
372
+ #
373
+ # x.first_common_ancestor(y, inclusive: false) # => z
374
+ # x.first_common_ancestor(w, inclusive: false) # => z
375
+ # x.first_common_ancestor(x, inclusive: false) # => w
376
+ #
377
+ # x.first_common_ancestor(y, inclusive: true) # => z
378
+ # x.first_common_ancestor(w, inclusive: true) # => w
379
+ # x.first_common_ancestor(x, inclusive: true) # => x
380
+ #
381
+ # @see Token#common_ancestors
382
+ # @see Token#first_common_ancestor_path
383
+ #
384
+ # @return [nil, Token] first common ancestor
385
+ def first_common_ancestor(other_token, inclusive: false)
386
+ common_ancestors(other_token, inclusive: inclusive).first
387
+ end
388
+
389
+ private
390
+
391
+ # FIXME: extract this from the header of the PROIEL XML file instead and
392
+ # subclass PositionalTag
393
+ POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
394
+
395
+ # FIXME: extract this from the header of the PROIEL XML file instead and
396
+ # subclass PositionalTag
397
+ MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
398
+ person number tense mood voice gender case degree strength inflection
399
+ )
400
+
401
+ NULL_PARTS_OF_SPEECH = {
402
+ 'V' => 'V-',
403
+ 'C' => 'C-',
404
+ 'P' => 'Pp',
405
+ }
406
+ end
407
+ end
@@ -0,0 +1,90 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Tokenization
8
+ # Loads tokenization patterns from a configuration file.
9
+ #
10
+ # The configuration file should be a JSON file. The keys should
11
+ # be language tags and the values tokenization patterns.
12
+ #
13
+ # The method can be called multiple times. On the first invocation
14
+ # patterns will be loaded, on subsequent invocations patterns will
15
+ # be updated. Only patterns for languages that are defined in the
16
+ # configuration file will be updated, other patterns will remain unchanged.
17
+ #
18
+ # @param filename [String] name of tokenization pattern file
19
+ #
20
+ # @return [Hash] loaded patterns
21
+ #
22
+ def self.load_patterns(filename)
23
+ raise ArgumentError, 'invalid filename' unless filename.is_a?(String)
24
+
25
+ patterns = JSON.parse(File.read(filename))
26
+
27
+ regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
28
+
29
+ @@regexes ||= {}
30
+ @@regexes.merge!(regexes)
31
+ end
32
+
33
+ # Makes a regular expression from a pattern given in the configuration file.
34
+ #
35
+ # The regular expression is to avoid partial matches. Multi-line matches
36
+ # are allowed in case characters that are interpreted as line separators
37
+ # occur in the data.
38
+ #
39
+ # @param pattern [String] tokenization pattern
40
+ #
41
+ # @return [Regexp]
42
+ #
43
+ def self.make_regex(pattern)
44
+ raise ArgumentError, 'invalid pattern' unless pattern.is_a?(String)
45
+
46
+ Regexp.new("^#{pattern}$", Regexp::MULTILINE)
47
+ end
48
+
49
+ # Tests if a token form is splitable. Any form with more than one character
50
+ # is splitable.
51
+ #
52
+ # @param form [String, nil] token form to Tests
53
+ #
54
+ # @return [true, false]
55
+ #
56
+ def self.is_splitable?(form)
57
+ raise ArgumentError, 'invalid form' unless form.is_a?(String) or form.nil?
58
+
59
+ form and form.length > 1
60
+ end
61
+
62
+ # Splits a token form using the tokenization patterns that apply for a
63
+ # the specified language. Tokenization patterns must already have been
64
+ # loaded.
65
+ #
66
+ # @param language_tag [String] ISO 639-3 tag for the language whose patterns
67
+ # should be used to split the token form
68
+ # @param form [String] token form to split
69
+ #
70
+ # @return [Array<String>]
71
+ #
72
+ def self.split_form(language_tag, form)
73
+ raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
74
+ raise ArgumentError, 'invalid form' unless form.is_a?(String)
75
+
76
+ if form[/\W+/]
77
+ # Split on any non-word character like a space or punctuation
78
+ form.split(/(\W+)/)
79
+ elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
80
+ # Apply language-specific pattern
81
+ form.match(@@regexes[language_tag]).captures
82
+ elsif form == ''
83
+ ['']
84
+ else
85
+ # Give up and split by character
86
+ form.split(/()/)
87
+ end
88
+ end
89
+ end
90
+ end