proiel 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A source object in a treebank.
8
+ class Source < TreebankObject
9
+ # @return [String] ID of the source
10
+ attr_reader :id
11
+
12
+ # @return [Treebank] treebank that the div belongs to
13
+ attr_reader :treebank
14
+
15
+ # @return [String] language of the source as an ISO 639-3 language tag
16
+ attr_reader :language
17
+
18
+ # @return [DateTime] export time for the source
19
+ attr_reader :export_time
20
+
21
+ # @return [Hash{Symbol, String}] metadata fields for the source
22
+ # @see PROIEL::Treebank::METADATA_ELEMENTS
23
+ attr_reader :metadata
24
+
25
+ # Creates a new source object.
26
+ def initialize(parent, id, export_time, language, metadata, &block)
27
+ @treebank = parent
28
+ @id = id.freeze
29
+ @export_time = DateTime.parse(export_time).freeze
30
+ @language = language.freeze
31
+ @metadata = metadata.freeze
32
+ @children = block.call(self) if block_given?
33
+ end
34
+
35
+ # @return [String] a complete citation for the source
36
+ def citation
37
+ citation_part
38
+ end
39
+
40
+ # Returns the printable form of the source with all token forms and any
41
+ # presentation data.
42
+ #
43
+ # @return [String] the printable form of the source
44
+ def printable_form(options = {})
45
+ @children.map { |d| d.printable_form(options) }.compact.join
46
+ end
47
+
48
+ # Accesses metadata fields.
49
+ #
50
+ # @see PROIEL::Treebank::METADATA_ELEMENTS
51
+ def method_missing(method_name, *args, &block)
52
+ if @metadata.key?(method_name) and args.empty?
53
+ @metadata[method_name]
54
+ else
55
+ super
56
+ end
57
+ end
58
+
59
+ # Finds all divs in the source.
60
+ #
61
+ # @return [Enumerator] divs in the source
62
+ def divs
63
+ @children.to_enum
64
+ end
65
+
66
+ # Finds all sentences in the source.
67
+ #
68
+ # @return [Enumerator] sentences in the source
69
+ #
70
+ # @example Iterating sentences
71
+ # sentences.each { |s| puts s.id }
72
+ #
73
+ # @example Create an array with only reviewed sentences
74
+ # sentences.select(&:reviewed?)
75
+ #
76
+ # @example Counting sentences
77
+ # sentences.count #=> 200
78
+ #
79
+ def sentences
80
+ Enumerator.new do |y|
81
+ @children.each do |div|
82
+ div.sentences.each do |sentence|
83
+ y << sentence
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ # Finds all tokens in the source.
90
+ #
91
+ # @return [Enumerator] tokens in the source
92
+ #
93
+ # @example Iterating tokens
94
+ # tokens.each { |t| puts t.id }
95
+ #
96
+ # @example Create an array with only empty tokens
97
+ # tokens.select(&:is_empty?)
98
+ #
99
+ # @example Counting tokens
100
+ # puts tokens.count #=> 200
101
+ #
102
+ def tokens
103
+ Enumerator.new do |y|
104
+ @children.each do |div|
105
+ div.sentences.each do |sentence|
106
+ sentence.tokens.each do |token|
107
+ y << token
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,41 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Statistics
8
+ # Computes the line of best fit using the least-squares method.
9
+ #
10
+ # @param x [Array<Number>] x-values
11
+ # @param y [Array<Number>] y-values
12
+ #
13
+ # @return [Array(Float, Float)] y-intercept and slope
14
+ #
15
+ # @example
16
+ # x = [8, 2, 11, 6, 5, 4, 12, 9, 6, 1]
17
+ # y = [3, 10, 3, 6, 8, 12, 1, 4, 9, 14]
18
+ # a, b = PROIEL::Statistics.least_squares(x, y)
19
+ # a # => 14.081081081081088
20
+ # b # => -1.1064189189189197
21
+ #
22
+ def self.least_squares(x, y)
23
+ raise ArgumentError unless x.is_a?(Array)
24
+ raise ArgumentError unless y.is_a?(Array)
25
+ raise ArgumentError, 'array lengths differ' unless x.size == y.size
26
+
27
+ x_mean = x.reduce(&:+).to_f / x.size
28
+ y_mean = y.reduce(&:+).to_f / y.size
29
+ x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
30
+ xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
31
+
32
+ sxy = xy_sum - x.length * x_mean * y_mean
33
+ sx2 = x_sqsum - x.length * (x_mean ** 2)
34
+
35
+ beta = sxy / sx2
36
+ alfa = y_mean - beta * x_mean
37
+
38
+ [alfa, beta]
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,407 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A token object in a treebank.
8
+ class Token < TreebankObject
9
+ # A class representing a token sentence in the PROIEL treebank.
10
+ extend Memoist
11
+
12
+ # @return [Fixnum] ID of the sentence
13
+ attr_reader :id
14
+
15
+ # @return [Sentence] parent sentence object
16
+ attr_accessor :sentence
17
+
18
+ # @return [nil, Fixnum] ID of head token
19
+ attr_reader :head_id
20
+
21
+ # @return [nil, String] token form
22
+ attr_reader :form
23
+
24
+ # @return [nil, String] token lemma
25
+ attr_reader :lemma
26
+
27
+ # @return [nil, String] token part of speech tag
28
+ attr_reader :part_of_speech
29
+
30
+ # @return [nil, String] token part of speech tag
31
+ alias :pos :part_of_speech
32
+
33
+ # @return [nil, String] token morphological tag
34
+ attr_reader :morphology
35
+
36
+ # @return [nil, String] token relation tag
37
+ attr_reader :relation
38
+
39
+ # @return [nil, String] token empty token sort tag
40
+ attr_reader :empty_token_sort
41
+
42
+ # @return [nil, String] citation part
43
+ attr_reader :citation_part
44
+
45
+ # @return [nil, String] presentation material before form
46
+ attr_reader :presentation_before
47
+
48
+ # @return [nil, String] presentation material after form
49
+ attr_reader :presentation_after
50
+
51
+ # @return [nil, Fixnum] ID of antecedent token
52
+ attr_reader :antecedent_id
53
+
54
+ # @return [nil, String] information status tag
55
+ attr_reader :information_status
56
+
57
+ # @return [nil, String] contrast group tag
58
+ attr_reader :contrast_group
59
+
60
+ # @return [nil, String] free-form foreign IDs
61
+ attr_reader :foreign_ids
62
+
63
+ # @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
64
+ attr_reader :slashes
65
+
66
+ # Creates a new token object.
67
+ def initialize(parent, id, head_id, form, lemma, part_of_speech,
68
+ morphology, relation, empty_token_sort, citation_part,
69
+ presentation_before, presentation_after, antecedent_id,
70
+ information_status, contrast_group, foreign_ids, slashes)
71
+ @sentence = parent
72
+
73
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
74
+ @id = id
75
+
76
+ raise ArgumentError, 'integer or nil expected' unless head_id.nil? or head_id.is_a?(Integer)
77
+ @head_id = head_id
78
+
79
+ raise ArgumentError, 'string or nil expected' unless form.nil? or form.is_a?(String)
80
+ @form = form.freeze
81
+
82
+ raise ArgumentError, 'string or nil expected' unless lemma.nil? or lemma.is_a?(String)
83
+ @lemma = lemma.freeze
84
+
85
+ raise ArgumentError, 'string or nil expected' unless part_of_speech.nil? or part_of_speech.is_a?(String)
86
+ @part_of_speech = part_of_speech.freeze
87
+
88
+ raise ArgumentError, 'string or nil expected' unless morphology.nil? or morphology.is_a?(String)
89
+ @morphology = morphology.freeze
90
+
91
+ raise ArgumentError, 'string or nil expected' unless relation.nil? or relation.is_a?(String)
92
+ @relation = relation.freeze
93
+
94
+ raise ArgumentError, 'string or nil expected' unless empty_token_sort.nil? or empty_token_sort.is_a?(String)
95
+ @empty_token_sort = empty_token_sort.freeze
96
+
97
+ raise ArgumentError, 'string or nil expected' unless citation_part.nil? or citation_part.is_a?(String)
98
+ @citation_part = citation_part.freeze
99
+
100
+ raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
101
+ @presentation_before = presentation_before.freeze
102
+
103
+ raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
104
+ @presentation_after = presentation_after.freeze
105
+
106
+ raise ArgumentError, 'integer or nil expected' unless antecedent_id.nil? or antecedent_id.is_a?(Integer)
107
+ @antecedent_id = antecedent_id
108
+
109
+ raise ArgumentError, 'string or nil expected' unless information_status.nil? or information_status.is_a?(String)
110
+ @information_status = information_status.freeze
111
+
112
+ raise ArgumentError, 'string or nil expected' unless contrast_group.nil? or contrast_group.is_a?(String)
113
+ @contrast_group = contrast_group.freeze
114
+
115
+ raise ArgumentError, 'string or nil expected' unless foreign_ids.nil? or foreign_ids.is_a?(String)
116
+ @foreign_ids = foreign_ids.freeze
117
+
118
+ raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
119
+ @slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
120
+ end
121
+
122
+ # @return [Div] parent div object
123
+ def div
124
+ @sentence.div
125
+ end
126
+
127
+ # @return [Source] parent source object
128
+ def source
129
+ @sentence.div.source
130
+ end
131
+
132
+ # @return [Treebank] parent treebank object
133
+ def treebank
134
+ @sentence.div.source.treebank
135
+ end
136
+
137
+ # @return [String] language of the token as an ISO 639-3 language tag
138
+ def language
139
+ source.language
140
+ end
141
+
142
+ memoize :language
143
+
144
+ # @return [nil, String] a complete citation for the token
145
+ def citation
146
+ if citation_part
147
+ [source.citation_part, citation_part].compact.join(' ')
148
+ else
149
+ nil
150
+ end
151
+ end
152
+
153
+ # Returns the printable form of the token with any presentation data.
154
+ #
155
+ # @param custom_token_formatter [Lambda] formatting function for tokens
156
+ #
157
+ # @return [String] the printable form of the token
158
+ def printable_form(custom_token_formatter: nil)
159
+ printable_form =
160
+ if custom_token_formatter
161
+ custom_token_formatter.call(id, form)
162
+ else
163
+ form
164
+ end
165
+
166
+ [presentation_before, printable_form, presentation_after].compact.join
167
+ end
168
+
169
+ # @return [Hash<Symbol,String>] token part of speech tag as a hash
170
+ def part_of_speech_hash
171
+ if part_of_speech
172
+ POS_POSITIONAL_TAG_SEQUENCE.zip(part_of_speech.split('')).reject { |_, v| v == '-' }.to_h
173
+ else
174
+ {}
175
+ end
176
+ end
177
+
178
+ memoize :part_of_speech_hash
179
+
180
+ alias :pos_hash :part_of_speech_hash
181
+
182
+ # Returns the part of speech tag if set, but also provides a suitable
183
+ # part of speech tag for empty elements.
184
+ #
185
+ # @return [String] part of speech tag
186
+ def part_of_speech_with_nulls
187
+ part_of_speech || NULL_PARTS_OF_SPEECH[empty_token_sort]
188
+ end
189
+
190
+ alias :pos_with_nulls :part_of_speech_with_nulls
191
+
192
+ # @return [Hash<Symbol,String>] token morphology tag as a hash
193
+ def morphology_hash
194
+ if morphology
195
+ MORPHOLOGY_POSITIONAL_TAG_SEQUENCE.zip(morphology.split('')).reject { |_, v| v == '-' }.to_h
196
+ else
197
+ {}
198
+ end
199
+ end
200
+
201
+ memoize :morphology_hash
202
+
203
+ # Checks if the token is the root of its dependency graph.
204
+ #
205
+ # If the token belongs to a sentence that lacks dependency annotation,
206
+ # all tokens are treated as roots. If a sentence has partial or complete
207
+ # dependency annotation there may still be multiple root tokens.
208
+ #
209
+ # @return [true, false]
210
+ def is_root?
211
+ head_id.nil?
212
+ end
213
+
214
+ # Finds the head of this token.
215
+ #
216
+ # The head is the parent of the this token in the tree that has tokens as
217
+ # nodes and primary relations as edges.
218
+ #
219
+ # @return [Token] head
220
+ def head
221
+ if is_root?
222
+ nil
223
+ else
224
+ treebank.find_token(head_id)
225
+ end
226
+ end
227
+
228
+ memoize :head
229
+
230
+ alias :parent :head
231
+
232
+ # Finds dependent of this token in the dependency graph.
233
+ #
234
+ # The dependents are the children of the this token in the tree that has
235
+ # tokens as nodes and primary relations as edges.
236
+ #
237
+ # The order of the returned dependents is indeterminate.
238
+ #
239
+ # @return [Array<Token>] dependent
240
+ def dependents
241
+ @sentence.tokens.select { |t| t.head_id == @id }
242
+ end
243
+
244
+ memoize :dependents
245
+
246
+ alias :children :dependents
247
+
248
+ # Finds ancestors of this token in the dependency graph.
249
+ #
250
+ # The ancestors are the ancestors of the this token in the tree that has
251
+ # tokens as nodes and primary relations as edges.
252
+ #
253
+ # The order of the returned ancestors is as follows: The first
254
+ # ancestor is the head of this token, the next ancestor is
255
+ # the head of the previous token, and so on.
256
+ #
257
+ # @return [Array<Token>] ancestors
258
+ def ancestors
259
+ if is_root?
260
+ []
261
+ else
262
+ [head] + head.ancestors
263
+ end
264
+ end
265
+
266
+ memoize :ancestors
267
+
268
+ # Finds descendents of this token in the dependency graph.
269
+ #
270
+ # The descendents are the ancestors of the this token in the tree that has
271
+ # tokens as nodes and primary relations as edges.
272
+ #
273
+ # The order of the returned descendents is as indeterminate.
274
+ #
275
+ # @return [Array<Token>] descendents
276
+ def descendents
277
+ dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
278
+ end
279
+
280
+ memoize :descendents
281
+
282
+ alias :descendants :descendents
283
+
284
+ # Tests if the token is empty.
285
+ #
286
+ # A token is empty if it does not have a form. If the token is empty,
287
+ # {Token#empty_token_sort} explains its function.
288
+ #
289
+ # @see Token#has_content?
290
+ #
291
+ # @return [true, false]
292
+ def is_empty?
293
+ !empty_token_sort.nil?
294
+ end
295
+
296
+ # Tests if the token has content.
297
+ #
298
+ # A token has content if it has a form.
299
+ #
300
+ # @see Token#is_empty?
301
+ #
302
+ # @return [true, false]
303
+ def has_content?
304
+ empty_token_sort.nil?
305
+ end
306
+
307
+ # Tests if the token has a citation.
308
+ #
309
+ # A token has a citation if `citation_part` is not `nil`.
310
+ #
311
+ # @return [true, false]
312
+ def has_citation?
313
+ !citation_part.nil?
314
+ end
315
+
316
+ # Checks if the token is a PRO token.
317
+ #
318
+ # @return [true, false]
319
+ def pro?
320
+ empty_token_sort == 'P'
321
+ end
322
+
323
+ # Finds the common ancestors that this token and another token
324
+ # share in the dependency graph.
325
+ #
326
+ # If `inclusive` is `false`, a common ancestor is defined strictly
327
+ # as a common ancestor of both tokens. If `inclusive` is `true`,
328
+ # one of the tokens can be a common ancestor of the other.
329
+ #
330
+ # Ancestors are returned in the same order as {Token#ancestors}.
331
+ #
332
+ # @example
333
+ # x.head # => w
334
+ # w.head # => z
335
+ # y.head # => z
336
+ # z.head # => u
337
+ #
338
+ # x.common_ancestors(y, inclusive: false) # => [z, u]
339
+ # x.common_ancestors(w, inclusive: false) # => [z, u]
340
+ # x.common_ancestors(x, inclusive: false) # => [w, z, u]
341
+ #
342
+ # x.common_ancestors(y, inclusive: true) # => [z, u]
343
+ # x.common_ancestors(w, inclusive: true) # => [w, z, u]
344
+ # x.common_ancestors(x, inclusive: true) # => [x, w, z, u]
345
+ #
346
+ # @see Token#first_common_ancestor
347
+ # @see Token#first_common_ancestor_path
348
+ #
349
+ # @return [Array<Token>] common ancestors
350
+ def common_ancestors(other_token, inclusive: false)
351
+ if inclusive
352
+ x, y = [self] + ancestors, [other_token] + other_token.ancestors
353
+ else
354
+ x, y = ancestors, other_token.ancestors
355
+ end
356
+
357
+ x & y
358
+ end
359
+
360
+ # Finds the first common ancestor that this token and another token
361
+ # share in the dependency graph.
362
+ #
363
+ # If `inclusive` is `false`, a common ancestor is defined strictly
364
+ # as a common ancestor of both tokens. If `inclusive` is `true`,
365
+ # one of the tokens can be a common ancestor of the other.
366
+ #
367
+ # @example
368
+ # x.head # => w
369
+ # w.head # => z
370
+ # y.head # => z
371
+ # z.head # => u
372
+ #
373
+ # x.first_common_ancestor(y, inclusive: false) # => z
374
+ # x.first_common_ancestor(w, inclusive: false) # => z
375
+ # x.first_common_ancestor(x, inclusive: false) # => w
376
+ #
377
+ # x.first_common_ancestor(y, inclusive: true) # => z
378
+ # x.first_common_ancestor(w, inclusive: true) # => w
379
+ # x.first_common_ancestor(x, inclusive: true) # => x
380
+ #
381
+ # @see Token#common_ancestors
382
+ # @see Token#first_common_ancestor_path
383
+ #
384
+ # @return [nil, Token] first common ancestor
385
+ def first_common_ancestor(other_token, inclusive: false)
386
+ common_ancestors(other_token, inclusive: inclusive).first
387
+ end
388
+
389
+ private
390
+
391
+ # FIXME: extract this from the header of the PROIEL XML file instead and
392
+ # subclass PositionalTag
393
+ POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
394
+
395
+ # FIXME: extract this from the header of the PROIEL XML file instead and
396
+ # subclass PositionalTag
397
+ MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
398
+ person number tense mood voice gender case degree strength inflection
399
+ )
400
+
401
+ NULL_PARTS_OF_SPEECH = {
402
+ 'V' => 'V-',
403
+ 'C' => 'C-',
404
+ 'P' => 'Pp',
405
+ }
406
+ end
407
+ end
@@ -0,0 +1,90 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Tokenization
8
+ # Loads tokenization patterns from a configuration file.
9
+ #
10
+ # The configuration file should be a JSON file. The keys should
11
+ # be language tags and the values tokenization patterns.
12
+ #
13
+ # The method can be called multiple times. On the first invocation
14
+ # patterns will be loaded, on subsequent invocations patterns will
15
+ # be updated. Only patterns for languages that are defined in the
16
+ # configuration file will be updated, other patterns will remain unchanged.
17
+ #
18
+ # @param filename [String] name of tokenization pattern file
19
+ #
20
+ # @return [Hash] loaded patterns
21
+ #
22
+ def self.load_patterns(filename)
23
+ raise ArgumentError, 'invalid filename' unless filename.is_a?(String)
24
+
25
+ patterns = JSON.parse(File.read(filename))
26
+
27
+ regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
28
+
29
+ @@regexes ||= {}
30
+ @@regexes.merge!(regexes)
31
+ end
32
+
33
+ # Makes a regular expression from a pattern given in the configuration file.
34
+ #
35
+ # The regular expression is to avoid partial matches. Multi-line matches
36
+ # are allowed in case characters that are interpreted as line separators
37
+ # occur in the data.
38
+ #
39
+ # @param pattern [String] tokenization pattern
40
+ #
41
+ # @return [Regexp]
42
+ #
43
+ def self.make_regex(pattern)
44
+ raise ArgumentError, 'invalid pattern' unless pattern.is_a?(String)
45
+
46
+ Regexp.new("^#{pattern}$", Regexp::MULTILINE)
47
+ end
48
+
49
+ # Tests if a token form is splitable. Any form with more than one character
50
+ # is splitable.
51
+ #
52
+ # @param form [String, nil] token form to Tests
53
+ #
54
+ # @return [true, false]
55
+ #
56
+ def self.is_splitable?(form)
57
+ raise ArgumentError, 'invalid form' unless form.is_a?(String) or form.nil?
58
+
59
+ form and form.length > 1
60
+ end
61
+
62
+ # Splits a token form using the tokenization patterns that apply for a
63
+ # the specified language. Tokenization patterns must already have been
64
+ # loaded.
65
+ #
66
+ # @param language_tag [String] ISO 639-3 tag for the language whose patterns
67
+ # should be used to split the token form
68
+ # @param form [String] token form to split
69
+ #
70
+ # @return [Array<String>]
71
+ #
72
+ def self.split_form(language_tag, form)
73
+ raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
74
+ raise ArgumentError, 'invalid form' unless form.is_a?(String)
75
+
76
+ if form[/\W+/]
77
+ # Split on any non-word character like a space or punctuation
78
+ form.split(/(\W+)/)
79
+ elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
80
+ # Apply language-specific pattern
81
+ form.match(@@regexes[language_tag]).captures
82
+ elsif form == ''
83
+ ['']
84
+ else
85
+ # Give up and split by character
86
+ form.split(/()/)
87
+ end
88
+ end
89
+ end
90
+ end