proiel 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5ead6b41029599129af6a717b0398980876a139c
4
- data.tar.gz: 11a555a98a41029dfb721e9613bbb5be23847cfd
2
+ SHA256:
3
+ metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
4
+ data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
5
5
  SHA512:
6
- metadata.gz: 475971ca8443be39f3ef2a634ff8afd50931b248130e439991d473d06fcf0b7695d38d0629856654738ecaa853af1d63ba8675849eef52055b431252d07e205e
7
- data.tar.gz: c33088acdb1e3fb130386204b02e3f54ca6566a05310c3397e8471c88360a742aafb8e5edc515e0d185280625994c606fed987f8cb7518a23c34a5e13686474b
6
+ metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
7
+ data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
data/README.md CHANGED
@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
12
12
 
13
13
  ## Installation
14
14
 
15
- To install this library you need Ruby 2.1 or newer.
15
+ This library requires Ruby >= 2.2. Install as
16
16
 
17
17
  ```shell
18
18
  gem install proiel
@@ -35,7 +35,7 @@ bundle
35
35
  ```
36
36
 
37
37
  To download a sample treebank, initialize a new git repository and add the
38
- [PROIEL treebank](http://proiel.github.io) as a submodule:
38
+ [PROIEL treebank](https://proiel.github.io) as a submodule:
39
39
 
40
40
  ```shell
41
41
  git init
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -13,6 +13,8 @@ require 'nokogiri'
13
13
  require 'singleton'
14
14
  require 'erb'
15
15
  require 'open3'
16
+ require 'set'
17
+ require 'builder'
16
18
 
17
19
  require 'proiel/version'
18
20
  require 'proiel/utils'
@@ -31,3 +33,6 @@ require 'proiel/div'
31
33
  require 'proiel/sentence'
32
34
  require 'proiel/token'
33
35
  require 'proiel/visualization'
36
+ require 'proiel/chronology'
37
+ require 'proiel/valency'
38
+ require 'proiel/dictionary'
@@ -0,0 +1,80 @@
1
+ #--
2
+ # Copyright (c) 2016-2017 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for parsing chronological descriptions. Extra care is taken to get
8
+ # the interpretation of centuries and ranges involving the transition between 1
9
+ # BC and AD 1 correct.
10
+ module PROIEL::Chronology
11
+ # Computes the chronological midpoint of a chronological description.
12
+ #
13
+ # @param s [String] chronological description
14
+ #
15
+ # @return [Integer]
16
+ #
17
+ # @example
18
+ # midpoint('1000') # => 1000
19
+ # midpoint('1000 BC') # => -1000
20
+ # midpoint('1000-1020') # => 1010
21
+ def self.midpoint(s)
22
+ i = parse(s)
23
+
24
+ if i.is_a?(Array)
25
+ # Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
26
+ # up again unless negative
27
+ if i.first < 0 and i.last > 0
28
+ y = (i.first + i.last - 1)/2.0
29
+ if y < 0
30
+ y.floor
31
+ else
32
+ (y + 1).floor
33
+ end
34
+ else
35
+ ((i.first + i.last)/2.0).floor # a non-integer midpoint is within the year of the integer part
36
+ end
37
+ elsif i.is_a?(Integer)
38
+ i
39
+ else
40
+ raise ArgumentError, 'integer or array expected'
41
+ end
42
+ end
43
+
44
+ # Parses a chronological description. The syntax of chronological
45
+ # descriptions is explained in the [PROIEL XML
46
+ # documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
47
+ #
48
+ # @param s [String] chronological description
49
+ #
50
+ # @return [Integer, Array<Integer,Integer>]
51
+ #
52
+ # @example
53
+ # parse('1000') # => 1000
54
+ # parse('1000 BC') # => -1000
55
+ # parse('1000-1020') # => [1000,1020]
56
+ # parse('1000 BC-1020') # => [-1000,1020]
57
+ def self.parse(s)
58
+ case s
59
+ when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
60
+ i = $1.to_i
61
+ multiplier = $2 ? -1 : 1
62
+ (i * multiplier).to_i.tap do |i|
63
+ # There is no year zero in the Julian calendar
64
+ raise ArgumentError, 'invalid year' if i.zero?
65
+ end
66
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
67
+ a = $1.to_i * 100
68
+ [a - 99, a]
69
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
70
+ a = -$1.to_i * 100
71
+ [a, a + 99]
72
+ when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
73
+ s.split('-').map { |i| self.parse(i) }.tap do |from, to|
74
+ raise ArgumentError, 'invalid range' unless from < to
75
+ end
76
+ else
77
+ raise ArgumentError, 'unexpected format'
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,3 @@
1
+ module PROIEL::Dictionary; end
2
+
3
+ require 'proiel/dictionary/builder'
@@ -0,0 +1,201 @@
1
+ #--
2
+ # Copyright (c) 2016-2017 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for synthesising and manipulating dictionaries from treebank data.
8
+ module PROIEL::Dictionary
9
+ class Builder
10
+ attr_reader :license
11
+ attr_reader :language
12
+ attr_reader :sources
13
+ attr_reader :lemmata
14
+
15
+ def initialize
16
+ @language = nil
17
+ @license = nil
18
+ @sources = []
19
+ @lemmata = {}
20
+ @valency = PROIEL::Valency::Lexicon.new
21
+ end
22
+
23
+ def add_source!(source)
24
+ raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
25
+ raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
26
+ raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
27
+
28
+ @language ||= source.language
29
+ @license ||= source.license
30
+ @sources << source
31
+
32
+ source.tokens.each { |token| index_token!(token) }
33
+
34
+ index_homographs!
35
+ end
36
+
37
+ CURRENT_SCHEMA_VERSION = '3.0'
38
+
39
+ def to_xml(io)
40
+ builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
41
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
42
+ builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
43
+ builder.dictionary(language: @language) do
44
+ builder.sources do
45
+ @sources.each do |source|
46
+ builder.source(id: source.id, license: source.license)
47
+ end
48
+ end
49
+
50
+ builder.lemmata(n: @lemmata.count) do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
52
+ lemma_to_xml(builder, form, data)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def lemma_to_xml(builder, form, data)
62
+ builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
63
+ distribution_to_xml(builder, data)
64
+ glosses_to_xml(builder, data)
65
+ homographs_to_xml(builder, data)
66
+ paradigm_to_xml(builder, data)
67
+ valency_to_xml(builder, data)
68
+ end
69
+ end
70
+
71
+ def distribution_to_xml(builder, data)
72
+ builder.distribution do
73
+ data[:distribution].sort_by(&:first).each do |source_id, n|
74
+ builder.source(id: source_id, n: n)
75
+ end
76
+ end
77
+ end
78
+
79
+ def glosses_to_xml(builder, data)
80
+ if data[:glosses].count > 0
81
+ builder.glosses do
82
+ # TODO
83
+ end
84
+ end
85
+ end
86
+
87
+ def homographs_to_xml(builder, data)
88
+ if data[:homographs].count > 0
89
+ builder.homographs do
90
+ data[:homographs].each do |homograph|
91
+ builder.lemma form: homograph
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+ def paradigm_to_xml(builder, data)
98
+ unless data[:paradigm].empty?
99
+ builder.paradigm do
100
+ data[:paradigm].sort_by(&:first).each do |morphology, d|
101
+ builder.slot1 morphology: morphology do
102
+ d.sort_by(&:first).each do |form, n|
103
+ builder.slot2 form: form, n: n
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ def valency_to_xml(builder, data)
112
+ unless data[:valency].empty?
113
+ builder.valency do
114
+ frames =
115
+ data[:valency].map do |arguments, token_ids|
116
+ { arguments: arguments, tokens: token_ids }
117
+ end
118
+
119
+ PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
120
+ builder.frame do
121
+ builder.arguments do
122
+ frame[:arguments].each do |argument|
123
+ builder.argument argument
124
+ end
125
+ end
126
+
127
+ if frame[:tokens][:a].count > 0
128
+ builder.tokens flags: 'a', n: frame[:tokens][:a].count do
129
+ frame[:tokens][:a].each do |token_id|
130
+ builder.token id: token_id
131
+ end
132
+ end
133
+ end
134
+
135
+ if frame[:tokens][:r].count > 0
136
+ builder.tokens flags: 'r', n: frame[:tokens][:r].count do
137
+ frame[:tokens][:r].each do |token_id|
138
+ builder.token id: token_id
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def index_homographs!
149
+ @lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
150
+ if homographs.count > 1
151
+ homographs.each do |form|
152
+ @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def index_token!(token)
159
+ if token.lemma and token.part_of_speech
160
+ encoded_lemma = [token.lemma, token.part_of_speech].join(',')
161
+
162
+ @lemmata[encoded_lemma] ||= {
163
+ lemma: token.lemma,
164
+ part_of_speech: token.part_of_speech,
165
+ distribution: {},
166
+ glosses: {},
167
+ homographs: [],
168
+ paradigm: {},
169
+ n: 0,
170
+ valency: {},
171
+ }
172
+
173
+ lemma = @lemmata[encoded_lemma]
174
+
175
+ lemma[:distribution][token.source.id] ||= 0
176
+ lemma[:distribution][token.source.id] += 1
177
+
178
+ lemma[:paradigm][token.morphology] ||= {}
179
+ lemma[:paradigm][token.morphology][token.form] ||= 0
180
+ lemma[:paradigm][token.morphology][token.form] += 1
181
+
182
+ lemma[:n] += 1
183
+
184
+ # Find verbal nodes
185
+ if token.part_of_speech[/^V/]
186
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
187
+
188
+ lemma[:valency][frame] ||= { a: [], r: [] }
189
+
190
+ entry = lemma[:valency][frame]
191
+
192
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
193
+ entry[:r] << token.id
194
+ else
195
+ entry[:a] << token.id
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -135,5 +135,21 @@ module PROIEL
135
135
  end
136
136
  end
137
137
  end
138
+
139
+ # Returns the aligned div if any.
140
+ #
141
+ # @return [Div, NilClass] aligned div
142
+ def alignment(aligned_source)
143
+ alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
144
+ end
145
+
146
+ # Returns inferred aligned divs if any.
147
+ #
148
+ # @return [Array<Div>] inferred aligned divs
149
+ def inferred_alignment(aligned_source)
150
+ sentences.map do |sentence|
151
+ sentence.inferred_alignment(aligned_source)
152
+ end.flatten.compact.map(&:div).uniq
153
+ end
138
154
  end
139
155
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -16,9 +16,11 @@ module PROIEL
16
16
  # Creates a new validator for a PROIEL XML file.
17
17
  #
18
18
  # @param filename [String] name of PROIEL XML file to validate
19
+ # @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
19
20
  #
20
- def initialize(filename)
21
+ def initialize(filename, aligned_filename = nil)
21
22
  @filename = filename
23
+ @aligned_filename = aligned_filename
22
24
  @errors = []
23
25
  end
24
26
 
@@ -154,6 +156,27 @@ module PROIEL
154
156
  end
155
157
  end
156
158
 
159
+ # Pass 5: if div is aligned, sentences and tokens within should belong
160
+ # to aligned div(s); if sentence aligned, tokens within should belong
161
+ # to aligned sentence(s). Skip if no alignment_id on source (see pass
162
+ # 4) or if aligned source not available.
163
+ if @aligned_filename
164
+ aligned_tb = PROIEL::Treebank.new
165
+ aligned_tb.load_from_xml(@aligned_filename)
166
+
167
+ tb.sources.each do |source|
168
+ if source.alignment_id
169
+ aligned_source = aligned_tb.find_source(source.alignment_id)
170
+
171
+ if aligned_source
172
+ check_alignment_integrity(errors, source, aligned_source)
173
+ else
174
+ errors << "Aligned source not available in treebank"
175
+ end
176
+ end
177
+ end
178
+ end
179
+
157
180
  # Decide if there were any errors
158
181
  if errors.empty?
159
182
  true
@@ -182,6 +205,52 @@ module PROIEL
182
205
  errors << "Token #{token.id}: #{attribute_name} is null"
183
206
  end
184
207
  end
208
+
209
+ def check_alignment_integrity(errors, source, aligned_source)
210
+ source.divs.each do |div|
211
+ target_sentences =
212
+ div.sentences.map do |sentence|
213
+ target_tokens =
214
+ sentence.tokens.select(&:alignment_id).map do |token|
215
+ # Check that target token exists in aligned source
216
+ aligned_token = aligned_source.treebank.find_token(token.alignment_id)
217
+
218
+ if aligned_token
219
+ aligned_token
220
+ else
221
+ errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
222
+ nil
223
+ end
224
+ end
225
+
226
+ inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
227
+
228
+ if sentence.alignment_id
229
+ a = sentence.alignment_id.to_s.split(',').sort.join(',')
230
+ i = inferred_target_sentences.map(&:id).sort.join(',')
231
+
232
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
233
+ if a != i
234
+ errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
235
+ end
236
+ end
237
+
238
+ inferred_target_sentences
239
+ end
240
+
241
+ inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
242
+
243
+ if div.alignment_id
244
+ a = div.alignment_id.to_s.split(',').sort.join(',')
245
+ i = inferred_target_divs.map(&:id).sort.join(',')
246
+
247
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
248
+ if a != i
249
+ errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
250
+ end
251
+ end
252
+ end
253
+ end
185
254
  end
186
255
  end
187
256
  end
@@ -116,7 +116,7 @@ module PROIEL
116
116
  # @return [String] the printable form of the sentence
117
117
  def printable_form(options = {})
118
118
  [presentation_before,
119
- @children.map { |t| t.printable_form(options) },
119
+ @children.reject(&:is_empty?).map { |t| t.printable_form(options) },
120
120
  presentation_after].compact.join
121
121
  end
122
122
 
@@ -217,5 +217,21 @@ module PROIEL
217
217
  def tokens
218
218
  @children.to_enum
219
219
  end
220
+
221
+ # Returns the aligned sentence if any.
222
+ #
223
+ # @return [Sentence, NilClass] aligned sentence
224
+ def alignment(aligned_source)
225
+ alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
226
+ end
227
+
228
+ # Returns inferred aligned sentences if any.
229
+ #
230
+ # @return [Array<Sentence>] inferred aligned sentences
231
+ def inferred_alignment(aligned_source)
232
+ tokens.select(&:alignment_id).map do |token|
233
+ token.alignment(aligned_source)
234
+ end.flatten.compact.map(&:sentence).uniq
235
+ end
220
236
  end
221
237
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -160,12 +160,13 @@ module PROIEL
160
160
  # Returns the printable form of the token with any presentation data.
161
161
  #
162
162
  # @param custom_token_formatter [Lambda] formatting function for tokens
163
+ # which is passed the token as its sole argument
163
164
  #
164
165
  # @return [String] the printable form of the token
165
166
  def printable_form(custom_token_formatter: nil)
166
167
  printable_form =
167
168
  if custom_token_formatter
168
- custom_token_formatter.call(id, form)
169
+ custom_token_formatter.call(self)
169
170
  else
170
171
  form
171
172
  end
@@ -393,6 +394,13 @@ module PROIEL
393
394
  common_ancestors(other_token, inclusive: inclusive).first
394
395
  end
395
396
 
397
+ # Returns the aligned token if any.
398
+ #
399
+ # @return [Token, NilClass] aligned token
400
+ def alignment(aligned_source)
401
+ alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
402
+ end
403
+
396
404
  private
397
405
 
398
406
  # FIXME: extract this from the header of the PROIEL XML file instead and
@@ -0,0 +1,5 @@
1
+ module PROIEL::Valency; end
2
+
3
+ require 'proiel/valency/obliqueness'
4
+ require 'proiel/valency/arguments'
5
+ require 'proiel/valency/lexicon'
@@ -0,0 +1,147 @@
1
+ module PROIEL::Valency::Arguments
2
+ def self.get_argument_frame(token)
3
+ arguments = collect_arguments(token)
4
+ hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
5
+
6
+ a =
7
+ hoisted_arguments.map do |argument|
8
+ { relation: argument.relation }.merge(extract_features(argument))
9
+ end
10
+
11
+ PROIEL::Valency::Obliqueness.sort_arguments(a)
12
+ end
13
+
14
+ private
15
+
16
+ POS_CLASSIFICATION = {
17
+ 'R' => :functor,
18
+ 'G' => :functor,
19
+ 'N' => :nominal,
20
+ 'P' => :nominal,
21
+ 'A' => :nominal,
22
+ 'M' => :nominal,
23
+ 'V' => :verbal,
24
+ }
25
+
26
+ # Collapses dependents based on features
27
+ def self.collapse_dependents(dependents)
28
+ # Hoist dependents if any of the dependents is a coordinator
29
+ dependents = dependents.map { |d| hoist_dependents(d) }
30
+
31
+ # Figure out if all dependents are equivalent for the purposes of
32
+ # argument frames. Typical examples would be coordinated, identical
33
+ # prepositions (which is operationalised as same lemma, same POS, no
34
+ # case) or coordinated nouns in the same case (which is operationalised
35
+ # as same major POS, same case). If we fail to figure out a way to
36
+ # hoist and reduce arguments, we keep the coordinator.
37
+ majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
38
+ majors = majors.length == 1 ? majors.first : nil
39
+
40
+ case majors
41
+ when :functor
42
+ lemmas = dependents.map(&:lemma).uniq
43
+ if lemmas.length == 1
44
+ dependents.first
45
+ else
46
+ #STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
47
+ nil
48
+ end
49
+ when :nominal
50
+ cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
51
+ if cases.length == 1
52
+ dependents.first
53
+ else
54
+ #STDERR.puts "Different cases N/P: #{cases.inspect}"
55
+ nil
56
+ end
57
+ when :verbal
58
+ moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
59
+ if moods.length == 1
60
+ dependents.first
61
+ else
62
+ #STDERR.puts "Different moods V: #{moods.inspect}"
63
+ nil
64
+ end
65
+ else
66
+ #STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
67
+ nil
68
+ end
69
+ end
70
+
71
+ # Hoists the real argument dependents from conjoined arguments
72
+ def self.hoist_dependents(argument)
73
+ if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
74
+ # Pick dependents that have the same relation as the coordinator. This
75
+ # eliminates auxiliary elements like particles and repeated
76
+ # conjunctions as well as attributes that scope over all conjuncts.
77
+ dependents = argument.dependents.select { |d| d.relation == argument.relation }
78
+
79
+ collapse_dependents(dependents) || argument
80
+ else
81
+ argument
82
+ end
83
+ end
84
+
85
+ # Extracts morphosyntactic features that are relevant to the argument frame
86
+ def self.extract_features(argument)
87
+ {}.tap do |features|
88
+ case argument.part_of_speech_hash[:major]
89
+ when 'G'
90
+ features[:lemma] = argument.lemma
91
+ features[:part_of_speech] = argument.part_of_speech
92
+
93
+ # There may be multiple dependents and dependents may be headed by
94
+ # coordinators. All relevant dependents have the relation PRED.
95
+ dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
96
+ local_argument = collapse_dependents(dependents)
97
+ features[:mood] = local_argument.morphology_hash[:mood] if local_argument and local_argument.morphology_hash[:mood]
98
+ when 'R'
99
+ features[:lemma] = argument.lemma
100
+ features[:part_of_speech] = argument.part_of_speech
101
+
102
+ # There may be multiple dependents and dependents may be headed by
103
+ # coordinators. All relevant dependents have the relation OBL.
104
+ dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
105
+ local_argument = collapse_dependents(dependents)
106
+ features[:case] = local_argument.morphology_hash[:case] if local_argument and local_argument.morphology_hash[:case]
107
+ when 'V'
108
+ features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
109
+ when 'D'
110
+ features[:lemma] = argument.lemma
111
+ features[:part_of_speech] = argument.part_of_speech
112
+ when 'P'
113
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
114
+ if argument.part_of_speech == 'Pk' # reflexive personal pronoun
115
+ features[:lemma] = argument.lemma
116
+ features[:part_of_speech] = argument.part_of_speech
117
+ end
118
+ else
119
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
120
+ end
121
+ end
122
+ end
123
+
124
+ # Determines the arguments of a predicate
125
+ def self.collect_arguments(token)
126
+ token.dependents.select do |dependent|
127
+ case dependent.relation
128
+ when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
129
+ true
130
+ when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
131
+ false
132
+ when 'arg' # unspecific but always an argument
133
+ true
134
+ when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
135
+ false
136
+ when 'rel' # unspecific but never an argument
137
+ false
138
+ when 'pred', 'parpred', 'voc' # shouldn't happen
139
+ false
140
+ when 'pid', 'xsub' # really shouldn't happen
141
+ false
142
+ else
143
+ raise "unknown relation #{dependent.relation.inspect}"
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,59 @@
1
+ module PROIEL
2
+ module Valency
3
+ class Lexicon
4
+ attr_reader :frames
5
+
6
+ def initialize
7
+ @source_ids = Set.new
8
+ @source_languages = Set.new
9
+ @frames = {}
10
+ end
11
+
12
+ # Generates a valency lexicon from the provided sources. In practice the
13
+ # sources should be in the same language but this is not enforced. This
14
+ # makes it possible to generate a lexicon from sources in closely related
15
+ # languages or dialects.
16
+ def add_source!(source)
17
+ @source_ids << source.id
18
+ @source_languages << source.language
19
+
20
+ source.sentences.each do |sentence|
21
+ tokens = find_verbal_nodes(sentence)
22
+ tokens.each do |token|
23
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
24
+
25
+ partition =
26
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
27
+ :r
28
+ else
29
+ :a
30
+ end
31
+
32
+ @frames[token.lemma] ||= {}
33
+ @frames[token.lemma][token.part_of_speech] ||= {}
34
+ @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
35
+ @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
36
+ end
37
+ end
38
+ end
39
+
40
+ def lookup(lemma, part_of_speech)
41
+ frames =
42
+ @frames[lemma][part_of_speech].map do |arguments, token_ids|
43
+ { arguments: arguments, tokens: token_ids }
44
+ end
45
+ PROIEL::Valency::Obliqueness.sort_frames(frames)
46
+ end
47
+
48
+ private
49
+
50
+ # Find verbal nodes in a sentence
51
+ def find_verbal_nodes(sentence)
52
+ sentence.tokens.select do |token|
53
+ # FIXME: is this test in the proiel library already?
54
+ (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ module PROIEL::Valency::Obliqueness
2
+ # Sorts frames by obliqueness
3
+ def self.sort_frames(frames)
4
+ # Sort frames by obliqueness, then by inspecting them so that we get
5
+ # a stable, reproducible order.
6
+ frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
7
+ end
8
+
9
+ # Sorts arguments by obliqueness
10
+ def self.sort_arguments(arguments)
11
+ arguments.sort_by { |argument| obliqueness_of_argument(argument) }
12
+ end
13
+
14
+ private
15
+
16
+ def self.obliqueness_of_arguments(arguments)
17
+ arguments.map do |argument|
18
+ obliqueness_of_argument(argument)
19
+ end
20
+ end
21
+
22
+ def self.obliqueness_of_argument(argument)
23
+ obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
24
+ end
25
+
26
+ OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg)
27
+
28
+ def self.obliqueness_of_relation(relation)
29
+ OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
30
+ end
31
+ end
@@ -1,9 +1,9 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
  module PROIEL
7
7
  # Gem version
8
- VERSION = '1.2.0'
8
+ VERSION = '1.2.1'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-31 00:00:00.000000000 Z
11
+ date: 2018-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -30,28 +30,28 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.6.6
33
+ version: '1.8'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.6.6
40
+ version: '1.8'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: sax-machine
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 1.3.2
47
+ version: '1.3'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 1.3.2
54
+ version: '1.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: memoist
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -66,48 +66,62 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.12'
69
+ - !ruby/object:Gem::Dependency
70
+ name: builder
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: '1.12'
89
+ version: '1.15'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: '1.12'
96
+ version: '1.15'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rake
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
101
  - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '11.2'
103
+ version: '12.0'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
108
  - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '11.2'
110
+ version: '12.0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rspec
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '3.2'
117
+ version: '3.6'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '3.2'
124
+ version: '3.6'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: pry
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +142,14 @@ dependencies:
128
142
  requirements:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
- version: '0.12'
145
+ version: '0.14'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
- version: '0.12'
152
+ version: '0.14'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: yard
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +178,10 @@ files:
164
178
  - bin/setup
165
179
  - lib/proiel.rb
166
180
  - lib/proiel/annotation_schema.rb
181
+ - lib/proiel/chronology.rb
167
182
  - lib/proiel/citations.rb
183
+ - lib/proiel/dictionary.rb
184
+ - lib/proiel/dictionary/builder.rb
168
185
  - lib/proiel/div.rb
169
186
  - lib/proiel/positional_tag.rb
170
187
  - lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
@@ -183,6 +200,10 @@ files:
183
200
  - lib/proiel/treebank.rb
184
201
  - lib/proiel/treebank_object.rb
185
202
  - lib/proiel/utils.rb
203
+ - lib/proiel/valency.rb
204
+ - lib/proiel/valency/arguments.rb
205
+ - lib/proiel/valency/lexicon.rb
206
+ - lib/proiel/valency/obliqueness.rb
186
207
  - lib/proiel/version.rb
187
208
  - lib/proiel/visualization.rb
188
209
  - lib/proiel/visualization/graphviz.rb
@@ -201,7 +222,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
201
222
  requirements:
202
223
  - - ">="
203
224
  - !ruby/object:Gem::Version
204
- version: '2.1'
225
+ version: '2.2'
205
226
  required_rubygems_version: !ruby/object:Gem::Requirement
206
227
  requirements:
207
228
  - - ">="
@@ -209,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
209
230
  version: '0'
210
231
  requirements: []
211
232
  rubyforge_project:
212
- rubygems_version: 2.5.1
233
+ rubygems_version: 2.7.4
213
234
  signing_key:
214
235
  specification_version: 4
215
236
  summary: A library for working with treebanks using the PROIEL dependency format