proiel 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5ead6b41029599129af6a717b0398980876a139c
4
- data.tar.gz: 11a555a98a41029dfb721e9613bbb5be23847cfd
2
+ SHA256:
3
+ metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
4
+ data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
5
5
  SHA512:
6
- metadata.gz: 475971ca8443be39f3ef2a634ff8afd50931b248130e439991d473d06fcf0b7695d38d0629856654738ecaa853af1d63ba8675849eef52055b431252d07e205e
7
- data.tar.gz: c33088acdb1e3fb130386204b02e3f54ca6566a05310c3397e8471c88360a742aafb8e5edc515e0d185280625994c606fed987f8cb7518a23c34a5e13686474b
6
+ metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
7
+ data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
data/README.md CHANGED
@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
12
12
 
13
13
  ## Installation
14
14
 
15
- To install this library you need Ruby 2.1 or newer.
15
+ This library requires Ruby >= 2.2. Install as
16
16
 
17
17
  ```shell
18
18
  gem install proiel
@@ -35,7 +35,7 @@ bundle
35
35
  ```
36
36
 
37
37
  To download a sample treebank, initialize a new git repository and add the
38
- [PROIEL treebank](http://proiel.github.io) as a submodule:
38
+ [PROIEL treebank](https://proiel.github.io) as a submodule:
39
39
 
40
40
  ```shell
41
41
  git init
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -13,6 +13,8 @@ require 'nokogiri'
13
13
  require 'singleton'
14
14
  require 'erb'
15
15
  require 'open3'
16
+ require 'set'
17
+ require 'builder'
16
18
 
17
19
  require 'proiel/version'
18
20
  require 'proiel/utils'
@@ -31,3 +33,6 @@ require 'proiel/div'
31
33
  require 'proiel/sentence'
32
34
  require 'proiel/token'
33
35
  require 'proiel/visualization'
36
+ require 'proiel/chronology'
37
+ require 'proiel/valency'
38
+ require 'proiel/dictionary'
@@ -0,0 +1,80 @@
1
+ #--
2
+ # Copyright (c) 2016-2017 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for parsing chronological descriptions. Extra care is taken to get
8
+ # the interpretation of centuries and ranges involving the transition between 1
9
+ # BC and AD 1 correct.
10
+ module PROIEL::Chronology
11
+ # Computes the chronological midpoint of a chronological description.
12
+ #
13
+ # @param s [String] chronological description
14
+ #
15
+ # @return [Integer]
16
+ #
17
+ # @example
18
+ # midpoint('1000') # => 1000
19
+ # midpoint('1000 BC') # => -1000
20
+ # midpoint('1000-1020') # => 1010
21
+ def self.midpoint(s)
22
+ i = parse(s)
23
+
24
+ if i.is_a?(Array)
25
+ # Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
26
+ # up again unless negative
27
+ if i.first < 0 and i.last > 0
28
+ y = (i.first + i.last - 1)/2.0
29
+ if y < 0
30
+ y.floor
31
+ else
32
+ (y + 1).floor
33
+ end
34
+ else
35
+ ((i.first + i.last)/2.0).floor # a non-integer midpoint is within the year of the integer part
36
+ end
37
+ elsif i.is_a?(Integer)
38
+ i
39
+ else
40
+ raise ArgumentError, 'integer or array expected'
41
+ end
42
+ end
43
+
44
+ # Parses a chronological description. The syntax of chronological
45
+ # descriptions is explained in the [PROIEL XML
46
+ # documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
47
+ #
48
+ # @param s [String] chronological description
49
+ #
50
+ # @return [Integer, Array<Integer,Integer>]
51
+ #
52
+ # @example
53
+ # parse('1000') # => 1000
54
+ # parse('1000 BC') # => -1000
55
+ # parse('1000-1020') # => [1000,1020]
56
+ # parse('1000 BC-1020') # => [-1000,1020]
57
+ def self.parse(s)
58
+ case s
59
+ when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
60
+ i = $1.to_i
61
+ multiplier = $2 ? -1 : 1
62
+ (i * multiplier).to_i.tap do |i|
63
+ # There is no year zero in the Julian calendar
64
+ raise ArgumentError, 'invalid year' if i.zero?
65
+ end
66
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
67
+ a = $1.to_i * 100
68
+ [a - 99, a]
69
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
70
+ a = -$1.to_i * 100
71
+ [a, a + 99]
72
+ when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
73
+ s.split('-').map { |i| self.parse(i) }.tap do |from, to|
74
+ raise ArgumentError, 'invalid range' unless from < to
75
+ end
76
+ else
77
+ raise ArgumentError, 'unexpected format'
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,3 @@
1
+ module PROIEL::Dictionary; end
2
+
3
+ require 'proiel/dictionary/builder'
@@ -0,0 +1,201 @@
1
+ #--
2
+ # Copyright (c) 2016-2017 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for synthesising and manipulating dictionaries from treebank data.
8
+ module PROIEL::Dictionary
9
+ class Builder
10
+ attr_reader :license
11
+ attr_reader :language
12
+ attr_reader :sources
13
+ attr_reader :lemmata
14
+
15
+ def initialize
16
+ @language = nil
17
+ @license = nil
18
+ @sources = []
19
+ @lemmata = {}
20
+ @valency = PROIEL::Valency::Lexicon.new
21
+ end
22
+
23
+ def add_source!(source)
24
+ raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
25
+ raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
26
+ raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
27
+
28
+ @language ||= source.language
29
+ @license ||= source.license
30
+ @sources << source
31
+
32
+ source.tokens.each { |token| index_token!(token) }
33
+
34
+ index_homographs!
35
+ end
36
+
37
+ CURRENT_SCHEMA_VERSION = '3.0'
38
+
39
+ def to_xml(io)
40
+ builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
41
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
42
+ builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
43
+ builder.dictionary(language: @language) do
44
+ builder.sources do
45
+ @sources.each do |source|
46
+ builder.source(id: source.id, license: source.license)
47
+ end
48
+ end
49
+
50
+ builder.lemmata(n: @lemmata.count) do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
52
+ lemma_to_xml(builder, form, data)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def lemma_to_xml(builder, form, data)
62
+ builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
63
+ distribution_to_xml(builder, data)
64
+ glosses_to_xml(builder, data)
65
+ homographs_to_xml(builder, data)
66
+ paradigm_to_xml(builder, data)
67
+ valency_to_xml(builder, data)
68
+ end
69
+ end
70
+
71
+ def distribution_to_xml(builder, data)
72
+ builder.distribution do
73
+ data[:distribution].sort_by(&:first).each do |source_id, n|
74
+ builder.source(id: source_id, n: n)
75
+ end
76
+ end
77
+ end
78
+
79
+ def glosses_to_xml(builder, data)
80
+ if data[:glosses].count > 0
81
+ builder.glosses do
82
+ # TODO
83
+ end
84
+ end
85
+ end
86
+
87
+ def homographs_to_xml(builder, data)
88
+ if data[:homographs].count > 0
89
+ builder.homographs do
90
+ data[:homographs].each do |homograph|
91
+ builder.lemma form: homograph
92
+ end
93
+ end
94
+ end
95
+ end
96
+
97
+ def paradigm_to_xml(builder, data)
98
+ unless data[:paradigm].empty?
99
+ builder.paradigm do
100
+ data[:paradigm].sort_by(&:first).each do |morphology, d|
101
+ builder.slot1 morphology: morphology do
102
+ d.sort_by(&:first).each do |form, n|
103
+ builder.slot2 form: form, n: n
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ def valency_to_xml(builder, data)
112
+ unless data[:valency].empty?
113
+ builder.valency do
114
+ frames =
115
+ data[:valency].map do |arguments, token_ids|
116
+ { arguments: arguments, tokens: token_ids }
117
+ end
118
+
119
+ PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
120
+ builder.frame do
121
+ builder.arguments do
122
+ frame[:arguments].each do |argument|
123
+ builder.argument argument
124
+ end
125
+ end
126
+
127
+ if frame[:tokens][:a].count > 0
128
+ builder.tokens flags: 'a', n: frame[:tokens][:a].count do
129
+ frame[:tokens][:a].each do |token_id|
130
+ builder.token id: token_id
131
+ end
132
+ end
133
+ end
134
+
135
+ if frame[:tokens][:r].count > 0
136
+ builder.tokens flags: 'r', n: frame[:tokens][:r].count do
137
+ frame[:tokens][:r].each do |token_id|
138
+ builder.token id: token_id
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def index_homographs!
149
+ @lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
150
+ if homographs.count > 1
151
+ homographs.each do |form|
152
+ @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def index_token!(token)
159
+ if token.lemma and token.part_of_speech
160
+ encoded_lemma = [token.lemma, token.part_of_speech].join(',')
161
+
162
+ @lemmata[encoded_lemma] ||= {
163
+ lemma: token.lemma,
164
+ part_of_speech: token.part_of_speech,
165
+ distribution: {},
166
+ glosses: {},
167
+ homographs: [],
168
+ paradigm: {},
169
+ n: 0,
170
+ valency: {},
171
+ }
172
+
173
+ lemma = @lemmata[encoded_lemma]
174
+
175
+ lemma[:distribution][token.source.id] ||= 0
176
+ lemma[:distribution][token.source.id] += 1
177
+
178
+ lemma[:paradigm][token.morphology] ||= {}
179
+ lemma[:paradigm][token.morphology][token.form] ||= 0
180
+ lemma[:paradigm][token.morphology][token.form] += 1
181
+
182
+ lemma[:n] += 1
183
+
184
+ # Find verbal nodes
185
+ if token.part_of_speech[/^V/]
186
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
187
+
188
+ lemma[:valency][frame] ||= { a: [], r: [] }
189
+
190
+ entry = lemma[:valency][frame]
191
+
192
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
193
+ entry[:r] << token.id
194
+ else
195
+ entry[:a] << token.id
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -135,5 +135,21 @@ module PROIEL
135
135
  end
136
136
  end
137
137
  end
138
+
139
+ # Returns the aligned div if any.
140
+ #
141
+ # @return [Div, NilClass] aligned div
142
+ def alignment(aligned_source)
143
+ alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
144
+ end
145
+
146
+ # Returns inferred aligned divs if any.
147
+ #
148
+ # @return [Array<Div>] inferred aligned divs
149
+ def inferred_alignment(aligned_source)
150
+ sentences.map do |sentence|
151
+ sentence.inferred_alignment(aligned_source)
152
+ end.flatten.compact.map(&:div).uniq
153
+ end
138
154
  end
139
155
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -16,9 +16,11 @@ module PROIEL
16
16
  # Creates a new validator for a PROIEL XML file.
17
17
  #
18
18
  # @param filename [String] name of PROIEL XML file to validate
19
+ # @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
19
20
  #
20
- def initialize(filename)
21
+ def initialize(filename, aligned_filename = nil)
21
22
  @filename = filename
23
+ @aligned_filename = aligned_filename
22
24
  @errors = []
23
25
  end
24
26
 
@@ -154,6 +156,27 @@ module PROIEL
154
156
  end
155
157
  end
156
158
 
159
+ # Pass 5: if div is aligned, sentences and tokens within should belong
160
+ # to aligned div(s); if sentence aligned, tokens within should belong
161
+ # to aligned sentence(s). Skip if no alignment_id on source (see pass
162
+ # 4) or if aligned source not available.
163
+ if @aligned_filename
164
+ aligned_tb = PROIEL::Treebank.new
165
+ aligned_tb.load_from_xml(@aligned_filename)
166
+
167
+ tb.sources.each do |source|
168
+ if source.alignment_id
169
+ aligned_source = aligned_tb.find_source(source.alignment_id)
170
+
171
+ if aligned_source
172
+ check_alignment_integrity(errors, source, aligned_source)
173
+ else
174
+ errors << "Aligned source not available in treebank"
175
+ end
176
+ end
177
+ end
178
+ end
179
+
157
180
  # Decide if there were any errors
158
181
  if errors.empty?
159
182
  true
@@ -182,6 +205,52 @@ module PROIEL
182
205
  errors << "Token #{token.id}: #{attribute_name} is null"
183
206
  end
184
207
  end
208
+
209
+ def check_alignment_integrity(errors, source, aligned_source)
210
+ source.divs.each do |div|
211
+ target_sentences =
212
+ div.sentences.map do |sentence|
213
+ target_tokens =
214
+ sentence.tokens.select(&:alignment_id).map do |token|
215
+ # Check that target token exists in aligned source
216
+ aligned_token = aligned_source.treebank.find_token(token.alignment_id)
217
+
218
+ if aligned_token
219
+ aligned_token
220
+ else
221
+ errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
222
+ nil
223
+ end
224
+ end
225
+
226
+ inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
227
+
228
+ if sentence.alignment_id
229
+ a = sentence.alignment_id.to_s.split(',').sort.join(',')
230
+ i = inferred_target_sentences.map(&:id).sort.join(',')
231
+
232
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
233
+ if a != i
234
+ errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
235
+ end
236
+ end
237
+
238
+ inferred_target_sentences
239
+ end
240
+
241
+ inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
242
+
243
+ if div.alignment_id
244
+ a = div.alignment_id.to_s.split(',').sort.join(',')
245
+ i = inferred_target_divs.map(&:id).sort.join(',')
246
+
247
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
248
+ if a != i
249
+ errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
250
+ end
251
+ end
252
+ end
253
+ end
185
254
  end
186
255
  end
187
256
  end
@@ -116,7 +116,7 @@ module PROIEL
116
116
  # @return [String] the printable form of the sentence
117
117
  def printable_form(options = {})
118
118
  [presentation_before,
119
- @children.map { |t| t.printable_form(options) },
119
+ @children.reject(&:is_empty?).map { |t| t.printable_form(options) },
120
120
  presentation_after].compact.join
121
121
  end
122
122
 
@@ -217,5 +217,21 @@ module PROIEL
217
217
  def tokens
218
218
  @children.to_enum
219
219
  end
220
+
221
+ # Returns the aligned sentence if any.
222
+ #
223
+ # @return [Sentence, NilClass] aligned sentence
224
+ def alignment(aligned_source)
225
+ alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
226
+ end
227
+
228
+ # Returns inferred aligned sentences if any.
229
+ #
230
+ # @return [Array<Sentence>] inferred aligned sentences
231
+ def inferred_alignment(aligned_source)
232
+ tokens.select(&:alignment_id).map do |token|
233
+ token.alignment(aligned_source)
234
+ end.flatten.compact.map(&:sentence).uniq
235
+ end
220
236
  end
221
237
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -160,12 +160,13 @@ module PROIEL
160
160
  # Returns the printable form of the token with any presentation data.
161
161
  #
162
162
  # @param custom_token_formatter [Lambda] formatting function for tokens
163
+ # which is passed the token as its sole argument
163
164
  #
164
165
  # @return [String] the printable form of the token
165
166
  def printable_form(custom_token_formatter: nil)
166
167
  printable_form =
167
168
  if custom_token_formatter
168
- custom_token_formatter.call(id, form)
169
+ custom_token_formatter.call(self)
169
170
  else
170
171
  form
171
172
  end
@@ -393,6 +394,13 @@ module PROIEL
393
394
  common_ancestors(other_token, inclusive: inclusive).first
394
395
  end
395
396
 
397
+ # Returns the aligned token if any.
398
+ #
399
+ # @return [Token, NilClass] aligned token
400
+ def alignment(aligned_source)
401
+ alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
402
+ end
403
+
396
404
  private
397
405
 
398
406
  # FIXME: extract this from the header of the PROIEL XML file instead and
@@ -0,0 +1,5 @@
1
+ module PROIEL::Valency; end
2
+
3
+ require 'proiel/valency/obliqueness'
4
+ require 'proiel/valency/arguments'
5
+ require 'proiel/valency/lexicon'
@@ -0,0 +1,147 @@
1
+ module PROIEL::Valency::Arguments
2
+ def self.get_argument_frame(token)
3
+ arguments = collect_arguments(token)
4
+ hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
5
+
6
+ a =
7
+ hoisted_arguments.map do |argument|
8
+ { relation: argument.relation }.merge(extract_features(argument))
9
+ end
10
+
11
+ PROIEL::Valency::Obliqueness.sort_arguments(a)
12
+ end
13
+
14
+ private
15
+
16
+ POS_CLASSIFICATION = {
17
+ 'R' => :functor,
18
+ 'G' => :functor,
19
+ 'N' => :nominal,
20
+ 'P' => :nominal,
21
+ 'A' => :nominal,
22
+ 'M' => :nominal,
23
+ 'V' => :verbal,
24
+ }
25
+
26
+ # Collapses dependents based on features
27
+ def self.collapse_dependents(dependents)
28
+ # Hoist dependents if any of the dependents is a coordinator
29
+ dependents = dependents.map { |d| hoist_dependents(d) }
30
+
31
+ # Figure out if all dependents are equivalent for the purposes of
32
+ # argument frames. Typical examples would be coordinated, identical
33
+ # prepositions (which is operationalised as same lemma, same POS, no
34
+ # case) or coordinated nouns in the same case (which is operationalised
35
+ # as same major POS, same case). If we fail to figure out a way to
36
+ # hoist and reduce arguments, we keep the coordinator.
37
+ majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
38
+ majors = majors.length == 1 ? majors.first : nil
39
+
40
+ case majors
41
+ when :functor
42
+ lemmas = dependents.map(&:lemma).uniq
43
+ if lemmas.length == 1
44
+ dependents.first
45
+ else
46
+ #STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
47
+ nil
48
+ end
49
+ when :nominal
50
+ cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
51
+ if cases.length == 1
52
+ dependents.first
53
+ else
54
+ #STDERR.puts "Different cases N/P: #{cases.inspect}"
55
+ nil
56
+ end
57
+ when :verbal
58
+ moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
59
+ if moods.length == 1
60
+ dependents.first
61
+ else
62
+ #STDERR.puts "Different moods V: #{moods.inspect}"
63
+ nil
64
+ end
65
+ else
66
+ #STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
67
+ nil
68
+ end
69
+ end
70
+
71
+ # Hoists the real argument dependents from conjoined arguments
72
+ def self.hoist_dependents(argument)
73
+ if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
74
+ # Pick dependents that have the same relation as the coordinator. This
75
+ # eliminates auxiliary elements like particles and repeated
76
+ # conjunctions as well as attributes that scope over all conjuncts.
77
+ dependents = argument.dependents.select { |d| d.relation == argument.relation }
78
+
79
+ collapse_dependents(dependents) || argument
80
+ else
81
+ argument
82
+ end
83
+ end
84
+
85
+ # Extracts morphosyntactic features that are relevant to the argument frame
86
+ def self.extract_features(argument)
87
+ {}.tap do |features|
88
+ case argument.part_of_speech_hash[:major]
89
+ when 'G'
90
+ features[:lemma] = argument.lemma
91
+ features[:part_of_speech] = argument.part_of_speech
92
+
93
+ # There may be multiple dependents and dependents may be headed by
94
+ # coordinators. All relevant dependents have the relation PRED.
95
+ dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
96
+ local_argument = collapse_dependents(dependents)
97
+ features[:mood] = local_argument.morphology_hash[:mood] if local_argument and local_argument.morphology_hash[:mood]
98
+ when 'R'
99
+ features[:lemma] = argument.lemma
100
+ features[:part_of_speech] = argument.part_of_speech
101
+
102
+ # There may be multiple dependents and dependents may be headed by
103
+ # coordinators. All relevant dependents have the relation OBL.
104
+ dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
105
+ local_argument = collapse_dependents(dependents)
106
+ features[:case] = local_argument.morphology_hash[:case] if local_argument and local_argument.morphology_hash[:case]
107
+ when 'V'
108
+ features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
109
+ when 'D'
110
+ features[:lemma] = argument.lemma
111
+ features[:part_of_speech] = argument.part_of_speech
112
+ when 'P'
113
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
114
+ if argument.part_of_speech == 'Pk' # reflexive personal pronoun
115
+ features[:lemma] = argument.lemma
116
+ features[:part_of_speech] = argument.part_of_speech
117
+ end
118
+ else
119
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
120
+ end
121
+ end
122
+ end
123
+
124
+ # Determines the arguments of a predicate
125
+ def self.collect_arguments(token)
126
+ token.dependents.select do |dependent|
127
+ case dependent.relation
128
+ when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
129
+ true
130
+ when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
131
+ false
132
+ when 'arg' # unspecific but always an argument
133
+ true
134
+ when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
135
+ false
136
+ when 'rel' # unspecific but never an argument
137
+ false
138
+ when 'pred', 'parpred', 'voc' # shouldn't happen
139
+ false
140
+ when 'pid', 'xsub' # really shouldn't happen
141
+ false
142
+ else
143
+ raise "unknown relation #{dependent.relation.inspect}"
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,59 @@
1
+ module PROIEL
2
+ module Valency
3
+ class Lexicon
4
+ attr_reader :frames
5
+
6
+ def initialize
7
+ @source_ids = Set.new
8
+ @source_languages = Set.new
9
+ @frames = {}
10
+ end
11
+
12
+ # Generates a valency lexicon from the provided sources. In practice the
13
+ # sources should be in the same language but this is not enforced. This
14
+ # makes it possible to generate a lexicon from sources in closely related
15
+ # languages or dialects.
16
+ def add_source!(source)
17
+ @source_ids << source.id
18
+ @source_languages << source.language
19
+
20
+ source.sentences.each do |sentence|
21
+ tokens = find_verbal_nodes(sentence)
22
+ tokens.each do |token|
23
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
24
+
25
+ partition =
26
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
27
+ :r
28
+ else
29
+ :a
30
+ end
31
+
32
+ @frames[token.lemma] ||= {}
33
+ @frames[token.lemma][token.part_of_speech] ||= {}
34
+ @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
35
+ @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
36
+ end
37
+ end
38
+ end
39
+
40
+ def lookup(lemma, part_of_speech)
41
+ frames =
42
+ @frames[lemma][part_of_speech].map do |arguments, token_ids|
43
+ { arguments: arguments, tokens: token_ids }
44
+ end
45
+ PROIEL::Valency::Obliqueness.sort_frames(frames)
46
+ end
47
+
48
+ private
49
+
50
+ # Find verbal nodes in a sentence
51
+ def find_verbal_nodes(sentence)
52
+ sentence.tokens.select do |token|
53
+ # FIXME: is this test in the proiel library already?
54
+ (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ module PROIEL::Valency::Obliqueness
2
+ # Sorts frames by obliqueness
3
+ def self.sort_frames(frames)
4
+ # Sort frames by obliqueness, then by inspecting them so that we get
5
+ # a stable, reproducible order.
6
+ frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
7
+ end
8
+
9
+ # Sorts arguments by obliqueness
10
+ def self.sort_arguments(arguments)
11
+ arguments.sort_by { |argument| obliqueness_of_argument(argument) }
12
+ end
13
+
14
+ private
15
+
16
+ def self.obliqueness_of_arguments(arguments)
17
+ arguments.map do |argument|
18
+ obliqueness_of_argument(argument)
19
+ end
20
+ end
21
+
22
+ def self.obliqueness_of_argument(argument)
23
+ obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
24
+ end
25
+
26
+ OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg)
27
+
28
+ def self.obliqueness_of_relation(relation)
29
+ OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
30
+ end
31
+ end
@@ -1,9 +1,9 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
  module PROIEL
7
7
  # Gem version
8
- VERSION = '1.2.0'
8
+ VERSION = '1.2.1'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-31 00:00:00.000000000 Z
11
+ date: 2018-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -30,28 +30,28 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.6.6
33
+ version: '1.8'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.6.6
40
+ version: '1.8'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: sax-machine
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 1.3.2
47
+ version: '1.3'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 1.3.2
54
+ version: '1.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: memoist
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -66,48 +66,62 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.12'
69
+ - !ruby/object:Gem::Dependency
70
+ name: builder
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: '1.12'
89
+ version: '1.15'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: '1.12'
96
+ version: '1.15'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rake
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
101
  - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '11.2'
103
+ version: '12.0'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
108
  - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '11.2'
110
+ version: '12.0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rspec
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - "~>"
102
116
  - !ruby/object:Gem::Version
103
- version: '3.2'
117
+ version: '3.6'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
- version: '3.2'
124
+ version: '3.6'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: pry
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -128,14 +142,14 @@ dependencies:
128
142
  requirements:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
- version: '0.12'
145
+ version: '0.14'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
- version: '0.12'
152
+ version: '0.14'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: yard
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +178,10 @@ files:
164
178
  - bin/setup
165
179
  - lib/proiel.rb
166
180
  - lib/proiel/annotation_schema.rb
181
+ - lib/proiel/chronology.rb
167
182
  - lib/proiel/citations.rb
183
+ - lib/proiel/dictionary.rb
184
+ - lib/proiel/dictionary/builder.rb
168
185
  - lib/proiel/div.rb
169
186
  - lib/proiel/positional_tag.rb
170
187
  - lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
@@ -183,6 +200,10 @@ files:
183
200
  - lib/proiel/treebank.rb
184
201
  - lib/proiel/treebank_object.rb
185
202
  - lib/proiel/utils.rb
203
+ - lib/proiel/valency.rb
204
+ - lib/proiel/valency/arguments.rb
205
+ - lib/proiel/valency/lexicon.rb
206
+ - lib/proiel/valency/obliqueness.rb
186
207
  - lib/proiel/version.rb
187
208
  - lib/proiel/visualization.rb
188
209
  - lib/proiel/visualization/graphviz.rb
@@ -201,7 +222,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
201
222
  requirements:
202
223
  - - ">="
203
224
  - !ruby/object:Gem::Version
204
- version: '2.1'
225
+ version: '2.2'
205
226
  required_rubygems_version: !ruby/object:Gem::Requirement
206
227
  requirements:
207
228
  - - ">="
@@ -209,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
209
230
  version: '0'
210
231
  requirements: []
211
232
  rubyforge_project:
212
- rubygems_version: 2.5.1
233
+ rubygems_version: 2.7.4
213
234
  signing_key:
214
235
  specification_version: 4
215
236
  summary: A library for working with treebanks using the PROIEL dependency format