proiel 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +2 -2
- data/lib/proiel.rb +6 -1
- data/lib/proiel/chronology.rb +80 -0
- data/lib/proiel/dictionary.rb +3 -0
- data/lib/proiel/dictionary/builder.rb +201 -0
- data/lib/proiel/div.rb +17 -1
- data/lib/proiel/proiel_xml/validator.rb +71 -2
- data/lib/proiel/sentence.rb +17 -1
- data/lib/proiel/token.rb +10 -2
- data/lib/proiel/valency.rb +5 -0
- data/lib/proiel/valency/arguments.rb +147 -0
- data/lib/proiel/valency/lexicon.rb +59 -0
- data/lib/proiel/valency/obliqueness.rb +31 -0
- data/lib/proiel/version.rb +2 -2
- metadata +37 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
|
4
|
+
data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
|
7
|
+
data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
|
|
12
12
|
|
13
13
|
## Installation
|
14
14
|
|
15
|
-
|
15
|
+
This library requires Ruby >= 2.2. Install as
|
16
16
|
|
17
17
|
```shell
|
18
18
|
gem install proiel
|
@@ -35,7 +35,7 @@ bundle
|
|
35
35
|
```
|
36
36
|
|
37
37
|
To download a sample treebank, initialize a new git repository and add the
|
38
|
-
[PROIEL treebank](
|
38
|
+
[PROIEL treebank](https://proiel.github.io) as a submodule:
|
39
39
|
|
40
40
|
```shell
|
41
41
|
git init
|
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -13,6 +13,8 @@ require 'nokogiri'
|
|
13
13
|
require 'singleton'
|
14
14
|
require 'erb'
|
15
15
|
require 'open3'
|
16
|
+
require 'set'
|
17
|
+
require 'builder'
|
16
18
|
|
17
19
|
require 'proiel/version'
|
18
20
|
require 'proiel/utils'
|
@@ -31,3 +33,6 @@ require 'proiel/div'
|
|
31
33
|
require 'proiel/sentence'
|
32
34
|
require 'proiel/token'
|
33
35
|
require 'proiel/visualization'
|
36
|
+
require 'proiel/chronology'
|
37
|
+
require 'proiel/valency'
|
38
|
+
require 'proiel/dictionary'
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2017 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for parsing chronological descriptions. Extra care is taken to get
|
8
|
+
# the interpretation of centuries and ranges involving the transition between 1
|
9
|
+
# BC and AD 1 correct.
|
10
|
+
module PROIEL::Chronology
|
11
|
+
# Computes the chronological midpoint of a chronological description.
|
12
|
+
#
|
13
|
+
# @param s [String] chronological description
|
14
|
+
#
|
15
|
+
# @return [Integer]
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# midpoint('1000') # => 1000
|
19
|
+
# midpoint('1000 BC') # => -1000
|
20
|
+
# midpoint('1000-1020') # => 1010
|
21
|
+
def self.midpoint(s)
|
22
|
+
i = parse(s)
|
23
|
+
|
24
|
+
if i.is_a?(Array)
|
25
|
+
# Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
|
26
|
+
# up again unless negative
|
27
|
+
if i.first < 0 and i.last > 0
|
28
|
+
y = (i.first + i.last - 1)/2.0
|
29
|
+
if y < 0
|
30
|
+
y.floor
|
31
|
+
else
|
32
|
+
(y + 1).floor
|
33
|
+
end
|
34
|
+
else
|
35
|
+
((i.first + i.last)/2.0).floor # a non-integer midpoint is within the year of the integer part
|
36
|
+
end
|
37
|
+
elsif i.is_a?(Integer)
|
38
|
+
i
|
39
|
+
else
|
40
|
+
raise ArgumentError, 'integer or array expected'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Parses a chronological description. The syntax of chronological
|
45
|
+
# descriptions is explained in the [PROIEL XML
|
46
|
+
# documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
|
47
|
+
#
|
48
|
+
# @param s [String] chronological description
|
49
|
+
#
|
50
|
+
# @return [Integer, Array<Integer,Integer>]
|
51
|
+
#
|
52
|
+
# @example
|
53
|
+
# parse('1000') # => 1000
|
54
|
+
# parse('1000 BC') # => -1000
|
55
|
+
# parse('1000-1020') # => [1000,1020]
|
56
|
+
# parse('1000 BC-1020') # => [-1000,1020]
|
57
|
+
def self.parse(s)
|
58
|
+
case s
|
59
|
+
when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
|
60
|
+
i = $1.to_i
|
61
|
+
multiplier = $2 ? -1 : 1
|
62
|
+
(i * multiplier).to_i.tap do |i|
|
63
|
+
# There is no year zero in the Julian calendar
|
64
|
+
raise ArgumentError, 'invalid year' if i.zero?
|
65
|
+
end
|
66
|
+
when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
|
67
|
+
a = $1.to_i * 100
|
68
|
+
[a - 99, a]
|
69
|
+
when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
|
70
|
+
a = -$1.to_i * 100
|
71
|
+
[a, a + 99]
|
72
|
+
when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
|
73
|
+
s.split('-').map { |i| self.parse(i) }.tap do |from, to|
|
74
|
+
raise ArgumentError, 'invalid range' unless from < to
|
75
|
+
end
|
76
|
+
else
|
77
|
+
raise ArgumentError, 'unexpected format'
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2017 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
+
module PROIEL::Dictionary
|
9
|
+
class Builder
|
10
|
+
attr_reader :license
|
11
|
+
attr_reader :language
|
12
|
+
attr_reader :sources
|
13
|
+
attr_reader :lemmata
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@language = nil
|
17
|
+
@license = nil
|
18
|
+
@sources = []
|
19
|
+
@lemmata = {}
|
20
|
+
@valency = PROIEL::Valency::Lexicon.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_source!(source)
|
24
|
+
raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
|
25
|
+
raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
|
26
|
+
raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
|
27
|
+
|
28
|
+
@language ||= source.language
|
29
|
+
@license ||= source.license
|
30
|
+
@sources << source
|
31
|
+
|
32
|
+
source.tokens.each { |token| index_token!(token) }
|
33
|
+
|
34
|
+
index_homographs!
|
35
|
+
end
|
36
|
+
|
37
|
+
CURRENT_SCHEMA_VERSION = '3.0'
|
38
|
+
|
39
|
+
def to_xml(io)
|
40
|
+
builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
|
41
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
42
|
+
builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
|
43
|
+
builder.dictionary(language: @language) do
|
44
|
+
builder.sources do
|
45
|
+
@sources.each do |source|
|
46
|
+
builder.source(id: source.id, license: source.license)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
builder.lemmata(n: @lemmata.count) do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
|
52
|
+
lemma_to_xml(builder, form, data)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def lemma_to_xml(builder, form, data)
|
62
|
+
builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
|
63
|
+
distribution_to_xml(builder, data)
|
64
|
+
glosses_to_xml(builder, data)
|
65
|
+
homographs_to_xml(builder, data)
|
66
|
+
paradigm_to_xml(builder, data)
|
67
|
+
valency_to_xml(builder, data)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def distribution_to_xml(builder, data)
|
72
|
+
builder.distribution do
|
73
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
74
|
+
builder.source(id: source_id, n: n)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def glosses_to_xml(builder, data)
|
80
|
+
if data[:glosses].count > 0
|
81
|
+
builder.glosses do
|
82
|
+
# TODO
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def homographs_to_xml(builder, data)
|
88
|
+
if data[:homographs].count > 0
|
89
|
+
builder.homographs do
|
90
|
+
data[:homographs].each do |homograph|
|
91
|
+
builder.lemma form: homograph
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def paradigm_to_xml(builder, data)
|
98
|
+
unless data[:paradigm].empty?
|
99
|
+
builder.paradigm do
|
100
|
+
data[:paradigm].sort_by(&:first).each do |morphology, d|
|
101
|
+
builder.slot1 morphology: morphology do
|
102
|
+
d.sort_by(&:first).each do |form, n|
|
103
|
+
builder.slot2 form: form, n: n
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def valency_to_xml(builder, data)
|
112
|
+
unless data[:valency].empty?
|
113
|
+
builder.valency do
|
114
|
+
frames =
|
115
|
+
data[:valency].map do |arguments, token_ids|
|
116
|
+
{ arguments: arguments, tokens: token_ids }
|
117
|
+
end
|
118
|
+
|
119
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
|
120
|
+
builder.frame do
|
121
|
+
builder.arguments do
|
122
|
+
frame[:arguments].each do |argument|
|
123
|
+
builder.argument argument
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if frame[:tokens][:a].count > 0
|
128
|
+
builder.tokens flags: 'a', n: frame[:tokens][:a].count do
|
129
|
+
frame[:tokens][:a].each do |token_id|
|
130
|
+
builder.token id: token_id
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
if frame[:tokens][:r].count > 0
|
136
|
+
builder.tokens flags: 'r', n: frame[:tokens][:r].count do
|
137
|
+
frame[:tokens][:r].each do |token_id|
|
138
|
+
builder.token id: token_id
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def index_homographs!
|
149
|
+
@lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
|
150
|
+
if homographs.count > 1
|
151
|
+
homographs.each do |form|
|
152
|
+
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def index_token!(token)
|
159
|
+
if token.lemma and token.part_of_speech
|
160
|
+
encoded_lemma = [token.lemma, token.part_of_speech].join(',')
|
161
|
+
|
162
|
+
@lemmata[encoded_lemma] ||= {
|
163
|
+
lemma: token.lemma,
|
164
|
+
part_of_speech: token.part_of_speech,
|
165
|
+
distribution: {},
|
166
|
+
glosses: {},
|
167
|
+
homographs: [],
|
168
|
+
paradigm: {},
|
169
|
+
n: 0,
|
170
|
+
valency: {},
|
171
|
+
}
|
172
|
+
|
173
|
+
lemma = @lemmata[encoded_lemma]
|
174
|
+
|
175
|
+
lemma[:distribution][token.source.id] ||= 0
|
176
|
+
lemma[:distribution][token.source.id] += 1
|
177
|
+
|
178
|
+
lemma[:paradigm][token.morphology] ||= {}
|
179
|
+
lemma[:paradigm][token.morphology][token.form] ||= 0
|
180
|
+
lemma[:paradigm][token.morphology][token.form] += 1
|
181
|
+
|
182
|
+
lemma[:n] += 1
|
183
|
+
|
184
|
+
# Find verbal nodes
|
185
|
+
if token.part_of_speech[/^V/]
|
186
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
187
|
+
|
188
|
+
lemma[:valency][frame] ||= { a: [], r: [] }
|
189
|
+
|
190
|
+
entry = lemma[:valency][frame]
|
191
|
+
|
192
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
193
|
+
entry[:r] << token.id
|
194
|
+
else
|
195
|
+
entry[:a] << token.id
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -135,5 +135,21 @@ module PROIEL
|
|
135
135
|
end
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
# Returns the aligned div if any.
|
140
|
+
#
|
141
|
+
# @return [Div, NilClass] aligned div
|
142
|
+
def alignment(aligned_source)
|
143
|
+
alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
|
144
|
+
end
|
145
|
+
|
146
|
+
# Returns inferred aligned divs if any.
|
147
|
+
#
|
148
|
+
# @return [Array<Div>] inferred aligned divs
|
149
|
+
def inferred_alignment(aligned_source)
|
150
|
+
sentences.map do |sentence|
|
151
|
+
sentence.inferred_alignment(aligned_source)
|
152
|
+
end.flatten.compact.map(&:div).uniq
|
153
|
+
end
|
138
154
|
end
|
139
155
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -16,9 +16,11 @@ module PROIEL
|
|
16
16
|
# Creates a new validator for a PROIEL XML file.
|
17
17
|
#
|
18
18
|
# @param filename [String] name of PROIEL XML file to validate
|
19
|
+
# @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
|
19
20
|
#
|
20
|
-
def initialize(filename)
|
21
|
+
def initialize(filename, aligned_filename = nil)
|
21
22
|
@filename = filename
|
23
|
+
@aligned_filename = aligned_filename
|
22
24
|
@errors = []
|
23
25
|
end
|
24
26
|
|
@@ -154,6 +156,27 @@ module PROIEL
|
|
154
156
|
end
|
155
157
|
end
|
156
158
|
|
159
|
+
# Pass 5: if div is aligned, sentences and tokens within should belong
|
160
|
+
# to aligned div(s); if sentence aligned, tokens within should belong
|
161
|
+
# to aligned sentence(s). Skip if no alignment_id on source (see pass
|
162
|
+
# 4) or if aligned source not available.
|
163
|
+
if @aligned_filename
|
164
|
+
aligned_tb = PROIEL::Treebank.new
|
165
|
+
aligned_tb.load_from_xml(@aligned_filename)
|
166
|
+
|
167
|
+
tb.sources.each do |source|
|
168
|
+
if source.alignment_id
|
169
|
+
aligned_source = aligned_tb.find_source(source.alignment_id)
|
170
|
+
|
171
|
+
if aligned_source
|
172
|
+
check_alignment_integrity(errors, source, aligned_source)
|
173
|
+
else
|
174
|
+
errors << "Aligned source not available in treebank"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
157
180
|
# Decide if there were any errors
|
158
181
|
if errors.empty?
|
159
182
|
true
|
@@ -182,6 +205,52 @@ module PROIEL
|
|
182
205
|
errors << "Token #{token.id}: #{attribute_name} is null"
|
183
206
|
end
|
184
207
|
end
|
208
|
+
|
209
|
+
def check_alignment_integrity(errors, source, aligned_source)
|
210
|
+
source.divs.each do |div|
|
211
|
+
target_sentences =
|
212
|
+
div.sentences.map do |sentence|
|
213
|
+
target_tokens =
|
214
|
+
sentence.tokens.select(&:alignment_id).map do |token|
|
215
|
+
# Check that target token exists in aligned source
|
216
|
+
aligned_token = aligned_source.treebank.find_token(token.alignment_id)
|
217
|
+
|
218
|
+
if aligned_token
|
219
|
+
aligned_token
|
220
|
+
else
|
221
|
+
errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
|
222
|
+
nil
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
|
227
|
+
|
228
|
+
if sentence.alignment_id
|
229
|
+
a = sentence.alignment_id.to_s.split(',').sort.join(',')
|
230
|
+
i = inferred_target_sentences.map(&:id).sort.join(',')
|
231
|
+
|
232
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
233
|
+
if a != i
|
234
|
+
errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
inferred_target_sentences
|
239
|
+
end
|
240
|
+
|
241
|
+
inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
|
242
|
+
|
243
|
+
if div.alignment_id
|
244
|
+
a = div.alignment_id.to_s.split(',').sort.join(',')
|
245
|
+
i = inferred_target_divs.map(&:id).sort.join(',')
|
246
|
+
|
247
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
248
|
+
if a != i
|
249
|
+
errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
185
254
|
end
|
186
255
|
end
|
187
256
|
end
|
data/lib/proiel/sentence.rb
CHANGED
@@ -116,7 +116,7 @@ module PROIEL
|
|
116
116
|
# @return [String] the printable form of the sentence
|
117
117
|
def printable_form(options = {})
|
118
118
|
[presentation_before,
|
119
|
-
@children.map { |t| t.printable_form(options) },
|
119
|
+
@children.reject(&:is_empty?).map { |t| t.printable_form(options) },
|
120
120
|
presentation_after].compact.join
|
121
121
|
end
|
122
122
|
|
@@ -217,5 +217,21 @@ module PROIEL
|
|
217
217
|
def tokens
|
218
218
|
@children.to_enum
|
219
219
|
end
|
220
|
+
|
221
|
+
# Returns the aligned sentence if any.
|
222
|
+
#
|
223
|
+
# @return [Sentence, NilClass] aligned sentence
|
224
|
+
def alignment(aligned_source)
|
225
|
+
alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
# Returns inferred aligned sentences if any.
|
229
|
+
#
|
230
|
+
# @return [Array<Sentence>] inferred aligned sentences
|
231
|
+
def inferred_alignment(aligned_source)
|
232
|
+
tokens.select(&:alignment_id).map do |token|
|
233
|
+
token.alignment(aligned_source)
|
234
|
+
end.flatten.compact.map(&:sentence).uniq
|
235
|
+
end
|
220
236
|
end
|
221
237
|
end
|
data/lib/proiel/token.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -160,12 +160,13 @@ module PROIEL
|
|
160
160
|
# Returns the printable form of the token with any presentation data.
|
161
161
|
#
|
162
162
|
# @param custom_token_formatter [Lambda] formatting function for tokens
|
163
|
+
# which is passed the token as its sole argument
|
163
164
|
#
|
164
165
|
# @return [String] the printable form of the token
|
165
166
|
def printable_form(custom_token_formatter: nil)
|
166
167
|
printable_form =
|
167
168
|
if custom_token_formatter
|
168
|
-
custom_token_formatter.call(
|
169
|
+
custom_token_formatter.call(self)
|
169
170
|
else
|
170
171
|
form
|
171
172
|
end
|
@@ -393,6 +394,13 @@ module PROIEL
|
|
393
394
|
common_ancestors(other_token, inclusive: inclusive).first
|
394
395
|
end
|
395
396
|
|
397
|
+
# Returns the aligned token if any.
|
398
|
+
#
|
399
|
+
# @return [Token, NilClass] aligned token
|
400
|
+
def alignment(aligned_source)
|
401
|
+
alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
|
402
|
+
end
|
403
|
+
|
396
404
|
private
|
397
405
|
|
398
406
|
# FIXME: extract this from the header of the PROIEL XML file instead and
|
@@ -0,0 +1,147 @@
|
|
1
|
+
module PROIEL::Valency::Arguments
|
2
|
+
def self.get_argument_frame(token)
|
3
|
+
arguments = collect_arguments(token)
|
4
|
+
hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
|
5
|
+
|
6
|
+
a =
|
7
|
+
hoisted_arguments.map do |argument|
|
8
|
+
{ relation: argument.relation }.merge(extract_features(argument))
|
9
|
+
end
|
10
|
+
|
11
|
+
PROIEL::Valency::Obliqueness.sort_arguments(a)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
POS_CLASSIFICATION = {
|
17
|
+
'R' => :functor,
|
18
|
+
'G' => :functor,
|
19
|
+
'N' => :nominal,
|
20
|
+
'P' => :nominal,
|
21
|
+
'A' => :nominal,
|
22
|
+
'M' => :nominal,
|
23
|
+
'V' => :verbal,
|
24
|
+
}
|
25
|
+
|
26
|
+
# Collapses dependents based on features
|
27
|
+
def self.collapse_dependents(dependents)
|
28
|
+
# Hoist dependents if any of the dependents is a coordinator
|
29
|
+
dependents = dependents.map { |d| hoist_dependents(d) }
|
30
|
+
|
31
|
+
# Figure out if all dependents are equivalent for the purposes of
|
32
|
+
# argument frames. Typical examples would be coordinated, identical
|
33
|
+
# prepositions (which is operationalised as same lemma, same POS, no
|
34
|
+
# case) or coordinated nouns in the same case (which is operationalised
|
35
|
+
# as same major POS, same case). If we fail to figure out a way to
|
36
|
+
# hoist and reduce arguments, we keep the coordinator.
|
37
|
+
majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
|
38
|
+
majors = majors.length == 1 ? majors.first : nil
|
39
|
+
|
40
|
+
case majors
|
41
|
+
when :functor
|
42
|
+
lemmas = dependents.map(&:lemma).uniq
|
43
|
+
if lemmas.length == 1
|
44
|
+
dependents.first
|
45
|
+
else
|
46
|
+
#STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
when :nominal
|
50
|
+
cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
|
51
|
+
if cases.length == 1
|
52
|
+
dependents.first
|
53
|
+
else
|
54
|
+
#STDERR.puts "Different cases N/P: #{cases.inspect}"
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
when :verbal
|
58
|
+
moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
|
59
|
+
if moods.length == 1
|
60
|
+
dependents.first
|
61
|
+
else
|
62
|
+
#STDERR.puts "Different moods V: #{moods.inspect}"
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
else
|
66
|
+
#STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Hoists the real argument dependents from conjoined arguments
|
72
|
+
def self.hoist_dependents(argument)
|
73
|
+
if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
|
74
|
+
# Pick dependents that have the same relation as the coordinator. This
|
75
|
+
# eliminates auxiliary elements like particles and repeated
|
76
|
+
# conjunctions as well as attributes that scope over all conjuncts.
|
77
|
+
dependents = argument.dependents.select { |d| d.relation == argument.relation }
|
78
|
+
|
79
|
+
collapse_dependents(dependents) || argument
|
80
|
+
else
|
81
|
+
argument
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Extracts morphosyntactic features that are relevant to the argument frame
|
86
|
+
def self.extract_features(argument)
|
87
|
+
{}.tap do |features|
|
88
|
+
case argument.part_of_speech_hash[:major]
|
89
|
+
when 'G'
|
90
|
+
features[:lemma] = argument.lemma
|
91
|
+
features[:part_of_speech] = argument.part_of_speech
|
92
|
+
|
93
|
+
# There may be multiple dependents and dependents may be headed by
|
94
|
+
# coordinators. All relevant dependents have the relation PRED.
|
95
|
+
dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
|
96
|
+
local_argument = collapse_dependents(dependents)
|
97
|
+
features[:mood] = local_argument.morphology_hash[:mood] if local_argument and local_argument.morphology_hash[:mood]
|
98
|
+
when 'R'
|
99
|
+
features[:lemma] = argument.lemma
|
100
|
+
features[:part_of_speech] = argument.part_of_speech
|
101
|
+
|
102
|
+
# There may be multiple dependents and dependents may be headed by
|
103
|
+
# coordinators. All relevant dependents have the relation OBL.
|
104
|
+
dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
|
105
|
+
local_argument = collapse_dependents(dependents)
|
106
|
+
features[:case] = local_argument.morphology_hash[:case] if local_argument and local_argument.morphology_hash[:case]
|
107
|
+
when 'V'
|
108
|
+
features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
|
109
|
+
when 'D'
|
110
|
+
features[:lemma] = argument.lemma
|
111
|
+
features[:part_of_speech] = argument.part_of_speech
|
112
|
+
when 'P'
|
113
|
+
features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
|
114
|
+
if argument.part_of_speech == 'Pk' # reflexive personal pronoun
|
115
|
+
features[:lemma] = argument.lemma
|
116
|
+
features[:part_of_speech] = argument.part_of_speech
|
117
|
+
end
|
118
|
+
else
|
119
|
+
features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Determines the arguments of a predicate
|
125
|
+
def self.collect_arguments(token)
|
126
|
+
token.dependents.select do |dependent|
|
127
|
+
case dependent.relation
|
128
|
+
when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
|
129
|
+
true
|
130
|
+
when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
|
131
|
+
false
|
132
|
+
when 'arg' # unspecific but always an argument
|
133
|
+
true
|
134
|
+
when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
|
135
|
+
false
|
136
|
+
when 'rel' # unspecific but never an argument
|
137
|
+
false
|
138
|
+
when 'pred', 'parpred', 'voc' # shouldn't happen
|
139
|
+
false
|
140
|
+
when 'pid', 'xsub' # really shouldn't happen
|
141
|
+
false
|
142
|
+
else
|
143
|
+
raise "unknown relation #{dependent.relation.inspect}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Valency
|
3
|
+
class Lexicon
|
4
|
+
attr_reader :frames
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@source_ids = Set.new
|
8
|
+
@source_languages = Set.new
|
9
|
+
@frames = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generates a valency lexicon from the provided sources. In practice the
|
13
|
+
# sources should be in the same language but this is not enforced. This
|
14
|
+
# makes it possible to generate a lexicon from sources in closely related
|
15
|
+
# languages or dialects.
|
16
|
+
def add_source!(source)
|
17
|
+
@source_ids << source.id
|
18
|
+
@source_languages << source.language
|
19
|
+
|
20
|
+
source.sentences.each do |sentence|
|
21
|
+
tokens = find_verbal_nodes(sentence)
|
22
|
+
tokens.each do |token|
|
23
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
24
|
+
|
25
|
+
partition =
|
26
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
27
|
+
:r
|
28
|
+
else
|
29
|
+
:a
|
30
|
+
end
|
31
|
+
|
32
|
+
@frames[token.lemma] ||= {}
|
33
|
+
@frames[token.lemma][token.part_of_speech] ||= {}
|
34
|
+
@frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
|
35
|
+
@frames[token.lemma][token.part_of_speech][frame][partition] << token.id
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def lookup(lemma, part_of_speech)
|
41
|
+
frames =
|
42
|
+
@frames[lemma][part_of_speech].map do |arguments, token_ids|
|
43
|
+
{ arguments: arguments, tokens: token_ids }
|
44
|
+
end
|
45
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Find verbal nodes in a sentence
|
51
|
+
def find_verbal_nodes(sentence)
|
52
|
+
sentence.tokens.select do |token|
|
53
|
+
# FIXME: is this test in the proiel library already?
|
54
|
+
(token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module PROIEL::Valency::Obliqueness
|
2
|
+
# Sorts frames by obliqueness
|
3
|
+
def self.sort_frames(frames)
|
4
|
+
# Sort frames by obliqueness, then by inspecting them so that we get
|
5
|
+
# a stable, reproducible order.
|
6
|
+
frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sorts arguments by obliqueness
|
10
|
+
def self.sort_arguments(arguments)
|
11
|
+
arguments.sort_by { |argument| obliqueness_of_argument(argument) }
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.obliqueness_of_arguments(arguments)
|
17
|
+
arguments.map do |argument|
|
18
|
+
obliqueness_of_argument(argument)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.obliqueness_of_argument(argument)
|
23
|
+
obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
|
24
|
+
end
|
25
|
+
|
26
|
+
OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg)
|
27
|
+
|
28
|
+
def self.obliqueness_of_relation(relation)
|
29
|
+
OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
|
30
|
+
end
|
31
|
+
end
|
data/lib/proiel/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -30,28 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: '1.8'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: '1.8'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: sax-machine
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.3
|
47
|
+
version: '1.3'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.3
|
54
|
+
version: '1.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: memoist
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,48 +66,62 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.12'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: builder
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: '1.
|
89
|
+
version: '1.15'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
94
|
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: '1.
|
96
|
+
version: '1.15'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rake
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '12.0'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '12.0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rspec
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '3.
|
117
|
+
version: '3.6'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '3.
|
124
|
+
version: '3.6'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: pry
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +142,14 @@ dependencies:
|
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
145
|
+
version: '0.14'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
152
|
+
version: '0.14'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: yard
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +178,10 @@ files:
|
|
164
178
|
- bin/setup
|
165
179
|
- lib/proiel.rb
|
166
180
|
- lib/proiel/annotation_schema.rb
|
181
|
+
- lib/proiel/chronology.rb
|
167
182
|
- lib/proiel/citations.rb
|
183
|
+
- lib/proiel/dictionary.rb
|
184
|
+
- lib/proiel/dictionary/builder.rb
|
168
185
|
- lib/proiel/div.rb
|
169
186
|
- lib/proiel/positional_tag.rb
|
170
187
|
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
|
@@ -183,6 +200,10 @@ files:
|
|
183
200
|
- lib/proiel/treebank.rb
|
184
201
|
- lib/proiel/treebank_object.rb
|
185
202
|
- lib/proiel/utils.rb
|
203
|
+
- lib/proiel/valency.rb
|
204
|
+
- lib/proiel/valency/arguments.rb
|
205
|
+
- lib/proiel/valency/lexicon.rb
|
206
|
+
- lib/proiel/valency/obliqueness.rb
|
186
207
|
- lib/proiel/version.rb
|
187
208
|
- lib/proiel/visualization.rb
|
188
209
|
- lib/proiel/visualization/graphviz.rb
|
@@ -201,7 +222,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
201
222
|
requirements:
|
202
223
|
- - ">="
|
203
224
|
- !ruby/object:Gem::Version
|
204
|
-
version: '2.
|
225
|
+
version: '2.2'
|
205
226
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
206
227
|
requirements:
|
207
228
|
- - ">="
|
@@ -209,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
209
230
|
version: '0'
|
210
231
|
requirements: []
|
211
232
|
rubyforge_project:
|
212
|
-
rubygems_version: 2.
|
233
|
+
rubygems_version: 2.7.4
|
213
234
|
signing_key:
|
214
235
|
specification_version: 4
|
215
236
|
summary: A library for working with treebanks using the PROIEL dependency format
|