proiel 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +2 -2
- data/lib/proiel.rb +6 -1
- data/lib/proiel/chronology.rb +80 -0
- data/lib/proiel/dictionary.rb +3 -0
- data/lib/proiel/dictionary/builder.rb +201 -0
- data/lib/proiel/div.rb +17 -1
- data/lib/proiel/proiel_xml/validator.rb +71 -2
- data/lib/proiel/sentence.rb +17 -1
- data/lib/proiel/token.rb +10 -2
- data/lib/proiel/valency.rb +5 -0
- data/lib/proiel/valency/arguments.rb +147 -0
- data/lib/proiel/valency/lexicon.rb +59 -0
- data/lib/proiel/valency/obliqueness.rb +31 -0
- data/lib/proiel/version.rb +2 -2
- metadata +37 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
|
4
|
+
data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
|
7
|
+
data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
|
|
12
12
|
|
13
13
|
## Installation
|
14
14
|
|
15
|
-
|
15
|
+
This library requires Ruby >= 2.2. Install as
|
16
16
|
|
17
17
|
```shell
|
18
18
|
gem install proiel
|
@@ -35,7 +35,7 @@ bundle
|
|
35
35
|
```
|
36
36
|
|
37
37
|
To download a sample treebank, initialize a new git repository and add the
|
38
|
-
[PROIEL treebank](
|
38
|
+
[PROIEL treebank](https://proiel.github.io) as a submodule:
|
39
39
|
|
40
40
|
```shell
|
41
41
|
git init
|
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -13,6 +13,8 @@ require 'nokogiri'
|
|
13
13
|
require 'singleton'
|
14
14
|
require 'erb'
|
15
15
|
require 'open3'
|
16
|
+
require 'set'
|
17
|
+
require 'builder'
|
16
18
|
|
17
19
|
require 'proiel/version'
|
18
20
|
require 'proiel/utils'
|
@@ -31,3 +33,6 @@ require 'proiel/div'
|
|
31
33
|
require 'proiel/sentence'
|
32
34
|
require 'proiel/token'
|
33
35
|
require 'proiel/visualization'
|
36
|
+
require 'proiel/chronology'
|
37
|
+
require 'proiel/valency'
|
38
|
+
require 'proiel/dictionary'
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2017 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for parsing chronological descriptions. Extra care is taken to get
|
8
|
+
# the interpretation of centuries and ranges involving the transition between 1
|
9
|
+
# BC and AD 1 correct.
|
10
|
+
module PROIEL::Chronology
|
11
|
+
# Computes the chronological midpoint of a chronological description.
|
12
|
+
#
|
13
|
+
# @param s [String] chronological description
|
14
|
+
#
|
15
|
+
# @return [Integer]
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# midpoint('1000') # => 1000
|
19
|
+
# midpoint('1000 BC') # => -1000
|
20
|
+
# midpoint('1000-1020') # => 1010
|
21
|
+
def self.midpoint(s)
|
22
|
+
i = parse(s)
|
23
|
+
|
24
|
+
if i.is_a?(Array)
|
25
|
+
# Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
|
26
|
+
# up again unless negative
|
27
|
+
if i.first < 0 and i.last > 0
|
28
|
+
y = (i.first + i.last - 1)/2.0
|
29
|
+
if y < 0
|
30
|
+
y.floor
|
31
|
+
else
|
32
|
+
(y + 1).floor
|
33
|
+
end
|
34
|
+
else
|
35
|
+
((i.first + i.last)/2.0).floor # a non-integer midpoint is within the year of the integer part
|
36
|
+
end
|
37
|
+
elsif i.is_a?(Integer)
|
38
|
+
i
|
39
|
+
else
|
40
|
+
raise ArgumentError, 'integer or array expected'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Parses a chronological description. The syntax of chronological
|
45
|
+
# descriptions is explained in the [PROIEL XML
|
46
|
+
# documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
|
47
|
+
#
|
48
|
+
# @param s [String] chronological description
|
49
|
+
#
|
50
|
+
# @return [Integer, Array<Integer,Integer>]
|
51
|
+
#
|
52
|
+
# @example
|
53
|
+
# parse('1000') # => 1000
|
54
|
+
# parse('1000 BC') # => -1000
|
55
|
+
# parse('1000-1020') # => [1000,1020]
|
56
|
+
# parse('1000 BC-1020') # => [-1000,1020]
|
57
|
+
def self.parse(s)
|
58
|
+
case s
|
59
|
+
when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
|
60
|
+
i = $1.to_i
|
61
|
+
multiplier = $2 ? -1 : 1
|
62
|
+
(i * multiplier).to_i.tap do |i|
|
63
|
+
# There is no year zero in the Julian calendar
|
64
|
+
raise ArgumentError, 'invalid year' if i.zero?
|
65
|
+
end
|
66
|
+
when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
|
67
|
+
a = $1.to_i * 100
|
68
|
+
[a - 99, a]
|
69
|
+
when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
|
70
|
+
a = -$1.to_i * 100
|
71
|
+
[a, a + 99]
|
72
|
+
when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
|
73
|
+
s.split('-').map { |i| self.parse(i) }.tap do |from, to|
|
74
|
+
raise ArgumentError, 'invalid range' unless from < to
|
75
|
+
end
|
76
|
+
else
|
77
|
+
raise ArgumentError, 'unexpected format'
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2017 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
+
module PROIEL::Dictionary
|
9
|
+
class Builder
|
10
|
+
attr_reader :license
|
11
|
+
attr_reader :language
|
12
|
+
attr_reader :sources
|
13
|
+
attr_reader :lemmata
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@language = nil
|
17
|
+
@license = nil
|
18
|
+
@sources = []
|
19
|
+
@lemmata = {}
|
20
|
+
@valency = PROIEL::Valency::Lexicon.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_source!(source)
|
24
|
+
raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
|
25
|
+
raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
|
26
|
+
raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
|
27
|
+
|
28
|
+
@language ||= source.language
|
29
|
+
@license ||= source.license
|
30
|
+
@sources << source
|
31
|
+
|
32
|
+
source.tokens.each { |token| index_token!(token) }
|
33
|
+
|
34
|
+
index_homographs!
|
35
|
+
end
|
36
|
+
|
37
|
+
CURRENT_SCHEMA_VERSION = '3.0'
|
38
|
+
|
39
|
+
def to_xml(io)
|
40
|
+
builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
|
41
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
42
|
+
builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
|
43
|
+
builder.dictionary(language: @language) do
|
44
|
+
builder.sources do
|
45
|
+
@sources.each do |source|
|
46
|
+
builder.source(id: source.id, license: source.license)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
builder.lemmata(n: @lemmata.count) do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
|
52
|
+
lemma_to_xml(builder, form, data)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def lemma_to_xml(builder, form, data)
|
62
|
+
builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
|
63
|
+
distribution_to_xml(builder, data)
|
64
|
+
glosses_to_xml(builder, data)
|
65
|
+
homographs_to_xml(builder, data)
|
66
|
+
paradigm_to_xml(builder, data)
|
67
|
+
valency_to_xml(builder, data)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def distribution_to_xml(builder, data)
|
72
|
+
builder.distribution do
|
73
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
74
|
+
builder.source(id: source_id, n: n)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def glosses_to_xml(builder, data)
|
80
|
+
if data[:glosses].count > 0
|
81
|
+
builder.glosses do
|
82
|
+
# TODO
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def homographs_to_xml(builder, data)
|
88
|
+
if data[:homographs].count > 0
|
89
|
+
builder.homographs do
|
90
|
+
data[:homographs].each do |homograph|
|
91
|
+
builder.lemma form: homograph
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def paradigm_to_xml(builder, data)
|
98
|
+
unless data[:paradigm].empty?
|
99
|
+
builder.paradigm do
|
100
|
+
data[:paradigm].sort_by(&:first).each do |morphology, d|
|
101
|
+
builder.slot1 morphology: morphology do
|
102
|
+
d.sort_by(&:first).each do |form, n|
|
103
|
+
builder.slot2 form: form, n: n
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def valency_to_xml(builder, data)
|
112
|
+
unless data[:valency].empty?
|
113
|
+
builder.valency do
|
114
|
+
frames =
|
115
|
+
data[:valency].map do |arguments, token_ids|
|
116
|
+
{ arguments: arguments, tokens: token_ids }
|
117
|
+
end
|
118
|
+
|
119
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
|
120
|
+
builder.frame do
|
121
|
+
builder.arguments do
|
122
|
+
frame[:arguments].each do |argument|
|
123
|
+
builder.argument argument
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if frame[:tokens][:a].count > 0
|
128
|
+
builder.tokens flags: 'a', n: frame[:tokens][:a].count do
|
129
|
+
frame[:tokens][:a].each do |token_id|
|
130
|
+
builder.token id: token_id
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
if frame[:tokens][:r].count > 0
|
136
|
+
builder.tokens flags: 'r', n: frame[:tokens][:r].count do
|
137
|
+
frame[:tokens][:r].each do |token_id|
|
138
|
+
builder.token id: token_id
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def index_homographs!
|
149
|
+
@lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
|
150
|
+
if homographs.count > 1
|
151
|
+
homographs.each do |form|
|
152
|
+
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def index_token!(token)
|
159
|
+
if token.lemma and token.part_of_speech
|
160
|
+
encoded_lemma = [token.lemma, token.part_of_speech].join(',')
|
161
|
+
|
162
|
+
@lemmata[encoded_lemma] ||= {
|
163
|
+
lemma: token.lemma,
|
164
|
+
part_of_speech: token.part_of_speech,
|
165
|
+
distribution: {},
|
166
|
+
glosses: {},
|
167
|
+
homographs: [],
|
168
|
+
paradigm: {},
|
169
|
+
n: 0,
|
170
|
+
valency: {},
|
171
|
+
}
|
172
|
+
|
173
|
+
lemma = @lemmata[encoded_lemma]
|
174
|
+
|
175
|
+
lemma[:distribution][token.source.id] ||= 0
|
176
|
+
lemma[:distribution][token.source.id] += 1
|
177
|
+
|
178
|
+
lemma[:paradigm][token.morphology] ||= {}
|
179
|
+
lemma[:paradigm][token.morphology][token.form] ||= 0
|
180
|
+
lemma[:paradigm][token.morphology][token.form] += 1
|
181
|
+
|
182
|
+
lemma[:n] += 1
|
183
|
+
|
184
|
+
# Find verbal nodes
|
185
|
+
if token.part_of_speech[/^V/]
|
186
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
187
|
+
|
188
|
+
lemma[:valency][frame] ||= { a: [], r: [] }
|
189
|
+
|
190
|
+
entry = lemma[:valency][frame]
|
191
|
+
|
192
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
193
|
+
entry[:r] << token.id
|
194
|
+
else
|
195
|
+
entry[:a] << token.id
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -135,5 +135,21 @@ module PROIEL
|
|
135
135
|
end
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
# Returns the aligned div if any.
|
140
|
+
#
|
141
|
+
# @return [Div, NilClass] aligned div
|
142
|
+
def alignment(aligned_source)
|
143
|
+
alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
|
144
|
+
end
|
145
|
+
|
146
|
+
# Returns inferred aligned divs if any.
|
147
|
+
#
|
148
|
+
# @return [Array<Div>] inferred aligned divs
|
149
|
+
def inferred_alignment(aligned_source)
|
150
|
+
sentences.map do |sentence|
|
151
|
+
sentence.inferred_alignment(aligned_source)
|
152
|
+
end.flatten.compact.map(&:div).uniq
|
153
|
+
end
|
138
154
|
end
|
139
155
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -16,9 +16,11 @@ module PROIEL
|
|
16
16
|
# Creates a new validator for a PROIEL XML file.
|
17
17
|
#
|
18
18
|
# @param filename [String] name of PROIEL XML file to validate
|
19
|
+
# @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
|
19
20
|
#
|
20
|
-
def initialize(filename)
|
21
|
+
def initialize(filename, aligned_filename = nil)
|
21
22
|
@filename = filename
|
23
|
+
@aligned_filename = aligned_filename
|
22
24
|
@errors = []
|
23
25
|
end
|
24
26
|
|
@@ -154,6 +156,27 @@ module PROIEL
|
|
154
156
|
end
|
155
157
|
end
|
156
158
|
|
159
|
+
# Pass 5: if div is aligned, sentences and tokens within should belong
|
160
|
+
# to aligned div(s); if sentence aligned, tokens within should belong
|
161
|
+
# to aligned sentence(s). Skip if no alignment_id on source (see pass
|
162
|
+
# 4) or if aligned source not available.
|
163
|
+
if @aligned_filename
|
164
|
+
aligned_tb = PROIEL::Treebank.new
|
165
|
+
aligned_tb.load_from_xml(@aligned_filename)
|
166
|
+
|
167
|
+
tb.sources.each do |source|
|
168
|
+
if source.alignment_id
|
169
|
+
aligned_source = aligned_tb.find_source(source.alignment_id)
|
170
|
+
|
171
|
+
if aligned_source
|
172
|
+
check_alignment_integrity(errors, source, aligned_source)
|
173
|
+
else
|
174
|
+
errors << "Aligned source not available in treebank"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
157
180
|
# Decide if there were any errors
|
158
181
|
if errors.empty?
|
159
182
|
true
|
@@ -182,6 +205,52 @@ module PROIEL
|
|
182
205
|
errors << "Token #{token.id}: #{attribute_name} is null"
|
183
206
|
end
|
184
207
|
end
|
208
|
+
|
209
|
+
def check_alignment_integrity(errors, source, aligned_source)
|
210
|
+
source.divs.each do |div|
|
211
|
+
target_sentences =
|
212
|
+
div.sentences.map do |sentence|
|
213
|
+
target_tokens =
|
214
|
+
sentence.tokens.select(&:alignment_id).map do |token|
|
215
|
+
# Check that target token exists in aligned source
|
216
|
+
aligned_token = aligned_source.treebank.find_token(token.alignment_id)
|
217
|
+
|
218
|
+
if aligned_token
|
219
|
+
aligned_token
|
220
|
+
else
|
221
|
+
errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
|
222
|
+
nil
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
|
227
|
+
|
228
|
+
if sentence.alignment_id
|
229
|
+
a = sentence.alignment_id.to_s.split(',').sort.join(',')
|
230
|
+
i = inferred_target_sentences.map(&:id).sort.join(',')
|
231
|
+
|
232
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
233
|
+
if a != i
|
234
|
+
errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
inferred_target_sentences
|
239
|
+
end
|
240
|
+
|
241
|
+
inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
|
242
|
+
|
243
|
+
if div.alignment_id
|
244
|
+
a = div.alignment_id.to_s.split(',').sort.join(',')
|
245
|
+
i = inferred_target_divs.map(&:id).sort.join(',')
|
246
|
+
|
247
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
248
|
+
if a != i
|
249
|
+
errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
185
254
|
end
|
186
255
|
end
|
187
256
|
end
|
data/lib/proiel/sentence.rb
CHANGED
@@ -116,7 +116,7 @@ module PROIEL
|
|
116
116
|
# @return [String] the printable form of the sentence
|
117
117
|
def printable_form(options = {})
|
118
118
|
[presentation_before,
|
119
|
-
@children.map { |t| t.printable_form(options) },
|
119
|
+
@children.reject(&:is_empty?).map { |t| t.printable_form(options) },
|
120
120
|
presentation_after].compact.join
|
121
121
|
end
|
122
122
|
|
@@ -217,5 +217,21 @@ module PROIEL
|
|
217
217
|
def tokens
|
218
218
|
@children.to_enum
|
219
219
|
end
|
220
|
+
|
221
|
+
# Returns the aligned sentence if any.
|
222
|
+
#
|
223
|
+
# @return [Sentence, NilClass] aligned sentence
|
224
|
+
def alignment(aligned_source)
|
225
|
+
alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
# Returns inferred aligned sentences if any.
|
229
|
+
#
|
230
|
+
# @return [Array<Sentence>] inferred aligned sentences
|
231
|
+
def inferred_alignment(aligned_source)
|
232
|
+
tokens.select(&:alignment_id).map do |token|
|
233
|
+
token.alignment(aligned_source)
|
234
|
+
end.flatten.compact.map(&:sentence).uniq
|
235
|
+
end
|
220
236
|
end
|
221
237
|
end
|
data/lib/proiel/token.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -160,12 +160,13 @@ module PROIEL
|
|
160
160
|
# Returns the printable form of the token with any presentation data.
|
161
161
|
#
|
162
162
|
# @param custom_token_formatter [Lambda] formatting function for tokens
|
163
|
+
# which is passed the token as its sole argument
|
163
164
|
#
|
164
165
|
# @return [String] the printable form of the token
|
165
166
|
def printable_form(custom_token_formatter: nil)
|
166
167
|
printable_form =
|
167
168
|
if custom_token_formatter
|
168
|
-
custom_token_formatter.call(
|
169
|
+
custom_token_formatter.call(self)
|
169
170
|
else
|
170
171
|
form
|
171
172
|
end
|
@@ -393,6 +394,13 @@ module PROIEL
|
|
393
394
|
common_ancestors(other_token, inclusive: inclusive).first
|
394
395
|
end
|
395
396
|
|
397
|
+
# Returns the aligned token if any.
|
398
|
+
#
|
399
|
+
# @return [Token, NilClass] aligned token
|
400
|
+
def alignment(aligned_source)
|
401
|
+
alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
|
402
|
+
end
|
403
|
+
|
396
404
|
private
|
397
405
|
|
398
406
|
# FIXME: extract this from the header of the PROIEL XML file instead and
|
@@ -0,0 +1,147 @@
|
|
1
|
+
module PROIEL::Valency::Arguments
|
2
|
+
def self.get_argument_frame(token)
|
3
|
+
arguments = collect_arguments(token)
|
4
|
+
hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
|
5
|
+
|
6
|
+
a =
|
7
|
+
hoisted_arguments.map do |argument|
|
8
|
+
{ relation: argument.relation }.merge(extract_features(argument))
|
9
|
+
end
|
10
|
+
|
11
|
+
PROIEL::Valency::Obliqueness.sort_arguments(a)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
POS_CLASSIFICATION = {
|
17
|
+
'R' => :functor,
|
18
|
+
'G' => :functor,
|
19
|
+
'N' => :nominal,
|
20
|
+
'P' => :nominal,
|
21
|
+
'A' => :nominal,
|
22
|
+
'M' => :nominal,
|
23
|
+
'V' => :verbal,
|
24
|
+
}
|
25
|
+
|
26
|
+
# Collapses dependents based on features
|
27
|
+
def self.collapse_dependents(dependents)
|
28
|
+
# Hoist dependents if any of the dependents is a coordinator
|
29
|
+
dependents = dependents.map { |d| hoist_dependents(d) }
|
30
|
+
|
31
|
+
# Figure out if all dependents are equivalent for the purposes of
|
32
|
+
# argument frames. Typical examples would be coordinated, identical
|
33
|
+
# prepositions (which is operationalised as same lemma, same POS, no
|
34
|
+
# case) or coordinated nouns in the same case (which is operationalised
|
35
|
+
# as same major POS, same case). If we fail to figure out a way to
|
36
|
+
# hoist and reduce arguments, we keep the coordinator.
|
37
|
+
majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
|
38
|
+
majors = majors.length == 1 ? majors.first : nil
|
39
|
+
|
40
|
+
case majors
|
41
|
+
when :functor
|
42
|
+
lemmas = dependents.map(&:lemma).uniq
|
43
|
+
if lemmas.length == 1
|
44
|
+
dependents.first
|
45
|
+
else
|
46
|
+
#STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
when :nominal
|
50
|
+
cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
|
51
|
+
if cases.length == 1
|
52
|
+
dependents.first
|
53
|
+
else
|
54
|
+
#STDERR.puts "Different cases N/P: #{cases.inspect}"
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
when :verbal
|
58
|
+
moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
|
59
|
+
if moods.length == 1
|
60
|
+
dependents.first
|
61
|
+
else
|
62
|
+
#STDERR.puts "Different moods V: #{moods.inspect}"
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
else
|
66
|
+
#STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Hoists the real argument dependents from conjoined arguments
|
72
|
+
def self.hoist_dependents(argument)
|
73
|
+
if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
|
74
|
+
# Pick dependents that have the same relation as the coordinator. This
|
75
|
+
# eliminates auxiliary elements like particles and repeated
|
76
|
+
# conjunctions as well as attributes that scope over all conjuncts.
|
77
|
+
dependents = argument.dependents.select { |d| d.relation == argument.relation }
|
78
|
+
|
79
|
+
collapse_dependents(dependents) || argument
|
80
|
+
else
|
81
|
+
argument
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Extracts morphosyntactic features that are relevant to the argument frame
|
86
|
+
def self.extract_features(argument)
|
87
|
+
{}.tap do |features|
|
88
|
+
case argument.part_of_speech_hash[:major]
|
89
|
+
when 'G'
|
90
|
+
features[:lemma] = argument.lemma
|
91
|
+
features[:part_of_speech] = argument.part_of_speech
|
92
|
+
|
93
|
+
# There may be multiple dependents and dependents may be headed by
|
94
|
+
# coordinators. All relevant dependents have the relation PRED.
|
95
|
+
dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
|
96
|
+
local_argument = collapse_dependents(dependents)
|
97
|
+
features[:mood] = local_argument.morphology_hash[:mood] if local_argument and local_argument.morphology_hash[:mood]
|
98
|
+
when 'R'
|
99
|
+
features[:lemma] = argument.lemma
|
100
|
+
features[:part_of_speech] = argument.part_of_speech
|
101
|
+
|
102
|
+
# There may be multiple dependents and dependents may be headed by
|
103
|
+
# coordinators. All relevant dependents have the relation OBL.
|
104
|
+
dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
|
105
|
+
local_argument = collapse_dependents(dependents)
|
106
|
+
features[:case] = local_argument.morphology_hash[:case] if local_argument and local_argument.morphology_hash[:case]
|
107
|
+
when 'V'
|
108
|
+
features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
|
109
|
+
when 'D'
|
110
|
+
features[:lemma] = argument.lemma
|
111
|
+
features[:part_of_speech] = argument.part_of_speech
|
112
|
+
when 'P'
|
113
|
+
features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
|
114
|
+
if argument.part_of_speech == 'Pk' # reflexive personal pronoun
|
115
|
+
features[:lemma] = argument.lemma
|
116
|
+
features[:part_of_speech] = argument.part_of_speech
|
117
|
+
end
|
118
|
+
else
|
119
|
+
features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Determines the arguments of a predicate
|
125
|
+
def self.collect_arguments(token)
|
126
|
+
token.dependents.select do |dependent|
|
127
|
+
case dependent.relation
|
128
|
+
when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
|
129
|
+
true
|
130
|
+
when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
|
131
|
+
false
|
132
|
+
when 'arg' # unspecific but always an argument
|
133
|
+
true
|
134
|
+
when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
|
135
|
+
false
|
136
|
+
when 'rel' # unspecific but never an argument
|
137
|
+
false
|
138
|
+
when 'pred', 'parpred', 'voc' # shouldn't happen
|
139
|
+
false
|
140
|
+
when 'pid', 'xsub' # really shouldn't happen
|
141
|
+
false
|
142
|
+
else
|
143
|
+
raise "unknown relation #{dependent.relation.inspect}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Valency
|
3
|
+
class Lexicon
|
4
|
+
attr_reader :frames
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@source_ids = Set.new
|
8
|
+
@source_languages = Set.new
|
9
|
+
@frames = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generates a valency lexicon from the provided sources. In practice the
|
13
|
+
# sources should be in the same language but this is not enforced. This
|
14
|
+
# makes it possible to generate a lexicon from sources in closely related
|
15
|
+
# languages or dialects.
|
16
|
+
def add_source!(source)
|
17
|
+
@source_ids << source.id
|
18
|
+
@source_languages << source.language
|
19
|
+
|
20
|
+
source.sentences.each do |sentence|
|
21
|
+
tokens = find_verbal_nodes(sentence)
|
22
|
+
tokens.each do |token|
|
23
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
24
|
+
|
25
|
+
partition =
|
26
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
27
|
+
:r
|
28
|
+
else
|
29
|
+
:a
|
30
|
+
end
|
31
|
+
|
32
|
+
@frames[token.lemma] ||= {}
|
33
|
+
@frames[token.lemma][token.part_of_speech] ||= {}
|
34
|
+
@frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
|
35
|
+
@frames[token.lemma][token.part_of_speech][frame][partition] << token.id
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def lookup(lemma, part_of_speech)
|
41
|
+
frames =
|
42
|
+
@frames[lemma][part_of_speech].map do |arguments, token_ids|
|
43
|
+
{ arguments: arguments, tokens: token_ids }
|
44
|
+
end
|
45
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Find verbal nodes in a sentence
|
51
|
+
def find_verbal_nodes(sentence)
|
52
|
+
sentence.tokens.select do |token|
|
53
|
+
# FIXME: is this test in the proiel library already?
|
54
|
+
(token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module PROIEL::Valency::Obliqueness
|
2
|
+
# Sorts frames by obliqueness
|
3
|
+
def self.sort_frames(frames)
|
4
|
+
# Sort frames by obliqueness, then by inspecting them so that we get
|
5
|
+
# a stable, reproducible order.
|
6
|
+
frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sorts arguments by obliqueness
|
10
|
+
def self.sort_arguments(arguments)
|
11
|
+
arguments.sort_by { |argument| obliqueness_of_argument(argument) }
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.obliqueness_of_arguments(arguments)
|
17
|
+
arguments.map do |argument|
|
18
|
+
obliqueness_of_argument(argument)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.obliqueness_of_argument(argument)
|
23
|
+
obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
|
24
|
+
end
|
25
|
+
|
26
|
+
OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg)
|
27
|
+
|
28
|
+
def self.obliqueness_of_relation(relation)
|
29
|
+
OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
|
30
|
+
end
|
31
|
+
end
|
data/lib/proiel/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -30,28 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: '1.8'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: '1.8'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: sax-machine
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.3
|
47
|
+
version: '1.3'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.3
|
54
|
+
version: '1.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: memoist
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,48 +66,62 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.12'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: builder
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: '1.
|
89
|
+
version: '1.15'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
94
|
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: '1.
|
96
|
+
version: '1.15'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rake
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
101
|
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
103
|
+
version: '12.0'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
108
|
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
110
|
+
version: '12.0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rspec
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version: '3.
|
117
|
+
version: '3.6'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version: '3.
|
124
|
+
version: '3.6'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: pry
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +142,14 @@ dependencies:
|
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
145
|
+
version: '0.14'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
152
|
+
version: '0.14'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: yard
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +178,10 @@ files:
|
|
164
178
|
- bin/setup
|
165
179
|
- lib/proiel.rb
|
166
180
|
- lib/proiel/annotation_schema.rb
|
181
|
+
- lib/proiel/chronology.rb
|
167
182
|
- lib/proiel/citations.rb
|
183
|
+
- lib/proiel/dictionary.rb
|
184
|
+
- lib/proiel/dictionary/builder.rb
|
168
185
|
- lib/proiel/div.rb
|
169
186
|
- lib/proiel/positional_tag.rb
|
170
187
|
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
|
@@ -183,6 +200,10 @@ files:
|
|
183
200
|
- lib/proiel/treebank.rb
|
184
201
|
- lib/proiel/treebank_object.rb
|
185
202
|
- lib/proiel/utils.rb
|
203
|
+
- lib/proiel/valency.rb
|
204
|
+
- lib/proiel/valency/arguments.rb
|
205
|
+
- lib/proiel/valency/lexicon.rb
|
206
|
+
- lib/proiel/valency/obliqueness.rb
|
186
207
|
- lib/proiel/version.rb
|
187
208
|
- lib/proiel/visualization.rb
|
188
209
|
- lib/proiel/visualization/graphviz.rb
|
@@ -201,7 +222,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
201
222
|
requirements:
|
202
223
|
- - ">="
|
203
224
|
- !ruby/object:Gem::Version
|
204
|
-
version: '2.
|
225
|
+
version: '2.2'
|
205
226
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
206
227
|
requirements:
|
207
228
|
- - ">="
|
@@ -209,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
209
230
|
version: '0'
|
210
231
|
requirements: []
|
211
232
|
rubyforge_project:
|
212
|
-
rubygems_version: 2.
|
233
|
+
rubygems_version: 2.7.4
|
213
234
|
signing_key:
|
214
235
|
specification_version: 4
|
215
236
|
summary: A library for working with treebanks using the PROIEL dependency format
|