proiel-cli 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +18 -3
- data/bin/proiel +1 -1
- data/lib/proiel/cli/commands/build.rb +91 -0
- data/lib/proiel/cli/commands/convert.rb +7 -2
- data/lib/proiel/cli/commands/dictionary.rb +46 -0
- data/lib/proiel/cli/commands/info.rb +1 -1
- data/lib/proiel/cli/commands/shell.rb +34 -0
- data/lib/proiel/cli/commands/tokenize.rb +2 -2
- data/lib/proiel/cli/commands/validate.rb +6 -4
- data/lib/proiel/cli/commands/visualize.rb +14 -11
- data/lib/proiel/cli/converters/conll-u/morphology.rb +162 -72
- data/lib/proiel/cli/converters/conll-u/syntax.rb +108 -62
- data/lib/proiel/cli/converters/conll-u.rb +648 -548
- data/lib/proiel/cli/converters/conll-x.rb +67 -52
- data/lib/proiel/cli/converters/lexc.rb +21 -23
- data/lib/proiel/cli/converters/proielxml.rb +173 -132
- data/lib/proiel/cli/converters/text.rb +69 -71
- data/lib/proiel/cli/converters/tiger.rb +110 -114
- data/lib/proiel/cli/converters/tiger2.rb +139 -141
- data/lib/proiel/cli/converters/tnt.rb +19 -15
- data/lib/proiel/cli/version.rb +1 -1
- data/lib/proiel/cli.rb +26 -1
- metadata +43 -58
- data/bin/setup +0 -8
- data/contrib/proiel-tnt-train +0 -15
- data/lib/proiel/cli/commands.rb +0 -28
|
@@ -1,680 +1,780 @@
|
|
|
1
1
|
require 'proiel/cli/converters/conll-u/morphology'
|
|
2
2
|
require 'proiel/cli/converters/conll-u/syntax'
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
error_count += 1
|
|
33
|
-
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
|
34
|
-
STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
|
|
35
|
-
end
|
|
4
|
+
module PROIEL::Converter
|
|
5
|
+
# Converter that outputs CoNLL-U.
|
|
6
|
+
#
|
|
7
|
+
# This converter relies on certain assumptions about correct linguistic
|
|
8
|
+
# annotation in order to produce a meaningful representation in CoNLL-U.
|
|
9
|
+
class CoNLLU
|
|
10
|
+
class << self
|
|
11
|
+
def process(tb, options = [])
|
|
12
|
+
error_count = 0
|
|
13
|
+
sentence_count = 0
|
|
14
|
+
tb.sources.each do |source|
|
|
15
|
+
source.divs.each do |div|
|
|
16
|
+
div.sentences.each do |sentence|
|
|
17
|
+
sentence_count += 1
|
|
18
|
+
n = Sentence.new sentence
|
|
19
|
+
begin
|
|
20
|
+
# Do the conversion first to avoid spurious headers if the conversion fails
|
|
21
|
+
a = n.convert.to_conll
|
|
22
|
+
puts "# source = #{source.title}, #{div.title}"
|
|
23
|
+
# using printable_form would give us punctuation, which must then be added to the tree
|
|
24
|
+
puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}"
|
|
25
|
+
puts "# sent_id = #{sentence.id}"
|
|
26
|
+
puts a
|
|
27
|
+
puts
|
|
28
|
+
rescue => e
|
|
29
|
+
error_count += 1
|
|
30
|
+
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
|
31
|
+
STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
|
|
36
32
|
end
|
|
37
33
|
end
|
|
38
34
|
end
|
|
39
|
-
STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
|
|
40
35
|
end
|
|
36
|
+
STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
|
|
41
37
|
end
|
|
38
|
+
end
|
|
42
39
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
end
|
|
85
|
-
else
|
|
86
|
-
tks << tk
|
|
40
|
+
class Sentence
|
|
41
|
+
|
|
42
|
+
attr_accessor :tokens
|
|
43
|
+
|
|
44
|
+
# initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
|
|
45
|
+
def initialize(sentence)
|
|
46
|
+
|
|
47
|
+
id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil
|
|
48
|
+
|
|
49
|
+
# initialize array to hold the sentence tokens
|
|
50
|
+
tks = []
|
|
51
|
+
# keep track of how many new tokens have been created
|
|
52
|
+
offset = 0
|
|
53
|
+
|
|
54
|
+
sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|
|
|
55
|
+
|
|
56
|
+
if tk.form =~ /[[:space:]]/
|
|
57
|
+
subtoks = tk.form.split(/[[:space:]]/)
|
|
58
|
+
|
|
59
|
+
subtoks.each_with_index do |subtok, i|
|
|
60
|
+
tks << PROIEL::Token.new(sentence,
|
|
61
|
+
(i == 0 ? tk.id : 1000 + offset), # id
|
|
62
|
+
(i == 0 ? tk.head_id : tk.id), # head_id
|
|
63
|
+
subtok,
|
|
64
|
+
# hope the lemmas split the same way as the tokens. Grab the form if you don't find a lemma
|
|
65
|
+
(tk.lemma.split(/[[:space:]]/)[i] || subtok),
|
|
66
|
+
tk.part_of_speech, # copy the postag
|
|
67
|
+
tk.morphology,
|
|
68
|
+
(i == 0 ? tk.relation : 'fixed'),
|
|
69
|
+
nil, #empty_token_sort
|
|
70
|
+
tk.citation_part,
|
|
71
|
+
(i == 0 ? tk.presentation_before : nil),
|
|
72
|
+
(i == (subtoks.size - 1) ? tk.presentation_after : nil),
|
|
73
|
+
(i == 0 ? tk.antecedent_id : nil),
|
|
74
|
+
(i == 0 ? tk.information_status : nil),
|
|
75
|
+
(i == 0 ? tk.contrast_group : nil),
|
|
76
|
+
(i == 0 ? tk.foreign_ids : nil),
|
|
77
|
+
(i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), # This needs to be given a real slash object for the initialization, although it throws away the info
|
|
78
|
+
(subtok == subtoks.first ? tk.alignment_id : nil)
|
|
79
|
+
)
|
|
80
|
+
offset += 1
|
|
87
81
|
end
|
|
82
|
+
else
|
|
83
|
+
tks << tk
|
|
88
84
|
end
|
|
85
|
+
end
|
|
89
86
|
|
|
90
|
-
|
|
91
|
-
tks.map(&:id).each_with_index.each do |id, i|
|
|
92
|
-
id_to_number[id] = i + 1
|
|
93
|
-
end
|
|
94
87
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
Token.new(id_to_number[t.id],
|
|
98
|
-
id_to_number[t.head_id],
|
|
99
|
-
#insert dots in any whitespace inside words and lemmata
|
|
100
|
-
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
|
101
|
-
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
|
102
|
-
t.part_of_speech,
|
|
103
|
-
t.language,
|
|
104
|
-
t.morphology,
|
|
105
|
-
t.relation,
|
|
106
|
-
t.empty_token_sort,
|
|
107
|
-
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
|
108
|
-
t.citation_part,
|
|
109
|
-
self
|
|
110
|
-
)
|
|
111
|
-
end
|
|
88
|
+
tks.map(&:id).each_with_index.each do |id, i|
|
|
89
|
+
id_to_number[id] = i + 1
|
|
112
90
|
end
|
|
113
91
|
|
|
114
|
-
|
|
115
|
-
restructure_graph!
|
|
116
|
-
relabel_graph!
|
|
117
|
-
map_part_of_speech!
|
|
118
|
-
self
|
|
119
|
-
end
|
|
92
|
+
@tokens = tks.map do |t|
|
|
120
93
|
|
|
121
|
-
|
|
122
|
-
|
|
94
|
+
Token.new(id_to_number[t.id],
|
|
95
|
+
id_to_number[t.head_id],
|
|
96
|
+
#insert dots in any whitespace inside words and lemmata
|
|
97
|
+
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
|
98
|
+
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
|
99
|
+
t.part_of_speech,
|
|
100
|
+
t.language,
|
|
101
|
+
t.morphology,
|
|
102
|
+
t.relation,
|
|
103
|
+
t.empty_token_sort,
|
|
104
|
+
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
|
105
|
+
t.citation_part,
|
|
106
|
+
self
|
|
107
|
+
)
|
|
123
108
|
end
|
|
109
|
+
end
|
|
124
110
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
111
|
+
def convert
|
|
112
|
+
restructure_graph!
|
|
113
|
+
relabel_graph!
|
|
114
|
+
check_directionality!
|
|
115
|
+
distribute_conjunctions!
|
|
116
|
+
map_part_of_speech!
|
|
117
|
+
self
|
|
118
|
+
end
|
|
128
119
|
|
|
129
|
-
|
|
130
|
-
|
|
120
|
+
def distribute_conjunctions!
|
|
121
|
+
@tokens.select { |t| t.has_conjunct? }.each do |h|
|
|
122
|
+
conjuncts = h.dependents.select { |d| d.relation == 'conj' }
|
|
123
|
+
conjunctions = h.dependents.select { |d| d.relation == 'cc' }
|
|
124
|
+
conjunctions.each do |c|
|
|
125
|
+
if c.id > h.id
|
|
126
|
+
new_head = conjuncts.select { |cj| cj.id > c.id }.first
|
|
127
|
+
c.head_id = new_head.id if new_head
|
|
128
|
+
end
|
|
129
|
+
end
|
|
131
130
|
end
|
|
131
|
+
end
|
|
132
132
|
|
|
133
|
-
|
|
134
|
-
|
|
133
|
+
def check_directionality!
|
|
134
|
+
@tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f|
|
|
135
|
+
f.promote!(nil, f.relation) if f.id < f.head.id
|
|
135
136
|
end
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
|
|
137
|
+
@tokens.select { |t| t.relation == 'conj' }.each do |f|
|
|
138
|
+
raise "conj must go left-to-right (id: #{f.id}, head_id: #{f.head.id}, form: #{f.form}, head_form: #{f.head.form})" if f.id < f.head.id
|
|
139
139
|
end
|
|
140
|
+
end
|
|
140
141
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
142
|
+
def find_token(identifier)
|
|
143
|
+
@tokens.select { |t| t.id == identifier }.first
|
|
144
|
+
end
|
|
144
145
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
146
|
+
def remove_token!(token)
|
|
147
|
+
@tokens.delete(token)
|
|
148
|
+
end
|
|
148
149
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
150
|
+
def to_s
|
|
151
|
+
@tokens.map(&:to_s).join("\n")
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def count_tokens
|
|
155
|
+
roots.map(&:count_subgraph).inject(0, :+)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def roots
|
|
159
|
+
@tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def to_graph
|
|
163
|
+
roots.map(&:to_graph).join("\n")
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def to_conll
|
|
167
|
+
@tokens.map(&:to_conll).join("\n")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# TODO: this will leave several root nodes in many cases. For now, raise an error
|
|
171
|
+
def prune_empty_rootnodes!
|
|
172
|
+
unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
|
|
173
|
+
empty_roots.each do |r|
|
|
174
|
+
# promote xobj to root if there is one
|
|
175
|
+
xobjs = r.dependents.select { |d| d.relation == 'xobj' }
|
|
176
|
+
if xobjs.any?
|
|
177
|
+
new_root = xobjs.first
|
|
155
178
|
new_root.head_id = 0
|
|
156
179
|
new_root.relation = r.relation
|
|
157
180
|
r.dependents.each { |d| d.head_id = new_root.id }
|
|
158
181
|
remove_token! r
|
|
159
182
|
end
|
|
160
|
-
prune_empty_rootnodes!
|
|
161
183
|
end
|
|
184
|
+
#prune_empty_rootnodes!
|
|
162
185
|
end
|
|
186
|
+
end
|
|
163
187
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
188
|
+
def demote_subjunctions!
|
|
189
|
+
@tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
|
|
190
|
+
end
|
|
167
191
|
|
|
168
|
-
|
|
192
|
+
def demote_parentheticals_and_vocatives!
|
|
193
|
+
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
|
194
|
+
if p.any? and r.none?
|
|
195
|
+
# promote the first vocative/parenthetical to head in case there's nothing else
|
|
196
|
+
p.first.relation = 'pred'
|
|
169
197
|
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
|
170
|
-
if p.any? and r.none?
|
|
171
|
-
# promote the first vocative/parenthetical to head in case there's nothing else
|
|
172
|
-
p.first.relation = 'pred'
|
|
173
|
-
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
|
174
|
-
end
|
|
175
|
-
raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
|
|
176
|
-
p.each { |x| x.head_id = r.first.id }
|
|
177
198
|
end
|
|
199
|
+
raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
|
|
200
|
+
p.each { |x| x.head_id = r.first.id }
|
|
201
|
+
end
|
|
178
202
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
203
|
+
def relabel_graph!
|
|
204
|
+
roots.each(&:relabel_graph!)
|
|
205
|
+
end
|
|
182
206
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
207
|
+
def map_part_of_speech!
|
|
208
|
+
roots.each(&:map_part_of_speech!)
|
|
209
|
+
end
|
|
186
210
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
211
|
+
def restructure_graph!
|
|
212
|
+
@tokens.delete_if { |n| n.empty_token_sort == 'P' }
|
|
213
|
+
@tokens.select(&:preposition?).each(&:process_preposition!)
|
|
214
|
+
@tokens.select { |t| t.comparison_word? and t.dependents and t.dependents.select { |d| ['sub','obj','obl','comp','adv'].include?(d.relation) }.any? }.each(&:process_comparison!)
|
|
215
|
+
roots.each(&:change_coordinations!)
|
|
216
|
+
@tokens.select(&:copula?).each(&:process_copula!)
|
|
217
|
+
demote_subjunctions!
|
|
218
|
+
prune_empty_rootnodes!
|
|
219
|
+
# do ellipses from left to right for proper remnant treatment
|
|
220
|
+
@tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
|
|
221
|
+
#NB! apos gets overridden by process_comparison so some dislocations are lost
|
|
222
|
+
@tokens.select { |t| t.relation == 'apos' and t.id < t.head_id }.each(&:process_dislocation!)
|
|
223
|
+
# DIRTY: remove the rest of the empty nodes by attaching them
|
|
224
|
+
# to their grandmother with remnant. This is the best way to
|
|
225
|
+
# do it given the current state of the UDEP scheme, but
|
|
226
|
+
# revisions will come.
|
|
227
|
+
roots.each(&:remove_empties!)
|
|
228
|
+
demote_parentheticals_and_vocatives!
|
|
203
229
|
end
|
|
230
|
+
end
|
|
204
231
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
232
|
+
class Token
|
|
233
|
+
|
|
234
|
+
attr_accessor :head_id
|
|
235
|
+
attr_accessor :upos
|
|
236
|
+
attr_reader :relation
|
|
237
|
+
attr_reader :part_of_speech
|
|
238
|
+
attr_reader :id
|
|
239
|
+
attr_reader :lemma
|
|
240
|
+
attr_reader :language
|
|
241
|
+
attr_reader :empty_token_sort
|
|
242
|
+
attr_reader :form
|
|
243
|
+
attr_reader :citation_part
|
|
244
|
+
|
|
245
|
+
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
|
|
246
|
+
@id = id
|
|
247
|
+
@head_id = head_id
|
|
248
|
+
@form = form
|
|
249
|
+
@lemma = lemma
|
|
250
|
+
@baselemma, @variant = @lemma.split('#')
|
|
251
|
+
@part_of_speech = part_of_speech
|
|
252
|
+
@language = language
|
|
253
|
+
@morphology = morphology
|
|
254
|
+
@relation = relation
|
|
255
|
+
@empty_token_sort = empty_token_sort
|
|
256
|
+
@slashes = slashes
|
|
257
|
+
@sentence = sentence
|
|
258
|
+
@features = (morphology ? map_morphology(morphology) : '' )
|
|
259
|
+
@citation_part = 'ref=' + (citation_part ? citation_part : '').gsub(/\s/, '_')
|
|
260
|
+
@upos = nil
|
|
261
|
+
end
|
|
234
262
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
263
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = [
|
|
264
|
+
:person, :number, :tense, :mood, :voice, :gender, :case,
|
|
265
|
+
:degree, :strength, :inflection
|
|
266
|
+
]
|
|
239
267
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
268
|
+
def map_morphology morph
|
|
269
|
+
res = []
|
|
270
|
+
for tag in 0..morph.length - 1
|
|
271
|
+
res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]]
|
|
272
|
+
end
|
|
273
|
+
res = res.reject {|v| v == 'VerbForm=Part'} if res.include?('VerbForm=PartRes|Tense=Past')
|
|
274
|
+
res = res.reject {|s| s == 'Strength=Weak' } unless @language == 'got'
|
|
275
|
+
res = res.map { |s| s == 'Strength=Strong' ? 'Variant=Short' : s } unless @language == 'got'
|
|
276
|
+
res << 'Polarity=Neg' if ['не.быти','не.бꙑти'].include?(@lemma)
|
|
277
|
+
res.compact.join('|')
|
|
278
|
+
end
|
|
247
279
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
280
|
+
def genitive?
|
|
281
|
+
@morphology =~ /......g.*/
|
|
282
|
+
end
|
|
251
283
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
284
|
+
# returns +true+ if the node is an adjective or an ordinal
|
|
285
|
+
def adjectival?
|
|
286
|
+
@part_of_speech == 'A-' or @part_of_speech == 'Mo'
|
|
287
|
+
end
|
|
256
288
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
289
|
+
def subjunction?
|
|
290
|
+
@part_of_speech == 'G-'
|
|
291
|
+
end
|
|
260
292
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
293
|
+
def adverb?
|
|
294
|
+
@part_of_speech =~ /\AD/
|
|
295
|
+
end
|
|
264
296
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
297
|
+
def cardinal?
|
|
298
|
+
@part_of_speech == 'Ma'
|
|
299
|
+
end
|
|
268
300
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
dependents.any?(&:copula?) or
|
|
273
|
-
dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation } or
|
|
274
|
-
root?
|
|
275
|
-
end
|
|
301
|
+
def relative?
|
|
302
|
+
@part_of_speech == 'Pr' or @part_of_speech == 'Dq'
|
|
303
|
+
end
|
|
276
304
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
305
|
+
def verb?
|
|
306
|
+
@part_of_speech == 'V-' or @empty_token_sort == 'V'
|
|
307
|
+
end
|
|
280
308
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
309
|
+
def orphan?
|
|
310
|
+
relation == 'orphan'
|
|
311
|
+
end
|
|
284
312
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
313
|
+
# A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute construction without a verb; or it has a subjunction dependent; or it is a relative pronoun/adverb or has a relative pronoun/adverb dependent; or if it is the root (e.g. in a nominal clause)
|
|
314
|
+
def clausal?
|
|
315
|
+
(@part_of_speech == 'V-' and !nominalized? and !has_preposition?) or
|
|
316
|
+
dependents.any?(&:copula?) or
|
|
317
|
+
dependents.any? { |d| ['sub', 'nsubj','nsubj:outer', 'nsubj:pass', 'csubj', 'csubj:pass'].include? d.relation } or
|
|
318
|
+
dependents.any?(&:subjunction?) or
|
|
319
|
+
relative? or
|
|
320
|
+
dependents.any?(&:relative?) or
|
|
321
|
+
dependents.any?(&:orphan?) or
|
|
322
|
+
root?
|
|
323
|
+
end
|
|
294
324
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
def determiner?
|
|
300
|
-
DETERMINERS.include? @part_of_speech
|
|
301
|
-
end
|
|
325
|
+
def conjunction?
|
|
326
|
+
part_of_speech == 'C-' or @empty_token_sort == 'C'
|
|
327
|
+
end
|
|
302
328
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
329
|
+
def coordinated?
|
|
330
|
+
head and head.conjunction? and head.relation == @relation
|
|
331
|
+
end
|
|
306
332
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
333
|
+
def has_conjunct?
|
|
334
|
+
dependents.any? { |d| d.relation == 'conj' }
|
|
335
|
+
end
|
|
310
336
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
337
|
+
# Returns +true+ if the node has an xobj dependent and either 1)
|
|
338
|
+
# the lemma is copular or 2) the node is empty and has no pid
|
|
339
|
+
# slash or a pid slash to a node with a copular lemma
|
|
340
|
+
def copula?
|
|
341
|
+
@relation == 'cop' or
|
|
342
|
+
(COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or
|
|
343
|
+
(@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and
|
|
344
|
+
dependents.any? { |d| d.relation == 'xobj' } )
|
|
345
|
+
end
|
|
314
346
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
347
|
+
def has_copula?
|
|
348
|
+
dependents.any?(&:copula?)
|
|
349
|
+
end
|
|
318
350
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
351
|
+
def auxiliary?
|
|
352
|
+
AUXILIARIES.include?([lemma, part_of_speech, language].join(',')) or (part_of_speech == "V-" and relation == 'aux')
|
|
353
|
+
end
|
|
322
354
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
355
|
+
def comparison_word?
|
|
356
|
+
COMPARISON_LEMMATA.include?([lemma,part_of_speech,language].join(','))
|
|
357
|
+
end
|
|
326
358
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
359
|
+
def determiner?
|
|
360
|
+
DETERMINERS.include? @part_of_speech
|
|
361
|
+
end
|
|
330
362
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
363
|
+
def ellipsis?
|
|
364
|
+
@empty_token_sort == 'V'
|
|
365
|
+
end
|
|
334
366
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
367
|
+
def foreign?
|
|
368
|
+
@part_of_speech == 'F-'
|
|
369
|
+
end
|
|
338
370
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
371
|
+
def has_content?
|
|
372
|
+
@empty_token_sort.nil? or @empty_token_sort == ''
|
|
373
|
+
end
|
|
342
374
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
end
|
|
347
|
-
end
|
|
348
|
-
|
|
349
|
-
def TAM_particle?
|
|
350
|
-
@relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
|
351
|
-
end
|
|
352
|
-
|
|
353
|
-
def particle?
|
|
354
|
-
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
|
355
|
-
end
|
|
375
|
+
def has_subject?
|
|
376
|
+
dependents.any? { |d| ['sub','nsubj','nsubj:pass','csubj','csubj:pass','nsubj:outer'].include?(d.relation) }
|
|
377
|
+
end
|
|
356
378
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
def preposition?
|
|
362
|
-
@part_of_speech == 'R-'
|
|
363
|
-
end
|
|
379
|
+
def interjection?
|
|
380
|
+
@part_of_speech == 'I-'
|
|
381
|
+
end
|
|
364
382
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
383
|
+
def is_empty?
|
|
384
|
+
!has_content?
|
|
385
|
+
end
|
|
368
386
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
387
|
+
def deponent?
|
|
388
|
+
DEPONENTS[@language] and DEPONENTS[@language].match(@lemma)
|
|
389
|
+
end
|
|
372
390
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
c.relation = rel
|
|
377
|
-
end
|
|
378
|
-
end
|
|
379
|
-
@relation = rel
|
|
380
|
-
end
|
|
391
|
+
def mediopassive?
|
|
392
|
+
(!deponent? and @morphology) ? @morphology[4] =~/[mpe]/ : false
|
|
393
|
+
end
|
|
381
394
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
395
|
+
def passive?
|
|
396
|
+
(!deponent? and @morphology) ? @morphology[4] == 'p' : false
|
|
397
|
+
end
|
|
385
398
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
399
|
+
def negation?
|
|
400
|
+
NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
|
401
|
+
end
|
|
389
402
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
403
|
+
def nominal?
|
|
404
|
+
@part_of_speech =~ /\A[NPM]/ or nominalized?
|
|
405
|
+
end
|
|
393
406
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
head.conj_head
|
|
398
|
-
else
|
|
399
|
-
head
|
|
400
|
-
end
|
|
401
|
-
end
|
|
407
|
+
def long?
|
|
408
|
+
@morphology[8] == 'w'
|
|
409
|
+
end
|
|
402
410
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
else
|
|
407
|
-
nil
|
|
408
|
-
end
|
|
411
|
+
def nominalized?
|
|
412
|
+
dependents.any? do |d|
|
|
413
|
+
d.determiner? and ['atr', 'aux', 'det'].include? d.relation
|
|
409
414
|
end
|
|
415
|
+
end
|
|
410
416
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
417
|
+
def tam_particle?
|
|
418
|
+
@relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def particle?
|
|
422
|
+
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
def pronominal?
|
|
426
|
+
@part_of_speech =~ /\AP./ # no evidence that possessives are pronoun/determiner-like
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
def preposition?
|
|
430
|
+
@part_of_speech == 'R-'
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def proper_noun?
|
|
434
|
+
@part_of_speech == 'Ne'
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def root?
|
|
438
|
+
@head_id == 0
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def relation=(rel)
|
|
442
|
+
if conjunction?
|
|
443
|
+
dependents.select { |d| d.relation == @relation }.each do |c|
|
|
444
|
+
c.relation = rel
|
|
416
445
|
end
|
|
417
446
|
end
|
|
447
|
+
@relation = rel
|
|
448
|
+
end
|
|
418
449
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
@lemma,
|
|
423
|
-
@upos,
|
|
424
|
-
@part_of_speech,
|
|
425
|
-
format_features(@features),
|
|
426
|
-
@head_id,
|
|
427
|
-
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
|
428
|
-
'_', # slashes here
|
|
429
|
-
@citation_part].join("\t")
|
|
430
|
-
end
|
|
450
|
+
def count_subgraph
|
|
451
|
+
dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1)
|
|
452
|
+
end
|
|
431
453
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
454
|
+
def subgraph_set
|
|
455
|
+
[self] + dependents.map(&:subgraph_set).flatten
|
|
456
|
+
end
|
|
435
457
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
458
|
+
def left_corner
|
|
459
|
+
([self] + dependents).sort_by(&:id).first
|
|
460
|
+
end
|
|
439
461
|
|
|
440
|
-
|
|
441
|
-
|
|
462
|
+
def conj_head
|
|
463
|
+
raise 'Not a conjunct' unless @relation == 'conj'
|
|
464
|
+
if head.relation == 'conj'
|
|
465
|
+
head.conj_head
|
|
466
|
+
else
|
|
467
|
+
head
|
|
442
468
|
end
|
|
469
|
+
end
|
|
443
470
|
|
|
444
|
-
|
|
445
|
-
|
|
471
|
+
def pid
|
|
472
|
+
if pid = @slashes.select { |t, r| r == 'pid' }.first
|
|
473
|
+
@sentence.tokens.select { |t| pid.first == t.id}.first
|
|
474
|
+
else
|
|
475
|
+
nil
|
|
446
476
|
end
|
|
477
|
+
end
|
|
447
478
|
|
|
448
|
-
|
|
449
|
-
|
|
479
|
+
def format_features(features)
|
|
480
|
+
if features == ''
|
|
481
|
+
'_'
|
|
482
|
+
else
|
|
483
|
+
features.split('|').sort.join('|')
|
|
450
484
|
end
|
|
485
|
+
end
|
|
451
486
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
487
|
+
def miscellaneous
|
|
488
|
+
m = @citation_part
|
|
489
|
+
m += "|LId=#{@variant}" if @variant
|
|
490
|
+
m
|
|
491
|
+
end
|
|
455
492
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
493
|
+
def to_conll
|
|
494
|
+
[@id,
|
|
495
|
+
@form,
|
|
496
|
+
@baselemma.gsub(/не\./,''),
|
|
497
|
+
@upos,
|
|
498
|
+
@part_of_speech,
|
|
499
|
+
format_features(@features),
|
|
500
|
+
@head_id,
|
|
501
|
+
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
|
502
|
+
'_', # slashes here
|
|
503
|
+
miscellaneous].join("\t")
|
|
504
|
+
end
|
|
464
505
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
506
|
+
def to_s
|
|
507
|
+
[@id, @form, @head_id, @relation].join("\t")
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
def to_n
|
|
511
|
+
[@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-')
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
def to_graph(indents = 0)
|
|
515
|
+
([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n")
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
def siblings
|
|
519
|
+
@sentence.tokens.select { |t| t.head_id == @head_id } - [self]
|
|
520
|
+
end
|
|
521
|
+
|
|
522
|
+
def head
|
|
523
|
+
@sentence.tokens.select { |t| t.id == @head_id }.first
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
def dependents
|
|
527
|
+
@sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id)
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
def find_appositive_head
|
|
531
|
+
raise 'Not an apposition' unless @relation == 'apos'
|
|
532
|
+
if head.conjunction? and head.relation == 'apos'
|
|
533
|
+
head.find_appositive_head
|
|
534
|
+
else
|
|
535
|
+
head
|
|
475
536
|
end
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
def find_postag possible_postags
|
|
540
|
+
tag, crit, feats = possible_postags.shift
|
|
541
|
+
if tag.nil?
|
|
542
|
+
# raise "Found no postag"
|
|
543
|
+
elsif crit.call self
|
|
544
|
+
@upos = tag
|
|
545
|
+
@features += ((@features.empty? ? '' : '|') + feats) if feats
|
|
546
|
+
else
|
|
547
|
+
find_postag possible_postags
|
|
486
548
|
end
|
|
549
|
+
end
|
|
487
550
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
551
|
+
def find_relation possible_relations
|
|
552
|
+
rel, crit = possible_relations.shift
|
|
553
|
+
if rel.nil?
|
|
554
|
+
# raise "Found no relation"
|
|
555
|
+
elsif crit.call self
|
|
556
|
+
rel
|
|
557
|
+
else
|
|
558
|
+
find_relation possible_relations
|
|
494
559
|
end
|
|
560
|
+
end
|
|
495
561
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
562
|
+
def map_part_of_speech!
|
|
563
|
+
dependents.each(&:map_part_of_speech!)
|
|
564
|
+
possible_postags = POS_MAP[@part_of_speech]
|
|
565
|
+
find_postag possible_postags.dup
|
|
566
|
+
# ugly, but the ugliness comes from UDEP
|
|
567
|
+
@upos = 'PRON' if @upos == 'DET' and @relation != 'det'
|
|
568
|
+
@upos = REL_TO_POS[@relation] if @upos == 'X'
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
def relabel_graph!
|
|
572
|
+
dependents.each(&:relabel_graph!)
|
|
573
|
+
# TODO: if there are iobjs without an obj among the dependents, one of them should be promoted to obj
|
|
574
|
+
@relation = map_relation
|
|
575
|
+
raise "No relation for #{form}" unless @relation
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
def map_relation
|
|
579
|
+
possible_relations = RELATION_MAPPING[@relation]
|
|
580
|
+
case possible_relations
|
|
581
|
+
when String
|
|
582
|
+
possible_relations
|
|
583
|
+
when Array
|
|
584
|
+
x = find_relation possible_relations.dup
|
|
585
|
+
when nil
|
|
586
|
+
# do nothing: the token has already changed its relation
|
|
587
|
+
@relation
|
|
588
|
+
else
|
|
589
|
+
raise "Unknown value #{possible_relations.inspect} for #{@relation}"
|
|
501
590
|
end
|
|
591
|
+
end
|
|
502
592
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
593
|
+
# attach subjunctions with 'mark' under their verbs and promote
|
|
594
|
+
# the verb to take over the subjunction's relation. If the verb
|
|
595
|
+
# is empty, the subjunction stays as head.
|
|
596
|
+
def process_subjunction!
|
|
597
|
+
# ignore if the subjunction has no dependents or only conj dependents.
|
|
598
|
+
# NB: this requires that the function is called *after* processing conjunctions
|
|
599
|
+
return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty?
|
|
600
|
+
pred = dependents.select { |d| d.relation == 'pred' }
|
|
601
|
+
raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one?
|
|
602
|
+
pred = pred.first
|
|
603
|
+
# promote the subjunction if the verb is empty
|
|
604
|
+
if pred.is_empty?
|
|
605
|
+
pred.dependents.each { |d| d.head_id = id }
|
|
606
|
+
@sentence.remove_token! pred
|
|
607
|
+
# else demote the subjunction
|
|
608
|
+
else
|
|
609
|
+
pred.invert!('mark')
|
|
610
|
+
# move any remaining discourse children to the new head (note that we need to keep some aux'es to get them as "fixed" dependents
|
|
611
|
+
dependents.each { |d| d.head_id = pred.id unless (d.relation == 'aux' and ['Px', 'Pr'].include? d.part_of_speech) or d.relation == 'fixed' }
|
|
516
612
|
end
|
|
613
|
+
end
|
|
517
614
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
615
|
+
def process_comparison!
|
|
616
|
+
cl = dependents.select { |d| ['sub','obj','obl','comp','adv'].include?(d.relation) }
|
|
617
|
+
head.relation = 'advcl:cmp' if head and head.part_of_speech == 'C-' and head.relation == relation
|
|
618
|
+
comp = cl.first
|
|
619
|
+
comp.invert!('mark','advcl:cmp')
|
|
620
|
+
dependents.each { |d| d.head_id = comp.id }
|
|
621
|
+
end
|
|
622
|
+
|
|
623
|
+
def process_dislocation!
|
|
624
|
+
self.head_id = head.head_id unless head.root?
|
|
625
|
+
self.relation = "dislocated"
|
|
626
|
+
end
|
|
627
|
+
|
|
628
|
+
def process_ellipsis!
|
|
629
|
+
aux = dependents.select(&:auxiliary?).first
|
|
630
|
+
if aux
|
|
631
|
+
aux.promote!
|
|
632
|
+
return
|
|
536
633
|
end
|
|
537
634
|
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
end
|
|
635
|
+
sub = dependents.select { |d| d.relation == 'sub' }.first
|
|
636
|
+
new_head = find_highest_daughter
|
|
637
|
+
new_head_sub = new_head.dependents.select { |d| d.relation == 'sub' }.first
|
|
638
|
+
sub.relation = 'nsubj:outer' if sub and new_head_sub
|
|
639
|
+
new_head.promote!('orphan')
|
|
544
640
|
|
|
545
|
-
new_head = find_highest_daughter
|
|
546
|
-
new_head.promote!('orphan')
|
|
547
|
-
|
|
548
641
|
# dependents.each do |d|
|
|
549
|
-
|
|
550
|
-
|
|
642
|
+
# check if there's a partner with the same relation under the overt node.
|
|
643
|
+
# TODO: this isn't really very convincing when it comes to ADVs
|
|
551
644
|
# if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
|
552
645
|
# partner = partner.find_remnant
|
|
553
646
|
# d.head_id = partner.id
|
|
554
647
|
# d.relation = 'remnant'
|
|
555
|
-
|
|
648
|
+
# if there's no partner, just attach under the overt node, preserving the relation
|
|
556
649
|
# else
|
|
557
650
|
# d.head_id = overt.id
|
|
558
651
|
# end
|
|
559
652
|
# end
|
|
560
|
-
|
|
561
|
-
|
|
653
|
+
@sentence.remove_token!(self)
|
|
654
|
+
end
|
|
562
655
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
end
|
|
656
|
+
def find_remnant
|
|
657
|
+
if r = dependents.select { |d| d.relation == 'remnant' }.first
|
|
658
|
+
r.find_remnant
|
|
659
|
+
else
|
|
660
|
+
self
|
|
569
661
|
end
|
|
662
|
+
end
|
|
570
663
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
664
|
+
def find_highest_daughter
|
|
665
|
+
dependents.min_by { |d| OBLIQUENESS_HIERARCHY.find_index(d.map_relation[/[^:]*/]) || 1000 }
|
|
666
|
+
end
|
|
574
667
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
668
|
+
def process_copula!
|
|
669
|
+
predicates = dependents.select { |d| d.relation == 'xobj' }
|
|
670
|
+
raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
|
|
671
|
+
sub = dependents.select { |d| d.relation == 'sub' }.first
|
|
672
|
+
new_head = predicates.first
|
|
673
|
+
new_head_sub = new_head.dependents.select { |d| d.relation == 'sub' }.first
|
|
674
|
+
sub.relation = 'nsubj:outer' if sub and new_head_sub
|
|
675
|
+
predicates.first.promote!(nil, 'cop')
|
|
676
|
+
end
|
|
580
677
|
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
678
|
+
def has_preposition?
|
|
679
|
+
dependents.any? { |d| d.preposition? and d.relation == 'case' }
|
|
680
|
+
end
|
|
584
681
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
682
|
+
def process_preposition!
|
|
683
|
+
raise 'Only prepositions can be processed this way!' unless part_of_speech == 'R-'
|
|
684
|
+
obliques = dependents.select { |d| d.relation == 'obl' }
|
|
685
|
+
doublepreps = dependents.select { |d| d.relation == 'aux' and d.preposition? }
|
|
686
|
+
mods = dependents.select { |d| d.relation != 'obl' and !(d.relation == 'aux' and d.preposition?) }
|
|
687
|
+
raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1
|
|
688
|
+
return if obliques.empty? #shouldn't really happen, but in practice
|
|
689
|
+
obliques.first.invert!('case') # , "adv")
|
|
690
|
+
doublepreps.each { |p| p.head_id = obliques.first.id and p.relation = 'case' }
|
|
691
|
+
mods.each { |m| m.head_id = obliques.first.id }
|
|
692
|
+
end
|
|
592
693
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
end
|
|
694
|
+
def remove_empties!
|
|
695
|
+
dependents.each(&:remove_empties!)
|
|
696
|
+
if is_empty?
|
|
697
|
+
dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
|
|
698
|
+
@sentence.remove_token! self
|
|
599
699
|
end
|
|
700
|
+
end
|
|
600
701
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
702
|
+
# Changes coordinations recursively from the bottom of the graph
|
|
703
|
+
def change_coordinations!
|
|
704
|
+
dependents.each(&:change_coordinations!)
|
|
705
|
+
process_coordination! if conjunction?
|
|
706
|
+
end
|
|
606
707
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
def distribute_shared_modifiers!
|
|
615
|
-
raise "Can only distribute over a conjunction!" unless conjunction?
|
|
616
|
-
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
|
617
|
-
first_conjunct = conjuncts.shift
|
|
618
|
-
raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
|
|
619
|
-
raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
|
|
620
|
-
modifiers.each do |m|
|
|
621
|
-
m.head_id = first_conjunct.id
|
|
622
|
-
conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
|
|
623
|
-
end
|
|
624
|
-
end
|
|
708
|
+
def process_coordination!
|
|
709
|
+
raise 'Only coordinations can be processed this way!' unless conjunction?
|
|
710
|
+
return if dependents.reject { |d| d.relation == 'aux' }.empty?
|
|
711
|
+
distribute_shared_modifiers!
|
|
712
|
+
dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!('conj', 'cc')
|
|
713
|
+
end
|
|
625
714
|
|
|
626
|
-
|
|
627
|
-
|
|
715
|
+
def distribute_shared_modifiers!
|
|
716
|
+
raise 'Can only distribute over a conjunction!' unless conjunction?
|
|
717
|
+
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
|
718
|
+
first_conjunct = conjuncts.shift
|
|
719
|
+
raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
|
|
720
|
+
raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
|
|
721
|
+
modifiers.each do |m|
|
|
722
|
+
m.head_id = first_conjunct.id
|
|
723
|
+
conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
|
|
628
724
|
end
|
|
725
|
+
end
|
|
629
726
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
727
|
+
def add_slash!(slash)
|
|
728
|
+
@slashes << slash
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
# Inverts the direction of a dependency relation. By default the
|
|
732
|
+
# labels are also swapped, but new relations can be specified
|
|
733
|
+
# for both the new dependent and the new head.
|
|
734
|
+
def invert!(new_dependent_relation = nil, new_head_relation = nil)
|
|
735
|
+
raise 'Cannot promote a token under root!' if @head_id == 0
|
|
736
|
+
new_dependent_relation ||= @relation
|
|
737
|
+
new_head_relation ||= head.relation
|
|
738
|
+
new_head_id = head.head_id
|
|
739
|
+
|
|
740
|
+
head.head_id = @id
|
|
741
|
+
head.relation = new_dependent_relation
|
|
742
|
+
@head_id = new_head_id
|
|
743
|
+
self.relation = new_head_relation
|
|
744
|
+
end
|
|
638
745
|
|
|
746
|
+
# promotes a node to its head's place. The node takes over its
|
|
747
|
+
# former head's relation and all dependents. The new relation
|
|
748
|
+
# for these dependents can be specified; if it is not, they will
|
|
749
|
+
# keep their former relation. The former head is made a
|
|
750
|
+
# dependent of the node (with a specified relation) or,
|
|
751
|
+
# if it is an empty node, destroyed.
|
|
752
|
+
|
|
753
|
+
def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
|
|
754
|
+
raise 'Cannot promote a token under root!' if @head_id == 0
|
|
755
|
+
new_head_relation = head.relation
|
|
756
|
+
new_head_id = head.head_id
|
|
757
|
+
|
|
758
|
+
# move all dependents of the former head to the new one
|
|
759
|
+
siblings.each do |t|
|
|
760
|
+
t.head_id = @id
|
|
761
|
+
# ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings), now also includes conj, cc
|
|
762
|
+
t.relation = new_sibling_relation if (new_sibling_relation and !['aux','conj','cc'].include?(t.relation))
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
# remove the former head if it was empty
|
|
766
|
+
if head.is_empty?
|
|
767
|
+
@sentence.remove_token!(head)
|
|
768
|
+
# else make it a dependent of the new head
|
|
769
|
+
else
|
|
639
770
|
head.head_id = @id
|
|
640
771
|
head.relation = new_dependent_relation
|
|
641
|
-
@head_id = new_head_id
|
|
642
|
-
self.relation = new_head_relation
|
|
643
772
|
end
|
|
644
773
|
|
|
645
|
-
|
|
646
|
-
#
|
|
647
|
-
#
|
|
648
|
-
|
|
649
|
-
# dependent of the node (with a specified relation) or,
|
|
650
|
-
# if it is an empty node, destroyed.
|
|
651
|
-
|
|
652
|
-
def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
|
|
653
|
-
raise "Cannot promote a token under root!" if @head_id == 0
|
|
654
|
-
new_head_relation = head.relation
|
|
655
|
-
new_head_id = head.head_id
|
|
656
|
-
|
|
657
|
-
# move all dependents of the former head to the new one
|
|
658
|
-
siblings.each do |t|
|
|
659
|
-
t.head_id = @id
|
|
660
|
-
# ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings)
|
|
661
|
-
t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux')
|
|
662
|
-
end
|
|
663
|
-
|
|
664
|
-
# remove the former head if it was empty
|
|
665
|
-
if head.is_empty?
|
|
666
|
-
@sentence.remove_token!(head)
|
|
667
|
-
# else make it a dependent of the new head
|
|
668
|
-
else
|
|
669
|
-
head.head_id = @id
|
|
670
|
-
head.relation = new_dependent_relation
|
|
671
|
-
end
|
|
672
|
-
|
|
673
|
-
@head_id = new_head_id
|
|
674
|
-
# don't use relation=, as we don't want this relation to be
|
|
675
|
-
# copied down a tree of conjunctions
|
|
676
|
-
@relation = new_head_relation
|
|
677
|
-
end
|
|
774
|
+
@head_id = new_head_id
|
|
775
|
+
# don't use relation=, as we don't want this relation to be
|
|
776
|
+
# copied down a tree of conjunctions
|
|
777
|
+
@relation = new_head_relation
|
|
678
778
|
end
|
|
679
779
|
end
|
|
680
780
|
end
|