proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Tokenize < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:tokenize) do |c|
|
7
|
+
c.syntax 'tokenize'
|
8
|
+
c.description 'Tokenize raw text'
|
9
|
+
c.syntax '[options] filename'
|
10
|
+
|
11
|
+
c.action { |args, options| process(args, options) }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def process(args, options)
|
16
|
+
if args.empty?
|
17
|
+
STDERR.puts 'Missing filename. Use --help for more information.'
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
if args.length > 1
|
22
|
+
STDERR.puts 'Too many filenames. Use --help for more information.'
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
|
26
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
27
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
28
|
+
|
29
|
+
filename = args.first
|
30
|
+
|
31
|
+
File.open(filename, 'r') do |file|
|
32
|
+
header = read_header(file)
|
33
|
+
body = read_body(file)
|
34
|
+
|
35
|
+
builder.proiel('export-time' => header.export_time, 'schema-version' => '2.0') do
|
36
|
+
builder.source(id: header.id, language: header.language) do
|
37
|
+
builder.title header.title
|
38
|
+
builder.author header.author
|
39
|
+
builder.tag!('citation-part', header.citation_part)
|
40
|
+
|
41
|
+
tokenize(builder, body)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def tokenize(builder, body)
|
48
|
+
citation_part = nil
|
49
|
+
|
50
|
+
body.each_with_index do |sd_body, i|
|
51
|
+
builder.div(title: sd_body[:title]) do
|
52
|
+
sd_body[:contents].split(/(@[^ ]+|§[^ ]+)/).map do |s|
|
53
|
+
if s[0] == '§' or s[0] == '@'
|
54
|
+
s
|
55
|
+
else
|
56
|
+
# It's sensible to place the break not immediately after probable
|
57
|
+
# sentence-breaking punctuation like periods and question marks, but
|
58
|
+
# after the punctuation mark and characters typically used in pairs,
|
59
|
+
# like brackets and apostrophes.
|
60
|
+
s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
|
61
|
+
end
|
62
|
+
end.join.split('|').each_with_index do |s_body, j|
|
63
|
+
builder.sentence(status_tag: 'unannotated') do
|
64
|
+
leftover_before = ''
|
65
|
+
|
66
|
+
# Preserve linebreaks in the text.
|
67
|
+
s_body.gsub!(/\s*[\n\r]/, "\u2028")
|
68
|
+
|
69
|
+
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
|
+
case form
|
71
|
+
when /^@(.*)$/
|
72
|
+
leftover_before += before unless before.nil?
|
73
|
+
leftover_before += $1
|
74
|
+
leftover_before += after unless after.nil?
|
75
|
+
when /^§(.*)$/
|
76
|
+
leftover_before += before unless before.nil?
|
77
|
+
citation_part = $1
|
78
|
+
leftover_before += after unless after.nil?
|
79
|
+
else
|
80
|
+
before = leftover_before + before
|
81
|
+
leftover_before = ''
|
82
|
+
|
83
|
+
attrs = { citation_part: citation_part, form: form }
|
84
|
+
attrs[:presentation_before] = before unless before == ''
|
85
|
+
attrs[:presentation_after] = after unless after == ''
|
86
|
+
|
87
|
+
builder.token(attrs)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
VALID_METADATA_FIELDS =
|
97
|
+
%w(title author citation_part language id
|
98
|
+
|
99
|
+
principal funder distributor distributor_address date
|
100
|
+
license license_url
|
101
|
+
reference_system
|
102
|
+
editor editorial_note
|
103
|
+
annotator reviewer
|
104
|
+
|
105
|
+
electronic_text_editor electronic_text_title
|
106
|
+
electronic_text_version
|
107
|
+
electronic_text_publisher electronic_text_place electronic_text_date
|
108
|
+
electronic_text_original_url
|
109
|
+
electronic_text_license electronic_text_license_url
|
110
|
+
|
111
|
+
printed_text_editor printed_text_title
|
112
|
+
printed_text_edition
|
113
|
+
printed_text_publisher printed_text_place printed_text_date)
|
114
|
+
|
115
|
+
def read_header(f)
|
116
|
+
f.rewind
|
117
|
+
|
118
|
+
OpenStruct.new.tap do |hdr|
|
119
|
+
# We expect a header first, each line starting with %, and we
|
120
|
+
# assume that the header ends with the first line that does
|
121
|
+
# not start with %.
|
122
|
+
f.each_line do |l|
|
123
|
+
l.chomp!
|
124
|
+
|
125
|
+
case l
|
126
|
+
when /^%/
|
127
|
+
field, value = l.sub(/^%\s*/, '').split(/\s*=\s*/, 2)
|
128
|
+
|
129
|
+
case field
|
130
|
+
when 'id', 'export_time', *VALID_METADATA_FIELDS
|
131
|
+
hdr[field] = value.strip
|
132
|
+
else
|
133
|
+
STDERR.puts "Invalid header field #{field}. Ignoring.".yellow
|
134
|
+
end
|
135
|
+
else
|
136
|
+
break
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def read_body(f)
|
143
|
+
f.rewind
|
144
|
+
|
145
|
+
Array.new.tap do |bdy|
|
146
|
+
f.each_line do |l|
|
147
|
+
case l
|
148
|
+
when /^%/
|
149
|
+
# Ignore header
|
150
|
+
when /^\s*$/
|
151
|
+
# Ignore empty lines
|
152
|
+
when /^#/
|
153
|
+
# New source division started
|
154
|
+
bdy << { title: l.sub(/^#/, '').strip, contents: '' }
|
155
|
+
else
|
156
|
+
bdy << { title: '', contents: '' } if bdy.empty?
|
157
|
+
bdy.last[:contents] += l
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Validate < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:validate) do |c|
|
7
|
+
c.syntax 'validate'
|
8
|
+
c.description 'Validate input data'
|
9
|
+
c.action { |args, options| process(args, options) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def process(args, options)
|
14
|
+
if args.empty?
|
15
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
@schemas = {}
|
20
|
+
|
21
|
+
args.each do |filename|
|
22
|
+
v = PROIEL::PROIELXML::Validator.new(filename)
|
23
|
+
|
24
|
+
if v.valid?
|
25
|
+
puts "#{filename} is valid".green
|
26
|
+
|
27
|
+
exit 0
|
28
|
+
else
|
29
|
+
puts "#{filename} is invalid".red
|
30
|
+
|
31
|
+
v.errors.each do |error|
|
32
|
+
puts "* #{error}"
|
33
|
+
end
|
34
|
+
|
35
|
+
exit 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,589 @@
|
|
1
|
+
require 'proiel/cli/converters/conll-u/morphology'
|
2
|
+
require 'proiel/cli/converters/conll-u/syntax'
|
3
|
+
|
4
|
+
module PROIEL
|
5
|
+
module Converter
|
6
|
+
class CoNLLU
|
7
|
+
class << self
|
8
|
+
def process(tb, options = [])
|
9
|
+
error_count = 0
|
10
|
+
sentence_count = 0
|
11
|
+
tb.sources.each do |source|
|
12
|
+
source.divs.each do |div|
|
13
|
+
div.sentences.each do |sentence|
|
14
|
+
sentence_count += 1
|
15
|
+
n = Sentence.new sentence
|
16
|
+
# Unlike other conversions, this one has to rely on
|
17
|
+
# certain assumptions about correct linguistic
|
18
|
+
# annotation in order to producea meaningful
|
19
|
+
# representation in CoNLL-U
|
20
|
+
begin
|
21
|
+
puts n.convert.to_conll
|
22
|
+
puts
|
23
|
+
rescue => e
|
24
|
+
error_count += 1
|
25
|
+
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Sentence
|
35
|
+
|
36
|
+
attr_accessor :tokens
|
37
|
+
|
38
|
+
# initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
|
39
|
+
def initialize(sentence)
|
40
|
+
|
41
|
+
id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil
|
42
|
+
|
43
|
+
tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
|
44
|
+
|
45
|
+
tk.map(&:id).each_with_index.each do |id, i|
|
46
|
+
id_to_number[id] = i + 1
|
47
|
+
end
|
48
|
+
|
49
|
+
@tokens = tk.map do |t|
|
50
|
+
Token.new(id_to_number[t.id],
|
51
|
+
id_to_number[t.head_id],
|
52
|
+
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
53
|
+
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
54
|
+
t.part_of_speech,
|
55
|
+
t.language,
|
56
|
+
t.morphology,
|
57
|
+
t.relation,
|
58
|
+
t.empty_token_sort,
|
59
|
+
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
60
|
+
self
|
61
|
+
)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def convert
|
66
|
+
restructure_graph!
|
67
|
+
relabel_graph!
|
68
|
+
map_part_of_speech!
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
def find_token(identifier)
|
73
|
+
@tokens.select { |t| t.id == identifier }.first
|
74
|
+
end
|
75
|
+
|
76
|
+
def remove_token!(token)
|
77
|
+
@tokens.delete(token)
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_s
|
81
|
+
@tokens.map(&:to_s).join("\n")
|
82
|
+
end
|
83
|
+
|
84
|
+
def count_tokens
|
85
|
+
roots.map(&:count_subgraph).inject(0, :+)
|
86
|
+
end
|
87
|
+
|
88
|
+
def roots
|
89
|
+
@tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
|
90
|
+
end
|
91
|
+
|
92
|
+
def to_graph
|
93
|
+
roots.map(&:to_graph).join("\n")
|
94
|
+
end
|
95
|
+
|
96
|
+
def to_conll
|
97
|
+
@tokens.map(&:to_conll).join("\n")
|
98
|
+
end
|
99
|
+
|
100
|
+
# TODO: this will leave several root nodes in many cases. For now, raise an error
|
101
|
+
def prune_empty_rootnodes!
|
102
|
+
unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
|
103
|
+
empty_roots.each do |r|
|
104
|
+
# promote the first dependent to root
|
105
|
+
new_root = r.dependents.first
|
106
|
+
new_root.head_id = 0
|
107
|
+
new_root.relation = r.relation
|
108
|
+
r.dependents.each { |d| d.head_id = new_root.id }
|
109
|
+
remove_token! r
|
110
|
+
end
|
111
|
+
prune_empty_rootnodes!
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def demote_subjunctions!
|
116
|
+
@tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
|
117
|
+
end
|
118
|
+
|
119
|
+
def demote_parentheticals_and_vocatives!
|
120
|
+
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
121
|
+
if p.any? and r.none?
|
122
|
+
# promote the first vocative/parenthetical to head in case there's nothing else
|
123
|
+
p.first.relation = 'pred'
|
124
|
+
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
125
|
+
end
|
126
|
+
raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
|
127
|
+
p.each { |x| x.head_id = r.first.id }
|
128
|
+
end
|
129
|
+
|
130
|
+
def relabel_graph!
|
131
|
+
roots.each(&:relabel_graph!)
|
132
|
+
end
|
133
|
+
|
134
|
+
def map_part_of_speech!
|
135
|
+
roots.each(&:map_part_of_speech!)
|
136
|
+
end
|
137
|
+
|
138
|
+
def restructure_graph!
|
139
|
+
@tokens.delete_if { |n| n.empty_token_sort == 'P' }
|
140
|
+
@tokens.select(&:preposition?).each(&:process_preposition!)
|
141
|
+
roots.each(&:change_coordinations!)
|
142
|
+
@tokens.select(&:copula?).each(&:process_copula!)
|
143
|
+
prune_empty_rootnodes!
|
144
|
+
# do ellipses from left to right for proper remnant treatment
|
145
|
+
@tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
|
146
|
+
demote_subjunctions!
|
147
|
+
# DIRTY: remove the rest of the empty nodes by attaching them
|
148
|
+
# to their grandmother with remnant. This is the best way to
|
149
|
+
# do it given the current state of the UDEP scheme, but
|
150
|
+
# revisions will come.
|
151
|
+
roots.each(&:remove_empties!)
|
152
|
+
demote_parentheticals_and_vocatives!
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class Token
|
157
|
+
|
158
|
+
attr_accessor :head_id
|
159
|
+
attr_accessor :upos
|
160
|
+
attr_reader :relation
|
161
|
+
attr_reader :part_of_speech
|
162
|
+
attr_reader :id
|
163
|
+
attr_reader :lemma
|
164
|
+
attr_reader :language
|
165
|
+
attr_reader :empty_token_sort
|
166
|
+
attr_reader :form
|
167
|
+
|
168
|
+
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
|
169
|
+
@id = id
|
170
|
+
@head_id = head_id
|
171
|
+
@form = form
|
172
|
+
@lemma = lemma
|
173
|
+
@part_of_speech = part_of_speech
|
174
|
+
@language = language
|
175
|
+
@morphology = morphology
|
176
|
+
@relation = relation
|
177
|
+
@empty_token_sort = empty_token_sort
|
178
|
+
@slashes = slashes
|
179
|
+
@sentence = sentence
|
180
|
+
@features = (morphology ? map_morphology(morphology) : '' )
|
181
|
+
@upos = nil
|
182
|
+
end
|
183
|
+
|
184
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = [
|
185
|
+
:person, :number, :tense, :mood, :voice, :gender, :case,
|
186
|
+
:degree, :strength, :inflection
|
187
|
+
]
|
188
|
+
|
189
|
+
def map_morphology morph
|
190
|
+
res = []
|
191
|
+
for tag in 0..morph.length - 1
|
192
|
+
res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]]
|
193
|
+
end
|
194
|
+
res.compact.join('|')
|
195
|
+
end
|
196
|
+
|
197
|
+
# returns +true+ if the node is an adjective or an ordinal
|
198
|
+
def adjectival?
|
199
|
+
@part_of_speech == 'A-' or @part_of_speech == 'Mo'
|
200
|
+
end
|
201
|
+
|
202
|
+
def adverb?
|
203
|
+
@part_of_speech =~ /\AD/
|
204
|
+
end
|
205
|
+
|
206
|
+
def cardinal?
|
207
|
+
@part_of_speech == 'Ma'
|
208
|
+
end
|
209
|
+
|
210
|
+
# A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)
|
211
|
+
def clausal?
|
212
|
+
(@part_of_speech == 'V-' and !nominalized?) or
|
213
|
+
dependents.any?(&:copula?) or
|
214
|
+
dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation } or
|
215
|
+
root?
|
216
|
+
end
|
217
|
+
|
218
|
+
def conjunction?
|
219
|
+
part_of_speech == 'C-' or @empty_token_sort == 'C'
|
220
|
+
end
|
221
|
+
|
222
|
+
def coordinated?
|
223
|
+
head and head.conjunction? and head.relation == @relation
|
224
|
+
end
|
225
|
+
|
226
|
+
# Returns +true+ if the node has an xobj dependent and either 1)
|
227
|
+
# the lemma is copular or 2) the node is empty and has no pid
|
228
|
+
# slash or a pid slash to a node with a copular lemma
|
229
|
+
def copula?
|
230
|
+
@relation == 'cop' or
|
231
|
+
(COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or
|
232
|
+
(@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and
|
233
|
+
dependents.any? { |d| d.relation == 'xobj' } )
|
234
|
+
end
|
235
|
+
|
236
|
+
def determiner?
|
237
|
+
DETERMINERS.include? @part_of_speech
|
238
|
+
end
|
239
|
+
|
240
|
+
def ellipsis?
|
241
|
+
@empty_token_sort == 'V'
|
242
|
+
end
|
243
|
+
|
244
|
+
def foreign?
|
245
|
+
@part_of_speech == 'F-'
|
246
|
+
end
|
247
|
+
|
248
|
+
def has_content?
|
249
|
+
@empty_token_sort.nil? or @empty_token_sort == ''
|
250
|
+
end
|
251
|
+
|
252
|
+
def interjection?
|
253
|
+
@part_of_speech == 'I-'
|
254
|
+
end
|
255
|
+
|
256
|
+
def is_empty?
|
257
|
+
!has_content?
|
258
|
+
end
|
259
|
+
|
260
|
+
def mediopassive?
|
261
|
+
@morphology[4] =~/[mpe]/
|
262
|
+
end
|
263
|
+
|
264
|
+
def negation?
|
265
|
+
NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
266
|
+
end
|
267
|
+
|
268
|
+
def nominal?
|
269
|
+
@part_of_speech =~ /\A[NPM]/ or nominalized?
|
270
|
+
end
|
271
|
+
|
272
|
+
def nominalized?
|
273
|
+
dependents.any? do |d|
|
274
|
+
d.determiner? and ['atr', 'aux', 'det'].include? d.relation
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def particle?
|
279
|
+
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
280
|
+
end
|
281
|
+
|
282
|
+
def passive?
|
283
|
+
@morphology[4] == 'p'
|
284
|
+
end
|
285
|
+
|
286
|
+
def preposition?
|
287
|
+
@part_of_speech == 'R-'
|
288
|
+
end
|
289
|
+
|
290
|
+
def proper_noun?
|
291
|
+
@part_of_speech == 'Ne'
|
292
|
+
end
|
293
|
+
|
294
|
+
def root?
|
295
|
+
@head_id == 0
|
296
|
+
end
|
297
|
+
|
298
|
+
def relation=(rel)
|
299
|
+
if conjunction?
|
300
|
+
dependents.select { |d| d.relation == @relation }.each do |c|
|
301
|
+
c.relation = rel
|
302
|
+
end
|
303
|
+
end
|
304
|
+
@relation = rel
|
305
|
+
end
|
306
|
+
|
307
|
+
def count_subgraph
|
308
|
+
dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1)
|
309
|
+
end
|
310
|
+
|
311
|
+
def subgraph_set
|
312
|
+
[self] + dependents.map(&:subgraph_set).flatten
|
313
|
+
end
|
314
|
+
|
315
|
+
def left_corner
|
316
|
+
([self] + dependents).sort_by(&:id).first
|
317
|
+
end
|
318
|
+
|
319
|
+
def conj_head
|
320
|
+
raise "Not a conjunct" unless @relation == 'conj'
|
321
|
+
if head.relation == 'conj'
|
322
|
+
head.conj_head
|
323
|
+
else
|
324
|
+
head
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
def pid
|
329
|
+
if pid = @slashes.select { |t, r| r == 'pid' }.first
|
330
|
+
@sentence.tokens.select { |t| pid.first == t.id}.first
|
331
|
+
else
|
332
|
+
nil
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
def format_features(features)
|
337
|
+
if features == ''
|
338
|
+
'_'
|
339
|
+
else
|
340
|
+
features.split("|").sort.join("|")
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def to_conll
|
345
|
+
[@id,
|
346
|
+
@form,
|
347
|
+
@lemma,
|
348
|
+
@upos,
|
349
|
+
@part_of_speech,
|
350
|
+
format_features(@features),
|
351
|
+
@head_id,
|
352
|
+
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
353
|
+
'_', # slashes here
|
354
|
+
'_'].join("\t")
|
355
|
+
end
|
356
|
+
|
357
|
+
def to_s
|
358
|
+
[@id, @form, @head_id, @relation].join("\t")
|
359
|
+
end
|
360
|
+
|
361
|
+
def to_n
|
362
|
+
[@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-')
|
363
|
+
end
|
364
|
+
|
365
|
+
def to_graph(indents = 0)
|
366
|
+
([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n")
|
367
|
+
end
|
368
|
+
|
369
|
+
def siblings
|
370
|
+
@sentence.tokens.select { |t| t.head_id == @head_id } - [self]
|
371
|
+
end
|
372
|
+
|
373
|
+
def head
|
374
|
+
@sentence.tokens.select { |t| t.id == @head_id }.first
|
375
|
+
end
|
376
|
+
|
377
|
+
def dependents
|
378
|
+
@sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id)
|
379
|
+
end
|
380
|
+
|
381
|
+
def find_appositive_head
|
382
|
+
raise "Not an apposition" unless @relation == 'apos'
|
383
|
+
if head.conjunction? and head.relation == 'apos'
|
384
|
+
head.find_appositive_head
|
385
|
+
else
|
386
|
+
head
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
def find_relation possible_relations
|
391
|
+
rel, crit = possible_relations.shift
|
392
|
+
if rel.nil?
|
393
|
+
# raise "Found no relation"
|
394
|
+
elsif crit.call self
|
395
|
+
@relation = rel
|
396
|
+
else
|
397
|
+
find_relation possible_relations
|
398
|
+
end
|
399
|
+
end
|
400
|
+
|
401
|
+
def map_part_of_speech!
|
402
|
+
dependents.each(&:map_part_of_speech!)
|
403
|
+
@upos = POS_MAP[@part_of_speech].first
|
404
|
+
raise "No match found for pos #{part_of_speech.inspect}" unless @upos
|
405
|
+
if feat = POS_MAP[@part_of_speech][1]
|
406
|
+
@features += ((@features.empty? ? '' : '|') + feat)
|
407
|
+
end
|
408
|
+
# ugly, but the ugliness comes from UDEP
|
409
|
+
@upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
|
410
|
+
end
|
411
|
+
|
412
|
+
def relabel_graph!
|
413
|
+
dependents.each(&:relabel_graph!)
|
414
|
+
possible_relations = RELATION_MAPPING[@relation]
|
415
|
+
case possible_relations
|
416
|
+
when String
|
417
|
+
@relation = possible_relations
|
418
|
+
when Array
|
419
|
+
find_relation possible_relations.dup
|
420
|
+
when nil
|
421
|
+
# do nothing: the token has already changed its relation
|
422
|
+
else
|
423
|
+
raise "Unknown value #{possible_relations.inspect} for #{@relation}"
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
# attach subjunctions with 'mark' under their verbs and promote
|
428
|
+
# the verb to take over the subjunction's relation. If the verb
|
429
|
+
# is empty, the subjunction stays as head.
|
430
|
+
def process_subjunction!
|
431
|
+
# ignore if the subjunction has no dependents or only conj dependents.
|
432
|
+
# NB: this requires that the function is called *after* processing conjunctions
|
433
|
+
return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty?
|
434
|
+
pred = dependents.select { |d| d.relation == 'pred' }
|
435
|
+
raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one?
|
436
|
+
pred = pred.first
|
437
|
+
# promote the subjunction if the verb is empty
|
438
|
+
if pred.is_empty?
|
439
|
+
pred.dependents.each { |d| d.head_id = id }
|
440
|
+
@sentence.remove_token! pred
|
441
|
+
# else demote the subjunction
|
442
|
+
else
|
443
|
+
pred.invert!('mark')
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
|
448
|
+
|
449
|
+
# TODO: process "implicit pid" through APOS chain too
|
450
|
+
def process_ellipsis!
|
451
|
+
# First we find the corresponding overt token.
|
452
|
+
# If there's an explicit pid slash, we'll grab that one.
|
453
|
+
if pid and !subgraph_set.include?(pid)
|
454
|
+
overt = pid
|
455
|
+
# otherwise, try a conjunct
|
456
|
+
elsif @relation == 'conj'
|
457
|
+
overt = conj_head
|
458
|
+
elsif @relation == 'apos'
|
459
|
+
overt = find_appositive_head
|
460
|
+
else
|
461
|
+
return
|
462
|
+
end
|
463
|
+
|
464
|
+
dependents.each do |d|
|
465
|
+
# check if there's a partner with the same relation under the overt node.
|
466
|
+
# TODO: this isn't really very convincing when it comes to ADVs
|
467
|
+
if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
468
|
+
partner = partner.find_remnant
|
469
|
+
d.head_id = partner.id
|
470
|
+
d.relation = 'remnant'
|
471
|
+
# if there's no partner, just attach under the overt node, preserving the relation
|
472
|
+
else
|
473
|
+
d.head_id = overt.id
|
474
|
+
end
|
475
|
+
end
|
476
|
+
@sentence.remove_token!(self)
|
477
|
+
end
|
478
|
+
|
479
|
+
def find_remnant
|
480
|
+
if r = dependents.select { |d| d.relation == 'remnant' }.first
|
481
|
+
r.find_remnant
|
482
|
+
else
|
483
|
+
self
|
484
|
+
end
|
485
|
+
end
|
486
|
+
|
487
|
+
def process_copula!
|
488
|
+
predicates = dependents.select { |d| d.relation == 'xobj' }
|
489
|
+
raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
|
490
|
+
predicates.first.promote!(nil, 'cop')
|
491
|
+
end
|
492
|
+
|
493
|
+
def process_preposition!
|
494
|
+
raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
|
495
|
+
obliques = dependents.select { |d| d.relation == 'obl' }
|
496
|
+
raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1
|
497
|
+
return if obliques.empty? #shouldn't really happen, but in practice
|
498
|
+
obliques.first.invert!("case") # , "adv")
|
499
|
+
end
|
500
|
+
|
501
|
+
def remove_empties!
|
502
|
+
dependents.each(&:remove_empties!)
|
503
|
+
if is_empty?
|
504
|
+
dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
|
505
|
+
@sentence.remove_token! self
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
# Changes coordinations recursively from the bottom of the graph
|
510
|
+
def change_coordinations!
|
511
|
+
dependents.each(&:change_coordinations!)
|
512
|
+
process_coordination! if conjunction?
|
513
|
+
end
|
514
|
+
|
515
|
+
def process_coordination!
|
516
|
+
raise "Only coordinations can be processed this way!" unless conjunction?
|
517
|
+
return if dependents.reject { |d| d.relation == 'aux' }.empty?
|
518
|
+
distribute_shared_modifiers!
|
519
|
+
dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc")
|
520
|
+
end
|
521
|
+
|
522
|
+
def distribute_shared_modifiers!
|
523
|
+
raise "Can only distribute over a conjunction!" unless conjunction?
|
524
|
+
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
525
|
+
first_conjunct = conjuncts.shift
|
526
|
+
raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
|
527
|
+
raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
|
528
|
+
modifiers.each do |m|
|
529
|
+
m.head_id = first_conjunct.id
|
530
|
+
conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
def add_slash!(slash)
|
535
|
+
@slashes << slash
|
536
|
+
end
|
537
|
+
|
538
|
+
# Inverts the direction of a dependency relation. By default the
|
539
|
+
# labels are also swapped, but new relations can be specified
|
540
|
+
# for both the new dependent and the new head.
|
541
|
+
def invert!(new_dependent_relation = nil, new_head_relation = nil)
|
542
|
+
raise "Cannot promote a token under root!" if @head_id == 0
|
543
|
+
new_dependent_relation ||= @relation
|
544
|
+
new_head_relation ||= head.relation
|
545
|
+
new_head_id = head.head_id
|
546
|
+
|
547
|
+
head.head_id = @id
|
548
|
+
head.relation = new_dependent_relation
|
549
|
+
@head_id = new_head_id
|
550
|
+
self.relation = new_head_relation
|
551
|
+
end
|
552
|
+
|
553
|
+
# promotes a node to its head's place. The node takes over its
|
554
|
+
# former head's relation and all dependents. The new relation
|
555
|
+
# for these dependents can be specified; if it is not, they will
|
556
|
+
# keep their former relation. The former head is made a
|
557
|
+
# dependent of the node (with a specified relation) or,
|
558
|
+
# if it is an empty node, destroyed.
|
559
|
+
|
560
|
+
def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
|
561
|
+
raise "Cannot promote a token under root!" if @head_id == 0
|
562
|
+
new_head_relation = head.relation
|
563
|
+
new_head_id = head.head_id
|
564
|
+
|
565
|
+
# move all dependents of the former head to the new one
|
566
|
+
siblings.each do |t|
|
567
|
+
t.head_id = @id
|
568
|
+
# ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings)
|
569
|
+
t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux')
|
570
|
+
end
|
571
|
+
|
572
|
+
# remove the former head if it was empty
|
573
|
+
if head.is_empty?
|
574
|
+
@sentence.remove_token!(head)
|
575
|
+
# else make it a dependent of the new head
|
576
|
+
else
|
577
|
+
head.head_id = @id
|
578
|
+
head.relation = new_dependent_relation
|
579
|
+
end
|
580
|
+
|
581
|
+
@head_id = new_head_id
|
582
|
+
# don't use relation=, as we don't want this relation to be
|
583
|
+
# copied down a tree of conjunctions
|
584
|
+
@relation = new_head_relation
|
585
|
+
end
|
586
|
+
end
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|