proiel-cli 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Tokenize < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:tokenize) do |c|
|
7
|
+
c.syntax 'tokenize'
|
8
|
+
c.description 'Tokenize raw text'
|
9
|
+
c.syntax '[options] filename'
|
10
|
+
|
11
|
+
c.action { |args, options| process(args, options) }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def process(args, options)
|
16
|
+
if args.empty?
|
17
|
+
STDERR.puts 'Missing filename. Use --help for more information.'
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
if args.length > 1
|
22
|
+
STDERR.puts 'Too many filenames. Use --help for more information.'
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
|
26
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
27
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
28
|
+
|
29
|
+
filename = args.first
|
30
|
+
|
31
|
+
File.open(filename, 'r') do |file|
|
32
|
+
header = read_header(file)
|
33
|
+
body = read_body(file)
|
34
|
+
|
35
|
+
builder.proiel('export-time' => header.export_time, 'schema-version' => '2.0') do
|
36
|
+
builder.source(id: header.id, language: header.language) do
|
37
|
+
builder.title header.title
|
38
|
+
builder.author header.author
|
39
|
+
builder.tag!('citation-part', header.citation_part)
|
40
|
+
|
41
|
+
tokenize(builder, body)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def tokenize(builder, body)
|
48
|
+
citation_part = nil
|
49
|
+
|
50
|
+
body.each_with_index do |sd_body, i|
|
51
|
+
builder.div(title: sd_body[:title]) do
|
52
|
+
sd_body[:contents].split(/(@[^ ]+|§[^ ]+)/).map do |s|
|
53
|
+
if s[0] == '§' or s[0] == '@'
|
54
|
+
s
|
55
|
+
else
|
56
|
+
# It's sensible to place the break not immediately after probable
|
57
|
+
# sentence-breaking punctuation like periods and question marks, but
|
58
|
+
# after the punctuation mark and characters typically used in pairs,
|
59
|
+
# like brackets and apostrophes.
|
60
|
+
s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
|
61
|
+
end
|
62
|
+
end.join.split('|').each_with_index do |s_body, j|
|
63
|
+
builder.sentence(status_tag: 'unannotated') do
|
64
|
+
leftover_before = ''
|
65
|
+
|
66
|
+
# Preserve linebreaks in the text.
|
67
|
+
s_body.gsub!(/\s*[\n\r]/, "\u2028")
|
68
|
+
|
69
|
+
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
|
+
case form
|
71
|
+
when /^@(.*)$/
|
72
|
+
leftover_before += before unless before.nil?
|
73
|
+
leftover_before += $1
|
74
|
+
leftover_before += after unless after.nil?
|
75
|
+
when /^§(.*)$/
|
76
|
+
leftover_before += before unless before.nil?
|
77
|
+
citation_part = $1
|
78
|
+
leftover_before += after unless after.nil?
|
79
|
+
else
|
80
|
+
before = leftover_before + before
|
81
|
+
leftover_before = ''
|
82
|
+
|
83
|
+
attrs = { citation_part: citation_part, form: form }
|
84
|
+
attrs[:presentation_before] = before unless before == ''
|
85
|
+
attrs[:presentation_after] = after unless after == ''
|
86
|
+
|
87
|
+
builder.token(attrs)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
VALID_METADATA_FIELDS =
|
97
|
+
%w(title author citation_part language id
|
98
|
+
|
99
|
+
principal funder distributor distributor_address date
|
100
|
+
license license_url
|
101
|
+
reference_system
|
102
|
+
editor editorial_note
|
103
|
+
annotator reviewer
|
104
|
+
|
105
|
+
electronic_text_editor electronic_text_title
|
106
|
+
electronic_text_version
|
107
|
+
electronic_text_publisher electronic_text_place electronic_text_date
|
108
|
+
electronic_text_original_url
|
109
|
+
electronic_text_license electronic_text_license_url
|
110
|
+
|
111
|
+
printed_text_editor printed_text_title
|
112
|
+
printed_text_edition
|
113
|
+
printed_text_publisher printed_text_place printed_text_date)
|
114
|
+
|
115
|
+
def read_header(f)
|
116
|
+
f.rewind
|
117
|
+
|
118
|
+
OpenStruct.new.tap do |hdr|
|
119
|
+
# We expect a header first, each line starting with %, and we
|
120
|
+
# assume that the header ends with the first line that does
|
121
|
+
# not start with %.
|
122
|
+
f.each_line do |l|
|
123
|
+
l.chomp!
|
124
|
+
|
125
|
+
case l
|
126
|
+
when /^%/
|
127
|
+
field, value = l.sub(/^%\s*/, '').split(/\s*=\s*/, 2)
|
128
|
+
|
129
|
+
case field
|
130
|
+
when 'id', 'export_time', *VALID_METADATA_FIELDS
|
131
|
+
hdr[field] = value.strip
|
132
|
+
else
|
133
|
+
STDERR.puts "Invalid header field #{field}. Ignoring.".yellow
|
134
|
+
end
|
135
|
+
else
|
136
|
+
break
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def read_body(f)
|
143
|
+
f.rewind
|
144
|
+
|
145
|
+
Array.new.tap do |bdy|
|
146
|
+
f.each_line do |l|
|
147
|
+
case l
|
148
|
+
when /^%/
|
149
|
+
# Ignore header
|
150
|
+
when /^\s*$/
|
151
|
+
# Ignore empty lines
|
152
|
+
when /^#/
|
153
|
+
# New source division started
|
154
|
+
bdy << { title: l.sub(/^#/, '').strip, contents: '' }
|
155
|
+
else
|
156
|
+
bdy << { title: '', contents: '' } if bdy.empty?
|
157
|
+
bdy.last[:contents] += l
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Validate < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:validate) do |c|
|
7
|
+
c.syntax 'validate'
|
8
|
+
c.description 'Validate input data'
|
9
|
+
c.action { |args, options| process(args, options) }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def process(args, options)
|
14
|
+
if args.empty?
|
15
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
@schemas = {}
|
20
|
+
|
21
|
+
args.each do |filename|
|
22
|
+
v = PROIEL::PROIELXML::Validator.new(filename)
|
23
|
+
|
24
|
+
if v.valid?
|
25
|
+
puts "#{filename} is valid".green
|
26
|
+
|
27
|
+
exit 0
|
28
|
+
else
|
29
|
+
puts "#{filename} is invalid".red
|
30
|
+
|
31
|
+
v.errors.each do |error|
|
32
|
+
puts "* #{error}"
|
33
|
+
end
|
34
|
+
|
35
|
+
exit 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,589 @@
|
|
1
|
+
require 'proiel/cli/converters/conll-u/morphology'
|
2
|
+
require 'proiel/cli/converters/conll-u/syntax'
|
3
|
+
|
4
|
+
module PROIEL
|
5
|
+
module Converter
|
6
|
+
class CoNLLU
|
7
|
+
class << self
|
8
|
+
def process(tb, options = [])
|
9
|
+
error_count = 0
|
10
|
+
sentence_count = 0
|
11
|
+
tb.sources.each do |source|
|
12
|
+
source.divs.each do |div|
|
13
|
+
div.sentences.each do |sentence|
|
14
|
+
sentence_count += 1
|
15
|
+
n = Sentence.new sentence
|
16
|
+
# Unlike other conversions, this one has to rely on
|
17
|
+
# certain assumptions about correct linguistic
|
18
|
+
# annotation in order to producea meaningful
|
19
|
+
# representation in CoNLL-U
|
20
|
+
begin
|
21
|
+
puts n.convert.to_conll
|
22
|
+
puts
|
23
|
+
rescue => e
|
24
|
+
error_count += 1
|
25
|
+
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Sentence
|
35
|
+
|
36
|
+
attr_accessor :tokens
|
37
|
+
|
38
|
+
# initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence
|
39
|
+
def initialize(sentence)
|
40
|
+
|
41
|
+
id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil
|
42
|
+
|
43
|
+
tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
|
44
|
+
|
45
|
+
tk.map(&:id).each_with_index.each do |id, i|
|
46
|
+
id_to_number[id] = i + 1
|
47
|
+
end
|
48
|
+
|
49
|
+
@tokens = tk.map do |t|
|
50
|
+
Token.new(id_to_number[t.id],
|
51
|
+
id_to_number[t.head_id],
|
52
|
+
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
53
|
+
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
54
|
+
t.part_of_speech,
|
55
|
+
t.language,
|
56
|
+
t.morphology,
|
57
|
+
t.relation,
|
58
|
+
t.empty_token_sort,
|
59
|
+
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
60
|
+
self
|
61
|
+
)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def convert
|
66
|
+
restructure_graph!
|
67
|
+
relabel_graph!
|
68
|
+
map_part_of_speech!
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
def find_token(identifier)
|
73
|
+
@tokens.select { |t| t.id == identifier }.first
|
74
|
+
end
|
75
|
+
|
76
|
+
def remove_token!(token)
|
77
|
+
@tokens.delete(token)
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_s
|
81
|
+
@tokens.map(&:to_s).join("\n")
|
82
|
+
end
|
83
|
+
|
84
|
+
def count_tokens
|
85
|
+
roots.map(&:count_subgraph).inject(0, :+)
|
86
|
+
end
|
87
|
+
|
88
|
+
def roots
|
89
|
+
@tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
|
90
|
+
end
|
91
|
+
|
92
|
+
def to_graph
|
93
|
+
roots.map(&:to_graph).join("\n")
|
94
|
+
end
|
95
|
+
|
96
|
+
def to_conll
|
97
|
+
@tokens.map(&:to_conll).join("\n")
|
98
|
+
end
|
99
|
+
|
100
|
+
# TODO: this will leave several root nodes in many cases. For now, raise an error
|
101
|
+
def prune_empty_rootnodes!
|
102
|
+
unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
|
103
|
+
empty_roots.each do |r|
|
104
|
+
# promote the first dependent to root
|
105
|
+
new_root = r.dependents.first
|
106
|
+
new_root.head_id = 0
|
107
|
+
new_root.relation = r.relation
|
108
|
+
r.dependents.each { |d| d.head_id = new_root.id }
|
109
|
+
remove_token! r
|
110
|
+
end
|
111
|
+
prune_empty_rootnodes!
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def demote_subjunctions!
|
116
|
+
@tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
|
117
|
+
end
|
118
|
+
|
119
|
+
def demote_parentheticals_and_vocatives!
|
120
|
+
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
121
|
+
if p.any? and r.none?
|
122
|
+
# promote the first vocative/parenthetical to head in case there's nothing else
|
123
|
+
p.first.relation = 'pred'
|
124
|
+
r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
|
125
|
+
end
|
126
|
+
raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
|
127
|
+
p.each { |x| x.head_id = r.first.id }
|
128
|
+
end
|
129
|
+
|
130
|
+
def relabel_graph!
|
131
|
+
roots.each(&:relabel_graph!)
|
132
|
+
end
|
133
|
+
|
134
|
+
def map_part_of_speech!
|
135
|
+
roots.each(&:map_part_of_speech!)
|
136
|
+
end
|
137
|
+
|
138
|
+
def restructure_graph!
|
139
|
+
@tokens.delete_if { |n| n.empty_token_sort == 'P' }
|
140
|
+
@tokens.select(&:preposition?).each(&:process_preposition!)
|
141
|
+
roots.each(&:change_coordinations!)
|
142
|
+
@tokens.select(&:copula?).each(&:process_copula!)
|
143
|
+
prune_empty_rootnodes!
|
144
|
+
# do ellipses from left to right for proper remnant treatment
|
145
|
+
@tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
|
146
|
+
demote_subjunctions!
|
147
|
+
# DIRTY: remove the rest of the empty nodes by attaching them
|
148
|
+
# to their grandmother with remnant. This is the best way to
|
149
|
+
# do it given the current state of the UDEP scheme, but
|
150
|
+
# revisions will come.
|
151
|
+
roots.each(&:remove_empties!)
|
152
|
+
demote_parentheticals_and_vocatives!
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
class Token
|
157
|
+
|
158
|
+
attr_accessor :head_id
|
159
|
+
attr_accessor :upos
|
160
|
+
attr_reader :relation
|
161
|
+
attr_reader :part_of_speech
|
162
|
+
attr_reader :id
|
163
|
+
attr_reader :lemma
|
164
|
+
attr_reader :language
|
165
|
+
attr_reader :empty_token_sort
|
166
|
+
attr_reader :form
|
167
|
+
|
168
|
+
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
|
169
|
+
@id = id
|
170
|
+
@head_id = head_id
|
171
|
+
@form = form
|
172
|
+
@lemma = lemma
|
173
|
+
@part_of_speech = part_of_speech
|
174
|
+
@language = language
|
175
|
+
@morphology = morphology
|
176
|
+
@relation = relation
|
177
|
+
@empty_token_sort = empty_token_sort
|
178
|
+
@slashes = slashes
|
179
|
+
@sentence = sentence
|
180
|
+
@features = (morphology ? map_morphology(morphology) : '' )
|
181
|
+
@upos = nil
|
182
|
+
end
|
183
|
+
|
184
|
+
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = [
|
185
|
+
:person, :number, :tense, :mood, :voice, :gender, :case,
|
186
|
+
:degree, :strength, :inflection
|
187
|
+
]
|
188
|
+
|
189
|
+
def map_morphology morph
|
190
|
+
res = []
|
191
|
+
for tag in 0..morph.length - 1
|
192
|
+
res << MORPHOLOGY_MAP[MORPHOLOGY_POSITIONAL_TAG_SEQUENCE[tag]][morph[tag]]
|
193
|
+
end
|
194
|
+
res.compact.join('|')
|
195
|
+
end
|
196
|
+
|
197
|
+
# returns +true+ if the node is an adjective or an ordinal
|
198
|
+
def adjectival?
|
199
|
+
@part_of_speech == 'A-' or @part_of_speech == 'Mo'
|
200
|
+
end
|
201
|
+
|
202
|
+
def adverb?
|
203
|
+
@part_of_speech =~ /\AD/
|
204
|
+
end
|
205
|
+
|
206
|
+
def cardinal?
|
207
|
+
@part_of_speech == 'Ma'
|
208
|
+
end
|
209
|
+
|
210
|
+
# A node is clausal if it is a verb and not nominalized; or it has a copula dependent; or it has a subject (e.g. in an absolute constructino without a verb; or if it is the root (e.g. in a nominal clause)
|
211
|
+
def clausal?
|
212
|
+
(@part_of_speech == 'V-' and !nominalized?) or
|
213
|
+
dependents.any?(&:copula?) or
|
214
|
+
dependents.any? { |d| ['sub', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass'].include? d.relation } or
|
215
|
+
root?
|
216
|
+
end
|
217
|
+
|
218
|
+
def conjunction?
|
219
|
+
part_of_speech == 'C-' or @empty_token_sort == 'C'
|
220
|
+
end
|
221
|
+
|
222
|
+
def coordinated?
|
223
|
+
head and head.conjunction? and head.relation == @relation
|
224
|
+
end
|
225
|
+
|
226
|
+
# Returns +true+ if the node has an xobj dependent and either 1)
|
227
|
+
# the lemma is copular or 2) the node is empty and has no pid
|
228
|
+
# slash or a pid slash to a node with a copular lemma
|
229
|
+
def copula?
|
230
|
+
@relation == 'cop' or
|
231
|
+
(COPULAR_LEMMATA.include?([lemma, part_of_speech, language].join(',')) or
|
232
|
+
(@empty_token_sort == 'V' and (pid.nil? or pid.is_empty? or COPULAR_LEMMATA.include?([pid.lemma, pid.part_of_speech, pid.language].join(',')))) and
|
233
|
+
dependents.any? { |d| d.relation == 'xobj' } )
|
234
|
+
end
|
235
|
+
|
236
|
+
def determiner?
|
237
|
+
DETERMINERS.include? @part_of_speech
|
238
|
+
end
|
239
|
+
|
240
|
+
def ellipsis?
|
241
|
+
@empty_token_sort == 'V'
|
242
|
+
end
|
243
|
+
|
244
|
+
def foreign?
|
245
|
+
@part_of_speech == 'F-'
|
246
|
+
end
|
247
|
+
|
248
|
+
def has_content?
|
249
|
+
@empty_token_sort.nil? or @empty_token_sort == ''
|
250
|
+
end
|
251
|
+
|
252
|
+
def interjection?
|
253
|
+
@part_of_speech == 'I-'
|
254
|
+
end
|
255
|
+
|
256
|
+
def is_empty?
|
257
|
+
!has_content?
|
258
|
+
end
|
259
|
+
|
260
|
+
def mediopassive?
|
261
|
+
@morphology[4] =~/[mpe]/
|
262
|
+
end
|
263
|
+
|
264
|
+
def negation?
|
265
|
+
NEGATION_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
266
|
+
end
|
267
|
+
|
268
|
+
def nominal?
|
269
|
+
@part_of_speech =~ /\A[NPM]/ or nominalized?
|
270
|
+
end
|
271
|
+
|
272
|
+
def nominalized?
|
273
|
+
dependents.any? do |d|
|
274
|
+
d.determiner? and ['atr', 'aux', 'det'].include? d.relation
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def particle?
|
279
|
+
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
280
|
+
end
|
281
|
+
|
282
|
+
def passive?
|
283
|
+
@morphology[4] == 'p'
|
284
|
+
end
|
285
|
+
|
286
|
+
def preposition?
|
287
|
+
@part_of_speech == 'R-'
|
288
|
+
end
|
289
|
+
|
290
|
+
def proper_noun?
|
291
|
+
@part_of_speech == 'Ne'
|
292
|
+
end
|
293
|
+
|
294
|
+
def root?
|
295
|
+
@head_id == 0
|
296
|
+
end
|
297
|
+
|
298
|
+
def relation=(rel)
|
299
|
+
if conjunction?
|
300
|
+
dependents.select { |d| d.relation == @relation }.each do |c|
|
301
|
+
c.relation = rel
|
302
|
+
end
|
303
|
+
end
|
304
|
+
@relation = rel
|
305
|
+
end
|
306
|
+
|
307
|
+
def count_subgraph
|
308
|
+
dependents.map(&:count_subgraph).inject(0, :+) + (is_empty? ? 0 : 1)
|
309
|
+
end
|
310
|
+
|
311
|
+
def subgraph_set
|
312
|
+
[self] + dependents.map(&:subgraph_set).flatten
|
313
|
+
end
|
314
|
+
|
315
|
+
def left_corner
|
316
|
+
([self] + dependents).sort_by(&:id).first
|
317
|
+
end
|
318
|
+
|
319
|
+
def conj_head
|
320
|
+
raise "Not a conjunct" unless @relation == 'conj'
|
321
|
+
if head.relation == 'conj'
|
322
|
+
head.conj_head
|
323
|
+
else
|
324
|
+
head
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
def pid
|
329
|
+
if pid = @slashes.select { |t, r| r == 'pid' }.first
|
330
|
+
@sentence.tokens.select { |t| pid.first == t.id}.first
|
331
|
+
else
|
332
|
+
nil
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
def format_features(features)
|
337
|
+
if features == ''
|
338
|
+
'_'
|
339
|
+
else
|
340
|
+
features.split("|").sort.join("|")
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def to_conll
|
345
|
+
[@id,
|
346
|
+
@form,
|
347
|
+
@lemma,
|
348
|
+
@upos,
|
349
|
+
@part_of_speech,
|
350
|
+
format_features(@features),
|
351
|
+
@head_id,
|
352
|
+
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
353
|
+
'_', # slashes here
|
354
|
+
'_'].join("\t")
|
355
|
+
end
|
356
|
+
|
357
|
+
def to_s
|
358
|
+
[@id, @form, @head_id, @relation].join("\t")
|
359
|
+
end
|
360
|
+
|
361
|
+
def to_n
|
362
|
+
[@relation, @id, (@form || @empty_token_sort), (@upos || @part_of_speech) ].join('-')
|
363
|
+
end
|
364
|
+
|
365
|
+
def to_graph(indents = 0)
|
366
|
+
([("\t" * indents) + (to_n)] + dependents.map { |d| d.to_graph(indents + 1) }).join("\n")
|
367
|
+
end
|
368
|
+
|
369
|
+
def siblings
|
370
|
+
@sentence.tokens.select { |t| t.head_id == @head_id } - [self]
|
371
|
+
end
|
372
|
+
|
373
|
+
def head
|
374
|
+
@sentence.tokens.select { |t| t.id == @head_id }.first
|
375
|
+
end
|
376
|
+
|
377
|
+
def dependents
|
378
|
+
@sentence.tokens.select { |t| t.head_id == @id }.sort_by(&:id)
|
379
|
+
end
|
380
|
+
|
381
|
+
def find_appositive_head
|
382
|
+
raise "Not an apposition" unless @relation == 'apos'
|
383
|
+
if head.conjunction? and head.relation == 'apos'
|
384
|
+
head.find_appositive_head
|
385
|
+
else
|
386
|
+
head
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
def find_relation possible_relations
|
391
|
+
rel, crit = possible_relations.shift
|
392
|
+
if rel.nil?
|
393
|
+
# raise "Found no relation"
|
394
|
+
elsif crit.call self
|
395
|
+
@relation = rel
|
396
|
+
else
|
397
|
+
find_relation possible_relations
|
398
|
+
end
|
399
|
+
end
|
400
|
+
|
401
|
+
def map_part_of_speech!
|
402
|
+
dependents.each(&:map_part_of_speech!)
|
403
|
+
@upos = POS_MAP[@part_of_speech].first
|
404
|
+
raise "No match found for pos #{part_of_speech.inspect}" unless @upos
|
405
|
+
if feat = POS_MAP[@part_of_speech][1]
|
406
|
+
@features += ((@features.empty? ? '' : '|') + feat)
|
407
|
+
end
|
408
|
+
# ugly, but the ugliness comes from UDEP
|
409
|
+
@upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
|
410
|
+
end
|
411
|
+
|
412
|
+
def relabel_graph!
|
413
|
+
dependents.each(&:relabel_graph!)
|
414
|
+
possible_relations = RELATION_MAPPING[@relation]
|
415
|
+
case possible_relations
|
416
|
+
when String
|
417
|
+
@relation = possible_relations
|
418
|
+
when Array
|
419
|
+
find_relation possible_relations.dup
|
420
|
+
when nil
|
421
|
+
# do nothing: the token has already changed its relation
|
422
|
+
else
|
423
|
+
raise "Unknown value #{possible_relations.inspect} for #{@relation}"
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
# attach subjunctions with 'mark' under their verbs and promote
|
428
|
+
# the verb to take over the subjunction's relation. If the verb
|
429
|
+
# is empty, the subjunction stays as head.
|
430
|
+
def process_subjunction!
|
431
|
+
# ignore if the subjunction has no dependents or only conj dependents.
|
432
|
+
# NB: this requires that the function is called *after* processing conjunctions
|
433
|
+
return if dependents.reject { |d| ['conj', 'cc'].include? d.relation }.empty?
|
434
|
+
pred = dependents.select { |d| d.relation == 'pred' }
|
435
|
+
raise "#{pred.size} PREDs under the subjunction #{to_n}:\n#{@sentence.to_graph}" unless pred.one?
|
436
|
+
pred = pred.first
|
437
|
+
# promote the subjunction if the verb is empty
|
438
|
+
if pred.is_empty?
|
439
|
+
pred.dependents.each { |d| d.head_id = id }
|
440
|
+
@sentence.remove_token! pred
|
441
|
+
# else demote the subjunction
|
442
|
+
else
|
443
|
+
pred.invert!('mark')
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
|
448
|
+
|
449
|
+
# TODO: process "implicit pid" through APOS chain too
|
450
|
+
def process_ellipsis!
|
451
|
+
# First we find the corresponding overt token.
|
452
|
+
# If there's an explicit pid slash, we'll grab that one.
|
453
|
+
if pid and !subgraph_set.include?(pid)
|
454
|
+
overt = pid
|
455
|
+
# otherwise, try a conjunct
|
456
|
+
elsif @relation == 'conj'
|
457
|
+
overt = conj_head
|
458
|
+
elsif @relation == 'apos'
|
459
|
+
overt = find_appositive_head
|
460
|
+
else
|
461
|
+
return
|
462
|
+
end
|
463
|
+
|
464
|
+
dependents.each do |d|
|
465
|
+
# check if there's a partner with the same relation under the overt node.
|
466
|
+
# TODO: this isn't really very convincing when it comes to ADVs
|
467
|
+
if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
468
|
+
partner = partner.find_remnant
|
469
|
+
d.head_id = partner.id
|
470
|
+
d.relation = 'remnant'
|
471
|
+
# if there's no partner, just attach under the overt node, preserving the relation
|
472
|
+
else
|
473
|
+
d.head_id = overt.id
|
474
|
+
end
|
475
|
+
end
|
476
|
+
@sentence.remove_token!(self)
|
477
|
+
end
|
478
|
+
|
479
|
+
def find_remnant
|
480
|
+
if r = dependents.select { |d| d.relation == 'remnant' }.first
|
481
|
+
r.find_remnant
|
482
|
+
else
|
483
|
+
self
|
484
|
+
end
|
485
|
+
end
|
486
|
+
|
487
|
+
def process_copula!
|
488
|
+
predicates = dependents.select { |d| d.relation == 'xobj' }
|
489
|
+
raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
|
490
|
+
predicates.first.promote!(nil, 'cop')
|
491
|
+
end
|
492
|
+
|
493
|
+
def process_preposition!
|
494
|
+
raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
|
495
|
+
obliques = dependents.select { |d| d.relation == 'obl' }
|
496
|
+
raise "#{obliques.size} oblique dependents under #{to_n}\n#{to_graph}" if obliques.size > 1
|
497
|
+
return if obliques.empty? #shouldn't really happen, but in practice
|
498
|
+
obliques.first.invert!("case") # , "adv")
|
499
|
+
end
|
500
|
+
|
501
|
+
def remove_empties!
|
502
|
+
dependents.each(&:remove_empties!)
|
503
|
+
if is_empty?
|
504
|
+
dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
|
505
|
+
@sentence.remove_token! self
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
# Changes coordinations recursively from the bottom of the graph
|
510
|
+
def change_coordinations!
|
511
|
+
dependents.each(&:change_coordinations!)
|
512
|
+
process_coordination! if conjunction?
|
513
|
+
end
|
514
|
+
|
515
|
+
def process_coordination!
|
516
|
+
raise "Only coordinations can be processed this way!" unless conjunction?
|
517
|
+
return if dependents.reject { |d| d.relation == 'aux' }.empty?
|
518
|
+
distribute_shared_modifiers!
|
519
|
+
dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc")
|
520
|
+
end
|
521
|
+
|
522
|
+
def distribute_shared_modifiers!
|
523
|
+
raise "Can only distribute over a conjunction!" unless conjunction?
|
524
|
+
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
525
|
+
first_conjunct = conjuncts.shift
|
526
|
+
raise "No first conjunct under #{to_n}\n#{to_graph}" unless first_conjunct
|
527
|
+
raise "The first conjunct is a misannotated conjunction in #{to_n}\n#{to_graph}" if first_conjunct.conjunction? and first_conjunct.dependents.empty?
|
528
|
+
modifiers.each do |m|
|
529
|
+
m.head_id = first_conjunct.id
|
530
|
+
conjuncts.each { |c| c.add_slash! [m.id, m.relation] }
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
def add_slash!(slash)
|
535
|
+
@slashes << slash
|
536
|
+
end
|
537
|
+
|
538
|
+
# Inverts the direction of a dependency relation. By default the
|
539
|
+
# labels are also swapped, but new relations can be specified
|
540
|
+
# for both the new dependent and the new head.
|
541
|
+
def invert!(new_dependent_relation = nil, new_head_relation = nil)
|
542
|
+
raise "Cannot promote a token under root!" if @head_id == 0
|
543
|
+
new_dependent_relation ||= @relation
|
544
|
+
new_head_relation ||= head.relation
|
545
|
+
new_head_id = head.head_id
|
546
|
+
|
547
|
+
head.head_id = @id
|
548
|
+
head.relation = new_dependent_relation
|
549
|
+
@head_id = new_head_id
|
550
|
+
self.relation = new_head_relation
|
551
|
+
end
|
552
|
+
|
553
|
+
# promotes a node to its head's place. The node takes over its
|
554
|
+
# former head's relation and all dependents. The new relation
|
555
|
+
# for these dependents can be specified; if it is not, they will
|
556
|
+
# keep their former relation. The former head is made a
|
557
|
+
# dependent of the node (with a specified relation) or,
|
558
|
+
# if it is an empty node, destroyed.
|
559
|
+
|
560
|
+
def promote!(new_sibling_relation = nil, new_dependent_relation = 'aux')
|
561
|
+
raise "Cannot promote a token under root!" if @head_id == 0
|
562
|
+
new_head_relation = head.relation
|
563
|
+
new_head_id = head.head_id
|
564
|
+
|
565
|
+
# move all dependents of the former head to the new one
|
566
|
+
siblings.each do |t|
|
567
|
+
t.head_id = @id
|
568
|
+
# ugly hack to avoid overwriting the aux relation here (aux siblings aren't really siblings)
|
569
|
+
t.relation = new_sibling_relation if (new_sibling_relation and t.relation != 'aux')
|
570
|
+
end
|
571
|
+
|
572
|
+
# remove the former head if it was empty
|
573
|
+
if head.is_empty?
|
574
|
+
@sentence.remove_token!(head)
|
575
|
+
# else make it a dependent of the new head
|
576
|
+
else
|
577
|
+
head.head_id = @id
|
578
|
+
head.relation = new_dependent_relation
|
579
|
+
end
|
580
|
+
|
581
|
+
@head_id = new_head_id
|
582
|
+
# don't use relation=, as we don't want this relation to be
|
583
|
+
# copied down a tree of conjunctions
|
584
|
+
@relation = new_head_relation
|
585
|
+
end
|
586
|
+
end
|
587
|
+
end
|
588
|
+
end
|
589
|
+
end
|