proiel-cli 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +21 -15
- data/lib/proiel/cli/converters/conll-u.rb +137 -48
- data/lib/proiel/cli/converters/conll-u/morphology.rb +43 -31
- data/lib/proiel/cli/converters/conll-u/syntax.rb +26 -19
- data/lib/proiel/cli/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98027bdd669bde3fa19db4f6159e3b36d5024f1f
|
4
|
+
data.tar.gz: 4e6019add4436629e2488fc03d19f3d040c48149
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 301a294838fb2ee805bcabbdb5425d39815749c8d4ccea9e8f7883ccaee4fee09ffb0cd45ad8ea0bef06cfb7b80a241b4acb87d224079101863be2d8f0614c7a
|
7
|
+
data.tar.gz: ca1bbaadca8be10e714984ab8ae1f460e6c61d3e5c7b09882c8819e95aadcccc152c69e53e1bd0072be6fc30491c422a84c6886fbd4f7f203b17cfa517f92310
|
data/README.md
CHANGED
@@ -12,25 +12,31 @@ gem install proiel-cli
|
|
12
12
|
|
13
13
|
## Using the command-line interface
|
14
14
|
|
15
|
-
|
16
|
-
`proiel info`, for example, displays metadata and some brief statistics, and
|
17
|
-
`proiel convert conll` converts the treebank to CoNLL format. Use `proiel
|
18
|
-
--help` for further examples and usage instructions.
|
15
|
+
This gem includes a command-line utility, `proiel`, which solves various routine tasks involving PROIEL-style treebanks.
|
19
16
|
|
20
|
-
|
17
|
+
`proiel info`, for example, displays metadata and some brief statistics, and `proiel convert conll` converts the treebank to CoNLL format. Use `proiel --help` for further examples and usage instructions.
|
21
18
|
|
22
|
-
|
19
|
+
To use the `visualize` command you will need to have [graphviz](http://graphviz.org) installed. On macOS you can use [Homebrew](https://brew.sh/) for this:
|
23
20
|
|
24
|
-
|
25
|
-
|
21
|
+
```shell
|
22
|
+
brew install graphviz
|
23
|
+
```
|
26
24
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
Make sure that the `dot` command is available in the path:
|
26
|
+
|
27
|
+
```shell
|
28
|
+
$ which dot
|
29
|
+
/usr/local/bin/dot
|
30
|
+
```
|
32
31
|
|
33
32
|
## Contributing
|
34
33
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub
|
36
|
-
|
34
|
+
Bug reports and pull requests are welcome on [GitHub](https://github.com/proiel/proiel-cli/issues).
|
35
|
+
|
36
|
+
## Development
|
37
|
+
|
38
|
+
To contribute to development, check out the git repository from [GitHub](https://github.com/proiel/proiel-cli) and run `bin/setup` to install all development dependencies. Then run `rake` to run the tests.
|
39
|
+
|
40
|
+
To install a development version of this gem, run `bundle exec rake install`.
|
41
|
+
|
42
|
+
To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the gem to [rubygems.org](https://rubygems.org).
|
@@ -1,6 +1,12 @@
|
|
1
1
|
require 'proiel/cli/converters/conll-u/morphology'
|
2
2
|
require 'proiel/cli/converters/conll-u/syntax'
|
3
3
|
|
4
|
+
# Unlike other conversions, this one has to rely on
|
5
|
+
# certain assumptions about correct linguistic
|
6
|
+
# annotation in order to produce a meaningful
|
7
|
+
# representation in CoNLL-U
|
8
|
+
|
9
|
+
|
4
10
|
module PROIEL
|
5
11
|
module Converter
|
6
12
|
class CoNLLU
|
@@ -13,16 +19,19 @@ module PROIEL
|
|
13
19
|
div.sentences.each do |sentence|
|
14
20
|
sentence_count += 1
|
15
21
|
n = Sentence.new sentence
|
16
|
-
# Unlike other conversions, this one has to rely on
|
17
|
-
# certain assumptions about correct linguistic
|
18
|
-
# annotation in order to producea meaningful
|
19
|
-
# representation in CoNLL-U
|
20
22
|
begin
|
21
|
-
|
23
|
+
# Do the conversion first to avoid spurious headers if the conversion fails
|
24
|
+
a = n.convert.to_conll
|
25
|
+
puts "# source = #{source.title}, #{div.title}"
|
26
|
+
# using printable_form would give us punctuation, which must then be added to the tree
|
27
|
+
puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}"
|
28
|
+
puts "# sent_id = #{sentence.id}"
|
29
|
+
puts a
|
22
30
|
puts
|
23
31
|
rescue => e
|
24
32
|
error_count += 1
|
25
33
|
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
34
|
+
STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
|
26
35
|
end
|
27
36
|
end
|
28
37
|
end
|
@@ -40,16 +49,55 @@ module PROIEL
|
|
40
49
|
|
41
50
|
id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil
|
42
51
|
|
43
|
-
|
52
|
+
# initialize array to hold the sentence tokens
|
53
|
+
tks = []
|
54
|
+
# keep track of how many new tokens have been created
|
55
|
+
offset = 0
|
44
56
|
|
45
|
-
|
57
|
+
sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|
|
58
|
+
|
59
|
+
if tk.form =~ /[[:space:]]/
|
60
|
+
subtoks = tk.form.split(/[[:space:]]/)
|
61
|
+
|
62
|
+
subtoks.each_with_index do |subtok, i|
|
63
|
+
tks << PROIEL::Token.new(sentence,
|
64
|
+
(i == 0 ? tk.id : 1000 + offset), # id
|
65
|
+
(i == 0 ? tk.head_id : tk.id), # head_id
|
66
|
+
subtok,
|
67
|
+
# hope the lemmas split the same way as the tokens. Grab the form is you don't find a lemma
|
68
|
+
(tk.lemma.split(/[[:space:]]/)[i] || subtok),
|
69
|
+
tk.part_of_speech, # copy the postag
|
70
|
+
tk.morphology,
|
71
|
+
(i == 0 ? tk.relation : "flat"),
|
72
|
+
nil, #empty_token_sort
|
73
|
+
tk.citation_part,
|
74
|
+
(i == 0 ? tk.presentation_before : nil),
|
75
|
+
(i == (subtoks.size - 1) ? tk.presentation_after : nil),
|
76
|
+
(i == 0 ? tk.antecedent_id : nil),
|
77
|
+
(i == 0 ? tk.information_status : nil),
|
78
|
+
(i == 0 ? tk.contrast_group : nil),
|
79
|
+
(i == 0 ? tk.foreign_ids : nil),
|
80
|
+
(i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), # This needs to be given a real slash object for the initialization, although it throws away the info
|
81
|
+
(subtok == subtoks.first ? tk.alignment_id : nil)
|
82
|
+
)
|
83
|
+
offset += 1
|
84
|
+
end
|
85
|
+
else
|
86
|
+
tks << tk
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
tks.map(&:id).each_with_index.each do |id, i|
|
46
92
|
id_to_number[id] = i + 1
|
47
93
|
end
|
48
94
|
|
49
|
-
@tokens =
|
95
|
+
@tokens = tks.map do |t|
|
96
|
+
|
50
97
|
Token.new(id_to_number[t.id],
|
51
98
|
id_to_number[t.head_id],
|
52
|
-
|
99
|
+
#insert dots in any whitespace inside words and lemmata
|
100
|
+
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
53
101
|
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
54
102
|
t.part_of_speech,
|
55
103
|
t.language,
|
@@ -197,11 +245,19 @@ module PROIEL
|
|
197
245
|
res.compact.join('|')
|
198
246
|
end
|
199
247
|
|
248
|
+
def genitive?
|
249
|
+
@morphology =~ /......g.*/
|
250
|
+
end
|
251
|
+
|
200
252
|
# returns +true+ if the node is an adjective or an ordinal
|
201
253
|
def adjectival?
|
202
254
|
@part_of_speech == 'A-' or @part_of_speech == 'Mo'
|
203
255
|
end
|
204
256
|
|
257
|
+
def subjunction?
|
258
|
+
@part_of_speech == 'G-'
|
259
|
+
end
|
260
|
+
|
205
261
|
def adverb?
|
206
262
|
@part_of_speech =~ /\AD/
|
207
263
|
end
|
@@ -236,6 +292,10 @@ module PROIEL
|
|
236
292
|
dependents.any? { |d| d.relation == 'xobj' } )
|
237
293
|
end
|
238
294
|
|
295
|
+
def auxiliary?
|
296
|
+
AUXILIARIES.include?([lemma, part_of_speech, language].join(','))
|
297
|
+
end
|
298
|
+
|
239
299
|
def determiner?
|
240
300
|
DETERMINERS.include? @part_of_speech
|
241
301
|
end
|
@@ -260,8 +320,16 @@ module PROIEL
|
|
260
320
|
!has_content?
|
261
321
|
end
|
262
322
|
|
323
|
+
def deponent?
|
324
|
+
DEPONENTS[@language] and DEPONENTS[@language].match(@lemma)
|
325
|
+
end
|
326
|
+
|
263
327
|
def mediopassive?
|
264
|
-
@morphology[4] =~/[mpe]/
|
328
|
+
(!deponent? and @morphology) ? @morphology[4] =~/[mpe]/ : false
|
329
|
+
end
|
330
|
+
|
331
|
+
def passive?
|
332
|
+
(!deponent? and @morphology) ? @morphology[4] == 'p' : false
|
265
333
|
end
|
266
334
|
|
267
335
|
def negation?
|
@@ -277,15 +345,19 @@ module PROIEL
|
|
277
345
|
d.determiner? and ['atr', 'aux', 'det'].include? d.relation
|
278
346
|
end
|
279
347
|
end
|
280
|
-
|
348
|
+
|
349
|
+
def TAM_particle?
|
350
|
+
@relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
351
|
+
end
|
352
|
+
|
281
353
|
def particle?
|
282
354
|
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
283
355
|
end
|
284
356
|
|
285
|
-
def
|
286
|
-
@
|
357
|
+
def pronominal?
|
358
|
+
@part_of_speech =~ /\AP[^st]/ # no evidence that possessives are pronoun/determiner-like
|
287
359
|
end
|
288
|
-
|
360
|
+
|
289
361
|
def preposition?
|
290
362
|
@part_of_speech == 'R-'
|
291
363
|
end
|
@@ -343,7 +415,7 @@ module PROIEL
|
|
343
415
|
features.split("|").sort.join("|")
|
344
416
|
end
|
345
417
|
end
|
346
|
-
|
418
|
+
|
347
419
|
def to_conll
|
348
420
|
[@id,
|
349
421
|
@form,
|
@@ -390,12 +462,24 @@ module PROIEL
|
|
390
462
|
end
|
391
463
|
end
|
392
464
|
|
465
|
+
def find_postag possible_postags
|
466
|
+
tag, crit, feats = possible_postags.shift
|
467
|
+
if tag.nil?
|
468
|
+
# raise "Found no postag"
|
469
|
+
elsif crit.call self
|
470
|
+
@upos = tag
|
471
|
+
@features += ((@features.empty? ? '' : '|') + feats) if feats
|
472
|
+
else
|
473
|
+
find_postag possible_postags
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
393
477
|
def find_relation possible_relations
|
394
478
|
rel, crit = possible_relations.shift
|
395
479
|
if rel.nil?
|
396
480
|
# raise "Found no relation"
|
397
481
|
elsif crit.call self
|
398
|
-
|
482
|
+
rel
|
399
483
|
else
|
400
484
|
find_relation possible_relations
|
401
485
|
end
|
@@ -403,25 +487,29 @@ module PROIEL
|
|
403
487
|
|
404
488
|
def map_part_of_speech!
|
405
489
|
dependents.each(&:map_part_of_speech!)
|
406
|
-
|
407
|
-
|
408
|
-
if feat = POS_MAP[@part_of_speech][1]
|
409
|
-
@features += ((@features.empty? ? '' : '|') + feat)
|
410
|
-
end
|
490
|
+
possible_postags = POS_MAP[@part_of_speech]
|
491
|
+
find_postag possible_postags.dup
|
411
492
|
# ugly, but the ugliness comes from UDEP
|
412
493
|
@upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
|
413
494
|
end
|
414
495
|
|
415
496
|
def relabel_graph!
|
416
497
|
dependents.each(&:relabel_graph!)
|
498
|
+
# TODO: if there are iobjs without an obj among the dependents, one of them should be promoted to obj
|
499
|
+
@relation = map_relation
|
500
|
+
raise "No relation for #{form}" unless @relation
|
501
|
+
end
|
502
|
+
|
503
|
+
def map_relation
|
417
504
|
possible_relations = RELATION_MAPPING[@relation]
|
418
505
|
case possible_relations
|
419
506
|
when String
|
420
|
-
|
507
|
+
possible_relations
|
421
508
|
when Array
|
422
|
-
find_relation possible_relations.dup
|
509
|
+
x = find_relation possible_relations.dup
|
423
510
|
when nil
|
424
|
-
|
511
|
+
# do nothing: the token has already changed its relation
|
512
|
+
@relation
|
425
513
|
else
|
426
514
|
raise "Unknown value #{possible_relations.inspect} for #{@relation}"
|
427
515
|
end
|
@@ -447,35 +535,28 @@ module PROIEL
|
|
447
535
|
end
|
448
536
|
end
|
449
537
|
|
450
|
-
|
451
|
-
|
452
|
-
# TODO: process "implicit pid" through APOS chain too
|
453
538
|
def process_ellipsis!
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
overt = pid
|
458
|
-
# otherwise, try a conjunct
|
459
|
-
elsif @relation == 'conj'
|
460
|
-
overt = conj_head
|
461
|
-
elsif @relation == 'apos'
|
462
|
-
overt = find_appositive_head
|
463
|
-
else
|
539
|
+
aux = dependents.select(&:auxiliary?).first
|
540
|
+
if aux
|
541
|
+
aux.promote!
|
464
542
|
return
|
465
543
|
end
|
466
544
|
|
467
|
-
|
545
|
+
new_head = find_highest_daughter
|
546
|
+
new_head.promote!('orphan')
|
547
|
+
|
548
|
+
# dependents.each do |d|
|
468
549
|
# check if there's a partner with the same relation under the overt node.
|
469
550
|
# TODO: this isn't really very convincing when it comes to ADVs
|
470
|
-
if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
471
|
-
partner = partner.find_remnant
|
472
|
-
d.head_id = partner.id
|
473
|
-
d.relation = 'remnant'
|
551
|
+
# if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
552
|
+
# partner = partner.find_remnant
|
553
|
+
# d.head_id = partner.id
|
554
|
+
# d.relation = 'remnant'
|
474
555
|
# if there's no partner, just attach under the overt node, preserving the relation
|
475
|
-
else
|
476
|
-
d.head_id = overt.id
|
477
|
-
end
|
478
|
-
end
|
556
|
+
# else
|
557
|
+
# d.head_id = overt.id
|
558
|
+
# end
|
559
|
+
# end
|
479
560
|
@sentence.remove_token!(self)
|
480
561
|
end
|
481
562
|
|
@@ -487,12 +568,20 @@ module PROIEL
|
|
487
568
|
end
|
488
569
|
end
|
489
570
|
|
571
|
+
def find_highest_daughter
|
572
|
+
dependents.min_by { |d| OBLIQUENESS_HIERARCHY.find_index(d.map_relation[/[^:]*/]) || 1000 }
|
573
|
+
end
|
574
|
+
|
490
575
|
def process_copula!
|
491
576
|
predicates = dependents.select { |d| d.relation == 'xobj' }
|
492
577
|
raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
|
493
578
|
predicates.first.promote!(nil, 'cop')
|
494
579
|
end
|
495
580
|
|
581
|
+
def has_preposition?
|
582
|
+
dependents.any? { |d| d.preposition? and d.relation == "case" }
|
583
|
+
end
|
584
|
+
|
496
585
|
def process_preposition!
|
497
586
|
raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
|
498
587
|
obliques = dependents.select { |d| d.relation == 'obl' }
|
@@ -519,9 +608,9 @@ module PROIEL
|
|
519
608
|
raise "Only coordinations can be processed this way!" unless conjunction?
|
520
609
|
return if dependents.reject { |d| d.relation == 'aux' }.empty?
|
521
610
|
distribute_shared_modifiers!
|
522
|
-
dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc")
|
611
|
+
dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!("conj", "cc")
|
523
612
|
end
|
524
|
-
|
613
|
+
|
525
614
|
def distribute_shared_modifiers!
|
526
615
|
raise "Can only distribute over a conjunction!" unless conjunction?
|
527
616
|
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
@@ -2,8 +2,13 @@
|
|
2
2
|
module PROIEL
|
3
3
|
module Converter
|
4
4
|
class CoNLLU
|
5
|
+
|
6
|
+
# try to guess deponency based on the lemma
|
7
|
+
DEPONENTS = { 'lat' => /r\Z/,
|
8
|
+
'grc' => /ομαι\Z/ }
|
5
9
|
COPULAR_LEMMATA = ['sum,V-,lat', 'εἰμί#1,V-,grc']
|
6
|
-
|
10
|
+
AUXILIARIES = COPULAR_LEMMATA + []
|
11
|
+
DETERMINERS = ['S-', 'Pd', 'Px']
|
7
12
|
NEGATION_LEMMATA = ['non,Df,lat', 'ne,Df,lat',
|
8
13
|
'μή,Df,grc',
|
9
14
|
'μήγε,Df,grc',
|
@@ -35,8 +40,10 @@ module PROIEL
|
|
35
40
|
'nibai#2,Df,got',
|
36
41
|
'nih,Df,got',
|
37
42
|
]
|
38
|
-
|
39
|
-
|
43
|
+
|
44
|
+
TAM_PARTICLE_LEMMATA = ['ἄν,Df,grc',
|
45
|
+
]
|
46
|
+
|
40
47
|
PARTICLE_LEMMATA = [ 'at,Df,lat',
|
41
48
|
'atque,Df,lat',
|
42
49
|
'autem,Df,lat',
|
@@ -59,7 +66,6 @@ module PROIEL
|
|
59
66
|
'tunc,Df,lat',
|
60
67
|
'vero,Df,lat',
|
61
68
|
'ἅμα,Df,grc',
|
62
|
-
'ἄν,Df,grc',
|
63
69
|
'ἀνά,Df,grc',
|
64
70
|
'ἆρα,Df,grc',
|
65
71
|
'ἄραγε,Df,grc',
|
@@ -137,33 +143,39 @@ module PROIEL
|
|
137
143
|
|
138
144
|
POS_MAP =
|
139
145
|
{
|
140
|
-
'A-' => ['ADJ'],
|
141
|
-
'
|
142
|
-
'
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
'
|
147
|
-
'
|
148
|
-
'
|
149
|
-
'
|
150
|
-
'I-' => ['INTJ'],
|
151
|
-
'
|
152
|
-
'
|
153
|
-
'
|
154
|
-
'
|
155
|
-
'
|
156
|
-
'
|
157
|
-
'
|
158
|
-
'
|
159
|
-
'
|
160
|
-
|
161
|
-
'
|
162
|
-
'
|
163
|
-
'
|
164
|
-
'
|
165
|
-
'
|
166
|
-
'
|
146
|
+
'A-' => [['ADJ', lambda { |x| true } ]],
|
147
|
+
'C-' => [['CCONJ', lambda { |x| true } ]],
|
148
|
+
'Df' => [['AUX', lambda(&:TAM_particle?)],
|
149
|
+
['ADV', lambda(&:negation?), "Polarity=Neg"],
|
150
|
+
['ADV', lambda { |x| true } ]
|
151
|
+
],
|
152
|
+
'Dq' => [['ADV', lambda { |x| true }, "PronType=Rel"]],
|
153
|
+
'Du' => [['ADV', lambda { |x| true }, "PronType=Int"]],
|
154
|
+
'F-' => [['X', lambda { |x| true } ]],
|
155
|
+
'G-' => [['SCONJ', lambda { |x| true } ]],
|
156
|
+
'I-' => [['INTJ', lambda { |x| true } ]],
|
157
|
+
'Ma' => [['NUM', lambda { |x| true } ]],
|
158
|
+
'Mo' => [['ADJ', lambda { |x| true } ]],
|
159
|
+
'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes
|
160
|
+
'Nb' => [['NOUN', lambda { |x| true } ]],
|
161
|
+
'Ne' => [['PROPN', lambda { |x| true } ]],
|
162
|
+
'Pc' => [['PRON', lambda { |x| true }, "PronType=Rcp"]],
|
163
|
+
'Pd' => [['DET', lambda { |x| true } ]],
|
164
|
+
'Pi' => [['PRON', lambda { |x| true }, "PronType=Int"]],
|
165
|
+
'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }],
|
166
|
+
['PRON', lambda { |x| true }, "PronType=Prs|Reflex=Yes"]],
|
167
|
+
'Pp' => [['PRON', lambda { |x| true }, "PronType=Prs"]],
|
168
|
+
'Pr' => [['PRON', lambda { |x| true }, "PronType=Rel"]],
|
169
|
+
'Ps' => [['ADJ', lambda { |x| true }, "Poss=Yes"]], ### NB no evidence for a pronominal/determiner-like nature here
|
170
|
+
'Pt' => [['ADJ', lambda { |x| true }, "Poss=Yes|Reflex=Yes" ]], ### NB no evidence for a pronominal/determiner-like nature here
|
171
|
+
'Px' => [['DET', lambda { |x| true } ]],
|
172
|
+
'Py' => [['PRON', lambda { |x| true } ]],
|
173
|
+
'R-' => [['ADP', lambda { |x| true } ]],
|
174
|
+
'V-' => [['AUX', lambda(&:auxiliary?)],
|
175
|
+
['VERB', lambda { |x| true } ]],
|
176
|
+
'S-' => [['DET', lambda { |x| true }, "Definite=Def|PronType=Dem"]], # (we only have definite articles)
|
177
|
+
'X-' => [['X', lambda { |x| true } ]]
|
178
|
+
}
|
167
179
|
|
168
180
|
MORPHOLOGY_MAP = {
|
169
181
|
:person => {'1' => 'Person=1',
|
@@ -1,16 +1,19 @@
|
|
1
1
|
module PROIEL
|
2
2
|
module Converter
|
3
3
|
class CoNLLU
|
4
|
+
|
5
|
+
OBLIQUENESS_HIERARCHY = ["nsubj", "obj", "iobj", "obl", "advmod", "csubj", "xcomp", "ccomp", "advcl"]
|
6
|
+
|
4
7
|
RELATION_MAPPING = {
|
5
8
|
"adnom" => "dep",
|
6
9
|
"adv" => [["advcl", lambda(&:clausal?) ],
|
7
10
|
["advmod", lambda { |x| x.adverb? or x.preposition? } ],
|
8
11
|
["advmod", lambda(&:adjectival?) ], # adjective for adverb
|
9
|
-
["
|
12
|
+
["obl", lambda(&:nominal?) ],
|
10
13
|
["advmod", lambda { |x| true } ],
|
11
14
|
],
|
12
|
-
"ag" => "
|
13
|
-
"apos" => [["name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
|
15
|
+
"ag" => "obl:agent", # add :agent" once defined
|
16
|
+
"apos" => [["flat:name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
|
14
17
|
["appos", lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ],
|
15
18
|
["acl", lambda { |x| x.clausal? and x.head and x.head.nominal? } ], # add :relcl ?
|
16
19
|
# what to do about sentential appositions?
|
@@ -19,44 +22,48 @@ module PROIEL
|
|
19
22
|
],
|
20
23
|
"arg" => "dep",
|
21
24
|
"atr" => [["nummod", lambda(&:cardinal?) ],
|
22
|
-
["
|
25
|
+
["det", lambda { |x| x.pronominal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check
|
26
|
+
["nmod", lambda(&:nominal?) ],
|
23
27
|
["acl", lambda { |x| x.clausal? } ], # add :relcl?
|
24
28
|
["advmod", lambda { |x| x.head and x.head.clausal? } ],
|
25
|
-
["det", lambda(&:determiner?) ],
|
26
29
|
["amod", lambda { |x| true } ], #default
|
27
30
|
],
|
28
31
|
"aux" => [["det", lambda(&:determiner?) ],
|
29
|
-
["
|
30
|
-
["aux", lambda(&:clausal?) ],
|
31
|
-
["
|
32
|
+
["aux:pass", lambda { |x| x.clausal? and x.head.passive? } ],
|
33
|
+
["aux", lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in
|
34
|
+
["advmod", lambda(&:negation?) ],
|
32
35
|
["discourse", lambda { |x| x.particle? or x.interjection? } ],
|
33
|
-
|
36
|
+
# include subjunctions that are aux here; (root sentences with subjunction)
|
37
|
+
["advmod", lambda { |x| x.adjectival? or x.adverb? or x.subjunction? } ],
|
34
38
|
["cc", lambda(&:conjunction?) ],
|
35
|
-
["foreign", lambda(&:foreign?) ],
|
39
|
+
["flat:foreign", lambda(&:foreign?) ],
|
36
40
|
# We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml)
|
37
|
-
["mark", lambda { |x| ['
|
41
|
+
["mark", lambda { |x| ['R-'].include? x.part_of_speech } ], #'R-' as infinitive marker in Gothic
|
42
|
+
["aux", lambda { |x| ['Pk' ].include? x.part_of_speech } ], #reflexive as valency reducer
|
38
43
|
['amod', lambda { |x| x.preposition? } ], # Armenian DOM
|
39
|
-
['
|
44
|
+
['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px'
|
40
45
|
|
41
46
|
# MISANNOTATION IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps'
|
42
47
|
],
|
43
|
-
"comp" => [['
|
48
|
+
"comp" => [['csubj:pass', lambda { |x| x.head and x.head.passive? } ],
|
44
49
|
['csubj', lambda { |x| x.head and x.head.copula? } ],
|
45
50
|
['ccomp', lambda { |x| true } ],
|
46
51
|
],
|
47
52
|
"expl" => "expl",
|
48
53
|
"narg" => [['acl', lambda(&:clausal?) ],
|
49
|
-
['nmod', lambda(&:nominal?) ],
|
54
|
+
['nmod', lambda(&:nominal?) ],
|
50
55
|
['nmod', lambda(&:adjectival?) ], # nominaliezed in this function
|
51
56
|
['nmod', lambda { |x| true } ],
|
52
57
|
],
|
53
58
|
"nonsub" => "dep",
|
54
|
-
"obj" => "
|
55
|
-
"obl" => [
|
56
|
-
["
|
59
|
+
"obj" => "obj:dir",
|
60
|
+
"obl" => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions
|
61
|
+
["advmod", lambda { |x| x.adverb? or x.preposition? } ],
|
62
|
+
["obl", lambda { |x| x.has_preposition? } ],
|
63
|
+
["iobj", lambda(&:nominal?) ],# if nominal (NB check for presence of article!) TODO: should be "obj" if the verb is monovalent (even by elision)
|
57
64
|
["iobj", lambda(&:adjectival?) ], # OBL adjectives are nominalized
|
58
65
|
["advcl", lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer!
|
59
|
-
["iobj", lambda { |x| true } ],
|
66
|
+
["iobj", lambda { |x| true } ],
|
60
67
|
],
|
61
68
|
"parpred" => "parataxis",
|
62
69
|
"part" => "nmod",
|
@@ -66,7 +73,7 @@ module PROIEL
|
|
66
73
|
["ERROR", lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }],
|
67
74
|
],
|
68
75
|
"rel" => "acl", # add :relcl?
|
69
|
-
"sub" => [["
|
76
|
+
"sub" => [["nsubj:pass", lambda { |x| x.head and x.head.passive? } ],
|
70
77
|
["nsubj", lambda { |x| true }],
|
71
78
|
],
|
72
79
|
"voc" => "vocative",
|
data/lib/proiel/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-03-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: builder
|
@@ -250,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
250
250
|
version: '0'
|
251
251
|
requirements: []
|
252
252
|
rubyforge_project:
|
253
|
-
rubygems_version: 2.
|
253
|
+
rubygems_version: 2.6.8
|
254
254
|
signing_key:
|
255
255
|
specification_version: 4
|
256
256
|
summary: A command-line interface for working with PROIEL treebanks
|