proiel-cli 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -15
- data/lib/proiel/cli/converters/conll-u.rb +137 -48
- data/lib/proiel/cli/converters/conll-u/morphology.rb +43 -31
- data/lib/proiel/cli/converters/conll-u/syntax.rb +26 -19
- data/lib/proiel/cli/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98027bdd669bde3fa19db4f6159e3b36d5024f1f
|
4
|
+
data.tar.gz: 4e6019add4436629e2488fc03d19f3d040c48149
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 301a294838fb2ee805bcabbdb5425d39815749c8d4ccea9e8f7883ccaee4fee09ffb0cd45ad8ea0bef06cfb7b80a241b4acb87d224079101863be2d8f0614c7a
|
7
|
+
data.tar.gz: ca1bbaadca8be10e714984ab8ae1f460e6c61d3e5c7b09882c8819e95aadcccc152c69e53e1bd0072be6fc30491c422a84c6886fbd4f7f203b17cfa517f92310
|
data/README.md
CHANGED
@@ -12,25 +12,31 @@ gem install proiel-cli
|
|
12
12
|
|
13
13
|
## Using the command-line interface
|
14
14
|
|
15
|
-
|
16
|
-
`proiel info`, for example, displays metadata and some brief statistics, and
|
17
|
-
`proiel convert conll` converts the treebank to CoNLL format. Use `proiel
|
18
|
-
--help` for further examples and usage instructions.
|
15
|
+
This gem includes a command-line utility, `proiel`, which solves various routine tasks involving PROIEL-style treebanks.
|
19
16
|
|
20
|
-
|
17
|
+
`proiel info`, for example, displays metadata and some brief statistics, and `proiel convert conll` converts the treebank to CoNLL format. Use `proiel --help` for further examples and usage instructions.
|
21
18
|
|
22
|
-
|
19
|
+
To use the `visualize` command you will need to have [graphviz](http://graphviz.org) installed. On macOS you can use [Homebrew](https://brew.sh/) for this:
|
23
20
|
|
24
|
-
|
25
|
-
|
21
|
+
```shell
|
22
|
+
brew install graphviz
|
23
|
+
```
|
26
24
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
Make sure that the `dot` command is available in the path:
|
26
|
+
|
27
|
+
```shell
|
28
|
+
$ which dot
|
29
|
+
/usr/local/bin/dot
|
30
|
+
```
|
32
31
|
|
33
32
|
## Contributing
|
34
33
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub
|
36
|
-
|
34
|
+
Bug reports and pull requests are welcome on [GitHub](https://github.com/proiel/proiel-cli/issues).
|
35
|
+
|
36
|
+
## Development
|
37
|
+
|
38
|
+
To contribute to development, check out the git repository from [GitHub](https://github.com/proiel/proiel-cli) and run `bin/setup` to install all development dependencies. Then run `rake` to run the tests.
|
39
|
+
|
40
|
+
To install a development version of this gem, run `bundle exec rake install`.
|
41
|
+
|
42
|
+
To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the gem to [rubygems.org](https://rubygems.org).
|
@@ -1,6 +1,12 @@
|
|
1
1
|
require 'proiel/cli/converters/conll-u/morphology'
|
2
2
|
require 'proiel/cli/converters/conll-u/syntax'
|
3
3
|
|
4
|
+
# Unlike other conversions, this one has to rely on
|
5
|
+
# certain assumptions about correct linguistic
|
6
|
+
# annotation in order to produce a meaningful
|
7
|
+
# representation in CoNLL-U
|
8
|
+
|
9
|
+
|
4
10
|
module PROIEL
|
5
11
|
module Converter
|
6
12
|
class CoNLLU
|
@@ -13,16 +19,19 @@ module PROIEL
|
|
13
19
|
div.sentences.each do |sentence|
|
14
20
|
sentence_count += 1
|
15
21
|
n = Sentence.new sentence
|
16
|
-
# Unlike other conversions, this one has to rely on
|
17
|
-
# certain assumptions about correct linguistic
|
18
|
-
# annotation in order to producea meaningful
|
19
|
-
# representation in CoNLL-U
|
20
22
|
begin
|
21
|
-
|
23
|
+
# Do the conversion first to avoid spurious headers if the conversion fails
|
24
|
+
a = n.convert.to_conll
|
25
|
+
puts "# source = #{source.title}, #{div.title}"
|
26
|
+
# using printable_form would give us punctuation, which must then be added to the tree
|
27
|
+
puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}"
|
28
|
+
puts "# sent_id = #{sentence.id}"
|
29
|
+
puts a
|
22
30
|
puts
|
23
31
|
rescue => e
|
24
32
|
error_count += 1
|
25
33
|
STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
|
34
|
+
STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
|
26
35
|
end
|
27
36
|
end
|
28
37
|
end
|
@@ -40,16 +49,55 @@ module PROIEL
|
|
40
49
|
|
41
50
|
id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil
|
42
51
|
|
43
|
-
|
52
|
+
# initialize array to hold the sentence tokens
|
53
|
+
tks = []
|
54
|
+
# keep track of how many new tokens have been created
|
55
|
+
offset = 0
|
44
56
|
|
45
|
-
|
57
|
+
sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|
|
58
|
+
|
59
|
+
if tk.form =~ /[[:space:]]/
|
60
|
+
subtoks = tk.form.split(/[[:space:]]/)
|
61
|
+
|
62
|
+
subtoks.each_with_index do |subtok, i|
|
63
|
+
tks << PROIEL::Token.new(sentence,
|
64
|
+
(i == 0 ? tk.id : 1000 + offset), # id
|
65
|
+
(i == 0 ? tk.head_id : tk.id), # head_id
|
66
|
+
subtok,
|
67
|
+
# hope the lemmas split the same way as the tokens. Grab the form is you don't find a lemma
|
68
|
+
(tk.lemma.split(/[[:space:]]/)[i] || subtok),
|
69
|
+
tk.part_of_speech, # copy the postag
|
70
|
+
tk.morphology,
|
71
|
+
(i == 0 ? tk.relation : "flat"),
|
72
|
+
nil, #empty_token_sort
|
73
|
+
tk.citation_part,
|
74
|
+
(i == 0 ? tk.presentation_before : nil),
|
75
|
+
(i == (subtoks.size - 1) ? tk.presentation_after : nil),
|
76
|
+
(i == 0 ? tk.antecedent_id : nil),
|
77
|
+
(i == 0 ? tk.information_status : nil),
|
78
|
+
(i == 0 ? tk.contrast_group : nil),
|
79
|
+
(i == 0 ? tk.foreign_ids : nil),
|
80
|
+
(i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), # This needs to be given a real slash object for the initialization, although it throws away the info
|
81
|
+
(subtok == subtoks.first ? tk.alignment_id : nil)
|
82
|
+
)
|
83
|
+
offset += 1
|
84
|
+
end
|
85
|
+
else
|
86
|
+
tks << tk
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
tks.map(&:id).each_with_index.each do |id, i|
|
46
92
|
id_to_number[id] = i + 1
|
47
93
|
end
|
48
94
|
|
49
|
-
@tokens =
|
95
|
+
@tokens = tks.map do |t|
|
96
|
+
|
50
97
|
Token.new(id_to_number[t.id],
|
51
98
|
id_to_number[t.head_id],
|
52
|
-
|
99
|
+
#insert dots in any whitespace inside words and lemmata
|
100
|
+
t.form.to_s.gsub(/[[:space:]]/, '.'),
|
53
101
|
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
|
54
102
|
t.part_of_speech,
|
55
103
|
t.language,
|
@@ -197,11 +245,19 @@ module PROIEL
|
|
197
245
|
res.compact.join('|')
|
198
246
|
end
|
199
247
|
|
248
|
+
def genitive?
|
249
|
+
@morphology =~ /......g.*/
|
250
|
+
end
|
251
|
+
|
200
252
|
# returns +true+ if the node is an adjective or an ordinal
|
201
253
|
def adjectival?
|
202
254
|
@part_of_speech == 'A-' or @part_of_speech == 'Mo'
|
203
255
|
end
|
204
256
|
|
257
|
+
def subjunction?
|
258
|
+
@part_of_speech == 'G-'
|
259
|
+
end
|
260
|
+
|
205
261
|
def adverb?
|
206
262
|
@part_of_speech =~ /\AD/
|
207
263
|
end
|
@@ -236,6 +292,10 @@ module PROIEL
|
|
236
292
|
dependents.any? { |d| d.relation == 'xobj' } )
|
237
293
|
end
|
238
294
|
|
295
|
+
def auxiliary?
|
296
|
+
AUXILIARIES.include?([lemma, part_of_speech, language].join(','))
|
297
|
+
end
|
298
|
+
|
239
299
|
def determiner?
|
240
300
|
DETERMINERS.include? @part_of_speech
|
241
301
|
end
|
@@ -260,8 +320,16 @@ module PROIEL
|
|
260
320
|
!has_content?
|
261
321
|
end
|
262
322
|
|
323
|
+
def deponent?
|
324
|
+
DEPONENTS[@language] and DEPONENTS[@language].match(@lemma)
|
325
|
+
end
|
326
|
+
|
263
327
|
def mediopassive?
|
264
|
-
@morphology[4] =~/[mpe]/
|
328
|
+
(!deponent? and @morphology) ? @morphology[4] =~/[mpe]/ : false
|
329
|
+
end
|
330
|
+
|
331
|
+
def passive?
|
332
|
+
(!deponent? and @morphology) ? @morphology[4] == 'p' : false
|
265
333
|
end
|
266
334
|
|
267
335
|
def negation?
|
@@ -277,15 +345,19 @@ module PROIEL
|
|
277
345
|
d.determiner? and ['atr', 'aux', 'det'].include? d.relation
|
278
346
|
end
|
279
347
|
end
|
280
|
-
|
348
|
+
|
349
|
+
def TAM_particle?
|
350
|
+
@relation == 'aux' and TAM_PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
351
|
+
end
|
352
|
+
|
281
353
|
def particle?
|
282
354
|
@relation == 'aux' and PARTICLE_LEMMATA.include?([lemma, part_of_speech, language].join(','))
|
283
355
|
end
|
284
356
|
|
285
|
-
def
|
286
|
-
@
|
357
|
+
def pronominal?
|
358
|
+
@part_of_speech =~ /\AP[^st]/ # no evidence that possessives are pronoun/determiner-like
|
287
359
|
end
|
288
|
-
|
360
|
+
|
289
361
|
def preposition?
|
290
362
|
@part_of_speech == 'R-'
|
291
363
|
end
|
@@ -343,7 +415,7 @@ module PROIEL
|
|
343
415
|
features.split("|").sort.join("|")
|
344
416
|
end
|
345
417
|
end
|
346
|
-
|
418
|
+
|
347
419
|
def to_conll
|
348
420
|
[@id,
|
349
421
|
@form,
|
@@ -390,12 +462,24 @@ module PROIEL
|
|
390
462
|
end
|
391
463
|
end
|
392
464
|
|
465
|
+
def find_postag possible_postags
|
466
|
+
tag, crit, feats = possible_postags.shift
|
467
|
+
if tag.nil?
|
468
|
+
# raise "Found no postag"
|
469
|
+
elsif crit.call self
|
470
|
+
@upos = tag
|
471
|
+
@features += ((@features.empty? ? '' : '|') + feats) if feats
|
472
|
+
else
|
473
|
+
find_postag possible_postags
|
474
|
+
end
|
475
|
+
end
|
476
|
+
|
393
477
|
def find_relation possible_relations
|
394
478
|
rel, crit = possible_relations.shift
|
395
479
|
if rel.nil?
|
396
480
|
# raise "Found no relation"
|
397
481
|
elsif crit.call self
|
398
|
-
|
482
|
+
rel
|
399
483
|
else
|
400
484
|
find_relation possible_relations
|
401
485
|
end
|
@@ -403,25 +487,29 @@ module PROIEL
|
|
403
487
|
|
404
488
|
def map_part_of_speech!
|
405
489
|
dependents.each(&:map_part_of_speech!)
|
406
|
-
|
407
|
-
|
408
|
-
if feat = POS_MAP[@part_of_speech][1]
|
409
|
-
@features += ((@features.empty? ? '' : '|') + feat)
|
410
|
-
end
|
490
|
+
possible_postags = POS_MAP[@part_of_speech]
|
491
|
+
find_postag possible_postags.dup
|
411
492
|
# ugly, but the ugliness comes from UDEP
|
412
493
|
@upos = 'ADJ' if @upos == 'DET' and @relation != 'det'
|
413
494
|
end
|
414
495
|
|
415
496
|
def relabel_graph!
|
416
497
|
dependents.each(&:relabel_graph!)
|
498
|
+
# TODO: if there are iobjs without an obj among the dependents, one of them should be promoted to obj
|
499
|
+
@relation = map_relation
|
500
|
+
raise "No relation for #{form}" unless @relation
|
501
|
+
end
|
502
|
+
|
503
|
+
def map_relation
|
417
504
|
possible_relations = RELATION_MAPPING[@relation]
|
418
505
|
case possible_relations
|
419
506
|
when String
|
420
|
-
|
507
|
+
possible_relations
|
421
508
|
when Array
|
422
|
-
find_relation possible_relations.dup
|
509
|
+
x = find_relation possible_relations.dup
|
423
510
|
when nil
|
424
|
-
|
511
|
+
# do nothing: the token has already changed its relation
|
512
|
+
@relation
|
425
513
|
else
|
426
514
|
raise "Unknown value #{possible_relations.inspect} for #{@relation}"
|
427
515
|
end
|
@@ -447,35 +535,28 @@ module PROIEL
|
|
447
535
|
end
|
448
536
|
end
|
449
537
|
|
450
|
-
|
451
|
-
|
452
|
-
# TODO: process "implicit pid" through APOS chain too
|
453
538
|
def process_ellipsis!
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
overt = pid
|
458
|
-
# otherwise, try a conjunct
|
459
|
-
elsif @relation == 'conj'
|
460
|
-
overt = conj_head
|
461
|
-
elsif @relation == 'apos'
|
462
|
-
overt = find_appositive_head
|
463
|
-
else
|
539
|
+
aux = dependents.select(&:auxiliary?).first
|
540
|
+
if aux
|
541
|
+
aux.promote!
|
464
542
|
return
|
465
543
|
end
|
466
544
|
|
467
|
-
|
545
|
+
new_head = find_highest_daughter
|
546
|
+
new_head.promote!('orphan')
|
547
|
+
|
548
|
+
# dependents.each do |d|
|
468
549
|
# check if there's a partner with the same relation under the overt node.
|
469
550
|
# TODO: this isn't really very convincing when it comes to ADVs
|
470
|
-
if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
471
|
-
partner = partner.find_remnant
|
472
|
-
d.head_id = partner.id
|
473
|
-
d.relation = 'remnant'
|
551
|
+
# if partner = overt.dependents.select { |p| p != self and p.relation == d.relation }.first #inserted p != self
|
552
|
+
# partner = partner.find_remnant
|
553
|
+
# d.head_id = partner.id
|
554
|
+
# d.relation = 'remnant'
|
474
555
|
# if there's no partner, just attach under the overt node, preserving the relation
|
475
|
-
else
|
476
|
-
d.head_id = overt.id
|
477
|
-
end
|
478
|
-
end
|
556
|
+
# else
|
557
|
+
# d.head_id = overt.id
|
558
|
+
# end
|
559
|
+
# end
|
479
560
|
@sentence.remove_token!(self)
|
480
561
|
end
|
481
562
|
|
@@ -487,12 +568,20 @@ module PROIEL
|
|
487
568
|
end
|
488
569
|
end
|
489
570
|
|
571
|
+
def find_highest_daughter
|
572
|
+
dependents.min_by { |d| OBLIQUENESS_HIERARCHY.find_index(d.map_relation[/[^:]*/]) || 1000 }
|
573
|
+
end
|
574
|
+
|
490
575
|
def process_copula!
|
491
576
|
predicates = dependents.select { |d| d.relation == 'xobj' }
|
492
577
|
raise "#{predicates.size} predicates under #{to_n}\n#{to_graph}" if predicates.size != 1
|
493
578
|
predicates.first.promote!(nil, 'cop')
|
494
579
|
end
|
495
580
|
|
581
|
+
def has_preposition?
|
582
|
+
dependents.any? { |d| d.preposition? and d.relation == "case" }
|
583
|
+
end
|
584
|
+
|
496
585
|
def process_preposition!
|
497
586
|
raise "Only prepositions can be processed this way!" unless part_of_speech == 'R-'
|
498
587
|
obliques = dependents.select { |d| d.relation == 'obl' }
|
@@ -519,9 +608,9 @@ module PROIEL
|
|
519
608
|
raise "Only coordinations can be processed this way!" unless conjunction?
|
520
609
|
return if dependents.reject { |d| d.relation == 'aux' }.empty?
|
521
610
|
distribute_shared_modifiers!
|
522
|
-
dependents.reject { |d| d.relation == 'aux' }.first.promote!("conj", "cc")
|
611
|
+
dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!("conj", "cc")
|
523
612
|
end
|
524
|
-
|
613
|
+
|
525
614
|
def distribute_shared_modifiers!
|
526
615
|
raise "Can only distribute over a conjunction!" unless conjunction?
|
527
616
|
conjuncts, modifiers = dependents.reject { |d| d.relation == 'aux' }.partition { |d| d.relation == @relation or (d.relation == 'adv' and @relation == 'xadv') }
|
@@ -2,8 +2,13 @@
|
|
2
2
|
module PROIEL
|
3
3
|
module Converter
|
4
4
|
class CoNLLU
|
5
|
+
|
6
|
+
# try to guess deponency based on the lemma
|
7
|
+
DEPONENTS = { 'lat' => /r\Z/,
|
8
|
+
'grc' => /ομαι\Z/ }
|
5
9
|
COPULAR_LEMMATA = ['sum,V-,lat', 'εἰμί#1,V-,grc']
|
6
|
-
|
10
|
+
AUXILIARIES = COPULAR_LEMMATA + []
|
11
|
+
DETERMINERS = ['S-', 'Pd', 'Px']
|
7
12
|
NEGATION_LEMMATA = ['non,Df,lat', 'ne,Df,lat',
|
8
13
|
'μή,Df,grc',
|
9
14
|
'μήγε,Df,grc',
|
@@ -35,8 +40,10 @@ module PROIEL
|
|
35
40
|
'nibai#2,Df,got',
|
36
41
|
'nih,Df,got',
|
37
42
|
]
|
38
|
-
|
39
|
-
|
43
|
+
|
44
|
+
TAM_PARTICLE_LEMMATA = ['ἄν,Df,grc',
|
45
|
+
]
|
46
|
+
|
40
47
|
PARTICLE_LEMMATA = [ 'at,Df,lat',
|
41
48
|
'atque,Df,lat',
|
42
49
|
'autem,Df,lat',
|
@@ -59,7 +66,6 @@ module PROIEL
|
|
59
66
|
'tunc,Df,lat',
|
60
67
|
'vero,Df,lat',
|
61
68
|
'ἅμα,Df,grc',
|
62
|
-
'ἄν,Df,grc',
|
63
69
|
'ἀνά,Df,grc',
|
64
70
|
'ἆρα,Df,grc',
|
65
71
|
'ἄραγε,Df,grc',
|
@@ -137,33 +143,39 @@ module PROIEL
|
|
137
143
|
|
138
144
|
POS_MAP =
|
139
145
|
{
|
140
|
-
'A-' => ['ADJ'],
|
141
|
-
'
|
142
|
-
'
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
'
|
147
|
-
'
|
148
|
-
'
|
149
|
-
'
|
150
|
-
'I-' => ['INTJ'],
|
151
|
-
'
|
152
|
-
'
|
153
|
-
'
|
154
|
-
'
|
155
|
-
'
|
156
|
-
'
|
157
|
-
'
|
158
|
-
'
|
159
|
-
'
|
160
|
-
|
161
|
-
'
|
162
|
-
'
|
163
|
-
'
|
164
|
-
'
|
165
|
-
'
|
166
|
-
'
|
146
|
+
'A-' => [['ADJ', lambda { |x| true } ]],
|
147
|
+
'C-' => [['CCONJ', lambda { |x| true } ]],
|
148
|
+
'Df' => [['AUX', lambda(&:TAM_particle?)],
|
149
|
+
['ADV', lambda(&:negation?), "Polarity=Neg"],
|
150
|
+
['ADV', lambda { |x| true } ]
|
151
|
+
],
|
152
|
+
'Dq' => [['ADV', lambda { |x| true }, "PronType=Rel"]],
|
153
|
+
'Du' => [['ADV', lambda { |x| true }, "PronType=Int"]],
|
154
|
+
'F-' => [['X', lambda { |x| true } ]],
|
155
|
+
'G-' => [['SCONJ', lambda { |x| true } ]],
|
156
|
+
'I-' => [['INTJ', lambda { |x| true } ]],
|
157
|
+
'Ma' => [['NUM', lambda { |x| true } ]],
|
158
|
+
'Mo' => [['ADJ', lambda { |x| true } ]],
|
159
|
+
'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes
|
160
|
+
'Nb' => [['NOUN', lambda { |x| true } ]],
|
161
|
+
'Ne' => [['PROPN', lambda { |x| true } ]],
|
162
|
+
'Pc' => [['PRON', lambda { |x| true }, "PronType=Rcp"]],
|
163
|
+
'Pd' => [['DET', lambda { |x| true } ]],
|
164
|
+
'Pi' => [['PRON', lambda { |x| true }, "PronType=Int"]],
|
165
|
+
'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }],
|
166
|
+
['PRON', lambda { |x| true }, "PronType=Prs|Reflex=Yes"]],
|
167
|
+
'Pp' => [['PRON', lambda { |x| true }, "PronType=Prs"]],
|
168
|
+
'Pr' => [['PRON', lambda { |x| true }, "PronType=Rel"]],
|
169
|
+
'Ps' => [['ADJ', lambda { |x| true }, "Poss=Yes"]], ### NB no evidence for a pronominal/determiner-like nature here
|
170
|
+
'Pt' => [['ADJ', lambda { |x| true }, "Poss=Yes|Reflex=Yes" ]], ### NB no evidence for a pronominal/determiner-like nature here
|
171
|
+
'Px' => [['DET', lambda { |x| true } ]],
|
172
|
+
'Py' => [['PRON', lambda { |x| true } ]],
|
173
|
+
'R-' => [['ADP', lambda { |x| true } ]],
|
174
|
+
'V-' => [['AUX', lambda(&:auxiliary?)],
|
175
|
+
['VERB', lambda { |x| true } ]],
|
176
|
+
'S-' => [['DET', lambda { |x| true }, "Definite=Def|PronType=Dem"]], # (we only have definite articles)
|
177
|
+
'X-' => [['X', lambda { |x| true } ]]
|
178
|
+
}
|
167
179
|
|
168
180
|
MORPHOLOGY_MAP = {
|
169
181
|
:person => {'1' => 'Person=1',
|
@@ -1,16 +1,19 @@
|
|
1
1
|
module PROIEL
|
2
2
|
module Converter
|
3
3
|
class CoNLLU
|
4
|
+
|
5
|
+
OBLIQUENESS_HIERARCHY = ["nsubj", "obj", "iobj", "obl", "advmod", "csubj", "xcomp", "ccomp", "advcl"]
|
6
|
+
|
4
7
|
RELATION_MAPPING = {
|
5
8
|
"adnom" => "dep",
|
6
9
|
"adv" => [["advcl", lambda(&:clausal?) ],
|
7
10
|
["advmod", lambda { |x| x.adverb? or x.preposition? } ],
|
8
11
|
["advmod", lambda(&:adjectival?) ], # adjective for adverb
|
9
|
-
["
|
12
|
+
["obl", lambda(&:nominal?) ],
|
10
13
|
["advmod", lambda { |x| true } ],
|
11
14
|
],
|
12
|
-
"ag" => "
|
13
|
-
"apos" => [["name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
|
15
|
+
"ag" => "obl:agent", # add :agent" once defined
|
16
|
+
"apos" => [["flat:name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
|
14
17
|
["appos", lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ],
|
15
18
|
["acl", lambda { |x| x.clausal? and x.head and x.head.nominal? } ], # add :relcl ?
|
16
19
|
# what to do about sentential appositions?
|
@@ -19,44 +22,48 @@ module PROIEL
|
|
19
22
|
],
|
20
23
|
"arg" => "dep",
|
21
24
|
"atr" => [["nummod", lambda(&:cardinal?) ],
|
22
|
-
["
|
25
|
+
["det", lambda { |x| x.pronominal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check
|
26
|
+
["nmod", lambda(&:nominal?) ],
|
23
27
|
["acl", lambda { |x| x.clausal? } ], # add :relcl?
|
24
28
|
["advmod", lambda { |x| x.head and x.head.clausal? } ],
|
25
|
-
["det", lambda(&:determiner?) ],
|
26
29
|
["amod", lambda { |x| true } ], #default
|
27
30
|
],
|
28
31
|
"aux" => [["det", lambda(&:determiner?) ],
|
29
|
-
["
|
30
|
-
["aux", lambda(&:clausal?) ],
|
31
|
-
["
|
32
|
+
["aux:pass", lambda { |x| x.clausal? and x.head.passive? } ],
|
33
|
+
["aux", lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in
|
34
|
+
["advmod", lambda(&:negation?) ],
|
32
35
|
["discourse", lambda { |x| x.particle? or x.interjection? } ],
|
33
|
-
|
36
|
+
# include subjunctions that are aux here; (root sentences with subjunction)
|
37
|
+
["advmod", lambda { |x| x.adjectival? or x.adverb? or x.subjunction? } ],
|
34
38
|
["cc", lambda(&:conjunction?) ],
|
35
|
-
["foreign", lambda(&:foreign?) ],
|
39
|
+
["flat:foreign", lambda(&:foreign?) ],
|
36
40
|
# We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml)
|
37
|
-
["mark", lambda { |x| ['
|
41
|
+
["mark", lambda { |x| ['R-'].include? x.part_of_speech } ], #'R-' as infinitive marker in Gothic
|
42
|
+
["aux", lambda { |x| ['Pk' ].include? x.part_of_speech } ], #reflexive as valency reducer
|
38
43
|
['amod', lambda { |x| x.preposition? } ], # Armenian DOM
|
39
|
-
['
|
44
|
+
['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px'
|
40
45
|
|
41
46
|
# MISANNOTATION IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps'
|
42
47
|
],
|
43
|
-
"comp" => [['
|
48
|
+
"comp" => [['csubj:pass', lambda { |x| x.head and x.head.passive? } ],
|
44
49
|
['csubj', lambda { |x| x.head and x.head.copula? } ],
|
45
50
|
['ccomp', lambda { |x| true } ],
|
46
51
|
],
|
47
52
|
"expl" => "expl",
|
48
53
|
"narg" => [['acl', lambda(&:clausal?) ],
|
49
|
-
['nmod', lambda(&:nominal?) ],
|
54
|
+
['nmod', lambda(&:nominal?) ],
|
50
55
|
['nmod', lambda(&:adjectival?) ], # nominaliezed in this function
|
51
56
|
['nmod', lambda { |x| true } ],
|
52
57
|
],
|
53
58
|
"nonsub" => "dep",
|
54
|
-
"obj" => "
|
55
|
-
"obl" => [
|
56
|
-
["
|
59
|
+
"obj" => "obj:dir",
|
60
|
+
"obl" => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions
|
61
|
+
["advmod", lambda { |x| x.adverb? or x.preposition? } ],
|
62
|
+
["obl", lambda { |x| x.has_preposition? } ],
|
63
|
+
["iobj", lambda(&:nominal?) ],# if nominal (NB check for presence of article!) TODO: should be "obj" if the verb is monovalent (even by elision)
|
57
64
|
["iobj", lambda(&:adjectival?) ], # OBL adjectives are nominalized
|
58
65
|
["advcl", lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer!
|
59
|
-
["iobj", lambda { |x| true } ],
|
66
|
+
["iobj", lambda { |x| true } ],
|
60
67
|
],
|
61
68
|
"parpred" => "parataxis",
|
62
69
|
"part" => "nmod",
|
@@ -66,7 +73,7 @@ module PROIEL
|
|
66
73
|
["ERROR", lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }],
|
67
74
|
],
|
68
75
|
"rel" => "acl", # add :relcl?
|
69
|
-
"sub" => [["
|
76
|
+
"sub" => [["nsubj:pass", lambda { |x| x.head and x.head.passive? } ],
|
70
77
|
["nsubj", lambda { |x| true }],
|
71
78
|
],
|
72
79
|
"voc" => "vocative",
|
data/lib/proiel/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-03-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: builder
|
@@ -250,7 +250,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
250
250
|
version: '0'
|
251
251
|
requirements: []
|
252
252
|
rubyforge_project:
|
253
|
-
rubygems_version: 2.
|
253
|
+
rubygems_version: 2.6.8
|
254
254
|
signing_key:
|
255
255
|
specification_version: 4
|
256
256
|
summary: A command-line interface for working with PROIEL treebanks
|