proiel-cli 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +18 -3
- data/bin/proiel +1 -1
- data/lib/proiel/cli/commands/build.rb +91 -0
- data/lib/proiel/cli/commands/convert.rb +7 -2
- data/lib/proiel/cli/commands/dictionary.rb +46 -0
- data/lib/proiel/cli/commands/info.rb +1 -1
- data/lib/proiel/cli/commands/shell.rb +34 -0
- data/lib/proiel/cli/commands/tokenize.rb +2 -2
- data/lib/proiel/cli/commands/validate.rb +6 -4
- data/lib/proiel/cli/commands/visualize.rb +14 -11
- data/lib/proiel/cli/converters/conll-u/morphology.rb +162 -72
- data/lib/proiel/cli/converters/conll-u/syntax.rb +108 -62
- data/lib/proiel/cli/converters/conll-u.rb +648 -548
- data/lib/proiel/cli/converters/conll-x.rb +67 -52
- data/lib/proiel/cli/converters/lexc.rb +21 -23
- data/lib/proiel/cli/converters/proielxml.rb +173 -132
- data/lib/proiel/cli/converters/text.rb +69 -71
- data/lib/proiel/cli/converters/tiger.rb +110 -114
- data/lib/proiel/cli/converters/tiger2.rb +139 -141
- data/lib/proiel/cli/converters/tnt.rb +19 -15
- data/lib/proiel/cli/version.rb +1 -1
- data/lib/proiel/cli.rb +26 -1
- metadata +43 -58
- data/bin/setup +0 -8
- data/contrib/proiel-tnt-train +0 -15
- data/lib/proiel/cli/commands.rb +0 -28
|
@@ -1,192 +1,190 @@
|
|
|
1
|
-
module PROIEL
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
tb.
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
write_sentence(builder, sentence)
|
|
20
|
-
end
|
|
1
|
+
module PROIEL::Converter
|
|
2
|
+
class Tiger2
|
|
3
|
+
SCHEMA_FILE = File.join('tiger2', 'Tiger2.xsd')
|
|
4
|
+
|
|
5
|
+
class << self
|
|
6
|
+
def process(tb, _)
|
|
7
|
+
selected_features = [] # TODO
|
|
8
|
+
@features = selected_features.map { |f| [f, 'FREC'] }
|
|
9
|
+
|
|
10
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
|
11
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
|
12
|
+
|
|
13
|
+
tb.sources.each do |source|
|
|
14
|
+
@hack = tb.annotation_schema
|
|
15
|
+
write_source(builder, source, tb) do
|
|
16
|
+
source.divs.each do |div|
|
|
17
|
+
div.sentences.each do |sentence|
|
|
18
|
+
write_sentence(builder, sentence)
|
|
21
19
|
end
|
|
22
20
|
end
|
|
23
21
|
end
|
|
24
22
|
end
|
|
23
|
+
end
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
declare_annotation(builder, @features,
|
|
43
|
-
tb.annotation_schema)
|
|
25
|
+
def write_source(builder, s, tb)
|
|
26
|
+
builder.corpus('xml:id' => s.id,
|
|
27
|
+
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
|
28
|
+
'xsi:schemaLocation' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/ http://korpling.german.hu-berlin.de/tiger2/V2.0.5/Tiger2.xsd',
|
|
29
|
+
'xmlns:tiger2' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/',
|
|
30
|
+
'xmlns' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/') do
|
|
31
|
+
builder.head do
|
|
32
|
+
builder.meta do
|
|
33
|
+
builder.name(s.title)
|
|
34
|
+
builder.author('The PROIEL project')
|
|
35
|
+
builder.date(s.export_time.strftime('%F %T %z'))
|
|
36
|
+
builder.description
|
|
37
|
+
builder.format
|
|
38
|
+
builder.history
|
|
44
39
|
end
|
|
45
40
|
|
|
46
|
-
builder
|
|
47
|
-
|
|
48
|
-
|
|
41
|
+
declare_annotation(builder, @features,
|
|
42
|
+
tb.annotation_schema)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
builder.body do
|
|
46
|
+
yield builder
|
|
49
47
|
end
|
|
50
48
|
end
|
|
49
|
+
end
|
|
51
50
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
51
|
+
def declare_annotation(builder, features, annotation_schema)
|
|
52
|
+
builder.annotation do
|
|
53
|
+
features.each do |name, domain|
|
|
54
|
+
# FIXME: we may want to list possible values for some of these
|
|
55
|
+
builder.feature(name: name, domain: domain)
|
|
56
|
+
end
|
|
58
57
|
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
builder.edgelabel do
|
|
59
|
+
builder.value(name: '--')
|
|
61
60
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
end
|
|
61
|
+
annotation_schema.primary_relations.each do |tag, features|
|
|
62
|
+
builder.value({ name: tag }, features.summary)
|
|
65
63
|
end
|
|
64
|
+
end
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
end
|
|
66
|
+
builder.secedgelabel do
|
|
67
|
+
annotation_schema.secondary_relations.each do |tag, features|
|
|
68
|
+
builder.value({name: tag }, features.summary)
|
|
71
69
|
end
|
|
72
70
|
end
|
|
73
71
|
end
|
|
72
|
+
end
|
|
74
73
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
def declare_edgelabels(builder)
|
|
75
|
+
builder.feature(name: 'label', type: 'prim', domain: 'edge') do
|
|
76
|
+
declare_primary_edges(builder)
|
|
77
|
+
end
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
builder.feature(name: 'label', type: 'sec', domain: 'edge') do
|
|
80
|
+
declare_secedges(builder)
|
|
81
|
+
end
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
end
|
|
83
|
+
builder.feature(name: 'label', type: 'coref', domain: 'edge') do
|
|
84
|
+
builder.value(name: 'antecedent')
|
|
85
|
+
builder.value(name: 'inference')
|
|
88
86
|
end
|
|
87
|
+
end
|
|
89
88
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
end
|
|
89
|
+
def write_sentence(builder, s)
|
|
90
|
+
builder.s('xml:id' => "s#{s.id}") do
|
|
91
|
+
builder.graph(root: "s#{s.id}_root") do
|
|
92
|
+
write_terminals(builder, s)
|
|
93
|
+
write_nonterminals(builder, s)
|
|
96
94
|
end
|
|
97
95
|
end
|
|
96
|
+
end
|
|
98
97
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
end
|
|
98
|
+
def write_terminals(builder, s)
|
|
99
|
+
builder.terminals do
|
|
100
|
+
s.tokens.each do |t|
|
|
101
|
+
builder.t(token_attrs(t, 'T').merge({ 'xml:id' => "w#{t.id}"}))
|
|
104
102
|
end
|
|
105
103
|
end
|
|
104
|
+
end
|
|
106
105
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
else
|
|
123
|
-
attrs[name] = t.pos
|
|
124
|
-
end
|
|
125
|
-
when *MORPHOLOGICAL_FEATURES
|
|
126
|
-
attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
|
|
106
|
+
def token_attrs(t, type)
|
|
107
|
+
attrs = {}
|
|
108
|
+
|
|
109
|
+
@features.each do |name, domain|
|
|
110
|
+
if domain == 'FREC' or domain == type
|
|
111
|
+
case name
|
|
112
|
+
when :word, :cat
|
|
113
|
+
attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
|
|
114
|
+
when *@semantic_features
|
|
115
|
+
attrs[name] = t.sem_tags_to_hash[attr]
|
|
116
|
+
when :lemma
|
|
117
|
+
attrs[name] = t.lemma
|
|
118
|
+
when :pos
|
|
119
|
+
if t.empty_token_sort
|
|
120
|
+
attrs[name] = t.empty_token_sort + '-'
|
|
127
121
|
else
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
122
|
+
attrs[name] = t.pos
|
|
123
|
+
end
|
|
124
|
+
when *MORPHOLOGICAL_FEATURES
|
|
125
|
+
attrs[name] = name.to_s.split('_').map { |a| t.morphology_hash[a.to_sym] || '-' }.join
|
|
126
|
+
else
|
|
127
|
+
if t.respond_to?(name)
|
|
128
|
+
attrs[name] = t.send(name)
|
|
129
|
+
else
|
|
130
|
+
raise "Do not know how to get required attribute #{name}"
|
|
133
131
|
end
|
|
134
|
-
attrs[name] ||= "--"
|
|
135
132
|
end
|
|
133
|
+
attrs[name] ||= '--'
|
|
136
134
|
end
|
|
137
|
-
|
|
138
|
-
attrs
|
|
139
135
|
end
|
|
140
136
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
137
|
+
attrs
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def write_nonterminals(builder, s)
|
|
141
|
+
builder.nonterminals do
|
|
142
|
+
# Add an empty root node
|
|
143
|
+
h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
|
|
144
|
+
h['xml:id'] = "s#{s.id}_root"
|
|
146
145
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
end
|
|
146
|
+
builder.nt(h) do
|
|
147
|
+
s.tokens.reject { |t| t.head or t.pro? }.each do |t|
|
|
148
|
+
builder.edge(idref: "p#{t.id}", label: t.relation)
|
|
151
149
|
end
|
|
150
|
+
end
|
|
152
151
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
152
|
+
# Add other NTs
|
|
153
|
+
s.tokens.each do |t|
|
|
154
|
+
builder.nt(token_attrs(t, 'NT').merge('xml:id' => "p#{t.id}")) do
|
|
155
|
+
# Add an edge to the correspoding terminal node
|
|
156
|
+
builder.edge(idref: "w#{t.id}", label: '--')
|
|
158
157
|
|
|
159
|
-
|
|
160
|
-
|
|
158
|
+
# Add primary dependency edges
|
|
159
|
+
t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
|
|
161
160
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
end
|
|
161
|
+
# Add secondary dependency edges
|
|
162
|
+
t.slashes.each do |relation, target_id|
|
|
163
|
+
builder.secedge(idref: "p#{target_id}", label: relation)
|
|
166
164
|
end
|
|
167
165
|
end
|
|
168
166
|
end
|
|
169
167
|
end
|
|
168
|
+
end
|
|
170
169
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def write_edges(t, builder)
|
|
176
|
-
# Add an edge between this node and the correspoding terminal node unless
|
|
177
|
-
# this is not a morphtaggable node.
|
|
178
|
-
builder.edge('tiger2:type' => "prim", 'tiger2:target' => "w#{t.id}", :label => '--') if t.is_morphtaggable? or t.empty_token_sort == 'P'
|
|
170
|
+
def write_root_edge(t, builder)
|
|
171
|
+
builder.edge('tiger2:type' => 'prim', 'tiger2:target' => "p#{t.id}", :label => t.relation.tag)
|
|
172
|
+
end
|
|
179
173
|
|
|
180
|
-
|
|
181
|
-
|
|
174
|
+
def write_edges(t, builder)
|
|
175
|
+
# Add an edge between this node and the correspoding terminal node unless
|
|
176
|
+
# this is not a morphtaggable node.
|
|
177
|
+
builder.edge('tiger2:type' => 'prim', 'tiger2:target' => "w#{t.id}", :label => '--') if t.is_morphtaggable? or t.empty_token_sort == 'P'
|
|
182
178
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
builder.edge('tiger2:type' => "sec", 'tiger2:target' => "p#{se.slashee_id}", :label => se.relation.tag)
|
|
186
|
-
end
|
|
179
|
+
# Add primary dependency edges including empty pro tokens if we are exporting info structure as well
|
|
180
|
+
t.dependents.each { |d| builder.edge('tiger2:type' => 'prim', 'tiger2:target' => "p#{d.id}", :label => d.relation.tag) }
|
|
187
181
|
|
|
188
|
-
|
|
182
|
+
# Add secondary dependency edges
|
|
183
|
+
get_slashes(t).each do |se|
|
|
184
|
+
builder.edge('tiger2:type' => 'sec', 'tiger2:target' => "p#{se.slashee_id}", :label => se.relation.tag)
|
|
189
185
|
end
|
|
186
|
+
|
|
187
|
+
builder.edge('tiger2:type' => 'coref', 'tiger2:target' => t.antecedent_id, :label => (t.information_status_tag == 'acc_inf' ? 'inference' : 'antecedent'))
|
|
190
188
|
end
|
|
191
189
|
end
|
|
192
190
|
end
|
|
@@ -1,26 +1,30 @@
|
|
|
1
|
-
module PROIEL
|
|
2
|
-
|
|
3
|
-
class
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
puts '--'
|
|
1
|
+
module PROIEL::Converter
|
|
2
|
+
class TNT
|
|
3
|
+
class << self
|
|
4
|
+
def process(tb, options)
|
|
5
|
+
tb.sources.each do |source|
|
|
6
|
+
puts "%% Source #{source.id}"
|
|
7
|
+
puts '--'
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
source.divs.each do |div|
|
|
10
|
+
div.sentences.each do |sentence|
|
|
11
|
+
puts "%% Sentence #{sentence.id}"
|
|
12
|
+
sentence.tokens.each do |token|
|
|
13
|
+
if options['pos'] or options['morphology']
|
|
14
|
+
unless token.form.nil? or token.pos.nil?
|
|
15
15
|
if options['morphology']
|
|
16
|
-
|
|
16
|
+
unless token.morphology.nil?
|
|
17
|
+
puts [token.form, token.pos + token.morphology].join("\t")
|
|
18
|
+
end
|
|
17
19
|
else
|
|
18
20
|
puts [token.form, token.pos].join("\t")
|
|
19
21
|
end
|
|
20
22
|
end
|
|
23
|
+
else
|
|
24
|
+
puts token.form
|
|
21
25
|
end
|
|
22
|
-
puts '--'
|
|
23
26
|
end
|
|
27
|
+
puts '--'
|
|
24
28
|
end
|
|
25
29
|
end
|
|
26
30
|
end
|
data/lib/proiel/cli/version.rb
CHANGED
data/lib/proiel/cli.rb
CHANGED
|
@@ -1,4 +1,29 @@
|
|
|
1
|
+
require 'builder'
|
|
2
|
+
require 'colorize'
|
|
3
|
+
require 'mercenary'
|
|
4
|
+
require 'proiel'
|
|
5
|
+
require 'pry'
|
|
1
6
|
require 'ruby-progressbar'
|
|
2
7
|
|
|
3
8
|
require 'proiel/cli/version'
|
|
4
|
-
|
|
9
|
+
|
|
10
|
+
module PROIEL
|
|
11
|
+
class Command
|
|
12
|
+
class << self
|
|
13
|
+
def subclasses
|
|
14
|
+
@subclasses ||= []
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def inherited(base)
|
|
18
|
+
subclasses << base
|
|
19
|
+
super(base)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
module Converter; end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
Dir[File.join(File.dirname(__FILE__), 'cli', '{commands,converters}', '*.rb')].sort.each do |f|
|
|
28
|
+
require f
|
|
29
|
+
end
|