proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
class Text
|
4
|
+
class << self
|
5
|
+
def process(tb, options)
|
6
|
+
tb.sources.each do |source|
|
7
|
+
puts "% id = #{source.id}"
|
8
|
+
puts "% export_time = #{source.export_time}"
|
9
|
+
puts "% title = #{source.title}"
|
10
|
+
puts "% author = #{source.author}"
|
11
|
+
puts "% citation_part = #{source.citation_part}"
|
12
|
+
puts "% language = #{source.language}"
|
13
|
+
|
14
|
+
source.divs.each do |div|
|
15
|
+
puts
|
16
|
+
puts "# #{div.title}"
|
17
|
+
puts
|
18
|
+
|
19
|
+
if options['diffable']
|
20
|
+
print_diffable_div(div)
|
21
|
+
else
|
22
|
+
print_formatted_div(div)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def print_formatted_div(div)
|
29
|
+
p = ''
|
30
|
+
p += div.presentation_before unless div.presentation_before.nil?
|
31
|
+
|
32
|
+
current_citation = nil
|
33
|
+
|
34
|
+
div.sentences.each do |sentence|
|
35
|
+
p += sentence.presentation_before unless sentence.presentation_before.nil?
|
36
|
+
|
37
|
+
sentence.tokens.each do |token|
|
38
|
+
if token.has_content?
|
39
|
+
new_citation = token.citation_part
|
40
|
+
|
41
|
+
if current_citation != new_citation
|
42
|
+
p += "§#{new_citation.gsub(/\s+/, '_')} "
|
43
|
+
current_citation = new_citation
|
44
|
+
end
|
45
|
+
|
46
|
+
p += [token.presentation_before,
|
47
|
+
token.form,
|
48
|
+
token.presentation_after].compact.join
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
p += sentence.presentation_after unless sentence.presentation_after.nil?
|
53
|
+
end
|
54
|
+
|
55
|
+
p += div.presentation_after unless div.presentation_after.nil?
|
56
|
+
|
57
|
+
p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
|
58
|
+
line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
|
59
|
+
end * "\n"
|
60
|
+
|
61
|
+
puts p
|
62
|
+
end
|
63
|
+
|
64
|
+
def print_diffable_div(div)
|
65
|
+
current_citation = nil
|
66
|
+
|
67
|
+
p = ''
|
68
|
+
pb = div.presentation_before || ''
|
69
|
+
|
70
|
+
div.sentences.each do |sentence|
|
71
|
+
pb += sentence.presentation_before || ''
|
72
|
+
|
73
|
+
sentence.tokens.each do |token|
|
74
|
+
if token.has_content?
|
75
|
+
if current_citation != token.citation_part
|
76
|
+
puts p unless p.empty?
|
77
|
+
p = "§#{token.citation_part.gsub(/\s+/, '_')} "
|
78
|
+
current_citation = token.citation_part
|
79
|
+
end
|
80
|
+
|
81
|
+
p += [pb,
|
82
|
+
token.presentation_before,
|
83
|
+
token.form,
|
84
|
+
token.presentation_after].compact.join
|
85
|
+
pb = ''
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
p += sentence.presentation_after unless sentence.presentation_after.nil?
|
90
|
+
end
|
91
|
+
|
92
|
+
p += div.presentation_after unless div.presentation_after.nil?
|
93
|
+
|
94
|
+
puts p unless p.empty?
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'builder'
|
2
|
+
|
3
|
+
module PROIEL
|
4
|
+
module Converter
|
5
|
+
# Converter for the TigerXML format
|
6
|
+
# (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
|
7
|
+
# in the variant used by VISL under the name 'TIGER dependency format'
|
8
|
+
# (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
|
9
|
+
class Tiger
|
10
|
+
SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
|
11
|
+
|
12
|
+
MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
|
13
|
+
OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def process(tb, options)
|
17
|
+
selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
|
18
|
+
@features = selected_features.map { |f| [f, 'FREC'] }.to_h
|
19
|
+
|
20
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
21
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
22
|
+
|
23
|
+
tb.sources.each do |source|
|
24
|
+
@hack = tb.annotation_schema
|
25
|
+
write_source(builder, source) do
|
26
|
+
source.divs.each do |div|
|
27
|
+
div.sentences.each do |sentence|
|
28
|
+
write_sentence(builder, sentence)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def write_source(builder, s)
|
36
|
+
builder.corpus(id: s.id) do
|
37
|
+
builder.head do
|
38
|
+
builder.meta do
|
39
|
+
builder.name(s.title)
|
40
|
+
end
|
41
|
+
|
42
|
+
declare_annotation(builder, @features, @hack)
|
43
|
+
end
|
44
|
+
|
45
|
+
builder.body do
|
46
|
+
yield
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def declare_annotation(builder, features, annotation_schema)
|
52
|
+
builder.annotation do
|
53
|
+
features.each do |name, domain|
|
54
|
+
# FIXME: we may want to list possible values for some of these
|
55
|
+
builder.feature(name: name, domain: domain)
|
56
|
+
end
|
57
|
+
|
58
|
+
builder.edgelabel do
|
59
|
+
builder.value(name: '--')
|
60
|
+
|
61
|
+
annotation_schema.primary_relations.each do |tag, features|
|
62
|
+
builder.value({ name: tag }, features.summary)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
builder.secedgelabel do
|
67
|
+
annotation_schema.secondary_relations.each do |tag, features|
|
68
|
+
builder.value({name: tag }, features.summary)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def token_attrs(s, t, type)
|
75
|
+
attrs = {}
|
76
|
+
|
77
|
+
@features.each do |name, domain|
|
78
|
+
if domain == 'FREC' or domain == type
|
79
|
+
case name
|
80
|
+
when :word, :cat
|
81
|
+
attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
|
82
|
+
when *@semantic_features
|
83
|
+
attrs[name] = t.sem_tags_to_hash[attr]
|
84
|
+
when :lemma
|
85
|
+
attrs[name] = t.lemma
|
86
|
+
when :pos
|
87
|
+
if t.empty_token_sort
|
88
|
+
attrs[name] = t.empty_token_sort + "-"
|
89
|
+
else
|
90
|
+
attrs[name] = t.pos
|
91
|
+
end
|
92
|
+
when *MORPHOLOGICAL_FEATURES
|
93
|
+
attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
|
94
|
+
else
|
95
|
+
if t.respond_to?(name)
|
96
|
+
attrs[name] = t.send(name)
|
97
|
+
else
|
98
|
+
raise "Do not know how to get required attribute #{name}"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
attrs[name] ||= "--"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
attrs
|
106
|
+
end
|
107
|
+
|
108
|
+
def write_terminals(builder, s)
|
109
|
+
builder.terminals do
|
110
|
+
s.tokens.each do |t|
|
111
|
+
builder.t(token_attrs(s, t, 'T').merge({ id: "w#{t.id}"}))
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def write_nonterminals(builder, s)
|
117
|
+
builder.nonterminals do
|
118
|
+
# Add an empty root node
|
119
|
+
h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
|
120
|
+
h[:id] = "s#{s.id}_root"
|
121
|
+
|
122
|
+
builder.nt(h) do
|
123
|
+
s.tokens.reject { |t| t.head or t.pro? }.each do |t|
|
124
|
+
builder.edge(idref: "p#{t.id}", label: t.relation)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# Add other NTs
|
129
|
+
s.tokens.each do |t|
|
130
|
+
builder.nt(token_attrs(s, t, 'NT').merge(id: "p#{t.id}")) do
|
131
|
+
# Add an edge to the correspoding terminal node
|
132
|
+
builder.edge(idref: "w#{t.id}", label: '--')
|
133
|
+
|
134
|
+
# Add primary dependency edges
|
135
|
+
t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
|
136
|
+
|
137
|
+
# Add secondary dependency edges
|
138
|
+
t.slashes.each do |relation, target_id|
|
139
|
+
builder.secedge(idref: "p#{target_id}", label: relation)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def write_sentence(builder, s)
|
147
|
+
builder.s(id: "s#{s.id}") do
|
148
|
+
builder.graph(root: "s#{s.id}_root") do
|
149
|
+
write_terminals(builder, s)
|
150
|
+
write_nonterminals(builder, s)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
class Tiger2
|
4
|
+
SCHEMA_FILE = File.join('tiger2', 'Tiger2.xsd')
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def process(tb, options)
|
8
|
+
selected_features = [] # TODO
|
9
|
+
@features = selected_features.map { |f| [f, 'FREC'] }
|
10
|
+
|
11
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
12
|
+
builder.instruct! :xml, version: "1.0", encoding: "UTF-8"
|
13
|
+
|
14
|
+
tb.sources.each do |source|
|
15
|
+
@hack = tb.annotation_schema
|
16
|
+
write_source(builder, source, tb) do
|
17
|
+
source.divs.each do |div|
|
18
|
+
div.sentences.each do |sentence|
|
19
|
+
write_sentence(builder, sentence)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def write_source(builder, s, tb)
|
27
|
+
builder.corpus('xml:id' => s.id,
|
28
|
+
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
29
|
+
'xsi:schemaLocation' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/ http://korpling.german.hu-berlin.de/tiger2/V2.0.5/Tiger2.xsd',
|
30
|
+
'xmlns:tiger2' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/',
|
31
|
+
'xmlns' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/') do
|
32
|
+
builder.head do
|
33
|
+
builder.meta do
|
34
|
+
builder.name(s.title)
|
35
|
+
builder.author('The PROIEL project')
|
36
|
+
builder.date(s.export_time.strftime("%F %T %z"))
|
37
|
+
builder.description
|
38
|
+
builder.format
|
39
|
+
builder.history
|
40
|
+
end
|
41
|
+
|
42
|
+
declare_annotation(builder, @features,
|
43
|
+
tb.annotation_schema)
|
44
|
+
end
|
45
|
+
|
46
|
+
builder.body do
|
47
|
+
yield builder
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def declare_annotation(builder, features, annotation_schema)
|
53
|
+
builder.annotation do
|
54
|
+
features.each do |name, domain|
|
55
|
+
# FIXME: we may want to list possible values for some of these
|
56
|
+
builder.feature(name: name, domain: domain)
|
57
|
+
end
|
58
|
+
|
59
|
+
builder.edgelabel do
|
60
|
+
builder.value(name: '--')
|
61
|
+
|
62
|
+
annotation_schema.primary_relations.each do |tag, features|
|
63
|
+
builder.value({ name: tag }, features.summary)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
builder.secedgelabel do
|
68
|
+
annotation_schema.secondary_relations.each do |tag, features|
|
69
|
+
builder.value({name: tag }, features.summary)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def declare_edgelabels(builder)
|
76
|
+
builder.feature(name: "label", type: "prim", domain: "edge") do
|
77
|
+
declare_primary_edges(builder)
|
78
|
+
end
|
79
|
+
|
80
|
+
builder.feature(name: "label", type: "sec", domain: "edge") do
|
81
|
+
declare_secedges(builder)
|
82
|
+
end
|
83
|
+
|
84
|
+
builder.feature(name: "label", type: "coref", domain: "edge") do
|
85
|
+
builder.value(name: "antecedent")
|
86
|
+
builder.value(name: "inference")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def write_sentence(builder, s)
|
91
|
+
builder.s('xml:id' => "s#{s.id}") do
|
92
|
+
builder.graph(root: "s#{s.id}_root") do
|
93
|
+
write_terminals(builder, s)
|
94
|
+
write_nonterminals(builder, s)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def write_terminals(builder, s)
|
100
|
+
builder.terminals do
|
101
|
+
s.tokens.each do |t|
|
102
|
+
builder.t(token_attrs(s, t, 'T').merge({ 'xml:id' => "w#{t.id}"}))
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def token_attrs(s, t, type)
|
108
|
+
attrs = {}
|
109
|
+
|
110
|
+
@features.each do |name, domain|
|
111
|
+
if domain == 'FREC' or domain == type
|
112
|
+
case name
|
113
|
+
when :word, :cat
|
114
|
+
attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
|
115
|
+
when *@semantic_features
|
116
|
+
attrs[name] = t.sem_tags_to_hash[attr]
|
117
|
+
when :lemma
|
118
|
+
attrs[name] = t.lemma
|
119
|
+
when :pos
|
120
|
+
if t.empty_token_sort
|
121
|
+
attrs[name] = t.empty_token_sort + "-"
|
122
|
+
else
|
123
|
+
attrs[name] = t.pos
|
124
|
+
end
|
125
|
+
when *MORPHOLOGICAL_FEATURES
|
126
|
+
attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
|
127
|
+
else
|
128
|
+
if t.respond_to?(name)
|
129
|
+
attrs[name] = t.send(name)
|
130
|
+
else
|
131
|
+
raise "Do not know how to get required attribute #{name}"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
attrs[name] ||= "--"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
attrs
|
139
|
+
end
|
140
|
+
|
141
|
+
def write_nonterminals(builder, s)
|
142
|
+
builder.nonterminals do
|
143
|
+
# Add an empty root node
|
144
|
+
h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
|
145
|
+
h['xml:id'] = "s#{s.id}_root"
|
146
|
+
|
147
|
+
builder.nt(h) do
|
148
|
+
s.tokens.reject { |t| t.head or t.pro? }.each do |t|
|
149
|
+
builder.edge(idref: "p#{t.id}", label: t.relation)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Add other NTs
|
154
|
+
s.tokens.each do |t|
|
155
|
+
builder.nt(token_attrs(s, t, 'NT').merge('xml:id' => "p#{t.id}")) do
|
156
|
+
# Add an edge to the correspoding terminal node
|
157
|
+
builder.edge(idref: "w#{t.id}", label: '--')
|
158
|
+
|
159
|
+
# Add primary dependency edges
|
160
|
+
t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
|
161
|
+
|
162
|
+
# Add secondary dependency edges
|
163
|
+
t.slashes.each do |relation, target_id|
|
164
|
+
builder.secedge(idref: "p#{target_id}", label: relation)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def write_root_edge(t, builder)
|
172
|
+
builder.edge('tiger2:type' => "prim", 'tiger2:target' => "p#{t.id}", :label => t.relation.tag)
|
173
|
+
end
|
174
|
+
|
175
|
+
def write_edges(t, builder)
|
176
|
+
# Add an edge between this node and the correspoding terminal node unless
|
177
|
+
# this is not a morphtaggable node.
|
178
|
+
builder.edge('tiger2:type' => "prim", 'tiger2:target' => "w#{t.id}", :label => '--') if t.is_morphtaggable? or t.empty_token_sort == 'P'
|
179
|
+
|
180
|
+
# Add primary dependency edges including empty pro tokens if we are exporting info structure as well
|
181
|
+
t.dependents.each { |d| builder.edge('tiger2:type' => "prim", 'tiger2:target' => "p#{d.id}", :label => d.relation.tag) }
|
182
|
+
|
183
|
+
# Add secondary dependency edges
|
184
|
+
get_slashes(t).each do |se|
|
185
|
+
builder.edge('tiger2:type' => "sec", 'tiger2:target' => "p#{se.slashee_id}", :label => se.relation.tag)
|
186
|
+
end
|
187
|
+
|
188
|
+
builder.edge('tiger2:type' => "coref", 'tiger2:target' => t.antecedent_id, :label => (t.information_status_tag == 'acc_inf' ? "inference" : "antecedent") )
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|