proiel-cli 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +11 -3
- data/bin/proiel +1 -1
- data/lib/proiel/cli/commands/build.rb +91 -0
- data/lib/proiel/cli/commands/convert.rb +7 -2
- data/lib/proiel/cli/commands/dictionary.rb +46 -0
- data/lib/proiel/cli/commands/info.rb +1 -1
- data/lib/proiel/cli/commands/shell.rb +34 -0
- data/lib/proiel/cli/commands/tokenize.rb +2 -2
- data/lib/proiel/cli/commands/validate.rb +1 -1
- data/lib/proiel/cli/commands/visualize.rb +14 -11
- data/lib/proiel/cli/converters/conll-u/morphology.rb +162 -72
- data/lib/proiel/cli/converters/conll-u/syntax.rb +108 -62
- data/lib/proiel/cli/converters/conll-u.rb +648 -548
- data/lib/proiel/cli/converters/conll-x.rb +67 -52
- data/lib/proiel/cli/converters/lexc.rb +21 -23
- data/lib/proiel/cli/converters/proielxml.rb +173 -132
- data/lib/proiel/cli/converters/text.rb +69 -71
- data/lib/proiel/cli/converters/tiger.rb +110 -114
- data/lib/proiel/cli/converters/tiger2.rb +139 -141
- data/lib/proiel/cli/converters/tnt.rb +19 -15
- data/lib/proiel/cli/version.rb +1 -1
- data/lib/proiel/cli.rb +26 -1
- metadata +43 -58
- data/bin/setup +0 -8
- data/contrib/proiel-tnt-train +0 -15
- data/lib/proiel/cli/commands.rb +0 -28
|
@@ -1,98 +1,96 @@
|
|
|
1
|
-
module PROIEL
|
|
2
|
-
|
|
3
|
-
class
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
print_formatted_div(div)
|
|
23
|
-
end
|
|
1
|
+
module PROIEL::Converter
|
|
2
|
+
class Text
|
|
3
|
+
class << self
|
|
4
|
+
def process(tb, options)
|
|
5
|
+
tb.sources.each do |source|
|
|
6
|
+
puts "% id = #{source.id}"
|
|
7
|
+
puts "% export_time = #{source.export_time}"
|
|
8
|
+
puts "% title = #{source.title}"
|
|
9
|
+
puts "% author = #{source.author}"
|
|
10
|
+
puts "% citation_part = #{source.citation_part}"
|
|
11
|
+
puts "% language = #{source.language}"
|
|
12
|
+
|
|
13
|
+
source.divs.each do |div|
|
|
14
|
+
puts
|
|
15
|
+
puts "# #{div.title}"
|
|
16
|
+
puts
|
|
17
|
+
|
|
18
|
+
if options['diffable']
|
|
19
|
+
print_diffable_div(div)
|
|
20
|
+
else
|
|
21
|
+
print_formatted_div(div)
|
|
24
22
|
end
|
|
25
23
|
end
|
|
26
24
|
end
|
|
25
|
+
end
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
current_citation = nil
|
|
27
|
+
def print_formatted_div(div)
|
|
28
|
+
p = ''
|
|
29
|
+
p += div.presentation_before unless div.presentation_before.nil?
|
|
33
30
|
|
|
34
|
-
|
|
35
|
-
p += sentence.presentation_before unless sentence.presentation_before.nil?
|
|
31
|
+
current_citation = nil
|
|
36
32
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
new_citation = token.citation_part
|
|
33
|
+
div.sentences.each do |sentence|
|
|
34
|
+
p += sentence.presentation_before unless sentence.presentation_before.nil?
|
|
40
35
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
end
|
|
36
|
+
sentence.tokens.each do |token|
|
|
37
|
+
if token.has_content?
|
|
38
|
+
new_citation = token.citation_part
|
|
45
39
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
40
|
+
if current_citation != new_citation
|
|
41
|
+
p += "§#{new_citation.gsub(/\s+/, '_')} "
|
|
42
|
+
current_citation = new_citation
|
|
49
43
|
end
|
|
50
|
-
end
|
|
51
44
|
|
|
52
|
-
|
|
45
|
+
p += [token.presentation_before,
|
|
46
|
+
token.form,
|
|
47
|
+
token.presentation_after].compact.join
|
|
48
|
+
end
|
|
53
49
|
end
|
|
54
50
|
|
|
55
|
-
p +=
|
|
51
|
+
p += sentence.presentation_after unless sentence.presentation_after.nil?
|
|
52
|
+
end
|
|
56
53
|
|
|
57
|
-
|
|
58
|
-
line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
|
|
59
|
-
end * "\n"
|
|
54
|
+
p += div.presentation_after unless div.presentation_after.nil?
|
|
60
55
|
|
|
61
|
-
|
|
62
|
-
|
|
56
|
+
p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
|
|
57
|
+
line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
|
|
58
|
+
end * "\n"
|
|
63
59
|
|
|
64
|
-
|
|
65
|
-
|
|
60
|
+
puts p
|
|
61
|
+
end
|
|
66
62
|
|
|
67
|
-
|
|
68
|
-
|
|
63
|
+
def print_diffable_div(div)
|
|
64
|
+
current_citation = nil
|
|
69
65
|
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
p = ''
|
|
67
|
+
pb = div.presentation_before || ''
|
|
72
68
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if current_citation != token.citation_part
|
|
76
|
-
puts p unless p.empty?
|
|
77
|
-
p = "§#{token.citation_part.gsub(/\s+/, '_')} "
|
|
78
|
-
current_citation = token.citation_part
|
|
79
|
-
end
|
|
69
|
+
div.sentences.each do |sentence|
|
|
70
|
+
pb += sentence.presentation_before || ''
|
|
80
71
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
72
|
+
sentence.tokens.each do |token|
|
|
73
|
+
if token.has_content?
|
|
74
|
+
if current_citation != token.citation_part
|
|
75
|
+
puts p unless p.empty?
|
|
76
|
+
p = "§#{token.citation_part.gsub(/\s+/, '_')} "
|
|
77
|
+
current_citation = token.citation_part
|
|
86
78
|
end
|
|
87
|
-
end
|
|
88
79
|
|
|
89
|
-
|
|
80
|
+
p += [pb,
|
|
81
|
+
token.presentation_before,
|
|
82
|
+
token.form,
|
|
83
|
+
token.presentation_after].compact.join
|
|
84
|
+
pb = ''
|
|
85
|
+
end
|
|
90
86
|
end
|
|
91
87
|
|
|
92
|
-
p +=
|
|
93
|
-
|
|
94
|
-
puts p unless p.empty?
|
|
88
|
+
p += sentence.presentation_after unless sentence.presentation_after.nil?
|
|
95
89
|
end
|
|
90
|
+
|
|
91
|
+
p += div.presentation_after unless div.presentation_after.nil?
|
|
92
|
+
|
|
93
|
+
puts p unless p.empty?
|
|
96
94
|
end
|
|
97
95
|
end
|
|
98
96
|
end
|
|
@@ -1,154 +1,150 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
source.divs.each do |div|
|
|
27
|
-
div.sentences.each do |sentence|
|
|
28
|
-
write_sentence(builder, sentence)
|
|
29
|
-
end
|
|
1
|
+
module PROIEL::Converter
|
|
2
|
+
# Converter for the TigerXML format
|
|
3
|
+
# (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
|
|
4
|
+
# in the variant used by VISL under the name 'TIGER dependency format'
|
|
5
|
+
# (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
|
|
6
|
+
class Tiger
|
|
7
|
+
SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
|
|
8
|
+
|
|
9
|
+
MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
|
|
10
|
+
OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
def process(tb, _)
|
|
14
|
+
selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
|
|
15
|
+
@features = selected_features.map { |f| [f, 'FREC'] }.to_h
|
|
16
|
+
|
|
17
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
|
18
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
|
19
|
+
|
|
20
|
+
tb.sources.each do |source|
|
|
21
|
+
@hack = tb.annotation_schema
|
|
22
|
+
write_source(builder, source) do
|
|
23
|
+
source.divs.each do |div|
|
|
24
|
+
div.sentences.each do |sentence|
|
|
25
|
+
write_sentence(builder, sentence)
|
|
30
26
|
end
|
|
31
27
|
end
|
|
32
28
|
end
|
|
33
29
|
end
|
|
30
|
+
end
|
|
34
31
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
declare_annotation(builder, @features, @hack)
|
|
32
|
+
def write_source(builder, s)
|
|
33
|
+
builder.corpus(id: s.id) do
|
|
34
|
+
builder.head do
|
|
35
|
+
builder.meta do
|
|
36
|
+
builder.name(s.title)
|
|
43
37
|
end
|
|
44
38
|
|
|
45
|
-
builder
|
|
46
|
-
|
|
47
|
-
|
|
39
|
+
declare_annotation(builder, @features, @hack)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
builder.body do
|
|
43
|
+
yield
|
|
48
44
|
end
|
|
49
45
|
end
|
|
46
|
+
end
|
|
50
47
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
48
|
+
def declare_annotation(builder, features, annotation_schema)
|
|
49
|
+
builder.annotation do
|
|
50
|
+
features.each do |name, domain|
|
|
51
|
+
# FIXME: we may want to list possible values for some of these
|
|
52
|
+
builder.feature(name: name, domain: domain)
|
|
53
|
+
end
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
55
|
+
builder.edgelabel do
|
|
56
|
+
builder.value(name: '--')
|
|
60
57
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
end
|
|
58
|
+
annotation_schema.primary_relations.each do |tag, features|
|
|
59
|
+
builder.value({ name: tag }, features.summary)
|
|
64
60
|
end
|
|
61
|
+
end
|
|
65
62
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
end
|
|
63
|
+
builder.secedgelabel do
|
|
64
|
+
annotation_schema.secondary_relations.each do |tag, features|
|
|
65
|
+
builder.value({name: tag }, features.summary)
|
|
70
66
|
end
|
|
71
67
|
end
|
|
72
68
|
end
|
|
69
|
+
end
|
|
73
70
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
71
|
+
def token_attrs(t, type)
|
|
72
|
+
attrs = {}
|
|
73
|
+
|
|
74
|
+
@features.each do |name, domain|
|
|
75
|
+
if domain == 'FREC' or domain == type
|
|
76
|
+
case name
|
|
77
|
+
when :word, :cat
|
|
78
|
+
attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
|
|
79
|
+
when *@semantic_features
|
|
80
|
+
attrs[name] = t.sem_tags_to_hash[attr]
|
|
81
|
+
when :lemma
|
|
82
|
+
attrs[name] = t.lemma
|
|
83
|
+
when :pos
|
|
84
|
+
if t.empty_token_sort
|
|
85
|
+
attrs[name] = t.empty_token_sort + '-'
|
|
86
|
+
else
|
|
87
|
+
attrs[name] = t.pos
|
|
88
|
+
end
|
|
89
|
+
when *MORPHOLOGICAL_FEATURES
|
|
90
|
+
attrs[name] = name.to_s.split('_').map { |a| t.morphology_hash[a.to_sym] || '-' }.join
|
|
91
|
+
else
|
|
92
|
+
if t.respond_to?(name)
|
|
93
|
+
attrs[name] = t.send(name)
|
|
94
94
|
else
|
|
95
|
-
|
|
96
|
-
attrs[name] = t.send(name)
|
|
97
|
-
else
|
|
98
|
-
raise "Do not know how to get required attribute #{name}"
|
|
99
|
-
end
|
|
95
|
+
raise "Do not know how to get required attribute #{name}"
|
|
100
96
|
end
|
|
101
|
-
attrs[name] ||= "--"
|
|
102
97
|
end
|
|
98
|
+
attrs[name] ||= '--'
|
|
103
99
|
end
|
|
104
|
-
|
|
105
|
-
attrs
|
|
106
100
|
end
|
|
107
101
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
102
|
+
attrs
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def write_terminals(builder, s)
|
|
106
|
+
builder.terminals do
|
|
107
|
+
s.tokens.each do |t|
|
|
108
|
+
builder.t(token_attrs(t, 'T').merge({ id: "w#{t.id}"}))
|
|
113
109
|
end
|
|
114
110
|
end
|
|
111
|
+
end
|
|
115
112
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
113
|
+
def write_nonterminals(builder, s)
|
|
114
|
+
builder.nonterminals do
|
|
115
|
+
# Add an empty root node
|
|
116
|
+
h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
|
|
117
|
+
h[:id] = "s#{s.id}_root"
|
|
121
118
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
end
|
|
119
|
+
builder.nt(h) do
|
|
120
|
+
s.tokens.reject { |t| t.head or t.pro? }.each do |t|
|
|
121
|
+
builder.edge(idref: "p#{t.id}", label: t.relation)
|
|
126
122
|
end
|
|
123
|
+
end
|
|
127
124
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
125
|
+
# Add other NTs
|
|
126
|
+
s.tokens.each do |t|
|
|
127
|
+
builder.nt(token_attrs(t, 'NT').merge(id: "p#{t.id}")) do
|
|
128
|
+
# Add an edge to the correspoding terminal node
|
|
129
|
+
builder.edge(idref: "w#{t.id}", label: '--')
|
|
133
130
|
|
|
134
|
-
|
|
135
|
-
|
|
131
|
+
# Add primary dependency edges
|
|
132
|
+
t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
|
|
136
133
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
end
|
|
134
|
+
# Add secondary dependency edges
|
|
135
|
+
t.slashes.each do |relation, target_id|
|
|
136
|
+
builder.secedge(idref: "p#{target_id}", label: relation)
|
|
141
137
|
end
|
|
142
138
|
end
|
|
143
139
|
end
|
|
144
140
|
end
|
|
141
|
+
end
|
|
145
142
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
end
|
|
143
|
+
def write_sentence(builder, s)
|
|
144
|
+
builder.s(id: "s#{s.id}") do
|
|
145
|
+
builder.graph(root: "s#{s.id}_root") do
|
|
146
|
+
write_terminals(builder, s)
|
|
147
|
+
write_nonterminals(builder, s)
|
|
152
148
|
end
|
|
153
149
|
end
|
|
154
150
|
end
|