proiel-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ module PROIEL
2
+ module Converter
3
+ class Text
4
+ class << self
5
+ def process(tb, options)
6
+ tb.sources.each do |source|
7
+ puts "% id = #{source.id}"
8
+ puts "% export_time = #{source.export_time}"
9
+ puts "% title = #{source.title}"
10
+ puts "% author = #{source.author}"
11
+ puts "% citation_part = #{source.citation_part}"
12
+ puts "% language = #{source.language}"
13
+
14
+ source.divs.each do |div|
15
+ puts
16
+ puts "# #{div.title}"
17
+ puts
18
+
19
+ if options['diffable']
20
+ print_diffable_div(div)
21
+ else
22
+ print_formatted_div(div)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ def print_formatted_div(div)
29
+ p = ''
30
+ p += div.presentation_before unless div.presentation_before.nil?
31
+
32
+ current_citation = nil
33
+
34
+ div.sentences.each do |sentence|
35
+ p += sentence.presentation_before unless sentence.presentation_before.nil?
36
+
37
+ sentence.tokens.each do |token|
38
+ if token.has_content?
39
+ new_citation = token.citation_part
40
+
41
+ if current_citation != new_citation
42
+ p += "§#{new_citation.gsub(/\s+/, '_')} "
43
+ current_citation = new_citation
44
+ end
45
+
46
+ p += [token.presentation_before,
47
+ token.form,
48
+ token.presentation_after].compact.join
49
+ end
50
+ end
51
+
52
+ p += sentence.presentation_after unless sentence.presentation_after.nil?
53
+ end
54
+
55
+ p += div.presentation_after unless div.presentation_after.nil?
56
+
57
+ p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
58
+ line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
59
+ end * "\n"
60
+
61
+ puts p
62
+ end
63
+
64
+ def print_diffable_div(div)
65
+ current_citation = nil
66
+
67
+ p = ''
68
+ pb = div.presentation_before || ''
69
+
70
+ div.sentences.each do |sentence|
71
+ pb += sentence.presentation_before || ''
72
+
73
+ sentence.tokens.each do |token|
74
+ if token.has_content?
75
+ if current_citation != token.citation_part
76
+ puts p unless p.empty?
77
+ p = "§#{token.citation_part.gsub(/\s+/, '_')} "
78
+ current_citation = token.citation_part
79
+ end
80
+
81
+ p += [pb,
82
+ token.presentation_before,
83
+ token.form,
84
+ token.presentation_after].compact.join
85
+ pb = ''
86
+ end
87
+ end
88
+
89
+ p += sentence.presentation_after unless sentence.presentation_after.nil?
90
+ end
91
+
92
+ p += div.presentation_after unless div.presentation_after.nil?
93
+
94
+ puts p unless p.empty?
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,157 @@
1
+ require 'builder'
2
+
3
+ module PROIEL
4
+ module Converter
5
+ # Converter for the TigerXML format
6
+ # (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
7
+ # in the variant used by VISL under the name 'TIGER dependency format'
8
+ # (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
9
+ class Tiger
10
+ SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
11
+
12
+ MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
13
+ OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
14
+
15
+ class << self
16
+ def process(tb, options)
17
+ selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
18
+ @features = selected_features.map { |f| [f, 'FREC'] }.to_h
19
+
20
+ builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
21
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
22
+
23
+ tb.sources.each do |source|
24
+ @hack = tb.annotation_schema
25
+ write_source(builder, source) do
26
+ source.divs.each do |div|
27
+ div.sentences.each do |sentence|
28
+ write_sentence(builder, sentence)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def write_source(builder, s)
36
+ builder.corpus(id: s.id) do
37
+ builder.head do
38
+ builder.meta do
39
+ builder.name(s.title)
40
+ end
41
+
42
+ declare_annotation(builder, @features, @hack)
43
+ end
44
+
45
+ builder.body do
46
+ yield
47
+ end
48
+ end
49
+ end
50
+
51
+ def declare_annotation(builder, features, annotation_schema)
52
+ builder.annotation do
53
+ features.each do |name, domain|
54
+ # FIXME: we may want to list possible values for some of these
55
+ builder.feature(name: name, domain: domain)
56
+ end
57
+
58
+ builder.edgelabel do
59
+ builder.value(name: '--')
60
+
61
+ annotation_schema.primary_relations.each do |tag, features|
62
+ builder.value({ name: tag }, features.summary)
63
+ end
64
+ end
65
+
66
+ builder.secedgelabel do
67
+ annotation_schema.secondary_relations.each do |tag, features|
68
+ builder.value({name: tag }, features.summary)
69
+ end
70
+ end
71
+ end
72
+ end
73
+
74
+ def token_attrs(s, t, type)
75
+ attrs = {}
76
+
77
+ @features.each do |name, domain|
78
+ if domain == 'FREC' or domain == type
79
+ case name
80
+ when :word, :cat
81
+ attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
82
+ when *@semantic_features
83
+ attrs[name] = t.sem_tags_to_hash[attr]
84
+ when :lemma
85
+ attrs[name] = t.lemma
86
+ when :pos
87
+ if t.empty_token_sort
88
+ attrs[name] = t.empty_token_sort + "-"
89
+ else
90
+ attrs[name] = t.pos
91
+ end
92
+ when *MORPHOLOGICAL_FEATURES
93
+ attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
94
+ else
95
+ if t.respond_to?(name)
96
+ attrs[name] = t.send(name)
97
+ else
98
+ raise "Do not know how to get required attribute #{name}"
99
+ end
100
+ end
101
+ attrs[name] ||= "--"
102
+ end
103
+ end
104
+
105
+ attrs
106
+ end
107
+
108
+ def write_terminals(builder, s)
109
+ builder.terminals do
110
+ s.tokens.each do |t|
111
+ builder.t(token_attrs(s, t, 'T').merge({ id: "w#{t.id}"}))
112
+ end
113
+ end
114
+ end
115
+
116
+ def write_nonterminals(builder, s)
117
+ builder.nonterminals do
118
+ # Add an empty root node
119
+ h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
120
+ h[:id] = "s#{s.id}_root"
121
+
122
+ builder.nt(h) do
123
+ s.tokens.reject { |t| t.head or t.pro? }.each do |t|
124
+ builder.edge(idref: "p#{t.id}", label: t.relation)
125
+ end
126
+ end
127
+
128
+ # Add other NTs
129
+ s.tokens.each do |t|
130
+ builder.nt(token_attrs(s, t, 'NT').merge(id: "p#{t.id}")) do
131
+ # Add an edge to the correspoding terminal node
132
+ builder.edge(idref: "w#{t.id}", label: '--')
133
+
134
+ # Add primary dependency edges
135
+ t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
136
+
137
+ # Add secondary dependency edges
138
+ t.slashes.each do |relation, target_id|
139
+ builder.secedge(idref: "p#{target_id}", label: relation)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def write_sentence(builder, s)
147
+ builder.s(id: "s#{s.id}") do
148
+ builder.graph(root: "s#{s.id}_root") do
149
+ write_terminals(builder, s)
150
+ write_nonterminals(builder, s)
151
+ end
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,193 @@
1
+ module PROIEL
2
+ module Converter
3
+ class Tiger2
4
+ SCHEMA_FILE = File.join('tiger2', 'Tiger2.xsd')
5
+
6
+ class << self
7
+ def process(tb, options)
8
+ selected_features = [] # TODO
9
+ @features = selected_features.map { |f| [f, 'FREC'] }
10
+
11
+ builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
12
+ builder.instruct! :xml, version: "1.0", encoding: "UTF-8"
13
+
14
+ tb.sources.each do |source|
15
+ @hack = tb.annotation_schema
16
+ write_source(builder, source, tb) do
17
+ source.divs.each do |div|
18
+ div.sentences.each do |sentence|
19
+ write_sentence(builder, sentence)
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def write_source(builder, s, tb)
27
+ builder.corpus('xml:id' => s.id,
28
+ 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
29
+ 'xsi:schemaLocation' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/ http://korpling.german.hu-berlin.de/tiger2/V2.0.5/Tiger2.xsd',
30
+ 'xmlns:tiger2' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/',
31
+ 'xmlns' => 'http://korpling.german.hu-berlin.de/tiger2/V2.0.5/') do
32
+ builder.head do
33
+ builder.meta do
34
+ builder.name(s.title)
35
+ builder.author('The PROIEL project')
36
+ builder.date(s.export_time.strftime("%F %T %z"))
37
+ builder.description
38
+ builder.format
39
+ builder.history
40
+ end
41
+
42
+ declare_annotation(builder, @features,
43
+ tb.annotation_schema)
44
+ end
45
+
46
+ builder.body do
47
+ yield builder
48
+ end
49
+ end
50
+ end
51
+
52
+ def declare_annotation(builder, features, annotation_schema)
53
+ builder.annotation do
54
+ features.each do |name, domain|
55
+ # FIXME: we may want to list possible values for some of these
56
+ builder.feature(name: name, domain: domain)
57
+ end
58
+
59
+ builder.edgelabel do
60
+ builder.value(name: '--')
61
+
62
+ annotation_schema.primary_relations.each do |tag, features|
63
+ builder.value({ name: tag }, features.summary)
64
+ end
65
+ end
66
+
67
+ builder.secedgelabel do
68
+ annotation_schema.secondary_relations.each do |tag, features|
69
+ builder.value({name: tag }, features.summary)
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def declare_edgelabels(builder)
76
+ builder.feature(name: "label", type: "prim", domain: "edge") do
77
+ declare_primary_edges(builder)
78
+ end
79
+
80
+ builder.feature(name: "label", type: "sec", domain: "edge") do
81
+ declare_secedges(builder)
82
+ end
83
+
84
+ builder.feature(name: "label", type: "coref", domain: "edge") do
85
+ builder.value(name: "antecedent")
86
+ builder.value(name: "inference")
87
+ end
88
+ end
89
+
90
+ def write_sentence(builder, s)
91
+ builder.s('xml:id' => "s#{s.id}") do
92
+ builder.graph(root: "s#{s.id}_root") do
93
+ write_terminals(builder, s)
94
+ write_nonterminals(builder, s)
95
+ end
96
+ end
97
+ end
98
+
99
+ def write_terminals(builder, s)
100
+ builder.terminals do
101
+ s.tokens.each do |t|
102
+ builder.t(token_attrs(s, t, 'T').merge({ 'xml:id' => "w#{t.id}"}))
103
+ end
104
+ end
105
+ end
106
+
107
+ def token_attrs(s, t, type)
108
+ attrs = {}
109
+
110
+ @features.each do |name, domain|
111
+ if domain == 'FREC' or domain == type
112
+ case name
113
+ when :word, :cat
114
+ attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
115
+ when *@semantic_features
116
+ attrs[name] = t.sem_tags_to_hash[attr]
117
+ when :lemma
118
+ attrs[name] = t.lemma
119
+ when :pos
120
+ if t.empty_token_sort
121
+ attrs[name] = t.empty_token_sort + "-"
122
+ else
123
+ attrs[name] = t.pos
124
+ end
125
+ when *MORPHOLOGICAL_FEATURES
126
+ attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
127
+ else
128
+ if t.respond_to?(name)
129
+ attrs[name] = t.send(name)
130
+ else
131
+ raise "Do not know how to get required attribute #{name}"
132
+ end
133
+ end
134
+ attrs[name] ||= "--"
135
+ end
136
+ end
137
+
138
+ attrs
139
+ end
140
+
141
+ def write_nonterminals(builder, s)
142
+ builder.nonterminals do
143
+ # Add an empty root node
144
+ h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
145
+ h['xml:id'] = "s#{s.id}_root"
146
+
147
+ builder.nt(h) do
148
+ s.tokens.reject { |t| t.head or t.pro? }.each do |t|
149
+ builder.edge(idref: "p#{t.id}", label: t.relation)
150
+ end
151
+ end
152
+
153
+ # Add other NTs
154
+ s.tokens.each do |t|
155
+ builder.nt(token_attrs(s, t, 'NT').merge('xml:id' => "p#{t.id}")) do
156
+ # Add an edge to the correspoding terminal node
157
+ builder.edge(idref: "w#{t.id}", label: '--')
158
+
159
+ # Add primary dependency edges
160
+ t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
161
+
162
+ # Add secondary dependency edges
163
+ t.slashes.each do |relation, target_id|
164
+ builder.secedge(idref: "p#{target_id}", label: relation)
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+
171
+ def write_root_edge(t, builder)
172
+ builder.edge('tiger2:type' => "prim", 'tiger2:target' => "p#{t.id}", :label => t.relation.tag)
173
+ end
174
+
175
+ def write_edges(t, builder)
176
+ # Add an edge between this node and the correspoding terminal node unless
177
+ # this is not a morphtaggable node.
178
+ builder.edge('tiger2:type' => "prim", 'tiger2:target' => "w#{t.id}", :label => '--') if t.is_morphtaggable? or t.empty_token_sort == 'P'
179
+
180
+ # Add primary dependency edges including empty pro tokens if we are exporting info structure as well
181
+ t.dependents.each { |d| builder.edge('tiger2:type' => "prim", 'tiger2:target' => "p#{d.id}", :label => d.relation.tag) }
182
+
183
+ # Add secondary dependency edges
184
+ get_slashes(t).each do |se|
185
+ builder.edge('tiger2:type' => "sec", 'tiger2:target' => "p#{se.slashee_id}", :label => se.relation.tag)
186
+ end
187
+
188
+ builder.edge('tiger2:type' => "coref", 'tiger2:target' => t.antecedent_id, :label => (t.information_status_tag == 'acc_inf' ? "inference" : "antecedent") )
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end