proiel-cli 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,98 +1,96 @@
1
- module PROIEL
2
- module Converter
3
- class Text
4
- class << self
5
- def process(tb, options)
6
- tb.sources.each do |source|
7
- puts "% id = #{source.id}"
8
- puts "% export_time = #{source.export_time}"
9
- puts "% title = #{source.title}"
10
- puts "% author = #{source.author}"
11
- puts "% citation_part = #{source.citation_part}"
12
- puts "% language = #{source.language}"
13
-
14
- source.divs.each do |div|
15
- puts
16
- puts "# #{div.title}"
17
- puts
18
-
19
- if options['diffable']
20
- print_diffable_div(div)
21
- else
22
- print_formatted_div(div)
23
- end
1
+ module PROIEL::Converter
2
+ class Text
3
+ class << self
4
+ def process(tb, options)
5
+ tb.sources.each do |source|
6
+ puts "% id = #{source.id}"
7
+ puts "% export_time = #{source.export_time}"
8
+ puts "% title = #{source.title}"
9
+ puts "% author = #{source.author}"
10
+ puts "% citation_part = #{source.citation_part}"
11
+ puts "% language = #{source.language}"
12
+
13
+ source.divs.each do |div|
14
+ puts
15
+ puts "# #{div.title}"
16
+ puts
17
+
18
+ if options['diffable']
19
+ print_diffable_div(div)
20
+ else
21
+ print_formatted_div(div)
24
22
  end
25
23
  end
26
24
  end
25
+ end
27
26
 
28
- def print_formatted_div(div)
29
- p = ''
30
- p += div.presentation_before unless div.presentation_before.nil?
31
-
32
- current_citation = nil
27
+ def print_formatted_div(div)
28
+ p = ''
29
+ p += div.presentation_before unless div.presentation_before.nil?
33
30
 
34
- div.sentences.each do |sentence|
35
- p += sentence.presentation_before unless sentence.presentation_before.nil?
31
+ current_citation = nil
36
32
 
37
- sentence.tokens.each do |token|
38
- if token.has_content?
39
- new_citation = token.citation_part
33
+ div.sentences.each do |sentence|
34
+ p += sentence.presentation_before unless sentence.presentation_before.nil?
40
35
 
41
- if current_citation != new_citation
42
- p += "§#{new_citation.gsub(/\s+/, '_')} "
43
- current_citation = new_citation
44
- end
36
+ sentence.tokens.each do |token|
37
+ if token.has_content?
38
+ new_citation = token.citation_part
45
39
 
46
- p += [token.presentation_before,
47
- token.form,
48
- token.presentation_after].compact.join
40
+ if current_citation != new_citation
41
+ p += "§#{new_citation.gsub(/\s+/, '_')} "
42
+ current_citation = new_citation
49
43
  end
50
- end
51
44
 
52
- p += sentence.presentation_after unless sentence.presentation_after.nil?
45
+ p += [token.presentation_before,
46
+ token.form,
47
+ token.presentation_after].compact.join
48
+ end
53
49
  end
54
50
 
55
- p += div.presentation_after unless div.presentation_after.nil?
51
+ p += sentence.presentation_after unless sentence.presentation_after.nil?
52
+ end
56
53
 
57
- p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
58
- line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
59
- end * "\n"
54
+ p += div.presentation_after unless div.presentation_after.nil?
60
55
 
61
- puts p
62
- end
56
+ p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
57
+ line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
58
+ end * "\n"
63
59
 
64
- def print_diffable_div(div)
65
- current_citation = nil
60
+ puts p
61
+ end
66
62
 
67
- p = ''
68
- pb = div.presentation_before || ''
63
+ def print_diffable_div(div)
64
+ current_citation = nil
69
65
 
70
- div.sentences.each do |sentence|
71
- pb += sentence.presentation_before || ''
66
+ p = ''
67
+ pb = div.presentation_before || ''
72
68
 
73
- sentence.tokens.each do |token|
74
- if token.has_content?
75
- if current_citation != token.citation_part
76
- puts p unless p.empty?
77
- p = "§#{token.citation_part.gsub(/\s+/, '_')} "
78
- current_citation = token.citation_part
79
- end
69
+ div.sentences.each do |sentence|
70
+ pb += sentence.presentation_before || ''
80
71
 
81
- p += [pb,
82
- token.presentation_before,
83
- token.form,
84
- token.presentation_after].compact.join
85
- pb = ''
72
+ sentence.tokens.each do |token|
73
+ if token.has_content?
74
+ if current_citation != token.citation_part
75
+ puts p unless p.empty?
76
+ p = "§#{token.citation_part.gsub(/\s+/, '_')} "
77
+ current_citation = token.citation_part
86
78
  end
87
- end
88
79
 
89
- p += sentence.presentation_after unless sentence.presentation_after.nil?
80
+ p += [pb,
81
+ token.presentation_before,
82
+ token.form,
83
+ token.presentation_after].compact.join
84
+ pb = ''
85
+ end
90
86
  end
91
87
 
92
- p += div.presentation_after unless div.presentation_after.nil?
93
-
94
- puts p unless p.empty?
88
+ p += sentence.presentation_after unless sentence.presentation_after.nil?
95
89
  end
90
+
91
+ p += div.presentation_after unless div.presentation_after.nil?
92
+
93
+ puts p unless p.empty?
96
94
  end
97
95
  end
98
96
  end
@@ -1,154 +1,150 @@
1
- require 'builder'
2
-
3
- module PROIEL
4
- module Converter
5
- # Converter for the TigerXML format
6
- # (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
7
- # in the variant used by VISL under the name 'TIGER dependency format'
8
- # (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
9
- class Tiger
10
- SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
11
-
12
- MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
13
- OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
14
-
15
- class << self
16
- def process(tb, options)
17
- selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
18
- @features = selected_features.map { |f| [f, 'FREC'] }.to_h
19
-
20
- builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
21
- builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
22
-
23
- tb.sources.each do |source|
24
- @hack = tb.annotation_schema
25
- write_source(builder, source) do
26
- source.divs.each do |div|
27
- div.sentences.each do |sentence|
28
- write_sentence(builder, sentence)
29
- end
1
+ module PROIEL::Converter
2
+ # Converter for the TigerXML format
3
+ # (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
4
+ # in the variant used by VISL under the name 'TIGER dependency format'
5
+ # (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
6
+ class Tiger
7
+ SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
8
+
9
+ MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
10
+ OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
11
+
12
+ class << self
13
+ def process(tb, _)
14
+ selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
15
+ @features = selected_features.map { |f| [f, 'FREC'] }.to_h
16
+
17
+ builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
18
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
19
+
20
+ tb.sources.each do |source|
21
+ @hack = tb.annotation_schema
22
+ write_source(builder, source) do
23
+ source.divs.each do |div|
24
+ div.sentences.each do |sentence|
25
+ write_sentence(builder, sentence)
30
26
  end
31
27
  end
32
28
  end
33
29
  end
30
+ end
34
31
 
35
- def write_source(builder, s)
36
- builder.corpus(id: s.id) do
37
- builder.head do
38
- builder.meta do
39
- builder.name(s.title)
40
- end
41
-
42
- declare_annotation(builder, @features, @hack)
32
+ def write_source(builder, s)
33
+ builder.corpus(id: s.id) do
34
+ builder.head do
35
+ builder.meta do
36
+ builder.name(s.title)
43
37
  end
44
38
 
45
- builder.body do
46
- yield
47
- end
39
+ declare_annotation(builder, @features, @hack)
40
+ end
41
+
42
+ builder.body do
43
+ yield
48
44
  end
49
45
  end
46
+ end
50
47
 
51
- def declare_annotation(builder, features, annotation_schema)
52
- builder.annotation do
53
- features.each do |name, domain|
54
- # FIXME: we may want to list possible values for some of these
55
- builder.feature(name: name, domain: domain)
56
- end
48
+ def declare_annotation(builder, features, annotation_schema)
49
+ builder.annotation do
50
+ features.each do |name, domain|
51
+ # FIXME: we may want to list possible values for some of these
52
+ builder.feature(name: name, domain: domain)
53
+ end
57
54
 
58
- builder.edgelabel do
59
- builder.value(name: '--')
55
+ builder.edgelabel do
56
+ builder.value(name: '--')
60
57
 
61
- annotation_schema.primary_relations.each do |tag, features|
62
- builder.value({ name: tag }, features.summary)
63
- end
58
+ annotation_schema.primary_relations.each do |tag, features|
59
+ builder.value({ name: tag }, features.summary)
64
60
  end
61
+ end
65
62
 
66
- builder.secedgelabel do
67
- annotation_schema.secondary_relations.each do |tag, features|
68
- builder.value({name: tag }, features.summary)
69
- end
63
+ builder.secedgelabel do
64
+ annotation_schema.secondary_relations.each do |tag, features|
65
+ builder.value({name: tag }, features.summary)
70
66
  end
71
67
  end
72
68
  end
69
+ end
73
70
 
74
- def token_attrs(s, t, type)
75
- attrs = {}
76
-
77
- @features.each do |name, domain|
78
- if domain == 'FREC' or domain == type
79
- case name
80
- when :word, :cat
81
- attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
82
- when *@semantic_features
83
- attrs[name] = t.sem_tags_to_hash[attr]
84
- when :lemma
85
- attrs[name] = t.lemma
86
- when :pos
87
- if t.empty_token_sort
88
- attrs[name] = t.empty_token_sort + "-"
89
- else
90
- attrs[name] = t.pos
91
- end
92
- when *MORPHOLOGICAL_FEATURES
93
- attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
71
+ def token_attrs(t, type)
72
+ attrs = {}
73
+
74
+ @features.each do |name, domain|
75
+ if domain == 'FREC' or domain == type
76
+ case name
77
+ when :word, :cat
78
+ attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
79
+ when *@semantic_features
80
+ attrs[name] = t.sem_tags_to_hash[attr]
81
+ when :lemma
82
+ attrs[name] = t.lemma
83
+ when :pos
84
+ if t.empty_token_sort
85
+ attrs[name] = t.empty_token_sort + '-'
86
+ else
87
+ attrs[name] = t.pos
88
+ end
89
+ when *MORPHOLOGICAL_FEATURES
90
+ attrs[name] = name.to_s.split('_').map { |a| t.morphology_hash[a.to_sym] || '-' }.join
91
+ else
92
+ if t.respond_to?(name)
93
+ attrs[name] = t.send(name)
94
94
  else
95
- if t.respond_to?(name)
96
- attrs[name] = t.send(name)
97
- else
98
- raise "Do not know how to get required attribute #{name}"
99
- end
95
+ raise "Do not know how to get required attribute #{name}"
100
96
  end
101
- attrs[name] ||= "--"
102
97
  end
98
+ attrs[name] ||= '--'
103
99
  end
104
-
105
- attrs
106
100
  end
107
101
 
108
- def write_terminals(builder, s)
109
- builder.terminals do
110
- s.tokens.each do |t|
111
- builder.t(token_attrs(s, t, 'T').merge({ id: "w#{t.id}"}))
112
- end
102
+ attrs
103
+ end
104
+
105
+ def write_terminals(builder, s)
106
+ builder.terminals do
107
+ s.tokens.each do |t|
108
+ builder.t(token_attrs(t, 'T').merge({ id: "w#{t.id}"}))
113
109
  end
114
110
  end
111
+ end
115
112
 
116
- def write_nonterminals(builder, s)
117
- builder.nonterminals do
118
- # Add an empty root node
119
- h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
120
- h[:id] = "s#{s.id}_root"
113
+ def write_nonterminals(builder, s)
114
+ builder.nonterminals do
115
+ # Add an empty root node
116
+ h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
117
+ h[:id] = "s#{s.id}_root"
121
118
 
122
- builder.nt(h) do
123
- s.tokens.reject { |t| t.head or t.pro? }.each do |t|
124
- builder.edge(idref: "p#{t.id}", label: t.relation)
125
- end
119
+ builder.nt(h) do
120
+ s.tokens.reject { |t| t.head or t.pro? }.each do |t|
121
+ builder.edge(idref: "p#{t.id}", label: t.relation)
126
122
  end
123
+ end
127
124
 
128
- # Add other NTs
129
- s.tokens.each do |t|
130
- builder.nt(token_attrs(s, t, 'NT').merge(id: "p#{t.id}")) do
131
- # Add an edge to the correspoding terminal node
132
- builder.edge(idref: "w#{t.id}", label: '--')
125
+ # Add other NTs
126
+ s.tokens.each do |t|
127
+ builder.nt(token_attrs(t, 'NT').merge(id: "p#{t.id}")) do
128
+ # Add an edge to the correspoding terminal node
129
+ builder.edge(idref: "w#{t.id}", label: '--')
133
130
 
134
- # Add primary dependency edges
135
- t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
131
+ # Add primary dependency edges
132
+ t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
136
133
 
137
- # Add secondary dependency edges
138
- t.slashes.each do |relation, target_id|
139
- builder.secedge(idref: "p#{target_id}", label: relation)
140
- end
134
+ # Add secondary dependency edges
135
+ t.slashes.each do |relation, target_id|
136
+ builder.secedge(idref: "p#{target_id}", label: relation)
141
137
  end
142
138
  end
143
139
  end
144
140
  end
141
+ end
145
142
 
146
- def write_sentence(builder, s)
147
- builder.s(id: "s#{s.id}") do
148
- builder.graph(root: "s#{s.id}_root") do
149
- write_terminals(builder, s)
150
- write_nonterminals(builder, s)
151
- end
143
+ def write_sentence(builder, s)
144
+ builder.s(id: "s#{s.id}") do
145
+ builder.graph(root: "s#{s.id}_root") do
146
+ write_terminals(builder, s)
147
+ write_nonterminals(builder, s)
152
148
  end
153
149
  end
154
150
  end