proiel-cli 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,64 +1,79 @@
1
- module PROIEL
2
- module Converter
3
- # This converts to the CoNLL-X format as described on http://ilk.uvt.nl/conll/#dataformat.
4
- class CoNLLX
5
- class << self
6
- def process(tb, options)
7
- tb.sources.each do |source|
8
- source.divs.each do |div|
9
- div.sentences.each do |sentence|
10
- id_to_number = {}
11
-
12
- # Do not care about prodrop tokens
13
- tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
14
-
15
- # Renumber to make the sequence continguous after prodrop tokens where left out
16
- tk.map(&:id).each_with_index.each do |id, i|
17
- id_to_number[id] = i + 1
18
- end
1
+ module PROIEL::Converter
2
+ # Converter that outputs the CoNLL-X format as described on
3
+ # http://ilk.uvt.nl/conll/#dataformat.
4
+ #
5
+ # The conversion removes empty tokens. PRO tokens are completely ignored,
6
+ # while null C and null V tokens are eliminated by attaching their
7
+ # dependents to the first non-null ancestor and labelling them with a
8
+ # concatenation of dependency relations.
9
+ #
10
+ # Sequences of whitespace in forms and lemmas are represented by '.'.
11
+ class CoNLLX
12
+ class << self
13
+ def process(tb, _)
14
+ tb.sources.each do |source|
15
+ source.sentences.each do |sentence|
16
+ process_sentence(tb, sentence)
17
+ end
18
+ end
19
+ end
19
20
 
20
- id_to_token = tk.inject({}) { |h, t| h.merge({t.id => t}) }
21
+ def process_sentence(tb, sentence)
22
+ tokens = sentence.tokens
21
23
 
22
- tk.each do |token|
23
- unless token.is_empty?
24
- this_number = id_to_number[token.id]
25
- head_number, relation = find_lexical_head_and_relation(id_to_number, id_to_token, token)
26
- form = token.form.gsub(/[[:space:]]/, '.')
27
- lemma = token.lemma.gsub(/[[:space:]]/, '.')
28
- pos_major = token.part_of_speech_hash[:major]
29
- pos_full = token.part_of_speech
30
- morphology = format_morphology(token)
24
+ # Generate 1-based continguous numbering of overt tokens with
25
+ # null V and null C tokens appended at the end. We do this
26
+ # manually to ensure that the numbering is correct whatever the
27
+ # sequence is in the treebank.
28
+ id_map = Hash.new { |h, k| h[k] = h.keys.length + 1 }
29
+ tokens.select(&:has_content?).each { |t| id_map[t] } # these blocks have side-effects
30
+ tokens.reject(&:has_content?).reject(&:pro?).each { |t| id_map[t] }
31
31
 
32
- puts [this_number, form, lemma, pos_major, pos_full,
33
- morphology, head_number, relation, "_", "_"].join("\t")
34
- end
35
- end
32
+ # Iterate overt tokens and print one formatted line per token.
33
+ tokens.select(&:has_content?).each do |token|
34
+ this_number = id_map[token]
35
+ head_number, relation = find_lexical_head_and_relation(id_map, tb, token)
36
+ form = format_text(token.form)
37
+ lemma = format_text(token.lemma)
38
+ pos_major, pos_full = format_pos(token)
39
+ morphology = format_morphology(token)
36
40
 
37
- puts
38
- end
39
- end
40
- end
41
+ puts [this_number, form, lemma, pos_major, pos_full,
42
+ morphology, head_number, relation, '_', '_'].join("\t")
41
43
  end
42
44
 
43
- def format_morphology(token)
44
- token.morphology_hash.map do |k, v|
45
- # Remove inflection tag unless when set to inflecting
46
- if k == :inflection and v =='i'
47
- nil
48
- else
49
- "#{k.upcase[0..3]}#{v}"
50
- end
51
- end.compact.join('|')
52
- end
45
+ # Separate sentences by an empty line.
46
+ puts
47
+ end
53
48
 
54
- def find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '')
55
- if t.is_root?
56
- [0, rel + t.relation] # FIXME: may be empty token anyway
57
- elsif id_to_token[t.head_id].has_content?
58
- [id_to_number[t.head_id], rel + t.relation]
49
+ def format_text(s)
50
+ s.gsub(/[[:space:]]+/, '.')
51
+ end
52
+
53
+ def format_pos(token)
54
+ [token.part_of_speech_hash[:major], token.part_of_speech]
55
+ end
56
+
57
+ def format_morphology(token)
58
+ token.morphology_hash.map do |k, v|
59
+ # Remove inflection tag except when set to inflecting
60
+ if k == :inflection and v =='i'
61
+ nil
59
62
  else
60
- find_lexical_head_and_relation(id_to_number, id_to_token, id_to_token[t.head_id], rel + "#{t.relation}(#{id_to_number[t.head_id]})")
63
+ "#{k.upcase[0..3]}#{v}"
61
64
  end
65
+ end.compact.join('|')
66
+ end
67
+
68
+ def find_lexical_head_and_relation(id_map, tb, t, rel = '')
69
+ new_relation = rel + t.relation
70
+
71
+ if t.is_root?
72
+ [0, new_relation]
73
+ elsif t.head.has_content?
74
+ [id_map[t.head], new_relation]
75
+ else
76
+ find_lexical_head_and_relation(id_map, tb, t.head, "#{new_relation}(#{id_map[t.head]})")
62
77
  end
63
78
  end
64
79
  end
@@ -1,33 +1,31 @@
1
- module PROIEL
2
- module Converter
3
- # This converts part of speech and morphology to a lexc file.
4
- class Lexc
5
- class << self
6
- def process(tb, options)
7
- lexicon = {}
1
+ module PROIEL::Converter
2
+ # Converter that outputs a lexc file with part of speech and morphology.
3
+ class Lexc
4
+ class << self
5
+ def process(tb, options)
6
+ lexicon = {}
8
7
 
9
- tb.sources.each do |source|
10
- source.divs.each do |div|
11
- div.sentences.each do |sentence|
12
- sentence.tokens.each do |token|
13
- unless token.is_empty?
14
- lexicon[token.form] ||= []
15
- if options['morphology']
16
- lexicon[token.form] << [token.lemma, [token.part_of_speech, token.morphology].join].join(',')
17
- else
18
- lexicon[token.form] << [token.lemma, token.part_of_speech].join(',')
19
- end
8
+ tb.sources.each do |source|
9
+ source.divs.each do |div|
10
+ div.sentences.each do |sentence|
11
+ sentence.tokens.each do |token|
12
+ unless token.is_empty?
13
+ lexicon[token.form] ||= []
14
+ if options['morphology']
15
+ lexicon[token.form] << [token.lemma, [token.part_of_speech, token.morphology].join].join(',')
16
+ else
17
+ lexicon[token.form] << [token.lemma, token.part_of_speech].join(',')
20
18
  end
21
19
  end
22
20
  end
23
21
  end
24
22
  end
23
+ end
25
24
 
26
- puts "LEXICON Root"
27
- lexicon.sort.each do |form, tags|
28
- tags.sort.uniq.each do |tag|
29
- puts " %s:%s #;" % [tag, form]
30
- end
25
+ puts 'LEXICON Root'
26
+ lexicon.sort.each do |form, tags|
27
+ tags.sort.uniq.each do |tag|
28
+ puts ' %s:%s #;' % [tag, form]
31
29
  end
32
30
  end
33
31
  end
@@ -1,168 +1,209 @@
1
- module PROIEL
2
- module Converter
3
- class PROIELXML
4
- class << self
5
- def process(tb, options)
6
- builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
7
- builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
8
- builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
9
- builder.annotation do
10
- builder.relations do
11
- tb.annotation_schema.relation_tags.each do |tag, value|
12
- attrs = { tag: tag }
13
- attrs.merge!(grab_features(value, %i(summary primary secondary)))
14
- builder.value(attrs)
15
- end
1
+ module PROIEL::Converter
2
+ # Converter that outputs PROIEL XML. This is primarily useful for filtering,
3
+ # merging or splitting PROIEL XML data. It is also useful for "upgrading"
4
+ # PROIEL XML to a new version or for testing round tripping of data.
5
+ class PROIELXML
6
+ class << self
7
+ def process(tb, options)
8
+ builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
9
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
10
+ builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
11
+ builder.annotation do
12
+ builder.relations do
13
+ tb.annotation_schema.relation_tags.each do |tag, value|
14
+ attrs = { tag: tag }
15
+ attrs.merge!(grab_features(value, %i(summary primary secondary)))
16
+ builder.value(attrs)
16
17
  end
18
+ end
17
19
 
18
- builder.tag! 'parts-of-speech' do
19
- tb.annotation_schema.part_of_speech_tags.each do |tag, value|
20
- attrs = { tag: tag }
21
- attrs.merge!(grab_features(value, %i(summary)))
22
- builder.value(attrs)
23
- end
20
+ builder.tag! 'parts-of-speech' do
21
+ tb.annotation_schema.part_of_speech_tags.each do |tag, value|
22
+ attrs = { tag: tag }
23
+ attrs.merge!(grab_features(value, %i(summary)))
24
+ builder.value(attrs)
24
25
  end
26
+ end
25
27
 
26
- builder.morphology do
27
- tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
28
- builder.field(tag: cat_tag) do
29
- cat_values.each do |tag, value|
30
- attrs = { tag: tag }
31
- attrs.merge!(grab_features(value, %i(summary)))
32
- builder.value(attrs)
33
- end
28
+ builder.morphology do
29
+ tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
30
+ builder.field(tag: cat_tag) do
31
+ cat_values.each do |tag, value|
32
+ attrs = { tag: tag }
33
+ attrs.merge!(grab_features(value, %i(summary)))
34
+ builder.value(attrs)
34
35
  end
35
36
  end
36
37
  end
38
+ end
37
39
 
38
- builder.tag! 'information-statuses' do
39
- tb.annotation_schema.information_status_tags.each do |tag, value|
40
- attrs = { tag: tag }
41
- attrs.merge!(grab_features(value, %i(summary)))
42
- builder.value(attrs)
43
- end
40
+ builder.tag! 'information-statuses' do
41
+ tb.annotation_schema.information_status_tags.each do |tag, value|
42
+ attrs = { tag: tag }
43
+ attrs.merge!(grab_features(value, %i(summary)))
44
+ builder.value(attrs)
44
45
  end
45
46
  end
47
+ end
46
48
 
47
- tb.sources.each do |source|
48
- mandatory_features = %i(id language)
49
- optional_features = []
50
- optional_features += %i(alignment_id) unless options['remove-alignments']
49
+ tb.sources.each do |source|
50
+ next if options['remove-unaligned-sources'] and source.alignment_id.nil?
51
51
 
52
- builder.source(grab_features(source, mandatory_features, optional_features)) do
53
- PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
54
- builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
55
- end
52
+ mandatory_features = %i(id language)
53
+ optional_features = []
54
+ optional_features += %i(alignment_id) unless options['remove-alignments']
56
55
 
57
- source.divs.each do |div|
58
- if include_div?(div, options)
59
- mandatory_features = %i()
60
-
61
- optional_features = []
62
- optional_features += %i(presentation_before presentation_after)
63
- optional_features += %i(alignment_id) unless options['remove-alignments']
64
-
65
- builder.div(grab_features(div, mandatory_features, optional_features)) do
66
- builder.title div.title if div.title
67
-
68
- div.sentences.each do |sentence|
69
- if include_sentence?(sentence, options)
70
- mandatory_features = %i(id)
71
-
72
- optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
73
- optional_features += %i(status) unless options['remove-status']
74
- optional_features += %i(presentation_before presentation_after)
75
- optional_features += %i(alignment_id) unless options['remove-alignments']
76
- optional_features += %i(annotated_at) unless options['remove-annotator']
77
- optional_features += %i(reviewed_at) unless options['remove-reviewer']
78
- optional_features += %i(annotated_by) unless options['remove-annotator']
79
- optional_features += %i(reviewed_by) unless options['remove-reviewer']
80
-
81
- builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
82
- sentence.tokens.each do |token|
83
- next if token.empty_token_sort == 'P' and options['remove-information-structure']
84
- next if token.empty_token_sort == 'C' and options['remove-syntax']
85
- next if token.empty_token_sort == 'V' and options['remove-syntax']
86
-
87
- mandatory_features = %i(id)
88
-
89
- optional_features = %i(citation_part)
90
- optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
91
- optional_features += %i(head_id relation) unless options['remove-syntax']
92
- optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']
93
-
94
- unless token.is_empty?
95
- mandatory_features << :form
96
- optional_features += %i(presentation_before presentation_after foreign_ids)
97
- else
98
- mandatory_features << :empty_token_sort
99
- end
100
-
101
- optional_features += %i(alignment_id) unless options['remove-alignments']
102
-
103
- attrs = grab_features(token, mandatory_features, optional_features)
104
-
105
- unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
106
- builder.token(attrs) do
107
- token.slashes.each do |relation, target_id|
108
- builder.slash(:"target-id" => target_id, relation: relation)
109
- end
110
- end
111
- else
112
- unless options['remove-syntax'] and token.is_empty?
113
- builder.token(attrs)
114
- end
115
- end
116
- end
117
- end
118
- end
119
- end
120
- end
121
- end
56
+ builder.source(grab_features(source, mandatory_features, optional_features)) do
57
+ PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
58
+ builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
59
+ end
60
+
61
+ source.divs.each do |div|
62
+ if include_div?(div, options)
63
+
64
+ overrides = {
65
+ div: {},
66
+ sentence: {},
67
+ token: {}
68
+ }
69
+
70
+ process_div(builder, tb, source, div, options, overrides)
122
71
  end
123
72
  end
124
73
  end
125
74
  end
126
75
  end
76
+ end
77
+
78
+ def include_div?(div, options)
79
+ if options['remove-empty-divs']
80
+ div.sentences.any? { |sentence| include_sentence?(sentence, options) }
81
+ else
82
+ true
83
+ end
84
+ end
85
+
86
+ def include_sentence?(sentence, options)
87
+ case sentence.status
88
+ when :reviewed
89
+ not options['remove-reviewed'] and not options['remove-annotated']
90
+ when :annotated
91
+ not options['remove-not-reviewed'] and not options['remove-annotated']
92
+ else
93
+ not options['remove-not-reviewed'] and not options['remove-not-annotated']
94
+ end
95
+ end
96
+
97
+ def include_token?(token, options)
98
+ if options['remove-syntax'] and (token.empty_token_sort == 'C' or token.empty_token_sort == 'V')
99
+ false
100
+ elsif token.empty_token_sort == 'P' and options['remove-information-structure']
101
+ false
102
+ else
103
+ true
104
+ end
105
+ end
106
+
107
+ def process_div(builder, tb, source, div, options, overrides)
108
+ mandatory_features = %i()
127
109
 
128
- def include_div?(div, options)
129
- if options['remove-empty-divs']
130
- div.sentences.any? { |sentence| include_sentence?(sentence, options) }
131
- else
132
- true
110
+ optional_features = []
111
+ optional_features += %i(presentation_before presentation_after)
112
+ optional_features += %i(id alignment_id) unless options['remove-alignments']
113
+
114
+ if options['infer-alignments'] and source.alignment_id
115
+ aligned_source = tb.find_source(source.alignment_id)
116
+ # FIXME: how to behave here? overwrite existing? what if nil? how to deal with multiple aligned divs?
117
+ overrides[:div][:alignment_id] = div.alignment_id || div.inferred_alignment(aligned_source).map(&:id).join(',')
118
+ end
119
+
120
+ builder.div(grab_features(div, mandatory_features, optional_features, overrides[:div])) do
121
+ builder.title div.title if div.title
122
+
123
+ div.sentences.select do |sentence|
124
+ include_sentence?(sentence, options)
125
+ end.each do |sentence|
126
+ process_sentence(builder, tb, sentence, options, overrides)
133
127
  end
134
128
  end
129
+ end
135
130
 
136
- def include_sentence?(sentence, options)
137
- case sentence.status
138
- when :reviewed
139
- true
140
- when :annotated
141
- not options['remove-not-reviewed']
142
- else
143
- not options['remove-not-reviewed'] and not options['remove-not-annotated']
131
+ def process_sentence(builder, tb, sentence, options, overrides)
132
+ mandatory_features = %i(id)
133
+
134
+ optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
135
+ optional_features += %i(status) unless options['remove-status']
136
+ optional_features += %i(presentation_before presentation_after)
137
+ optional_features += %i(alignment_id) unless options['remove-alignments']
138
+ optional_features += %i(annotated_at) unless options['remove-annotator']
139
+ optional_features += %i(reviewed_at) unless options['remove-reviewer']
140
+ optional_features += %i(annotated_by) unless options['remove-annotator']
141
+ optional_features += %i(reviewed_by) unless options['remove-reviewer']
142
+
143
+ builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
144
+ sentence.tokens.select do |token|
145
+ include_token?(token, options)
146
+ end.each do |token|
147
+ process_token(builder, tb, token, options, overrides)
144
148
  end
145
149
  end
150
+ end
146
151
 
147
- def grab_features(obj, mandatory_features, optional_features = [])
148
- attrs = {}
152
+ def process_token(builder, tb, token, options, overrides)
153
+ mandatory_features = %i(id)
149
154
 
150
- mandatory_features.each do |f|
151
- v = obj.send(f)
155
+ optional_features = %i(citation_part)
156
+ optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
157
+ optional_features += %i(head_id relation) unless options['remove-syntax']
158
+ optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']
152
159
 
153
- attrs[f.to_s.gsub('_', '-')] = v
154
- end
160
+ unless token.is_empty?
161
+ mandatory_features << :form
162
+ optional_features += %i(presentation_before presentation_after foreign_ids)
163
+ else
164
+ mandatory_features << :empty_token_sort
165
+ end
166
+
167
+ if options['remove-not-reviewed'] or options['remove-not-annotated'] or options['remove-annotated'] or options['remove-annotated']
168
+ overrides[:token][:antecedent_id] =
169
+ (token.antecedent_id and include_sentence?(tb.find_token(token.antecedent_id.to_i).sentence, options)) ? token.antecedent_id : nil
170
+ end
171
+
172
+ optional_features += %i(alignment_id) unless options['remove-alignments']
155
173
 
156
- optional_features.each do |f|
157
- v = obj.send(f)
174
+ attrs = grab_features(token, mandatory_features, optional_features, overrides[:token])
158
175
 
159
- if v and v.to_s != ''
160
- attrs[f.to_s.gsub('_', '-')] = v
176
+ unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
177
+ builder.token(attrs) do
178
+ token.slashes.each do |relation, target_id|
179
+ builder.slash(:"target-id" => target_id, relation: relation)
161
180
  end
162
181
  end
182
+ else
183
+ unless options['remove-syntax'] and token.is_empty?
184
+ builder.token(attrs)
185
+ end
186
+ end
187
+ end
163
188
 
164
- attrs
189
+ def grab_features(obj, mandatory_features, optional_features = [], overrides = {})
190
+ attrs = {}
191
+
192
+ mandatory_features.each do |f|
193
+ v = overrides.key?(f) ? overrides[f] : obj.send(f)
194
+
195
+ attrs[f.to_s.gsub('_', '-')] = v
165
196
  end
197
+
198
+ optional_features.each do |f|
199
+ v = overrides.key?(f) ? overrides[f] : obj.send(f)
200
+
201
+ if v and v.to_s != ''
202
+ attrs[f.to_s.gsub('_', '-')] = v
203
+ end
204
+ end
205
+
206
+ attrs
166
207
  end
167
208
  end
168
209
  end