pils 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+ class GeoAnnotation < Pils::Tcf::Annotation
24
+
25
+ attr_accessor :lat, :lon, :alt, :continent
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,31 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+
24
+ class NamedEntityAnnotation < Pils::Tcf::Annotation
25
+
26
+ attr_accessor :category
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+
24
+ class Sentence < BoundedElement
25
+
26
+ attr_accessor :tokens
27
+ attr_accessor :previous_sentence
28
+ attr_accessor :next_sentence
29
+
30
+ def initialize(tcf_document, xml_element)
31
+ @tcf_document = tcf_document
32
+ @xml_element = xml_element
33
+ @tokens = []
34
+ @previous_sentence, @next_sentence = nil
35
+ end
36
+
37
+ def token_length
38
+ @tokens.size
39
+ end
40
+
41
+ def character_length
42
+ @tokens.collect{|token| token.length}.sum
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,296 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+
24
+ class TcfDocument
25
+
26
+ def initialize(io, opts = {})
27
+ @doc = Nokogiri::XML(io)
28
+ # TODO add a method that reads the XML into Ruby structures
29
+ @tokens = Array.new
30
+ @sentences = Array.new
31
+ @named_entities = Array.new
32
+ @geo_annotations = Array.new
33
+ @id_map = Hash.new
34
+ @token_map = Hash.new
35
+ @dependency_map = Hash.new()
36
+
37
+ process_tokens
38
+ unless @tokens.all?{|t| t.boundaries? }
39
+ if opts.has_key?(:primary_text)
40
+ calculate_character_offsets(opts[:primary_text])
41
+ else
42
+ calculate_character_offsets
43
+ end
44
+ end
45
+
46
+ process_sentences
47
+
48
+ # TODO process pos and lemma information
49
+ process_pos
50
+ process_lemma
51
+
52
+ process_named_entities
53
+ process_geo_annotations
54
+ process_dependencies
55
+
56
+ end
57
+
58
+ def token_map
59
+ @token_map
60
+ end
61
+
62
+ def id_map
63
+ @id_map
64
+ end
65
+
66
+ def calculate_character_offsets(primary_text = nil)
67
+ effective_primary_text = primary_text
68
+ if effective_primary_text.nil?
69
+ effective_primary_text = text
70
+ end
71
+ char_index = 0
72
+ tokens.each do |token|
73
+ unless token.form.nil?
74
+ # Pils::log ' > %s' % token.form
75
+ new_index = effective_primary_text.index(token.form, char_index)
76
+ if new_index
77
+ new_offset = new_index + token.form.length
78
+ token.boundaries= [new_index, new_offset]
79
+ char_index = new_offset
80
+ else
81
+ #TODO: what to do when the token cannot be found?
82
+ end
83
+ else
84
+ #TODO Problem: what to do when token has no form?
85
+ end
86
+ end
87
+ end
88
+
89
+ def text
90
+ #Pils::log "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
91
+ #Pils::log "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
92
+ #Pils::log "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
93
+ #Pils::log "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
94
+
95
+ @text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
96
+ end
97
+
98
+ def tokens
99
+ @tokens
100
+ end
101
+
102
+ def sentences
103
+ @sentences
104
+ end
105
+
106
+ def named_entities
107
+ @named_entities
108
+ end
109
+
110
+ def geo_annotations
111
+ @geo_annotations
112
+ end
113
+
114
+ def dependency_map
115
+ @dependency_map
116
+ end
117
+
118
+ def store_named_entity(named_entity_object)
119
+ @named_entities << named_entity_object
120
+ end
121
+
122
+
123
+ def xml_sentences
124
+ # /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
125
+ @xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
126
+ end
127
+
128
+ def xml_tokens
129
+ @xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
130
+ end
131
+
132
+ def xml_named_entities
133
+ @xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
134
+ end
135
+
136
+ def xml_geo_annotations
137
+ @xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
138
+ end
139
+
140
+ def xml_dependencies
141
+ @xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
142
+ end
143
+
144
+ # TODO add deep support for sentences and related tokens
145
+
146
+ def new_token(doc, xml_token)
147
+ token_object = Pils::Tcf::Token.new(doc, xml_token)
148
+ if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
149
+ token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
150
+ end
151
+ token_object
152
+ end
153
+
154
+ def store_token(token_object, xml_token)
155
+ @tokens << token_object
156
+ @id_map[xml_token['ID']] = token_object
157
+ @token_map[token_object] = xml_token['ID']
158
+ end
159
+
160
+ def new_sentence(doc, xml_sentence)
161
+ sentence_object = Pils::Tcf::Sentence.new(doc, xml_sentence)
162
+ token_ids = xml_sentence['tokenIDs']
163
+ token_ids.split(/\s+/).each do |tid|
164
+ if @id_map.has_key?(tid)
165
+ sentence_object.tokens << @id_map[tid]
166
+ end
167
+ end
168
+ sentence_object
169
+ end
170
+
171
+ def store_sentence(sentence_object, xml_sentence)
172
+ @sentences << sentence_object
173
+ #TODO as soon as we need such a map.
174
+ end
175
+
176
+ def token_for_id(xml_id)
177
+ @id_map[xml_id]
178
+ end
179
+
180
+ def id_for_token(token)
181
+ @token_map[token]
182
+ end
183
+
184
+ def link_tokens!
185
+ @tokens.each_with_index do |token,i|
186
+ pred = i>0 ? @tokens[i-1] : nil
187
+ succ = i<(@tokens.size-1) ? @tokens[i+1] : nil
188
+ token.previous_token = pred
189
+ token.next_token = succ
190
+ end
191
+ end
192
+
193
+ def link_sentences!
194
+ @sentences.each_with_index do |sentence,i|
195
+ pred = i>0 ? @sentences[i-1] : nil
196
+ succ = i<(@sentences.size-1) ? @sentences[i+1] : nil
197
+ sentence.previous_sentence = pred
198
+ sentence.next_sentence = succ
199
+ end
200
+ end
201
+
202
+ private
203
+
204
+ def process_tokens
205
+ xml_tokens.each do |xml_token|
206
+ token = new_token(@doc, xml_token)
207
+ store_token(token, xml_token)
208
+ end
209
+ end
210
+
211
+ def process_sentences
212
+ xml_sentences.each do |xml_sentence|
213
+ sentence = new_sentence(@doc, xml_sentence)
214
+ store_sentence(sentence, xml_sentence)
215
+ end
216
+ end
217
+
218
+ def process_pos
219
+ xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
220
+ xml_tags.each do |tag|
221
+ val = tag.text
222
+ ref = tag['tokenIDs']
223
+ ref_obj = @id_map[ref]
224
+ if val && ref_obj
225
+ ref_obj.pos = val
226
+ end
227
+ end
228
+ end
229
+
230
+ def process_lemma
231
+ xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
232
+ xml_lemmas.each do |lemma|
233
+ val = lemma.text
234
+ ref = lemma['tokenIDs']
235
+ ref_obj = @id_map[ref]
236
+ if val && ref_obj
237
+ ref_obj.lemma = val
238
+ end
239
+ end
240
+ end
241
+
242
+ def process_named_entities
243
+ xml_named_entities.each do |ent|
244
+ nato = Pils::Tcf::NamedEntityAnnotation.new(@doc)
245
+ nato.category = ent['class']
246
+ token_refs = ent['tokenIDs'].split(/\s+/)
247
+ tokens = token_refs.collect{|r| token_for_id(r)}
248
+ tokens.each do |t|
249
+ nato << t
250
+ end
251
+ @named_entities << nato
252
+ #Pils::log ent['class']
253
+ #Pils::log tokens.collect{|t| t.form}.join(' ')
254
+ end
255
+ end
256
+
257
+ def process_geo_annotations
258
+ xml_geo_annotations.each do |anno|
259
+ geo = Pils::Tcf::GeoAnnotation.new(@doc)
260
+ geo.lat = anno['lat'].to_f
261
+ geo.lon = anno['lon'].to_f
262
+ geo.alt = anno['alt'].to_f
263
+ geo.continent = anno['continent']
264
+ token_refs = anno['tokenIDs'].split(/\s+/)
265
+ tokens = token_refs.collect{|r| token_for_id(r)}
266
+ tokens.each do |t|
267
+ geo << t
268
+ end
269
+ @geo_annotations << geo
270
+ end
271
+ end
272
+
273
+ def process_dependencies
274
+ xml_dependencies.each do |dep|
275
+ # <tc:dependency depIDs="t_4" func="ROOT"/>
276
+ # <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
277
+
278
+ depToken = token_for_id(dep['depIDs'])
279
+
280
+ if dep.has_attribute?('govIDs')
281
+ # non-root tag. func is also defined.
282
+ govToken = token_for_id(dep['govIDs'])
283
+ @dependency_map[[depToken,govToken]] = dep['func']
284
+ else
285
+ # root tag.
286
+
287
+ end
288
+
289
+ end
290
+ end
291
+
292
+ end
293
+
294
+ end
295
+ end
296
+
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+
24
+ class Token < BoundedElement
25
+
26
+ attr_accessor :pos
27
+ attr_accessor :lemma
28
+ attr_accessor :previous_token
29
+ attr_accessor :next_token
30
+
31
+ def initialize(tcf_document, xml_element)
32
+ @tcf_document = tcf_document
33
+ @xml_element = xml_element
34
+ @pos, @lemma, @previous_token, @next_token = nil
35
+ end
36
+
37
+ def form
38
+ @form ||= CGI.unescapeHTML(@xml_element.text)
39
+ end
40
+
41
+ def pos?
42
+ not pos.nil?
43
+ end
44
+
45
+ def lemma?
46
+ not lemma.nil?
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+ end