pils 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/pils +3 -0
- data/lib/pils.rb +64 -0
- data/lib/pils/de.rb +24 -0
- data/lib/pils/de/skeleton.rb +207 -0
- data/lib/pils/de/small.rb +128 -0
- data/lib/pils/parsing.rb +31 -0
- data/lib/pils/parsing/cat.rb +62 -0
- data/lib/pils/parsing/grammar.rb +47 -0
- data/lib/pils/parsing/lexicon.rb +100 -0
- data/lib/pils/parsing/parser.rb +310 -0
- data/lib/pils/parsing/rule.rb +43 -0
- data/lib/pils/parsing/tree.rb +147 -0
- data/lib/pils/parsing/wordform.rb +44 -0
- data/lib/pils/structures.rb +7 -0
- data/lib/pils/structures/avm.rb +98 -0
- data/lib/pils/tcf.rb +37 -0
- data/lib/pils/tcf/annotation.rb +42 -0
- data/lib/pils/tcf/bounded_element.rb +46 -0
- data/lib/pils/tcf/geo_annotation.rb +29 -0
- data/lib/pils/tcf/named_entity_annotation.rb +31 -0
- data/lib/pils/tcf/sentence.rb +47 -0
- data/lib/pils/tcf/tcf_document.rb +296 -0
- data/lib/pils/tcf/token.rb +52 -0
- data/lib/pils/tcf/transform/transformer.rb +468 -0
- data/lib/pils/version.rb +3 -0
- data/pils-0.1.2.gem +0 -0
- data/pils.gemspec +41 -0
- data/tasks/testing.rake +23 -0
- metadata +128 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# (c) 2019 Peter Menke
|
4
|
+
#
|
5
|
+
# This file is part of pils
|
6
|
+
# ("Programming in linguistic seminars").
|
7
|
+
#
|
8
|
+
# pils is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# pils is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with pils. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
module Pils
|
22
|
+
module Tcf
|
23
|
+
class GeoAnnotation < Pils::Tcf::Annotation
|
24
|
+
|
25
|
+
attr_accessor :lat, :lon, :alt, :continent
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# (c) 2019 Peter Menke
|
4
|
+
#
|
5
|
+
# This file is part of pils
|
6
|
+
# ("Programming in linguistic seminars").
|
7
|
+
#
|
8
|
+
# pils is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# pils is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with pils. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
module Pils
|
22
|
+
module Tcf
|
23
|
+
|
24
|
+
class NamedEntityAnnotation < Pils::Tcf::Annotation
|
25
|
+
|
26
|
+
attr_accessor :category
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# (c) 2019 Peter Menke
|
4
|
+
#
|
5
|
+
# This file is part of pils
|
6
|
+
# ("Programming in linguistic seminars").
|
7
|
+
#
|
8
|
+
# pils is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# pils is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with pils. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
module Pils
|
22
|
+
module Tcf
|
23
|
+
|
24
|
+
class Sentence < BoundedElement
|
25
|
+
|
26
|
+
attr_accessor :tokens
|
27
|
+
attr_accessor :previous_sentence
|
28
|
+
attr_accessor :next_sentence
|
29
|
+
|
30
|
+
def initialize(tcf_document, xml_element)
|
31
|
+
@tcf_document = tcf_document
|
32
|
+
@xml_element = xml_element
|
33
|
+
@tokens = []
|
34
|
+
@previous_sentence, @next_sentence = nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def token_length
|
38
|
+
@tokens.size
|
39
|
+
end
|
40
|
+
|
41
|
+
def character_length
|
42
|
+
@tokens.collect{|token| token.length}.sum
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,296 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# (c) 2019 Peter Menke
|
4
|
+
#
|
5
|
+
# This file is part of pils
|
6
|
+
# ("Programming in linguistic seminars").
|
7
|
+
#
|
8
|
+
# pils is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# pils is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with pils. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
module Pils
|
22
|
+
module Tcf
|
23
|
+
|
24
|
+
class TcfDocument
|
25
|
+
|
26
|
+
def initialize(io, opts = {})
|
27
|
+
@doc = Nokogiri::XML(io)
|
28
|
+
# TODO add a method that reads the XML into Ruby structures
|
29
|
+
@tokens = Array.new
|
30
|
+
@sentences = Array.new
|
31
|
+
@named_entities = Array.new
|
32
|
+
@geo_annotations = Array.new
|
33
|
+
@id_map = Hash.new
|
34
|
+
@token_map = Hash.new
|
35
|
+
@dependency_map = Hash.new()
|
36
|
+
|
37
|
+
process_tokens
|
38
|
+
unless @tokens.all?{|t| t.boundaries? }
|
39
|
+
if opts.has_key?(:primary_text)
|
40
|
+
calculate_character_offsets(opts[:primary_text])
|
41
|
+
else
|
42
|
+
calculate_character_offsets
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
process_sentences
|
47
|
+
|
48
|
+
# TODO process pos and lemma information
|
49
|
+
process_pos
|
50
|
+
process_lemma
|
51
|
+
|
52
|
+
process_named_entities
|
53
|
+
process_geo_annotations
|
54
|
+
process_dependencies
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
def token_map
|
59
|
+
@token_map
|
60
|
+
end
|
61
|
+
|
62
|
+
def id_map
|
63
|
+
@id_map
|
64
|
+
end
|
65
|
+
|
66
|
+
def calculate_character_offsets(primary_text = nil)
|
67
|
+
effective_primary_text = primary_text
|
68
|
+
if effective_primary_text.nil?
|
69
|
+
effective_primary_text = text
|
70
|
+
end
|
71
|
+
char_index = 0
|
72
|
+
tokens.each do |token|
|
73
|
+
unless token.form.nil?
|
74
|
+
# Pils::log ' > %s' % token.form
|
75
|
+
new_index = effective_primary_text.index(token.form, char_index)
|
76
|
+
if new_index
|
77
|
+
new_offset = new_index + token.form.length
|
78
|
+
token.boundaries= [new_index, new_offset]
|
79
|
+
char_index = new_offset
|
80
|
+
else
|
81
|
+
#TODO: what to do when the token cannot be found?
|
82
|
+
end
|
83
|
+
else
|
84
|
+
#TODO Problem: what to do when token has no form?
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def text
|
90
|
+
#Pils::log "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
|
91
|
+
#Pils::log "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
|
92
|
+
#Pils::log "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
|
93
|
+
#Pils::log "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
|
94
|
+
|
95
|
+
@text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
|
96
|
+
end
|
97
|
+
|
98
|
+
def tokens
|
99
|
+
@tokens
|
100
|
+
end
|
101
|
+
|
102
|
+
def sentences
|
103
|
+
@sentences
|
104
|
+
end
|
105
|
+
|
106
|
+
def named_entities
|
107
|
+
@named_entities
|
108
|
+
end
|
109
|
+
|
110
|
+
def geo_annotations
|
111
|
+
@geo_annotations
|
112
|
+
end
|
113
|
+
|
114
|
+
def dependency_map
|
115
|
+
@dependency_map
|
116
|
+
end
|
117
|
+
|
118
|
+
def store_named_entity(named_entity_object)
|
119
|
+
@named_entities << named_entity_object
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
def xml_sentences
|
124
|
+
# /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
|
125
|
+
@xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
126
|
+
end
|
127
|
+
|
128
|
+
def xml_tokens
|
129
|
+
@xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
130
|
+
end
|
131
|
+
|
132
|
+
def xml_named_entities
|
133
|
+
@xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
134
|
+
end
|
135
|
+
|
136
|
+
def xml_geo_annotations
|
137
|
+
@xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
138
|
+
end
|
139
|
+
|
140
|
+
def xml_dependencies
|
141
|
+
@xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
142
|
+
end
|
143
|
+
|
144
|
+
# TODO add deep support for sentences and related tokens
|
145
|
+
|
146
|
+
def new_token(doc, xml_token)
|
147
|
+
token_object = Pils::Tcf::Token.new(doc, xml_token)
|
148
|
+
if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
|
149
|
+
token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
|
150
|
+
end
|
151
|
+
token_object
|
152
|
+
end
|
153
|
+
|
154
|
+
def store_token(token_object, xml_token)
|
155
|
+
@tokens << token_object
|
156
|
+
@id_map[xml_token['ID']] = token_object
|
157
|
+
@token_map[token_object] = xml_token['ID']
|
158
|
+
end
|
159
|
+
|
160
|
+
def new_sentence(doc, xml_sentence)
|
161
|
+
sentence_object = Pils::Tcf::Sentence.new(doc, xml_sentence)
|
162
|
+
token_ids = xml_sentence['tokenIDs']
|
163
|
+
token_ids.split(/\s+/).each do |tid|
|
164
|
+
if @id_map.has_key?(tid)
|
165
|
+
sentence_object.tokens << @id_map[tid]
|
166
|
+
end
|
167
|
+
end
|
168
|
+
sentence_object
|
169
|
+
end
|
170
|
+
|
171
|
+
def store_sentence(sentence_object, xml_sentence)
|
172
|
+
@sentences << sentence_object
|
173
|
+
#TODO as soon as we need such a map.
|
174
|
+
end
|
175
|
+
|
176
|
+
def token_for_id(xml_id)
|
177
|
+
@id_map[xml_id]
|
178
|
+
end
|
179
|
+
|
180
|
+
def id_for_token(token)
|
181
|
+
@token_map[token]
|
182
|
+
end
|
183
|
+
|
184
|
+
def link_tokens!
|
185
|
+
@tokens.each_with_index do |token,i|
|
186
|
+
pred = i>0 ? @tokens[i-1] : nil
|
187
|
+
succ = i<(@tokens.size-1) ? @tokens[i+1] : nil
|
188
|
+
token.previous_token = pred
|
189
|
+
token.next_token = succ
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def link_sentences!
|
194
|
+
@sentences.each_with_index do |sentence,i|
|
195
|
+
pred = i>0 ? @sentences[i-1] : nil
|
196
|
+
succ = i<(@sentences.size-1) ? @sentences[i+1] : nil
|
197
|
+
sentence.previous_sentence = pred
|
198
|
+
sentence.next_sentence = succ
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
private
|
203
|
+
|
204
|
+
def process_tokens
|
205
|
+
xml_tokens.each do |xml_token|
|
206
|
+
token = new_token(@doc, xml_token)
|
207
|
+
store_token(token, xml_token)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def process_sentences
|
212
|
+
xml_sentences.each do |xml_sentence|
|
213
|
+
sentence = new_sentence(@doc, xml_sentence)
|
214
|
+
store_sentence(sentence, xml_sentence)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def process_pos
|
219
|
+
xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
220
|
+
xml_tags.each do |tag|
|
221
|
+
val = tag.text
|
222
|
+
ref = tag['tokenIDs']
|
223
|
+
ref_obj = @id_map[ref]
|
224
|
+
if val && ref_obj
|
225
|
+
ref_obj.pos = val
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def process_lemma
|
231
|
+
xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
232
|
+
xml_lemmas.each do |lemma|
|
233
|
+
val = lemma.text
|
234
|
+
ref = lemma['tokenIDs']
|
235
|
+
ref_obj = @id_map[ref]
|
236
|
+
if val && ref_obj
|
237
|
+
ref_obj.lemma = val
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def process_named_entities
|
243
|
+
xml_named_entities.each do |ent|
|
244
|
+
nato = Pils::Tcf::NamedEntityAnnotation.new(@doc)
|
245
|
+
nato.category = ent['class']
|
246
|
+
token_refs = ent['tokenIDs'].split(/\s+/)
|
247
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
248
|
+
tokens.each do |t|
|
249
|
+
nato << t
|
250
|
+
end
|
251
|
+
@named_entities << nato
|
252
|
+
#Pils::log ent['class']
|
253
|
+
#Pils::log tokens.collect{|t| t.form}.join(' ')
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def process_geo_annotations
|
258
|
+
xml_geo_annotations.each do |anno|
|
259
|
+
geo = Pils::Tcf::GeoAnnotation.new(@doc)
|
260
|
+
geo.lat = anno['lat'].to_f
|
261
|
+
geo.lon = anno['lon'].to_f
|
262
|
+
geo.alt = anno['alt'].to_f
|
263
|
+
geo.continent = anno['continent']
|
264
|
+
token_refs = anno['tokenIDs'].split(/\s+/)
|
265
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
266
|
+
tokens.each do |t|
|
267
|
+
geo << t
|
268
|
+
end
|
269
|
+
@geo_annotations << geo
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
def process_dependencies
|
274
|
+
xml_dependencies.each do |dep|
|
275
|
+
# <tc:dependency depIDs="t_4" func="ROOT"/>
|
276
|
+
# <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
|
277
|
+
|
278
|
+
depToken = token_for_id(dep['depIDs'])
|
279
|
+
|
280
|
+
if dep.has_attribute?('govIDs')
|
281
|
+
# non-root tag. func is also defined.
|
282
|
+
govToken = token_for_id(dep['govIDs'])
|
283
|
+
@dependency_map[[depToken,govToken]] = dep['func']
|
284
|
+
else
|
285
|
+
# root tag.
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
293
|
+
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
# (c) 2019 Peter Menke
|
4
|
+
#
|
5
|
+
# This file is part of pils
|
6
|
+
# ("Programming in linguistic seminars").
|
7
|
+
#
|
8
|
+
# pils is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# pils is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with pils. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
module Pils
|
22
|
+
module Tcf
|
23
|
+
|
24
|
+
class Token < BoundedElement
|
25
|
+
|
26
|
+
attr_accessor :pos
|
27
|
+
attr_accessor :lemma
|
28
|
+
attr_accessor :previous_token
|
29
|
+
attr_accessor :next_token
|
30
|
+
|
31
|
+
def initialize(tcf_document, xml_element)
|
32
|
+
@tcf_document = tcf_document
|
33
|
+
@xml_element = xml_element
|
34
|
+
@pos, @lemma, @previous_token, @next_token = nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def form
|
38
|
+
@form ||= CGI.unescapeHTML(@xml_element.text)
|
39
|
+
end
|
40
|
+
|
41
|
+
def pos?
|
42
|
+
not pos.nil?
|
43
|
+
end
|
44
|
+
|
45
|
+
def lemma?
|
46
|
+
not lemma.nil?
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|