pils 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,468 @@
1
+ # encoding: utf-8
2
+ #
3
+ # (c) 2019 Peter Menke
4
+ #
5
+ # This file is part of pils
6
+ # ("Programming in linguistic seminars").
7
+ #
8
+ # pils is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # pils is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with pils. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ module Pils
22
+ module Tcf
23
+ module Transform
24
+
25
+ class Transformer
26
+
27
+
28
+ def self.encode_pos(pos)
29
+ replacements = {
30
+ ',' => '_COMMA',
31
+ '.' => '_FULLSTOP',
32
+ ':' => '_COLON',
33
+ '(' => '_LPAREN',
34
+ ')' => '_RPAREN',
35
+ '``' => 'QUOT',
36
+ }
37
+ return pos if pos =~ /\w+/
38
+ # else: return word description of the pos.
39
+ if replacements.has_key?(pos)
40
+ return replacements[pos]
41
+ end
42
+ return pos
43
+ end
44
+
45
+ def initialize(tcf_doc, rdf_opts)
46
+ @tcf_doc = tcf_doc
47
+ @rdf_opts = rdf_opts
48
+ @uri_base = rdf_opts[:base_uri] if rdf_opts.has_key?(:base_uri)
49
+
50
+ if rdf_opts.has_key?(:tagger_label)
51
+ @tagger_label = rdf_opts[:tagger_label]
52
+ else
53
+ @tagger_label = nil
54
+ end
55
+
56
+ if rdf_opts.has_key?(:tagger_uri)
57
+ @tagger_uri = rdf_opts[:tagger_uri]
58
+ else
59
+ @tagger_uri = nil
60
+ end
61
+
62
+ end
63
+
64
+
65
+ def char_uri(base, from,to)
66
+ RDF::URI("#{base}#char=#{from},#{to}")
67
+ end
68
+
69
+ def twopart_uri(base, suffix)
70
+ RDF::URI("#{base}##{suffix}")
71
+ end
72
+
73
+ def transform(mode = :plain)
74
+ return transform_plain if mode == :plain
75
+ return transform_noprov if mode == :noprov
76
+ return transform_modularized if mode == :modularized
77
+ end
78
+
79
+ def uri_base
80
+ @uri_base || 'http://example.org/tcf2nif/example.txt'
81
+ end
82
+
83
+ def text_converter_uri
84
+ RDF::URI('http://hdl.handle.net/11858/00-1778-0000-0004-BA56-7')
85
+ end
86
+
87
+ def tokenizer_uri
88
+ # TODO make it possible to use a custom tokenizer
89
+
90
+ RDF::URI('http://hdl.handle.net/11858/00-1778-0000-0004-BA56-7')
91
+ end
92
+
93
+ def pos_tagger_uri
94
+ # TODO make it possible to use a custom tokenizer
95
+ if @tagger_uri
96
+ return RDF::URI(@tagger_uri)
97
+ else
98
+ return RDF::URI('http://hdl.handle.net/11858/00-247C-0000-0007-3739-5')
99
+ end
100
+ end
101
+
102
+ def tokenization_activity_uri
103
+ twopart_uri(uri_base, 'TokenizationActivity')
104
+ end
105
+
106
+ def pos_tagging_activity_uri
107
+ # TODO make it possible to use a custom tokenizer
108
+ if @tagger_label
109
+ return twopart_uri(uri_base, "PosTaggingActivity#{@tagger_label}")
110
+ else
111
+ return twopart_uri(uri_base, 'PosTaggingActivity')
112
+ end
113
+ end
114
+
115
+ def ne_tagging_activity_uri
116
+ twopart_uri(uri_base, 'NeTaggingActivity')
117
+ end
118
+
119
+ def geo_tagging_activity_uri
120
+ twopart_uri(uri_base, 'GeoTaggingActivity')
121
+ end
122
+
123
+ def dep_parsing_activity_uri
124
+ twopart_uri(uri_base, 'DependencyParsingActivity')
125
+ end
126
+
127
+ def tokenization_activity_time
128
+ RDF::Literal.new('2015-07-09T14:01:00', datatype: RDF::XSD.dateTime)
129
+ end
130
+
131
+ def pos_tagging_activity_time
132
+ RDF::Literal.new('2015-07-09T14:02:00', datatype: RDF::XSD.dateTime)
133
+ end
134
+
135
+ def ne_tagging_activity_time
136
+ RDF::Literal.new('2015-07-09T14:03:00', datatype: RDF::XSD.dateTime)
137
+ end
138
+
139
+ def geo_tagging_activity_time
140
+ RDF::Literal.new('2015-07-09T14:04:00', datatype: RDF::XSD.dateTime)
141
+ end
142
+
143
+ def dep_parsing_activity_time
144
+ RDF::Literal.new('2015-07-09T14:05:00', datatype: RDF::XSD.dateTime)
145
+ end
146
+
147
+ def transform_noprov(reify=false)
148
+ graph = RDF::Graph.new
149
+
150
+ # create a document URI for the document.
151
+ context_uri = char_uri(uri_base, 0, '')
152
+
153
+ # this generates a representation of the whole primary text
154
+ graph << [ context_uri, RDF.type, NIF.String ]
155
+ graph << [ context_uri, RDF.type, NIF.Context ]
156
+ graph << [ context_uri, RDF.type, NIF.RFC5147String ]
157
+ graph << [ context_uri, NIF.isString, RDF::Literal.new(@tcf_doc.text, lang: :en) ]
158
+ graph << [ context_uri, NIF.beginIndex, RDF::Literal.new(0, datatype: RDF::XSD.nonNegativeInteger) ]
159
+ graph << [ context_uri, NIF.endIndex, RDF::Literal.new(@tcf_doc.text.length, datatype: RDF::XSD.nonNegativeInteger) ]
160
+
161
+ # This generates a representation of the single tokens
162
+ @tcf_doc.tokens.each_with_index do |token,i|
163
+ token_uri = char_uri(uri_base, token.begin_index, token.end_index)
164
+ graph << [ token_uri, NIF.referenceContext, context_uri ]
165
+ graph << [ token_uri, RDF.type, NIF.String ]
166
+ graph << [ token_uri, RDF.type, NIF.Word ]
167
+ graph << [ token_uri, RDF.type, NIF.RFC5147String ]
168
+ graph << [ token_uri, NIF.beginIndex, RDF::Literal.new(token.begin_index, datatype: RDF::XSD.nonNegativeInteger) ]
169
+ graph << [ token_uri, NIF.endIndex, RDF::Literal.new(token.end_index, datatype: RDF::XSD.nonNegativeInteger) ]
170
+ graph << [ token_uri, NIF.anchorOf, RDF::Literal.new(token.form, datatype: RDF::XSD.string) ]
171
+
172
+ if token.previous_token
173
+ graph << [ token_uri, NIF.previousWord, char_uri(uri_base, token.previous_token.begin_index, token.previous_token.end_index) ]
174
+ end
175
+ if token.next_token
176
+ graph << [ token_uri, NIF.nextWord, char_uri(uri_base, token.next_token.begin_index, token.next_token.end_index) ]
177
+ end
178
+
179
+ # adds data about POS if this data is present
180
+ if token.pos? # && token.pos =~ /\w+/
181
+ # TODO Tokens must be checked whether they contain strange characters!
182
+ # Do this! COMMA, COLON, QUESTION_MARK
183
+ nif_pos(token, i, reify).each do |trip|
184
+ graph << trip
185
+ end
186
+ end
187
+ # Adds data about lemma if this data is present
188
+ if token.lemma?
189
+ nif_lemma(token, i, reify).each do |trip|
190
+ graph << trip #[ token_uri, NIF.lemma, RDF::Literal.new(token.lemma, datatype: RDF::XSD.string) ]
191
+ end
192
+ end
193
+ end
194
+
195
+ i = 0
196
+ @tcf_doc.dependency_map.each do |key, value|
197
+ dep = key.first
198
+ gov = key.last
199
+ i = i + 1
200
+ if reify
201
+ tok_uri = char_uri(uri_base, dep.begin_index, dep.end_index)
202
+ anno_uri = twopart_uri(uri_base, "Dep#{i}")
203
+ graph << [tok_uri, NIF.annotation, anno_uri]
204
+ graph << [anno_uri, NIF.dependency, char_uri(uri_base, gov.begin_index, gov.end_index)]
205
+ graph << [anno_uri, NIF.dependencyRelationType, RDF::Literal.new(value)]
206
+ graph << [anno_uri, PROV.wasGeneratedBy, dep_parsing_activity_uri]
207
+ graph << [anno_uri, PROV.wasDerivedFrom, tok_uri]
208
+ graph << [anno_uri, PROV.wasDerivedFrom, char_uri(uri_base, gov.begin_index, gov.end_index)]
209
+ graph << [anno_uri, PROV.generatedAtTime, dep_parsing_activity_time]
210
+ else
211
+ graph << [char_uri(uri_base, dep.begin_index, dep.end_index), NIF.dependency, char_uri(uri_base, gov.begin_index, gov.end_index)]
212
+ graph << [char_uri(uri_base, dep.begin_index, dep.end_index), NIF.dependencyRelationType, RDF::Literal.new(value)]
213
+ end
214
+ end
215
+
216
+ return graph if reify
217
+
218
+ # TODO add information about named entities
219
+ # named entities
220
+ # get all named entities from the corpus.
221
+ # are they in there, anyway?
222
+ @tcf_doc.named_entities.each_with_index do |ne,i|
223
+ # generate a string for reference if more than one token is used.
224
+ # else, use just the URI for that given token.
225
+ current_uri = char_uri(uri_base, ne.tokens.first.begin_index, ne.tokens.first.end_index)
226
+ if ne.tokens.size > 1
227
+ # create a new string thing
228
+ min_ind = ne.tokens.min{|t| t.begin_index}.begin_index
229
+ max_ind = ne.tokens.max{|t| t.end_index}.end_index
230
+ current_uri = char_uri(uri_base, min_ind, max_ind)
231
+ end
232
+ anno_uri = twopart_uri(uri_base, "ne#{i}")
233
+ graph << [current_uri, NIF::annotation, anno_uri]
234
+ graph << [anno_uri, RDF.type, NIF.String]
235
+ # Pils::log '(%3i) %20s . %40s : %20s' % [ne.tokens.size, current_uri, ne.tokens.collect{|t| t.form}.join(' '), ne.category]
236
+ graph << [anno_uri, NIF.taNerdCoreClassRef, NERD[ne.category.capitalize] ]
237
+ end
238
+
239
+ # TODO add information about geolocations
240
+ @tcf_doc.geo_annotations.each_with_index do |geo,i|
241
+ min_ind = geo.tokens.min{|t| t.begin_index}.begin_index
242
+ max_ind = geo.tokens.max{|t| t.end_index}.end_index
243
+ current_uri = char_uri(uri_base, min_ind, max_ind)
244
+ graph << [current_uri, RDF.type, NIF.String]
245
+ anno_uri = twopart_uri(uri_base, "geo#{i}")
246
+
247
+ graph << [current_uri, NIF::annotation, anno_uri]
248
+ graph << [anno_uri, Pils::Tcf::GEO.lat, geo.lat]
249
+ graph << [anno_uri, Pils::Tcf::GEO.long, geo.lon]
250
+ graph << [anno_uri, Pils::Tcf::GEO.alt, geo.alt]
251
+ graph << [anno_uri, RDF::URI('http://example.org/tcf2nif/continent'), geo.continent]
252
+ end
253
+
254
+ # TODO add information about dependency trees
255
+
256
+ graph
257
+
258
+ end
259
+
260
+ def transform_plain
261
+ #Pils::log "1"
262
+ graph = transform_noprov(true)
263
+ #Pils::log "2"
264
+ text_uri = char_uri(uri_base, 0, '')
265
+ # add provenance info to some of the triples.
266
+ # 1. add static Prov data for the tool chain.
267
+ # 2. add provenance data for the TCF-formatted text.
268
+ # 3. add provenance data for each token.
269
+ #Pils::log "3"
270
+ @tcf_doc.tokens.each do |token|
271
+ token_uri = char_uri(uri_base, token.begin_index, token.end_index)
272
+ graph << [token_uri, Pils::Tcf::PROV.wasGeneratedBy, tokenization_activity_uri]
273
+ graph << [token_uri, Pils::Tcf::PROV.wasDerivedFrom, text_uri]
274
+ graph << [token_uri, Pils::Tcf::PROV.generatedAtTime, tokenization_activity_time]
275
+ end
276
+
277
+ # add info to named entities
278
+ #Pils::log "4"
279
+ @tcf_doc.named_entities.each_with_index do |ne,i|
280
+ #Pils::log " a"
281
+ anno_uri = twopart_uri(uri_base, "ne#{i}")
282
+ #Pils::log " b"
283
+ graph << [anno_uri, Pils::Tcf::PROV.wasGeneratedBy, ne_tagging_activity_uri]
284
+ #Pils::log " c"
285
+ #Pils::log ne.tokens.size
286
+ ne.tokens.each do |tok|
287
+ #Pils::log tok.class.name
288
+ #Pils::log tok.begin_index
289
+ #Pils::log tok.end_index
290
+
291
+ graph << [anno_uri, Pils::Tcf::PROV.wasDerivedFrom, char_uri(uri_base, tok.begin_index, tok.end_index)]
292
+ graph << [char_uri(uri_base, tok.begin_index, tok.end_index), NIF.annotation, anno_uri]
293
+ #Pils::log " d"
294
+ end
295
+ #Pils::log " e"
296
+ graph << [anno_uri, Pils::Tcf::PROV.generatedAtTime, ne_tagging_activity_time]
297
+ end
298
+ #Pils::log "5"
299
+
300
+ @tcf_doc.geo_annotations.each_with_index do |geo,i|
301
+ anno_uri = twopart_uri(uri_base, "geo#{i}")
302
+ graph << [anno_uri, Pils::Tcf::PROV.wasGeneratedBy, geo_tagging_activity_uri]
303
+ geo.tokens.each do |tok|
304
+ graph << [anno_uri, Pils::Tcf::PROV.wasDerivedFrom, char_uri(uri_base, tok.begin_index, tok.end_index)]
305
+ graph << [char_uri(uri_base, tok.begin_index, tok.end_index), NIF.annotation, anno_uri]
306
+ end
307
+ graph << [anno_uri, Pils::Tcf::PROV.generatedAtTime, geo_tagging_activity_time]
308
+ end
309
+ graph
310
+ end
311
+
312
+ def transform_modularized()
313
+ graph = RDF::Graph.new
314
+
315
+ # create a document URI for the document.
316
+ context_uri = char_uri(uri_base, 0, '')
317
+
318
+ # generate the modules
319
+ # ToDo: make this configurable! we certainly need
320
+ # custom URIs here. also, custom metadata.
321
+ pri_module_uri = twopart_uri(uri_base, 'PrimaryTextModule')
322
+ tok_module_uri = twopart_uri(uri_base, 'TokenizationModule')
323
+ pos_module_uri = twopart_uri(uri_base, 'PosModule')
324
+ lem_module_uri = twopart_uri(uri_base, 'LemmaModule')
325
+ ner_module_uri = twopart_uri(uri_base, 'NeModule')
326
+ geo_module_uri = twopart_uri(uri_base, 'GeoModule')
327
+ dep_module_uri = twopart_uri(uri_base, 'DependencyModule')
328
+
329
+ module_uris = [pri_module_uri, tok_module_uri, pos_module_uri, lem_module_uri]
330
+
331
+ module_uris.each do |u|
332
+ graph << [u, RDF.type, MOND.Module ]
333
+ graph << [u, MOND.belongsToDocument, uri_base ]
334
+ end
335
+
336
+ graph << [ tok_module_uri, MOND.propagateType, NIF.String ]
337
+ graph << [ tok_module_uri, MOND.propagateType, NIF.Word ]
338
+ graph << [ tok_module_uri, MOND.propagateType, NIF.RFC5147String ]
339
+
340
+ # this generates a representation of the whole primary text
341
+ # put this into a separate module. Assign the module to the document.
342
+ graph << [ context_uri, RDF.type, NIF.String ]
343
+ graph << [ context_uri, RDF.type, NIF.Context ]
344
+ graph << [ context_uri, RDF.type, NIF.RFC5147String ]
345
+ graph << [ context_uri, NIF.isString, RDF::Literal.new(@tcf_doc.text, lang: :en) ]
346
+ graph << [ context_uri, NIF.beginIndex, RDF::Literal.new(0, datatype: RDF::XSD.nonNegativeInteger) ]
347
+ graph << [ context_uri, NIF.endIndex, RDF::Literal.new(@tcf_doc.text.length, datatype: RDF::XSD.nonNegativeInteger) ]
348
+ graph << [ context_uri, MOND.belongsToModule, pri_module_uri ]
349
+
350
+ # This generates a representation of the single tokens
351
+ poscounter = 1
352
+ lemcounter = 1
353
+ @tcf_doc.tokens.each_with_index do |token,i|
354
+ token_uri = char_uri(uri_base, token.begin_index, token.end_index)
355
+ graph << [ token_uri, NIF.referenceContext, context_uri ]
356
+ graph << [ token_uri, NIF.beginIndex, RDF::Literal.new(token.begin_index, datatype: RDF::XSD.nonNegativeInteger) ]
357
+ graph << [ token_uri, NIF.endIndex, RDF::Literal.new(token.end_index, datatype: RDF::XSD.nonNegativeInteger) ]
358
+ graph << [ token_uri, NIF.anchorOf, RDF::Literal.new(token.form, datatype: RDF::XSD.string) ]
359
+ if token.pos?
360
+ pos = Transformer.encode_pos(token.pos)
361
+ # pos = "QUOT" if pos == "``"
362
+ graph << [ token_uri, NIF.annotation, twopart_uri(uri_base, "Pos#{poscounter}") ]
363
+ graph << [ twopart_uri(uri_base, "Pos#{poscounter}"), NIF.oliaLink, Pils::Tcf::PENN[pos] ]
364
+ graph << [ twopart_uri(uri_base, "Pos#{poscounter}"), MOND.belongsToModule, pos_module_uri ]
365
+ poscounter = poscounter + 1
366
+ end
367
+ if token.lemma?
368
+ graph << [ token_uri, NIF.annotation, twopart_uri(uri_base, "Lemma#{lemcounter}") ]
369
+ graph << [ twopart_uri(uri_base, "Lemma#{lemcounter}"), NIF.lemma, RDF::Literal.new(token.lemma) ]
370
+ graph << [ twopart_uri(uri_base, "Lemma#{lemcounter}"), MOND.belongsToModule, lem_module_uri ]
371
+ lemcounter = lemcounter + 1
372
+ end
373
+ graph << [ token_uri, MOND.belongsToModule, tok_module_uri ]
374
+ end
375
+
376
+ @tcf_doc.named_entities.each_with_index do |ne,i|
377
+ current_uri = char_uri(uri_base, ne.tokens.first.begin_index, ne.tokens.first.end_index)
378
+ if ne.tokens.size > 1
379
+ # create a new string thing
380
+ min_ind = ne.tokens.min{|t| t.begin_index}.begin_index
381
+ max_ind = ne.tokens.max{|t| t.end_index}.end_index
382
+ current_uri = char_uri(uri_base, min_ind, max_ind)
383
+ end
384
+ anno_uri = twopart_uri(uri_base, "NE#{i}")
385
+ graph << [current_uri, NIF::annotation, anno_uri]
386
+ graph << [anno_uri, RDF.type, NIF.String]
387
+ graph << [anno_uri, MOND.belongsToModule, ner_module_uri ]
388
+ graph << [anno_uri, NIF.taNerdCoreClassRef, NERD[ne.category.capitalize] ]
389
+ end
390
+
391
+ @tcf_doc.geo_annotations.each_with_index do |geo,i|
392
+ min_ind = geo.tokens.min{|t| t.begin_index}.begin_index
393
+ max_ind = geo.tokens.max{|t| t.end_index}.end_index
394
+ current_uri = char_uri(uri_base, min_ind, max_ind)
395
+ graph << [current_uri, RDF.type, NIF.String]
396
+ anno_uri = twopart_uri(uri_base, "Geo#{i}")
397
+
398
+ graph << [current_uri, NIF::annotation, anno_uri]
399
+ graph << [anno_uri, MOND.belongsToModule, geo_module_uri ]
400
+ graph << [anno_uri, Pils::Tcf::GEO.lat, geo.lat]
401
+ graph << [anno_uri, Pils::Tcf::GEO.long, geo.lon]
402
+ graph << [anno_uri, Pils::Tcf::GEO.alt, geo.alt]
403
+ graph << [anno_uri, RDF::URI('http://example.org/tcf2nif/continent'), geo.continent]
404
+ end
405
+
406
+ d = 0
407
+ @tcf_doc.dependency_map.each do |key, value|
408
+ dep = key.first
409
+ gov = key.last
410
+ d = d + 1
411
+ tok_uri = char_uri(uri_base, dep.begin_index, dep.end_index)
412
+ anno_uri = twopart_uri(uri_base, "Dep#{d}")
413
+ graph << [tok_uri, NIF.annotation, anno_uri]
414
+ graph << [anno_uri, NIF.dependency, char_uri(uri_base, gov.begin_index, gov.end_index)]
415
+ graph << [anno_uri, NIF.dependencyRelationType, RDF::Literal.new(value)]
416
+ graph << [anno_uri, MOND.belongsToModule, dep_module_uri ]
417
+ #graph << [anno_uri, PROV.wasGeneratedBy, dep_parsing_activity_uri]
418
+ #graph << [anno_uri, PROV.wasDerivedFrom, tok_uri]
419
+ #graph << [anno_uri, PROV.wasDerivedFrom, char_uri(uri_base, gov.begin_index, gov.end_index)]
420
+ #graph << [anno_uri, PROV.generatedAtTime, dep_parsing_activity_time]
421
+ end
422
+ graph
423
+ end
424
+
425
+ def nif_pos(token, index, reify=false, tagset=Pils::Tcf::PENN)
426
+ subject = char_uri(uri_base, token.begin_index, token.end_index)
427
+ pos = Transformer.encode_pos(token.pos)
428
+ if reify
429
+ if @tagger_label
430
+ anno_uri = twopart_uri(uri_base, "Pos#{@tagger_label}#{index}")
431
+ else
432
+ anno_uri = twopart_uri(uri_base, "Pos#{index}")
433
+ end
434
+ [
435
+ [subject, NIF.annotation, anno_uri],
436
+ [anno_uri, NIF.oliaLink, tagset[pos]],
437
+ [anno_uri, PROV.wasGeneratedBy, pos_tagging_activity_uri],
438
+ [anno_uri, PROV.wasDerivedFrom, subject],
439
+ [anno_uri, PROV.generatedAtTime, pos_tagging_activity_time]
440
+ ]
441
+ else
442
+ [[subject, NIF.oliaLink, tagset[pos]]]
443
+ end
444
+ end
445
+
446
+ def nif_lemma(token, index, reify=false)
447
+ subject = char_uri(uri_base, token.begin_index, token.end_index)
448
+ lemma = token.lemma
449
+ if reify
450
+ anno_uri = twopart_uri(uri_base, "Lemma#{index}")
451
+ [
452
+ [subject, NIF.annotation, anno_uri],
453
+ [anno_uri, NIF.lemma, RDF::Literal.new(lemma, datatype: RDF::XSD.string)],
454
+ [anno_uri, PROV.wasGeneratedBy, pos_tagging_activity_uri],
455
+ [anno_uri, PROV.wasDerivedFrom, subject],
456
+ [anno_uri, PROV.generatedAtTime, pos_tagging_activity_time]
457
+ ]
458
+ else
459
+ [[subject, NIF.lemma, RDF::Literal.new(lemma, datatype: RDF::XSD.string)]]
460
+ end
461
+ end
462
+
463
+
464
+ end
465
+
466
+ end
467
+ end
468
+ end