tcf2nif 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/.gitlab-ci.yml +25 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +165 -0
- data/README.md +112 -0
- data/Rakefile +6 -0
- data/Tcf2Nif.ipynb +197 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/exe/convpar +7 -0
- data/exe/createturtle +32 -0
- data/exe/tcf2nif +29 -0
- data/exe/txt2tcf +30 -0
- data/lib/tcf2nif.rb +55 -0
- data/lib/tcf2nif/annotation.rb +39 -0
- data/lib/tcf2nif/bounded_element.rb +43 -0
- data/lib/tcf2nif/geo_annotation.rb +26 -0
- data/lib/tcf2nif/named_entity_annotation.rb +28 -0
- data/lib/tcf2nif/tcf_document.rb +228 -0
- data/lib/tcf2nif/token.rb +47 -0
- data/lib/tcf2nif/transformer.rb +352 -0
- data/lib/tcf2nif/version.rb +3 -0
- data/tcf2nif.gemspec +40 -0
- metadata +200 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class Token < BoundedElement
|
23
|
+
|
24
|
+
attr_accessor :pos
|
25
|
+
attr_accessor :lemma
|
26
|
+
|
27
|
+
def initialize(tcf_document, xml_element)
|
28
|
+
@tcf_document = tcf_document
|
29
|
+
@xml_element = xml_element
|
30
|
+
@pos, @lemma = nil
|
31
|
+
end
|
32
|
+
|
33
|
+
def form
|
34
|
+
@form ||= @xml_element.text
|
35
|
+
end
|
36
|
+
|
37
|
+
def pos?
|
38
|
+
not pos.nil?
|
39
|
+
end
|
40
|
+
|
41
|
+
def lemma?
|
42
|
+
not lemma.nil?
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,352 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class Transformer
|
23
|
+
|
24
|
+
def initialize(tcf_doc, rdf_opts)
|
25
|
+
@tcf_doc = tcf_doc
|
26
|
+
@rdf_opts = rdf_opts
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def char_uri(base, from,to)
|
31
|
+
RDF::URI("#{base}#char=#{from},#{to}")
|
32
|
+
end
|
33
|
+
|
34
|
+
def twopart_uri(base, suffix)
|
35
|
+
RDF::URI("#{base}##{suffix}")
|
36
|
+
end
|
37
|
+
|
38
|
+
def transform(mode = :plain)
|
39
|
+
return transform_plain if mode == :plain
|
40
|
+
return transform_noprov if mode == :noprov
|
41
|
+
return transform_modularized if mode == :modularized
|
42
|
+
end
|
43
|
+
|
44
|
+
def uri_base
|
45
|
+
'http://example.org/tcf2nif/example.txt'
|
46
|
+
end
|
47
|
+
|
48
|
+
def text_converter_uri
|
49
|
+
RDF::URI('http://hdl.handle.net/11858/00-1778-0000-0004-BA56-7')
|
50
|
+
end
|
51
|
+
|
52
|
+
def tokenizer_uri
|
53
|
+
RDF::URI('http://hdl.handle.net/11858/00-1778-0000-0004-BA56-7')
|
54
|
+
end
|
55
|
+
|
56
|
+
def pos_tagger_uri
|
57
|
+
RDF::URI('http://hdl.handle.net/11858/00-247C-0000-0007-3739-5')
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def tokenization_activity_uri
|
62
|
+
twopart_uri(uri_base, 'TokenizationActivity')
|
63
|
+
end
|
64
|
+
|
65
|
+
def pos_tagging_activity_uri
|
66
|
+
twopart_uri(uri_base, 'PosTaggingActivity')
|
67
|
+
end
|
68
|
+
|
69
|
+
def ne_tagging_activity_uri
|
70
|
+
twopart_uri(uri_base, 'NeTaggingActivity')
|
71
|
+
end
|
72
|
+
|
73
|
+
def geo_tagging_activity_uri
|
74
|
+
twopart_uri(uri_base, 'GeoTaggingActivity')
|
75
|
+
end
|
76
|
+
|
77
|
+
def dep_parsing_activity_uri
|
78
|
+
twopart_uri(uri_base, 'DependencyParsingActivity')
|
79
|
+
end
|
80
|
+
|
81
|
+
def tokenization_activity_time
|
82
|
+
RDF::Literal.new('2015-07-09T14:01:00', datatype: RDF::XSD.dateTime)
|
83
|
+
end
|
84
|
+
|
85
|
+
def pos_tagging_activity_time
|
86
|
+
RDF::Literal.new('2015-07-09T14:02:00', datatype: RDF::XSD.dateTime)
|
87
|
+
end
|
88
|
+
|
89
|
+
def ne_tagging_activity_time
|
90
|
+
RDF::Literal.new('2015-07-09T14:03:00', datatype: RDF::XSD.dateTime)
|
91
|
+
end
|
92
|
+
|
93
|
+
def geo_tagging_activity_time
|
94
|
+
RDF::Literal.new('2015-07-09T14:04:00', datatype: RDF::XSD.dateTime)
|
95
|
+
end
|
96
|
+
|
97
|
+
def dep_parsing_activity_time
|
98
|
+
RDF::Literal.new('2015-07-09T14:05:00', datatype: RDF::XSD.dateTime)
|
99
|
+
end
|
100
|
+
|
101
|
+
def transform_noprov(reify=false)
|
102
|
+
graph = RDF::Graph.new
|
103
|
+
|
104
|
+
# create a document URI for the document.
|
105
|
+
context_uri = char_uri(uri_base, 0, '')
|
106
|
+
|
107
|
+
# this generates a representation of the whole primary text
|
108
|
+
graph << [ context_uri, RDF.type, NIF.String ]
|
109
|
+
graph << [ context_uri, RDF.type, NIF.Context ]
|
110
|
+
graph << [ context_uri, RDF.type, NIF.RFC5147String ]
|
111
|
+
graph << [ context_uri, NIF.isString, RDF::Literal.new(@tcf_doc.text, lang: :en) ]
|
112
|
+
graph << [ context_uri, NIF.beginIndex, RDF::Literal.new(0, datatype: RDF::XSD.nonNegativeInteger) ]
|
113
|
+
graph << [ context_uri, NIF.endIndex, RDF::Literal.new(@tcf_doc.text.length, datatype: RDF::XSD.nonNegativeInteger) ]
|
114
|
+
|
115
|
+
# This generates a representation of the single tokens
|
116
|
+
@tcf_doc.tokens.each_with_index do |token,i|
|
117
|
+
token_uri = char_uri(uri_base, token.begin_index, token.end_index)
|
118
|
+
graph << [ token_uri, NIF.referenceContext, context_uri ]
|
119
|
+
graph << [ token_uri, RDF.type, NIF.String ]
|
120
|
+
graph << [ token_uri, RDF.type, NIF.Word ]
|
121
|
+
graph << [ token_uri, RDF.type, NIF.RFC5147String ]
|
122
|
+
graph << [ token_uri, NIF.beginIndex, RDF::Literal.new(token.begin_index, datatype: RDF::XSD.nonNegativeInteger) ]
|
123
|
+
graph << [ token_uri, NIF.endIndex, RDF::Literal.new(token.end_index, datatype: RDF::XSD.nonNegativeInteger) ]
|
124
|
+
graph << [ token_uri, NIF.anchorOf, RDF::Literal.new(token.form, datatype: RDF::XSD.string) ]
|
125
|
+
|
126
|
+
# adds data about POS if this data is present
|
127
|
+
if token.pos? && token.pos =~ /\w+/
|
128
|
+
# TODO Tokens must be checked whether they contain strange characters!
|
129
|
+
nif_pos(token, i, reify).each do |trip|
|
130
|
+
graph << trip
|
131
|
+
end
|
132
|
+
end
|
133
|
+
# Adds data about lemma if this data is present
|
134
|
+
if token.lemma?
|
135
|
+
nif_lemma(token, i, reify).each do |trip|
|
136
|
+
graph << trip #[ token_uri, NIF.lemma, RDF::Literal.new(token.lemma, datatype: RDF::XSD.string) ]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
i = 0
|
142
|
+
@tcf_doc.dependency_map.each do |key, value|
|
143
|
+
dep = key.first
|
144
|
+
gov = key.last
|
145
|
+
i = i + 1
|
146
|
+
if reify
|
147
|
+
tok_uri = char_uri(uri_base, dep.begin_index, dep.end_index)
|
148
|
+
anno_uri = twopart_uri(uri_base, "Dep#{i}")
|
149
|
+
graph << [tok_uri, NIF.annotation, anno_uri]
|
150
|
+
graph << [anno_uri, NIF.dependency, char_uri(uri_base, gov.begin_index, gov.end_index)]
|
151
|
+
graph << [anno_uri, NIF.dependencyRelationType, RDF::Literal.new(value)]
|
152
|
+
graph << [anno_uri, PROV.wasGeneratedBy, dep_parsing_activity_uri]
|
153
|
+
graph << [anno_uri, PROV.wasDerivedFrom, tok_uri]
|
154
|
+
graph << [anno_uri, PROV.wasDerivedFrom, char_uri(uri_base, gov.begin_index, gov.end_index)]
|
155
|
+
graph << [anno_uri, PROV.generatedAtTime, dep_parsing_activity_time]
|
156
|
+
else
|
157
|
+
# puts char_uri(uri_base, dep.begin_index, dep.end_index)
|
158
|
+
# puts NIF.dependency
|
159
|
+
# puts char_uri(uri_base, gov.begin_index, gov.end_index)
|
160
|
+
# puts char_uri(uri_base, dep.begin_index, dep.end_index)
|
161
|
+
# puts NIF.dependencyRelationType
|
162
|
+
# puts RDF::Literal.new(value)
|
163
|
+
|
164
|
+
graph << [char_uri(uri_base, dep.begin_index, dep.end_index), NIF.dependency, char_uri(uri_base, gov.begin_index, gov.end_index)]
|
165
|
+
graph << [char_uri(uri_base, dep.begin_index, dep.end_index), NIF.dependencyRelationType, RDF::Literal.new(value)]
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
return graph if reify
|
170
|
+
|
171
|
+
# TODO add information about named entities
|
172
|
+
# named entities
|
173
|
+
# get all named entities from the corpus.
|
174
|
+
# are they in there, anyway?
|
175
|
+
@tcf_doc.named_entities.each_with_index do |ne,i|
|
176
|
+
# generate a string for reference if more than one token is used.
|
177
|
+
# else, use just the URI for that given token.
|
178
|
+
current_uri = char_uri(uri_base, ne.tokens.first.begin_index, ne.tokens.first.end_index)
|
179
|
+
if ne.tokens.size > 1
|
180
|
+
# create a new string thing
|
181
|
+
min_ind = ne.tokens.min{|t| t.begin_index}.begin_index
|
182
|
+
max_ind = ne.tokens.max{|t| t.end_index}.end_index
|
183
|
+
current_uri = char_uri(uri_base, min_ind, max_ind)
|
184
|
+
end
|
185
|
+
anno_uri = twopart_uri(uri_base, "ne#{i}")
|
186
|
+
graph << [current_uri, NIF::annotation, anno_uri]
|
187
|
+
graph << [anno_uri, RDF.type, NIF.String]
|
188
|
+
# puts '(%3i) %20s . %40s : %20s' % [ne.tokens.size, current_uri, ne.tokens.collect{|t| t.form}.join(' '), ne.category]
|
189
|
+
graph << [anno_uri, NIF.taNerdCoreClassRef, NERD[ne.category.capitalize] ]
|
190
|
+
end
|
191
|
+
|
192
|
+
# TODO add information about geolocations
|
193
|
+
@tcf_doc.geo_annotations.each_with_index do |geo,i|
|
194
|
+
min_ind = geo.tokens.min{|t| t.begin_index}.begin_index
|
195
|
+
max_ind = geo.tokens.max{|t| t.end_index}.end_index
|
196
|
+
current_uri = char_uri(uri_base, min_ind, max_ind)
|
197
|
+
graph << [current_uri, RDF.type, NIF.String]
|
198
|
+
anno_uri = twopart_uri(uri_base, "geo#{i}")
|
199
|
+
|
200
|
+
graph << [current_uri, NIF::annotation, anno_uri]
|
201
|
+
graph << [anno_uri, Tcf2Nif::GEO.lat, geo.lat]
|
202
|
+
graph << [anno_uri, Tcf2Nif::GEO.long, geo.lon]
|
203
|
+
graph << [anno_uri, Tcf2Nif::GEO.alt, geo.alt]
|
204
|
+
graph << [anno_uri, RDF::URI('http://example.org/tcf2nif/continent'), geo.continent]
|
205
|
+
end
|
206
|
+
|
207
|
+
# TODO add information about dependency trees
|
208
|
+
|
209
|
+
graph
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
def transform_plain
|
214
|
+
#puts "1"
|
215
|
+
graph = transform_noprov(true)
|
216
|
+
#puts "2"
|
217
|
+
text_uri = char_uri(uri_base, 0, '')
|
218
|
+
# add provenance info to some of the triples.
|
219
|
+
# 1. add static Prov data for the tool chain.
|
220
|
+
# 2. add provenance data for the TCF-formatted text.
|
221
|
+
# 3. add provenance data for each token.
|
222
|
+
#puts "3"
|
223
|
+
@tcf_doc.tokens.each do |token|
|
224
|
+
token_uri = char_uri(uri_base, token.begin_index, token.end_index)
|
225
|
+
graph << [token_uri, Tcf2Nif::PROV.wasGeneratedBy, tokenization_activity_uri]
|
226
|
+
graph << [token_uri, Tcf2Nif::PROV.wasDerivedFrom, text_uri]
|
227
|
+
graph << [token_uri, Tcf2Nif::PROV.generatedAtTime, tokenization_activity_time]
|
228
|
+
end
|
229
|
+
|
230
|
+
# add info to named entities
|
231
|
+
#puts "4"
|
232
|
+
@tcf_doc.named_entities.each_with_index do |ne,i|
|
233
|
+
#puts " a"
|
234
|
+
anno_uri = twopart_uri(uri_base, "ne#{i}")
|
235
|
+
#puts " b"
|
236
|
+
graph << [anno_uri, Tcf2Nif::PROV.wasGeneratedBy, ne_tagging_activity_uri]
|
237
|
+
#puts " c"
|
238
|
+
#puts ne.tokens.size
|
239
|
+
ne.tokens.each do |tok|
|
240
|
+
#puts tok.class.name
|
241
|
+
#puts tok.begin_index
|
242
|
+
#puts tok.end_index
|
243
|
+
|
244
|
+
graph << [anno_uri, Tcf2Nif::PROV.wasDerivedFrom, char_uri(uri_base, tok.begin_index, tok.end_index)]
|
245
|
+
graph << [char_uri(uri_base, tok.begin_index, tok.end_index), NIF.annotation, anno_uri]
|
246
|
+
#puts " d"
|
247
|
+
end
|
248
|
+
#puts " e"
|
249
|
+
graph << [anno_uri, Tcf2Nif::PROV.generatedAtTime, ne_tagging_activity_time]
|
250
|
+
end
|
251
|
+
#puts "5"
|
252
|
+
|
253
|
+
@tcf_doc.geo_annotations.each_with_index do |geo,i|
|
254
|
+
anno_uri = twopart_uri(uri_base, "geo#{i}")
|
255
|
+
graph << [anno_uri, Tcf2Nif::PROV.wasGeneratedBy, geo_tagging_activity_uri]
|
256
|
+
geo.tokens.each do |tok|
|
257
|
+
graph << [anno_uri, Tcf2Nif::PROV.wasDerivedFrom, char_uri(uri_base, tok.begin_index, tok.end_index)]
|
258
|
+
graph << [char_uri(uri_base, tok.begin_index, tok.end_index), NIF.annotation, anno_uri]
|
259
|
+
end
|
260
|
+
graph << [anno_uri, Tcf2Nif::PROV.generatedAtTime, geo_tagging_activity_time]
|
261
|
+
end
|
262
|
+
graph
|
263
|
+
end
|
264
|
+
|
265
|
+
def transform_modularized()
|
266
|
+
graph = RDF::Graph.new
|
267
|
+
|
268
|
+
# create a document URI for the document.
|
269
|
+
context_uri = char_uri(uri_base, 0, '')
|
270
|
+
|
271
|
+
# generate the modules
|
272
|
+
pri_module_uri = twopart_uri(uri_base, 'PrimaryTextModule')
|
273
|
+
tok_module_uri = twopart_uri(uri_base, 'TokenizationModule')
|
274
|
+
pos_module_uri = twopart_uri(uri_base, 'PosModule')
|
275
|
+
lem_module_uri = twopart_uri(uri_base, 'LemmaModule')
|
276
|
+
|
277
|
+
module_uris = [pri_module_uri, tok_module_uri, pos_module_uri, lem_module_uri]
|
278
|
+
|
279
|
+
module_uris.each do |u|
|
280
|
+
graph << [u, RDF.type, MOND.Module ]
|
281
|
+
graph << [u, MOND.belongsToDocument, uri_base ]
|
282
|
+
end
|
283
|
+
|
284
|
+
graph << [ tok_module_uri, MOND.propagateType, NIF.String ]
|
285
|
+
graph << [ tok_module_uri, MOND.propagateType, NIF.Word ]
|
286
|
+
graph << [ tok_module_uri, MOND.propagateType, NIF.RFC5147String ]
|
287
|
+
|
288
|
+
|
289
|
+
# this generates a representation of the whole primary text
|
290
|
+
# put this into a separate module. Assign the module to the document.
|
291
|
+
graph << [ context_uri, RDF.type, NIF.String ]
|
292
|
+
graph << [ context_uri, RDF.type, NIF.Context ]
|
293
|
+
graph << [ context_uri, RDF.type, NIF.RFC5147String ]
|
294
|
+
graph << [ context_uri, NIF.isString, RDF::Literal.new(@tcf_doc.text, lang: :en) ]
|
295
|
+
graph << [ context_uri, NIF.beginIndex, RDF::Literal.new(0, datatype: RDF::XSD.nonNegativeInteger) ]
|
296
|
+
graph << [ context_uri, NIF.endIndex, RDF::Literal.new(@tcf_doc.text.length, datatype: RDF::XSD.nonNegativeInteger) ]
|
297
|
+
graph << [ context_uri, MOND.belongsToModule, pri_module_uri ]
|
298
|
+
|
299
|
+
# This generates a representation of the single tokens
|
300
|
+
@tcf_doc.tokens.each_with_index do |token,i|
|
301
|
+
token_uri = char_uri(uri_base, token.begin_index, token.end_index)
|
302
|
+
graph << [ token_uri, NIF.referenceContext, context_uri ]
|
303
|
+
# graph << [ token_uri, RDF.type, NIF.String ]
|
304
|
+
# graph << [ token_uri, RDF.type, NIF.Word ]
|
305
|
+
# graph << [ token_uri, RDF.type, NIF.RFC5147String ]
|
306
|
+
graph << [ token_uri, NIF.beginIndex, RDF::Literal.new(token.begin_index, datatype: RDF::XSD.nonNegativeInteger) ]
|
307
|
+
graph << [ token_uri, NIF.endIndex, RDF::Literal.new(token.end_index, datatype: RDF::XSD.nonNegativeInteger) ]
|
308
|
+
graph << [ token_uri, NIF.anchorOf, RDF::Literal.new(token.form, datatype: RDF::XSD.string) ]
|
309
|
+
graph << [ token_uri, MOND.belongsToModule, tok_module_uri ]
|
310
|
+
end
|
311
|
+
|
312
|
+
graph
|
313
|
+
end
|
314
|
+
|
315
|
+
def nif_pos(token, index, reify=false, tagset=Tcf2Nif::PENN)
|
316
|
+
subject = char_uri(uri_base, token.begin_index, token.end_index)
|
317
|
+
pos = token.pos
|
318
|
+
if reify
|
319
|
+
anno_uri = twopart_uri(uri_base, "Pos#{index}")
|
320
|
+
[
|
321
|
+
[subject, NIF.annotation, anno_uri],
|
322
|
+
[anno_uri, NIF.oliaLink, tagset[pos]],
|
323
|
+
[anno_uri, PROV.wasGeneratedBy, pos_tagging_activity_uri],
|
324
|
+
[anno_uri, PROV.wasDerivedFrom, subject],
|
325
|
+
[anno_uri, PROV.generatedAtTime, pos_tagging_activity_time]
|
326
|
+
]
|
327
|
+
else
|
328
|
+
[[subject, NIF.oliaLink, tagset[pos]]]
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
def nif_lemma(token, index, reify=false)
|
333
|
+
subject = char_uri(uri_base, token.begin_index, token.end_index)
|
334
|
+
lemma = token.lemma
|
335
|
+
if reify
|
336
|
+
anno_uri = twopart_uri(uri_base, "Lemma#{index}")
|
337
|
+
[
|
338
|
+
[subject, NIF.annotation, anno_uri],
|
339
|
+
[anno_uri, NIF.lemma, RDF::Literal.new(lemma, datatype: RDF::XSD.string)],
|
340
|
+
[anno_uri, PROV.wasGeneratedBy, pos_tagging_activity_uri],
|
341
|
+
[anno_uri, PROV.wasDerivedFrom, subject],
|
342
|
+
[anno_uri, PROV.generatedAtTime, pos_tagging_activity_time]
|
343
|
+
]
|
344
|
+
else
|
345
|
+
[[subject, NIF.lemma, RDF::Literal.new(lemma, datatype: RDF::XSD.string)]]
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
|
350
|
+
end
|
351
|
+
|
352
|
+
end
|
data/tcf2nif.gemspec
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'tcf2nif/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "tcf2nif"
|
8
|
+
spec.version = Tcf2Nif::VERSION
|
9
|
+
spec.authors = ["Peter Menke"]
|
10
|
+
spec.email = ["pmenke@googlemail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{A small NLP data converter from the TCF to the NIF format}
|
13
|
+
spec.description = %q{tcf2nif converts NLP data from the TCF format (used by WebLicht) to the RDF-based NIF format.}
|
14
|
+
spec.homepage = "http://github.com/pmenke/tcf2nif"
|
15
|
+
|
16
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
17
|
+
# delete this section to allow pushing this gem to any host.
|
18
|
+
if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
else
|
21
|
+
raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
|
22
|
+
end
|
23
|
+
|
24
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
25
|
+
spec.bindir = "exe"
|
26
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
27
|
+
spec.require_paths = ["lib"]
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "rspec"
|
32
|
+
spec.add_development_dependency "spork"
|
33
|
+
spec.add_development_dependency "simplecov"
|
34
|
+
|
35
|
+
spec.add_runtime_dependency 'rdf', '~> 1.1'
|
36
|
+
spec.add_runtime_dependency 'rdf-turtle', '~> 1.1'
|
37
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
38
|
+
spec.add_runtime_dependency 'trollop' # , '~> 1.6'
|
39
|
+
|
40
|
+
end
|
metadata
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tcf2nif
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Peter Menke
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-10-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: spork
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rdf
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.1'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.1'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rdf-turtle
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.1'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.1'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: nokogiri
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.6'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.6'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: trollop
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
description: tcf2nif converts NLP data from the TCF format (used by WebLicht) to the
|
140
|
+
RDF-based NIF format.
|
141
|
+
email:
|
142
|
+
- pmenke@googlemail.com
|
143
|
+
executables:
|
144
|
+
- convpar
|
145
|
+
- createturtle
|
146
|
+
- tcf2nif
|
147
|
+
- txt2tcf
|
148
|
+
extensions: []
|
149
|
+
extra_rdoc_files: []
|
150
|
+
files:
|
151
|
+
- ".gitignore"
|
152
|
+
- ".gitlab-ci.yml"
|
153
|
+
- ".rspec"
|
154
|
+
- ".travis.yml"
|
155
|
+
- CODE_OF_CONDUCT.md
|
156
|
+
- Gemfile
|
157
|
+
- LICENSE.txt
|
158
|
+
- README.md
|
159
|
+
- Rakefile
|
160
|
+
- Tcf2Nif.ipynb
|
161
|
+
- bin/console
|
162
|
+
- bin/setup
|
163
|
+
- exe/convpar
|
164
|
+
- exe/createturtle
|
165
|
+
- exe/tcf2nif
|
166
|
+
- exe/txt2tcf
|
167
|
+
- lib/tcf2nif.rb
|
168
|
+
- lib/tcf2nif/annotation.rb
|
169
|
+
- lib/tcf2nif/bounded_element.rb
|
170
|
+
- lib/tcf2nif/geo_annotation.rb
|
171
|
+
- lib/tcf2nif/named_entity_annotation.rb
|
172
|
+
- lib/tcf2nif/tcf_document.rb
|
173
|
+
- lib/tcf2nif/token.rb
|
174
|
+
- lib/tcf2nif/transformer.rb
|
175
|
+
- lib/tcf2nif/version.rb
|
176
|
+
- tcf2nif.gemspec
|
177
|
+
homepage: http://github.com/pmenke/tcf2nif
|
178
|
+
licenses: []
|
179
|
+
metadata: {}
|
180
|
+
post_install_message:
|
181
|
+
rdoc_options: []
|
182
|
+
require_paths:
|
183
|
+
- lib
|
184
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
version: '0'
|
189
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - ">="
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: '0'
|
194
|
+
requirements: []
|
195
|
+
rubyforge_project:
|
196
|
+
rubygems_version: 2.4.6
|
197
|
+
signing_key:
|
198
|
+
specification_version: 4
|
199
|
+
summary: A small NLP data converter from the TCF to the NIF format
|
200
|
+
test_files: []
|