tcf2nif 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tcf2nif"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env bash
2
+
3
+ echo "Running parallel converters"
4
+
5
+ rspec -t phanplain > ./spec/logs/phanplain.log 2>&1 &
6
+ rspec -t screwplain > ./spec/logs/screwplain.log 2>&1 &
7
+ rspec -t misplain > ./spec/logs/misplain.log 2>&1 &
@@ -0,0 +1,32 @@
1
+ #! /usr/bin/env bash
2
+ cd ../spec/out/plainprov
3
+ rapper ./phantom.nt -i ntriples -o turtle \
4
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
5
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
6
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
7
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
8
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
9
+ > ./phantom.ttl
10
+ rapper ./screw.nt -i ntriples -o turtle \
11
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
12
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
13
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
14
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
15
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
16
+ > ./screw.ttl
17
+ rapper ./miserables.nt -i ntriples -o turtle \
18
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
19
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
20
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
21
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
22
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
23
+ > ./miserables.ttl
24
+
25
+ cd ../modularized
26
+ rapper ./phantom.n3 -i ntriples -o turtle \
27
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
28
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
29
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
30
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
31
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
32
+ > ./phantom.ttl
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
4
+ require 'tcf2nif'
5
+ require 'trollop'
6
+
7
+ opts = Trollop::options do
8
+ synopsis 'Creates NIF documents from a TCF file (generated by WebLicht)'
9
+ usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
10
+ opt :input_file, "The file containing the plain text document to be converted", type: :string
11
+ opt :output_file, "The file name to be used for the output file", type: :string
12
+ opt :format, "The output file format", type: :string, default: 'n3'
13
+ end
14
+ puts opts
15
+
16
+ if opts[:input_file] && opts[:output_file] && opts[:format]
17
+ @tcf_file = File.open(opts[:input_file], 'r')
18
+ @tcf_doc = Tcf2Nif::TcfDocument.new(@tcf_file)
19
+ @trans = Tcf2Nif::Transformer.new(@tcf_doc, {})
20
+ @graph = @trans.transform(:noprov)
21
+ RDF::Writer.open(opts[:output_file], :format => :ntriples) do |writer|
22
+ writer << RDF::Repository.new do |repo|
23
+ repo << @graph
24
+ end
25
+ end
26
+ else
27
+ Trollop::educate
28
+ exit(1)
29
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
4
+ require 'tcf2nif'
5
+ require 'trollop'
6
+
7
+ opts = Trollop::options do
8
+ synopsis 'Creates TCF files via the WebLicht service from a plain text file'
9
+ usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
10
+ opt :input_file, "The file containing the plain text document to be converted", type: :string
11
+ opt :output_file, "The file name to be used for the output file", type: :string
12
+ opt :token, "The WebLicht WaaS API token", type: :string
13
+ opt :chain, "The chain description XML document", type: :string
14
+ end
15
+
16
+ if opts[:input_file] && opts[:output_file] && opts[:chain]
17
+ if opts.token
18
+ waas_token = opts.token
19
+ else
20
+ waas_token = ENV['WAAS_TOKEN']
21
+ end
22
+ service_uri = "https://weblicht.sfs.uni-tuebingen.de/WaaS/api/1.0/chain/process"
23
+
24
+ puts "curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}"
25
+ `curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}`
26
+
27
+ else
28
+ Trollop::educate
29
+ exit(1)
30
+ end
@@ -0,0 +1,55 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ require 'nokogiri'
21
+ require 'rdf'
22
+ require 'rdf/turtle'
23
+ require "tcf2nif/version"
24
+ require "tcf2nif/bounded_element"
25
+ require "tcf2nif/annotation"
26
+ require "tcf2nif/token"
27
+ require "tcf2nif/named_entity_annotation"
28
+ require "tcf2nif/geo_annotation"
29
+ require "tcf2nif/tcf_document"
30
+ require "tcf2nif/transformer"
31
+
32
+ module Tcf2Nif
33
+
34
+ NIF = RDF::Vocabulary.new("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
35
+ PENN = RDF::Vocabulary.new("http://purl.org/olia/penn.owl#")
36
+ NERD = RDF::Vocabulary.new("http://nerd.eurecom.fr/ontology#")
37
+ GEO = RDF::Vocabulary.new("http://www.w3.org/2003/01/geo/wgs84_pos#")
38
+ PROV = RDF::Vocabulary.new("http://www.w3.org/ns/prov#")
39
+ MOND = RDF::Vocabulary.new("http://petermenke.de/mond#")
40
+
41
+ STANDARD_PREFIXES = {
42
+ nif: Tcf2Nif::NIF,
43
+ rdfs: RDF::RDFS,
44
+ xsd: RDF::XSD,
45
+ penn: Tcf2Nif::PENN,
46
+ geo: Tcf2Nif::GEO,
47
+ nerd: Tcf2Nif::NERD,
48
+ mond: Tcf2Nif::MOND
49
+ }
50
+
51
+ def self.root
52
+ File.expand_path('../..',__FILE__)
53
+ end
54
+
55
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class Annotation
23
+
24
+ def initialize(tcf_document)
25
+ @tcf_document = tcf_document
26
+ @tokens = Array.new
27
+ end
28
+
29
+ def tokens
30
+ @tokens
31
+ end
32
+
33
+ def <<(token)
34
+ @tokens << token
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class BoundedElement
23
+
24
+ attr_accessor :begin_index
25
+ attr_accessor :end_index
26
+
27
+
28
+ def boundaries=(new_boundaries)
29
+ @begin_index=new_boundaries.first
30
+ @end_index=new_boundaries.last
31
+ end
32
+
33
+ def boundaries?
34
+ @begin_index && @end_index
35
+ end
36
+
37
+ def length
38
+ end_index - begin_index
39
+ end
40
+
41
+ end
42
+
43
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+ class GeoAnnotation < Tcf2Nif::Annotation
22
+
23
+ attr_accessor :lat, :lon, :alt, :continent
24
+
25
+ end
26
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class NamedEntityAnnotation < Tcf2Nif::Annotation
23
+
24
+ attr_accessor :category
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,228 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class TcfDocument
23
+
24
+ def initialize(io)
25
+ @doc = Nokogiri::XML(io)
26
+ # TODO add a method that reads the XML into Ruby structures
27
+ @tokens = Array.new
28
+ @named_entities = Array.new
29
+ @geo_annotations = Array.new
30
+ @id_map = Hash.new
31
+ @token_map = Hash.new
32
+ @dependency_map = Hash.new()
33
+
34
+ process_tokens
35
+ unless @tokens.all?{|t| t.boundaries? }
36
+ calculate_character_offsets
37
+ end
38
+
39
+ # TODO process pos and lemma information
40
+ process_pos
41
+ process_lemma
42
+
43
+ process_named_entities
44
+ process_geo_annotations
45
+ process_dependencies
46
+
47
+ end
48
+
49
+ def token_map
50
+ @token_map
51
+ end
52
+
53
+ def id_map
54
+ @id_map
55
+ end
56
+
57
+ def calculate_character_offsets
58
+ char_index = 0
59
+ tokens.each do |token|
60
+ new_index = text.index(token.form, char_index)
61
+ new_offset = new_index + token.form.length
62
+ token.boundaries= [new_index, new_offset]
63
+ char_index = new_offset
64
+ end
65
+ end
66
+
67
+ def text
68
+ #puts "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
69
+ #puts "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
70
+ #puts "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
71
+ #puts "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
72
+
73
+ @text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
74
+ end
75
+
76
+ def tokens
77
+ @tokens
78
+ end
79
+
80
+ def named_entities
81
+ @named_entities
82
+ end
83
+
84
+ def geo_annotations
85
+ @geo_annotations
86
+ end
87
+
88
+ def dependency_map
89
+ @dependency_map
90
+ end
91
+
92
+ def store_named_entity(named_entity_object)
93
+ @named_entities << named_entity_object
94
+ end
95
+
96
+
97
+ def xml_sentences
98
+ # /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
99
+ @xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
100
+ end
101
+
102
+ def xml_tokens
103
+ @xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
104
+ end
105
+
106
+ def xml_named_entities
107
+ @xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
108
+ end
109
+
110
+ def xml_geo_annotations
111
+ @xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
112
+ end
113
+
114
+ def xml_dependencies
115
+ @xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
116
+ end
117
+
118
+ # TODO add deep support for sentences and related tokens
119
+
120
+ def new_token(doc, xml_token)
121
+ token_object = Tcf2Nif::Token.new(doc, xml_token)
122
+ if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
123
+ token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
124
+ end
125
+ token_object
126
+ end
127
+
128
+ def store_token(token_object, xml_token)
129
+ @tokens << token_object
130
+ @id_map[xml_token['ID']] = token_object
131
+ @token_map[token_object] = xml_token['ID']
132
+ end
133
+
134
+ def token_for_id(xml_id)
135
+ @id_map[xml_id]
136
+ end
137
+
138
+ def id_for_token(token)
139
+ @token_map[token]
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def process_tokens
146
+ xml_tokens.each do |xml_token|
147
+ token = new_token(@doc, xml_token)
148
+ store_token(token, xml_token)
149
+ end
150
+ end
151
+
152
+ def process_pos
153
+ xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
154
+ xml_tags.each do |tag|
155
+ val = tag.text
156
+ ref = tag['tokenIDs']
157
+ ref_obj = @id_map[ref]
158
+ if val && ref_obj
159
+ ref_obj.pos = val
160
+ end
161
+ end
162
+ end
163
+
164
+ def process_lemma
165
+ xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
166
+ xml_lemmas.each do |lemma|
167
+ val = lemma.text
168
+ ref = lemma['tokenIDs']
169
+ ref_obj = @id_map[ref]
170
+ if val && ref_obj
171
+ ref_obj.lemma = val
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_named_entities
177
+ xml_named_entities.each do |ent|
178
+ nato = Tcf2Nif::NamedEntityAnnotation.new(@doc)
179
+ nato.category = ent['class']
180
+ token_refs = ent['tokenIDs'].split(/\s+/)
181
+ tokens = token_refs.collect{|r| token_for_id(r)}
182
+ tokens.each do |t|
183
+ nato << t
184
+ end
185
+ @named_entities << nato
186
+ #puts ent['class']
187
+ #puts tokens.collect{|t| t.form}.join(' ')
188
+ end
189
+ end
190
+
191
+ def process_geo_annotations
192
+ xml_geo_annotations.each do |anno|
193
+ geo = Tcf2Nif::GeoAnnotation.new(@doc)
194
+ geo.lat = anno['lat'].to_f
195
+ geo.lon = anno['lon'].to_f
196
+ geo.alt = anno['alt'].to_f
197
+ geo.continent = anno['continent']
198
+ token_refs = anno['tokenIDs'].split(/\s+/)
199
+ tokens = token_refs.collect{|r| token_for_id(r)}
200
+ tokens.each do |t|
201
+ geo << t
202
+ end
203
+ @geo_annotations << geo
204
+ end
205
+ end
206
+
207
+ def process_dependencies
208
+ xml_dependencies.each do |dep|
209
+ # <tc:dependency depIDs="t_4" func="ROOT"/>
210
+ # <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
211
+
212
+ depToken = token_for_id(dep['depIDs'])
213
+
214
+ if dep.has_attribute?('govIDs')
215
+ # non-root tag. func is also defined.
216
+ govToken = token_for_id(dep['govIDs'])
217
+ @dependency_map[[depToken,govToken]] = dep['func']
218
+ else
219
+ # root tag.
220
+
221
+ end
222
+
223
+ end
224
+ end
225
+
226
+ end
227
+
228
+ end