tcf2nif 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tcf2nif"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env bash
2
+
3
+ echo "Running parallel converters"
4
+
5
+ rspec -t phanplain > ./spec/logs/phanplain.log 2>&1 &
6
+ rspec -t screwplain > ./spec/logs/screwplain.log 2>&1 &
7
+ rspec -t misplain > ./spec/logs/misplain.log 2>&1 &
@@ -0,0 +1,32 @@
1
+ #! /usr/bin/env bash
2
+ cd ../spec/out/plainprov
3
+ rapper ./phantom.nt -i ntriples -o turtle \
4
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
5
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
6
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
7
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
8
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
9
+ > ./phantom.ttl
10
+ rapper ./screw.nt -i ntriples -o turtle \
11
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
12
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
13
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
14
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
15
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
16
+ > ./screw.ttl
17
+ rapper ./miserables.nt -i ntriples -o turtle \
18
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
19
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
20
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
21
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
22
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
23
+ > ./miserables.ttl
24
+
25
+ cd ../modularized
26
+ rapper ./phantom.n3 -i ntriples -o turtle \
27
+ -f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
28
+ -f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
29
+ -f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
30
+ -f 'xmlns:mond="http://petermenke.de/mond#"' \
31
+ -f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
32
+ > ./phantom.ttl
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
4
+ require 'tcf2nif'
5
+ require 'trollop'
6
+
7
+ opts = Trollop::options do
8
+ synopsis 'Creates NIF documents from a TCF file (generated by WebLicht)'
9
+ usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
10
+ opt :input_file, "The file containing the plain text document to be converted", type: :string
11
+ opt :output_file, "The file name to be used for the output file", type: :string
12
+ opt :format, "The output file format", type: :string, default: 'n3'
13
+ end
14
+ puts opts
15
+
16
+ if opts[:input_file] && opts[:output_file] && opts[:format]
17
+ @tcf_file = File.open(opts[:input_file], 'r')
18
+ @tcf_doc = Tcf2Nif::TcfDocument.new(@tcf_file)
19
+ @trans = Tcf2Nif::Transformer.new(@tcf_doc, {})
20
+ @graph = @trans.transform(:noprov)
21
+ RDF::Writer.open(opts[:output_file], :format => :ntriples) do |writer|
22
+ writer << RDF::Repository.new do |repo|
23
+ repo << @graph
24
+ end
25
+ end
26
+ else
27
+ Trollop::educate
28
+ exit(1)
29
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
4
+ require 'tcf2nif'
5
+ require 'trollop'
6
+
7
+ opts = Trollop::options do
8
+ synopsis 'Creates TCF files via the WebLicht service from a plain text file'
9
+ usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
10
+ opt :input_file, "The file containing the plain text document to be converted", type: :string
11
+ opt :output_file, "The file name to be used for the output file", type: :string
12
+ opt :token, "The WebLicht WaaS API token", type: :string
13
+ opt :chain, "The chain description XML document", type: :string
14
+ end
15
+
16
+ if opts[:input_file] && opts[:output_file] && opts[:chain]
17
+ if opts.token
18
+ waas_token = opts.token
19
+ else
20
+ waas_token = ENV['WAAS_TOKEN']
21
+ end
22
+ service_uri = "https://weblicht.sfs.uni-tuebingen.de/WaaS/api/1.0/chain/process"
23
+
24
+ puts "curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}"
25
+ `curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}`
26
+
27
+ else
28
+ Trollop::educate
29
+ exit(1)
30
+ end
@@ -0,0 +1,55 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ require 'nokogiri'
21
+ require 'rdf'
22
+ require 'rdf/turtle'
23
+ require "tcf2nif/version"
24
+ require "tcf2nif/bounded_element"
25
+ require "tcf2nif/annotation"
26
+ require "tcf2nif/token"
27
+ require "tcf2nif/named_entity_annotation"
28
+ require "tcf2nif/geo_annotation"
29
+ require "tcf2nif/tcf_document"
30
+ require "tcf2nif/transformer"
31
+
32
+ module Tcf2Nif
33
+
34
+ NIF = RDF::Vocabulary.new("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
35
+ PENN = RDF::Vocabulary.new("http://purl.org/olia/penn.owl#")
36
+ NERD = RDF::Vocabulary.new("http://nerd.eurecom.fr/ontology#")
37
+ GEO = RDF::Vocabulary.new("http://www.w3.org/2003/01/geo/wgs84_pos#")
38
+ PROV = RDF::Vocabulary.new("http://www.w3.org/ns/prov#")
39
+ MOND = RDF::Vocabulary.new("http://petermenke.de/mond#")
40
+
41
+ STANDARD_PREFIXES = {
42
+ nif: Tcf2Nif::NIF,
43
+ rdfs: RDF::RDFS,
44
+ xsd: RDF::XSD,
45
+ penn: Tcf2Nif::PENN,
46
+ geo: Tcf2Nif::GEO,
47
+ nerd: Tcf2Nif::NERD,
48
+ mond: Tcf2Nif::MOND
49
+ }
50
+
51
+ def self.root
52
+ File.expand_path('../..',__FILE__)
53
+ end
54
+
55
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class Annotation
23
+
24
+ def initialize(tcf_document)
25
+ @tcf_document = tcf_document
26
+ @tokens = Array.new
27
+ end
28
+
29
+ def tokens
30
+ @tokens
31
+ end
32
+
33
+ def <<(token)
34
+ @tokens << token
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class BoundedElement
23
+
24
+ attr_accessor :begin_index
25
+ attr_accessor :end_index
26
+
27
+
28
+ def boundaries=(new_boundaries)
29
+ @begin_index=new_boundaries.first
30
+ @end_index=new_boundaries.last
31
+ end
32
+
33
+ def boundaries?
34
+ @begin_index && @end_index
35
+ end
36
+
37
+ def length
38
+ end_index - begin_index
39
+ end
40
+
41
+ end
42
+
43
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+ class GeoAnnotation < Tcf2Nif::Annotation
22
+
23
+ attr_accessor :lat, :lon, :alt, :continent
24
+
25
+ end
26
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class NamedEntityAnnotation < Tcf2Nif::Annotation
23
+
24
+ attr_accessor :category
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,228 @@
1
+ # encoding: utf-8
2
+ # This file is part of the tcf2nif gem.
3
+ # Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
4
+ # http://www.sfb673.org
5
+ #
6
+ # tcf2nif is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as
8
+ # published by the Free Software Foundation, either version 3 of
9
+ # the License, or (at your option) any later version.
10
+ #
11
+ # tcf2nif is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with tcf2nif. If not, see
18
+ # <http://www.gnu.org/licenses/>.
19
+
20
+ module Tcf2Nif
21
+
22
+ class TcfDocument
23
+
24
+ def initialize(io)
25
+ @doc = Nokogiri::XML(io)
26
+ # TODO add a method that reads the XML into Ruby structures
27
+ @tokens = Array.new
28
+ @named_entities = Array.new
29
+ @geo_annotations = Array.new
30
+ @id_map = Hash.new
31
+ @token_map = Hash.new
32
+ @dependency_map = Hash.new()
33
+
34
+ process_tokens
35
+ unless @tokens.all?{|t| t.boundaries? }
36
+ calculate_character_offsets
37
+ end
38
+
39
+ # TODO process pos and lemma information
40
+ process_pos
41
+ process_lemma
42
+
43
+ process_named_entities
44
+ process_geo_annotations
45
+ process_dependencies
46
+
47
+ end
48
+
49
+ def token_map
50
+ @token_map
51
+ end
52
+
53
+ def id_map
54
+ @id_map
55
+ end
56
+
57
+ def calculate_character_offsets
58
+ char_index = 0
59
+ tokens.each do |token|
60
+ new_index = text.index(token.form, char_index)
61
+ new_offset = new_index + token.form.length
62
+ token.boundaries= [new_index, new_offset]
63
+ char_index = new_offset
64
+ end
65
+ end
66
+
67
+ def text
68
+ #puts "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
69
+ #puts "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
70
+ #puts "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
71
+ #puts "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
72
+
73
+ @text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
74
+ end
75
+
76
+ def tokens
77
+ @tokens
78
+ end
79
+
80
+ def named_entities
81
+ @named_entities
82
+ end
83
+
84
+ def geo_annotations
85
+ @geo_annotations
86
+ end
87
+
88
+ def dependency_map
89
+ @dependency_map
90
+ end
91
+
92
+ def store_named_entity(named_entity_object)
93
+ @named_entities << named_entity_object
94
+ end
95
+
96
+
97
+ def xml_sentences
98
+ # /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
99
+ @xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
100
+ end
101
+
102
+ def xml_tokens
103
+ @xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
104
+ end
105
+
106
+ def xml_named_entities
107
+ @xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
108
+ end
109
+
110
+ def xml_geo_annotations
111
+ @xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
112
+ end
113
+
114
+ def xml_dependencies
115
+ @xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
116
+ end
117
+
118
+ # TODO add deep support for sentences and related tokens
119
+
120
+ def new_token(doc, xml_token)
121
+ token_object = Tcf2Nif::Token.new(doc, xml_token)
122
+ if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
123
+ token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
124
+ end
125
+ token_object
126
+ end
127
+
128
+ def store_token(token_object, xml_token)
129
+ @tokens << token_object
130
+ @id_map[xml_token['ID']] = token_object
131
+ @token_map[token_object] = xml_token['ID']
132
+ end
133
+
134
+ def token_for_id(xml_id)
135
+ @id_map[xml_id]
136
+ end
137
+
138
+ def id_for_token(token)
139
+ @token_map[token]
140
+ end
141
+
142
+
143
+ private
144
+
145
+ def process_tokens
146
+ xml_tokens.each do |xml_token|
147
+ token = new_token(@doc, xml_token)
148
+ store_token(token, xml_token)
149
+ end
150
+ end
151
+
152
+ def process_pos
153
+ xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
154
+ xml_tags.each do |tag|
155
+ val = tag.text
156
+ ref = tag['tokenIDs']
157
+ ref_obj = @id_map[ref]
158
+ if val && ref_obj
159
+ ref_obj.pos = val
160
+ end
161
+ end
162
+ end
163
+
164
+ def process_lemma
165
+ xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
166
+ xml_lemmas.each do |lemma|
167
+ val = lemma.text
168
+ ref = lemma['tokenIDs']
169
+ ref_obj = @id_map[ref]
170
+ if val && ref_obj
171
+ ref_obj.lemma = val
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_named_entities
177
+ xml_named_entities.each do |ent|
178
+ nato = Tcf2Nif::NamedEntityAnnotation.new(@doc)
179
+ nato.category = ent['class']
180
+ token_refs = ent['tokenIDs'].split(/\s+/)
181
+ tokens = token_refs.collect{|r| token_for_id(r)}
182
+ tokens.each do |t|
183
+ nato << t
184
+ end
185
+ @named_entities << nato
186
+ #puts ent['class']
187
+ #puts tokens.collect{|t| t.form}.join(' ')
188
+ end
189
+ end
190
+
191
+ def process_geo_annotations
192
+ xml_geo_annotations.each do |anno|
193
+ geo = Tcf2Nif::GeoAnnotation.new(@doc)
194
+ geo.lat = anno['lat'].to_f
195
+ geo.lon = anno['lon'].to_f
196
+ geo.alt = anno['alt'].to_f
197
+ geo.continent = anno['continent']
198
+ token_refs = anno['tokenIDs'].split(/\s+/)
199
+ tokens = token_refs.collect{|r| token_for_id(r)}
200
+ tokens.each do |t|
201
+ geo << t
202
+ end
203
+ @geo_annotations << geo
204
+ end
205
+ end
206
+
207
+ def process_dependencies
208
+ xml_dependencies.each do |dep|
209
+ # <tc:dependency depIDs="t_4" func="ROOT"/>
210
+ # <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
211
+
212
+ depToken = token_for_id(dep['depIDs'])
213
+
214
+ if dep.has_attribute?('govIDs')
215
+ # non-root tag. func is also defined.
216
+ govToken = token_for_id(dep['govIDs'])
217
+ @dependency_map[[depToken,govToken]] = dep['func']
218
+ else
219
+ # root tag.
220
+
221
+ end
222
+
223
+ end
224
+ end
225
+
226
+ end
227
+
228
+ end