tcf2nif 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/.gitlab-ci.yml +25 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +165 -0
- data/README.md +112 -0
- data/Rakefile +6 -0
- data/Tcf2Nif.ipynb +197 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/exe/convpar +7 -0
- data/exe/createturtle +32 -0
- data/exe/tcf2nif +29 -0
- data/exe/txt2tcf +30 -0
- data/lib/tcf2nif.rb +55 -0
- data/lib/tcf2nif/annotation.rb +39 -0
- data/lib/tcf2nif/bounded_element.rb +43 -0
- data/lib/tcf2nif/geo_annotation.rb +26 -0
- data/lib/tcf2nif/named_entity_annotation.rb +28 -0
- data/lib/tcf2nif/tcf_document.rb +228 -0
- data/lib/tcf2nif/token.rb +47 -0
- data/lib/tcf2nif/transformer.rb +352 -0
- data/lib/tcf2nif/version.rb +3 -0
- data/tcf2nif.gemspec +40 -0
- metadata +200 -0
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "tcf2nif"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/exe/convpar
ADDED
data/exe/createturtle
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#! /usr/bin/env bash
|
2
|
+
cd ../spec/out/plainprov
|
3
|
+
rapper ./phantom.nt -i ntriples -o turtle \
|
4
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
5
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
6
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
7
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
8
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
9
|
+
> ./phantom.ttl
|
10
|
+
rapper ./screw.nt -i ntriples -o turtle \
|
11
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
12
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
13
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
14
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
15
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
16
|
+
> ./screw.ttl
|
17
|
+
rapper ./miserables.nt -i ntriples -o turtle \
|
18
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
19
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
20
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
21
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
22
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
23
|
+
> ./miserables.ttl
|
24
|
+
|
25
|
+
cd ../modularized
|
26
|
+
rapper ./phantom.n3 -i ntriples -o turtle \
|
27
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
28
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
29
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
30
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
31
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
32
|
+
> ./phantom.ttl
|
data/exe/tcf2nif
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
|
4
|
+
require 'tcf2nif'
|
5
|
+
require 'trollop'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
synopsis 'Creates NIF documents from a TCF file (generated by WebLicht)'
|
9
|
+
usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
|
10
|
+
opt :input_file, "The file containing the plain text document to be converted", type: :string
|
11
|
+
opt :output_file, "The file name to be used for the output file", type: :string
|
12
|
+
opt :format, "The output file format", type: :string, default: 'n3'
|
13
|
+
end
|
14
|
+
puts opts
|
15
|
+
|
16
|
+
if opts[:input_file] && opts[:output_file] && opts[:format]
|
17
|
+
@tcf_file = File.open(opts[:input_file], 'r')
|
18
|
+
@tcf_doc = Tcf2Nif::TcfDocument.new(@tcf_file)
|
19
|
+
@trans = Tcf2Nif::Transformer.new(@tcf_doc, {})
|
20
|
+
@graph = @trans.transform(:noprov)
|
21
|
+
RDF::Writer.open(opts[:output_file], :format => :ntriples) do |writer|
|
22
|
+
writer << RDF::Repository.new do |repo|
|
23
|
+
repo << @graph
|
24
|
+
end
|
25
|
+
end
|
26
|
+
else
|
27
|
+
Trollop::educate
|
28
|
+
exit(1)
|
29
|
+
end
|
data/exe/txt2tcf
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
|
4
|
+
require 'tcf2nif'
|
5
|
+
require 'trollop'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
synopsis 'Creates TCF files via the WebLicht service from a plain text file'
|
9
|
+
usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
|
10
|
+
opt :input_file, "The file containing the plain text document to be converted", type: :string
|
11
|
+
opt :output_file, "The file name to be used for the output file", type: :string
|
12
|
+
opt :token, "The WebLicht WaaS API token", type: :string
|
13
|
+
opt :chain, "The chain description XML document", type: :string
|
14
|
+
end
|
15
|
+
|
16
|
+
if opts[:input_file] && opts[:output_file] && opts[:chain]
|
17
|
+
if opts.token
|
18
|
+
waas_token = opts.token
|
19
|
+
else
|
20
|
+
waas_token = ENV['WAAS_TOKEN']
|
21
|
+
end
|
22
|
+
service_uri = "https://weblicht.sfs.uni-tuebingen.de/WaaS/api/1.0/chain/process"
|
23
|
+
|
24
|
+
puts "curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}"
|
25
|
+
`curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}`
|
26
|
+
|
27
|
+
else
|
28
|
+
Trollop::educate
|
29
|
+
exit(1)
|
30
|
+
end
|
data/lib/tcf2nif.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
require 'nokogiri'
|
21
|
+
require 'rdf'
|
22
|
+
require 'rdf/turtle'
|
23
|
+
require "tcf2nif/version"
|
24
|
+
require "tcf2nif/bounded_element"
|
25
|
+
require "tcf2nif/annotation"
|
26
|
+
require "tcf2nif/token"
|
27
|
+
require "tcf2nif/named_entity_annotation"
|
28
|
+
require "tcf2nif/geo_annotation"
|
29
|
+
require "tcf2nif/tcf_document"
|
30
|
+
require "tcf2nif/transformer"
|
31
|
+
|
32
|
+
module Tcf2Nif
|
33
|
+
|
34
|
+
NIF = RDF::Vocabulary.new("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
|
35
|
+
PENN = RDF::Vocabulary.new("http://purl.org/olia/penn.owl#")
|
36
|
+
NERD = RDF::Vocabulary.new("http://nerd.eurecom.fr/ontology#")
|
37
|
+
GEO = RDF::Vocabulary.new("http://www.w3.org/2003/01/geo/wgs84_pos#")
|
38
|
+
PROV = RDF::Vocabulary.new("http://www.w3.org/ns/prov#")
|
39
|
+
MOND = RDF::Vocabulary.new("http://petermenke.de/mond#")
|
40
|
+
|
41
|
+
STANDARD_PREFIXES = {
|
42
|
+
nif: Tcf2Nif::NIF,
|
43
|
+
rdfs: RDF::RDFS,
|
44
|
+
xsd: RDF::XSD,
|
45
|
+
penn: Tcf2Nif::PENN,
|
46
|
+
geo: Tcf2Nif::GEO,
|
47
|
+
nerd: Tcf2Nif::NERD,
|
48
|
+
mond: Tcf2Nif::MOND
|
49
|
+
}
|
50
|
+
|
51
|
+
def self.root
|
52
|
+
File.expand_path('../..',__FILE__)
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class Annotation
|
23
|
+
|
24
|
+
def initialize(tcf_document)
|
25
|
+
@tcf_document = tcf_document
|
26
|
+
@tokens = Array.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def tokens
|
30
|
+
@tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
def <<(token)
|
34
|
+
@tokens << token
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class BoundedElement
|
23
|
+
|
24
|
+
attr_accessor :begin_index
|
25
|
+
attr_accessor :end_index
|
26
|
+
|
27
|
+
|
28
|
+
def boundaries=(new_boundaries)
|
29
|
+
@begin_index=new_boundaries.first
|
30
|
+
@end_index=new_boundaries.last
|
31
|
+
end
|
32
|
+
|
33
|
+
def boundaries?
|
34
|
+
@begin_index && @end_index
|
35
|
+
end
|
36
|
+
|
37
|
+
def length
|
38
|
+
end_index - begin_index
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
class GeoAnnotation < Tcf2Nif::Annotation
|
22
|
+
|
23
|
+
attr_accessor :lat, :lon, :alt, :continent
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class NamedEntityAnnotation < Tcf2Nif::Annotation
|
23
|
+
|
24
|
+
attr_accessor :category
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class TcfDocument
|
23
|
+
|
24
|
+
def initialize(io)
|
25
|
+
@doc = Nokogiri::XML(io)
|
26
|
+
# TODO add a method that reads the XML into Ruby structures
|
27
|
+
@tokens = Array.new
|
28
|
+
@named_entities = Array.new
|
29
|
+
@geo_annotations = Array.new
|
30
|
+
@id_map = Hash.new
|
31
|
+
@token_map = Hash.new
|
32
|
+
@dependency_map = Hash.new()
|
33
|
+
|
34
|
+
process_tokens
|
35
|
+
unless @tokens.all?{|t| t.boundaries? }
|
36
|
+
calculate_character_offsets
|
37
|
+
end
|
38
|
+
|
39
|
+
# TODO process pos and lemma information
|
40
|
+
process_pos
|
41
|
+
process_lemma
|
42
|
+
|
43
|
+
process_named_entities
|
44
|
+
process_geo_annotations
|
45
|
+
process_dependencies
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def token_map
|
50
|
+
@token_map
|
51
|
+
end
|
52
|
+
|
53
|
+
def id_map
|
54
|
+
@id_map
|
55
|
+
end
|
56
|
+
|
57
|
+
def calculate_character_offsets
|
58
|
+
char_index = 0
|
59
|
+
tokens.each do |token|
|
60
|
+
new_index = text.index(token.form, char_index)
|
61
|
+
new_offset = new_index + token.form.length
|
62
|
+
token.boundaries= [new_index, new_offset]
|
63
|
+
char_index = new_offset
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def text
|
68
|
+
#puts "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
|
69
|
+
#puts "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
|
70
|
+
#puts "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
|
71
|
+
#puts "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
|
72
|
+
|
73
|
+
@text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def tokens
|
77
|
+
@tokens
|
78
|
+
end
|
79
|
+
|
80
|
+
def named_entities
|
81
|
+
@named_entities
|
82
|
+
end
|
83
|
+
|
84
|
+
def geo_annotations
|
85
|
+
@geo_annotations
|
86
|
+
end
|
87
|
+
|
88
|
+
def dependency_map
|
89
|
+
@dependency_map
|
90
|
+
end
|
91
|
+
|
92
|
+
def store_named_entity(named_entity_object)
|
93
|
+
@named_entities << named_entity_object
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
def xml_sentences
|
98
|
+
# /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
|
99
|
+
@xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
100
|
+
end
|
101
|
+
|
102
|
+
def xml_tokens
|
103
|
+
@xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
104
|
+
end
|
105
|
+
|
106
|
+
def xml_named_entities
|
107
|
+
@xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
108
|
+
end
|
109
|
+
|
110
|
+
def xml_geo_annotations
|
111
|
+
@xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
112
|
+
end
|
113
|
+
|
114
|
+
def xml_dependencies
|
115
|
+
@xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
116
|
+
end
|
117
|
+
|
118
|
+
# TODO add deep support for sentences and related tokens
|
119
|
+
|
120
|
+
def new_token(doc, xml_token)
|
121
|
+
token_object = Tcf2Nif::Token.new(doc, xml_token)
|
122
|
+
if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
|
123
|
+
token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
|
124
|
+
end
|
125
|
+
token_object
|
126
|
+
end
|
127
|
+
|
128
|
+
def store_token(token_object, xml_token)
|
129
|
+
@tokens << token_object
|
130
|
+
@id_map[xml_token['ID']] = token_object
|
131
|
+
@token_map[token_object] = xml_token['ID']
|
132
|
+
end
|
133
|
+
|
134
|
+
def token_for_id(xml_id)
|
135
|
+
@id_map[xml_id]
|
136
|
+
end
|
137
|
+
|
138
|
+
def id_for_token(token)
|
139
|
+
@token_map[token]
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def process_tokens
|
146
|
+
xml_tokens.each do |xml_token|
|
147
|
+
token = new_token(@doc, xml_token)
|
148
|
+
store_token(token, xml_token)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def process_pos
|
153
|
+
xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
154
|
+
xml_tags.each do |tag|
|
155
|
+
val = tag.text
|
156
|
+
ref = tag['tokenIDs']
|
157
|
+
ref_obj = @id_map[ref]
|
158
|
+
if val && ref_obj
|
159
|
+
ref_obj.pos = val
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def process_lemma
|
165
|
+
xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
166
|
+
xml_lemmas.each do |lemma|
|
167
|
+
val = lemma.text
|
168
|
+
ref = lemma['tokenIDs']
|
169
|
+
ref_obj = @id_map[ref]
|
170
|
+
if val && ref_obj
|
171
|
+
ref_obj.lemma = val
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_named_entities
|
177
|
+
xml_named_entities.each do |ent|
|
178
|
+
nato = Tcf2Nif::NamedEntityAnnotation.new(@doc)
|
179
|
+
nato.category = ent['class']
|
180
|
+
token_refs = ent['tokenIDs'].split(/\s+/)
|
181
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
182
|
+
tokens.each do |t|
|
183
|
+
nato << t
|
184
|
+
end
|
185
|
+
@named_entities << nato
|
186
|
+
#puts ent['class']
|
187
|
+
#puts tokens.collect{|t| t.form}.join(' ')
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def process_geo_annotations
|
192
|
+
xml_geo_annotations.each do |anno|
|
193
|
+
geo = Tcf2Nif::GeoAnnotation.new(@doc)
|
194
|
+
geo.lat = anno['lat'].to_f
|
195
|
+
geo.lon = anno['lon'].to_f
|
196
|
+
geo.alt = anno['alt'].to_f
|
197
|
+
geo.continent = anno['continent']
|
198
|
+
token_refs = anno['tokenIDs'].split(/\s+/)
|
199
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
200
|
+
tokens.each do |t|
|
201
|
+
geo << t
|
202
|
+
end
|
203
|
+
@geo_annotations << geo
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def process_dependencies
|
208
|
+
xml_dependencies.each do |dep|
|
209
|
+
# <tc:dependency depIDs="t_4" func="ROOT"/>
|
210
|
+
# <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
|
211
|
+
|
212
|
+
depToken = token_for_id(dep['depIDs'])
|
213
|
+
|
214
|
+
if dep.has_attribute?('govIDs')
|
215
|
+
# non-root tag. func is also defined.
|
216
|
+
govToken = token_for_id(dep['govIDs'])
|
217
|
+
@dependency_map[[depToken,govToken]] = dep['func']
|
218
|
+
else
|
219
|
+
# root tag.
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|