tcf2nif 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/.gitlab-ci.yml +25 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +165 -0
- data/README.md +112 -0
- data/Rakefile +6 -0
- data/Tcf2Nif.ipynb +197 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/exe/convpar +7 -0
- data/exe/createturtle +32 -0
- data/exe/tcf2nif +29 -0
- data/exe/txt2tcf +30 -0
- data/lib/tcf2nif.rb +55 -0
- data/lib/tcf2nif/annotation.rb +39 -0
- data/lib/tcf2nif/bounded_element.rb +43 -0
- data/lib/tcf2nif/geo_annotation.rb +26 -0
- data/lib/tcf2nif/named_entity_annotation.rb +28 -0
- data/lib/tcf2nif/tcf_document.rb +228 -0
- data/lib/tcf2nif/token.rb +47 -0
- data/lib/tcf2nif/transformer.rb +352 -0
- data/lib/tcf2nif/version.rb +3 -0
- data/tcf2nif.gemspec +40 -0
- metadata +200 -0
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "tcf2nif"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/exe/convpar
ADDED
data/exe/createturtle
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#! /usr/bin/env bash
|
2
|
+
cd ../spec/out/plainprov
|
3
|
+
rapper ./phantom.nt -i ntriples -o turtle \
|
4
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
5
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
6
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
7
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
8
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
9
|
+
> ./phantom.ttl
|
10
|
+
rapper ./screw.nt -i ntriples -o turtle \
|
11
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
12
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
13
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
14
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
15
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
16
|
+
> ./screw.ttl
|
17
|
+
rapper ./miserables.nt -i ntriples -o turtle \
|
18
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
19
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
20
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
21
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
22
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
23
|
+
> ./miserables.ttl
|
24
|
+
|
25
|
+
cd ../modularized
|
26
|
+
rapper ./phantom.n3 -i ntriples -o turtle \
|
27
|
+
-f 'xmlns:ex="http://example.org/tcf2nif/example.txt#"' \
|
28
|
+
-f 'xmlns:xsd="http://www.w3.org/2001/XMLSchema#"' \
|
29
|
+
-f 'xmlns:prov="http://www.w3.org/ns/prov#"' \
|
30
|
+
-f 'xmlns:mond="http://petermenke.de/mond#"' \
|
31
|
+
-f 'xmlns:nif="http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"' \
|
32
|
+
> ./phantom.ttl
|
data/exe/tcf2nif
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
|
4
|
+
require 'tcf2nif'
|
5
|
+
require 'trollop'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
synopsis 'Creates NIF documents from a TCF file (generated by WebLicht)'
|
9
|
+
usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
|
10
|
+
opt :input_file, "The file containing the plain text document to be converted", type: :string
|
11
|
+
opt :output_file, "The file name to be used for the output file", type: :string
|
12
|
+
opt :format, "The output file format", type: :string, default: 'n3'
|
13
|
+
end
|
14
|
+
puts opts
|
15
|
+
|
16
|
+
if opts[:input_file] && opts[:output_file] && opts[:format]
|
17
|
+
@tcf_file = File.open(opts[:input_file], 'r')
|
18
|
+
@tcf_doc = Tcf2Nif::TcfDocument.new(@tcf_file)
|
19
|
+
@trans = Tcf2Nif::Transformer.new(@tcf_doc, {})
|
20
|
+
@graph = @trans.transform(:noprov)
|
21
|
+
RDF::Writer.open(opts[:output_file], :format => :ntriples) do |writer|
|
22
|
+
writer << RDF::Repository.new do |repo|
|
23
|
+
repo << @graph
|
24
|
+
end
|
25
|
+
end
|
26
|
+
else
|
27
|
+
Trollop::educate
|
28
|
+
exit(1)
|
29
|
+
end
|
data/exe/txt2tcf
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
|
4
|
+
require 'tcf2nif'
|
5
|
+
require 'trollop'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
synopsis 'Creates TCF files via the WebLicht service from a plain text file'
|
9
|
+
usage "Usage: #{__FILE__} sourcefile [OPTIONS]"
|
10
|
+
opt :input_file, "The file containing the plain text document to be converted", type: :string
|
11
|
+
opt :output_file, "The file name to be used for the output file", type: :string
|
12
|
+
opt :token, "The WebLicht WaaS API token", type: :string
|
13
|
+
opt :chain, "The chain description XML document", type: :string
|
14
|
+
end
|
15
|
+
|
16
|
+
if opts[:input_file] && opts[:output_file] && opts[:chain]
|
17
|
+
if opts.token
|
18
|
+
waas_token = opts.token
|
19
|
+
else
|
20
|
+
waas_token = ENV['WAAS_TOKEN']
|
21
|
+
end
|
22
|
+
service_uri = "https://weblicht.sfs.uni-tuebingen.de/WaaS/api/1.0/chain/process"
|
23
|
+
|
24
|
+
puts "curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}"
|
25
|
+
`curl -X POST -F chains=@#{opts[:chain]} -F content=@#{opts[:input_file]} -F apikey=#{waas_token} #{service_uri} > #{opts[:output_file]}`
|
26
|
+
|
27
|
+
else
|
28
|
+
Trollop::educate
|
29
|
+
exit(1)
|
30
|
+
end
|
data/lib/tcf2nif.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
require 'nokogiri'
|
21
|
+
require 'rdf'
|
22
|
+
require 'rdf/turtle'
|
23
|
+
require "tcf2nif/version"
|
24
|
+
require "tcf2nif/bounded_element"
|
25
|
+
require "tcf2nif/annotation"
|
26
|
+
require "tcf2nif/token"
|
27
|
+
require "tcf2nif/named_entity_annotation"
|
28
|
+
require "tcf2nif/geo_annotation"
|
29
|
+
require "tcf2nif/tcf_document"
|
30
|
+
require "tcf2nif/transformer"
|
31
|
+
|
32
|
+
module Tcf2Nif
|
33
|
+
|
34
|
+
NIF = RDF::Vocabulary.new("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
|
35
|
+
PENN = RDF::Vocabulary.new("http://purl.org/olia/penn.owl#")
|
36
|
+
NERD = RDF::Vocabulary.new("http://nerd.eurecom.fr/ontology#")
|
37
|
+
GEO = RDF::Vocabulary.new("http://www.w3.org/2003/01/geo/wgs84_pos#")
|
38
|
+
PROV = RDF::Vocabulary.new("http://www.w3.org/ns/prov#")
|
39
|
+
MOND = RDF::Vocabulary.new("http://petermenke.de/mond#")
|
40
|
+
|
41
|
+
STANDARD_PREFIXES = {
|
42
|
+
nif: Tcf2Nif::NIF,
|
43
|
+
rdfs: RDF::RDFS,
|
44
|
+
xsd: RDF::XSD,
|
45
|
+
penn: Tcf2Nif::PENN,
|
46
|
+
geo: Tcf2Nif::GEO,
|
47
|
+
nerd: Tcf2Nif::NERD,
|
48
|
+
mond: Tcf2Nif::MOND
|
49
|
+
}
|
50
|
+
|
51
|
+
def self.root
|
52
|
+
File.expand_path('../..',__FILE__)
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class Annotation
|
23
|
+
|
24
|
+
def initialize(tcf_document)
|
25
|
+
@tcf_document = tcf_document
|
26
|
+
@tokens = Array.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def tokens
|
30
|
+
@tokens
|
31
|
+
end
|
32
|
+
|
33
|
+
def <<(token)
|
34
|
+
@tokens << token
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class BoundedElement
|
23
|
+
|
24
|
+
attr_accessor :begin_index
|
25
|
+
attr_accessor :end_index
|
26
|
+
|
27
|
+
|
28
|
+
def boundaries=(new_boundaries)
|
29
|
+
@begin_index=new_boundaries.first
|
30
|
+
@end_index=new_boundaries.last
|
31
|
+
end
|
32
|
+
|
33
|
+
def boundaries?
|
34
|
+
@begin_index && @end_index
|
35
|
+
end
|
36
|
+
|
37
|
+
def length
|
38
|
+
end_index - begin_index
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
class GeoAnnotation < Tcf2Nif::Annotation
|
22
|
+
|
23
|
+
attr_accessor :lat, :lon, :alt, :continent
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class NamedEntityAnnotation < Tcf2Nif::Annotation
|
23
|
+
|
24
|
+
attr_accessor :category
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# This file is part of the tcf2nif gem.
|
3
|
+
# Copyright (c) 2015 Peter Menke, SFB 673, Universität Bielefeld
|
4
|
+
# http://www.sfb673.org
|
5
|
+
#
|
6
|
+
# tcf2nif is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as
|
8
|
+
# published by the Free Software Foundation, either version 3 of
|
9
|
+
# the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# tcf2nif is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with tcf2nif. If not, see
|
18
|
+
# <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
module Tcf2Nif
|
21
|
+
|
22
|
+
class TcfDocument
|
23
|
+
|
24
|
+
def initialize(io)
|
25
|
+
@doc = Nokogiri::XML(io)
|
26
|
+
# TODO add a method that reads the XML into Ruby structures
|
27
|
+
@tokens = Array.new
|
28
|
+
@named_entities = Array.new
|
29
|
+
@geo_annotations = Array.new
|
30
|
+
@id_map = Hash.new
|
31
|
+
@token_map = Hash.new
|
32
|
+
@dependency_map = Hash.new()
|
33
|
+
|
34
|
+
process_tokens
|
35
|
+
unless @tokens.all?{|t| t.boundaries? }
|
36
|
+
calculate_character_offsets
|
37
|
+
end
|
38
|
+
|
39
|
+
# TODO process pos and lemma information
|
40
|
+
process_pos
|
41
|
+
process_lemma
|
42
|
+
|
43
|
+
process_named_entities
|
44
|
+
process_geo_annotations
|
45
|
+
process_dependencies
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def token_map
|
50
|
+
@token_map
|
51
|
+
end
|
52
|
+
|
53
|
+
def id_map
|
54
|
+
@id_map
|
55
|
+
end
|
56
|
+
|
57
|
+
def calculate_character_offsets
|
58
|
+
char_index = 0
|
59
|
+
tokens.each do |token|
|
60
|
+
new_index = text.index(token.form, char_index)
|
61
|
+
new_offset = new_index + token.form.length
|
62
|
+
token.boundaries= [new_index, new_offset]
|
63
|
+
char_index = new_offset
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def text
|
68
|
+
#puts "texts: %i" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').size
|
69
|
+
#puts "text type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').class.name
|
70
|
+
#puts "first type: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.class.name
|
71
|
+
#puts "first cont: %s" % @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s.slice(0,128)
|
72
|
+
|
73
|
+
@text ||= @doc.xpath('.//tc:text/text()', 'wl' => 'http://www.dspin.de/data', 'tc' => 'http://www.dspin.de/data/textcorpus').first.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def tokens
|
77
|
+
@tokens
|
78
|
+
end
|
79
|
+
|
80
|
+
def named_entities
|
81
|
+
@named_entities
|
82
|
+
end
|
83
|
+
|
84
|
+
def geo_annotations
|
85
|
+
@geo_annotations
|
86
|
+
end
|
87
|
+
|
88
|
+
def dependency_map
|
89
|
+
@dependency_map
|
90
|
+
end
|
91
|
+
|
92
|
+
def store_named_entity(named_entity_object)
|
93
|
+
@named_entities << named_entity_object
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
def xml_sentences
|
98
|
+
# /wl:D-Spin/tc:TextCorpus[1]/tc:text[1]
|
99
|
+
@xml_sentences ||= @doc.xpath('//tc:sentences/tc:sentence', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
100
|
+
end
|
101
|
+
|
102
|
+
def xml_tokens
|
103
|
+
@xml_tokens ||= @doc.xpath('//tc:tokens/tc:token', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
104
|
+
end
|
105
|
+
|
106
|
+
def xml_named_entities
|
107
|
+
@xml_named_entities ||= @doc.xpath('//tc:namedEntities/tc:entity', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
108
|
+
end
|
109
|
+
|
110
|
+
def xml_geo_annotations
|
111
|
+
@xml_geo_annotations ||= @doc.xpath('//tc:geo/tc:gpoint', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
112
|
+
end
|
113
|
+
|
114
|
+
def xml_dependencies
|
115
|
+
@xml_dependencies ||= @doc.xpath('//tc:depparsing/tc:parse/tc:dependency', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
116
|
+
end
|
117
|
+
|
118
|
+
# TODO add deep support for sentences and related tokens
|
119
|
+
|
120
|
+
def new_token(doc, xml_token)
|
121
|
+
token_object = Tcf2Nif::Token.new(doc, xml_token)
|
122
|
+
if xml_token.has_attribute?('start') && xml_token.has_attribute?('end')
|
123
|
+
token_object.boundaries= [xml_token['start'].to_i, xml_token['end'].to_i]
|
124
|
+
end
|
125
|
+
token_object
|
126
|
+
end
|
127
|
+
|
128
|
+
def store_token(token_object, xml_token)
|
129
|
+
@tokens << token_object
|
130
|
+
@id_map[xml_token['ID']] = token_object
|
131
|
+
@token_map[token_object] = xml_token['ID']
|
132
|
+
end
|
133
|
+
|
134
|
+
def token_for_id(xml_id)
|
135
|
+
@id_map[xml_id]
|
136
|
+
end
|
137
|
+
|
138
|
+
def id_for_token(token)
|
139
|
+
@token_map[token]
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def process_tokens
|
146
|
+
xml_tokens.each do |xml_token|
|
147
|
+
token = new_token(@doc, xml_token)
|
148
|
+
store_token(token, xml_token)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def process_pos
|
153
|
+
xml_tags = @doc.xpath('//tc:POStags/tc:tag', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
154
|
+
xml_tags.each do |tag|
|
155
|
+
val = tag.text
|
156
|
+
ref = tag['tokenIDs']
|
157
|
+
ref_obj = @id_map[ref]
|
158
|
+
if val && ref_obj
|
159
|
+
ref_obj.pos = val
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def process_lemma
|
165
|
+
xml_lemmas = @doc.xpath('//tc:lemmas/tc:lemma', 'tc' => 'http://www.dspin.de/data/textcorpus')
|
166
|
+
xml_lemmas.each do |lemma|
|
167
|
+
val = lemma.text
|
168
|
+
ref = lemma['tokenIDs']
|
169
|
+
ref_obj = @id_map[ref]
|
170
|
+
if val && ref_obj
|
171
|
+
ref_obj.lemma = val
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def process_named_entities
|
177
|
+
xml_named_entities.each do |ent|
|
178
|
+
nato = Tcf2Nif::NamedEntityAnnotation.new(@doc)
|
179
|
+
nato.category = ent['class']
|
180
|
+
token_refs = ent['tokenIDs'].split(/\s+/)
|
181
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
182
|
+
tokens.each do |t|
|
183
|
+
nato << t
|
184
|
+
end
|
185
|
+
@named_entities << nato
|
186
|
+
#puts ent['class']
|
187
|
+
#puts tokens.collect{|t| t.form}.join(' ')
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def process_geo_annotations
|
192
|
+
xml_geo_annotations.each do |anno|
|
193
|
+
geo = Tcf2Nif::GeoAnnotation.new(@doc)
|
194
|
+
geo.lat = anno['lat'].to_f
|
195
|
+
geo.lon = anno['lon'].to_f
|
196
|
+
geo.alt = anno['alt'].to_f
|
197
|
+
geo.continent = anno['continent']
|
198
|
+
token_refs = anno['tokenIDs'].split(/\s+/)
|
199
|
+
tokens = token_refs.collect{|r| token_for_id(r)}
|
200
|
+
tokens.each do |t|
|
201
|
+
geo << t
|
202
|
+
end
|
203
|
+
@geo_annotations << geo
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def process_dependencies
|
208
|
+
xml_dependencies.each do |dep|
|
209
|
+
# <tc:dependency depIDs="t_4" func="ROOT"/>
|
210
|
+
# <tc:dependency govIDs="t_4" depIDs="t_2" func="aux"/>
|
211
|
+
|
212
|
+
depToken = token_for_id(dep['depIDs'])
|
213
|
+
|
214
|
+
if dep.has_attribute?('govIDs')
|
215
|
+
# non-root tag. func is also defined.
|
216
|
+
govToken = token_for_id(dep['govIDs'])
|
217
|
+
@dependency_map[[depToken,govToken]] = dep['func']
|
218
|
+
else
|
219
|
+
# root tag.
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|