treebank-transform 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +46 -0
- data/Rakefile +7 -0
- data/bin/treebank-transform +7 -0
- data/examples/cicero_catilina_sample.xml +6889 -0
- data/lib/treebank/alphabet.rb +11 -0
- data/lib/treebank/elliptic_word.rb +42 -0
- data/lib/treebank/sentence.rb +69 -0
- data/lib/treebank/transform/cli.rb +15 -0
- data/lib/treebank/transform/version.rb +5 -0
- data/lib/treebank/transform.rb +36 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/treebank/transform_spec.rb +327 -0
- data/treebank-transform.gemspec +28 -0
- metadata +149 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
module Treebank
|
|
2
|
+
class EllipticWord
|
|
3
|
+
def initialize(word_node, sentence)
|
|
4
|
+
@node = word_node
|
|
5
|
+
@sentence = sentence
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def parse_elliptic_head
|
|
9
|
+
return unless match = @node['relation'].match(regexp)
|
|
10
|
+
|
|
11
|
+
label, elliptic_string, elliptic_label = match.captures
|
|
12
|
+
elliptic_head = @node['head']
|
|
13
|
+
|
|
14
|
+
unless head = @sentence.elliptic_nodes[elliptic_string]
|
|
15
|
+
new_node = create_new_node(elliptic_head, elliptic_label, elliptic_string)
|
|
16
|
+
head = new_node['id']
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
@node['relation'] = label
|
|
20
|
+
@node['head'] = head
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def create_new_node(head, label, string)
|
|
26
|
+
new_node = @sentence.add_ellipsis({
|
|
27
|
+
artificial: 'elliptic',
|
|
28
|
+
head: head,
|
|
29
|
+
relation: label,
|
|
30
|
+
}, string)
|
|
31
|
+
|
|
32
|
+
new_word = EllipticWord.new(new_node, @sentence)
|
|
33
|
+
new_word.parse_elliptic_head
|
|
34
|
+
|
|
35
|
+
new_node
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def regexp
|
|
39
|
+
/(\w+?)_ExD(\d+)_(.+)/
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
module Treebank
|
|
2
|
+
class Sentence
|
|
3
|
+
require "treebank/alphabet"
|
|
4
|
+
|
|
5
|
+
attr_reader :elliptic_nodes
|
|
6
|
+
|
|
7
|
+
def initialize(sentence_node)
|
|
8
|
+
@node = sentence_node
|
|
9
|
+
@last_id = @next_id = last_id
|
|
10
|
+
@elliptic_nodes = {}
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def add_ellipsis(attrs, string)
|
|
14
|
+
id = next_id
|
|
15
|
+
all_attrs = {
|
|
16
|
+
id: id,
|
|
17
|
+
insertion_id: get_insertion_id,
|
|
18
|
+
form: "[#{string}]"
|
|
19
|
+
}.merge(attrs)
|
|
20
|
+
|
|
21
|
+
new_node = new_word(all_attrs)
|
|
22
|
+
@elliptic_nodes[string] = id
|
|
23
|
+
|
|
24
|
+
@node.add_child(indent)
|
|
25
|
+
@node.add_child(new_node)
|
|
26
|
+
@node.add_child(new_line)
|
|
27
|
+
new_node
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def next_id
|
|
33
|
+
update_last_id
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def last_id
|
|
37
|
+
return @last_id if @last_id
|
|
38
|
+
last_word = @node.xpath('word').last
|
|
39
|
+
@last_id = last_word.attributes['id'].value.to_i
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def update_last_id
|
|
43
|
+
@next_id += 1
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def suffix
|
|
47
|
+
@suffix = @suffix ? Alphabet.next_letter(@suffix) : 'e'
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def get_insertion_id
|
|
51
|
+
"#{last_id.to_s.rjust(4, '0')}#{suffix}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def new_word(attrs)
|
|
55
|
+
word = Nokogiri::XML::Node.new('word', @node)
|
|
56
|
+
attrs.each { |k, v| word[k] = v }
|
|
57
|
+
word
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def indent
|
|
61
|
+
Nokogiri::XML::Text.new(" ", @node)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def new_line
|
|
65
|
+
Nokogiri::XML::Text.new("\n ", @node)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'treebank/transform'
|
|
2
|
+
require 'thor'
|
|
3
|
+
|
|
4
|
+
module Treebank
|
|
5
|
+
class Transform
|
|
6
|
+
class CLI < Thor
|
|
7
|
+
|
|
8
|
+
desc 'do FILE', 'transforms 1.5 Treebanks to the interim Arethusa format'
|
|
9
|
+
def do(file)
|
|
10
|
+
transformer = Transform.new(File.read(file))
|
|
11
|
+
puts transformer.transform
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require "treebank/transform/version"
|
|
2
|
+
require "nokogiri"
|
|
3
|
+
|
|
4
|
+
module Treebank
|
|
5
|
+
require "treebank/sentence"
|
|
6
|
+
require "treebank/elliptic_word"
|
|
7
|
+
|
|
8
|
+
class Transform
|
|
9
|
+
def initialize(doc)
|
|
10
|
+
@doc = Nokogiri::XML(doc);
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def transform
|
|
14
|
+
transform_elliptic_nodes
|
|
15
|
+
@doc.to_xml(indent: 2)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def transform_elliptic_nodes
|
|
21
|
+
@doc.xpath('//treebank/sentence').each do |sentence_node|
|
|
22
|
+
sentence = Sentence.new(sentence_node)
|
|
23
|
+
sentence_node.xpath('word').each do |word_node|
|
|
24
|
+
if has_elliptic_head(word_node['relation'])
|
|
25
|
+
word = EllipticWord.new(word_node, sentence)
|
|
26
|
+
word.parse_elliptic_head
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def has_elliptic_head(label)
|
|
33
|
+
label.match(/ExD\d+/)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'simplecov'
|
|
2
|
+
require 'coveralls'
|
|
3
|
+
|
|
4
|
+
Coveralls.wear!
|
|
5
|
+
|
|
6
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
|
7
|
+
SimpleCov::Formatter::HTMLFormatter,
|
|
8
|
+
Coveralls::SimpleCov::Formatter
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
SimpleCov.start do
|
|
12
|
+
add_filter '/spec/'
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
|
16
|
+
require 'treebank/transform'
|
|
17
|
+
|
|
18
|
+
RSpec.configure do |config|
|
|
19
|
+
config.run_all_when_everything_filtered = true
|
|
20
|
+
config.filter_run :focus
|
|
21
|
+
end
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Treebank::Transform do
|
|
4
|
+
it 'has a version number' do
|
|
5
|
+
expect(Treebank::Transform::VERSION).not_to be nil
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
# Examples taken from http://nlp.perseus.tufts.edu/syntax/treebank/ldt/1.5/data/1999.02.0010.xml
|
|
9
|
+
|
|
10
|
+
let(:tb1) do
|
|
11
|
+
<<EOF
|
|
12
|
+
<?xml version="1.0"?>
|
|
13
|
+
<treebank>
|
|
14
|
+
<sentence id="2" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=1" span="quam0:eludet0">
|
|
15
|
+
<word id="1" form="quam" lemma="quam1" postag="d--------" head="2" relation="ADV"/>
|
|
16
|
+
<word id="2" form="diu" lemma="diu1" postag="d--------" head="8" relation="ADV"/>
|
|
17
|
+
<word id="3" form="etiam" lemma="etiam1" postag="c--------" head="8" relation="AuxY"/>
|
|
18
|
+
<word id="4" form="furor" lemma="furor2" postag="n-s---mn-" head="8" relation="SBJ"/>
|
|
19
|
+
<word id="5" form="iste" lemma="iste1" postag="p-s---mn-" head="4" relation="ATR"/>
|
|
20
|
+
<word id="6" form="tuus" lemma="tuus1" postag="a-s---mn-" head="4" relation="ATR"/>
|
|
21
|
+
<word id="7" form="nos" lemma="nos1" postag="p-p---ma-" head="8" relation="OBJ"/>
|
|
22
|
+
<word id="8" form="eludet" lemma="eludo1" postag="v3sfia---" head="0" relation="PRED"/>
|
|
23
|
+
</sentence>
|
|
24
|
+
</treebank>
|
|
25
|
+
EOF
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
let(:tb2) do
|
|
29
|
+
<<EOF
|
|
30
|
+
<?xml version="1.0"?>
|
|
31
|
+
<treebank>
|
|
32
|
+
<sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
|
|
33
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
|
|
34
|
+
<word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
|
|
35
|
+
<word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
|
|
36
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
|
|
37
|
+
<word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
|
|
38
|
+
<word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
|
|
39
|
+
<word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
|
40
|
+
<word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
|
|
41
|
+
<word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="8" relation="PNOM_ExD0_ADV"/>
|
|
42
|
+
<word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
|
43
|
+
<word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
|
|
44
|
+
<word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
|
|
45
|
+
<word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
|
|
46
|
+
<word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
|
|
47
|
+
<word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
|
|
48
|
+
</sentence>
|
|
49
|
+
</treebank>
|
|
50
|
+
EOF
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
let(:tb2_result) do
|
|
54
|
+
<<EOF
|
|
55
|
+
<?xml version="1.0"?>
|
|
56
|
+
<treebank>
|
|
57
|
+
<sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
|
|
58
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
|
|
59
|
+
<word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
|
|
60
|
+
<word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
|
|
61
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
|
|
62
|
+
<word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
|
|
63
|
+
<word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
|
|
64
|
+
<word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
|
65
|
+
<word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
|
|
66
|
+
<word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="16" relation="PNOM"/>
|
|
67
|
+
<word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
|
68
|
+
<word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
|
|
69
|
+
<word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
|
|
70
|
+
<word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
|
|
71
|
+
<word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
|
|
72
|
+
<word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
|
|
73
|
+
<word id="16" insertion_id="0015e" form="[0]" artificial="elliptic" head="8" relation="ADV"/>
|
|
74
|
+
</sentence>
|
|
75
|
+
</treebank>
|
|
76
|
+
EOF
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
let(:tb3) do
|
|
80
|
+
<<EOF
|
|
81
|
+
<?xml version="1.0"?>
|
|
82
|
+
<treebank>
|
|
83
|
+
<sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
|
|
84
|
+
<word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="0" relation="SBJ_ExD0_PRED"/>
|
|
85
|
+
<word id="2" form="vero" lemma="verus1" postag="d--------" head="0" relation="AuxY_ExD0_PRED"/>
|
|
86
|
+
</sentence>
|
|
87
|
+
</treebank>
|
|
88
|
+
EOF
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
let(:tb3_result) do
|
|
92
|
+
<<EOF
|
|
93
|
+
<?xml version="1.0"?>
|
|
94
|
+
<treebank>
|
|
95
|
+
<sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
|
|
96
|
+
<word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="3" relation="SBJ"/>
|
|
97
|
+
<word id="2" form="vero" lemma="verus1" postag="d--------" head="3" relation="AuxY"/>
|
|
98
|
+
<word id="3" insertion_id="0002e" form="[0]" artificial="elliptic" head="0" relation="PRED"/>
|
|
99
|
+
</sentence>
|
|
100
|
+
</treebank>
|
|
101
|
+
EOF
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
let(:tb4) do
|
|
105
|
+
<<EOF
|
|
106
|
+
<?xml version="1.0"?>
|
|
107
|
+
<treebank>
|
|
108
|
+
<sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
|
|
109
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
|
110
|
+
<word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
|
|
111
|
+
<word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
|
|
112
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
|
113
|
+
<word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
|
|
114
|
+
<word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
|
115
|
+
<word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="1" relation="OBJ_ExD0_ADV_CO"/>
|
|
116
|
+
<word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
|
|
117
|
+
<word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
|
118
|
+
<word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
|
|
119
|
+
<word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
|
|
120
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
|
121
|
+
<word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
|
|
122
|
+
<word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
|
|
123
|
+
<word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
|
|
124
|
+
<word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
|
|
125
|
+
<word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
|
|
126
|
+
<word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
|
|
127
|
+
<word id="19" form="non" lemma="non1" postag="d--------" head="18" relation="AuxZ_ExD1_ADV"/>
|
|
128
|
+
<word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="18" relation="SBJ_ExD2_OBJ_ExD1_ADV"/>
|
|
129
|
+
<word id="21" form="potius" lemma="potis1" postag="d--------" head="18" relation="ADV_ExD1_ADV"/>
|
|
130
|
+
<word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
|
|
131
|
+
<word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="18" relation="SBJ_ExD1_ADV"/>
|
|
132
|
+
<word id="24" form="serius" lemma="serus1" postag="d--------" head="18" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
|
|
133
|
+
<word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
|
134
|
+
<word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="25" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
|
|
135
|
+
<word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
|
|
136
|
+
<word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
|
|
137
|
+
<word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
|
|
138
|
+
<word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
|
|
139
|
+
<word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
|
|
140
|
+
<word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
|
|
141
|
+
</sentence>
|
|
142
|
+
</treebank>
|
|
143
|
+
EOF
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
let(:tb4_result) do
|
|
147
|
+
<<EOF
|
|
148
|
+
<?xml version="1.0"?>
|
|
149
|
+
<treebank>
|
|
150
|
+
<sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
|
|
151
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
|
152
|
+
<word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
|
|
153
|
+
<word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
|
|
154
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
|
155
|
+
<word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
|
|
156
|
+
<word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
|
157
|
+
<word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="33" relation="OBJ"/>
|
|
158
|
+
<word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
|
|
159
|
+
<word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
|
160
|
+
<word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
|
|
161
|
+
<word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
|
|
162
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
|
163
|
+
<word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
|
|
164
|
+
<word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
|
|
165
|
+
<word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
|
|
166
|
+
<word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
|
|
167
|
+
<word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
|
|
168
|
+
<word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
|
|
169
|
+
<word id="19" form="non" lemma="non1" postag="d--------" head="34" relation="AuxZ"/>
|
|
170
|
+
<word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="35" relation="SBJ"/>
|
|
171
|
+
<word id="21" form="potius" lemma="potis1" postag="d--------" head="34" relation="ADV"/>
|
|
172
|
+
<word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
|
|
173
|
+
<word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="34" relation="SBJ"/>
|
|
174
|
+
<word id="24" form="serius" lemma="serus1" postag="d--------" head="35" relation="ADV"/>
|
|
175
|
+
<word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
|
176
|
+
<word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="35" relation="ADV"/>
|
|
177
|
+
<word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
|
|
178
|
+
<word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
|
|
179
|
+
<word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
|
|
180
|
+
<word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
|
|
181
|
+
<word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
|
|
182
|
+
<word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
|
|
183
|
+
<word id="33" insertion_id="0032e" form="[0]" artificial="elliptic" head="1" relation="ADV_CO"/>
|
|
184
|
+
<word id="34" insertion_id="0032f" form="[1]" artificial="elliptic" head="18" relation="ADV"/>
|
|
185
|
+
<word id="35" insertion_id="0032g" form="[2]" artificial="elliptic" head="34" relation="OBJ"/>
|
|
186
|
+
</sentence>
|
|
187
|
+
</treebank>
|
|
188
|
+
EOF
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
let(:tb5) do
|
|
192
|
+
<<EOF
|
|
193
|
+
<?xml version="1.0"?>
|
|
194
|
+
<treebank>
|
|
195
|
+
<sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
|
|
196
|
+
<word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR" />
|
|
197
|
+
<word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="12" relation="SBJ_ExD0_PRED_CO" />
|
|
198
|
+
<word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP" />
|
|
199
|
+
<word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="3" relation="ADV_ExD0_PRED_CO" />
|
|
200
|
+
<word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX" />
|
|
201
|
+
<word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR" />
|
|
202
|
+
<word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="12" relation="SBJ_ExD1_PRED_CO" />
|
|
203
|
+
<word id="8" form="a" lemma="ab1" postag="r--------" head="12" relation="AuxP_ExD1_PRED_CO" />
|
|
204
|
+
<word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV" />
|
|
205
|
+
<word id="10" form="umquam" lemma="umquam1" postag="d--------" head="12" relation="ADV_ExD1_PRED_CO" />
|
|
206
|
+
<word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR" />
|
|
207
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD" />
|
|
208
|
+
<word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR" />
|
|
209
|
+
<word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ" />
|
|
210
|
+
<word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP" />
|
|
211
|
+
<word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR" />
|
|
212
|
+
<word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV" />
|
|
213
|
+
<word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO" />
|
|
214
|
+
</sentence>
|
|
215
|
+
<sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
|
|
216
|
+
<word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR" />
|
|
217
|
+
<word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ" />
|
|
218
|
+
<word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ" />
|
|
219
|
+
<word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ" />
|
|
220
|
+
<word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR" />
|
|
221
|
+
<word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV" />
|
|
222
|
+
<word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR" />
|
|
223
|
+
<word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ" />
|
|
224
|
+
<word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY" />
|
|
225
|
+
<word id="10" form="ad" lemma="ad1" postag="r--------" head="13" relation="AuxP_ExD0_PRED_CO" />
|
|
226
|
+
<word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV" />
|
|
227
|
+
<word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="13" relation="OBJ_ExD0_PRED_CO" />
|
|
228
|
+
<word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD" />
|
|
229
|
+
<word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP" />
|
|
230
|
+
<word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV" />
|
|
231
|
+
<word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ" />
|
|
232
|
+
<word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO" />
|
|
233
|
+
</sentence>
|
|
234
|
+
</treebank>
|
|
235
|
+
EOF
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
let(:tb5_result) do
|
|
239
|
+
<<EOF
|
|
240
|
+
<?xml version="1.0"?>
|
|
241
|
+
<treebank>
|
|
242
|
+
<sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
|
|
243
|
+
<word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR"/>
|
|
244
|
+
<word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="19" relation="SBJ"/>
|
|
245
|
+
<word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP"/>
|
|
246
|
+
<word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="19" relation="ADV"/>
|
|
247
|
+
<word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX"/>
|
|
248
|
+
<word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR"/>
|
|
249
|
+
<word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="20" relation="SBJ"/>
|
|
250
|
+
<word id="8" form="a" lemma="ab1" postag="r--------" head="20" relation="AuxP"/>
|
|
251
|
+
<word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV"/>
|
|
252
|
+
<word id="10" form="umquam" lemma="umquam1" postag="d--------" head="20" relation="ADV"/>
|
|
253
|
+
<word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR"/>
|
|
254
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
|
255
|
+
<word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR"/>
|
|
256
|
+
<word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ"/>
|
|
257
|
+
<word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
|
258
|
+
<word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR"/>
|
|
259
|
+
<word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV"/>
|
|
260
|
+
<word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO"/>
|
|
261
|
+
<word id="19" insertion_id="0018e" form="[0]" artificial="elliptic" head="12" relation="PRED_CO"/>
|
|
262
|
+
<word id="20" insertion_id="0018f" form="[1]" artificial="elliptic" head="12" relation="PRED_CO"/>
|
|
263
|
+
</sentence>
|
|
264
|
+
<sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
|
|
265
|
+
<word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR"/>
|
|
266
|
+
<word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ"/>
|
|
267
|
+
<word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ"/>
|
|
268
|
+
<word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ"/>
|
|
269
|
+
<word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR"/>
|
|
270
|
+
<word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV"/>
|
|
271
|
+
<word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR"/>
|
|
272
|
+
<word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ"/>
|
|
273
|
+
<word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY"/>
|
|
274
|
+
<word id="10" form="ad" lemma="ad1" postag="r--------" head="18" relation="AuxP"/>
|
|
275
|
+
<word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV"/>
|
|
276
|
+
<word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="18" relation="OBJ"/>
|
|
277
|
+
<word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD"/>
|
|
278
|
+
<word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP"/>
|
|
279
|
+
<word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV"/>
|
|
280
|
+
<word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ"/>
|
|
281
|
+
<word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO"/>
|
|
282
|
+
<word id="18" insertion_id="0017e" form="[0]" artificial="elliptic" head="13" relation="PRED_CO"/>
|
|
283
|
+
</sentence>
|
|
284
|
+
</treebank>
|
|
285
|
+
EOF
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
describe "#transform" do
|
|
289
|
+
it "returns the document when there is nothing to transform" do
|
|
290
|
+
tb = Treebank::Transform.new(tb1)
|
|
291
|
+
result = tb.transform
|
|
292
|
+
expect(result).to eq tb1
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
context "with a single simple ellipsis" do
|
|
296
|
+
it "inserts a new elliptic node and updates the head" do
|
|
297
|
+
tb = Treebank::Transform.new(tb2)
|
|
298
|
+
result = tb.transform
|
|
299
|
+
expect(result).to eq tb2_result
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
context "when multiple token are children of the same ellipsis" do
|
|
304
|
+
it "inserts a new elliptic node and updates the head" do
|
|
305
|
+
tb = Treebank::Transform.new(tb3)
|
|
306
|
+
result = tb.transform
|
|
307
|
+
expect(result).to eq tb3_result
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
context "when ellipses are chained" do
|
|
312
|
+
it "does as it should" do
|
|
313
|
+
tb = Treebank::Transform.new(tb4)
|
|
314
|
+
result = tb.transform
|
|
315
|
+
expect(result).to eq tb4_result
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
context "with multiple sentences in a document" do
|
|
320
|
+
it "does as it should" do
|
|
321
|
+
tb = Treebank::Transform.new(tb5)
|
|
322
|
+
result = tb.transform
|
|
323
|
+
expect(result).to eq tb5_result
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'treebank/transform/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "treebank-transform"
|
|
8
|
+
spec.version = Treebank::Transform::VERSION
|
|
9
|
+
spec.authors = ["LFDM"]
|
|
10
|
+
spec.email = ["1986gh@gmail.com"]
|
|
11
|
+
spec.summary = %q{Transforms Perseus Treebank files}
|
|
12
|
+
spec.description = spec.summary
|
|
13
|
+
spec.homepage = ""
|
|
14
|
+
spec.license = "MIT"
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
+
spec.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
|
23
|
+
spec.add_development_dependency "rspec"
|
|
24
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
|
25
|
+
|
|
26
|
+
spec.add_dependency "thor"
|
|
27
|
+
spec.add_dependency "nokogiri"
|
|
28
|
+
end
|