treebank-transform 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +46 -0
- data/Rakefile +7 -0
- data/bin/treebank-transform +7 -0
- data/examples/cicero_catilina_sample.xml +6889 -0
- data/lib/treebank/alphabet.rb +11 -0
- data/lib/treebank/elliptic_word.rb +42 -0
- data/lib/treebank/sentence.rb +69 -0
- data/lib/treebank/transform/cli.rb +15 -0
- data/lib/treebank/transform/version.rb +5 -0
- data/lib/treebank/transform.rb +36 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/treebank/transform_spec.rb +327 -0
- data/treebank-transform.gemspec +28 -0
- metadata +149 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
module Treebank
|
2
|
+
class EllipticWord
|
3
|
+
def initialize(word_node, sentence)
|
4
|
+
@node = word_node
|
5
|
+
@sentence = sentence
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse_elliptic_head
|
9
|
+
return unless match = @node['relation'].match(regexp)
|
10
|
+
|
11
|
+
label, elliptic_string, elliptic_label = match.captures
|
12
|
+
elliptic_head = @node['head']
|
13
|
+
|
14
|
+
unless head = @sentence.elliptic_nodes[elliptic_string]
|
15
|
+
new_node = create_new_node(elliptic_head, elliptic_label, elliptic_string)
|
16
|
+
head = new_node['id']
|
17
|
+
end
|
18
|
+
|
19
|
+
@node['relation'] = label
|
20
|
+
@node['head'] = head
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def create_new_node(head, label, string)
|
26
|
+
new_node = @sentence.add_ellipsis({
|
27
|
+
artificial: 'elliptic',
|
28
|
+
head: head,
|
29
|
+
relation: label,
|
30
|
+
}, string)
|
31
|
+
|
32
|
+
new_word = EllipticWord.new(new_node, @sentence)
|
33
|
+
new_word.parse_elliptic_head
|
34
|
+
|
35
|
+
new_node
|
36
|
+
end
|
37
|
+
|
38
|
+
def regexp
|
39
|
+
/(\w+?)_ExD(\d+)_(.+)/
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Treebank
|
2
|
+
class Sentence
|
3
|
+
require "treebank/alphabet"
|
4
|
+
|
5
|
+
attr_reader :elliptic_nodes
|
6
|
+
|
7
|
+
def initialize(sentence_node)
|
8
|
+
@node = sentence_node
|
9
|
+
@last_id = @next_id = last_id
|
10
|
+
@elliptic_nodes = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_ellipsis(attrs, string)
|
14
|
+
id = next_id
|
15
|
+
all_attrs = {
|
16
|
+
id: id,
|
17
|
+
insertion_id: get_insertion_id,
|
18
|
+
form: "[#{string}]"
|
19
|
+
}.merge(attrs)
|
20
|
+
|
21
|
+
new_node = new_word(all_attrs)
|
22
|
+
@elliptic_nodes[string] = id
|
23
|
+
|
24
|
+
@node.add_child(indent)
|
25
|
+
@node.add_child(new_node)
|
26
|
+
@node.add_child(new_line)
|
27
|
+
new_node
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def next_id
|
33
|
+
update_last_id
|
34
|
+
end
|
35
|
+
|
36
|
+
def last_id
|
37
|
+
return @last_id if @last_id
|
38
|
+
last_word = @node.xpath('word').last
|
39
|
+
@last_id = last_word.attributes['id'].value.to_i
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_last_id
|
43
|
+
@next_id += 1
|
44
|
+
end
|
45
|
+
|
46
|
+
def suffix
|
47
|
+
@suffix = @suffix ? Alphabet.next_letter(@suffix) : 'e'
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_insertion_id
|
51
|
+
"#{last_id.to_s.rjust(4, '0')}#{suffix}"
|
52
|
+
end
|
53
|
+
|
54
|
+
def new_word(attrs)
|
55
|
+
word = Nokogiri::XML::Node.new('word', @node)
|
56
|
+
attrs.each { |k, v| word[k] = v }
|
57
|
+
word
|
58
|
+
end
|
59
|
+
|
60
|
+
def indent
|
61
|
+
Nokogiri::XML::Text.new(" ", @node)
|
62
|
+
end
|
63
|
+
|
64
|
+
def new_line
|
65
|
+
Nokogiri::XML::Text.new("\n ", @node)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'treebank/transform'
|
2
|
+
require 'thor'
|
3
|
+
|
4
|
+
module Treebank
|
5
|
+
class Transform
|
6
|
+
class CLI < Thor
|
7
|
+
|
8
|
+
desc 'do FILE', 'transforms 1.5 Treebanks to the interim Arethusa format'
|
9
|
+
def do(file)
|
10
|
+
transformer = Transform.new(File.read(file))
|
11
|
+
puts transformer.transform
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require "treebank/transform/version"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Treebank
|
5
|
+
require "treebank/sentence"
|
6
|
+
require "treebank/elliptic_word"
|
7
|
+
|
8
|
+
class Transform
|
9
|
+
def initialize(doc)
|
10
|
+
@doc = Nokogiri::XML(doc);
|
11
|
+
end
|
12
|
+
|
13
|
+
def transform
|
14
|
+
transform_elliptic_nodes
|
15
|
+
@doc.to_xml(indent: 2)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def transform_elliptic_nodes
|
21
|
+
@doc.xpath('//treebank/sentence').each do |sentence_node|
|
22
|
+
sentence = Sentence.new(sentence_node)
|
23
|
+
sentence_node.xpath('word').each do |word_node|
|
24
|
+
if has_elliptic_head(word_node['relation'])
|
25
|
+
word = EllipticWord.new(word_node, sentence)
|
26
|
+
word.parse_elliptic_head
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def has_elliptic_head(label)
|
33
|
+
label.match(/ExD\d+/)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
require 'coveralls'
|
3
|
+
|
4
|
+
Coveralls.wear!
|
5
|
+
|
6
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
7
|
+
SimpleCov::Formatter::HTMLFormatter,
|
8
|
+
Coveralls::SimpleCov::Formatter
|
9
|
+
]
|
10
|
+
|
11
|
+
SimpleCov.start do
|
12
|
+
add_filter '/spec/'
|
13
|
+
end
|
14
|
+
|
15
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
16
|
+
require 'treebank/transform'
|
17
|
+
|
18
|
+
RSpec.configure do |config|
|
19
|
+
config.run_all_when_everything_filtered = true
|
20
|
+
config.filter_run :focus
|
21
|
+
end
|
@@ -0,0 +1,327 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Treebank::Transform do
|
4
|
+
it 'has a version number' do
|
5
|
+
expect(Treebank::Transform::VERSION).not_to be nil
|
6
|
+
end
|
7
|
+
|
8
|
+
# Examples taken from http://nlp.perseus.tufts.edu/syntax/treebank/ldt/1.5/data/1999.02.0010.xml
|
9
|
+
|
10
|
+
let(:tb1) do
|
11
|
+
<<EOF
|
12
|
+
<?xml version="1.0"?>
|
13
|
+
<treebank>
|
14
|
+
<sentence id="2" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=1" span="quam0:eludet0">
|
15
|
+
<word id="1" form="quam" lemma="quam1" postag="d--------" head="2" relation="ADV"/>
|
16
|
+
<word id="2" form="diu" lemma="diu1" postag="d--------" head="8" relation="ADV"/>
|
17
|
+
<word id="3" form="etiam" lemma="etiam1" postag="c--------" head="8" relation="AuxY"/>
|
18
|
+
<word id="4" form="furor" lemma="furor2" postag="n-s---mn-" head="8" relation="SBJ"/>
|
19
|
+
<word id="5" form="iste" lemma="iste1" postag="p-s---mn-" head="4" relation="ATR"/>
|
20
|
+
<word id="6" form="tuus" lemma="tuus1" postag="a-s---mn-" head="4" relation="ATR"/>
|
21
|
+
<word id="7" form="nos" lemma="nos1" postag="p-p---ma-" head="8" relation="OBJ"/>
|
22
|
+
<word id="8" form="eludet" lemma="eludo1" postag="v3sfia---" head="0" relation="PRED"/>
|
23
|
+
</sentence>
|
24
|
+
</treebank>
|
25
|
+
EOF
|
26
|
+
end
|
27
|
+
|
28
|
+
let(:tb2) do
|
29
|
+
<<EOF
|
30
|
+
<?xml version="1.0"?>
|
31
|
+
<treebank>
|
32
|
+
<sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
|
33
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
|
34
|
+
<word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
|
35
|
+
<word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
|
36
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
|
37
|
+
<word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
|
38
|
+
<word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
|
39
|
+
<word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
40
|
+
<word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
|
41
|
+
<word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="8" relation="PNOM_ExD0_ADV"/>
|
42
|
+
<word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
43
|
+
<word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
|
44
|
+
<word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
|
45
|
+
<word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
|
46
|
+
<word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
|
47
|
+
<word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
|
48
|
+
</sentence>
|
49
|
+
</treebank>
|
50
|
+
EOF
|
51
|
+
end
|
52
|
+
|
53
|
+
let(:tb2_result) do
|
54
|
+
<<EOF
|
55
|
+
<?xml version="1.0"?>
|
56
|
+
<treebank>
|
57
|
+
<sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
|
58
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
|
59
|
+
<word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
|
60
|
+
<word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
|
61
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
|
62
|
+
<word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
|
63
|
+
<word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
|
64
|
+
<word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
65
|
+
<word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
|
66
|
+
<word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="16" relation="PNOM"/>
|
67
|
+
<word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
68
|
+
<word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
|
69
|
+
<word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
|
70
|
+
<word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
|
71
|
+
<word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
|
72
|
+
<word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
|
73
|
+
<word id="16" insertion_id="0015e" form="[0]" artificial="elliptic" head="8" relation="ADV"/>
|
74
|
+
</sentence>
|
75
|
+
</treebank>
|
76
|
+
EOF
|
77
|
+
end
|
78
|
+
|
79
|
+
let(:tb3) do
|
80
|
+
<<EOF
|
81
|
+
<?xml version="1.0"?>
|
82
|
+
<treebank>
|
83
|
+
<sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
|
84
|
+
<word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="0" relation="SBJ_ExD0_PRED"/>
|
85
|
+
<word id="2" form="vero" lemma="verus1" postag="d--------" head="0" relation="AuxY_ExD0_PRED"/>
|
86
|
+
</sentence>
|
87
|
+
</treebank>
|
88
|
+
EOF
|
89
|
+
end
|
90
|
+
|
91
|
+
let(:tb3_result) do
|
92
|
+
<<EOF
|
93
|
+
<?xml version="1.0"?>
|
94
|
+
<treebank>
|
95
|
+
<sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
|
96
|
+
<word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="3" relation="SBJ"/>
|
97
|
+
<word id="2" form="vero" lemma="verus1" postag="d--------" head="3" relation="AuxY"/>
|
98
|
+
<word id="3" insertion_id="0002e" form="[0]" artificial="elliptic" head="0" relation="PRED"/>
|
99
|
+
</sentence>
|
100
|
+
</treebank>
|
101
|
+
EOF
|
102
|
+
end
|
103
|
+
|
104
|
+
let(:tb4) do
|
105
|
+
<<EOF
|
106
|
+
<?xml version="1.0"?>
|
107
|
+
<treebank>
|
108
|
+
<sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
|
109
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
110
|
+
<word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
|
111
|
+
<word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
|
112
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
113
|
+
<word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
|
114
|
+
<word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
115
|
+
<word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="1" relation="OBJ_ExD0_ADV_CO"/>
|
116
|
+
<word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
|
117
|
+
<word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
118
|
+
<word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
|
119
|
+
<word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
|
120
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
121
|
+
<word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
|
122
|
+
<word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
|
123
|
+
<word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
|
124
|
+
<word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
|
125
|
+
<word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
|
126
|
+
<word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
|
127
|
+
<word id="19" form="non" lemma="non1" postag="d--------" head="18" relation="AuxZ_ExD1_ADV"/>
|
128
|
+
<word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="18" relation="SBJ_ExD2_OBJ_ExD1_ADV"/>
|
129
|
+
<word id="21" form="potius" lemma="potis1" postag="d--------" head="18" relation="ADV_ExD1_ADV"/>
|
130
|
+
<word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
|
131
|
+
<word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="18" relation="SBJ_ExD1_ADV"/>
|
132
|
+
<word id="24" form="serius" lemma="serus1" postag="d--------" head="18" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
|
133
|
+
<word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
134
|
+
<word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="25" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
|
135
|
+
<word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
|
136
|
+
<word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
|
137
|
+
<word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
|
138
|
+
<word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
|
139
|
+
<word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
|
140
|
+
<word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
|
141
|
+
</sentence>
|
142
|
+
</treebank>
|
143
|
+
EOF
|
144
|
+
end
|
145
|
+
|
146
|
+
let(:tb4_result) do
|
147
|
+
<<EOF
|
148
|
+
<?xml version="1.0"?>
|
149
|
+
<treebank>
|
150
|
+
<sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
|
151
|
+
<word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
152
|
+
<word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
|
153
|
+
<word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
|
154
|
+
<word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
155
|
+
<word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
|
156
|
+
<word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
|
157
|
+
<word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="33" relation="OBJ"/>
|
158
|
+
<word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
|
159
|
+
<word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
|
160
|
+
<word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
|
161
|
+
<word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
|
162
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
|
163
|
+
<word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
|
164
|
+
<word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
|
165
|
+
<word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
|
166
|
+
<word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
|
167
|
+
<word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
|
168
|
+
<word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
|
169
|
+
<word id="19" form="non" lemma="non1" postag="d--------" head="34" relation="AuxZ"/>
|
170
|
+
<word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="35" relation="SBJ"/>
|
171
|
+
<word id="21" form="potius" lemma="potis1" postag="d--------" head="34" relation="ADV"/>
|
172
|
+
<word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
|
173
|
+
<word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="34" relation="SBJ"/>
|
174
|
+
<word id="24" form="serius" lemma="serus1" postag="d--------" head="35" relation="ADV"/>
|
175
|
+
<word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
176
|
+
<word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="35" relation="ADV"/>
|
177
|
+
<word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
|
178
|
+
<word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
|
179
|
+
<word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
|
180
|
+
<word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
|
181
|
+
<word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
|
182
|
+
<word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
|
183
|
+
<word id="33" insertion_id="0032e" form="[0]" artificial="elliptic" head="1" relation="ADV_CO"/>
|
184
|
+
<word id="34" insertion_id="0032f" form="[1]" artificial="elliptic" head="18" relation="ADV"/>
|
185
|
+
<word id="35" insertion_id="0032g" form="[2]" artificial="elliptic" head="34" relation="OBJ"/>
|
186
|
+
</sentence>
|
187
|
+
</treebank>
|
188
|
+
EOF
|
189
|
+
end
|
190
|
+
|
191
|
+
let(:tb5) do
|
192
|
+
<<EOF
|
193
|
+
<?xml version="1.0"?>
|
194
|
+
<treebank>
|
195
|
+
<sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
|
196
|
+
<word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR" />
|
197
|
+
<word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="12" relation="SBJ_ExD0_PRED_CO" />
|
198
|
+
<word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP" />
|
199
|
+
<word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="3" relation="ADV_ExD0_PRED_CO" />
|
200
|
+
<word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX" />
|
201
|
+
<word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR" />
|
202
|
+
<word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="12" relation="SBJ_ExD1_PRED_CO" />
|
203
|
+
<word id="8" form="a" lemma="ab1" postag="r--------" head="12" relation="AuxP_ExD1_PRED_CO" />
|
204
|
+
<word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV" />
|
205
|
+
<word id="10" form="umquam" lemma="umquam1" postag="d--------" head="12" relation="ADV_ExD1_PRED_CO" />
|
206
|
+
<word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR" />
|
207
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD" />
|
208
|
+
<word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR" />
|
209
|
+
<word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ" />
|
210
|
+
<word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP" />
|
211
|
+
<word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR" />
|
212
|
+
<word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV" />
|
213
|
+
<word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO" />
|
214
|
+
</sentence>
|
215
|
+
<sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
|
216
|
+
<word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR" />
|
217
|
+
<word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ" />
|
218
|
+
<word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ" />
|
219
|
+
<word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ" />
|
220
|
+
<word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR" />
|
221
|
+
<word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV" />
|
222
|
+
<word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR" />
|
223
|
+
<word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ" />
|
224
|
+
<word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY" />
|
225
|
+
<word id="10" form="ad" lemma="ad1" postag="r--------" head="13" relation="AuxP_ExD0_PRED_CO" />
|
226
|
+
<word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV" />
|
227
|
+
<word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="13" relation="OBJ_ExD0_PRED_CO" />
|
228
|
+
<word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD" />
|
229
|
+
<word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP" />
|
230
|
+
<word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV" />
|
231
|
+
<word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ" />
|
232
|
+
<word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO" />
|
233
|
+
</sentence>
|
234
|
+
</treebank>
|
235
|
+
EOF
|
236
|
+
end
|
237
|
+
|
238
|
+
let(:tb5_result) do
|
239
|
+
<<EOF
|
240
|
+
<?xml version="1.0"?>
|
241
|
+
<treebank>
|
242
|
+
<sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
|
243
|
+
<word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR"/>
|
244
|
+
<word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="19" relation="SBJ"/>
|
245
|
+
<word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP"/>
|
246
|
+
<word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="19" relation="ADV"/>
|
247
|
+
<word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX"/>
|
248
|
+
<word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR"/>
|
249
|
+
<word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="20" relation="SBJ"/>
|
250
|
+
<word id="8" form="a" lemma="ab1" postag="r--------" head="20" relation="AuxP"/>
|
251
|
+
<word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV"/>
|
252
|
+
<word id="10" form="umquam" lemma="umquam1" postag="d--------" head="20" relation="ADV"/>
|
253
|
+
<word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR"/>
|
254
|
+
<word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
|
255
|
+
<word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR"/>
|
256
|
+
<word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ"/>
|
257
|
+
<word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
|
258
|
+
<word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR"/>
|
259
|
+
<word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV"/>
|
260
|
+
<word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO"/>
|
261
|
+
<word id="19" insertion_id="0018e" form="[0]" artificial="elliptic" head="12" relation="PRED_CO"/>
|
262
|
+
<word id="20" insertion_id="0018f" form="[1]" artificial="elliptic" head="12" relation="PRED_CO"/>
|
263
|
+
</sentence>
|
264
|
+
<sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
|
265
|
+
<word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR"/>
|
266
|
+
<word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ"/>
|
267
|
+
<word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ"/>
|
268
|
+
<word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ"/>
|
269
|
+
<word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR"/>
|
270
|
+
<word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV"/>
|
271
|
+
<word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR"/>
|
272
|
+
<word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ"/>
|
273
|
+
<word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY"/>
|
274
|
+
<word id="10" form="ad" lemma="ad1" postag="r--------" head="18" relation="AuxP"/>
|
275
|
+
<word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV"/>
|
276
|
+
<word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="18" relation="OBJ"/>
|
277
|
+
<word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD"/>
|
278
|
+
<word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP"/>
|
279
|
+
<word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV"/>
|
280
|
+
<word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ"/>
|
281
|
+
<word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO"/>
|
282
|
+
<word id="18" insertion_id="0017e" form="[0]" artificial="elliptic" head="13" relation="PRED_CO"/>
|
283
|
+
</sentence>
|
284
|
+
</treebank>
|
285
|
+
EOF
|
286
|
+
end
|
287
|
+
|
288
|
+
describe "#transform" do
|
289
|
+
it "returns the document when there is nothing to transform" do
|
290
|
+
tb = Treebank::Transform.new(tb1)
|
291
|
+
result = tb.transform
|
292
|
+
expect(result).to eq tb1
|
293
|
+
end
|
294
|
+
|
295
|
+
context "with a single simple ellipsis" do
|
296
|
+
it "inserts a new elliptic node and updates the head" do
|
297
|
+
tb = Treebank::Transform.new(tb2)
|
298
|
+
result = tb.transform
|
299
|
+
expect(result).to eq tb2_result
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "when multiple token are children of the same ellipsis" do
|
304
|
+
it "inserts a new elliptic node and updates the head" do
|
305
|
+
tb = Treebank::Transform.new(tb3)
|
306
|
+
result = tb.transform
|
307
|
+
expect(result).to eq tb3_result
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
context "when ellipses are chained" do
|
312
|
+
it "does as it should" do
|
313
|
+
tb = Treebank::Transform.new(tb4)
|
314
|
+
result = tb.transform
|
315
|
+
expect(result).to eq tb4_result
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
context "with multiple sentences in a document" do
|
320
|
+
it "does as it should" do
|
321
|
+
tb = Treebank::Transform.new(tb5)
|
322
|
+
result = tb.transform
|
323
|
+
expect(result).to eq tb5_result
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'treebank/transform/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "treebank-transform"
|
8
|
+
spec.version = Treebank::Transform::VERSION
|
9
|
+
spec.authors = ["LFDM"]
|
10
|
+
spec.email = ["1986gh@gmail.com"]
|
11
|
+
spec.summary = %q{Transforms Perseus Treebank files}
|
12
|
+
spec.description = spec.summary
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
25
|
+
|
26
|
+
spec.add_dependency "thor"
|
27
|
+
spec.add_dependency "nokogiri"
|
28
|
+
end
|