treebank-transform 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ module Treebank
2
+ module Alphabet
3
+ ALPHABET = ("a".."z").to_a
4
+
5
+ def self.next_letter(letter)
6
+ i = ALPHABET.find_index(letter)
7
+ ALPHABET[i + 1]
8
+ # do more here when we run out of letters
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,42 @@
1
+ module Treebank
2
+ class EllipticWord
3
+ def initialize(word_node, sentence)
4
+ @node = word_node
5
+ @sentence = sentence
6
+ end
7
+
8
+ def parse_elliptic_head
9
+ return unless match = @node['relation'].match(regexp)
10
+
11
+ label, elliptic_string, elliptic_label = match.captures
12
+ elliptic_head = @node['head']
13
+
14
+ unless head = @sentence.elliptic_nodes[elliptic_string]
15
+ new_node = create_new_node(elliptic_head, elliptic_label, elliptic_string)
16
+ head = new_node['id']
17
+ end
18
+
19
+ @node['relation'] = label
20
+ @node['head'] = head
21
+ end
22
+
23
+ private
24
+
25
+ def create_new_node(head, label, string)
26
+ new_node = @sentence.add_ellipsis({
27
+ artificial: 'elliptic',
28
+ head: head,
29
+ relation: label,
30
+ }, string)
31
+
32
+ new_word = EllipticWord.new(new_node, @sentence)
33
+ new_word.parse_elliptic_head
34
+
35
+ new_node
36
+ end
37
+
38
+ def regexp
39
+ /(\w+?)_ExD(\d+)_(.+)/
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,69 @@
1
+ module Treebank
2
+ class Sentence
3
+ require "treebank/alphabet"
4
+
5
+ attr_reader :elliptic_nodes
6
+
7
+ def initialize(sentence_node)
8
+ @node = sentence_node
9
+ @last_id = @next_id = last_id
10
+ @elliptic_nodes = {}
11
+ end
12
+
13
+ def add_ellipsis(attrs, string)
14
+ id = next_id
15
+ all_attrs = {
16
+ id: id,
17
+ insertion_id: get_insertion_id,
18
+ form: "[#{string}]"
19
+ }.merge(attrs)
20
+
21
+ new_node = new_word(all_attrs)
22
+ @elliptic_nodes[string] = id
23
+
24
+ @node.add_child(indent)
25
+ @node.add_child(new_node)
26
+ @node.add_child(new_line)
27
+ new_node
28
+ end
29
+
30
+ private
31
+
32
+ def next_id
33
+ update_last_id
34
+ end
35
+
36
+ def last_id
37
+ return @last_id if @last_id
38
+ last_word = @node.xpath('word').last
39
+ @last_id = last_word.attributes['id'].value.to_i
40
+ end
41
+
42
+ def update_last_id
43
+ @next_id += 1
44
+ end
45
+
46
+ def suffix
47
+ @suffix = @suffix ? Alphabet.next_letter(@suffix) : 'e'
48
+ end
49
+
50
+ def get_insertion_id
51
+ "#{last_id.to_s.rjust(4, '0')}#{suffix}"
52
+ end
53
+
54
+ def new_word(attrs)
55
+ word = Nokogiri::XML::Node.new('word', @node)
56
+ attrs.each { |k, v| word[k] = v }
57
+ word
58
+ end
59
+
60
+ def indent
61
+ Nokogiri::XML::Text.new(" ", @node)
62
+ end
63
+
64
+ def new_line
65
+ Nokogiri::XML::Text.new("\n ", @node)
66
+ end
67
+ end
68
+ end
69
+
@@ -0,0 +1,15 @@
1
+ require 'treebank/transform'
2
+ require 'thor'
3
+
4
+ module Treebank
5
+ class Transform
6
+ class CLI < Thor
7
+
8
+ desc 'do FILE', 'transforms 1.5 Treebanks to the interim Arethusa format'
9
+ def do(file)
10
+ transformer = Transform.new(File.read(file))
11
+ puts transformer.transform
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ module Treebank
2
+ class Transform
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,36 @@
1
+ require "treebank/transform/version"
2
+ require "nokogiri"
3
+
4
+ module Treebank
5
+ require "treebank/sentence"
6
+ require "treebank/elliptic_word"
7
+
8
+ class Transform
9
+ def initialize(doc)
10
+ @doc = Nokogiri::XML(doc);
11
+ end
12
+
13
+ def transform
14
+ transform_elliptic_nodes
15
+ @doc.to_xml(indent: 2)
16
+ end
17
+
18
+ private
19
+
20
+ def transform_elliptic_nodes
21
+ @doc.xpath('//treebank/sentence').each do |sentence_node|
22
+ sentence = Sentence.new(sentence_node)
23
+ sentence_node.xpath('word').each do |word_node|
24
+ if has_elliptic_head(word_node['relation'])
25
+ word = EllipticWord.new(word_node, sentence)
26
+ word.parse_elliptic_head
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def has_elliptic_head(label)
33
+ label.match(/ExD\d+/)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,21 @@
1
+ require 'simplecov'
2
+ require 'coveralls'
3
+
4
+ Coveralls.wear!
5
+
6
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
7
+ SimpleCov::Formatter::HTMLFormatter,
8
+ Coveralls::SimpleCov::Formatter
9
+ ]
10
+
11
+ SimpleCov.start do
12
+ add_filter '/spec/'
13
+ end
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
16
+ require 'treebank/transform'
17
+
18
+ RSpec.configure do |config|
19
+ config.run_all_when_everything_filtered = true
20
+ config.filter_run :focus
21
+ end
@@ -0,0 +1,327 @@
1
+ require 'spec_helper'
2
+
3
+ describe Treebank::Transform do
4
+ it 'has a version number' do
5
+ expect(Treebank::Transform::VERSION).not_to be nil
6
+ end
7
+
8
+ # Examples taken from http://nlp.perseus.tufts.edu/syntax/treebank/ldt/1.5/data/1999.02.0010.xml
9
+
10
+ let(:tb1) do
11
+ <<EOF
12
+ <?xml version="1.0"?>
13
+ <treebank>
14
+ <sentence id="2" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=1" span="quam0:eludet0">
15
+ <word id="1" form="quam" lemma="quam1" postag="d--------" head="2" relation="ADV"/>
16
+ <word id="2" form="diu" lemma="diu1" postag="d--------" head="8" relation="ADV"/>
17
+ <word id="3" form="etiam" lemma="etiam1" postag="c--------" head="8" relation="AuxY"/>
18
+ <word id="4" form="furor" lemma="furor2" postag="n-s---mn-" head="8" relation="SBJ"/>
19
+ <word id="5" form="iste" lemma="iste1" postag="p-s---mn-" head="4" relation="ATR"/>
20
+ <word id="6" form="tuus" lemma="tuus1" postag="a-s---mn-" head="4" relation="ATR"/>
21
+ <word id="7" form="nos" lemma="nos1" postag="p-p---ma-" head="8" relation="OBJ"/>
22
+ <word id="8" form="eludet" lemma="eludo1" postag="v3sfia---" head="0" relation="PRED"/>
23
+ </sentence>
24
+ </treebank>
25
+ EOF
26
+ end
27
+
28
+ let(:tb2) do
29
+ <<EOF
30
+ <?xml version="1.0"?>
31
+ <treebank>
32
+ <sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
33
+ <word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
34
+ <word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
35
+ <word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
36
+ <word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
37
+ <word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
38
+ <word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
39
+ <word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
40
+ <word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
41
+ <word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="8" relation="PNOM_ExD0_ADV"/>
42
+ <word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
43
+ <word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
44
+ <word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
45
+ <word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
46
+ <word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
47
+ <word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
48
+ </sentence>
49
+ </treebank>
50
+ EOF
51
+ end
52
+
53
+ let(:tb2_result) do
54
+ <<EOF
55
+ <?xml version="1.0"?>
56
+ <treebank>
57
+ <sentence id="126" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=7" span="si4:desinam0">
58
+ <word id="1" form="si" lemma="si1" postag="c--------" head="6" relation="AuxC"/>
59
+ <word id="2" form="est" lemma="sum1" postag="v3spia---" head="1" relation="ADV"/>
60
+ <word id="3" form="verus" lemma="verus1" postag="a-s---mn-" head="2" relation="PNOM"/>
61
+ <word id="4" form="," lemma="comma1" postag="u--------" head="1" relation="AuxX"/>
62
+ <word id="5" form="ne" lemma="ne1" postag="c--------" head="7" relation="AuxC"/>
63
+ <word id="6" form="opprimar" lemma="opprimo1" postag="v1spsp---" head="5" relation="ExD_CO"/>
64
+ <word id="7" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
65
+ <word id="8" form="sin" lemma="si1" postag="c--------" head="15" relation="AuxC"/>
66
+ <word id="9" form="falsus" lemma="falsus1" postag="a-s---mn-" head="16" relation="PNOM"/>
67
+ <word id="10" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
68
+ <word id="11" form="ut" lemma="ut1" postag="c--------" head="7" relation="AuxC"/>
69
+ <word id="12" form="tandem" lemma="tandem1" postag="d--------" head="15" relation="AuxY"/>
70
+ <word id="13" form="aliquando" lemma="aliquando1" postag="d--------" head="15" relation="ADV"/>
71
+ <word id="14" form="timere" lemma="timeo1" postag="v--pna---" head="15" relation="OBJ"/>
72
+ <word id="15" form="desinam" lemma="desino1" postag="v1spsa---" head="11" relation="ExD_CO"/>
73
+ <word id="16" insertion_id="0015e" form="[0]" artificial="elliptic" head="8" relation="ADV"/>
74
+ </sentence>
75
+ </treebank>
76
+ EOF
77
+ end
78
+
79
+ let(:tb3) do
80
+ <<EOF
81
+ <?xml version="1.0"?>
82
+ <treebank>
83
+ <sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
84
+ <word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="0" relation="SBJ_ExD0_PRED"/>
85
+ <word id="2" form="vero" lemma="verus1" postag="d--------" head="0" relation="AuxY_ExD0_PRED"/>
86
+ </sentence>
87
+ </treebank>
88
+ EOF
89
+ end
90
+
91
+ let(:tb3_result) do
92
+ <<EOF
93
+ <?xml version="1.0"?>
94
+ <treebank>
95
+ <sentence id="95" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quid2:vero0">
96
+ <word id="1" form="quid" lemma="quis1" postag="p-s---nn-" head="3" relation="SBJ"/>
97
+ <word id="2" form="vero" lemma="verus1" postag="d--------" head="3" relation="AuxY"/>
98
+ <word id="3" insertion_id="0002e" form="[0]" artificial="elliptic" head="0" relation="PRED"/>
99
+ </sentence>
100
+ </treebank>
101
+ EOF
102
+ end
103
+
104
+ let(:tb4) do
105
+ <<EOF
106
+ <?xml version="1.0"?>
107
+ <treebank>
108
+ <sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
109
+ <word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
110
+ <word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
111
+ <word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
112
+ <word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
113
+ <word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
114
+ <word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
115
+ <word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="1" relation="OBJ_ExD0_ADV_CO"/>
116
+ <word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
117
+ <word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
118
+ <word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
119
+ <word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
120
+ <word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
121
+ <word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
122
+ <word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
123
+ <word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
124
+ <word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
125
+ <word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
126
+ <word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
127
+ <word id="19" form="non" lemma="non1" postag="d--------" head="18" relation="AuxZ_ExD1_ADV"/>
128
+ <word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="18" relation="SBJ_ExD2_OBJ_ExD1_ADV"/>
129
+ <word id="21" form="potius" lemma="potis1" postag="d--------" head="18" relation="ADV_ExD1_ADV"/>
130
+ <word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
131
+ <word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="18" relation="SBJ_ExD1_ADV"/>
132
+ <word id="24" form="serius" lemma="serus1" postag="d--------" head="18" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
133
+ <word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
134
+ <word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="25" relation="ADV_ExD2_OBJ_ExD1_ADV"/>
135
+ <word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
136
+ <word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
137
+ <word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
138
+ <word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
139
+ <word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
140
+ <word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
141
+ </sentence>
142
+ </treebank>
143
+ EOF
144
+ end
145
+
146
+ let(:tb4_result) do
147
+ <<EOF
148
+ <?xml version="1.0"?>
149
+ <treebank>
150
+ <sentence id="31" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=2" span="si0:dicat0">
151
+ <word id="1" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
152
+ <word id="2" form="te" lemma="tu1" postag="p-s---ma-" head="7" relation="SBJ"/>
153
+ <word id="3" form="iam" lemma="jam1" postag="d--------" head="7" relation="AuxY"/>
154
+ <word id="4" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
155
+ <word id="5" form="Catilina" lemma="Catilina1" postag="n-s---mv-" head="7" relation="ExD"/>
156
+ <word id="6" form="," lemma="comma1" postag="u--------" head="5" relation="AuxX"/>
157
+ <word id="7" form="comprehendi" lemma="comprehendo1" postag="v--pnp---" head="33" relation="OBJ"/>
158
+ <word id="8" form="," lemma="comma1" postag="u--------" head="16" relation="COORD"/>
159
+ <word id="9" form="si" lemma="si1" postag="c--------" head="8" relation="AuxC"/>
160
+ <word id="10" form="interfici" lemma="interficio1" postag="v--pnp---" head="11" relation="OBJ"/>
161
+ <word id="11" form="iussero" lemma="jubeo1" postag="v1stia---" head="9" relation="ADV_CO"/>
162
+ <word id="12" form="," lemma="comma1" postag="u--------" head="8" relation="AuxX"/>
163
+ <word id="13" form="credo" lemma="credo1" postag="v1spia---" head="0" relation="PRED"/>
164
+ <word id="14" form="," lemma="comma1" postag="u--------" head="13" relation="AuxX"/>
165
+ <word id="15" form="erit" lemma="sum1" postag="v3sfia---" head="16" relation="AuxV"/>
166
+ <word id="16" form="verendum" lemma="vereor1" postag="t-spgpna-" head="13" relation="PRED"/>
167
+ <word id="17" form="mihi" lemma="ego1" postag="p-s---md-" head="16" relation="ADV"/>
168
+ <word id="18" form="ne" lemma="ne1" postag="c--------" head="16" relation="AuxC"/>
169
+ <word id="19" form="non" lemma="non1" postag="d--------" head="34" relation="AuxZ"/>
170
+ <word id="20" form="hoc" lemma="hic1" postag="p-s---na-" head="35" relation="SBJ"/>
171
+ <word id="21" form="potius" lemma="potis1" postag="d--------" head="34" relation="ADV"/>
172
+ <word id="22" form="omnes" lemma="omnis1" postag="a-p---mn-" head="23" relation="ATR"/>
173
+ <word id="23" form="boni" lemma="bonus1" postag="a-p---mn-" head="34" relation="SBJ"/>
174
+ <word id="24" form="serius" lemma="serus1" postag="d--------" head="35" relation="ADV"/>
175
+ <word id="25" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
176
+ <word id="26" form="me" lemma="ego1" postag="p-s---mb-" head="35" relation="ADV"/>
177
+ <word id="27" form="quam" lemma="quam1" postag="d--------" head="21" relation="AuxC"/>
178
+ <word id="28" form="quisquam" lemma="quisquam1" postag="p-s---mn-" head="32" relation="SBJ"/>
179
+ <word id="29" form="crudelius" lemma="crudelis1" postag="d--------" head="30" relation="ADV"/>
180
+ <word id="30" form="factum" lemma="facio1" postag="t-srppna-" head="32" relation="OBJ"/>
181
+ <word id="31" form="esse" lemma="sum1" postag="v--pna---" head="30" relation="AuxV"/>
182
+ <word id="32" form="dicat" lemma="dico2" postag="v3spsa---" head="27" relation="ADV"/>
183
+ <word id="33" insertion_id="0032e" form="[0]" artificial="elliptic" head="1" relation="ADV_CO"/>
184
+ <word id="34" insertion_id="0032f" form="[1]" artificial="elliptic" head="18" relation="ADV"/>
185
+ <word id="35" insertion_id="0032g" form="[2]" artificial="elliptic" head="34" relation="OBJ"/>
186
+ </sentence>
187
+ </treebank>
188
+ EOF
189
+ end
190
+
191
+ let(:tb5) do
192
+ <<EOF
193
+ <?xml version="1.0"?>
194
+ <treebank>
195
+ <sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
196
+ <word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR" />
197
+ <word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="12" relation="SBJ_ExD0_PRED_CO" />
198
+ <word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP" />
199
+ <word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="3" relation="ADV_ExD0_PRED_CO" />
200
+ <word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX" />
201
+ <word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR" />
202
+ <word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="12" relation="SBJ_ExD1_PRED_CO" />
203
+ <word id="8" form="a" lemma="ab1" postag="r--------" head="12" relation="AuxP_ExD1_PRED_CO" />
204
+ <word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV" />
205
+ <word id="10" form="umquam" lemma="umquam1" postag="d--------" head="12" relation="ADV_ExD1_PRED_CO" />
206
+ <word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR" />
207
+ <word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD" />
208
+ <word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR" />
209
+ <word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ" />
210
+ <word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP" />
211
+ <word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR" />
212
+ <word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV" />
213
+ <word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO" />
214
+ </sentence>
215
+ <sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
216
+ <word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR" />
217
+ <word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ" />
218
+ <word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ" />
219
+ <word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ" />
220
+ <word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR" />
221
+ <word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV" />
222
+ <word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR" />
223
+ <word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ" />
224
+ <word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY" />
225
+ <word id="10" form="ad" lemma="ad1" postag="r--------" head="13" relation="AuxP_ExD0_PRED_CO" />
226
+ <word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV" />
227
+ <word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="13" relation="OBJ_ExD0_PRED_CO" />
228
+ <word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD" />
229
+ <word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP" />
230
+ <word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV" />
231
+ <word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ" />
232
+ <word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO" />
233
+ </sentence>
234
+ </treebank>
235
+ EOF
236
+ end
237
+
238
+ let(:tb5_result) do
239
+ <<EOF
240
+ <?xml version="1.0"?>
241
+ <treebank>
242
+ <sentence id="93" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="quae1:afuit0">
243
+ <word id="1" form="quae" lemma="qui1" postag="p-s---fn-" head="2" relation="ATR"/>
244
+ <word id="2" form="libido" lemma="libido1" postag="n-s---fn-" head="19" relation="SBJ"/>
245
+ <word id="3" form="ab" lemma="ab1" postag="r--------" head="12" relation="AuxP"/>
246
+ <word id="4" form="oculis" lemma="oculus1" postag="n-p---mb-" head="19" relation="ADV"/>
247
+ <word id="5" form="," lemma="comma1" postag="u--------" head="12" relation="AuxX"/>
248
+ <word id="6" form="quod" lemma="qui1" postag="p-s---nn-" head="7" relation="ATR"/>
249
+ <word id="7" form="facinus" lemma="facinus1" postag="n-s---nn-" head="20" relation="SBJ"/>
250
+ <word id="8" form="a" lemma="ab1" postag="r--------" head="20" relation="AuxP"/>
251
+ <word id="9" form="manibus" lemma="manus1" postag="n-p---fb-" head="8" relation="ADV"/>
252
+ <word id="10" form="umquam" lemma="umquam1" postag="d--------" head="20" relation="ADV"/>
253
+ <word id="11" form="tuis" lemma="tuus1" postag="a-p---fb-" head="9" relation="ATR"/>
254
+ <word id="12" form="," lemma="comma1" postag="u--------" head="0" relation="COORD"/>
255
+ <word id="13" form="quod" lemma="qui1" postag="p-s---nn-" head="14" relation="ATR"/>
256
+ <word id="14" form="flagitium" lemma="flagitium1" postag="n-s---nn-" head="18" relation="SBJ"/>
257
+ <word id="15" form="a" lemma="ab1" postag="r--------" head="18" relation="AuxP"/>
258
+ <word id="16" form="toto" lemma="totus1" postag="a-s---nb-" head="17" relation="ATR"/>
259
+ <word id="17" form="corpore" lemma="corpus1" postag="n-s---nb-" head="15" relation="ADV"/>
260
+ <word id="18" form="afuit" lemma="Asum1" postag="v3sria---" head="12" relation="PRED_CO"/>
261
+ <word id="19" insertion_id="0018e" form="[0]" artificial="elliptic" head="12" relation="PRED_CO"/>
262
+ <word id="20" insertion_id="0018f" form="[1]" artificial="elliptic" head="12" relation="PRED_CO"/>
263
+ </sentence>
264
+ <sentence id="94" document_id="Perseus:text:1999.02.0010" subdoc="text=Catil.:Speech=1:chapter=6" span="cui0:praetulisti0">
265
+ <word id="1" form="cui" lemma="qui1" postag="p-s---md-" head="3" relation="ATR"/>
266
+ <word id="2" form="tu" lemma="tu1" postag="p-s---mn-" head="17" relation="SBJ"/>
267
+ <word id="3" form="adulescentulo" lemma="adulescentulus1" postag="n-s---md-" head="17" relation="OBJ"/>
268
+ <word id="4" form="quem" lemma="qui1" postag="p-s---ma-" head="7" relation="OBJ"/>
269
+ <word id="5" form="corruptelarum" lemma="corruptela1" postag="n-p---fg-" head="6" relation="ATR"/>
270
+ <word id="6" form="inlecebris" lemma="illecebra1" postag="n-p---fb-" head="7" relation="ADV"/>
271
+ <word id="7" form="inretisses" lemma="irretio" postag="v2slsa---" head="3" relation="ATR"/>
272
+ <word id="8" form="non" lemma="non1" postag="d--------" head="13" relation="AuxZ"/>
273
+ <word id="9" form="aut" lemma="aut1" postag="c--------" head="13" relation="AuxY"/>
274
+ <word id="10" form="ad" lemma="ad1" postag="r--------" head="18" relation="AuxP"/>
275
+ <word id="11" form="audaciam" lemma="audacia1" postag="n-s---fa-" head="10" relation="ADV"/>
276
+ <word id="12" form="ferrum" lemma="ferrum1" postag="n-s---na-" head="18" relation="OBJ"/>
277
+ <word id="13" form="aut" lemma="aut1" postag="c--------" head="17" relation="COORD"/>
278
+ <word id="14" form="ad" lemma="ad1" postag="r--------" head="17" relation="AuxP"/>
279
+ <word id="15" form="libidinem" lemma="libido1" postag="n-s---fa-" head="14" relation="ADV"/>
280
+ <word id="16" form="facem" lemma="fax1" postag="n-s---fa-" head="13" relation="OBJ"/>
281
+ <word id="17" form="praetulisti" lemma="praefero1" postag="v2sria---" head="0" relation="PRED_CO"/>
282
+ <word id="18" insertion_id="0017e" form="[0]" artificial="elliptic" head="13" relation="PRED_CO"/>
283
+ </sentence>
284
+ </treebank>
285
+ EOF
286
+ end
287
+
288
+ describe "#transform" do
289
+ it "returns the document when there is nothing to transform" do
290
+ tb = Treebank::Transform.new(tb1)
291
+ result = tb.transform
292
+ expect(result).to eq tb1
293
+ end
294
+
295
+ context "with a single simple ellipsis" do
296
+ it "inserts a new elliptic node and updates the head" do
297
+ tb = Treebank::Transform.new(tb2)
298
+ result = tb.transform
299
+ expect(result).to eq tb2_result
300
+ end
301
+ end
302
+
303
+ context "when multiple token are children of the same ellipsis" do
304
+ it "inserts a new elliptic node and updates the head" do
305
+ tb = Treebank::Transform.new(tb3)
306
+ result = tb.transform
307
+ expect(result).to eq tb3_result
308
+ end
309
+ end
310
+
311
+ context "when ellipses are chained" do
312
+ it "does as it should" do
313
+ tb = Treebank::Transform.new(tb4)
314
+ result = tb.transform
315
+ expect(result).to eq tb4_result
316
+ end
317
+ end
318
+
319
+ context "with multiple sentences in a document" do
320
+ it "does as it should" do
321
+ tb = Treebank::Transform.new(tb5)
322
+ result = tb.transform
323
+ expect(result).to eq tb5_result
324
+ end
325
+ end
326
+ end
327
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'treebank/transform/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "treebank-transform"
8
+ spec.version = Treebank::Transform::VERSION
9
+ spec.authors = ["LFDM"]
10
+ spec.email = ["1986gh@gmail.com"]
11
+ spec.summary = %q{Transforms Perseus Treebank files}
12
+ spec.description = spec.summary
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "simplecov", "~> 0.7"
25
+
26
+ spec.add_dependency "thor"
27
+ spec.add_dependency "nokogiri"
28
+ end