llt-segmenter 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/lib/llt/segmenter.rb +5 -1
- data/lib/llt/segmenter/version.rb +1 -1
- data/spec/lib/llt/segmenter_spec.rb +94 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3575aa6f3f997afa19250f04c5d01e881279db0a
|
4
|
+
data.tar.gz: e07b2452c0ed737ebddadc4b61c097b48cbcc6b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 36fdb7ee87f64279f678b1f96741a9bdfc19003c2c190d4a7982a3213455802c8b49b752adca3fcd0d732a2d50d17f96001944180bdd778fc651a69b0606ebe8
|
7
|
+
data.tar.gz: 65237979c9a8e3cec56c65ef3707e947347debdfc6c381bd427b4961c6fc72b195a90c745a5cf229d8ea2ae477c74ef362b84c5ed6e43cc8da19eabd9c9005e6
|
data/Gemfile
CHANGED
data/lib/llt/segmenter.rb
CHANGED
@@ -127,6 +127,7 @@ module LLT
|
|
127
127
|
|
128
128
|
sentence = scan_until_next_sentence(scanner, sentences)
|
129
129
|
|
130
|
+
|
130
131
|
raise if scanner.pos == loop_guard
|
131
132
|
|
132
133
|
if @xml
|
@@ -194,8 +195,11 @@ module LLT
|
|
194
195
|
def rescue_no_delimiters(sentences, scanner)
|
195
196
|
if sentences.any?
|
196
197
|
# broken off texts
|
197
|
-
scanner.scan_until(
|
198
|
+
scanner.scan_until(/\Z/)
|
198
199
|
else
|
200
|
+
if scanner.eos? && @xml
|
201
|
+
return ''
|
202
|
+
end
|
199
203
|
# try a simple newline as delimiter, if there was no delimiter
|
200
204
|
scanner.reset
|
201
205
|
@sentence_closer = /\n/
|
@@ -158,6 +158,100 @@ describe LLT::Segmenter do
|
|
158
158
|
sentences = segmenter.segment(txt, xml: true)
|
159
159
|
sentences.should have(1).item
|
160
160
|
end
|
161
|
+
|
162
|
+
it "doesn't fall for complex documents" do
|
163
|
+
txt = <<-EOF
|
164
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
165
|
+
<tei:text xml:lang="grc">
|
166
|
+
<tei:body>
|
167
|
+
<tei:div type="line">
|
168
|
+
<milestone ed="P" unit="para"/>μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος</tei:div>
|
169
|
+
</tei:body>
|
170
|
+
</tei:text>
|
171
|
+
</tei:TEI>
|
172
|
+
EOF
|
173
|
+
sentences = segmenter.segment(txt, xml: true)
|
174
|
+
sentences.should have(1).item
|
175
|
+
end
|
176
|
+
|
177
|
+
it "doesn't fall for complex documents II" do
|
178
|
+
txt = <<-EOF
|
179
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
180
|
+
<tei:text xml:lang="grc">
|
181
|
+
<tei:body>
|
182
|
+
<tei:div type="line">
|
183
|
+
<milestone ed="P" unit="para"/>Arma virum. Test.</tei:div>
|
184
|
+
</tei:body>
|
185
|
+
</tei:text>
|
186
|
+
</tei:TEI>
|
187
|
+
EOF
|
188
|
+
sentences = segmenter.segment(txt, xml: true)
|
189
|
+
sentences.should have(2).item
|
190
|
+
end
|
191
|
+
|
192
|
+
it "doesn't fall for complex documents III" do
|
193
|
+
txt = <<-EOF
|
194
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
195
|
+
<tei:text xml:lang="grc">
|
196
|
+
<tei:body>
|
197
|
+
<tei:div type="line">
|
198
|
+
<milestone ed="P" unit="para"/>Arma virum. Test</tei:div>
|
199
|
+
</tei:body>
|
200
|
+
</tei:text>
|
201
|
+
</tei:TEI>
|
202
|
+
EOF
|
203
|
+
sentences = segmenter.segment(txt, xml: true)
|
204
|
+
sentences.should have(2).item
|
205
|
+
end
|
206
|
+
|
207
|
+
it "doesn't fall for complex documents IV" do
|
208
|
+
txt = <<-EOF
|
209
|
+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
210
|
+
<text xml:lang="grc">
|
211
|
+
<body>
|
212
|
+
<div1 type="Book" n="1">
|
213
|
+
<l n="1">
|
214
|
+
<milestone ed="P" unit="para"/>
|
215
|
+
μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος
|
216
|
+
</l>
|
217
|
+
</div1>
|
218
|
+
<div1 type="Book" n="1">
|
219
|
+
<l n="2">οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,</l>
|
220
|
+
</div1>
|
221
|
+
<div1 type="Book" n="1">
|
222
|
+
<l n="3">πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν</l>
|
223
|
+
</div1>
|
224
|
+
<div1 type="Book" n="1">
|
225
|
+
<l n="4">ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν</l>
|
226
|
+
</div1>
|
227
|
+
<div1 type="Book" n="1">
|
228
|
+
<l n="5">οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,</l>
|
229
|
+
</div1>
|
230
|
+
<div1 type="Book" n="1">
|
231
|
+
<l n="6">ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε</l>
|
232
|
+
</div1>
|
233
|
+
<div1 type="Book" n="1">
|
234
|
+
<l n="7">Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.</l>
|
235
|
+
</div1>
|
236
|
+
<div1 type="Book" n="1">
|
237
|
+
<l n="8">
|
238
|
+
<milestone ed="P" unit="Para"/>
|
239
|
+
τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;
|
240
|
+
</l>
|
241
|
+
</div1>
|
242
|
+
<div1 type="Book" n="1">
|
243
|
+
<l n="9">Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς</l>
|
244
|
+
</div1>
|
245
|
+
<div1 type="Book" n="1">
|
246
|
+
<l n="10">νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,</l>
|
247
|
+
</div1>
|
248
|
+
</body>
|
249
|
+
</text>
|
250
|
+
</TEI>
|
251
|
+
EOF
|
252
|
+
sentences = segmenter.segment(txt, xml: true)
|
253
|
+
sentences.should have(4).item
|
254
|
+
end
|
161
255
|
end
|
162
256
|
|
163
257
|
context "with xml escaped characters" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
157
|
version: '0'
|
158
158
|
requirements: []
|
159
159
|
rubyforge_project:
|
160
|
-
rubygems_version: 2.2.
|
160
|
+
rubygems_version: 2.2.2
|
161
161
|
signing_key:
|
162
162
|
specification_version: 4
|
163
163
|
summary: Segments text into sentences
|