llt-segmenter 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/lib/llt/segmenter.rb +5 -1
- data/lib/llt/segmenter/version.rb +1 -1
- data/spec/lib/llt/segmenter_spec.rb +94 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3575aa6f3f997afa19250f04c5d01e881279db0a
|
4
|
+
data.tar.gz: e07b2452c0ed737ebddadc4b61c097b48cbcc6b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 36fdb7ee87f64279f678b1f96741a9bdfc19003c2c190d4a7982a3213455802c8b49b752adca3fcd0d732a2d50d17f96001944180bdd778fc651a69b0606ebe8
|
7
|
+
data.tar.gz: 65237979c9a8e3cec56c65ef3707e947347debdfc6c381bd427b4961c6fc72b195a90c745a5cf229d8ea2ae477c74ef362b84c5ed6e43cc8da19eabd9c9005e6
|
data/Gemfile
CHANGED
data/lib/llt/segmenter.rb
CHANGED
@@ -127,6 +127,7 @@ module LLT
|
|
127
127
|
|
128
128
|
sentence = scan_until_next_sentence(scanner, sentences)
|
129
129
|
|
130
|
+
|
130
131
|
raise if scanner.pos == loop_guard
|
131
132
|
|
132
133
|
if @xml
|
@@ -194,8 +195,11 @@ module LLT
|
|
194
195
|
def rescue_no_delimiters(sentences, scanner)
|
195
196
|
if sentences.any?
|
196
197
|
# broken off texts
|
197
|
-
scanner.scan_until(
|
198
|
+
scanner.scan_until(/\Z/)
|
198
199
|
else
|
200
|
+
if scanner.eos? && @xml
|
201
|
+
return ''
|
202
|
+
end
|
199
203
|
# try a simple newline as delimiter, if there was no delimiter
|
200
204
|
scanner.reset
|
201
205
|
@sentence_closer = /\n/
|
@@ -158,6 +158,100 @@ describe LLT::Segmenter do
|
|
158
158
|
sentences = segmenter.segment(txt, xml: true)
|
159
159
|
sentences.should have(1).item
|
160
160
|
end
|
161
|
+
|
162
|
+
it "doesn't fall for complex documents" do
|
163
|
+
txt = <<-EOF
|
164
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
165
|
+
<tei:text xml:lang="grc">
|
166
|
+
<tei:body>
|
167
|
+
<tei:div type="line">
|
168
|
+
<milestone ed="P" unit="para"/>μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος</tei:div>
|
169
|
+
</tei:body>
|
170
|
+
</tei:text>
|
171
|
+
</tei:TEI>
|
172
|
+
EOF
|
173
|
+
sentences = segmenter.segment(txt, xml: true)
|
174
|
+
sentences.should have(1).item
|
175
|
+
end
|
176
|
+
|
177
|
+
it "doesn't fall for complex documents II" do
|
178
|
+
txt = <<-EOF
|
179
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
180
|
+
<tei:text xml:lang="grc">
|
181
|
+
<tei:body>
|
182
|
+
<tei:div type="line">
|
183
|
+
<milestone ed="P" unit="para"/>Arma virum. Test.</tei:div>
|
184
|
+
</tei:body>
|
185
|
+
</tei:text>
|
186
|
+
</tei:TEI>
|
187
|
+
EOF
|
188
|
+
sentences = segmenter.segment(txt, xml: true)
|
189
|
+
sentences.should have(2).item
|
190
|
+
end
|
191
|
+
|
192
|
+
it "doesn't fall for complex documents III" do
|
193
|
+
txt = <<-EOF
|
194
|
+
<tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
|
195
|
+
<tei:text xml:lang="grc">
|
196
|
+
<tei:body>
|
197
|
+
<tei:div type="line">
|
198
|
+
<milestone ed="P" unit="para"/>Arma virum. Test</tei:div>
|
199
|
+
</tei:body>
|
200
|
+
</tei:text>
|
201
|
+
</tei:TEI>
|
202
|
+
EOF
|
203
|
+
sentences = segmenter.segment(txt, xml: true)
|
204
|
+
sentences.should have(2).item
|
205
|
+
end
|
206
|
+
|
207
|
+
it "doesn't fall for complex documents IV" do
|
208
|
+
txt = <<-EOF
|
209
|
+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
210
|
+
<text xml:lang="grc">
|
211
|
+
<body>
|
212
|
+
<div1 type="Book" n="1">
|
213
|
+
<l n="1">
|
214
|
+
<milestone ed="P" unit="para"/>
|
215
|
+
μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος
|
216
|
+
</l>
|
217
|
+
</div1>
|
218
|
+
<div1 type="Book" n="1">
|
219
|
+
<l n="2">οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,</l>
|
220
|
+
</div1>
|
221
|
+
<div1 type="Book" n="1">
|
222
|
+
<l n="3">πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν</l>
|
223
|
+
</div1>
|
224
|
+
<div1 type="Book" n="1">
|
225
|
+
<l n="4">ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν</l>
|
226
|
+
</div1>
|
227
|
+
<div1 type="Book" n="1">
|
228
|
+
<l n="5">οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,</l>
|
229
|
+
</div1>
|
230
|
+
<div1 type="Book" n="1">
|
231
|
+
<l n="6">ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε</l>
|
232
|
+
</div1>
|
233
|
+
<div1 type="Book" n="1">
|
234
|
+
<l n="7">Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.</l>
|
235
|
+
</div1>
|
236
|
+
<div1 type="Book" n="1">
|
237
|
+
<l n="8">
|
238
|
+
<milestone ed="P" unit="Para"/>
|
239
|
+
τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;
|
240
|
+
</l>
|
241
|
+
</div1>
|
242
|
+
<div1 type="Book" n="1">
|
243
|
+
<l n="9">Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς</l>
|
244
|
+
</div1>
|
245
|
+
<div1 type="Book" n="1">
|
246
|
+
<l n="10">νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,</l>
|
247
|
+
</div1>
|
248
|
+
</body>
|
249
|
+
</text>
|
250
|
+
</TEI>
|
251
|
+
EOF
|
252
|
+
sentences = segmenter.segment(txt, xml: true)
|
253
|
+
sentences.should have(4).item
|
254
|
+
end
|
161
255
|
end
|
162
256
|
|
163
257
|
context "with xml escaped characters" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
157
|
version: '0'
|
158
158
|
requirements: []
|
159
159
|
rubyforge_project:
|
160
|
-
rubygems_version: 2.2.
|
160
|
+
rubygems_version: 2.2.2
|
161
161
|
signing_key:
|
162
162
|
specification_version: 4
|
163
163
|
summary: Segments text into sentences
|