llt-segmenter 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f6c90686915f9e02706f88b650695a4b40aa6867
4
- data.tar.gz: 67ed2ade8bb50c0419a82f18b40eeef031c7a243
3
+ metadata.gz: 3575aa6f3f997afa19250f04c5d01e881279db0a
4
+ data.tar.gz: e07b2452c0ed737ebddadc4b61c097b48cbcc6b5
5
5
  SHA512:
6
- metadata.gz: e38adead709637f4520233ead966d000c332b8b356f4e2d827ae40b3ce31d6e2a0e6969762b57393bcb02c9a9b647e1e617a5bd81ab20712663ac525de7a5062
7
- data.tar.gz: 50fedc9e52883f8458a437a6f6ff6252bbe6e1f6d78cdfb59d5b78873a27da3dfc40481a1008cf7278493196041cc42a5beaa40be2faef9071e5f36aad946769
6
+ metadata.gz: 36fdb7ee87f64279f678b1f96741a9bdfc19003c2c190d4a7982a3213455802c8b49b752adca3fcd0d732a2d50d17f96001944180bdd778fc651a69b0606ebe8
7
+ data.tar.gz: 65237979c9a8e3cec56c65ef3707e947347debdfc6c381bd427b4961c6fc72b195a90c745a5cf229d8ea2ae477c74ef362b84c5ed6e43cc8da19eabd9c9005e6
data/Gemfile CHANGED
@@ -2,7 +2,6 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in llt-segmenter.gemspec
4
4
  gemspec
5
- gem 'pry'
6
5
 
7
6
  gem 'coveralls', require: false
8
7
 
@@ -127,6 +127,7 @@ module LLT
127
127
 
128
128
  sentence = scan_until_next_sentence(scanner, sentences)
129
129
 
130
+
130
131
  raise if scanner.pos == loop_guard
131
132
 
132
133
  if @xml
@@ -194,8 +195,11 @@ module LLT
194
195
  def rescue_no_delimiters(sentences, scanner)
195
196
  if sentences.any?
196
197
  # broken off texts
197
- scanner.scan_until(/$/)
198
+ scanner.scan_until(/\Z/)
198
199
  else
200
+ if scanner.eos? && @xml
201
+ return ''
202
+ end
199
203
  # try a simple newline as delimiter, if there was no delimiter
200
204
  scanner.reset
201
205
  @sentence_closer = /\n/
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.5"
3
+ VERSION = "0.0.6"
4
4
  end
5
5
  end
@@ -158,6 +158,100 @@ describe LLT::Segmenter do
158
158
  sentences = segmenter.segment(txt, xml: true)
159
159
  sentences.should have(1).item
160
160
  end
161
+
162
+ it "doesn't fall for complex documents" do
163
+ txt = <<-EOF
164
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
165
+ <tei:text xml:lang="grc">
166
+ <tei:body>
167
+ <tei:div type="line">
168
+ <milestone ed="P" unit="para"/>μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος</tei:div>
169
+ </tei:body>
170
+ </tei:text>
171
+ </tei:TEI>
172
+ EOF
173
+ sentences = segmenter.segment(txt, xml: true)
174
+ sentences.should have(1).item
175
+ end
176
+
177
+ it "doesn't fall for complex documents II" do
178
+ txt = <<-EOF
179
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
180
+ <tei:text xml:lang="grc">
181
+ <tei:body>
182
+ <tei:div type="line">
183
+ <milestone ed="P" unit="para"/>Arma virum. Test.</tei:div>
184
+ </tei:body>
185
+ </tei:text>
186
+ </tei:TEI>
187
+ EOF
188
+ sentences = segmenter.segment(txt, xml: true)
189
+ sentences.should have(2).item
190
+ end
191
+
192
+ it "doesn't fall for complex documents III" do
193
+ txt = <<-EOF
194
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
195
+ <tei:text xml:lang="grc">
196
+ <tei:body>
197
+ <tei:div type="line">
198
+ <milestone ed="P" unit="para"/>Arma virum. Test</tei:div>
199
+ </tei:body>
200
+ </tei:text>
201
+ </tei:TEI>
202
+ EOF
203
+ sentences = segmenter.segment(txt, xml: true)
204
+ sentences.should have(2).item
205
+ end
206
+
207
+ it "doesn't fall for complex documents IV" do
208
+ txt = <<-EOF
209
+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
210
+ <text xml:lang="grc">
211
+ <body>
212
+ <div1 type="Book" n="1">
213
+ <l n="1">
214
+ <milestone ed="P" unit="para"/>
215
+ μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος
216
+ </l>
217
+ </div1>
218
+ <div1 type="Book" n="1">
219
+ <l n="2">οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,</l>
220
+ </div1>
221
+ <div1 type="Book" n="1">
222
+ <l n="3">πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν</l>
223
+ </div1>
224
+ <div1 type="Book" n="1">
225
+ <l n="4">ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν</l>
226
+ </div1>
227
+ <div1 type="Book" n="1">
228
+ <l n="5">οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,</l>
229
+ </div1>
230
+ <div1 type="Book" n="1">
231
+ <l n="6">ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε</l>
232
+ </div1>
233
+ <div1 type="Book" n="1">
234
+ <l n="7">Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.</l>
235
+ </div1>
236
+ <div1 type="Book" n="1">
237
+ <l n="8">
238
+ <milestone ed="P" unit="Para"/>
239
+ τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;
240
+ </l>
241
+ </div1>
242
+ <div1 type="Book" n="1">
243
+ <l n="9">Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς</l>
244
+ </div1>
245
+ <div1 type="Book" n="1">
246
+ <l n="10">νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,</l>
247
+ </div1>
248
+ </body>
249
+ </text>
250
+ </TEI>
251
+ EOF
252
+ sentences = segmenter.segment(txt, xml: true)
253
+ sentences.should have(4).item
254
+ end
161
255
  end
162
256
 
163
257
  context "with xml escaped characters" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-11 00:00:00.000000000 Z
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
157
  version: '0'
158
158
  requirements: []
159
159
  rubyforge_project:
160
- rubygems_version: 2.2.0
160
+ rubygems_version: 2.2.2
161
161
  signing_key:
162
162
  specification_version: 4
163
163
  summary: Segments text into sentences