llt-segmenter 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f6c90686915f9e02706f88b650695a4b40aa6867
4
- data.tar.gz: 67ed2ade8bb50c0419a82f18b40eeef031c7a243
3
+ metadata.gz: 3575aa6f3f997afa19250f04c5d01e881279db0a
4
+ data.tar.gz: e07b2452c0ed737ebddadc4b61c097b48cbcc6b5
5
5
  SHA512:
6
- metadata.gz: e38adead709637f4520233ead966d000c332b8b356f4e2d827ae40b3ce31d6e2a0e6969762b57393bcb02c9a9b647e1e617a5bd81ab20712663ac525de7a5062
7
- data.tar.gz: 50fedc9e52883f8458a437a6f6ff6252bbe6e1f6d78cdfb59d5b78873a27da3dfc40481a1008cf7278493196041cc42a5beaa40be2faef9071e5f36aad946769
6
+ metadata.gz: 36fdb7ee87f64279f678b1f96741a9bdfc19003c2c190d4a7982a3213455802c8b49b752adca3fcd0d732a2d50d17f96001944180bdd778fc651a69b0606ebe8
7
+ data.tar.gz: 65237979c9a8e3cec56c65ef3707e947347debdfc6c381bd427b4961c6fc72b195a90c745a5cf229d8ea2ae477c74ef362b84c5ed6e43cc8da19eabd9c9005e6
data/Gemfile CHANGED
@@ -2,7 +2,6 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in llt-segmenter.gemspec
4
4
  gemspec
5
- gem 'pry'
6
5
 
7
6
  gem 'coveralls', require: false
8
7
 
@@ -127,6 +127,7 @@ module LLT
127
127
 
128
128
  sentence = scan_until_next_sentence(scanner, sentences)
129
129
 
130
+
130
131
  raise if scanner.pos == loop_guard
131
132
 
132
133
  if @xml
@@ -194,8 +195,11 @@ module LLT
194
195
  def rescue_no_delimiters(sentences, scanner)
195
196
  if sentences.any?
196
197
  # broken off texts
197
- scanner.scan_until(/$/)
198
+ scanner.scan_until(/\Z/)
198
199
  else
200
+ if scanner.eos? && @xml
201
+ return ''
202
+ end
199
203
  # try a simple newline as delimiter, if there was no delimiter
200
204
  scanner.reset
201
205
  @sentence_closer = /\n/
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.5"
3
+ VERSION = "0.0.6"
4
4
  end
5
5
  end
@@ -158,6 +158,100 @@ describe LLT::Segmenter do
158
158
  sentences = segmenter.segment(txt, xml: true)
159
159
  sentences.should have(1).item
160
160
  end
161
+
162
+ it "doesn't fall for complex documents" do
163
+ txt = <<-EOF
164
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
165
+ <tei:text xml:lang="grc">
166
+ <tei:body>
167
+ <tei:div type="line">
168
+ <milestone ed="P" unit="para"/>μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος</tei:div>
169
+ </tei:body>
170
+ </tei:text>
171
+ </tei:TEI>
172
+ EOF
173
+ sentences = segmenter.segment(txt, xml: true)
174
+ sentences.should have(1).item
175
+ end
176
+
177
+ it "doesn't fall for complex documents II" do
178
+ txt = <<-EOF
179
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
180
+ <tei:text xml:lang="grc">
181
+ <tei:body>
182
+ <tei:div type="line">
183
+ <milestone ed="P" unit="para"/>Arma virum. Test.</tei:div>
184
+ </tei:body>
185
+ </tei:text>
186
+ </tei:TEI>
187
+ EOF
188
+ sentences = segmenter.segment(txt, xml: true)
189
+ sentences.should have(2).item
190
+ end
191
+
192
+ it "doesn't fall for complex documents III" do
193
+ txt = <<-EOF
194
+ <tei:TEI xmlns:tei="http://www.tei-c.org/ns/1.0">
195
+ <tei:text xml:lang="grc">
196
+ <tei:body>
197
+ <tei:div type="line">
198
+ <milestone ed="P" unit="para"/>Arma virum. Test</tei:div>
199
+ </tei:body>
200
+ </tei:text>
201
+ </tei:TEI>
202
+ EOF
203
+ sentences = segmenter.segment(txt, xml: true)
204
+ sentences.should have(2).item
205
+ end
206
+
207
+ it "doesn't fall for complex documents IV" do
208
+ txt = <<-EOF
209
+ <TEI xmlns="http://www.tei-c.org/ns/1.0">
210
+ <text xml:lang="grc">
211
+ <body>
212
+ <div1 type="Book" n="1">
213
+ <l n="1">
214
+ <milestone ed="P" unit="para"/>
215
+ μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος
216
+ </l>
217
+ </div1>
218
+ <div1 type="Book" n="1">
219
+ <l n="2">οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,</l>
220
+ </div1>
221
+ <div1 type="Book" n="1">
222
+ <l n="3">πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν</l>
223
+ </div1>
224
+ <div1 type="Book" n="1">
225
+ <l n="4">ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν</l>
226
+ </div1>
227
+ <div1 type="Book" n="1">
228
+ <l n="5">οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,</l>
229
+ </div1>
230
+ <div1 type="Book" n="1">
231
+ <l n="6">ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε</l>
232
+ </div1>
233
+ <div1 type="Book" n="1">
234
+ <l n="7">Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.</l>
235
+ </div1>
236
+ <div1 type="Book" n="1">
237
+ <l n="8">
238
+ <milestone ed="P" unit="Para"/>
239
+ τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;
240
+ </l>
241
+ </div1>
242
+ <div1 type="Book" n="1">
243
+ <l n="9">Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς</l>
244
+ </div1>
245
+ <div1 type="Book" n="1">
246
+ <l n="10">νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,</l>
247
+ </div1>
248
+ </body>
249
+ </text>
250
+ </TEI>
251
+ EOF
252
+ sentences = segmenter.segment(txt, xml: true)
253
+ sentences.should have(4).item
254
+ end
161
255
  end
162
256
 
163
257
  context "with xml escaped characters" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-11 00:00:00.000000000 Z
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
157
  version: '0'
158
158
  requirements: []
159
159
  rubyforge_project:
160
- rubygems_version: 2.2.0
160
+ rubygems_version: 2.2.2
161
161
  signing_key:
162
162
  specification_version: 4
163
163
  summary: Segments text into sentences