llt-tokenizer 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd490c0611dc9cd5ed2f2aad95631b500e7e7d35
4
- data.tar.gz: fe8162ded4cecfda383f3919576ec19ca14f5a38
3
+ metadata.gz: 6ea54a34c5f463b52600e854cd9e9084a5599af7
4
+ data.tar.gz: 13cfe38eb69014a5d855e4e0d44a9d54991bc855
5
5
  SHA512:
6
- metadata.gz: a8a90a133ccd0c27fbbd20e64df0a8e560ad183010ae55716362e0909c737a6d5359c51a647da387ddade71a3bd50545381c899cd3fcea095001ef0e27c42483
7
- data.tar.gz: 2d81412a66b473206d23aec3128d3321ff736f3a9d5ab8fca0349bcc347e8d692b3ee33d0fd2c67f87f031335113c1a54caef80b5aa0a4eaca4c5bd2320d6778
6
+ metadata.gz: 7fcdbcfc60c63cbec28a0aa37044cddb22280ec4f4862c39bf3abc658619f273185589714c5d4ac27eb4c7a881c350047a499dba68c43e932e0cd18b02b8670d
7
+ data.tar.gz: 1c5bf2c479b0ff8e49358a1d53e5bb597ab5157d6981d65541eec6189e95d8fb57467bc62fd406d86ed1b4b473139238438cc589a4b2b150ec51798f2a837942
@@ -152,7 +152,7 @@ module LLT
152
152
 
153
153
  WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
154
154
  WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
155
- WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
155
+ WORDS_ENDING_WITH_VE = /^()$/i # formerly had neve and sive, which we split now
156
156
 
157
157
  # laetusque to -que laetus
158
158
  # in eoque to -que in eo
@@ -169,7 +169,7 @@ module LLT
169
169
  ENCLITICS = %w{ que ne ve c }
170
170
  def split_enklitika_and_change_their_position
171
171
  split_with_force
172
- split_nec_and_oute
172
+ split_frequent_enclitics # like latin c, ve or greek te, de
173
173
  make_frequent_corrections
174
174
  end
175
175
 
@@ -202,22 +202,24 @@ module LLT
202
202
  "#{@enclitics_marker}#{val}"
203
203
  end
204
204
 
205
- def split_nec_and_oute
206
- nec_indices = []
207
- oute_indices = []
205
+ ENCLITICS_MAP = {
206
+ /^(nec)$/i => 'c',
207
+ /^(ne|se)u$/i => 'u',
208
+ /^(nisi)$/i => 'si',
209
+ /^(οὐ|μή|εἰ)τε$/i => 'τε',
210
+ /^(οὐ|μή)δε$/i => 'δε',
211
+ }
212
+ def split_frequent_enclitics
213
+ container = []
208
214
  @worker.each_with_index do |token, i|
209
- case token
210
- when /^nec$/i
211
- token.slice!(-1)
212
- nec_indices << (i + nec_indices.size + @shift_range)
213
- when /^οὐτε$/i
214
- token.slice!(-2, 2)
215
- oute_indices << (i + oute_indices.size + @shift_range)
215
+ ENCLITICS_MAP.each do |regex, encl|
216
+ if token.match(regex)
217
+ token.slice!(-encl.length, encl.length)
218
+ container << [encl, (i + container.size + @shift_range)]
219
+ end
216
220
  end
217
221
  end
218
-
219
- nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
220
- oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
222
+ container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
221
223
  end
222
224
 
223
225
  def make_frequent_corrections
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.7"
3
+ VERSION = "0.0.8"
4
4
  end
5
5
  end
@@ -242,14 +242,14 @@ describe LLT::Tokenizer do
242
242
 
243
243
  context "when confronted with -ve" do
244
244
  examples = {
245
- 'sive' => 'sive',
245
+ 'sive' => '-ve si',
246
+ 'neve' => '-ve ne',
246
247
  'pluresve' => '-ve plures',
247
248
  'aestive' => 'aestive',
248
249
  'serve' => 'serve',
249
250
  'suave' => 'suave',
250
251
  'vive' => 'vive',
251
252
  'move' => 'move',
252
- 'neve' => 'neve',
253
253
  'cive' => 'cive',
254
254
  'Iove' => 'Iove',
255
255
  }
@@ -261,9 +261,49 @@ describe LLT::Tokenizer do
261
261
  end
262
262
  end
263
263
 
264
+ context "when confronted with -u" do
265
+ examples = {
266
+ 'seu' => '-u se',
267
+ 'neu' => '-u ne'
268
+ }
269
+
270
+ examples.each do |example, expected|
271
+ it "transforms #{example} to #{expected}" do
272
+ enklitika_test(example).should be_transformed_to expected
273
+ end
274
+ end
275
+ end
276
+
277
+ context "when confronted with -si" do
278
+ examples = {
279
+ 'nisi' => '-si ni'
280
+ }
281
+
282
+ examples.each do |example, expected|
283
+ it "transforms #{example} to #{expected}" do
284
+ enklitika_test(example).should be_transformed_to expected
285
+ end
286
+ end
287
+ end
288
+
264
289
  context "when confronted with -τε" do
265
290
  examples = {
266
- 'οὐτε' => '-τε οὐ'
291
+ 'οὐτε' => '-τε οὐ',
292
+ 'μήτε' => '-τε μή',
293
+ 'εἰτε' => '-τε εἰ'
294
+ }
295
+
296
+ examples.each do |example, expected|
297
+ it "transforms #{example} to #{expected}" do
298
+ enklitika_test(example).should be_transformed_to expected
299
+ end
300
+ end
301
+ end
302
+
303
+ context "when confronted with -δε" do
304
+ examples = {
305
+ 'οὐδε' => '-δε οὐ',
306
+ 'μήδε' => '-δε μή'
267
307
  }
268
308
 
269
309
  examples.each do |example, expected|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-17 00:00:00.000000000 Z
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler