llt-tokenizer 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd490c0611dc9cd5ed2f2aad95631b500e7e7d35
4
- data.tar.gz: fe8162ded4cecfda383f3919576ec19ca14f5a38
3
+ metadata.gz: 6ea54a34c5f463b52600e854cd9e9084a5599af7
4
+ data.tar.gz: 13cfe38eb69014a5d855e4e0d44a9d54991bc855
5
5
  SHA512:
6
- metadata.gz: a8a90a133ccd0c27fbbd20e64df0a8e560ad183010ae55716362e0909c737a6d5359c51a647da387ddade71a3bd50545381c899cd3fcea095001ef0e27c42483
7
- data.tar.gz: 2d81412a66b473206d23aec3128d3321ff736f3a9d5ab8fca0349bcc347e8d692b3ee33d0fd2c67f87f031335113c1a54caef80b5aa0a4eaca4c5bd2320d6778
6
+ metadata.gz: 7fcdbcfc60c63cbec28a0aa37044cddb22280ec4f4862c39bf3abc658619f273185589714c5d4ac27eb4c7a881c350047a499dba68c43e932e0cd18b02b8670d
7
+ data.tar.gz: 1c5bf2c479b0ff8e49358a1d53e5bb597ab5157d6981d65541eec6189e95d8fb57467bc62fd406d86ed1b4b473139238438cc589a4b2b150ec51798f2a837942
@@ -152,7 +152,7 @@ module LLT
152
152
 
153
153
  WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
154
154
  WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
155
- WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
155
+ WORDS_ENDING_WITH_VE = /^()$/i # formerly had neve and sive, which we split now
156
156
 
157
157
  # laetusque to -que laetus
158
158
  # in eoque to -que in eo
@@ -169,7 +169,7 @@ module LLT
169
169
  ENCLITICS = %w{ que ne ve c }
170
170
  def split_enklitika_and_change_their_position
171
171
  split_with_force
172
- split_nec_and_oute
172
+ split_frequent_enclitics # like latin c, ve or greek te, de
173
173
  make_frequent_corrections
174
174
  end
175
175
 
@@ -202,22 +202,24 @@ module LLT
202
202
  "#{@enclitics_marker}#{val}"
203
203
  end
204
204
 
205
- def split_nec_and_oute
206
- nec_indices = []
207
- oute_indices = []
205
+ ENCLITICS_MAP = {
206
+ /^(nec)$/i => 'c',
207
+ /^(ne|se)u$/i => 'u',
208
+ /^(nisi)$/i => 'si',
209
+ /^(οὐ|μή|εἰ)τε$/i => 'τε',
210
+ /^(οὐ|μή)δε$/i => 'δε',
211
+ }
212
+ def split_frequent_enclitics
213
+ container = []
208
214
  @worker.each_with_index do |token, i|
209
- case token
210
- when /^nec$/i
211
- token.slice!(-1)
212
- nec_indices << (i + nec_indices.size + @shift_range)
213
- when /^οὐτε$/i
214
- token.slice!(-2, 2)
215
- oute_indices << (i + oute_indices.size + @shift_range)
215
+ ENCLITICS_MAP.each do |regex, encl|
216
+ if token.match(regex)
217
+ token.slice!(-encl.length, encl.length)
218
+ container << [encl, (i + container.size + @shift_range)]
219
+ end
216
220
  end
217
221
  end
218
-
219
- nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
220
- oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
222
+ container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
221
223
  end
222
224
 
223
225
  def make_frequent_corrections
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.7"
3
+ VERSION = "0.0.8"
4
4
  end
5
5
  end
@@ -242,14 +242,14 @@ describe LLT::Tokenizer do
242
242
 
243
243
  context "when confronted with -ve" do
244
244
  examples = {
245
- 'sive' => 'sive',
245
+ 'sive' => '-ve si',
246
+ 'neve' => '-ve ne',
246
247
  'pluresve' => '-ve plures',
247
248
  'aestive' => 'aestive',
248
249
  'serve' => 'serve',
249
250
  'suave' => 'suave',
250
251
  'vive' => 'vive',
251
252
  'move' => 'move',
252
- 'neve' => 'neve',
253
253
  'cive' => 'cive',
254
254
  'Iove' => 'Iove',
255
255
  }
@@ -261,9 +261,49 @@ describe LLT::Tokenizer do
261
261
  end
262
262
  end
263
263
 
264
+ context "when confronted with -u" do
265
+ examples = {
266
+ 'seu' => '-u se',
267
+ 'neu' => '-u ne'
268
+ }
269
+
270
+ examples.each do |example, expected|
271
+ it "transforms #{example} to #{expected}" do
272
+ enklitika_test(example).should be_transformed_to expected
273
+ end
274
+ end
275
+ end
276
+
277
+ context "when confronted with -si" do
278
+ examples = {
279
+ 'nisi' => '-si ni'
280
+ }
281
+
282
+ examples.each do |example, expected|
283
+ it "transforms #{example} to #{expected}" do
284
+ enklitika_test(example).should be_transformed_to expected
285
+ end
286
+ end
287
+ end
288
+
264
289
  context "when confronted with -τε" do
265
290
  examples = {
266
- 'οὐτε' => '-τε οὐ'
291
+ 'οὐτε' => '-τε οὐ',
292
+ 'μήτε' => '-τε μή',
293
+ 'εἰτε' => '-τε εἰ'
294
+ }
295
+
296
+ examples.each do |example, expected|
297
+ it "transforms #{example} to #{expected}" do
298
+ enklitika_test(example).should be_transformed_to expected
299
+ end
300
+ end
301
+ end
302
+
303
+ context "when confronted with -δε" do
304
+ examples = {
305
+ 'οὐδε' => '-δε οὐ',
306
+ 'μήδε' => '-δε μή'
267
307
  }
268
308
 
269
309
  examples.each do |example, expected|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-17 00:00:00.000000000 Z
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler