llt-tokenizer 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/llt/tokenizer.rb +17 -15
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/tokenizer_spec.rb +43 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ea54a34c5f463b52600e854cd9e9084a5599af7
|
4
|
+
data.tar.gz: 13cfe38eb69014a5d855e4e0d44a9d54991bc855
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fcdbcfc60c63cbec28a0aa37044cddb22280ec4f4862c39bf3abc658619f273185589714c5d4ac27eb4c7a881c350047a499dba68c43e932e0cd18b02b8670d
|
7
|
+
data.tar.gz: 1c5bf2c479b0ff8e49358a1d53e5bb597ab5157d6981d65541eec6189e95d8fb57467bc62fd406d86ed1b4b473139238438cc589a4b2b150ec51798f2a837942
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -152,7 +152,7 @@ module LLT
|
|
152
152
|
|
153
153
|
WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
|
154
154
|
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
|
155
|
-
WORDS_ENDING_WITH_VE = /^(
|
155
|
+
WORDS_ENDING_WITH_VE = /^()$/i # formerly had neve and sive, which we split now
|
156
156
|
|
157
157
|
# laetusque to -que laetus
|
158
158
|
# in eoque to -que in eo
|
@@ -169,7 +169,7 @@ module LLT
|
|
169
169
|
ENCLITICS = %w{ que ne ve c }
|
170
170
|
def split_enklitika_and_change_their_position
|
171
171
|
split_with_force
|
172
|
-
|
172
|
+
split_frequent_enclitics # like latin c, ve or greek te, de
|
173
173
|
make_frequent_corrections
|
174
174
|
end
|
175
175
|
|
@@ -202,22 +202,24 @@ module LLT
|
|
202
202
|
"#{@enclitics_marker}#{val}"
|
203
203
|
end
|
204
204
|
|
205
|
-
|
206
|
-
|
207
|
-
|
205
|
+
ENCLITICS_MAP = {
|
206
|
+
/^(nec)$/i => 'c',
|
207
|
+
/^(ne|se)u$/i => 'u',
|
208
|
+
/^(nisi)$/i => 'si',
|
209
|
+
/^(οὐ|μή|εἰ)τε$/i => 'τε',
|
210
|
+
/^(οὐ|μή)δε$/i => 'δε',
|
211
|
+
}
|
212
|
+
def split_frequent_enclitics
|
213
|
+
container = []
|
208
214
|
@worker.each_with_index do |token, i|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
token.slice!(-2, 2)
|
215
|
-
oute_indices << (i + oute_indices.size + @shift_range)
|
215
|
+
ENCLITICS_MAP.each do |regex, encl|
|
216
|
+
if token.match(regex)
|
217
|
+
token.slice!(-encl.length, encl.length)
|
218
|
+
container << [encl, (i + container.size + @shift_range)]
|
219
|
+
end
|
216
220
|
end
|
217
221
|
end
|
218
|
-
|
219
|
-
nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
|
220
|
-
oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
|
222
|
+
container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
|
221
223
|
end
|
222
224
|
|
223
225
|
def make_frequent_corrections
|
@@ -242,14 +242,14 @@ describe LLT::Tokenizer do
|
|
242
242
|
|
243
243
|
context "when confronted with -ve" do
|
244
244
|
examples = {
|
245
|
-
'sive' => '
|
245
|
+
'sive' => '-ve si',
|
246
|
+
'neve' => '-ve ne',
|
246
247
|
'pluresve' => '-ve plures',
|
247
248
|
'aestive' => 'aestive',
|
248
249
|
'serve' => 'serve',
|
249
250
|
'suave' => 'suave',
|
250
251
|
'vive' => 'vive',
|
251
252
|
'move' => 'move',
|
252
|
-
'neve' => 'neve',
|
253
253
|
'cive' => 'cive',
|
254
254
|
'Iove' => 'Iove',
|
255
255
|
}
|
@@ -261,9 +261,49 @@ describe LLT::Tokenizer do
|
|
261
261
|
end
|
262
262
|
end
|
263
263
|
|
264
|
+
context "when confronted with -u" do
|
265
|
+
examples = {
|
266
|
+
'seu' => '-u se',
|
267
|
+
'neu' => '-u ne'
|
268
|
+
}
|
269
|
+
|
270
|
+
examples.each do |example, expected|
|
271
|
+
it "transforms #{example} to #{expected}" do
|
272
|
+
enklitika_test(example).should be_transformed_to expected
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
context "when confronted with -si" do
|
278
|
+
examples = {
|
279
|
+
'nisi' => '-si ni'
|
280
|
+
}
|
281
|
+
|
282
|
+
examples.each do |example, expected|
|
283
|
+
it "transforms #{example} to #{expected}" do
|
284
|
+
enklitika_test(example).should be_transformed_to expected
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
264
289
|
context "when confronted with -τε" do
|
265
290
|
examples = {
|
266
|
-
'οὐτε' => '-τε οὐ'
|
291
|
+
'οὐτε' => '-τε οὐ',
|
292
|
+
'μήτε' => '-τε μή',
|
293
|
+
'εἰτε' => '-τε εἰ'
|
294
|
+
}
|
295
|
+
|
296
|
+
examples.each do |example, expected|
|
297
|
+
it "transforms #{example} to #{expected}" do
|
298
|
+
enklitika_test(example).should be_transformed_to expected
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "when confronted with -δε" do
|
304
|
+
examples = {
|
305
|
+
'οὐδε' => '-δε οὐ',
|
306
|
+
'μήδε' => '-δε μή'
|
267
307
|
}
|
268
308
|
|
269
309
|
examples.each do |example, expected|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|