llt-tokenizer 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/llt/tokenizer.rb +17 -15
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/tokenizer_spec.rb +43 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ea54a34c5f463b52600e854cd9e9084a5599af7
|
4
|
+
data.tar.gz: 13cfe38eb69014a5d855e4e0d44a9d54991bc855
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fcdbcfc60c63cbec28a0aa37044cddb22280ec4f4862c39bf3abc658619f273185589714c5d4ac27eb4c7a881c350047a499dba68c43e932e0cd18b02b8670d
|
7
|
+
data.tar.gz: 1c5bf2c479b0ff8e49358a1d53e5bb597ab5157d6981d65541eec6189e95d8fb57467bc62fd406d86ed1b4b473139238438cc589a4b2b150ec51798f2a837942
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -152,7 +152,7 @@ module LLT
|
|
152
152
|
|
153
153
|
WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
|
154
154
|
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
|
155
|
-
WORDS_ENDING_WITH_VE = /^(
|
155
|
+
WORDS_ENDING_WITH_VE = /^()$/i # formerly had neve and sive, which we split now
|
156
156
|
|
157
157
|
# laetusque to -que laetus
|
158
158
|
# in eoque to -que in eo
|
@@ -169,7 +169,7 @@ module LLT
|
|
169
169
|
ENCLITICS = %w{ que ne ve c }
|
170
170
|
def split_enklitika_and_change_their_position
|
171
171
|
split_with_force
|
172
|
-
|
172
|
+
split_frequent_enclitics # like latin c, ve or greek te, de
|
173
173
|
make_frequent_corrections
|
174
174
|
end
|
175
175
|
|
@@ -202,22 +202,24 @@ module LLT
|
|
202
202
|
"#{@enclitics_marker}#{val}"
|
203
203
|
end
|
204
204
|
|
205
|
-
|
206
|
-
|
207
|
-
|
205
|
+
ENCLITICS_MAP = {
|
206
|
+
/^(nec)$/i => 'c',
|
207
|
+
/^(ne|se)u$/i => 'u',
|
208
|
+
/^(nisi)$/i => 'si',
|
209
|
+
/^(οὐ|μή|εἰ)τε$/i => 'τε',
|
210
|
+
/^(οὐ|μή)δε$/i => 'δε',
|
211
|
+
}
|
212
|
+
def split_frequent_enclitics
|
213
|
+
container = []
|
208
214
|
@worker.each_with_index do |token, i|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
token.slice!(-2, 2)
|
215
|
-
oute_indices << (i + oute_indices.size + @shift_range)
|
215
|
+
ENCLITICS_MAP.each do |regex, encl|
|
216
|
+
if token.match(regex)
|
217
|
+
token.slice!(-encl.length, encl.length)
|
218
|
+
container << [encl, (i + container.size + @shift_range)]
|
219
|
+
end
|
216
220
|
end
|
217
221
|
end
|
218
|
-
|
219
|
-
nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
|
220
|
-
oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
|
222
|
+
container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
|
221
223
|
end
|
222
224
|
|
223
225
|
def make_frequent_corrections
|
@@ -242,14 +242,14 @@ describe LLT::Tokenizer do
|
|
242
242
|
|
243
243
|
context "when confronted with -ve" do
|
244
244
|
examples = {
|
245
|
-
'sive' => '
|
245
|
+
'sive' => '-ve si',
|
246
|
+
'neve' => '-ve ne',
|
246
247
|
'pluresve' => '-ve plures',
|
247
248
|
'aestive' => 'aestive',
|
248
249
|
'serve' => 'serve',
|
249
250
|
'suave' => 'suave',
|
250
251
|
'vive' => 'vive',
|
251
252
|
'move' => 'move',
|
252
|
-
'neve' => 'neve',
|
253
253
|
'cive' => 'cive',
|
254
254
|
'Iove' => 'Iove',
|
255
255
|
}
|
@@ -261,9 +261,49 @@ describe LLT::Tokenizer do
|
|
261
261
|
end
|
262
262
|
end
|
263
263
|
|
264
|
+
context "when confronted with -u" do
|
265
|
+
examples = {
|
266
|
+
'seu' => '-u se',
|
267
|
+
'neu' => '-u ne'
|
268
|
+
}
|
269
|
+
|
270
|
+
examples.each do |example, expected|
|
271
|
+
it "transforms #{example} to #{expected}" do
|
272
|
+
enklitika_test(example).should be_transformed_to expected
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
context "when confronted with -si" do
|
278
|
+
examples = {
|
279
|
+
'nisi' => '-si ni'
|
280
|
+
}
|
281
|
+
|
282
|
+
examples.each do |example, expected|
|
283
|
+
it "transforms #{example} to #{expected}" do
|
284
|
+
enklitika_test(example).should be_transformed_to expected
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
264
289
|
context "when confronted with -τε" do
|
265
290
|
examples = {
|
266
|
-
'οὐτε' => '-τε οὐ'
|
291
|
+
'οὐτε' => '-τε οὐ',
|
292
|
+
'μήτε' => '-τε μή',
|
293
|
+
'εἰτε' => '-τε εἰ'
|
294
|
+
}
|
295
|
+
|
296
|
+
examples.each do |example, expected|
|
297
|
+
it "transforms #{example} to #{expected}" do
|
298
|
+
enklitika_test(example).should be_transformed_to expected
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "when confronted with -δε" do
|
304
|
+
examples = {
|
305
|
+
'οὐδε' => '-δε οὐ',
|
306
|
+
'μήδε' => '-δε μή'
|
267
307
|
}
|
268
308
|
|
269
309
|
examples.each do |example, expected|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|