llt-tokenizer 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/lib/llt/tokenizer.rb +10 -8
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/tokenizer_spec.rb +17 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
|
4
|
+
data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
|
7
|
+
data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
|
data/Gemfile
CHANGED
@@ -9,6 +9,7 @@ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
|
|
9
9
|
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
10
10
|
gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
|
11
11
|
gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
|
12
|
+
gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
|
12
13
|
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
14
|
|
14
15
|
# Dependencies of db_handler
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -140,8 +140,8 @@ module LLT
|
|
140
140
|
|
141
141
|
######################
|
142
142
|
|
143
|
-
WORDS_ENDING_WITH_QUE = /^([qc]u[
|
144
|
-
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
|
143
|
+
WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
|
144
|
+
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
|
145
145
|
WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
|
146
146
|
|
147
147
|
# laetusque to -que laetus
|
@@ -151,6 +151,10 @@ module LLT
|
|
151
151
|
# but
|
152
152
|
#
|
153
153
|
# uterque, institutione, sive et al. remain
|
154
|
+
#
|
155
|
+
# iuvene might come as a suprise in these lists - it's a hack, but
|
156
|
+
# special because it has ve and ne - both would get split. Such words
|
157
|
+
# might be so rare that we postpone proper handling for now
|
154
158
|
|
155
159
|
ENCLITICS = %w{ que ne ve c }
|
156
160
|
def split_enklitika_and_change_their_position
|
@@ -205,8 +209,8 @@ module LLT
|
|
205
209
|
# # TODO 27.11.13 14:15 by LFDM
|
206
210
|
# Implement caching here
|
207
211
|
ne_corrections
|
208
|
-
que_corrections
|
209
212
|
ve_corrections
|
213
|
+
que_corrections
|
210
214
|
end
|
211
215
|
|
212
216
|
def que_corrections
|
@@ -243,8 +247,7 @@ module LLT
|
|
243
247
|
entries = []
|
244
248
|
entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
|
245
249
|
entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
|
246
|
-
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /
|
247
|
-
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
|
250
|
+
entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
|
248
251
|
entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
|
249
252
|
entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
|
250
253
|
|
@@ -272,11 +275,11 @@ module LLT
|
|
272
275
|
entries = []
|
273
276
|
entries += lookup(orig_el + 'v', :adjective, :stem, 1)
|
274
277
|
entries += lookup(orig_el + 'v', :adjective, :stem, 3)
|
275
|
-
entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
|
278
|
+
entries += lookup(orig_el + 'v', :noun, :stem, [2, 33, 5])
|
279
|
+
entries += lookup(orig_el + 'v', :persona, :stem, 3)
|
276
280
|
entries += lookup(orig_el + 've', :verb, :pr, 2)
|
277
281
|
entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
|
278
282
|
|
279
|
-
|
280
283
|
if entries.any?
|
281
284
|
corrections << i - corrections.size
|
282
285
|
end
|
@@ -346,7 +349,6 @@ module LLT
|
|
346
349
|
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
347
350
|
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
348
351
|
PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
|
349
|
-
XML_TAG = /<\/?.+?>/
|
350
352
|
|
351
353
|
def create_tokens
|
352
354
|
# call #to_a is to retrieve (and align) optional metrical data
|
@@ -165,13 +165,23 @@ describe LLT::Tokenizer do
|
|
165
165
|
"denique" => "denique",
|
166
166
|
"itaque" => "itaque",
|
167
167
|
"plerumque" => "plerumque",
|
168
|
+
"plerosque" => "plerosque",
|
169
|
+
"plerique" => "plerique",
|
170
|
+
"plerarumque" => "plerarumque",
|
168
171
|
"quaque" => "quaque",
|
169
172
|
"quemque" => "quemque",
|
170
173
|
"undique" => "undique",
|
171
174
|
"uterque" => "uterque",
|
172
175
|
"utriusque" => "utriusque",
|
173
176
|
"utcumque" => "utcumque",
|
177
|
+
"usque" => "usque",
|
174
178
|
"bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
|
179
|
+
"quantumcumque" => "quantumcumque",
|
180
|
+
"quantulacumque" => "quantulacumque",
|
181
|
+
"unusquisque" => "unusquisque",
|
182
|
+
"quisque" => "quisque",
|
183
|
+
"quaeque" => "quaeque",
|
184
|
+
"uniuscuiusque" => "uniuscuiusque"
|
175
185
|
}
|
176
186
|
|
177
187
|
examples.each do |example, expected|
|
@@ -189,11 +199,15 @@ describe LLT::Tokenizer do
|
|
189
199
|
# frequent patterns in third declension nouns
|
190
200
|
"ratione" => "ratione",
|
191
201
|
"magnitudine" => "magnitudine",
|
202
|
+
"iactatione" => "iactatione",
|
192
203
|
"Platone" => "Platone",
|
193
204
|
"libidine" => "libidine",
|
194
205
|
"Solone" => "Solone",
|
195
206
|
"homine" => "homine",
|
196
207
|
"flumine" => "flumine",
|
208
|
+
"fine" => "fine",
|
209
|
+
"iuvene" => "iuvene",
|
210
|
+
"sanguine" => "sanguine",
|
197
211
|
|
198
212
|
# frequent patterns in third declension adjective
|
199
213
|
"commune" => "commune",
|
@@ -232,7 +246,9 @@ describe LLT::Tokenizer do
|
|
232
246
|
'suave' => 'suave',
|
233
247
|
'vive' => 'vive',
|
234
248
|
'move' => 'move',
|
235
|
-
'neve' => 'neve'
|
249
|
+
'neve' => 'neve',
|
250
|
+
'cive' => 'cive',
|
251
|
+
'Iove' => 'Iove',
|
236
252
|
}
|
237
253
|
|
238
254
|
examples.each do |example, expected|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|