llt-tokenizer 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/lib/llt/tokenizer.rb +10 -8
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/tokenizer_spec.rb +17 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
|
4
|
+
data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
|
7
|
+
data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
|
data/Gemfile
CHANGED
@@ -9,6 +9,7 @@ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
|
|
9
9
|
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
10
10
|
gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
|
11
11
|
gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
|
12
|
+
gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
|
12
13
|
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
14
|
|
14
15
|
# Dependencies of db_handler
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -140,8 +140,8 @@ module LLT
|
|
140
140
|
|
141
141
|
######################
|
142
142
|
|
143
|
-
WORDS_ENDING_WITH_QUE = /^([qc]u[
|
144
|
-
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
|
143
|
+
WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
|
144
|
+
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
|
145
145
|
WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
|
146
146
|
|
147
147
|
# laetusque to -que laetus
|
@@ -151,6 +151,10 @@ module LLT
|
|
151
151
|
# but
|
152
152
|
#
|
153
153
|
# uterque, institutione, sive et al. remain
|
154
|
+
#
|
155
|
+
# iuvene might come as a suprise in these lists - it's a hack, but
|
156
|
+
# special because it has ve and ne - both would get split. Such words
|
157
|
+
# might be so rare that we postpone proper handling for now
|
154
158
|
|
155
159
|
ENCLITICS = %w{ que ne ve c }
|
156
160
|
def split_enklitika_and_change_their_position
|
@@ -205,8 +209,8 @@ module LLT
|
|
205
209
|
# # TODO 27.11.13 14:15 by LFDM
|
206
210
|
# Implement caching here
|
207
211
|
ne_corrections
|
208
|
-
que_corrections
|
209
212
|
ve_corrections
|
213
|
+
que_corrections
|
210
214
|
end
|
211
215
|
|
212
216
|
def que_corrections
|
@@ -243,8 +247,7 @@ module LLT
|
|
243
247
|
entries = []
|
244
248
|
entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
|
245
249
|
entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
|
246
|
-
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /
|
247
|
-
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
|
250
|
+
entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
|
248
251
|
entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
|
249
252
|
entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
|
250
253
|
|
@@ -272,11 +275,11 @@ module LLT
|
|
272
275
|
entries = []
|
273
276
|
entries += lookup(orig_el + 'v', :adjective, :stem, 1)
|
274
277
|
entries += lookup(orig_el + 'v', :adjective, :stem, 3)
|
275
|
-
entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
|
278
|
+
entries += lookup(orig_el + 'v', :noun, :stem, [2, 33, 5])
|
279
|
+
entries += lookup(orig_el + 'v', :persona, :stem, 3)
|
276
280
|
entries += lookup(orig_el + 've', :verb, :pr, 2)
|
277
281
|
entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
|
278
282
|
|
279
|
-
|
280
283
|
if entries.any?
|
281
284
|
corrections << i - corrections.size
|
282
285
|
end
|
@@ -346,7 +349,6 @@ module LLT
|
|
346
349
|
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
347
350
|
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
348
351
|
PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
|
349
|
-
XML_TAG = /<\/?.+?>/
|
350
352
|
|
351
353
|
def create_tokens
|
352
354
|
# call #to_a is to retrieve (and align) optional metrical data
|
@@ -165,13 +165,23 @@ describe LLT::Tokenizer do
|
|
165
165
|
"denique" => "denique",
|
166
166
|
"itaque" => "itaque",
|
167
167
|
"plerumque" => "plerumque",
|
168
|
+
"plerosque" => "plerosque",
|
169
|
+
"plerique" => "plerique",
|
170
|
+
"plerarumque" => "plerarumque",
|
168
171
|
"quaque" => "quaque",
|
169
172
|
"quemque" => "quemque",
|
170
173
|
"undique" => "undique",
|
171
174
|
"uterque" => "uterque",
|
172
175
|
"utriusque" => "utriusque",
|
173
176
|
"utcumque" => "utcumque",
|
177
|
+
"usque" => "usque",
|
174
178
|
"bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
|
179
|
+
"quantumcumque" => "quantumcumque",
|
180
|
+
"quantulacumque" => "quantulacumque",
|
181
|
+
"unusquisque" => "unusquisque",
|
182
|
+
"quisque" => "quisque",
|
183
|
+
"quaeque" => "quaeque",
|
184
|
+
"uniuscuiusque" => "uniuscuiusque"
|
175
185
|
}
|
176
186
|
|
177
187
|
examples.each do |example, expected|
|
@@ -189,11 +199,15 @@ describe LLT::Tokenizer do
|
|
189
199
|
# frequent patterns in third declension nouns
|
190
200
|
"ratione" => "ratione",
|
191
201
|
"magnitudine" => "magnitudine",
|
202
|
+
"iactatione" => "iactatione",
|
192
203
|
"Platone" => "Platone",
|
193
204
|
"libidine" => "libidine",
|
194
205
|
"Solone" => "Solone",
|
195
206
|
"homine" => "homine",
|
196
207
|
"flumine" => "flumine",
|
208
|
+
"fine" => "fine",
|
209
|
+
"iuvene" => "iuvene",
|
210
|
+
"sanguine" => "sanguine",
|
197
211
|
|
198
212
|
# frequent patterns in third declension adjective
|
199
213
|
"commune" => "commune",
|
@@ -232,7 +246,9 @@ describe LLT::Tokenizer do
|
|
232
246
|
'suave' => 'suave',
|
233
247
|
'vive' => 'vive',
|
234
248
|
'move' => 'move',
|
235
|
-
'neve' => 'neve'
|
249
|
+
'neve' => 'neve',
|
250
|
+
'cive' => 'cive',
|
251
|
+
'Iove' => 'Iove',
|
236
252
|
}
|
237
253
|
|
238
254
|
examples.each do |example, expected|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|