llt-tokenizer 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3e46c64430c5caec2c91a4bb82b4322a9b02579a
4
- data.tar.gz: 2f746527437f1bdbd683e0033a2597300f7635d1
3
+ metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
4
+ data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
5
5
  SHA512:
6
- metadata.gz: 7366d89f3b48de21d266368690cfb927dbb408df562f5b677d87c632f3c5081972d87fd0be2c2da74149936915e03c10873bd881a33fe4198ea766a1d3197279
7
- data.tar.gz: cd55bd5af33d9cf077228ccb08559ad2fea87329d2982bf32fcdb31f11cc851255e9b26ed315058858ad44f722c535378d26018a0dec1f5d0e2982c1d96b47c5
6
+ metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
7
+ data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
data/Gemfile CHANGED
@@ -9,6 +9,7 @@ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
9
  gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
10
  gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
11
  gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
12
13
  gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
14
 
14
15
  # Dependencies of db_handler
data/lib/llt/tokenizer.rb CHANGED
@@ -140,8 +140,8 @@ module LLT
140
140
 
141
141
  ######################
142
142
 
143
- WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
144
- WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
143
+ WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
144
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
145
145
  WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
146
146
 
147
147
  # laetusque to -que laetus
@@ -151,6 +151,10 @@ module LLT
151
151
  # but
152
152
  #
153
153
  # uterque, institutione, sive et al. remain
154
+ #
155
+ # iuvene might come as a suprise in these lists - it's a hack, but
156
+ # special because it has ve and ne - both would get split. Such words
157
+ # might be so rare that we postpone proper handling for now
154
158
 
155
159
  ENCLITICS = %w{ que ne ve c }
156
160
  def split_enklitika_and_change_their_position
@@ -205,8 +209,8 @@ module LLT
205
209
  # # TODO 27.11.13 14:15 by LFDM
206
210
  # Implement caching here
207
211
  ne_corrections
208
- que_corrections
209
212
  ve_corrections
213
+ que_corrections
210
214
  end
211
215
 
212
216
  def que_corrections
@@ -243,8 +247,7 @@ module LLT
243
247
  entries = []
244
248
  entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
245
249
  entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
246
- entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
247
- entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
250
+ entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
248
251
  entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
249
252
  entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
250
253
 
@@ -272,11 +275,11 @@ module LLT
272
275
  entries = []
273
276
  entries += lookup(orig_el + 'v', :adjective, :stem, 1)
274
277
  entries += lookup(orig_el + 'v', :adjective, :stem, 3)
275
- entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
278
+ entries += lookup(orig_el + 'v', :noun, :stem, [2, 33, 5])
279
+ entries += lookup(orig_el + 'v', :persona, :stem, 3)
276
280
  entries += lookup(orig_el + 've', :verb, :pr, 2)
277
281
  entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
278
282
 
279
-
280
283
  if entries.any?
281
284
  corrections << i - corrections.size
282
285
  end
@@ -346,7 +349,6 @@ module LLT
346
349
  ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
347
350
  ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
348
351
  PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
349
- XML_TAG = /<\/?.+?>/
350
352
 
351
353
  def create_tokens
352
354
  # call #to_a is to retrieve (and align) optional metrical data
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.4"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
@@ -165,13 +165,23 @@ describe LLT::Tokenizer do
165
165
  "denique" => "denique",
166
166
  "itaque" => "itaque",
167
167
  "plerumque" => "plerumque",
168
+ "plerosque" => "plerosque",
169
+ "plerique" => "plerique",
170
+ "plerarumque" => "plerarumque",
168
171
  "quaque" => "quaque",
169
172
  "quemque" => "quemque",
170
173
  "undique" => "undique",
171
174
  "uterque" => "uterque",
172
175
  "utriusque" => "utriusque",
173
176
  "utcumque" => "utcumque",
177
+ "usque" => "usque",
174
178
  "bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
179
+ "quantumcumque" => "quantumcumque",
180
+ "quantulacumque" => "quantulacumque",
181
+ "unusquisque" => "unusquisque",
182
+ "quisque" => "quisque",
183
+ "quaeque" => "quaeque",
184
+ "uniuscuiusque" => "uniuscuiusque"
175
185
  }
176
186
 
177
187
  examples.each do |example, expected|
@@ -189,11 +199,15 @@ describe LLT::Tokenizer do
189
199
  # frequent patterns in third declension nouns
190
200
  "ratione" => "ratione",
191
201
  "magnitudine" => "magnitudine",
202
+ "iactatione" => "iactatione",
192
203
  "Platone" => "Platone",
193
204
  "libidine" => "libidine",
194
205
  "Solone" => "Solone",
195
206
  "homine" => "homine",
196
207
  "flumine" => "flumine",
208
+ "fine" => "fine",
209
+ "iuvene" => "iuvene",
210
+ "sanguine" => "sanguine",
197
211
 
198
212
  # frequent patterns in third declension adjective
199
213
  "commune" => "commune",
@@ -232,7 +246,9 @@ describe LLT::Tokenizer do
232
246
  'suave' => 'suave',
233
247
  'vive' => 'vive',
234
248
  'move' => 'move',
235
- 'neve' => 'neve'
249
+ 'neve' => 'neve',
250
+ 'cive' => 'cive',
251
+ 'Iove' => 'Iove',
236
252
  }
237
253
 
238
254
  examples.each do |example, expected|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-08 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler