llt-tokenizer 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3e46c64430c5caec2c91a4bb82b4322a9b02579a
4
- data.tar.gz: 2f746527437f1bdbd683e0033a2597300f7635d1
3
+ metadata.gz: 17d75e05ecdf64cd93e8de239e57652b4a6375ba
4
+ data.tar.gz: b21ab0737044b952a9642a68a9aae48a4d615d4e
5
5
  SHA512:
6
- metadata.gz: 7366d89f3b48de21d266368690cfb927dbb408df562f5b677d87c632f3c5081972d87fd0be2c2da74149936915e03c10873bd881a33fe4198ea766a1d3197279
7
- data.tar.gz: cd55bd5af33d9cf077228ccb08559ad2fea87329d2982bf32fcdb31f11cc851255e9b26ed315058858ad44f722c535378d26018a0dec1f5d0e2982c1d96b47c5
6
+ metadata.gz: afadf19b6eb1a7f45dca79631efa57ceb7359d286523d2979658208decffea5d094fdcd4fcc9b3e02f788390bd3e2eeb28c415f3ad7ea97efcdbdf28602c7d2d
7
+ data.tar.gz: f752b02680802aafb6305490b139dc0b9ca2e3e35e9b48ca2610f2039e57959571ac432e2b7fa3b4b9e1f05abdc54c9ef8e99f42a2927a2ee25294ecad908fd9
data/Gemfile CHANGED
@@ -9,6 +9,7 @@ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
9
9
  gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
10
10
  gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
11
11
  gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
12
+ gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
12
13
  gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
14
 
14
15
  # Dependencies of db_handler
data/lib/llt/tokenizer.rb CHANGED
@@ -140,8 +140,8 @@ module LLT
140
140
 
141
141
  ######################
142
142
 
143
- WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
144
- WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
143
+ WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
144
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene|iuvene)$/i
145
145
  WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
146
146
 
147
147
  # laetusque to -que laetus
@@ -151,6 +151,10 @@ module LLT
151
151
  # but
152
152
  #
153
153
  # uterque, institutione, sive et al. remain
154
+ #
155
+ # iuvene might come as a suprise in these lists - it's a hack, but
156
+ # special because it has ve and ne - both would get split. Such words
157
+ # might be so rare that we postpone proper handling for now
154
158
 
155
159
  ENCLITICS = %w{ que ne ve c }
156
160
  def split_enklitika_and_change_their_position
@@ -205,8 +209,8 @@ module LLT
205
209
  # # TODO 27.11.13 14:15 by LFDM
206
210
  # Implement caching here
207
211
  ne_corrections
208
- que_corrections
209
212
  ve_corrections
213
+ que_corrections
210
214
  end
211
215
 
212
216
  def que_corrections
@@ -243,8 +247,7 @@ module LLT
243
247
  entries = []
244
248
  entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
245
249
  entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
246
- entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
247
- entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
250
+ entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) if orig_el =~ /[ei]$/ # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne
248
251
  entries += lookup(orig_el + "n", :noun, :stem, 2) # domi-ne
249
252
  entries += lookup(orig_el + "n", :adjective, :stem, [1,3]) # communis commune, or bonus
250
253
 
@@ -272,11 +275,11 @@ module LLT
272
275
  entries = []
273
276
  entries += lookup(orig_el + 'v', :adjective, :stem, 1)
274
277
  entries += lookup(orig_el + 'v', :adjective, :stem, 3)
275
- entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
278
+ entries += lookup(orig_el + 'v', :noun, :stem, [2, 33, 5])
279
+ entries += lookup(orig_el + 'v', :persona, :stem, 3)
276
280
  entries += lookup(orig_el + 've', :verb, :pr, 2)
277
281
  entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
278
282
 
279
-
280
283
  if entries.any?
281
284
  corrections << i - corrections.size
282
285
  end
@@ -346,7 +349,6 @@ module LLT
346
349
  ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
347
350
  ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
348
351
  PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
349
- XML_TAG = /<\/?.+?>/
350
352
 
351
353
  def create_tokens
352
354
  # call #to_a is to retrieve (and align) optional metrical data
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Tokenizer
3
- VERSION = "0.0.4"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
@@ -165,13 +165,23 @@ describe LLT::Tokenizer do
165
165
  "denique" => "denique",
166
166
  "itaque" => "itaque",
167
167
  "plerumque" => "plerumque",
168
+ "plerosque" => "plerosque",
169
+ "plerique" => "plerique",
170
+ "plerarumque" => "plerarumque",
168
171
  "quaque" => "quaque",
169
172
  "quemque" => "quemque",
170
173
  "undique" => "undique",
171
174
  "uterque" => "uterque",
172
175
  "utriusque" => "utriusque",
173
176
  "utcumque" => "utcumque",
177
+ "usque" => "usque",
174
178
  "bonus laetusque et latus altusque" => "bonus -que laetus et latus -que altus",
179
+ "quantumcumque" => "quantumcumque",
180
+ "quantulacumque" => "quantulacumque",
181
+ "unusquisque" => "unusquisque",
182
+ "quisque" => "quisque",
183
+ "quaeque" => "quaeque",
184
+ "uniuscuiusque" => "uniuscuiusque"
175
185
  }
176
186
 
177
187
  examples.each do |example, expected|
@@ -189,11 +199,15 @@ describe LLT::Tokenizer do
189
199
  # frequent patterns in third declension nouns
190
200
  "ratione" => "ratione",
191
201
  "magnitudine" => "magnitudine",
202
+ "iactatione" => "iactatione",
192
203
  "Platone" => "Platone",
193
204
  "libidine" => "libidine",
194
205
  "Solone" => "Solone",
195
206
  "homine" => "homine",
196
207
  "flumine" => "flumine",
208
+ "fine" => "fine",
209
+ "iuvene" => "iuvene",
210
+ "sanguine" => "sanguine",
197
211
 
198
212
  # frequent patterns in third declension adjective
199
213
  "commune" => "commune",
@@ -232,7 +246,9 @@ describe LLT::Tokenizer do
232
246
  'suave' => 'suave',
233
247
  'vive' => 'vive',
234
248
  'move' => 'move',
235
- 'neve' => 'neve'
249
+ 'neve' => 'neve',
250
+ 'cive' => 'cive',
251
+ 'Iove' => 'Iove',
236
252
  }
237
253
 
238
254
  examples.each do |example, expected|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-08 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler