cassiopee 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +1 -0
- data/lib/cassiopee-mt.rb +11 -10
- data/lib/cassiopee.rb +320 -77
- data/tests/test-suite.rb +51 -4
- metadata +4 -4
data/Changelog
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
v0.1.5 : add CrawlerCache for basic cache management (previous result only), several fixes
|
1
2
|
v0.1.4 : fix 0.1.3 error on index load, add filter_position management in mt
|
2
3
|
v0.1.3 : 09/11 Olivier Sallou
|
3
4
|
add CrawlerMT in cassiopee-mt for multi thread support to speed up the search
|
data/lib/cassiopee-mt.rb
CHANGED
@@ -34,6 +34,7 @@ module CassiopeeMt
|
|
34
34
|
crawler.setLogLevel($log.level)
|
35
35
|
crawler.file_suffix = @file_suffix
|
36
36
|
crawler.loadIndex()
|
37
|
+
crawler.method = method
|
37
38
|
#crawler.file_suffix = @file_suffix+"."+threadId.to_s
|
38
39
|
end
|
39
40
|
|
@@ -55,16 +56,16 @@ module CassiopeeMt
|
|
55
56
|
end
|
56
57
|
nb = len.div(maxthread)
|
57
58
|
(1..maxthread).each do |i|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
59
|
+
crawler = Crawler.new
|
60
|
+
setParams(crawler,i)
|
61
|
+
curmax = min + nb
|
62
|
+
if(i==maxthread)
|
63
|
+
curmax = max
|
64
|
+
end
|
65
|
+
crawler.filter_position(min,curmax)
|
66
|
+
$log.debug("Start new Thread between " << min.to_s << " and " << curmax.to_s)
|
67
|
+
@th[i-1] = Thread.new{ Thread.current["matches"] = crawler.searchExact(pattern) }
|
68
|
+
min = curmax + 1
|
68
69
|
end
|
69
70
|
@th.each {|t| t.join; t["matches"].each { |m| @matches << m }}
|
70
71
|
return @matches
|
data/lib/cassiopee.rb
CHANGED
@@ -34,7 +34,6 @@ module Cassiopee
|
|
34
34
|
# Compute Hamming distance but using a mapping matrix of alphabet ambiguity
|
35
35
|
|
36
36
|
def computeHammingAmbiguous(pattern,hamming,ambiguous)
|
37
|
-
pattern = pattern.downcase
|
38
37
|
nberr = 0
|
39
38
|
(0..(self.length-1)).each do |c|
|
40
39
|
if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
|
@@ -52,7 +51,6 @@ module Cassiopee
|
|
52
51
|
# Return -1 if max is reached
|
53
52
|
|
54
53
|
def computeHamming(pattern,hamming)
|
55
|
-
pattern = pattern.downcase
|
56
54
|
nberr = 0
|
57
55
|
(0..(self.length-1)).each do |c|
|
58
56
|
if(pattern[c] != self[c])
|
@@ -72,11 +70,9 @@ module Cassiopee
|
|
72
70
|
# Return -1 if max is reached
|
73
71
|
|
74
72
|
def computeLevenshtein(pattern,edit)
|
75
|
-
pattern = pattern.downcase
|
76
73
|
|
77
74
|
distance = Text::Levenshtein.distance(self, pattern)
|
78
75
|
|
79
|
-
|
80
76
|
if(distance>edit)
|
81
77
|
return -1
|
82
78
|
end
|
@@ -91,44 +87,42 @@ module Cassiopee
|
|
91
87
|
|
92
88
|
def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
|
93
89
|
|
94
|
-
|
95
|
-
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
90
|
+
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
96
91
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
92
|
+
if Text.encoding_of(self) =~ /^U/i
|
93
|
+
unpack_rule = 'U*'
|
94
|
+
else
|
95
|
+
unpack_rule = 'C*'
|
96
|
+
end
|
102
97
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
98
|
+
s = self.unpack(unpack_rule)
|
99
|
+
t = pattern.unpack(unpack_rule)
|
100
|
+
n = s.length
|
101
|
+
m = t.length
|
102
|
+
return m if (0 == n)
|
103
|
+
return n if (0 == m)
|
109
104
|
|
110
|
-
|
111
|
-
|
105
|
+
d = (0..m).to_a
|
106
|
+
x = nil
|
112
107
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
return x
|
108
|
+
(0...n).each do |i|
|
109
|
+
e = i+1
|
110
|
+
(0...m).each do |j|
|
111
|
+
cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
|
112
|
+
x = [
|
113
|
+
d[j+1] + 1, # insertion
|
114
|
+
e + 1, # deletion
|
115
|
+
d[j] + cost # substitution
|
116
|
+
].min
|
117
|
+
d[j] = e
|
118
|
+
e = x
|
119
|
+
end
|
120
|
+
d[m] = x
|
121
|
+
end
|
122
|
+
if(x>edit)
|
123
|
+
return -1
|
124
|
+
end
|
125
|
+
return x
|
132
126
|
end
|
133
127
|
|
134
128
|
|
@@ -150,6 +144,106 @@ module Cassiopee
|
|
150
144
|
end
|
151
145
|
end
|
152
146
|
|
147
|
+
# Class maning cache of results
|
148
|
+
|
149
|
+
class CrawlerCache
|
150
|
+
|
151
|
+
FILE_CACHE_EXT = ".sfc"
|
152
|
+
|
153
|
+
# Suffix files name/path
|
154
|
+
attr_accessor :file_suffix
|
155
|
+
|
156
|
+
# search exact: 0
|
157
|
+
# hamming : 1
|
158
|
+
# edit : 2
|
159
|
+
attr_accessor :method
|
160
|
+
|
161
|
+
# filter
|
162
|
+
attr_accessor :min_position
|
163
|
+
attr_accessor :max_position
|
164
|
+
|
165
|
+
# max errors
|
166
|
+
attr_accessor :errors
|
167
|
+
|
168
|
+
attr_accessor :cache
|
169
|
+
|
170
|
+
$log = Logger.new(STDOUT)
|
171
|
+
$log.level = Logger::INFO
|
172
|
+
|
173
|
+
def setLogger(userlogger)
|
174
|
+
$log = userlogger
|
175
|
+
end
|
176
|
+
|
177
|
+
def initialize
|
178
|
+
@file_suffix = "crawler"
|
179
|
+
end
|
180
|
+
|
181
|
+
# Loads cache from file
|
182
|
+
def loadCache
|
183
|
+
return Array.new unless File.exists?(@file_suffix+FILE_CACHE_EXT)
|
184
|
+
begin
|
185
|
+
file = Zlib::GzipReader.open(@file_suffix+FILE_CACHE_EXT)
|
186
|
+
rescue Zlib::GzipFile::Error
|
187
|
+
file = File.open(@file_suffix+FILE_CACHE_EXT, 'r')
|
188
|
+
ensure
|
189
|
+
obj = Marshal.load file.read
|
190
|
+
file.close
|
191
|
+
if(method!=obj.method || min_position<obj.min_position || max_position>obj.max_position || errors>obj.errors)
|
192
|
+
return Array.new
|
193
|
+
end
|
194
|
+
return filterCache(obj)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Save self to cache, with cache object set from obj
|
199
|
+
def saveCache(obj)
|
200
|
+
self.cache = obj
|
201
|
+
marshal_dump = Marshal.dump(self)
|
202
|
+
sfxpos = File.new(@file_suffix+FILE_CACHE_EXT,'w')
|
203
|
+
sfxpos = Zlib::GzipWriter.new(sfxpos)
|
204
|
+
sfxpos.write marshal_dump
|
205
|
+
sfxpos.close
|
206
|
+
end
|
207
|
+
|
208
|
+
def clearCache
|
209
|
+
File.delete(@file_suffix+FILE_CACHE_EXT) unless !File.exists?(@file_suffix+FILE_CACHE_EXT)
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
# filter cache according to settings
|
215
|
+
# obj: cache object
|
216
|
+
def filterCache(cacheobject)
|
217
|
+
|
218
|
+
realmatches = Array.new
|
219
|
+
if(cacheobject==nil)
|
220
|
+
return realmatches
|
221
|
+
end
|
222
|
+
|
223
|
+
cacheobject.cache.each do |obj|
|
224
|
+
if(obj[1]>self.errors)
|
225
|
+
next
|
226
|
+
end
|
227
|
+
realpos = Array.new
|
228
|
+
realpos << obj[2][0]
|
229
|
+
(1..obj[2].length-1).each do |i|
|
230
|
+
curpos= obj[2][i]
|
231
|
+
if((curpos<=max_position || max_position==0) && curpos>=min_position)
|
232
|
+
realpos << curpos
|
233
|
+
end
|
234
|
+
end
|
235
|
+
if(realpos.length<=1)
|
236
|
+
next
|
237
|
+
end
|
238
|
+
realmatches << Array[obj[0],obj[1],realpos]
|
239
|
+
|
240
|
+
end
|
241
|
+
return realmatches
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
|
153
247
|
# Base class to index and search through a string
|
154
248
|
|
155
249
|
class Crawler
|
@@ -164,11 +258,30 @@ module Cassiopee
|
|
164
258
|
attr_accessor :use_store
|
165
259
|
# Array of comment characters to skip lines in input sequence file
|
166
260
|
attr_accessor :comments
|
261
|
+
|
262
|
+
# Manage basic cache to store previous match
|
263
|
+
attr_accessor :useCache
|
167
264
|
|
265
|
+
# Method for search FORCE or SUFFIX
|
266
|
+
# * SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)
|
267
|
+
# * FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search
|
268
|
+
# FORCE method does not yet support optimal filters
|
269
|
+
attr_accessor :method
|
270
|
+
|
271
|
+
METHOD_DIRECT = 0
|
272
|
+
METHOD_SUFFIX = 1
|
273
|
+
|
168
274
|
@min_position = 0
|
169
275
|
@max_position = 0
|
170
276
|
|
277
|
+
# Previous position filter
|
278
|
+
@prev_min_position = 0
|
279
|
+
@prev_max_position = 0
|
280
|
+
|
171
281
|
@ambiguous = nil
|
282
|
+
|
283
|
+
@pattern = nil
|
284
|
+
|
172
285
|
|
173
286
|
FILE_SUFFIX_EXT = ".sfx"
|
174
287
|
FILE_SUFFIX_POS = ".sfp"
|
@@ -177,27 +290,39 @@ module Cassiopee
|
|
177
290
|
|
178
291
|
$maxthread = 1
|
179
292
|
|
293
|
+
@cache = nil
|
294
|
+
|
180
295
|
|
181
296
|
$log = Logger.new(STDOUT)
|
182
297
|
$log.level = Logger::INFO
|
183
298
|
|
184
299
|
def initialize
|
185
300
|
@useAmbiguity = false
|
301
|
+
@useCache = false
|
186
302
|
@file_suffix = "crawler"
|
187
|
-
|
303
|
+
|
304
|
+
@method = 0
|
305
|
+
|
306
|
+
@prev_min_position = 0
|
307
|
+
@prev_max_position = 0
|
308
|
+
|
309
|
+
|
188
310
|
@suffix = nil
|
189
311
|
@suffixmd5 = nil
|
190
312
|
@position = 0
|
191
313
|
|
192
314
|
@suffixes = Hash.new
|
193
315
|
|
194
|
-
@matches =
|
316
|
+
@matches = Array.new
|
195
317
|
@curmatch = 0
|
196
318
|
@use_store = false
|
197
319
|
|
198
320
|
@sequence = nil
|
199
321
|
|
200
322
|
@comments = Array["#"]
|
323
|
+
|
324
|
+
@cache = Cassiopee::CrawlerCache.new
|
325
|
+
|
201
326
|
end
|
202
327
|
|
203
328
|
def filterLength
|
@@ -213,7 +338,11 @@ module Cassiopee
|
|
213
338
|
|
214
339
|
def clear
|
215
340
|
@suffixes = Hash.new
|
216
|
-
|
341
|
+
@matches.clear
|
342
|
+
@pattern = nil
|
343
|
+
@prev_max_position = 0
|
344
|
+
@prev_min_position = 0
|
345
|
+
@cache.clearCache()
|
217
346
|
File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
|
218
347
|
end
|
219
348
|
|
@@ -300,37 +429,57 @@ module Cassiopee
|
|
300
429
|
# Filter matches to be between min and max start position
|
301
430
|
# If not using use_store, search speed is improved but existing indexes are cleared
|
302
431
|
# If max=0, then max is string length
|
432
|
+
# Must be called after index creation or load
|
303
433
|
|
304
434
|
def filter_position(min,max)
|
305
435
|
if(!use_store)
|
306
436
|
clear()
|
307
437
|
end
|
438
|
+
@prev_min_position = @min_position
|
439
|
+
@prev_max_position = @max_position
|
308
440
|
@min_position = min
|
309
441
|
@max_position = max
|
310
442
|
end
|
311
443
|
|
312
444
|
# Search exact match
|
313
445
|
|
314
|
-
def searchExact(
|
446
|
+
def searchExact(s)
|
447
|
+
|
315
448
|
if(@useAmbiguity)
|
316
|
-
return searchApproximate(
|
449
|
+
return searchApproximate(s,0)
|
450
|
+
end
|
451
|
+
|
452
|
+
s = s.downcase
|
453
|
+
|
454
|
+
updateCache(0,0)
|
455
|
+
@matches = @cache.loadCache()
|
456
|
+
|
457
|
+
if(@matches.length>0)
|
458
|
+
return cache?(@matches)
|
317
459
|
end
|
318
|
-
|
319
|
-
|
460
|
+
|
461
|
+
#@matches.clear
|
462
|
+
|
463
|
+
@pattern = Digest::MD5.hexdigest(s)
|
464
|
+
|
465
|
+
parseSuffixes(@sequence,s.length,s.length,0,s)
|
320
466
|
|
321
|
-
|
467
|
+
return @matches unless(method == METHOD_SUFFIX)
|
468
|
+
|
322
469
|
# Search required length, compare (compare md5?)
|
323
470
|
# MD5 = 128 bits, easier to compare for large strings
|
324
|
-
|
325
|
-
|
471
|
+
|
472
|
+
|
473
|
+
matchsize = @pattern.length
|
474
|
+
|
326
475
|
@suffixes.each do |md5val,posArray|
|
327
|
-
if (md5val
|
476
|
+
if (isMatchEqual?(md5val))
|
328
477
|
match = Array[md5val, 0, posArray]
|
329
478
|
$log.debug "Match: " << match.inspect
|
330
479
|
@matches << match
|
331
480
|
end
|
332
481
|
end
|
333
|
-
return @matches
|
482
|
+
return cache?(@matches)
|
334
483
|
|
335
484
|
end
|
336
485
|
|
@@ -342,32 +491,48 @@ module Cassiopee
|
|
342
491
|
|
343
492
|
|
344
493
|
def searchApproximate(s,edit)
|
494
|
+
|
345
495
|
if(edit==0 && !@useAmbiguity)
|
346
496
|
return searchExact(s)
|
347
497
|
end
|
348
|
-
|
498
|
+
allowederrors = edit
|
349
499
|
if(edit>=0)
|
350
500
|
useHamming = true
|
351
501
|
minmatchsize = s.length
|
352
502
|
maxmatchsize = s.length
|
503
|
+
updateCache(1,edit)
|
504
|
+
@matches = @cache.loadCache()
|
353
505
|
else
|
354
506
|
useHamming = false
|
355
507
|
edit = edit * (-1)
|
356
508
|
minmatchsize = s.length - edit
|
357
509
|
maxmatchsize = s.length + edit
|
510
|
+
updateCache(2,edit)
|
511
|
+
@matches = @cache.loadCache()
|
358
512
|
end
|
513
|
+
|
514
|
+
if(@matches.length>0)
|
515
|
+
return @matches
|
516
|
+
end
|
517
|
+
|
518
|
+
s = s.downcase
|
359
519
|
|
360
|
-
|
520
|
+
|
521
|
+
#@matches.clear
|
522
|
+
@pattern = Digest::MD5.hexdigest(s)
|
523
|
+
|
524
|
+
parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s)
|
361
525
|
|
362
|
-
|
526
|
+
return cache?(@matches) unless(method == METHOD_SUFFIX)
|
527
|
+
|
528
|
+
|
363
529
|
|
364
|
-
|
365
|
-
|
530
|
+
|
366
531
|
@suffixes.each do |md5val,posArray|
|
367
532
|
if(md5val == SUFFIXLEN)
|
368
533
|
next
|
369
534
|
end
|
370
|
-
if (md5val ==
|
535
|
+
if (md5val == @pattern)
|
371
536
|
filteredPosArray = filter(posArray)
|
372
537
|
match = Array[md5val, 0, filteredPosArray]
|
373
538
|
$log.debug "Match: " << match.inspect
|
@@ -376,20 +541,8 @@ module Cassiopee
|
|
376
541
|
if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
|
377
542
|
# Get string
|
378
543
|
seq = extractSuffix(posArray[1],posArray[0])
|
379
|
-
|
380
|
-
|
381
|
-
if(@useAmbiguity && @ambiguous!=nil)
|
382
|
-
errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
|
383
|
-
else
|
384
|
-
errors = seq.computeHamming(s,edit)
|
385
|
-
end
|
386
|
-
else
|
387
|
-
if(@useAmbiguity && @ambigous!=nil)
|
388
|
-
errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
|
389
|
-
else
|
390
|
-
errors = seq.computeLevenshtein(s,edit)
|
391
|
-
end
|
392
|
-
end
|
544
|
+
errors = isApproximateEqual?(seq,s,useHamming,edit)
|
545
|
+
|
393
546
|
if(errors>=0)
|
394
547
|
filteredPosArray = filter(posArray)
|
395
548
|
match = Array[md5val, errors, filteredPosArray]
|
@@ -401,7 +554,7 @@ module Cassiopee
|
|
401
554
|
|
402
555
|
end
|
403
556
|
|
404
|
-
return @matches
|
557
|
+
return cache?(@matches)
|
405
558
|
end
|
406
559
|
|
407
560
|
# Filter the array of positions with defined position filter
|
@@ -488,14 +641,68 @@ module Cassiopee
|
|
488
641
|
end
|
489
642
|
|
490
643
|
private
|
644
|
+
|
645
|
+
# If cache is used, store results for later retrieval, else return matches directly
|
646
|
+
def cache?(results)
|
647
|
+
if(@useCache)
|
648
|
+
@cache.saveCache(results)
|
649
|
+
end
|
650
|
+
|
651
|
+
return results
|
652
|
+
end
|
653
|
+
|
654
|
+
# Update cache object with current object parameters
|
655
|
+
# * method: 0 -> exact, 1 -> hamming, 2 -> edit
|
656
|
+
def updateCache(method,errors)
|
657
|
+
@cache.file_suffix = @file_suffix
|
658
|
+
@cache.min_position = @min_position
|
659
|
+
@cache.max_position = @max_position
|
660
|
+
@cache.method = method
|
661
|
+
@cache.errors = errors
|
662
|
+
end
|
663
|
+
|
664
|
+
|
665
|
+
# check if md5 is equal to pattern
|
666
|
+
def isMatchEqual?(s)
|
667
|
+
if(@pattern == s)
|
668
|
+
return true
|
669
|
+
end
|
670
|
+
return false
|
671
|
+
end
|
672
|
+
|
673
|
+
# check if string is approximatly equal to pattern
|
674
|
+
# s: string to compare
|
675
|
+
# pattern: base pattern used
|
676
|
+
# useHamming: use Hamming or edit distance
|
677
|
+
# edit : allowed errors
|
678
|
+
def isApproximateEqual?(s,pattern,useHamming,edit)
|
679
|
+
errors = -1
|
680
|
+
s.extend(Cassiopee)
|
681
|
+
if(useHamming)
|
682
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
683
|
+
errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous)
|
684
|
+
else
|
685
|
+
errors = s.computeHamming(pattern,edit)
|
686
|
+
end
|
687
|
+
else
|
688
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
689
|
+
errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous)
|
690
|
+
else
|
691
|
+
errors = s.computeLevenshtein(pattern,edit)
|
692
|
+
end
|
693
|
+
end
|
694
|
+
end
|
695
|
+
|
696
|
+
|
697
|
+
|
491
698
|
|
492
699
|
# Parse input string
|
493
700
|
#
|
494
701
|
# * creates a suffix file
|
495
702
|
# * creates a suffix position file
|
496
703
|
|
497
|
-
def parseSuffixes(s,minlen,maxlen)
|
498
|
-
|
704
|
+
def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil)
|
705
|
+
|
499
706
|
# Controls
|
500
707
|
if(minlen<=0)
|
501
708
|
minlen = 1
|
@@ -554,6 +761,7 @@ module Cassiopee
|
|
554
761
|
next
|
555
762
|
end
|
556
763
|
changed = true
|
764
|
+
prev_progress = -1
|
557
765
|
(minpos..(maxpos)).each do |j|
|
558
766
|
# if position+length longer than sequence length, skip it
|
559
767
|
if(j+i>=@sequence.length)
|
@@ -562,10 +770,46 @@ module Cassiopee
|
|
562
770
|
@suffix = s[j,i]
|
563
771
|
@suffixmd5 = Digest::MD5.hexdigest(@suffix)
|
564
772
|
@position = j
|
565
|
-
|
566
|
-
|
773
|
+
progress = (@position * 100).div(@sequence.length)
|
774
|
+
if((progress % 10) == 0 && progress > prev_progress)
|
775
|
+
prev_progress = progress
|
776
|
+
$log.debug("progress: " << progress.to_s)
|
777
|
+
end
|
778
|
+
|
779
|
+
if(method==METHOD_DIRECT)
|
780
|
+
|
781
|
+
if(edit==0 && !@useAmbiguity)
|
782
|
+
if(isMatchEqual?(@suffixmd5))
|
783
|
+
errors = 0
|
784
|
+
else
|
785
|
+
errors = -1
|
786
|
+
end
|
787
|
+
else
|
788
|
+
|
789
|
+
if(edit>=0)
|
790
|
+
useHamming = true
|
791
|
+
allowederrors = edit
|
792
|
+
else
|
793
|
+
useHamming = false
|
794
|
+
allowederrors = edit * (-1)
|
795
|
+
end
|
796
|
+
errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors)
|
797
|
+
end
|
798
|
+
|
799
|
+
|
800
|
+
if(errors>=0)
|
801
|
+
match = Array[@suffixmd5, errors, Array[i,j]]
|
802
|
+
$log.debug "Match: " << match.inspect
|
803
|
+
@matches << match
|
804
|
+
end
|
805
|
+
|
806
|
+
|
807
|
+
|
808
|
+
else
|
809
|
+
nbSuffix += addSuffix(@suffixmd5, @position,i)
|
810
|
+
end
|
567
811
|
end
|
568
|
-
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
|
812
|
+
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT
|
569
813
|
end
|
570
814
|
|
571
815
|
|
@@ -615,7 +859,6 @@ module Cassiopee
|
|
615
859
|
input = line.downcase.chomp
|
616
860
|
skip = false
|
617
861
|
comments.each do |c|
|
618
|
-
$log.debug("skip line ?" << c << " == " << input[0])
|
619
862
|
if(input[0] == c[0])
|
620
863
|
# Line start with a comment char, skip it
|
621
864
|
$log.debug("skip line")
|
data/tests/test-suite.rb
CHANGED
@@ -9,13 +9,13 @@ class TestCrawler < Test::Unit::TestCase
|
|
9
9
|
|
10
10
|
def test_exactsearch
|
11
11
|
crawler = Cassiopee::Crawler.new
|
12
|
-
crawler.setLogLevel(Logger::
|
12
|
+
#crawler.setLogLevel(Logger::DEBUG)
|
13
13
|
crawler.indexString('my sample example')
|
14
14
|
matches = crawler.searchExact('ampl')
|
15
|
-
assert_equal(
|
15
|
+
assert_equal(2,matches.length)
|
16
16
|
# Minus 1, because first element is len of match
|
17
|
-
match = crawler.next()
|
18
|
-
assert_equal(2,match[2].length-1)
|
17
|
+
#match = crawler.next()
|
18
|
+
#assert_equal(2,match[2].length-1)
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_ambiguous
|
@@ -41,6 +41,23 @@ class TestCrawler < Test::Unit::TestCase
|
|
41
41
|
assert_equal(1,matches.length)
|
42
42
|
end
|
43
43
|
|
44
|
+
def test_directmethod
|
45
|
+
crawler = Cassiopee::Crawler.new
|
46
|
+
crawler.method = Cassiopee::Crawler::METHOD_DIRECT
|
47
|
+
crawler.indexString('my sample example')
|
48
|
+
matches = crawler.searchApproximate('ebampl',1)
|
49
|
+
assert_equal(1,matches.length)
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def test_suffixmethod
|
54
|
+
crawler = Cassiopee::Crawler.new
|
55
|
+
crawler.method = Cassiopee::Crawler::METHOD_SUFFIX
|
56
|
+
crawler.indexString('my sample example')
|
57
|
+
matches = crawler.searchApproximate('ebampl',1)
|
58
|
+
assert_equal(1,matches.length)
|
59
|
+
end
|
60
|
+
|
44
61
|
def test_multithreadsearch
|
45
62
|
crawler = CassiopeeMt::CrawlerMt.new
|
46
63
|
crawler.maxthread=3
|
@@ -49,6 +66,36 @@ class TestCrawler < Test::Unit::TestCase
|
|
49
66
|
assert_equal(1,matches.length)
|
50
67
|
end
|
51
68
|
|
69
|
+
def test_cache
|
70
|
+
|
71
|
+
crawler = Cassiopee::Crawler.new
|
72
|
+
crawler.indexString('my sample example')
|
73
|
+
matches = crawler.searchApproximate('ebampl',-1)
|
74
|
+
|
75
|
+
cache = Cassiopee::CrawlerCache.new
|
76
|
+
cache.method = 2
|
77
|
+
cache.min_position = 0
|
78
|
+
cache.max_position = 0
|
79
|
+
cache.errors = 1
|
80
|
+
cache.saveCache(matches)
|
81
|
+
|
82
|
+
cache = Cassiopee::CrawlerCache.new
|
83
|
+
cache.method = 2
|
84
|
+
cache.min_position = 0
|
85
|
+
cache.max_position = 0
|
86
|
+
cache.errors = 1
|
87
|
+
cachematches = cache.loadCache
|
88
|
+
assert_equal(1,cachematches.length)
|
89
|
+
|
90
|
+
cache = Cassiopee::CrawlerCache.new
|
91
|
+
cache.method = 2
|
92
|
+
cache.min_position = 0
|
93
|
+
cache.max_position = 0
|
94
|
+
cache.errors = 2
|
95
|
+
cachematches = cache.loadCache
|
96
|
+
assert_equal(0,cachematches.length)
|
97
|
+
|
98
|
+
end
|
52
99
|
end
|
53
100
|
|
54
101
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 5
|
10
|
+
version: 0.1.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-09-
|
18
|
+
date: 2011-09-20 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|