cassiopee 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +1 -0
- data/lib/cassiopee-mt.rb +11 -10
- data/lib/cassiopee.rb +320 -77
- data/tests/test-suite.rb +51 -4
- metadata +4 -4
data/Changelog
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
v0.1.5 : add CrawlerCache for basic cache management (previous result only), several fixes
|
1
2
|
v0.1.4 : fix 0.1.3 error on index load, add filter_position management in mt
|
2
3
|
v0.1.3 : 09/11 Olivier Sallou
|
3
4
|
add CrawlerMT in cassiopee-mt for multi thread support to speed up the search
|
data/lib/cassiopee-mt.rb
CHANGED
@@ -34,6 +34,7 @@ module CassiopeeMt
|
|
34
34
|
crawler.setLogLevel($log.level)
|
35
35
|
crawler.file_suffix = @file_suffix
|
36
36
|
crawler.loadIndex()
|
37
|
+
crawler.method = method
|
37
38
|
#crawler.file_suffix = @file_suffix+"."+threadId.to_s
|
38
39
|
end
|
39
40
|
|
@@ -55,16 +56,16 @@ module CassiopeeMt
|
|
55
56
|
end
|
56
57
|
nb = len.div(maxthread)
|
57
58
|
(1..maxthread).each do |i|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
59
|
+
crawler = Crawler.new
|
60
|
+
setParams(crawler,i)
|
61
|
+
curmax = min + nb
|
62
|
+
if(i==maxthread)
|
63
|
+
curmax = max
|
64
|
+
end
|
65
|
+
crawler.filter_position(min,curmax)
|
66
|
+
$log.debug("Start new Thread between " << min.to_s << " and " << curmax.to_s)
|
67
|
+
@th[i-1] = Thread.new{ Thread.current["matches"] = crawler.searchExact(pattern) }
|
68
|
+
min = curmax + 1
|
68
69
|
end
|
69
70
|
@th.each {|t| t.join; t["matches"].each { |m| @matches << m }}
|
70
71
|
return @matches
|
data/lib/cassiopee.rb
CHANGED
@@ -34,7 +34,6 @@ module Cassiopee
|
|
34
34
|
# Compute Hamming distance but using a mapping matrix of alphabet ambiguity
|
35
35
|
|
36
36
|
def computeHammingAmbiguous(pattern,hamming,ambiguous)
|
37
|
-
pattern = pattern.downcase
|
38
37
|
nberr = 0
|
39
38
|
(0..(self.length-1)).each do |c|
|
40
39
|
if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
|
@@ -52,7 +51,6 @@ module Cassiopee
|
|
52
51
|
# Return -1 if max is reached
|
53
52
|
|
54
53
|
def computeHamming(pattern,hamming)
|
55
|
-
pattern = pattern.downcase
|
56
54
|
nberr = 0
|
57
55
|
(0..(self.length-1)).each do |c|
|
58
56
|
if(pattern[c] != self[c])
|
@@ -72,11 +70,9 @@ module Cassiopee
|
|
72
70
|
# Return -1 if max is reached
|
73
71
|
|
74
72
|
def computeLevenshtein(pattern,edit)
|
75
|
-
pattern = pattern.downcase
|
76
73
|
|
77
74
|
distance = Text::Levenshtein.distance(self, pattern)
|
78
75
|
|
79
|
-
|
80
76
|
if(distance>edit)
|
81
77
|
return -1
|
82
78
|
end
|
@@ -91,44 +87,42 @@ module Cassiopee
|
|
91
87
|
|
92
88
|
def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
|
93
89
|
|
94
|
-
|
95
|
-
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
90
|
+
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
96
91
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
92
|
+
if Text.encoding_of(self) =~ /^U/i
|
93
|
+
unpack_rule = 'U*'
|
94
|
+
else
|
95
|
+
unpack_rule = 'C*'
|
96
|
+
end
|
102
97
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
98
|
+
s = self.unpack(unpack_rule)
|
99
|
+
t = pattern.unpack(unpack_rule)
|
100
|
+
n = s.length
|
101
|
+
m = t.length
|
102
|
+
return m if (0 == n)
|
103
|
+
return n if (0 == m)
|
109
104
|
|
110
|
-
|
111
|
-
|
105
|
+
d = (0..m).to_a
|
106
|
+
x = nil
|
112
107
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
return x
|
108
|
+
(0...n).each do |i|
|
109
|
+
e = i+1
|
110
|
+
(0...m).each do |j|
|
111
|
+
cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
|
112
|
+
x = [
|
113
|
+
d[j+1] + 1, # insertion
|
114
|
+
e + 1, # deletion
|
115
|
+
d[j] + cost # substitution
|
116
|
+
].min
|
117
|
+
d[j] = e
|
118
|
+
e = x
|
119
|
+
end
|
120
|
+
d[m] = x
|
121
|
+
end
|
122
|
+
if(x>edit)
|
123
|
+
return -1
|
124
|
+
end
|
125
|
+
return x
|
132
126
|
end
|
133
127
|
|
134
128
|
|
@@ -150,6 +144,106 @@ module Cassiopee
|
|
150
144
|
end
|
151
145
|
end
|
152
146
|
|
147
|
+
# Class maning cache of results
|
148
|
+
|
149
|
+
class CrawlerCache
|
150
|
+
|
151
|
+
FILE_CACHE_EXT = ".sfc"
|
152
|
+
|
153
|
+
# Suffix files name/path
|
154
|
+
attr_accessor :file_suffix
|
155
|
+
|
156
|
+
# search exact: 0
|
157
|
+
# hamming : 1
|
158
|
+
# edit : 2
|
159
|
+
attr_accessor :method
|
160
|
+
|
161
|
+
# filter
|
162
|
+
attr_accessor :min_position
|
163
|
+
attr_accessor :max_position
|
164
|
+
|
165
|
+
# max errors
|
166
|
+
attr_accessor :errors
|
167
|
+
|
168
|
+
attr_accessor :cache
|
169
|
+
|
170
|
+
$log = Logger.new(STDOUT)
|
171
|
+
$log.level = Logger::INFO
|
172
|
+
|
173
|
+
def setLogger(userlogger)
|
174
|
+
$log = userlogger
|
175
|
+
end
|
176
|
+
|
177
|
+
def initialize
|
178
|
+
@file_suffix = "crawler"
|
179
|
+
end
|
180
|
+
|
181
|
+
# Loads cache from file
|
182
|
+
def loadCache
|
183
|
+
return Array.new unless File.exists?(@file_suffix+FILE_CACHE_EXT)
|
184
|
+
begin
|
185
|
+
file = Zlib::GzipReader.open(@file_suffix+FILE_CACHE_EXT)
|
186
|
+
rescue Zlib::GzipFile::Error
|
187
|
+
file = File.open(@file_suffix+FILE_CACHE_EXT, 'r')
|
188
|
+
ensure
|
189
|
+
obj = Marshal.load file.read
|
190
|
+
file.close
|
191
|
+
if(method!=obj.method || min_position<obj.min_position || max_position>obj.max_position || errors>obj.errors)
|
192
|
+
return Array.new
|
193
|
+
end
|
194
|
+
return filterCache(obj)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
# Save self to cache, with cache object set from obj
|
199
|
+
def saveCache(obj)
|
200
|
+
self.cache = obj
|
201
|
+
marshal_dump = Marshal.dump(self)
|
202
|
+
sfxpos = File.new(@file_suffix+FILE_CACHE_EXT,'w')
|
203
|
+
sfxpos = Zlib::GzipWriter.new(sfxpos)
|
204
|
+
sfxpos.write marshal_dump
|
205
|
+
sfxpos.close
|
206
|
+
end
|
207
|
+
|
208
|
+
def clearCache
|
209
|
+
File.delete(@file_suffix+FILE_CACHE_EXT) unless !File.exists?(@file_suffix+FILE_CACHE_EXT)
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
# filter cache according to settings
|
215
|
+
# obj: cache object
|
216
|
+
def filterCache(cacheobject)
|
217
|
+
|
218
|
+
realmatches = Array.new
|
219
|
+
if(cacheobject==nil)
|
220
|
+
return realmatches
|
221
|
+
end
|
222
|
+
|
223
|
+
cacheobject.cache.each do |obj|
|
224
|
+
if(obj[1]>self.errors)
|
225
|
+
next
|
226
|
+
end
|
227
|
+
realpos = Array.new
|
228
|
+
realpos << obj[2][0]
|
229
|
+
(1..obj[2].length-1).each do |i|
|
230
|
+
curpos= obj[2][i]
|
231
|
+
if((curpos<=max_position || max_position==0) && curpos>=min_position)
|
232
|
+
realpos << curpos
|
233
|
+
end
|
234
|
+
end
|
235
|
+
if(realpos.length<=1)
|
236
|
+
next
|
237
|
+
end
|
238
|
+
realmatches << Array[obj[0],obj[1],realpos]
|
239
|
+
|
240
|
+
end
|
241
|
+
return realmatches
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
|
153
247
|
# Base class to index and search through a string
|
154
248
|
|
155
249
|
class Crawler
|
@@ -164,11 +258,30 @@ module Cassiopee
|
|
164
258
|
attr_accessor :use_store
|
165
259
|
# Array of comment characters to skip lines in input sequence file
|
166
260
|
attr_accessor :comments
|
261
|
+
|
262
|
+
# Manage basic cache to store previous match
|
263
|
+
attr_accessor :useCache
|
167
264
|
|
265
|
+
# Method for search FORCE or SUFFIX
|
266
|
+
# * SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)
|
267
|
+
# * FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search
|
268
|
+
# FORCE method does not yet support optimal filters
|
269
|
+
attr_accessor :method
|
270
|
+
|
271
|
+
METHOD_DIRECT = 0
|
272
|
+
METHOD_SUFFIX = 1
|
273
|
+
|
168
274
|
@min_position = 0
|
169
275
|
@max_position = 0
|
170
276
|
|
277
|
+
# Previous position filter
|
278
|
+
@prev_min_position = 0
|
279
|
+
@prev_max_position = 0
|
280
|
+
|
171
281
|
@ambiguous = nil
|
282
|
+
|
283
|
+
@pattern = nil
|
284
|
+
|
172
285
|
|
173
286
|
FILE_SUFFIX_EXT = ".sfx"
|
174
287
|
FILE_SUFFIX_POS = ".sfp"
|
@@ -177,27 +290,39 @@ module Cassiopee
|
|
177
290
|
|
178
291
|
$maxthread = 1
|
179
292
|
|
293
|
+
@cache = nil
|
294
|
+
|
180
295
|
|
181
296
|
$log = Logger.new(STDOUT)
|
182
297
|
$log.level = Logger::INFO
|
183
298
|
|
184
299
|
def initialize
|
185
300
|
@useAmbiguity = false
|
301
|
+
@useCache = false
|
186
302
|
@file_suffix = "crawler"
|
187
|
-
|
303
|
+
|
304
|
+
@method = 0
|
305
|
+
|
306
|
+
@prev_min_position = 0
|
307
|
+
@prev_max_position = 0
|
308
|
+
|
309
|
+
|
188
310
|
@suffix = nil
|
189
311
|
@suffixmd5 = nil
|
190
312
|
@position = 0
|
191
313
|
|
192
314
|
@suffixes = Hash.new
|
193
315
|
|
194
|
-
@matches =
|
316
|
+
@matches = Array.new
|
195
317
|
@curmatch = 0
|
196
318
|
@use_store = false
|
197
319
|
|
198
320
|
@sequence = nil
|
199
321
|
|
200
322
|
@comments = Array["#"]
|
323
|
+
|
324
|
+
@cache = Cassiopee::CrawlerCache.new
|
325
|
+
|
201
326
|
end
|
202
327
|
|
203
328
|
def filterLength
|
@@ -213,7 +338,11 @@ module Cassiopee
|
|
213
338
|
|
214
339
|
def clear
|
215
340
|
@suffixes = Hash.new
|
216
|
-
|
341
|
+
@matches.clear
|
342
|
+
@pattern = nil
|
343
|
+
@prev_max_position = 0
|
344
|
+
@prev_min_position = 0
|
345
|
+
@cache.clearCache()
|
217
346
|
File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
|
218
347
|
end
|
219
348
|
|
@@ -300,37 +429,57 @@ module Cassiopee
|
|
300
429
|
# Filter matches to be between min and max start position
|
301
430
|
# If not using use_store, search speed is improved but existing indexes are cleared
|
302
431
|
# If max=0, then max is string length
|
432
|
+
# Must be called after index creation or load
|
303
433
|
|
304
434
|
def filter_position(min,max)
|
305
435
|
if(!use_store)
|
306
436
|
clear()
|
307
437
|
end
|
438
|
+
@prev_min_position = @min_position
|
439
|
+
@prev_max_position = @max_position
|
308
440
|
@min_position = min
|
309
441
|
@max_position = max
|
310
442
|
end
|
311
443
|
|
312
444
|
# Search exact match
|
313
445
|
|
314
|
-
def searchExact(
|
446
|
+
def searchExact(s)
|
447
|
+
|
315
448
|
if(@useAmbiguity)
|
316
|
-
return searchApproximate(
|
449
|
+
return searchApproximate(s,0)
|
450
|
+
end
|
451
|
+
|
452
|
+
s = s.downcase
|
453
|
+
|
454
|
+
updateCache(0,0)
|
455
|
+
@matches = @cache.loadCache()
|
456
|
+
|
457
|
+
if(@matches.length>0)
|
458
|
+
return cache?(@matches)
|
317
459
|
end
|
318
|
-
|
319
|
-
|
460
|
+
|
461
|
+
#@matches.clear
|
462
|
+
|
463
|
+
@pattern = Digest::MD5.hexdigest(s)
|
464
|
+
|
465
|
+
parseSuffixes(@sequence,s.length,s.length,0,s)
|
320
466
|
|
321
|
-
|
467
|
+
return @matches unless(method == METHOD_SUFFIX)
|
468
|
+
|
322
469
|
# Search required length, compare (compare md5?)
|
323
470
|
# MD5 = 128 bits, easier to compare for large strings
|
324
|
-
|
325
|
-
|
471
|
+
|
472
|
+
|
473
|
+
matchsize = @pattern.length
|
474
|
+
|
326
475
|
@suffixes.each do |md5val,posArray|
|
327
|
-
if (md5val
|
476
|
+
if (isMatchEqual?(md5val))
|
328
477
|
match = Array[md5val, 0, posArray]
|
329
478
|
$log.debug "Match: " << match.inspect
|
330
479
|
@matches << match
|
331
480
|
end
|
332
481
|
end
|
333
|
-
return @matches
|
482
|
+
return cache?(@matches)
|
334
483
|
|
335
484
|
end
|
336
485
|
|
@@ -342,32 +491,48 @@ module Cassiopee
|
|
342
491
|
|
343
492
|
|
344
493
|
def searchApproximate(s,edit)
|
494
|
+
|
345
495
|
if(edit==0 && !@useAmbiguity)
|
346
496
|
return searchExact(s)
|
347
497
|
end
|
348
|
-
|
498
|
+
allowederrors = edit
|
349
499
|
if(edit>=0)
|
350
500
|
useHamming = true
|
351
501
|
minmatchsize = s.length
|
352
502
|
maxmatchsize = s.length
|
503
|
+
updateCache(1,edit)
|
504
|
+
@matches = @cache.loadCache()
|
353
505
|
else
|
354
506
|
useHamming = false
|
355
507
|
edit = edit * (-1)
|
356
508
|
minmatchsize = s.length - edit
|
357
509
|
maxmatchsize = s.length + edit
|
510
|
+
updateCache(2,edit)
|
511
|
+
@matches = @cache.loadCache()
|
358
512
|
end
|
513
|
+
|
514
|
+
if(@matches.length>0)
|
515
|
+
return @matches
|
516
|
+
end
|
517
|
+
|
518
|
+
s = s.downcase
|
359
519
|
|
360
|
-
|
520
|
+
|
521
|
+
#@matches.clear
|
522
|
+
@pattern = Digest::MD5.hexdigest(s)
|
523
|
+
|
524
|
+
parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s)
|
361
525
|
|
362
|
-
|
526
|
+
return cache?(@matches) unless(method == METHOD_SUFFIX)
|
527
|
+
|
528
|
+
|
363
529
|
|
364
|
-
|
365
|
-
|
530
|
+
|
366
531
|
@suffixes.each do |md5val,posArray|
|
367
532
|
if(md5val == SUFFIXLEN)
|
368
533
|
next
|
369
534
|
end
|
370
|
-
if (md5val ==
|
535
|
+
if (md5val == @pattern)
|
371
536
|
filteredPosArray = filter(posArray)
|
372
537
|
match = Array[md5val, 0, filteredPosArray]
|
373
538
|
$log.debug "Match: " << match.inspect
|
@@ -376,20 +541,8 @@ module Cassiopee
|
|
376
541
|
if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
|
377
542
|
# Get string
|
378
543
|
seq = extractSuffix(posArray[1],posArray[0])
|
379
|
-
|
380
|
-
|
381
|
-
if(@useAmbiguity && @ambiguous!=nil)
|
382
|
-
errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
|
383
|
-
else
|
384
|
-
errors = seq.computeHamming(s,edit)
|
385
|
-
end
|
386
|
-
else
|
387
|
-
if(@useAmbiguity && @ambigous!=nil)
|
388
|
-
errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
|
389
|
-
else
|
390
|
-
errors = seq.computeLevenshtein(s,edit)
|
391
|
-
end
|
392
|
-
end
|
544
|
+
errors = isApproximateEqual?(seq,s,useHamming,edit)
|
545
|
+
|
393
546
|
if(errors>=0)
|
394
547
|
filteredPosArray = filter(posArray)
|
395
548
|
match = Array[md5val, errors, filteredPosArray]
|
@@ -401,7 +554,7 @@ module Cassiopee
|
|
401
554
|
|
402
555
|
end
|
403
556
|
|
404
|
-
return @matches
|
557
|
+
return cache?(@matches)
|
405
558
|
end
|
406
559
|
|
407
560
|
# Filter the array of positions with defined position filter
|
@@ -488,14 +641,68 @@ module Cassiopee
|
|
488
641
|
end
|
489
642
|
|
490
643
|
private
|
644
|
+
|
645
|
+
# If cache is used, store results for later retrieval, else return matches directly
|
646
|
+
def cache?(results)
|
647
|
+
if(@useCache)
|
648
|
+
@cache.saveCache(results)
|
649
|
+
end
|
650
|
+
|
651
|
+
return results
|
652
|
+
end
|
653
|
+
|
654
|
+
# Update cache object with current object parameters
|
655
|
+
# * method: 0 -> exact, 1 -> hamming, 2 -> edit
|
656
|
+
def updateCache(method,errors)
|
657
|
+
@cache.file_suffix = @file_suffix
|
658
|
+
@cache.min_position = @min_position
|
659
|
+
@cache.max_position = @max_position
|
660
|
+
@cache.method = method
|
661
|
+
@cache.errors = errors
|
662
|
+
end
|
663
|
+
|
664
|
+
|
665
|
+
# check if md5 is equal to pattern
|
666
|
+
def isMatchEqual?(s)
|
667
|
+
if(@pattern == s)
|
668
|
+
return true
|
669
|
+
end
|
670
|
+
return false
|
671
|
+
end
|
672
|
+
|
673
|
+
# check if string is approximatly equal to pattern
|
674
|
+
# s: string to compare
|
675
|
+
# pattern: base pattern used
|
676
|
+
# useHamming: use Hamming or edit distance
|
677
|
+
# edit : allowed errors
|
678
|
+
def isApproximateEqual?(s,pattern,useHamming,edit)
|
679
|
+
errors = -1
|
680
|
+
s.extend(Cassiopee)
|
681
|
+
if(useHamming)
|
682
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
683
|
+
errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous)
|
684
|
+
else
|
685
|
+
errors = s.computeHamming(pattern,edit)
|
686
|
+
end
|
687
|
+
else
|
688
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
689
|
+
errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous)
|
690
|
+
else
|
691
|
+
errors = s.computeLevenshtein(pattern,edit)
|
692
|
+
end
|
693
|
+
end
|
694
|
+
end
|
695
|
+
|
696
|
+
|
697
|
+
|
491
698
|
|
492
699
|
# Parse input string
|
493
700
|
#
|
494
701
|
# * creates a suffix file
|
495
702
|
# * creates a suffix position file
|
496
703
|
|
497
|
-
def parseSuffixes(s,minlen,maxlen)
|
498
|
-
|
704
|
+
def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil)
|
705
|
+
|
499
706
|
# Controls
|
500
707
|
if(minlen<=0)
|
501
708
|
minlen = 1
|
@@ -554,6 +761,7 @@ module Cassiopee
|
|
554
761
|
next
|
555
762
|
end
|
556
763
|
changed = true
|
764
|
+
prev_progress = -1
|
557
765
|
(minpos..(maxpos)).each do |j|
|
558
766
|
# if position+length longer than sequence length, skip it
|
559
767
|
if(j+i>=@sequence.length)
|
@@ -562,10 +770,46 @@ module Cassiopee
|
|
562
770
|
@suffix = s[j,i]
|
563
771
|
@suffixmd5 = Digest::MD5.hexdigest(@suffix)
|
564
772
|
@position = j
|
565
|
-
|
566
|
-
|
773
|
+
progress = (@position * 100).div(@sequence.length)
|
774
|
+
if((progress % 10) == 0 && progress > prev_progress)
|
775
|
+
prev_progress = progress
|
776
|
+
$log.debug("progress: " << progress.to_s)
|
777
|
+
end
|
778
|
+
|
779
|
+
if(method==METHOD_DIRECT)
|
780
|
+
|
781
|
+
if(edit==0 && !@useAmbiguity)
|
782
|
+
if(isMatchEqual?(@suffixmd5))
|
783
|
+
errors = 0
|
784
|
+
else
|
785
|
+
errors = -1
|
786
|
+
end
|
787
|
+
else
|
788
|
+
|
789
|
+
if(edit>=0)
|
790
|
+
useHamming = true
|
791
|
+
allowederrors = edit
|
792
|
+
else
|
793
|
+
useHamming = false
|
794
|
+
allowederrors = edit * (-1)
|
795
|
+
end
|
796
|
+
errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors)
|
797
|
+
end
|
798
|
+
|
799
|
+
|
800
|
+
if(errors>=0)
|
801
|
+
match = Array[@suffixmd5, errors, Array[i,j]]
|
802
|
+
$log.debug "Match: " << match.inspect
|
803
|
+
@matches << match
|
804
|
+
end
|
805
|
+
|
806
|
+
|
807
|
+
|
808
|
+
else
|
809
|
+
nbSuffix += addSuffix(@suffixmd5, @position,i)
|
810
|
+
end
|
567
811
|
end
|
568
|
-
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
|
812
|
+
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT
|
569
813
|
end
|
570
814
|
|
571
815
|
|
@@ -615,7 +859,6 @@ module Cassiopee
|
|
615
859
|
input = line.downcase.chomp
|
616
860
|
skip = false
|
617
861
|
comments.each do |c|
|
618
|
-
$log.debug("skip line ?" << c << " == " << input[0])
|
619
862
|
if(input[0] == c[0])
|
620
863
|
# Line start with a comment char, skip it
|
621
864
|
$log.debug("skip line")
|
data/tests/test-suite.rb
CHANGED
@@ -9,13 +9,13 @@ class TestCrawler < Test::Unit::TestCase
|
|
9
9
|
|
10
10
|
def test_exactsearch
|
11
11
|
crawler = Cassiopee::Crawler.new
|
12
|
-
crawler.setLogLevel(Logger::
|
12
|
+
#crawler.setLogLevel(Logger::DEBUG)
|
13
13
|
crawler.indexString('my sample example')
|
14
14
|
matches = crawler.searchExact('ampl')
|
15
|
-
assert_equal(
|
15
|
+
assert_equal(2,matches.length)
|
16
16
|
# Minus 1, because first element is len of match
|
17
|
-
match = crawler.next()
|
18
|
-
assert_equal(2,match[2].length-1)
|
17
|
+
#match = crawler.next()
|
18
|
+
#assert_equal(2,match[2].length-1)
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_ambiguous
|
@@ -41,6 +41,23 @@ class TestCrawler < Test::Unit::TestCase
|
|
41
41
|
assert_equal(1,matches.length)
|
42
42
|
end
|
43
43
|
|
44
|
+
def test_directmethod
|
45
|
+
crawler = Cassiopee::Crawler.new
|
46
|
+
crawler.method = Cassiopee::Crawler::METHOD_DIRECT
|
47
|
+
crawler.indexString('my sample example')
|
48
|
+
matches = crawler.searchApproximate('ebampl',1)
|
49
|
+
assert_equal(1,matches.length)
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def test_suffixmethod
|
54
|
+
crawler = Cassiopee::Crawler.new
|
55
|
+
crawler.method = Cassiopee::Crawler::METHOD_SUFFIX
|
56
|
+
crawler.indexString('my sample example')
|
57
|
+
matches = crawler.searchApproximate('ebampl',1)
|
58
|
+
assert_equal(1,matches.length)
|
59
|
+
end
|
60
|
+
|
44
61
|
def test_multithreadsearch
|
45
62
|
crawler = CassiopeeMt::CrawlerMt.new
|
46
63
|
crawler.maxthread=3
|
@@ -49,6 +66,36 @@ class TestCrawler < Test::Unit::TestCase
|
|
49
66
|
assert_equal(1,matches.length)
|
50
67
|
end
|
51
68
|
|
69
|
+
def test_cache
|
70
|
+
|
71
|
+
crawler = Cassiopee::Crawler.new
|
72
|
+
crawler.indexString('my sample example')
|
73
|
+
matches = crawler.searchApproximate('ebampl',-1)
|
74
|
+
|
75
|
+
cache = Cassiopee::CrawlerCache.new
|
76
|
+
cache.method = 2
|
77
|
+
cache.min_position = 0
|
78
|
+
cache.max_position = 0
|
79
|
+
cache.errors = 1
|
80
|
+
cache.saveCache(matches)
|
81
|
+
|
82
|
+
cache = Cassiopee::CrawlerCache.new
|
83
|
+
cache.method = 2
|
84
|
+
cache.min_position = 0
|
85
|
+
cache.max_position = 0
|
86
|
+
cache.errors = 1
|
87
|
+
cachematches = cache.loadCache
|
88
|
+
assert_equal(1,cachematches.length)
|
89
|
+
|
90
|
+
cache = Cassiopee::CrawlerCache.new
|
91
|
+
cache.method = 2
|
92
|
+
cache.min_position = 0
|
93
|
+
cache.max_position = 0
|
94
|
+
cache.errors = 2
|
95
|
+
cachematches = cache.loadCache
|
96
|
+
assert_equal(0,cachematches.length)
|
97
|
+
|
98
|
+
end
|
52
99
|
end
|
53
100
|
|
54
101
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 5
|
10
|
+
version: 0.1.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-09-
|
18
|
+
date: 2011-09-20 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|