cassiopee 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +1 -0
- data/LICENSE +1 -0
- data/README +2 -0
- data/demo.rb +31 -0
- data/lib/cassiopee.rb +411 -0
- data/tests/test-suite.rb +38 -0
- metadata +72 -0
data/Changelog
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.1.0 : First version
|
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Software is distributed under LGPL v3
|
data/README
ADDED
data/demo.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'lib/cassiopee')
|
2
|
+
require 'rubygems'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
# Instanciate a new crawler
|
6
|
+
crawler = Cassiopee::Crawler.new
|
7
|
+
#crawler.use_store = true
|
8
|
+
|
9
|
+
# String to index
|
10
|
+
crawler.indexString('sallou sallu')
|
11
|
+
# Search pattern in indexed string
|
12
|
+
crawler.searchExact('llo')
|
13
|
+
# Search it again, using already loaded indexed data
|
14
|
+
crawler.searchExact('llo')
|
15
|
+
|
16
|
+
|
17
|
+
test= "my string"
|
18
|
+
# Extend to use match algorithms
|
19
|
+
test.extend(Cassiopee)
|
20
|
+
test.computeDistance('test',0,0)
|
21
|
+
puts "Hamming: " << test.computeHamming("my strigg",1).to_s
|
22
|
+
|
23
|
+
puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
|
24
|
+
|
25
|
+
# Approcimate search, edit distance = 1
|
26
|
+
crawler.searchApproximate("llu",-1)
|
27
|
+
|
28
|
+
# Go through matches
|
29
|
+
while((match = crawler.next())!=nil)
|
30
|
+
puts "got a match " << match.inspect
|
31
|
+
end
|
data/lib/cassiopee.rb
ADDED
@@ -0,0 +1,411 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'logger'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
module Cassiopee
|
6
|
+
|
7
|
+
# Calculate the edit or hamming distance between String and pattern
|
8
|
+
# Extend a String
|
9
|
+
# Return -1 if max is reached
|
10
|
+
|
11
|
+
def computeDistance(pattern,hamming,edit)
|
12
|
+
if(edit==0)
|
13
|
+
return computeHamming(pattern,hamming)
|
14
|
+
else
|
15
|
+
return computeLevenshtein(pattern,edit)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Calculate number of substitution between string and pattern
|
20
|
+
# Extend a String
|
21
|
+
# Return -1 if max is reached
|
22
|
+
|
23
|
+
def computeHamming(pattern,hamming)
|
24
|
+
pattern = pattern.downcase
|
25
|
+
nberr = 0
|
26
|
+
(0..(self.length-1)).each do |c|
|
27
|
+
if(pattern[c] != self[c])
|
28
|
+
nberr = nberr+1
|
29
|
+
if(nberr>hamming.to_i)
|
30
|
+
return -1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
return nberr
|
35
|
+
end
|
36
|
+
|
37
|
+
# Calculate the edit distance between string and pattern
|
38
|
+
# Extend a String
|
39
|
+
# Return -1 if max is reached
|
40
|
+
|
41
|
+
def computeLevenshtein(pattern,edit)
|
42
|
+
pattern = pattern.downcase
|
43
|
+
matrix= Array.new(2)
|
44
|
+
matrix[0] = Array.new(pattern.length+1)
|
45
|
+
matrix[1] = Array.new(pattern.length+1)
|
46
|
+
(0..(pattern.length)).each do |i|
|
47
|
+
matrix[0][i]=i
|
48
|
+
matrix[1][i]=i
|
49
|
+
end
|
50
|
+
c=0
|
51
|
+
p=1
|
52
|
+
(1..(self.length)).each do |i|
|
53
|
+
c = i.modulo(2)
|
54
|
+
p = (i+1).modulo(2)
|
55
|
+
matrix[c][0] = i
|
56
|
+
(1..(pattern.length)).each do |j|
|
57
|
+
# Bellman's principle of optimality
|
58
|
+
weight = 0
|
59
|
+
if(pattern[i-1] != self[j-1])
|
60
|
+
weight = 1
|
61
|
+
end
|
62
|
+
weight = matrix[p][j-1] + weight
|
63
|
+
if(weight > matrix[p][j] +1)
|
64
|
+
weight = matrix[p][j] +1
|
65
|
+
end
|
66
|
+
if(weight > matrix[c][j-1] +1)
|
67
|
+
weight = matrix[c][j-1] +1
|
68
|
+
end
|
69
|
+
matrix[c][j] = weight
|
70
|
+
end
|
71
|
+
end
|
72
|
+
p = c
|
73
|
+
c = (c + 1).modulo(2)
|
74
|
+
if(matrix[p][pattern.length]>edit)
|
75
|
+
return -1
|
76
|
+
end
|
77
|
+
return matrix[p][pattern.length]
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
# Base class to index and search through a string
|
82
|
+
|
83
|
+
class Crawler
|
84
|
+
|
85
|
+
# Use alphabet ambiguity (dna/rna) in search
|
86
|
+
attr_accessor :useAmbiguity
|
87
|
+
# Suffix files name/path
|
88
|
+
attr_accessor :file_suffix
|
89
|
+
# Max number fo threads to use (not yet used)
|
90
|
+
attr_accessor :maxthread
|
91
|
+
# Use persistent suffix file ?
|
92
|
+
attr_accessor :use_store
|
93
|
+
|
94
|
+
FILE_SUFFIX_EXT = ".sfx"
|
95
|
+
FILE_SUFFIX_POS = ".sfp"
|
96
|
+
|
97
|
+
SUFFIXLEN = 'suffix_length'
|
98
|
+
|
99
|
+
$maxthread = 1
|
100
|
+
|
101
|
+
$log = Logger.new(STDOUT)
|
102
|
+
$log.level = Logger::DEBUG
|
103
|
+
|
104
|
+
def initialize
|
105
|
+
@useAmbiguity = false
|
106
|
+
@file_suffix = "crawler"
|
107
|
+
|
108
|
+
@suffix = nil
|
109
|
+
@suffixmd5 = nil
|
110
|
+
@position = 0
|
111
|
+
|
112
|
+
@suffixes = Hash.new
|
113
|
+
|
114
|
+
@matches = nil
|
115
|
+
@curmatch = 0
|
116
|
+
@use_store = false
|
117
|
+
|
118
|
+
@sequence = nil
|
119
|
+
end
|
120
|
+
|
121
|
+
# Clear suffixes in memory
|
122
|
+
|
123
|
+
def clear
|
124
|
+
@suffixes = Hash.new
|
125
|
+
end
|
126
|
+
|
127
|
+
# Set Logger level
|
128
|
+
|
129
|
+
def setLogLevel(level)
|
130
|
+
$log.level = level
|
131
|
+
end
|
132
|
+
|
133
|
+
# Index an input file
|
134
|
+
|
135
|
+
def indexFile(f)
|
136
|
+
# Parse file, map letters to reduced alphabet
|
137
|
+
# Later on, use binary map instead of ascii map
|
138
|
+
# Take all suffix, order by length, link to position map on other file
|
139
|
+
# Store md5 for easier compare? + 20 bytes per suffix
|
140
|
+
@sequence = readSequence(f)
|
141
|
+
|
142
|
+
end
|
143
|
+
|
144
|
+
# Index an input string
|
145
|
+
|
146
|
+
def indexString(s)
|
147
|
+
@sequence = s
|
148
|
+
File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
|
149
|
+
data.puts(@sequence)
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
# Search exact match
|
156
|
+
|
157
|
+
def searchExact(pattern)
|
158
|
+
pattern = pattern.downcase
|
159
|
+
parseSuffixes(@sequence,pattern.length,pattern.length)
|
160
|
+
|
161
|
+
@matches = Array.new
|
162
|
+
# Search required length, compare (compare md5?)
|
163
|
+
# MD5 = 128 bits, easier to compare for large strings
|
164
|
+
matchsize = pattern.length
|
165
|
+
matchmd5 = Digest::MD5.hexdigest(pattern)
|
166
|
+
@suffixes.each do |md5val,posArray|
|
167
|
+
if (md5val == matchmd5)
|
168
|
+
match = Array[md5val, 0, posArray]
|
169
|
+
$log.debug "Match: " << match.inspect
|
170
|
+
@matches << match
|
171
|
+
end
|
172
|
+
end
|
173
|
+
return @matches
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
# Search an approximate string
|
178
|
+
#
|
179
|
+
# * support insertion, deletion, substitution
|
180
|
+
# * If edit > 0, use Hamming
|
181
|
+
# * Else use Levenshtein
|
182
|
+
|
183
|
+
|
184
|
+
def searchApproximate(s,edit)
|
185
|
+
if(edit==0)
|
186
|
+
return searchExact(s)
|
187
|
+
end
|
188
|
+
|
189
|
+
if(edit>0)
|
190
|
+
useHamming = true
|
191
|
+
minmatchsize = s.length
|
192
|
+
maxmatchsize = s.length
|
193
|
+
else
|
194
|
+
useHamming = false
|
195
|
+
edit = edit * (-1)
|
196
|
+
minmatchsize = s.length - edit
|
197
|
+
maxmatchsize = s.length + edit
|
198
|
+
end
|
199
|
+
|
200
|
+
parseSuffixes(@sequence,minmatchsize,maxmatchsize)
|
201
|
+
|
202
|
+
matchmd5 = Digest::MD5.hexdigest(s)
|
203
|
+
|
204
|
+
@matches = Array.new
|
205
|
+
|
206
|
+
@suffixes.each do |md5val,posArray|
|
207
|
+
if(md5val == SUFFIXLEN)
|
208
|
+
next
|
209
|
+
end
|
210
|
+
if (md5val == matchmd5)
|
211
|
+
match = Array[md5val, 0, posArray]
|
212
|
+
$log.debug "Match: " << match.inspect
|
213
|
+
@matches << match
|
214
|
+
else
|
215
|
+
if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
|
216
|
+
# Get string
|
217
|
+
seq = extractSuffix(posArray[1],posArray[0])
|
218
|
+
seq.extend(Cassiopee)
|
219
|
+
if(useHamming)
|
220
|
+
errors = seq.computeHamming(s,edit)
|
221
|
+
else
|
222
|
+
errors = seq.computeLevenshtein(s,edit)
|
223
|
+
end
|
224
|
+
if(errors>=0)
|
225
|
+
match = Array[md5val, errors, posArray]
|
226
|
+
$log.debug "Match: " << match.inspect
|
227
|
+
@matches << match
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
return @matches
|
235
|
+
end
|
236
|
+
|
237
|
+
# Extract un suffix from suffix file based on md5 match
|
238
|
+
|
239
|
+
def extractSuffix(start,len)
|
240
|
+
sequence = ''
|
241
|
+
begin
|
242
|
+
file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
|
243
|
+
file.pos = start
|
244
|
+
sequence = file.read(len)
|
245
|
+
file.close
|
246
|
+
rescue => err
|
247
|
+
puts "Exception: #{err}"
|
248
|
+
return nil
|
249
|
+
end
|
250
|
+
return sequence
|
251
|
+
end
|
252
|
+
|
253
|
+
# Iterates over matches
|
254
|
+
|
255
|
+
def next
|
256
|
+
if(@curmatch<@matches.length)
|
257
|
+
@curmatch = @curmatch + 1
|
258
|
+
return @matches[@curmatch-1]
|
259
|
+
else
|
260
|
+
@curmatch = 0
|
261
|
+
return nil
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
def to_s
|
266
|
+
puts '{ matches: "' << @matches.length << '" }'
|
267
|
+
end
|
268
|
+
|
269
|
+
private
|
270
|
+
|
271
|
+
# Parse input string
|
272
|
+
#
|
273
|
+
# * creates a suffix file
|
274
|
+
# * creates a suffix position file
|
275
|
+
|
276
|
+
def parseSuffixes(s,minlen,maxlen)
|
277
|
+
|
278
|
+
# Controls
|
279
|
+
if(minlen<=0)
|
280
|
+
minlen = 1
|
281
|
+
end
|
282
|
+
if(maxlen>@sequence.length)
|
283
|
+
maxlen = @sequence.length
|
284
|
+
end
|
285
|
+
|
286
|
+
|
287
|
+
suffixlen = nil
|
288
|
+
$log.info('Start indexing')
|
289
|
+
loaded = false
|
290
|
+
# Hash in memory already contain suffixes for searched lengths
|
291
|
+
if(@suffixes != nil && !@suffixes.empty?)
|
292
|
+
suffixlen = @suffixes[SUFFIXLEN]
|
293
|
+
if(suffixlen!=nil && !suffixlen.empty?)
|
294
|
+
loaded = true
|
295
|
+
(maxlen).downto(minlen) do |len|
|
296
|
+
if(suffixlen.index(len)==nil)
|
297
|
+
loaded = false
|
298
|
+
break
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
if(@use_store && loaded)
|
305
|
+
$log.debug('already in memory, skip file loading')
|
306
|
+
end
|
307
|
+
|
308
|
+
# If not already in memory
|
309
|
+
if(@use_store && !loaded)
|
310
|
+
@suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS)
|
311
|
+
suffixlen = @suffixes[SUFFIXLEN]
|
312
|
+
end
|
313
|
+
|
314
|
+
nbSuffix = 0
|
315
|
+
changed = false
|
316
|
+
|
317
|
+
# Load suffix between maxlen and minlen
|
318
|
+
(maxlen).downto(minlen) do |i|
|
319
|
+
$log.debug('parse for length ' << i.to_s)
|
320
|
+
if(suffixlen!=nil && suffixlen.index(i)!=nil)
|
321
|
+
$log.debug('length '<<i <<'already parsed')
|
322
|
+
next
|
323
|
+
end
|
324
|
+
changed = true
|
325
|
+
(0..(s.length-maxlen)).each do |j|
|
326
|
+
@suffix = s[j,i]
|
327
|
+
@suffixmd5 = Digest::MD5.hexdigest(@suffix)
|
328
|
+
@position = j
|
329
|
+
#$log.debug("add "+@suffix+" at pos "+@position.to_s)
|
330
|
+
nbSuffix += addSuffix(@suffixmd5, @position,i)
|
331
|
+
end
|
332
|
+
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
|
333
|
+
end
|
334
|
+
|
335
|
+
|
336
|
+
if(@use_store && changed)
|
337
|
+
$log.info("Store suffixes")
|
338
|
+
marshal_dump = Marshal.dump(@suffixes)
|
339
|
+
sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w')
|
340
|
+
sfxpos = Zlib::GzipWriter.new(sfxpos)
|
341
|
+
sfxpos.write marshal_dump
|
342
|
+
sfxpos.close
|
343
|
+
end
|
344
|
+
$log.info('End of indexing')
|
345
|
+
end
|
346
|
+
|
347
|
+
|
348
|
+
# Add a suffix in Hashmap
|
349
|
+
|
350
|
+
def addSuffix(md5val,position,len)
|
351
|
+
if(@suffixes.has_key?(md5val))
|
352
|
+
# Add position
|
353
|
+
@suffixes[md5val] << position
|
354
|
+
else
|
355
|
+
# Add position, write new suffix
|
356
|
+
# First elt is size of elt
|
357
|
+
@suffixes[md5val] = Array[len, position]
|
358
|
+
if(@suffixes.has_key?(SUFFIXLEN))
|
359
|
+
@suffixes[SUFFIXLEN] << len
|
360
|
+
else
|
361
|
+
@suffixes[SUFFIXLEN] = Array[len]
|
362
|
+
end
|
363
|
+
end
|
364
|
+
return 1
|
365
|
+
end
|
366
|
+
|
367
|
+
# read input string, and concat content
|
368
|
+
|
369
|
+
def readSequence(s)
|
370
|
+
$log.debug('read input sequence')
|
371
|
+
counter = 1
|
372
|
+
sequence = ''
|
373
|
+
begin
|
374
|
+
file = File.new(s, "r")
|
375
|
+
File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT)
|
376
|
+
File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
|
377
|
+
while (line = file.gets)
|
378
|
+
counter = counter + 1
|
379
|
+
input = line.downcase.chomp
|
380
|
+
sequence << input
|
381
|
+
data.puts input
|
382
|
+
end
|
383
|
+
|
384
|
+
end
|
385
|
+
file.close
|
386
|
+
rescue => err
|
387
|
+
puts "Exception: #{err}"
|
388
|
+
err
|
389
|
+
end
|
390
|
+
$log.debug('data file created')
|
391
|
+
return sequence
|
392
|
+
end
|
393
|
+
|
394
|
+
# Load suffix position file in memory
|
395
|
+
|
396
|
+
def loadSuffixes(file_name)
|
397
|
+
return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS)
|
398
|
+
begin
|
399
|
+
file = Zlib::GzipReader.open(file_name)
|
400
|
+
rescue Zlib::GzipFile::Error
|
401
|
+
file = File.open(file_name, 'r')
|
402
|
+
ensure
|
403
|
+
obj = Marshal.load file.read
|
404
|
+
file.close
|
405
|
+
return obj
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
end
|
410
|
+
|
411
|
+
end
|
data/tests/test-suite.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '../lib/cassiopee')
|
2
|
+
require 'rubygems'
|
3
|
+
require 'logger'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestCrawler < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_exactsearch
|
10
|
+
crawler = Cassiopee::Crawler.new
|
11
|
+
crawler.setLogLevel(Logger::ERROR)
|
12
|
+
crawler.indexString('my sample example')
|
13
|
+
matches = crawler.searchExact('ampl')
|
14
|
+
assert_equal(1,matches.length)
|
15
|
+
# Minus 1, because first element is len of match
|
16
|
+
match = crawler.next()
|
17
|
+
assert_equal(2,match[2].length-1)
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
def test_hammingsearch
|
22
|
+
crawler = Cassiopee::Crawler.new
|
23
|
+
crawler.indexString('my sample example')
|
24
|
+
matches = crawler.searchApproximate('ebampl',1)
|
25
|
+
assert_equal(1,matches.length)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_levenshteinsearch
|
29
|
+
crawler = Cassiopee::Crawler.new
|
30
|
+
crawler.indexString('my sample example')
|
31
|
+
matches = crawler.searchApproximate('ebampl',-1)
|
32
|
+
assert_equal(1,matches.length)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cassiopee
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Olivier Sallou
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-09-04 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
|
23
|
+
email: olivier.sallou@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- README
|
32
|
+
- Changelog
|
33
|
+
- LICENSE
|
34
|
+
- demo.rb
|
35
|
+
- lib/cassiopee.rb
|
36
|
+
- tests/test-suite.rb
|
37
|
+
has_rdoc: true
|
38
|
+
homepage: https://github.com/osallou/cassiopee
|
39
|
+
licenses:
|
40
|
+
- LGPL-3
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options:
|
43
|
+
- --main lib/cassiopee.rb
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
none: false
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
hash: 3
|
52
|
+
segments:
|
53
|
+
- 0
|
54
|
+
version: "0"
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 3
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.3.7
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Cassiopee index strings and provide exact or approximate search.
|
71
|
+
test_files:
|
72
|
+
- tests/test-suite.rb
|