cassiopee 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +4 -1
- data/README +1 -1
- data/bin/cassie.rb +139 -0
- data/demo.rb +1 -1
- data/lib/cassiopee.rb +78 -40
- metadata +21 -5
data/Changelog
CHANGED
data/README
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
Search an exact or approximate word (hamming or edit distance) in a string.
|
2
|
-
Support cache
|
2
|
+
Support index cache with incremental update for later searches
|
data/bin/cassie.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require File.join(File.dirname(__FILE__), '../lib/cassiopee')
|
3
|
+
require 'optparse'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
options = {}
|
7
|
+
|
8
|
+
optparse = OptionParser.new do|opts|
|
9
|
+
# Set a banner, displayed at the top
|
10
|
+
# of the help screen.
|
11
|
+
opts.banner = "Usage: cassie.rb [options]"
|
12
|
+
|
13
|
+
options[:verbose] = false
|
14
|
+
opts.on( '-v', '--verbose', 'Output more information' ) do
|
15
|
+
options[:verbose] = true
|
16
|
+
end
|
17
|
+
|
18
|
+
options[:filter] = nil
|
19
|
+
opts.on( '-f', '--filter FILTER', 'Filter matches between min and max positions ex. 100-150' ) do |filter|
|
20
|
+
options[:filter] = filter
|
21
|
+
end
|
22
|
+
|
23
|
+
options[:file] = nil
|
24
|
+
opts.on( '-i', '--index FILE', 'File to index' ) do |file|
|
25
|
+
options[:file] = file
|
26
|
+
end
|
27
|
+
|
28
|
+
options[:fpattern] = nil
|
29
|
+
opts.on( '--fpattern FILE', 'File with pattern' ) do |file|
|
30
|
+
options[:fpattern] = file
|
31
|
+
end
|
32
|
+
|
33
|
+
options[:pattern] = nil
|
34
|
+
opts.on( '-p', '--pattern PATTERN', 'Search pattern' ) do |file|
|
35
|
+
options[:pattern] = file
|
36
|
+
end
|
37
|
+
|
38
|
+
options[:store] = nil
|
39
|
+
opts.on( '-s', '--store FILE', 'Store index to file' ) do |file|
|
40
|
+
options[:store] = file
|
41
|
+
end
|
42
|
+
|
43
|
+
options[:name] = nil
|
44
|
+
opts.on( '-n', '--name NAME', 'name of index, default [crawler]' ) do |name|
|
45
|
+
options[:name] = name
|
46
|
+
end
|
47
|
+
|
48
|
+
options[:exact] = false
|
49
|
+
opts.on( '-x', '--exact', 'Do exact search (default)' ) do
|
50
|
+
options[:exact] = true
|
51
|
+
end
|
52
|
+
|
53
|
+
options[:error] = 0
|
54
|
+
opts.on( '-m', '--hamming ERROR', 'Maximum number of error to search with Hamming distance' ) do |error|
|
55
|
+
options[:error] = error
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on( '-e', '--edit ERROR', 'Maximum number of error to search with edit(levenshtein) distance' ) do |error|
|
59
|
+
options[:error] = error * (-1)
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
64
|
+
puts opts
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
optparse.parse!
|
71
|
+
|
72
|
+
if(options[:file]==nil)
|
73
|
+
puts "Error, input file is missing, use -h option for usage"
|
74
|
+
exit
|
75
|
+
elif(options[:verbose])
|
76
|
+
puts "Input sequence: " << options[:file].to_s
|
77
|
+
end
|
78
|
+
|
79
|
+
if(options[:fpattern]==nil && options[:pattern]==nil)
|
80
|
+
puts "Error, pattern is missing, use -h option for usage"
|
81
|
+
exit
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
if(options[:error]==0)
|
86
|
+
options[:exact] = true
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
crawler = Cassiopee::Crawler.new
|
91
|
+
crawler.setLogLevel(Logger::INFO)
|
92
|
+
if(options[:store])
|
93
|
+
crawler.use_store = true
|
94
|
+
end
|
95
|
+
if(options[:name]!=nil)
|
96
|
+
crawler.file_suffix = options[:name]
|
97
|
+
end
|
98
|
+
if(options[:filter]!=nil)
|
99
|
+
positions = options[:filter].split('-')
|
100
|
+
crawler.filter_position(positions[0],positions[1])
|
101
|
+
end
|
102
|
+
|
103
|
+
# String to index
|
104
|
+
crawler.indexFile(options[:file])
|
105
|
+
|
106
|
+
matches = nil
|
107
|
+
|
108
|
+
if(options[:fpattern]==nil)
|
109
|
+
pattern = options[:pattern]
|
110
|
+
else
|
111
|
+
pattern = ''
|
112
|
+
file = File.new(options[:fpattern], "r")
|
113
|
+
while (line = file.gets)
|
114
|
+
input = line.downcase.chomp
|
115
|
+
pattern << input
|
116
|
+
end
|
117
|
+
file.close
|
118
|
+
if(pattern.length==0)
|
119
|
+
puts "Error pattern file is empty"
|
120
|
+
exit
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
if(options[:verbose])
|
125
|
+
puts "Search pattern " << pattern
|
126
|
+
end
|
127
|
+
|
128
|
+
if(options[:exact])
|
129
|
+
puts "Search exact" unless !options[:verbose]
|
130
|
+
matches = crawler.searchExact(pattern)
|
131
|
+
else
|
132
|
+
puts "Search approximate" unless !options[:verbose]
|
133
|
+
matches = crawler.searchApproximate(pattern,options[:errors])
|
134
|
+
end
|
135
|
+
|
136
|
+
# Go through matches
|
137
|
+
while((match = crawler.next())!=nil)
|
138
|
+
puts "Match: " << match.inspect
|
139
|
+
end
|
data/demo.rb
CHANGED
@@ -23,7 +23,7 @@ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
|
|
23
23
|
puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
|
24
24
|
|
25
25
|
# Approcimate search, edit distance = 1
|
26
|
-
crawler.searchApproximate("llu",-
|
26
|
+
crawler.searchApproximate("llu",-2)
|
27
27
|
|
28
28
|
# Go through matches
|
29
29
|
while((match = crawler.next())!=nil)
|
data/lib/cassiopee.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'digest/md5'
|
2
2
|
require 'logger'
|
3
3
|
require 'zlib'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'text'
|
4
6
|
|
5
7
|
module Cassiopee
|
6
8
|
|
@@ -33,6 +35,7 @@ module Cassiopee
|
|
33
35
|
end
|
34
36
|
return nberr
|
35
37
|
end
|
38
|
+
|
36
39
|
|
37
40
|
# Calculate the edit distance between string and pattern
|
38
41
|
# Extend a String
|
@@ -40,41 +43,14 @@ module Cassiopee
|
|
40
43
|
|
41
44
|
def computeLevenshtein(pattern,edit)
|
42
45
|
pattern = pattern.downcase
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
matrix[1][i]=i
|
49
|
-
end
|
50
|
-
c=0
|
51
|
-
p=1
|
52
|
-
(1..(self.length)).each do |i|
|
53
|
-
c = i.modulo(2)
|
54
|
-
p = (i+1).modulo(2)
|
55
|
-
matrix[c][0] = i
|
56
|
-
(1..(pattern.length)).each do |j|
|
57
|
-
# Bellman's principle of optimality
|
58
|
-
weight = 0
|
59
|
-
if(pattern[i-1] != self[j-1])
|
60
|
-
weight = 1
|
61
|
-
end
|
62
|
-
weight = matrix[p][j-1] + weight
|
63
|
-
if(weight > matrix[p][j] +1)
|
64
|
-
weight = matrix[p][j] +1
|
65
|
-
end
|
66
|
-
if(weight > matrix[c][j-1] +1)
|
67
|
-
weight = matrix[c][j-1] +1
|
68
|
-
end
|
69
|
-
matrix[c][j] = weight
|
70
|
-
end
|
71
|
-
end
|
72
|
-
p = c
|
73
|
-
c = (c + 1).modulo(2)
|
74
|
-
if(matrix[p][pattern.length]>edit)
|
46
|
+
|
47
|
+
distance = Text::Levenshtein.distance(self, pattern)
|
48
|
+
|
49
|
+
|
50
|
+
if(distance>edit)
|
75
51
|
return -1
|
76
52
|
end
|
77
|
-
return
|
53
|
+
return distance
|
78
54
|
|
79
55
|
end
|
80
56
|
|
@@ -91,6 +67,9 @@ module Cassiopee
|
|
91
67
|
# Use persistent suffix file ?
|
92
68
|
attr_accessor :use_store
|
93
69
|
|
70
|
+
@min_position = 0
|
71
|
+
@max_position = 0
|
72
|
+
|
94
73
|
FILE_SUFFIX_EXT = ".sfx"
|
95
74
|
FILE_SUFFIX_POS = ".sfp"
|
96
75
|
|
@@ -119,9 +98,11 @@ module Cassiopee
|
|
119
98
|
end
|
120
99
|
|
121
100
|
# Clear suffixes in memory
|
101
|
+
# If using use_store, clear the store too
|
122
102
|
|
123
103
|
def clear
|
124
104
|
@suffixes = Hash.new
|
105
|
+
File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
|
125
106
|
end
|
126
107
|
|
127
108
|
# Set Logger level
|
@@ -131,6 +112,7 @@ module Cassiopee
|
|
131
112
|
end
|
132
113
|
|
133
114
|
# Index an input file
|
115
|
+
# Clear existing indexes
|
134
116
|
|
135
117
|
def indexFile(f)
|
136
118
|
# Parse file, map letters to reduced alphabet
|
@@ -138,18 +120,34 @@ module Cassiopee
|
|
138
120
|
# Take all suffix, order by length, link to position map on other file
|
139
121
|
# Store md5 for easier compare? + 20 bytes per suffix
|
140
122
|
@sequence = readSequence(f)
|
141
|
-
|
123
|
+
clear()
|
124
|
+
@min_position = 0
|
125
|
+
@max_position = 0
|
142
126
|
end
|
143
127
|
|
144
128
|
# Index an input string
|
129
|
+
# Clear existing indexes
|
145
130
|
|
146
131
|
def indexString(s)
|
147
132
|
@sequence = s
|
148
133
|
File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
|
149
134
|
data.puts(@sequence)
|
150
135
|
end
|
151
|
-
|
152
|
-
|
136
|
+
clear()
|
137
|
+
@min_position = 0
|
138
|
+
@max_position = 0
|
139
|
+
end
|
140
|
+
|
141
|
+
# Filter matches to be between min and max start position
|
142
|
+
# If not using use_store, search speed is improved but existing indexes are cleared
|
143
|
+
# If max=0, then max is string length
|
144
|
+
|
145
|
+
def filter_position(min,max)
|
146
|
+
if(!use_store)
|
147
|
+
clear()
|
148
|
+
end
|
149
|
+
@min_position = min
|
150
|
+
@max_position = max
|
153
151
|
end
|
154
152
|
|
155
153
|
# Search exact match
|
@@ -208,7 +206,8 @@ module Cassiopee
|
|
208
206
|
next
|
209
207
|
end
|
210
208
|
if (md5val == matchmd5)
|
211
|
-
|
209
|
+
filteredPosArray = filter(posArray)
|
210
|
+
match = Array[md5val, 0, filteredPosArray]
|
212
211
|
$log.debug "Match: " << match.inspect
|
213
212
|
@matches << match
|
214
213
|
else
|
@@ -222,7 +221,8 @@ module Cassiopee
|
|
222
221
|
errors = seq.computeLevenshtein(s,edit)
|
223
222
|
end
|
224
223
|
if(errors>=0)
|
225
|
-
|
224
|
+
filteredPosArray = filter(posArray)
|
225
|
+
match = Array[md5val, errors, filteredPosArray]
|
226
226
|
$log.debug "Match: " << match.inspect
|
227
227
|
@matches << match
|
228
228
|
end
|
@@ -234,6 +234,29 @@ module Cassiopee
|
|
234
234
|
return @matches
|
235
235
|
end
|
236
236
|
|
237
|
+
# Filter the array of positions with defined position filter
|
238
|
+
|
239
|
+
def filter(posArray)
|
240
|
+
$log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
|
241
|
+
if(@min_position==0 && @max_position==0)
|
242
|
+
return posArray
|
243
|
+
end
|
244
|
+
filteredArray = Array.new
|
245
|
+
i = 0
|
246
|
+
posArray.each do |pos|
|
247
|
+
if(i==0)
|
248
|
+
# First elt of array is match length
|
249
|
+
filteredArray << pos
|
250
|
+
end
|
251
|
+
if(i>0 && pos>=@min_position && pos<=@max_position)
|
252
|
+
filteredArray << pos
|
253
|
+
end
|
254
|
+
i +=1
|
255
|
+
end
|
256
|
+
return filteredArray
|
257
|
+
end
|
258
|
+
|
259
|
+
|
237
260
|
# Extract un suffix from suffix file based on md5 match
|
238
261
|
|
239
262
|
def extractSuffix(start,len)
|
@@ -283,6 +306,17 @@ module Cassiopee
|
|
283
306
|
maxlen = @sequence.length
|
284
307
|
end
|
285
308
|
|
309
|
+
if(!use_store)
|
310
|
+
minpos = @min_position
|
311
|
+
if(@max_position==0)
|
312
|
+
maxpos = @sequence.length
|
313
|
+
else
|
314
|
+
maxpos = @max_position
|
315
|
+
end
|
316
|
+
else
|
317
|
+
minpos = 0
|
318
|
+
maxpos = @sequence.length - minlen
|
319
|
+
end
|
286
320
|
|
287
321
|
suffixlen = nil
|
288
322
|
$log.info('Start indexing')
|
@@ -322,11 +356,15 @@ module Cassiopee
|
|
322
356
|
next
|
323
357
|
end
|
324
358
|
changed = true
|
325
|
-
(
|
359
|
+
(minpos..(maxpos)).each do |j|
|
360
|
+
# if position+length longer than sequence length, skip it
|
361
|
+
if(j+i>=@sequence.length)
|
362
|
+
next
|
363
|
+
end
|
326
364
|
@suffix = s[j,i]
|
327
365
|
@suffixmd5 = Digest::MD5.hexdigest(@suffix)
|
328
366
|
@position = j
|
329
|
-
|
367
|
+
$log.debug("add "+@suffix+" at pos "+@position.to_s)
|
330
368
|
nbSuffix += addSuffix(@suffixmd5, @position,i)
|
331
369
|
end
|
332
370
|
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -17,8 +17,23 @@ cert_chain: []
|
|
17
17
|
|
18
18
|
date: 2011-09-04 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
|
-
dependencies:
|
21
|
-
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: text
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 2
|
33
|
+
- 0
|
34
|
+
version: 0.2.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
22
37
|
description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
|
23
38
|
email: olivier.sallou@gmail.com
|
24
39
|
executables: []
|
@@ -33,6 +48,7 @@ files:
|
|
33
48
|
- LICENSE
|
34
49
|
- demo.rb
|
35
50
|
- lib/cassiopee.rb
|
51
|
+
- bin/cassie.rb
|
36
52
|
- tests/test-suite.rb
|
37
53
|
has_rdoc: true
|
38
54
|
homepage: https://github.com/osallou/cassiopee
|