cassiopee 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +4 -1
- data/README +1 -1
- data/bin/cassie.rb +139 -0
- data/demo.rb +1 -1
- data/lib/cassiopee.rb +78 -40
- metadata +21 -5
data/Changelog
CHANGED
data/README
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
Search an exact or approximate word (hamming or edit distance) in a string.
|
2
|
-
Support cache
|
2
|
+
Support index cache with incremental update for later searches
|
data/bin/cassie.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require File.join(File.dirname(__FILE__), '../lib/cassiopee')
|
3
|
+
require 'optparse'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
options = {}
|
7
|
+
|
8
|
+
optparse = OptionParser.new do|opts|
|
9
|
+
# Set a banner, displayed at the top
|
10
|
+
# of the help screen.
|
11
|
+
opts.banner = "Usage: cassie.rb [options]"
|
12
|
+
|
13
|
+
options[:verbose] = false
|
14
|
+
opts.on( '-v', '--verbose', 'Output more information' ) do
|
15
|
+
options[:verbose] = true
|
16
|
+
end
|
17
|
+
|
18
|
+
options[:filter] = nil
|
19
|
+
opts.on( '-f', '--filter FILTER', 'Filter matches between min and max positions ex. 100-150' ) do |filter|
|
20
|
+
options[:filter] = filter
|
21
|
+
end
|
22
|
+
|
23
|
+
options[:file] = nil
|
24
|
+
opts.on( '-i', '--index FILE', 'File to index' ) do |file|
|
25
|
+
options[:file] = file
|
26
|
+
end
|
27
|
+
|
28
|
+
options[:fpattern] = nil
|
29
|
+
opts.on( '--fpattern FILE', 'File with pattern' ) do |file|
|
30
|
+
options[:fpattern] = file
|
31
|
+
end
|
32
|
+
|
33
|
+
options[:pattern] = nil
|
34
|
+
opts.on( '-p', '--pattern PATTERN', 'Search pattern' ) do |file|
|
35
|
+
options[:pattern] = file
|
36
|
+
end
|
37
|
+
|
38
|
+
options[:store] = nil
|
39
|
+
opts.on( '-s', '--store FILE', 'Store index to file' ) do |file|
|
40
|
+
options[:store] = file
|
41
|
+
end
|
42
|
+
|
43
|
+
options[:name] = nil
|
44
|
+
opts.on( '-n', '--name NAME', 'name of index, default [crawler]' ) do |name|
|
45
|
+
options[:name] = name
|
46
|
+
end
|
47
|
+
|
48
|
+
options[:exact] = false
|
49
|
+
opts.on( '-x', '--exact', 'Do exact search (default)' ) do
|
50
|
+
options[:exact] = true
|
51
|
+
end
|
52
|
+
|
53
|
+
options[:error] = 0
|
54
|
+
opts.on( '-m', '--hamming ERROR', 'Maximum number of error to search with Hamming distance' ) do |error|
|
55
|
+
options[:error] = error
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on( '-e', '--edit ERROR', 'Maximum number of error to search with edit(levenshtein) distance' ) do |error|
|
59
|
+
options[:error] = error * (-1)
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
64
|
+
puts opts
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
optparse.parse!
|
71
|
+
|
72
|
+
if(options[:file]==nil)
|
73
|
+
puts "Error, input file is missing, use -h option for usage"
|
74
|
+
exit
|
75
|
+
elif(options[:verbose])
|
76
|
+
puts "Input sequence: " << options[:file].to_s
|
77
|
+
end
|
78
|
+
|
79
|
+
if(options[:fpattern]==nil && options[:pattern]==nil)
|
80
|
+
puts "Error, pattern is missing, use -h option for usage"
|
81
|
+
exit
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
if(options[:error]==0)
|
86
|
+
options[:exact] = true
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
crawler = Cassiopee::Crawler.new
|
91
|
+
crawler.setLogLevel(Logger::INFO)
|
92
|
+
if(options[:store])
|
93
|
+
crawler.use_store = true
|
94
|
+
end
|
95
|
+
if(options[:name]!=nil)
|
96
|
+
crawler.file_suffix = options[:name]
|
97
|
+
end
|
98
|
+
if(options[:filter]!=nil)
|
99
|
+
positions = options[:filter].split('-')
|
100
|
+
crawler.filter_position(positions[0],positions[1])
|
101
|
+
end
|
102
|
+
|
103
|
+
# String to index
|
104
|
+
crawler.indexFile(options[:file])
|
105
|
+
|
106
|
+
matches = nil
|
107
|
+
|
108
|
+
if(options[:fpattern]==nil)
|
109
|
+
pattern = options[:pattern]
|
110
|
+
else
|
111
|
+
pattern = ''
|
112
|
+
file = File.new(options[:fpattern], "r")
|
113
|
+
while (line = file.gets)
|
114
|
+
input = line.downcase.chomp
|
115
|
+
pattern << input
|
116
|
+
end
|
117
|
+
file.close
|
118
|
+
if(pattern.length==0)
|
119
|
+
puts "Error pattern file is empty"
|
120
|
+
exit
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
if(options[:verbose])
|
125
|
+
puts "Search pattern " << pattern
|
126
|
+
end
|
127
|
+
|
128
|
+
if(options[:exact])
|
129
|
+
puts "Search exact" unless !options[:verbose]
|
130
|
+
matches = crawler.searchExact(pattern)
|
131
|
+
else
|
132
|
+
puts "Search approximate" unless !options[:verbose]
|
133
|
+
matches = crawler.searchApproximate(pattern,options[:errors])
|
134
|
+
end
|
135
|
+
|
136
|
+
# Go through matches
|
137
|
+
while((match = crawler.next())!=nil)
|
138
|
+
puts "Match: " << match.inspect
|
139
|
+
end
|
data/demo.rb
CHANGED
@@ -23,7 +23,7 @@ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
|
|
23
23
|
puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
|
24
24
|
|
25
25
|
# Approcimate search, edit distance = 1
|
26
|
-
crawler.searchApproximate("llu",-
|
26
|
+
crawler.searchApproximate("llu",-2)
|
27
27
|
|
28
28
|
# Go through matches
|
29
29
|
while((match = crawler.next())!=nil)
|
data/lib/cassiopee.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'digest/md5'
|
2
2
|
require 'logger'
|
3
3
|
require 'zlib'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'text'
|
4
6
|
|
5
7
|
module Cassiopee
|
6
8
|
|
@@ -33,6 +35,7 @@ module Cassiopee
|
|
33
35
|
end
|
34
36
|
return nberr
|
35
37
|
end
|
38
|
+
|
36
39
|
|
37
40
|
# Calculate the edit distance between string and pattern
|
38
41
|
# Extend a String
|
@@ -40,41 +43,14 @@ module Cassiopee
|
|
40
43
|
|
41
44
|
def computeLevenshtein(pattern,edit)
|
42
45
|
pattern = pattern.downcase
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
matrix[1][i]=i
|
49
|
-
end
|
50
|
-
c=0
|
51
|
-
p=1
|
52
|
-
(1..(self.length)).each do |i|
|
53
|
-
c = i.modulo(2)
|
54
|
-
p = (i+1).modulo(2)
|
55
|
-
matrix[c][0] = i
|
56
|
-
(1..(pattern.length)).each do |j|
|
57
|
-
# Bellman's principle of optimality
|
58
|
-
weight = 0
|
59
|
-
if(pattern[i-1] != self[j-1])
|
60
|
-
weight = 1
|
61
|
-
end
|
62
|
-
weight = matrix[p][j-1] + weight
|
63
|
-
if(weight > matrix[p][j] +1)
|
64
|
-
weight = matrix[p][j] +1
|
65
|
-
end
|
66
|
-
if(weight > matrix[c][j-1] +1)
|
67
|
-
weight = matrix[c][j-1] +1
|
68
|
-
end
|
69
|
-
matrix[c][j] = weight
|
70
|
-
end
|
71
|
-
end
|
72
|
-
p = c
|
73
|
-
c = (c + 1).modulo(2)
|
74
|
-
if(matrix[p][pattern.length]>edit)
|
46
|
+
|
47
|
+
distance = Text::Levenshtein.distance(self, pattern)
|
48
|
+
|
49
|
+
|
50
|
+
if(distance>edit)
|
75
51
|
return -1
|
76
52
|
end
|
77
|
-
return
|
53
|
+
return distance
|
78
54
|
|
79
55
|
end
|
80
56
|
|
@@ -91,6 +67,9 @@ module Cassiopee
|
|
91
67
|
# Use persistent suffix file ?
|
92
68
|
attr_accessor :use_store
|
93
69
|
|
70
|
+
@min_position = 0
|
71
|
+
@max_position = 0
|
72
|
+
|
94
73
|
FILE_SUFFIX_EXT = ".sfx"
|
95
74
|
FILE_SUFFIX_POS = ".sfp"
|
96
75
|
|
@@ -119,9 +98,11 @@ module Cassiopee
|
|
119
98
|
end
|
120
99
|
|
121
100
|
# Clear suffixes in memory
|
101
|
+
# If using use_store, clear the store too
|
122
102
|
|
123
103
|
def clear
|
124
104
|
@suffixes = Hash.new
|
105
|
+
File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
|
125
106
|
end
|
126
107
|
|
127
108
|
# Set Logger level
|
@@ -131,6 +112,7 @@ module Cassiopee
|
|
131
112
|
end
|
132
113
|
|
133
114
|
# Index an input file
|
115
|
+
# Clear existing indexes
|
134
116
|
|
135
117
|
def indexFile(f)
|
136
118
|
# Parse file, map letters to reduced alphabet
|
@@ -138,18 +120,34 @@ module Cassiopee
|
|
138
120
|
# Take all suffix, order by length, link to position map on other file
|
139
121
|
# Store md5 for easier compare? + 20 bytes per suffix
|
140
122
|
@sequence = readSequence(f)
|
141
|
-
|
123
|
+
clear()
|
124
|
+
@min_position = 0
|
125
|
+
@max_position = 0
|
142
126
|
end
|
143
127
|
|
144
128
|
# Index an input string
|
129
|
+
# Clear existing indexes
|
145
130
|
|
146
131
|
def indexString(s)
|
147
132
|
@sequence = s
|
148
133
|
File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
|
149
134
|
data.puts(@sequence)
|
150
135
|
end
|
151
|
-
|
152
|
-
|
136
|
+
clear()
|
137
|
+
@min_position = 0
|
138
|
+
@max_position = 0
|
139
|
+
end
|
140
|
+
|
141
|
+
# Filter matches to be between min and max start position
|
142
|
+
# If not using use_store, search speed is improved but existing indexes are cleared
|
143
|
+
# If max=0, then max is string length
|
144
|
+
|
145
|
+
def filter_position(min,max)
|
146
|
+
if(!use_store)
|
147
|
+
clear()
|
148
|
+
end
|
149
|
+
@min_position = min
|
150
|
+
@max_position = max
|
153
151
|
end
|
154
152
|
|
155
153
|
# Search exact match
|
@@ -208,7 +206,8 @@ module Cassiopee
|
|
208
206
|
next
|
209
207
|
end
|
210
208
|
if (md5val == matchmd5)
|
211
|
-
|
209
|
+
filteredPosArray = filter(posArray)
|
210
|
+
match = Array[md5val, 0, filteredPosArray]
|
212
211
|
$log.debug "Match: " << match.inspect
|
213
212
|
@matches << match
|
214
213
|
else
|
@@ -222,7 +221,8 @@ module Cassiopee
|
|
222
221
|
errors = seq.computeLevenshtein(s,edit)
|
223
222
|
end
|
224
223
|
if(errors>=0)
|
225
|
-
|
224
|
+
filteredPosArray = filter(posArray)
|
225
|
+
match = Array[md5val, errors, filteredPosArray]
|
226
226
|
$log.debug "Match: " << match.inspect
|
227
227
|
@matches << match
|
228
228
|
end
|
@@ -234,6 +234,29 @@ module Cassiopee
|
|
234
234
|
return @matches
|
235
235
|
end
|
236
236
|
|
237
|
+
# Filter the array of positions with defined position filter
|
238
|
+
|
239
|
+
def filter(posArray)
|
240
|
+
$log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
|
241
|
+
if(@min_position==0 && @max_position==0)
|
242
|
+
return posArray
|
243
|
+
end
|
244
|
+
filteredArray = Array.new
|
245
|
+
i = 0
|
246
|
+
posArray.each do |pos|
|
247
|
+
if(i==0)
|
248
|
+
# First elt of array is match length
|
249
|
+
filteredArray << pos
|
250
|
+
end
|
251
|
+
if(i>0 && pos>=@min_position && pos<=@max_position)
|
252
|
+
filteredArray << pos
|
253
|
+
end
|
254
|
+
i +=1
|
255
|
+
end
|
256
|
+
return filteredArray
|
257
|
+
end
|
258
|
+
|
259
|
+
|
237
260
|
# Extract un suffix from suffix file based on md5 match
|
238
261
|
|
239
262
|
def extractSuffix(start,len)
|
@@ -283,6 +306,17 @@ module Cassiopee
|
|
283
306
|
maxlen = @sequence.length
|
284
307
|
end
|
285
308
|
|
309
|
+
if(!use_store)
|
310
|
+
minpos = @min_position
|
311
|
+
if(@max_position==0)
|
312
|
+
maxpos = @sequence.length
|
313
|
+
else
|
314
|
+
maxpos = @max_position
|
315
|
+
end
|
316
|
+
else
|
317
|
+
minpos = 0
|
318
|
+
maxpos = @sequence.length - minlen
|
319
|
+
end
|
286
320
|
|
287
321
|
suffixlen = nil
|
288
322
|
$log.info('Start indexing')
|
@@ -322,11 +356,15 @@ module Cassiopee
|
|
322
356
|
next
|
323
357
|
end
|
324
358
|
changed = true
|
325
|
-
(
|
359
|
+
(minpos..(maxpos)).each do |j|
|
360
|
+
# if position+length longer than sequence length, skip it
|
361
|
+
if(j+i>=@sequence.length)
|
362
|
+
next
|
363
|
+
end
|
326
364
|
@suffix = s[j,i]
|
327
365
|
@suffixmd5 = Digest::MD5.hexdigest(@suffix)
|
328
366
|
@position = j
|
329
|
-
|
367
|
+
$log.debug("add "+@suffix+" at pos "+@position.to_s)
|
330
368
|
nbSuffix += addSuffix(@suffixmd5, @position,i)
|
331
369
|
end
|
332
370
|
$log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -17,8 +17,23 @@ cert_chain: []
|
|
17
17
|
|
18
18
|
date: 2011-09-04 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
|
-
dependencies:
|
21
|
-
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: text
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 2
|
33
|
+
- 0
|
34
|
+
version: 0.2.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
22
37
|
description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
|
23
38
|
email: olivier.sallou@gmail.com
|
24
39
|
executables: []
|
@@ -33,6 +48,7 @@ files:
|
|
33
48
|
- LICENSE
|
34
49
|
- demo.rb
|
35
50
|
- lib/cassiopee.rb
|
51
|
+
- bin/cassie.rb
|
36
52
|
- tests/test-suite.rb
|
37
53
|
has_rdoc: true
|
38
54
|
homepage: https://github.com/osallou/cassiopee
|