cassiopee 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. data/Changelog +4 -1
  2. data/README +1 -1
  3. data/bin/cassie.rb +139 -0
  4. data/demo.rb +1 -1
  5. data/lib/cassiopee.rb +78 -40
  6. metadata +21 -5
data/Changelog CHANGED
@@ -1 +1,4 @@
1
- v0.1.0 : First version
1
+ v0.1.1 : fix #1, add filter option
2
+ 08/09/11 Olivier Sallou
3
+ v0.1.0 : First version
4
+ Olivier Sallou
data/README CHANGED
@@ -1,2 +1,2 @@
1
1
  Search an exact or approximate word (hamming or edit distance) in a string.
2
- Support cache mechanism and REST interface.
2
+ Support index cache with incremental update for later searches
data/bin/cassie.rb ADDED
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '../lib/cassiopee')
3
+ require 'optparse'
4
+ require 'logger'
5
+
6
+ options = {}
7
+
8
+ optparse = OptionParser.new do|opts|
9
+ # Set a banner, displayed at the top
10
+ # of the help screen.
11
+ opts.banner = "Usage: cassie.rb [options]"
12
+
13
+ options[:verbose] = false
14
+ opts.on( '-v', '--verbose', 'Output more information' ) do
15
+ options[:verbose] = true
16
+ end
17
+
18
+ options[:filter] = nil
19
+ opts.on( '-f', '--filter FILTER', 'Filter matches between min and max positions ex. 100-150' ) do |filter|
20
+ options[:filter] = filter
21
+ end
22
+
23
+ options[:file] = nil
24
+ opts.on( '-i', '--index FILE', 'File to index' ) do |file|
25
+ options[:file] = file
26
+ end
27
+
28
+ options[:fpattern] = nil
29
+ opts.on( '--fpattern FILE', 'File with pattern' ) do |file|
30
+ options[:fpattern] = file
31
+ end
32
+
33
+ options[:pattern] = nil
34
+ opts.on( '-p', '--pattern PATTERN', 'Search pattern' ) do |file|
35
+ options[:pattern] = file
36
+ end
37
+
38
+ options[:store] = nil
39
+ opts.on( '-s', '--store FILE', 'Store index to file' ) do |file|
40
+ options[:store] = file
41
+ end
42
+
43
+ options[:name] = nil
44
+ opts.on( '-n', '--name NAME', 'name of index, default [crawler]' ) do |name|
45
+ options[:name] = name
46
+ end
47
+
48
+ options[:exact] = false
49
+ opts.on( '-x', '--exact', 'Do exact search (default)' ) do
50
+ options[:exact] = true
51
+ end
52
+
53
+ options[:error] = 0
54
+ opts.on( '-m', '--hamming ERROR', 'Maximum number of error to search with Hamming distance' ) do |error|
55
+ options[:error] = error
56
+ end
57
+
58
+ opts.on( '-e', '--edit ERROR', 'Maximum number of error to search with edit(levenshtein) distance' ) do |error|
59
+ options[:error] = error * (-1)
60
+ end
61
+
62
+
63
+ opts.on( '-h', '--help', 'Display this screen' ) do
64
+ puts opts
65
+ exit
66
+ end
67
+
68
+ end
69
+
70
+ optparse.parse!
71
+
72
+ if(options[:file]==nil)
73
+ puts "Error, input file is missing, use -h option for usage"
74
+ exit
75
+ elif(options[:verbose])
76
+ puts "Input sequence: " << options[:file].to_s
77
+ end
78
+
79
+ if(options[:fpattern]==nil && options[:pattern]==nil)
80
+ puts "Error, pattern is missing, use -h option for usage"
81
+ exit
82
+ end
83
+
84
+
85
+ if(options[:error]==0)
86
+ options[:exact] = true
87
+ end
88
+
89
+
90
+ crawler = Cassiopee::Crawler.new
91
+ crawler.setLogLevel(Logger::INFO)
92
+ if(options[:store])
93
+ crawler.use_store = true
94
+ end
95
+ if(options[:name]!=nil)
96
+ crawler.file_suffix = options[:name]
97
+ end
98
+ if(options[:filter]!=nil)
99
+ positions = options[:filter].split('-')
100
+ crawler.filter_position(positions[0],positions[1])
101
+ end
102
+
103
+ # String to index
104
+ crawler.indexFile(options[:file])
105
+
106
+ matches = nil
107
+
108
+ if(options[:fpattern]==nil)
109
+ pattern = options[:pattern]
110
+ else
111
+ pattern = ''
112
+ file = File.new(options[:fpattern], "r")
113
+ while (line = file.gets)
114
+ input = line.downcase.chomp
115
+ pattern << input
116
+ end
117
+ file.close
118
+ if(pattern.length==0)
119
+ puts "Error pattern file is empty"
120
+ exit
121
+ end
122
+ end
123
+
124
+ if(options[:verbose])
125
+ puts "Search pattern " << pattern
126
+ end
127
+
128
+ if(options[:exact])
129
+ puts "Search exact" unless !options[:verbose]
130
+ matches = crawler.searchExact(pattern)
131
+ else
132
+ puts "Search approximate" unless !options[:verbose]
133
+ matches = crawler.searchApproximate(pattern,options[:errors])
134
+ end
135
+
136
+ # Go through matches
137
+ while((match = crawler.next())!=nil)
138
+ puts "Match: " << match.inspect
139
+ end
data/demo.rb CHANGED
@@ -23,7 +23,7 @@ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
23
23
  puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
24
24
 
25
25
  # Approcimate search, edit distance = 1
26
- crawler.searchApproximate("llu",-1)
26
+ crawler.searchApproximate("llu",-2)
27
27
 
28
28
  # Go through matches
29
29
  while((match = crawler.next())!=nil)
data/lib/cassiopee.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'digest/md5'
2
2
  require 'logger'
3
3
  require 'zlib'
4
+ require 'rubygems'
5
+ require 'text'
4
6
 
5
7
  module Cassiopee
6
8
 
@@ -33,6 +35,7 @@ module Cassiopee
33
35
  end
34
36
  return nberr
35
37
  end
38
+
36
39
 
37
40
  # Calculate the edit distance between string and pattern
38
41
  # Extend a String
@@ -40,41 +43,14 @@ module Cassiopee
40
43
 
41
44
  def computeLevenshtein(pattern,edit)
42
45
  pattern = pattern.downcase
43
- matrix= Array.new(2)
44
- matrix[0] = Array.new(pattern.length+1)
45
- matrix[1] = Array.new(pattern.length+1)
46
- (0..(pattern.length)).each do |i|
47
- matrix[0][i]=i
48
- matrix[1][i]=i
49
- end
50
- c=0
51
- p=1
52
- (1..(self.length)).each do |i|
53
- c = i.modulo(2)
54
- p = (i+1).modulo(2)
55
- matrix[c][0] = i
56
- (1..(pattern.length)).each do |j|
57
- # Bellman's principle of optimality
58
- weight = 0
59
- if(pattern[i-1] != self[j-1])
60
- weight = 1
61
- end
62
- weight = matrix[p][j-1] + weight
63
- if(weight > matrix[p][j] +1)
64
- weight = matrix[p][j] +1
65
- end
66
- if(weight > matrix[c][j-1] +1)
67
- weight = matrix[c][j-1] +1
68
- end
69
- matrix[c][j] = weight
70
- end
71
- end
72
- p = c
73
- c = (c + 1).modulo(2)
74
- if(matrix[p][pattern.length]>edit)
46
+
47
+ distance = Text::Levenshtein.distance(self, pattern)
48
+
49
+
50
+ if(distance>edit)
75
51
  return -1
76
52
  end
77
- return matrix[p][pattern.length]
53
+ return distance
78
54
 
79
55
  end
80
56
 
@@ -91,6 +67,9 @@ module Cassiopee
91
67
  # Use persistent suffix file ?
92
68
  attr_accessor :use_store
93
69
 
70
+ @min_position = 0
71
+ @max_position = 0
72
+
94
73
  FILE_SUFFIX_EXT = ".sfx"
95
74
  FILE_SUFFIX_POS = ".sfp"
96
75
 
@@ -119,9 +98,11 @@ module Cassiopee
119
98
  end
120
99
 
121
100
  # Clear suffixes in memory
101
+ # If using use_store, clear the store too
122
102
 
123
103
  def clear
124
104
  @suffixes = Hash.new
105
+ File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
125
106
  end
126
107
 
127
108
  # Set Logger level
@@ -131,6 +112,7 @@ module Cassiopee
131
112
  end
132
113
 
133
114
  # Index an input file
115
+ # Clear existing indexes
134
116
 
135
117
  def indexFile(f)
136
118
  # Parse file, map letters to reduced alphabet
@@ -138,18 +120,34 @@ module Cassiopee
138
120
  # Take all suffix, order by length, link to position map on other file
139
121
  # Store md5 for easier compare? + 20 bytes per suffix
140
122
  @sequence = readSequence(f)
141
-
123
+ clear()
124
+ @min_position = 0
125
+ @max_position = 0
142
126
  end
143
127
 
144
128
  # Index an input string
129
+ # Clear existing indexes
145
130
 
146
131
  def indexString(s)
147
132
  @sequence = s
148
133
  File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
149
134
  data.puts(@sequence)
150
135
  end
151
-
152
-
136
+ clear()
137
+ @min_position = 0
138
+ @max_position = 0
139
+ end
140
+
141
+ # Filter matches to be between min and max start position
142
+ # If not using use_store, search speed is improved but existing indexes are cleared
143
+ # If max=0, then max is string length
144
+
145
+ def filter_position(min,max)
146
+ if(!use_store)
147
+ clear()
148
+ end
149
+ @min_position = min
150
+ @max_position = max
153
151
  end
154
152
 
155
153
  # Search exact match
@@ -208,7 +206,8 @@ module Cassiopee
208
206
  next
209
207
  end
210
208
  if (md5val == matchmd5)
211
- match = Array[md5val, 0, posArray]
209
+ filteredPosArray = filter(posArray)
210
+ match = Array[md5val, 0, filteredPosArray]
212
211
  $log.debug "Match: " << match.inspect
213
212
  @matches << match
214
213
  else
@@ -222,7 +221,8 @@ module Cassiopee
222
221
  errors = seq.computeLevenshtein(s,edit)
223
222
  end
224
223
  if(errors>=0)
225
- match = Array[md5val, errors, posArray]
224
+ filteredPosArray = filter(posArray)
225
+ match = Array[md5val, errors, filteredPosArray]
226
226
  $log.debug "Match: " << match.inspect
227
227
  @matches << match
228
228
  end
@@ -234,6 +234,29 @@ module Cassiopee
234
234
  return @matches
235
235
  end
236
236
 
237
+ # Filter the array of positions with defined position filter
238
+
239
+ def filter(posArray)
240
+ $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
241
+ if(@min_position==0 && @max_position==0)
242
+ return posArray
243
+ end
244
+ filteredArray = Array.new
245
+ i = 0
246
+ posArray.each do |pos|
247
+ if(i==0)
248
+ # First elt of array is match length
249
+ filteredArray << pos
250
+ end
251
+ if(i>0 && pos>=@min_position && pos<=@max_position)
252
+ filteredArray << pos
253
+ end
254
+ i +=1
255
+ end
256
+ return filteredArray
257
+ end
258
+
259
+
237
260
  # Extract un suffix from suffix file based on md5 match
238
261
 
239
262
  def extractSuffix(start,len)
@@ -283,6 +306,17 @@ module Cassiopee
283
306
  maxlen = @sequence.length
284
307
  end
285
308
 
309
+ if(!use_store)
310
+ minpos = @min_position
311
+ if(@max_position==0)
312
+ maxpos = @sequence.length
313
+ else
314
+ maxpos = @max_position
315
+ end
316
+ else
317
+ minpos = 0
318
+ maxpos = @sequence.length - minlen
319
+ end
286
320
 
287
321
  suffixlen = nil
288
322
  $log.info('Start indexing')
@@ -322,11 +356,15 @@ module Cassiopee
322
356
  next
323
357
  end
324
358
  changed = true
325
- (0..(s.length-maxlen)).each do |j|
359
+ (minpos..(maxpos)).each do |j|
360
+ # if position+length longer than sequence length, skip it
361
+ if(j+i>=@sequence.length)
362
+ next
363
+ end
326
364
  @suffix = s[j,i]
327
365
  @suffixmd5 = Digest::MD5.hexdigest(@suffix)
328
366
  @position = j
329
- #$log.debug("add "+@suffix+" at pos "+@position.to_s)
367
+ $log.debug("add "+@suffix+" at pos "+@position.to_s)
330
368
  nbSuffix += addSuffix(@suffixmd5, @position,i)
331
369
  end
332
370
  $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassiopee
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 0
10
- version: 0.1.0
9
+ - 1
10
+ version: 0.1.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Olivier Sallou
@@ -17,8 +17,23 @@ cert_chain: []
17
17
 
18
18
  date: 2011-09-04 00:00:00 +02:00
19
19
  default_executable:
20
- dependencies: []
21
-
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: text
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 0
32
+ - 2
33
+ - 0
34
+ version: 0.2.0
35
+ type: :runtime
36
+ version_requirements: *id001
22
37
  description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
23
38
  email: olivier.sallou@gmail.com
24
39
  executables: []
@@ -33,6 +48,7 @@ files:
33
48
  - LICENSE
34
49
  - demo.rb
35
50
  - lib/cassiopee.rb
51
+ - bin/cassie.rb
36
52
  - tests/test-suite.rb
37
53
  has_rdoc: true
38
54
  homepage: https://github.com/osallou/cassiopee