cassiopee 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. data/Changelog +4 -1
  2. data/README +1 -1
  3. data/bin/cassie.rb +139 -0
  4. data/demo.rb +1 -1
  5. data/lib/cassiopee.rb +78 -40
  6. metadata +21 -5
data/Changelog CHANGED
@@ -1 +1,4 @@
1
- v0.1.0 : First version
1
+ v0.1.1 : fix #1, add filter option
2
+ 08/09/11 Olivier Sallou
3
+ v0.1.0 : First version
4
+ Olivier Sallou
data/README CHANGED
@@ -1,2 +1,2 @@
1
1
  Search an exact or approximate word (hamming or edit distance) in a string.
2
- Support cache mechanism and REST interface.
2
+ Support index cache with incremental update for later searches
data/bin/cassie.rb ADDED
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '../lib/cassiopee')
3
+ require 'optparse'
4
+ require 'logger'
5
+
6
+ options = {}
7
+
8
+ optparse = OptionParser.new do|opts|
9
+ # Set a banner, displayed at the top
10
+ # of the help screen.
11
+ opts.banner = "Usage: cassie.rb [options]"
12
+
13
+ options[:verbose] = false
14
+ opts.on( '-v', '--verbose', 'Output more information' ) do
15
+ options[:verbose] = true
16
+ end
17
+
18
+ options[:filter] = nil
19
+ opts.on( '-f', '--filter FILTER', 'Filter matches between min and max positions ex. 100-150' ) do |filter|
20
+ options[:filter] = filter
21
+ end
22
+
23
+ options[:file] = nil
24
+ opts.on( '-i', '--index FILE', 'File to index' ) do |file|
25
+ options[:file] = file
26
+ end
27
+
28
+ options[:fpattern] = nil
29
+ opts.on( '--fpattern FILE', 'File with pattern' ) do |file|
30
+ options[:fpattern] = file
31
+ end
32
+
33
+ options[:pattern] = nil
34
+ opts.on( '-p', '--pattern PATTERN', 'Search pattern' ) do |file|
35
+ options[:pattern] = file
36
+ end
37
+
38
+ options[:store] = nil
39
+ opts.on( '-s', '--store FILE', 'Store index to file' ) do |file|
40
+ options[:store] = file
41
+ end
42
+
43
+ options[:name] = nil
44
+ opts.on( '-n', '--name NAME', 'name of index, default [crawler]' ) do |name|
45
+ options[:name] = name
46
+ end
47
+
48
+ options[:exact] = false
49
+ opts.on( '-x', '--exact', 'Do exact search (default)' ) do
50
+ options[:exact] = true
51
+ end
52
+
53
+ options[:error] = 0
54
+ opts.on( '-m', '--hamming ERROR', 'Maximum number of error to search with Hamming distance' ) do |error|
55
+ options[:error] = error
56
+ end
57
+
58
+ opts.on( '-e', '--edit ERROR', 'Maximum number of error to search with edit(levenshtein) distance' ) do |error|
59
+ options[:error] = error * (-1)
60
+ end
61
+
62
+
63
+ opts.on( '-h', '--help', 'Display this screen' ) do
64
+ puts opts
65
+ exit
66
+ end
67
+
68
+ end
69
+
70
+ optparse.parse!
71
+
72
+ if(options[:file]==nil)
73
+ puts "Error, input file is missing, use -h option for usage"
74
+ exit
75
+ elif(options[:verbose])
76
+ puts "Input sequence: " << options[:file].to_s
77
+ end
78
+
79
+ if(options[:fpattern]==nil && options[:pattern]==nil)
80
+ puts "Error, pattern is missing, use -h option for usage"
81
+ exit
82
+ end
83
+
84
+
85
+ if(options[:error]==0)
86
+ options[:exact] = true
87
+ end
88
+
89
+
90
+ crawler = Cassiopee::Crawler.new
91
+ crawler.setLogLevel(Logger::INFO)
92
+ if(options[:store])
93
+ crawler.use_store = true
94
+ end
95
+ if(options[:name]!=nil)
96
+ crawler.file_suffix = options[:name]
97
+ end
98
+ if(options[:filter]!=nil)
99
+ positions = options[:filter].split('-')
100
+ crawler.filter_position(positions[0],positions[1])
101
+ end
102
+
103
+ # String to index
104
+ crawler.indexFile(options[:file])
105
+
106
+ matches = nil
107
+
108
+ if(options[:fpattern]==nil)
109
+ pattern = options[:pattern]
110
+ else
111
+ pattern = ''
112
+ file = File.new(options[:fpattern], "r")
113
+ while (line = file.gets)
114
+ input = line.downcase.chomp
115
+ pattern << input
116
+ end
117
+ file.close
118
+ if(pattern.length==0)
119
+ puts "Error pattern file is empty"
120
+ exit
121
+ end
122
+ end
123
+
124
+ if(options[:verbose])
125
+ puts "Search pattern " << pattern
126
+ end
127
+
128
+ if(options[:exact])
129
+ puts "Search exact" unless !options[:verbose]
130
+ matches = crawler.searchExact(pattern)
131
+ else
132
+ puts "Search approximate" unless !options[:verbose]
133
+ matches = crawler.searchApproximate(pattern,options[:errors])
134
+ end
135
+
136
+ # Go through matches
137
+ while((match = crawler.next())!=nil)
138
+ puts "Match: " << match.inspect
139
+ end
data/demo.rb CHANGED
@@ -23,7 +23,7 @@ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
23
23
  puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
24
24
 
25
25
  # Approcimate search, edit distance = 1
26
- crawler.searchApproximate("llu",-1)
26
+ crawler.searchApproximate("llu",-2)
27
27
 
28
28
  # Go through matches
29
29
  while((match = crawler.next())!=nil)
data/lib/cassiopee.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'digest/md5'
2
2
  require 'logger'
3
3
  require 'zlib'
4
+ require 'rubygems'
5
+ require 'text'
4
6
 
5
7
  module Cassiopee
6
8
 
@@ -33,6 +35,7 @@ module Cassiopee
33
35
  end
34
36
  return nberr
35
37
  end
38
+
36
39
 
37
40
  # Calculate the edit distance between string and pattern
38
41
  # Extend a String
@@ -40,41 +43,14 @@ module Cassiopee
40
43
 
41
44
  def computeLevenshtein(pattern,edit)
42
45
  pattern = pattern.downcase
43
- matrix= Array.new(2)
44
- matrix[0] = Array.new(pattern.length+1)
45
- matrix[1] = Array.new(pattern.length+1)
46
- (0..(pattern.length)).each do |i|
47
- matrix[0][i]=i
48
- matrix[1][i]=i
49
- end
50
- c=0
51
- p=1
52
- (1..(self.length)).each do |i|
53
- c = i.modulo(2)
54
- p = (i+1).modulo(2)
55
- matrix[c][0] = i
56
- (1..(pattern.length)).each do |j|
57
- # Bellman's principle of optimality
58
- weight = 0
59
- if(pattern[i-1] != self[j-1])
60
- weight = 1
61
- end
62
- weight = matrix[p][j-1] + weight
63
- if(weight > matrix[p][j] +1)
64
- weight = matrix[p][j] +1
65
- end
66
- if(weight > matrix[c][j-1] +1)
67
- weight = matrix[c][j-1] +1
68
- end
69
- matrix[c][j] = weight
70
- end
71
- end
72
- p = c
73
- c = (c + 1).modulo(2)
74
- if(matrix[p][pattern.length]>edit)
46
+
47
+ distance = Text::Levenshtein.distance(self, pattern)
48
+
49
+
50
+ if(distance>edit)
75
51
  return -1
76
52
  end
77
- return matrix[p][pattern.length]
53
+ return distance
78
54
 
79
55
  end
80
56
 
@@ -91,6 +67,9 @@ module Cassiopee
91
67
  # Use persistent suffix file ?
92
68
  attr_accessor :use_store
93
69
 
70
+ @min_position = 0
71
+ @max_position = 0
72
+
94
73
  FILE_SUFFIX_EXT = ".sfx"
95
74
  FILE_SUFFIX_POS = ".sfp"
96
75
 
@@ -119,9 +98,11 @@ module Cassiopee
119
98
  end
120
99
 
121
100
  # Clear suffixes in memory
101
+ # If using use_store, clear the store too
122
102
 
123
103
  def clear
124
104
  @suffixes = Hash.new
105
+ File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS)
125
106
  end
126
107
 
127
108
  # Set Logger level
@@ -131,6 +112,7 @@ module Cassiopee
131
112
  end
132
113
 
133
114
  # Index an input file
115
+ # Clear existing indexes
134
116
 
135
117
  def indexFile(f)
136
118
  # Parse file, map letters to reduced alphabet
@@ -138,18 +120,34 @@ module Cassiopee
138
120
  # Take all suffix, order by length, link to position map on other file
139
121
  # Store md5 for easier compare? + 20 bytes per suffix
140
122
  @sequence = readSequence(f)
141
-
123
+ clear()
124
+ @min_position = 0
125
+ @max_position = 0
142
126
  end
143
127
 
144
128
  # Index an input string
129
+ # Clear existing indexes
145
130
 
146
131
  def indexString(s)
147
132
  @sequence = s
148
133
  File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
149
134
  data.puts(@sequence)
150
135
  end
151
-
152
-
136
+ clear()
137
+ @min_position = 0
138
+ @max_position = 0
139
+ end
140
+
141
+ # Filter matches to be between min and max start position
142
+ # If not using use_store, search speed is improved but existing indexes are cleared
143
+ # If max=0, then max is string length
144
+
145
+ def filter_position(min,max)
146
+ if(!use_store)
147
+ clear()
148
+ end
149
+ @min_position = min
150
+ @max_position = max
153
151
  end
154
152
 
155
153
  # Search exact match
@@ -208,7 +206,8 @@ module Cassiopee
208
206
  next
209
207
  end
210
208
  if (md5val == matchmd5)
211
- match = Array[md5val, 0, posArray]
209
+ filteredPosArray = filter(posArray)
210
+ match = Array[md5val, 0, filteredPosArray]
212
211
  $log.debug "Match: " << match.inspect
213
212
  @matches << match
214
213
  else
@@ -222,7 +221,8 @@ module Cassiopee
222
221
  errors = seq.computeLevenshtein(s,edit)
223
222
  end
224
223
  if(errors>=0)
225
- match = Array[md5val, errors, posArray]
224
+ filteredPosArray = filter(posArray)
225
+ match = Array[md5val, errors, filteredPosArray]
226
226
  $log.debug "Match: " << match.inspect
227
227
  @matches << match
228
228
  end
@@ -234,6 +234,29 @@ module Cassiopee
234
234
  return @matches
235
235
  end
236
236
 
237
+ # Filter the array of positions with defined position filter
238
+
239
+ def filter(posArray)
240
+ $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s)
241
+ if(@min_position==0 && @max_position==0)
242
+ return posArray
243
+ end
244
+ filteredArray = Array.new
245
+ i = 0
246
+ posArray.each do |pos|
247
+ if(i==0)
248
+ # First elt of array is match length
249
+ filteredArray << pos
250
+ end
251
+ if(i>0 && pos>=@min_position && pos<=@max_position)
252
+ filteredArray << pos
253
+ end
254
+ i +=1
255
+ end
256
+ return filteredArray
257
+ end
258
+
259
+
237
260
  # Extract un suffix from suffix file based on md5 match
238
261
 
239
262
  def extractSuffix(start,len)
@@ -283,6 +306,17 @@ module Cassiopee
283
306
  maxlen = @sequence.length
284
307
  end
285
308
 
309
+ if(!use_store)
310
+ minpos = @min_position
311
+ if(@max_position==0)
312
+ maxpos = @sequence.length
313
+ else
314
+ maxpos = @max_position
315
+ end
316
+ else
317
+ minpos = 0
318
+ maxpos = @sequence.length - minlen
319
+ end
286
320
 
287
321
  suffixlen = nil
288
322
  $log.info('Start indexing')
@@ -322,11 +356,15 @@ module Cassiopee
322
356
  next
323
357
  end
324
358
  changed = true
325
- (0..(s.length-maxlen)).each do |j|
359
+ (minpos..(maxpos)).each do |j|
360
+ # if position+length longer than sequence length, skip it
361
+ if(j+i>=@sequence.length)
362
+ next
363
+ end
326
364
  @suffix = s[j,i]
327
365
  @suffixmd5 = Digest::MD5.hexdigest(@suffix)
328
366
  @position = j
329
- #$log.debug("add "+@suffix+" at pos "+@position.to_s)
367
+ $log.debug("add "+@suffix+" at pos "+@position.to_s)
330
368
  nbSuffix += addSuffix(@suffixmd5, @position,i)
331
369
  end
332
370
  $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassiopee
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 0
10
- version: 0.1.0
9
+ - 1
10
+ version: 0.1.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Olivier Sallou
@@ -17,8 +17,23 @@ cert_chain: []
17
17
 
18
18
  date: 2011-09-04 00:00:00 +02:00
19
19
  default_executable:
20
- dependencies: []
21
-
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: text
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 0
32
+ - 2
33
+ - 0
34
+ version: 0.2.0
35
+ type: :runtime
36
+ version_requirements: *id001
22
37
  description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
23
38
  email: olivier.sallou@gmail.com
24
39
  executables: []
@@ -33,6 +48,7 @@ files:
33
48
  - LICENSE
34
49
  - demo.rb
35
50
  - lib/cassiopee.rb
51
+ - bin/cassie.rb
36
52
  - tests/test-suite.rb
37
53
  has_rdoc: true
38
54
  homepage: https://github.com/osallou/cassiopee