cassiopee 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (7) hide show
  1. data/Changelog +1 -0
  2. data/LICENSE +1 -0
  3. data/README +2 -0
  4. data/demo.rb +31 -0
  5. data/lib/cassiopee.rb +411 -0
  6. data/tests/test-suite.rb +38 -0
  7. metadata +72 -0
data/Changelog ADDED
@@ -0,0 +1 @@
1
+ v0.1.0 : First version
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ Software is distributed under LGPL v3
data/README ADDED
@@ -0,0 +1,2 @@
1
+ Search an exact or approximate word (hamming or edit distance) in a string.
2
+ Support cache mechanism and REST interface.
data/demo.rb ADDED
@@ -0,0 +1,31 @@
1
+ require File.join(File.dirname(__FILE__), 'lib/cassiopee')
2
+ require 'rubygems'
3
+ require 'logger'
4
+
5
+ # Instanciate a new crawler
6
+ crawler = Cassiopee::Crawler.new
7
+ #crawler.use_store = true
8
+
9
+ # String to index
10
+ crawler.indexString('sallou sallu')
11
+ # Search pattern in indexed string
12
+ crawler.searchExact('llo')
13
+ # Search it again, using already loaded indexed data
14
+ crawler.searchExact('llo')
15
+
16
+
17
+ test= "my string"
18
+ # Extend to use match algorithms
19
+ test.extend(Cassiopee)
20
+ test.computeDistance('test',0,0)
21
+ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
22
+
23
+ puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
24
+
25
+ # Approcimate search, edit distance = 1
26
+ crawler.searchApproximate("llu",-1)
27
+
28
+ # Go through matches
29
+ while((match = crawler.next())!=nil)
30
+ puts "got a match " << match.inspect
31
+ end
data/lib/cassiopee.rb ADDED
@@ -0,0 +1,411 @@
1
+ require 'digest/md5'
2
+ require 'logger'
3
+ require 'zlib'
4
+
5
+ module Cassiopee
6
+
7
+ # Calculate the edit or hamming distance between String and pattern
8
+ # Extend a String
9
+ # Return -1 if max is reached
10
+
11
+ def computeDistance(pattern,hamming,edit)
12
+ if(edit==0)
13
+ return computeHamming(pattern,hamming)
14
+ else
15
+ return computeLevenshtein(pattern,edit)
16
+ end
17
+ end
18
+
19
+ # Calculate number of substitution between string and pattern
20
+ # Extend a String
21
+ # Return -1 if max is reached
22
+
23
+ def computeHamming(pattern,hamming)
24
+ pattern = pattern.downcase
25
+ nberr = 0
26
+ (0..(self.length-1)).each do |c|
27
+ if(pattern[c] != self[c])
28
+ nberr = nberr+1
29
+ if(nberr>hamming.to_i)
30
+ return -1
31
+ end
32
+ end
33
+ end
34
+ return nberr
35
+ end
36
+
37
+ # Calculate the edit distance between string and pattern
38
+ # Extend a String
39
+ # Return -1 if max is reached
40
+
41
+ def computeLevenshtein(pattern,edit)
42
+ pattern = pattern.downcase
43
+ matrix= Array.new(2)
44
+ matrix[0] = Array.new(pattern.length+1)
45
+ matrix[1] = Array.new(pattern.length+1)
46
+ (0..(pattern.length)).each do |i|
47
+ matrix[0][i]=i
48
+ matrix[1][i]=i
49
+ end
50
+ c=0
51
+ p=1
52
+ (1..(self.length)).each do |i|
53
+ c = i.modulo(2)
54
+ p = (i+1).modulo(2)
55
+ matrix[c][0] = i
56
+ (1..(pattern.length)).each do |j|
57
+ # Bellman's principle of optimality
58
+ weight = 0
59
+ if(pattern[i-1] != self[j-1])
60
+ weight = 1
61
+ end
62
+ weight = matrix[p][j-1] + weight
63
+ if(weight > matrix[p][j] +1)
64
+ weight = matrix[p][j] +1
65
+ end
66
+ if(weight > matrix[c][j-1] +1)
67
+ weight = matrix[c][j-1] +1
68
+ end
69
+ matrix[c][j] = weight
70
+ end
71
+ end
72
+ p = c
73
+ c = (c + 1).modulo(2)
74
+ if(matrix[p][pattern.length]>edit)
75
+ return -1
76
+ end
77
+ return matrix[p][pattern.length]
78
+
79
+ end
80
+
81
+ # Base class to index and search through a string
82
+
83
+ class Crawler
84
+
85
+ # Use alphabet ambiguity (dna/rna) in search
86
+ attr_accessor :useAmbiguity
87
+ # Suffix files name/path
88
+ attr_accessor :file_suffix
89
+ # Max number fo threads to use (not yet used)
90
+ attr_accessor :maxthread
91
+ # Use persistent suffix file ?
92
+ attr_accessor :use_store
93
+
94
+ FILE_SUFFIX_EXT = ".sfx"
95
+ FILE_SUFFIX_POS = ".sfp"
96
+
97
+ SUFFIXLEN = 'suffix_length'
98
+
99
+ $maxthread = 1
100
+
101
+ $log = Logger.new(STDOUT)
102
+ $log.level = Logger::DEBUG
103
+
104
+ def initialize
105
+ @useAmbiguity = false
106
+ @file_suffix = "crawler"
107
+
108
+ @suffix = nil
109
+ @suffixmd5 = nil
110
+ @position = 0
111
+
112
+ @suffixes = Hash.new
113
+
114
+ @matches = nil
115
+ @curmatch = 0
116
+ @use_store = false
117
+
118
+ @sequence = nil
119
+ end
120
+
121
+ # Clear suffixes in memory
122
+
123
+ def clear
124
+ @suffixes = Hash.new
125
+ end
126
+
127
+ # Set Logger level
128
+
129
+ def setLogLevel(level)
130
+ $log.level = level
131
+ end
132
+
133
+ # Index an input file
134
+
135
+ def indexFile(f)
136
+ # Parse file, map letters to reduced alphabet
137
+ # Later on, use binary map instead of ascii map
138
+ # Take all suffix, order by length, link to position map on other file
139
+ # Store md5 for easier compare? + 20 bytes per suffix
140
+ @sequence = readSequence(f)
141
+
142
+ end
143
+
144
+ # Index an input string
145
+
146
+ def indexString(s)
147
+ @sequence = s
148
+ File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
149
+ data.puts(@sequence)
150
+ end
151
+
152
+
153
+ end
154
+
155
+ # Search exact match
156
+
157
+ def searchExact(pattern)
158
+ pattern = pattern.downcase
159
+ parseSuffixes(@sequence,pattern.length,pattern.length)
160
+
161
+ @matches = Array.new
162
+ # Search required length, compare (compare md5?)
163
+ # MD5 = 128 bits, easier to compare for large strings
164
+ matchsize = pattern.length
165
+ matchmd5 = Digest::MD5.hexdigest(pattern)
166
+ @suffixes.each do |md5val,posArray|
167
+ if (md5val == matchmd5)
168
+ match = Array[md5val, 0, posArray]
169
+ $log.debug "Match: " << match.inspect
170
+ @matches << match
171
+ end
172
+ end
173
+ return @matches
174
+
175
+ end
176
+
177
+ # Search an approximate string
178
+ #
179
+ # * support insertion, deletion, substitution
180
+ # * If edit > 0, use Hamming
181
+ # * Else use Levenshtein
182
+
183
+
184
+ def searchApproximate(s,edit)
185
+ if(edit==0)
186
+ return searchExact(s)
187
+ end
188
+
189
+ if(edit>0)
190
+ useHamming = true
191
+ minmatchsize = s.length
192
+ maxmatchsize = s.length
193
+ else
194
+ useHamming = false
195
+ edit = edit * (-1)
196
+ minmatchsize = s.length - edit
197
+ maxmatchsize = s.length + edit
198
+ end
199
+
200
+ parseSuffixes(@sequence,minmatchsize,maxmatchsize)
201
+
202
+ matchmd5 = Digest::MD5.hexdigest(s)
203
+
204
+ @matches = Array.new
205
+
206
+ @suffixes.each do |md5val,posArray|
207
+ if(md5val == SUFFIXLEN)
208
+ next
209
+ end
210
+ if (md5val == matchmd5)
211
+ match = Array[md5val, 0, posArray]
212
+ $log.debug "Match: " << match.inspect
213
+ @matches << match
214
+ else
215
+ if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
216
+ # Get string
217
+ seq = extractSuffix(posArray[1],posArray[0])
218
+ seq.extend(Cassiopee)
219
+ if(useHamming)
220
+ errors = seq.computeHamming(s,edit)
221
+ else
222
+ errors = seq.computeLevenshtein(s,edit)
223
+ end
224
+ if(errors>=0)
225
+ match = Array[md5val, errors, posArray]
226
+ $log.debug "Match: " << match.inspect
227
+ @matches << match
228
+ end
229
+ end
230
+ end
231
+
232
+ end
233
+
234
+ return @matches
235
+ end
236
+
237
+ # Extract un suffix from suffix file based on md5 match
238
+
239
+ def extractSuffix(start,len)
240
+ sequence = ''
241
+ begin
242
+ file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
243
+ file.pos = start
244
+ sequence = file.read(len)
245
+ file.close
246
+ rescue => err
247
+ puts "Exception: #{err}"
248
+ return nil
249
+ end
250
+ return sequence
251
+ end
252
+
253
+ # Iterates over matches
254
+
255
+ def next
256
+ if(@curmatch<@matches.length)
257
+ @curmatch = @curmatch + 1
258
+ return @matches[@curmatch-1]
259
+ else
260
+ @curmatch = 0
261
+ return nil
262
+ end
263
+ end
264
+
265
+ def to_s
266
+ puts '{ matches: "' << @matches.length << '" }'
267
+ end
268
+
269
+ private
270
+
271
+ # Parse input string
272
+ #
273
+ # * creates a suffix file
274
+ # * creates a suffix position file
275
+
276
+ def parseSuffixes(s,minlen,maxlen)
277
+
278
+ # Controls
279
+ if(minlen<=0)
280
+ minlen = 1
281
+ end
282
+ if(maxlen>@sequence.length)
283
+ maxlen = @sequence.length
284
+ end
285
+
286
+
287
+ suffixlen = nil
288
+ $log.info('Start indexing')
289
+ loaded = false
290
+ # Hash in memory already contain suffixes for searched lengths
291
+ if(@suffixes != nil && !@suffixes.empty?)
292
+ suffixlen = @suffixes[SUFFIXLEN]
293
+ if(suffixlen!=nil && !suffixlen.empty?)
294
+ loaded = true
295
+ (maxlen).downto(minlen) do |len|
296
+ if(suffixlen.index(len)==nil)
297
+ loaded = false
298
+ break
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+ if(@use_store && loaded)
305
+ $log.debug('already in memory, skip file loading')
306
+ end
307
+
308
+ # If not already in memory
309
+ if(@use_store && !loaded)
310
+ @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS)
311
+ suffixlen = @suffixes[SUFFIXLEN]
312
+ end
313
+
314
+ nbSuffix = 0
315
+ changed = false
316
+
317
+ # Load suffix between maxlen and minlen
318
+ (maxlen).downto(minlen) do |i|
319
+ $log.debug('parse for length ' << i.to_s)
320
+ if(suffixlen!=nil && suffixlen.index(i)!=nil)
321
+ $log.debug('length '<<i <<'already parsed')
322
+ next
323
+ end
324
+ changed = true
325
+ (0..(s.length-maxlen)).each do |j|
326
+ @suffix = s[j,i]
327
+ @suffixmd5 = Digest::MD5.hexdigest(@suffix)
328
+ @position = j
329
+ #$log.debug("add "+@suffix+" at pos "+@position.to_s)
330
+ nbSuffix += addSuffix(@suffixmd5, @position,i)
331
+ end
332
+ $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
333
+ end
334
+
335
+
336
+ if(@use_store && changed)
337
+ $log.info("Store suffixes")
338
+ marshal_dump = Marshal.dump(@suffixes)
339
+ sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w')
340
+ sfxpos = Zlib::GzipWriter.new(sfxpos)
341
+ sfxpos.write marshal_dump
342
+ sfxpos.close
343
+ end
344
+ $log.info('End of indexing')
345
+ end
346
+
347
+
348
+ # Add a suffix in Hashmap
349
+
350
+ def addSuffix(md5val,position,len)
351
+ if(@suffixes.has_key?(md5val))
352
+ # Add position
353
+ @suffixes[md5val] << position
354
+ else
355
+ # Add position, write new suffix
356
+ # First elt is size of elt
357
+ @suffixes[md5val] = Array[len, position]
358
+ if(@suffixes.has_key?(SUFFIXLEN))
359
+ @suffixes[SUFFIXLEN] << len
360
+ else
361
+ @suffixes[SUFFIXLEN] = Array[len]
362
+ end
363
+ end
364
+ return 1
365
+ end
366
+
367
+ # read input string, and concat content
368
+
369
+ def readSequence(s)
370
+ $log.debug('read input sequence')
371
+ counter = 1
372
+ sequence = ''
373
+ begin
374
+ file = File.new(s, "r")
375
+ File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT)
376
+ File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
377
+ while (line = file.gets)
378
+ counter = counter + 1
379
+ input = line.downcase.chomp
380
+ sequence << input
381
+ data.puts input
382
+ end
383
+
384
+ end
385
+ file.close
386
+ rescue => err
387
+ puts "Exception: #{err}"
388
+ err
389
+ end
390
+ $log.debug('data file created')
391
+ return sequence
392
+ end
393
+
394
+ # Load suffix position file in memory
395
+
396
+ def loadSuffixes(file_name)
397
+ return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS)
398
+ begin
399
+ file = Zlib::GzipReader.open(file_name)
400
+ rescue Zlib::GzipFile::Error
401
+ file = File.open(file_name, 'r')
402
+ ensure
403
+ obj = Marshal.load file.read
404
+ file.close
405
+ return obj
406
+ end
407
+ end
408
+
409
+ end
410
+
411
+ end
@@ -0,0 +1,38 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/cassiopee')
2
+ require 'rubygems'
3
+ require 'logger'
4
+ require 'test/unit'
5
+
6
+ class TestCrawler < Test::Unit::TestCase
7
+
8
+
9
+ def test_exactsearch
10
+ crawler = Cassiopee::Crawler.new
11
+ crawler.setLogLevel(Logger::ERROR)
12
+ crawler.indexString('my sample example')
13
+ matches = crawler.searchExact('ampl')
14
+ assert_equal(1,matches.length)
15
+ # Minus 1, because first element is len of match
16
+ match = crawler.next()
17
+ assert_equal(2,match[2].length-1)
18
+ end
19
+
20
+
21
+ def test_hammingsearch
22
+ crawler = Cassiopee::Crawler.new
23
+ crawler.indexString('my sample example')
24
+ matches = crawler.searchApproximate('ebampl',1)
25
+ assert_equal(1,matches.length)
26
+ end
27
+
28
+ def test_levenshteinsearch
29
+ crawler = Cassiopee::Crawler.new
30
+ crawler.indexString('my sample example')
31
+ matches = crawler.searchApproximate('ebampl',-1)
32
+ assert_equal(1,matches.length)
33
+ end
34
+
35
+ end
36
+
37
+
38
+
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cassiopee
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Olivier Sallou
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-09-04 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
23
+ email: olivier.sallou@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - README
32
+ - Changelog
33
+ - LICENSE
34
+ - demo.rb
35
+ - lib/cassiopee.rb
36
+ - tests/test-suite.rb
37
+ has_rdoc: true
38
+ homepage: https://github.com/osallou/cassiopee
39
+ licenses:
40
+ - LGPL-3
41
+ post_install_message:
42
+ rdoc_options:
43
+ - --main lib/cassiopee.rb
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ hash: 3
52
+ segments:
53
+ - 0
54
+ version: "0"
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Cassiopee index strings and provide exact or approximate search.
71
+ test_files:
72
+ - tests/test-suite.rb