cassiopee 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (7) hide show
  1. data/Changelog +1 -0
  2. data/LICENSE +1 -0
  3. data/README +2 -0
  4. data/demo.rb +31 -0
  5. data/lib/cassiopee.rb +411 -0
  6. data/tests/test-suite.rb +38 -0
  7. metadata +72 -0
data/Changelog ADDED
@@ -0,0 +1 @@
1
+ v0.1.0 : First version
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ Software is distributed under LGPL v3
data/README ADDED
@@ -0,0 +1,2 @@
1
+ Search an exact or approximate word (hamming or edit distance) in a string.
2
+ Support cache mechanism and REST interface.
data/demo.rb ADDED
@@ -0,0 +1,31 @@
1
+ require File.join(File.dirname(__FILE__), 'lib/cassiopee')
2
+ require 'rubygems'
3
+ require 'logger'
4
+
5
+ # Instanciate a new crawler
6
+ crawler = Cassiopee::Crawler.new
7
+ #crawler.use_store = true
8
+
9
+ # String to index
10
+ crawler.indexString('sallou sallu')
11
+ # Search pattern in indexed string
12
+ crawler.searchExact('llo')
13
+ # Search it again, using already loaded indexed data
14
+ crawler.searchExact('llo')
15
+
16
+
17
+ test= "my string"
18
+ # Extend to use match algorithms
19
+ test.extend(Cassiopee)
20
+ test.computeDistance('test',0,0)
21
+ puts "Hamming: " << test.computeHamming("my strigg",1).to_s
22
+
23
+ puts "Levenshtein: " << test.computeLevenshtein("mystriigg",3).to_s
24
+
25
+ # Approcimate search, edit distance = 1
26
+ crawler.searchApproximate("llu",-1)
27
+
28
+ # Go through matches
29
+ while((match = crawler.next())!=nil)
30
+ puts "got a match " << match.inspect
31
+ end
data/lib/cassiopee.rb ADDED
@@ -0,0 +1,411 @@
1
+ require 'digest/md5'
2
+ require 'logger'
3
+ require 'zlib'
4
+
5
+ module Cassiopee
6
+
7
+ # Calculate the edit or hamming distance between String and pattern
8
+ # Extend a String
9
+ # Return -1 if max is reached
10
+
11
+ def computeDistance(pattern,hamming,edit)
12
+ if(edit==0)
13
+ return computeHamming(pattern,hamming)
14
+ else
15
+ return computeLevenshtein(pattern,edit)
16
+ end
17
+ end
18
+
19
+ # Calculate number of substitution between string and pattern
20
+ # Extend a String
21
+ # Return -1 if max is reached
22
+
23
+ def computeHamming(pattern,hamming)
24
+ pattern = pattern.downcase
25
+ nberr = 0
26
+ (0..(self.length-1)).each do |c|
27
+ if(pattern[c] != self[c])
28
+ nberr = nberr+1
29
+ if(nberr>hamming.to_i)
30
+ return -1
31
+ end
32
+ end
33
+ end
34
+ return nberr
35
+ end
36
+
37
+ # Calculate the edit distance between string and pattern
38
+ # Extend a String
39
+ # Return -1 if max is reached
40
+
41
+ def computeLevenshtein(pattern,edit)
42
+ pattern = pattern.downcase
43
+ matrix= Array.new(2)
44
+ matrix[0] = Array.new(pattern.length+1)
45
+ matrix[1] = Array.new(pattern.length+1)
46
+ (0..(pattern.length)).each do |i|
47
+ matrix[0][i]=i
48
+ matrix[1][i]=i
49
+ end
50
+ c=0
51
+ p=1
52
+ (1..(self.length)).each do |i|
53
+ c = i.modulo(2)
54
+ p = (i+1).modulo(2)
55
+ matrix[c][0] = i
56
+ (1..(pattern.length)).each do |j|
57
+ # Bellman's principle of optimality
58
+ weight = 0
59
+ if(pattern[i-1] != self[j-1])
60
+ weight = 1
61
+ end
62
+ weight = matrix[p][j-1] + weight
63
+ if(weight > matrix[p][j] +1)
64
+ weight = matrix[p][j] +1
65
+ end
66
+ if(weight > matrix[c][j-1] +1)
67
+ weight = matrix[c][j-1] +1
68
+ end
69
+ matrix[c][j] = weight
70
+ end
71
+ end
72
+ p = c
73
+ c = (c + 1).modulo(2)
74
+ if(matrix[p][pattern.length]>edit)
75
+ return -1
76
+ end
77
+ return matrix[p][pattern.length]
78
+
79
+ end
80
+
81
+ # Base class to index and search through a string
82
+
83
+ class Crawler
84
+
85
+ # Use alphabet ambiguity (dna/rna) in search
86
+ attr_accessor :useAmbiguity
87
+ # Suffix files name/path
88
+ attr_accessor :file_suffix
89
+ # Max number fo threads to use (not yet used)
90
+ attr_accessor :maxthread
91
+ # Use persistent suffix file ?
92
+ attr_accessor :use_store
93
+
94
+ FILE_SUFFIX_EXT = ".sfx"
95
+ FILE_SUFFIX_POS = ".sfp"
96
+
97
+ SUFFIXLEN = 'suffix_length'
98
+
99
+ $maxthread = 1
100
+
101
+ $log = Logger.new(STDOUT)
102
+ $log.level = Logger::DEBUG
103
+
104
+ def initialize
105
+ @useAmbiguity = false
106
+ @file_suffix = "crawler"
107
+
108
+ @suffix = nil
109
+ @suffixmd5 = nil
110
+ @position = 0
111
+
112
+ @suffixes = Hash.new
113
+
114
+ @matches = nil
115
+ @curmatch = 0
116
+ @use_store = false
117
+
118
+ @sequence = nil
119
+ end
120
+
121
+ # Clear suffixes in memory
122
+
123
+ def clear
124
+ @suffixes = Hash.new
125
+ end
126
+
127
+ # Set Logger level
128
+
129
+ def setLogLevel(level)
130
+ $log.level = level
131
+ end
132
+
133
+ # Index an input file
134
+
135
+ def indexFile(f)
136
+ # Parse file, map letters to reduced alphabet
137
+ # Later on, use binary map instead of ascii map
138
+ # Take all suffix, order by length, link to position map on other file
139
+ # Store md5 for easier compare? + 20 bytes per suffix
140
+ @sequence = readSequence(f)
141
+
142
+ end
143
+
144
+ # Index an input string
145
+
146
+ def indexString(s)
147
+ @sequence = s
148
+ File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
149
+ data.puts(@sequence)
150
+ end
151
+
152
+
153
+ end
154
+
155
+ # Search exact match
156
+
157
+ def searchExact(pattern)
158
+ pattern = pattern.downcase
159
+ parseSuffixes(@sequence,pattern.length,pattern.length)
160
+
161
+ @matches = Array.new
162
+ # Search required length, compare (compare md5?)
163
+ # MD5 = 128 bits, easier to compare for large strings
164
+ matchsize = pattern.length
165
+ matchmd5 = Digest::MD5.hexdigest(pattern)
166
+ @suffixes.each do |md5val,posArray|
167
+ if (md5val == matchmd5)
168
+ match = Array[md5val, 0, posArray]
169
+ $log.debug "Match: " << match.inspect
170
+ @matches << match
171
+ end
172
+ end
173
+ return @matches
174
+
175
+ end
176
+
177
+ # Search an approximate string
178
+ #
179
+ # * support insertion, deletion, substitution
180
+ # * If edit > 0, use Hamming
181
+ # * Else use Levenshtein
182
+
183
+
184
+ def searchApproximate(s,edit)
185
+ if(edit==0)
186
+ return searchExact(s)
187
+ end
188
+
189
+ if(edit>0)
190
+ useHamming = true
191
+ minmatchsize = s.length
192
+ maxmatchsize = s.length
193
+ else
194
+ useHamming = false
195
+ edit = edit * (-1)
196
+ minmatchsize = s.length - edit
197
+ maxmatchsize = s.length + edit
198
+ end
199
+
200
+ parseSuffixes(@sequence,minmatchsize,maxmatchsize)
201
+
202
+ matchmd5 = Digest::MD5.hexdigest(s)
203
+
204
+ @matches = Array.new
205
+
206
+ @suffixes.each do |md5val,posArray|
207
+ if(md5val == SUFFIXLEN)
208
+ next
209
+ end
210
+ if (md5val == matchmd5)
211
+ match = Array[md5val, 0, posArray]
212
+ $log.debug "Match: " << match.inspect
213
+ @matches << match
214
+ else
215
+ if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize)
216
+ # Get string
217
+ seq = extractSuffix(posArray[1],posArray[0])
218
+ seq.extend(Cassiopee)
219
+ if(useHamming)
220
+ errors = seq.computeHamming(s,edit)
221
+ else
222
+ errors = seq.computeLevenshtein(s,edit)
223
+ end
224
+ if(errors>=0)
225
+ match = Array[md5val, errors, posArray]
226
+ $log.debug "Match: " << match.inspect
227
+ @matches << match
228
+ end
229
+ end
230
+ end
231
+
232
+ end
233
+
234
+ return @matches
235
+ end
236
+
237
+ # Extract un suffix from suffix file based on md5 match
238
+
239
+ def extractSuffix(start,len)
240
+ sequence = ''
241
+ begin
242
+ file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
243
+ file.pos = start
244
+ sequence = file.read(len)
245
+ file.close
246
+ rescue => err
247
+ puts "Exception: #{err}"
248
+ return nil
249
+ end
250
+ return sequence
251
+ end
252
+
253
+ # Iterates over matches
254
+
255
+ def next
256
+ if(@curmatch<@matches.length)
257
+ @curmatch = @curmatch + 1
258
+ return @matches[@curmatch-1]
259
+ else
260
+ @curmatch = 0
261
+ return nil
262
+ end
263
+ end
264
+
265
+ def to_s
266
+ puts '{ matches: "' << @matches.length << '" }'
267
+ end
268
+
269
+ private
270
+
271
+ # Parse input string
272
+ #
273
+ # * creates a suffix file
274
+ # * creates a suffix position file
275
+
276
+ def parseSuffixes(s,minlen,maxlen)
277
+
278
+ # Controls
279
+ if(minlen<=0)
280
+ minlen = 1
281
+ end
282
+ if(maxlen>@sequence.length)
283
+ maxlen = @sequence.length
284
+ end
285
+
286
+
287
+ suffixlen = nil
288
+ $log.info('Start indexing')
289
+ loaded = false
290
+ # Hash in memory already contain suffixes for searched lengths
291
+ if(@suffixes != nil && !@suffixes.empty?)
292
+ suffixlen = @suffixes[SUFFIXLEN]
293
+ if(suffixlen!=nil && !suffixlen.empty?)
294
+ loaded = true
295
+ (maxlen).downto(minlen) do |len|
296
+ if(suffixlen.index(len)==nil)
297
+ loaded = false
298
+ break
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+ if(@use_store && loaded)
305
+ $log.debug('already in memory, skip file loading')
306
+ end
307
+
308
+ # If not already in memory
309
+ if(@use_store && !loaded)
310
+ @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS)
311
+ suffixlen = @suffixes[SUFFIXLEN]
312
+ end
313
+
314
+ nbSuffix = 0
315
+ changed = false
316
+
317
+ # Load suffix between maxlen and minlen
318
+ (maxlen).downto(minlen) do |i|
319
+ $log.debug('parse for length ' << i.to_s)
320
+ if(suffixlen!=nil && suffixlen.index(i)!=nil)
321
+ $log.debug('length '<<i <<'already parsed')
322
+ next
323
+ end
324
+ changed = true
325
+ (0..(s.length-maxlen)).each do |j|
326
+ @suffix = s[j,i]
327
+ @suffixmd5 = Digest::MD5.hexdigest(@suffix)
328
+ @position = j
329
+ #$log.debug("add "+@suffix+" at pos "+@position.to_s)
330
+ nbSuffix += addSuffix(@suffixmd5, @position,i)
331
+ end
332
+ $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s)
333
+ end
334
+
335
+
336
+ if(@use_store && changed)
337
+ $log.info("Store suffixes")
338
+ marshal_dump = Marshal.dump(@suffixes)
339
+ sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w')
340
+ sfxpos = Zlib::GzipWriter.new(sfxpos)
341
+ sfxpos.write marshal_dump
342
+ sfxpos.close
343
+ end
344
+ $log.info('End of indexing')
345
+ end
346
+
347
+
348
+ # Add a suffix in Hashmap
349
+
350
+ def addSuffix(md5val,position,len)
351
+ if(@suffixes.has_key?(md5val))
352
+ # Add position
353
+ @suffixes[md5val] << position
354
+ else
355
+ # Add position, write new suffix
356
+ # First elt is size of elt
357
+ @suffixes[md5val] = Array[len, position]
358
+ if(@suffixes.has_key?(SUFFIXLEN))
359
+ @suffixes[SUFFIXLEN] << len
360
+ else
361
+ @suffixes[SUFFIXLEN] = Array[len]
362
+ end
363
+ end
364
+ return 1
365
+ end
366
+
367
+ # read input string, and concat content
368
+
369
+ def readSequence(s)
370
+ $log.debug('read input sequence')
371
+ counter = 1
372
+ sequence = ''
373
+ begin
374
+ file = File.new(s, "r")
375
+ File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT)
376
+ File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data|
377
+ while (line = file.gets)
378
+ counter = counter + 1
379
+ input = line.downcase.chomp
380
+ sequence << input
381
+ data.puts input
382
+ end
383
+
384
+ end
385
+ file.close
386
+ rescue => err
387
+ puts "Exception: #{err}"
388
+ err
389
+ end
390
+ $log.debug('data file created')
391
+ return sequence
392
+ end
393
+
394
+ # Load suffix position file in memory
395
+
396
+ def loadSuffixes(file_name)
397
+ return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS)
398
+ begin
399
+ file = Zlib::GzipReader.open(file_name)
400
+ rescue Zlib::GzipFile::Error
401
+ file = File.open(file_name, 'r')
402
+ ensure
403
+ obj = Marshal.load file.read
404
+ file.close
405
+ return obj
406
+ end
407
+ end
408
+
409
+ end
410
+
411
+ end
@@ -0,0 +1,38 @@
1
+ require File.join(File.dirname(__FILE__), '../lib/cassiopee')
2
+ require 'rubygems'
3
+ require 'logger'
4
+ require 'test/unit'
5
+
6
+ class TestCrawler < Test::Unit::TestCase
7
+
8
+
9
+ def test_exactsearch
10
+ crawler = Cassiopee::Crawler.new
11
+ crawler.setLogLevel(Logger::ERROR)
12
+ crawler.indexString('my sample example')
13
+ matches = crawler.searchExact('ampl')
14
+ assert_equal(1,matches.length)
15
+ # Minus 1, because first element is len of match
16
+ match = crawler.next()
17
+ assert_equal(2,match[2].length-1)
18
+ end
19
+
20
+
21
+ def test_hammingsearch
22
+ crawler = Cassiopee::Crawler.new
23
+ crawler.indexString('my sample example')
24
+ matches = crawler.searchApproximate('ebampl',1)
25
+ assert_equal(1,matches.length)
26
+ end
27
+
28
+ def test_levenshteinsearch
29
+ crawler = Cassiopee::Crawler.new
30
+ crawler.indexString('my sample example')
31
+ matches = crawler.searchApproximate('ebampl',-1)
32
+ assert_equal(1,matches.length)
33
+ end
34
+
35
+ end
36
+
37
+
38
+
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cassiopee
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Olivier Sallou
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-09-04 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Cassiopee index one String and provide methods to search exact match or approximate matches with Hammming and/or edit distance.
23
+ email: olivier.sallou@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - README
32
+ - Changelog
33
+ - LICENSE
34
+ - demo.rb
35
+ - lib/cassiopee.rb
36
+ - tests/test-suite.rb
37
+ has_rdoc: true
38
+ homepage: https://github.com/osallou/cassiopee
39
+ licenses:
40
+ - LGPL-3
41
+ post_install_message:
42
+ rdoc_options:
43
+ - --main lib/cassiopee.rb
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ hash: 3
52
+ segments:
53
+ - 0
54
+ version: "0"
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Cassiopee index strings and provide exact or approximate search.
71
+ test_files:
72
+ - tests/test-suite.rb