cassiopee 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +7 -0
- data/lib/cassiopee.rb +319 -11
- data/tests/amb.map +2 -0
- data/tests/test-suite.rb +8 -0
- metadata +5 -4
data/Changelog
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v0.1.2 : 09/11 Olivier Sallou
|
2
|
+
add possibility to reload an "index" instead of using index method again
|
3
|
+
fix comment mngt (comments attribute)
|
4
|
+
add filter methods
|
5
|
+
add to_pos method to display results per position
|
6
|
+
add optimal methods
|
7
|
+
add ambiguity support
|
1
8
|
v0.1.1 : fix #1, add filter option
|
2
9
|
08/09/11 Olivier Sallou
|
3
10
|
v0.1.0 : First version
|
data/lib/cassiopee.rb
CHANGED
@@ -3,6 +3,7 @@ require 'logger'
|
|
3
3
|
require 'zlib'
|
4
4
|
require 'rubygems'
|
5
5
|
require 'text'
|
6
|
+
require 'text/util'
|
6
7
|
|
7
8
|
module Cassiopee
|
8
9
|
|
@@ -17,8 +18,36 @@ module Cassiopee
|
|
17
18
|
return computeLevenshtein(pattern,edit)
|
18
19
|
end
|
19
20
|
end
|
21
|
+
|
22
|
+
# Calculate the edit or hamming distance between String and pattern
|
23
|
+
# Extend a String
|
24
|
+
# Return -1 if max is reached
|
25
|
+
|
26
|
+
def computeAmbiguousDistance(pattern,hamming,edit,ambiguous)
|
27
|
+
if(edit==0)
|
28
|
+
return computeHammingAmbiguous(pattern,hamming,ambiguous)
|
29
|
+
else
|
30
|
+
return computeLevenshteinAmbiguous(pattern,edit,ambiguous)
|
31
|
+
end
|
32
|
+
end
|
20
33
|
|
21
|
-
#
|
34
|
+
# Compute Hamming distance but using a mapping matrix of alphabet ambiguity
|
35
|
+
|
36
|
+
def computeHammingAmbiguous(pattern,hamming,ambiguous)
|
37
|
+
pattern = pattern.downcase
|
38
|
+
nberr = 0
|
39
|
+
(0..(self.length-1)).each do |c|
|
40
|
+
if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
|
41
|
+
nberr = nberr+1
|
42
|
+
if(nberr>hamming.to_i)
|
43
|
+
return -1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return nberr
|
48
|
+
end
|
49
|
+
|
50
|
+
# Calculate number of substitution between string and pattern
|
22
51
|
# Extend a String
|
23
52
|
# Return -1 if max is reached
|
24
53
|
|
@@ -35,8 +64,9 @@ module Cassiopee
|
|
35
64
|
end
|
36
65
|
return nberr
|
37
66
|
end
|
38
|
-
|
39
|
-
|
67
|
+
|
68
|
+
|
69
|
+
|
40
70
|
# Calculate the edit distance between string and pattern
|
41
71
|
# Extend a String
|
42
72
|
# Return -1 if max is reached
|
@@ -53,12 +83,79 @@ module Cassiopee
|
|
53
83
|
return distance
|
54
84
|
|
55
85
|
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
# Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity
|
90
|
+
# Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison
|
91
|
+
|
92
|
+
def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
|
93
|
+
|
94
|
+
pattern = pattern.downcase
|
95
|
+
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
96
|
+
|
97
|
+
if Text.encoding_of(self) =~ /^U/i
|
98
|
+
unpack_rule = 'U*'
|
99
|
+
else
|
100
|
+
unpack_rule = 'C*'
|
101
|
+
end
|
102
|
+
|
103
|
+
s = self.unpack(unpack_rule)
|
104
|
+
t = pattern.unpack(unpack_rule)
|
105
|
+
n = s.length
|
106
|
+
m = t.length
|
107
|
+
return m if (0 == n)
|
108
|
+
return n if (0 == m)
|
109
|
+
|
110
|
+
d = (0..m).to_a
|
111
|
+
x = nil
|
112
|
+
|
113
|
+
(0...n).each do |i|
|
114
|
+
e = i+1
|
115
|
+
(0...m).each do |j|
|
116
|
+
cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
|
117
|
+
x = [
|
118
|
+
d[j+1] + 1, # insertion
|
119
|
+
e + 1, # deletion
|
120
|
+
d[j] + cost # substitution
|
121
|
+
].min
|
122
|
+
d[j] = e
|
123
|
+
e = x
|
124
|
+
end
|
125
|
+
d[m] = x
|
126
|
+
end
|
127
|
+
if(x>edit)
|
128
|
+
return -1
|
129
|
+
end
|
130
|
+
|
131
|
+
return x
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
# checks if 2 chars are equal with ambiguity rules
|
136
|
+
# * ambigous is a Hash of char/Array of char mapping
|
137
|
+
|
138
|
+
def isAmbiguousEqual(a,b,ambiguous)
|
139
|
+
if(ambiguous==nil || ambiguous[a.chr]==nil)
|
140
|
+
if(a==b)
|
141
|
+
return true
|
142
|
+
else
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
vin = "" << a.chr
|
147
|
+
if(ambiguous[a.chr].index(b.chr)!=nil)
|
148
|
+
return true
|
149
|
+
else
|
150
|
+
return false
|
151
|
+
end
|
152
|
+
end
|
56
153
|
|
57
154
|
# Base class to index and search through a string
|
58
155
|
|
59
156
|
class Crawler
|
60
157
|
|
61
|
-
# Use alphabet ambiguity (dna/rna) in search
|
158
|
+
# Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
|
62
159
|
attr_accessor :useAmbiguity
|
63
160
|
# Suffix files name/path
|
64
161
|
attr_accessor :file_suffix
|
@@ -66,9 +163,13 @@ module Cassiopee
|
|
66
163
|
attr_accessor :maxthread
|
67
164
|
# Use persistent suffix file ?
|
68
165
|
attr_accessor :use_store
|
166
|
+
# Array of comment characters to skip lines in input sequence file
|
167
|
+
attr_accessor :comments
|
69
168
|
|
70
169
|
@min_position = 0
|
71
170
|
@max_position = 0
|
171
|
+
|
172
|
+
@ambiguous = nil
|
72
173
|
|
73
174
|
FILE_SUFFIX_EXT = ".sfx"
|
74
175
|
FILE_SUFFIX_POS = ".sfp"
|
@@ -76,9 +177,10 @@ module Cassiopee
|
|
76
177
|
SUFFIXLEN = 'suffix_length'
|
77
178
|
|
78
179
|
$maxthread = 1
|
180
|
+
|
79
181
|
|
80
182
|
$log = Logger.new(STDOUT)
|
81
|
-
$log.level = Logger::
|
183
|
+
$log.level = Logger::INFO
|
82
184
|
|
83
185
|
def initialize
|
84
186
|
@useAmbiguity = false
|
@@ -95,7 +197,17 @@ module Cassiopee
|
|
95
197
|
@use_store = false
|
96
198
|
|
97
199
|
@sequence = nil
|
200
|
+
|
201
|
+
@comments = Array["#"]
|
98
202
|
end
|
203
|
+
|
204
|
+
def filterLength
|
205
|
+
filterOptimal(0)
|
206
|
+
end
|
207
|
+
|
208
|
+
def filterCost
|
209
|
+
filterOptimal(1)
|
210
|
+
end
|
99
211
|
|
100
212
|
# Clear suffixes in memory
|
101
213
|
# If using use_store, clear the store too
|
@@ -137,6 +249,53 @@ module Cassiopee
|
|
137
249
|
@min_position = 0
|
138
250
|
@max_position = 0
|
139
251
|
end
|
252
|
+
|
253
|
+
|
254
|
+
# Load ambiguity rules from a file
|
255
|
+
# File format should be:
|
256
|
+
# * A=B,C
|
257
|
+
# D=E,F
|
258
|
+
# ...
|
259
|
+
|
260
|
+
def loadAmbiguityFile(f)
|
261
|
+
if(!File.exists?(f))
|
262
|
+
$log.error("File "<< f << "does not exists")
|
263
|
+
exit(1)
|
264
|
+
end
|
265
|
+
@ambiguous = Hash.new
|
266
|
+
file = File.new(f, "r")
|
267
|
+
while (line = file.gets)
|
268
|
+
definition = line.downcase.chomp
|
269
|
+
ambdef = definition.split('=')
|
270
|
+
ambequal = ambdef[1].split(',')
|
271
|
+
@ambiguous[ambdef[0]] = ambequal
|
272
|
+
end
|
273
|
+
@useAmbiguity = true
|
274
|
+
$log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
|
275
|
+
file.close
|
276
|
+
|
277
|
+
end
|
278
|
+
|
279
|
+
# Load sequence from a previous index command
|
280
|
+
|
281
|
+
def loadIndex
|
282
|
+
seq = ''
|
283
|
+
begin
|
284
|
+
file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
|
285
|
+
while (line = file.gets)
|
286
|
+
input = line.downcase.chomp
|
287
|
+
seq << input
|
288
|
+
end
|
289
|
+
file.close
|
290
|
+
rescue => err
|
291
|
+
$log.error("Exception: #{err}")
|
292
|
+
exit()
|
293
|
+
end
|
294
|
+
@sequence = seq
|
295
|
+
clear()
|
296
|
+
@min_position = 0
|
297
|
+
@max_position = 0
|
298
|
+
end
|
140
299
|
|
141
300
|
# Filter matches to be between min and max start position
|
142
301
|
# If not using use_store, search speed is improved but existing indexes are cleared
|
@@ -153,6 +312,9 @@ module Cassiopee
|
|
153
312
|
# Search exact match
|
154
313
|
|
155
314
|
def searchExact(pattern)
|
315
|
+
if(@useAmbiguity)
|
316
|
+
return searchApproximate(pattern,0)
|
317
|
+
end
|
156
318
|
pattern = pattern.downcase
|
157
319
|
parseSuffixes(@sequence,pattern.length,pattern.length)
|
158
320
|
|
@@ -180,11 +342,11 @@ module Cassiopee
|
|
180
342
|
|
181
343
|
|
182
344
|
def searchApproximate(s,edit)
|
183
|
-
if(edit==0)
|
345
|
+
if(edit==0 && !@useAmbiguity)
|
184
346
|
return searchExact(s)
|
185
347
|
end
|
186
348
|
|
187
|
-
if(edit
|
349
|
+
if(edit>=0)
|
188
350
|
useHamming = true
|
189
351
|
minmatchsize = s.length
|
190
352
|
maxmatchsize = s.length
|
@@ -216,9 +378,17 @@ module Cassiopee
|
|
216
378
|
seq = extractSuffix(posArray[1],posArray[0])
|
217
379
|
seq.extend(Cassiopee)
|
218
380
|
if(useHamming)
|
219
|
-
|
381
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
382
|
+
errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
|
383
|
+
else
|
384
|
+
errors = seq.computeHamming(s,edit)
|
385
|
+
end
|
220
386
|
else
|
221
|
-
|
387
|
+
if(@useAmbiguity && @ambigous!=nil)
|
388
|
+
errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
|
389
|
+
else
|
390
|
+
errors = seq.computeLevenshtein(s,edit)
|
391
|
+
end
|
222
392
|
end
|
223
393
|
if(errors>=0)
|
224
394
|
filteredPosArray = filter(posArray)
|
@@ -285,6 +455,34 @@ module Cassiopee
|
|
285
455
|
end
|
286
456
|
end
|
287
457
|
|
458
|
+
def to_pos
|
459
|
+
positions = Hash.new
|
460
|
+
@matches.each do |match|
|
461
|
+
# match = Array[md5val, errors, posArray]
|
462
|
+
i=0
|
463
|
+
len = 0
|
464
|
+
match[2].each do |pos|
|
465
|
+
if(i==0)
|
466
|
+
len = pos
|
467
|
+
else
|
468
|
+
if(positions.has_key?(pos))
|
469
|
+
posmatch = positions[pos]
|
470
|
+
posmatch << Array[len,match[1]]
|
471
|
+
|
472
|
+
|
473
|
+
else
|
474
|
+
posmatch = Array.new
|
475
|
+
posmatch << Array[len,match[1]]
|
476
|
+
positions[pos] = posmatch
|
477
|
+
end
|
478
|
+
end
|
479
|
+
i += 1
|
480
|
+
end
|
481
|
+
|
482
|
+
end
|
483
|
+
return positions.sort
|
484
|
+
end
|
485
|
+
|
288
486
|
def to_s
|
289
487
|
puts '{ matches: "' << @matches.length << '" }'
|
290
488
|
end
|
@@ -415,8 +613,20 @@ module Cassiopee
|
|
415
613
|
while (line = file.gets)
|
416
614
|
counter = counter + 1
|
417
615
|
input = line.downcase.chomp
|
418
|
-
|
419
|
-
|
616
|
+
skip = false
|
617
|
+
comments.each do |c|
|
618
|
+
$log.debug("skip line ?" << c << " == " << input[0])
|
619
|
+
if(input[0] == c[0])
|
620
|
+
# Line start with a comment char, skip it
|
621
|
+
$log.debug("skip line")
|
622
|
+
skip = true
|
623
|
+
break
|
624
|
+
end
|
625
|
+
end
|
626
|
+
if(!skip)
|
627
|
+
sequence << input
|
628
|
+
data.puts input
|
629
|
+
end
|
420
630
|
end
|
421
631
|
|
422
632
|
end
|
@@ -443,7 +653,105 @@ module Cassiopee
|
|
443
653
|
return obj
|
444
654
|
end
|
445
655
|
end
|
656
|
+
|
657
|
+
# Filter @matches to keep only the longest or the error less matches for a same start position
|
658
|
+
|
659
|
+
def filterOptimal(type)
|
660
|
+
|
661
|
+
positions = Hash.new
|
662
|
+
@matches.each do |match|
|
663
|
+
# match = Array[md5val, errors, posArray]
|
664
|
+
i=0
|
665
|
+
len = 0
|
666
|
+
match[2].each do |pos|
|
667
|
+
if(i==0)
|
668
|
+
len = pos
|
669
|
+
else
|
670
|
+
if(positions.has_key?(pos))
|
671
|
+
posmatch = positions[pos]
|
672
|
+
posmatch << Array[len,match[1],match[0]]
|
673
|
+
#positions[pos] << posmatch
|
674
|
+
|
675
|
+
else
|
676
|
+
posmatch = Array.new
|
677
|
+
posmatch << Array[len,match[1],match[0]]
|
678
|
+
positions[pos] = posmatch
|
679
|
+
end
|
680
|
+
end
|
681
|
+
i += 1
|
682
|
+
end
|
683
|
+
end
|
684
|
+
|
685
|
+
matchtoremove = Array.new
|
686
|
+
positions.each do |pos,posmatch|
|
687
|
+
|
688
|
+
optimal = nil
|
689
|
+
match = nil
|
690
|
+
count = 0
|
691
|
+
newoptimal = nil
|
692
|
+
newmatch = nil
|
693
|
+
|
694
|
+
(0..posmatch.length-1).each do |i|
|
695
|
+
solution = posmatch[i]
|
696
|
+
if(i==0)
|
697
|
+
if(type==0)
|
698
|
+
# length
|
699
|
+
optimal = solution[0]
|
700
|
+
else
|
701
|
+
# cost
|
702
|
+
optimal = solution[1]
|
703
|
+
end
|
704
|
+
match = solution[2].to_s
|
705
|
+
#count += 1
|
706
|
+
next
|
707
|
+
end
|
708
|
+
|
709
|
+
newmatch = solution[2].to_s
|
710
|
+
if(type==0)
|
711
|
+
# length
|
712
|
+
newoptimal = solution[0]
|
713
|
+
if(newoptimal.to_i>optimal.to_i)
|
714
|
+
optimal = newoptimal
|
715
|
+
matchtoremove << match
|
716
|
+
match = newmatch
|
717
|
+
else
|
718
|
+
matchtoremove << newmatch
|
719
|
+
end
|
720
|
+
else
|
721
|
+
# cost
|
722
|
+
newoptimal = solution[1]
|
723
|
+
if(newoptimal<optimal)
|
724
|
+
optimal = newoptimal
|
725
|
+
matchtoremove << match
|
726
|
+
match = newmatch
|
727
|
+
else
|
728
|
+
matchtoremove << newmatch
|
729
|
+
end
|
730
|
+
end
|
731
|
+
count += 1
|
732
|
+
|
733
|
+
end
|
734
|
+
|
735
|
+
end
|
736
|
+
|
737
|
+
newmatches = Array.new
|
738
|
+
@matches.each do |match|
|
739
|
+
found = false
|
740
|
+
matchtoremove.each do |item|
|
741
|
+
if(match[0]==item)
|
742
|
+
found = true
|
743
|
+
break
|
744
|
+
end
|
745
|
+
end
|
746
|
+
if(!found)
|
747
|
+
newmatches << match
|
748
|
+
end
|
749
|
+
end
|
750
|
+
@matches = newmatches
|
751
|
+
|
752
|
+
end
|
446
753
|
|
447
754
|
end
|
448
755
|
|
756
|
+
|
449
757
|
end
|
data/tests/amb.map
ADDED
data/tests/test-suite.rb
CHANGED
@@ -17,6 +17,14 @@ class TestCrawler < Test::Unit::TestCase
|
|
17
17
|
assert_equal(2,match[2].length-1)
|
18
18
|
end
|
19
19
|
|
20
|
+
def test_ambiguous
|
21
|
+
crawler = Cassiopee::Crawler.new
|
22
|
+
crawler.loadAmbiguityFile(File.join(File.dirname(__FILE__), 'amb.map'))
|
23
|
+
crawler.indexString('aaaaaaaaaaacgttttttt')
|
24
|
+
matches = crawler.searchExact('aucgt')
|
25
|
+
assert_equal(1,matches.length)
|
26
|
+
end
|
27
|
+
|
20
28
|
|
21
29
|
def test_hammingsearch
|
22
30
|
crawler = Cassiopee::Crawler.new
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-09-
|
18
|
+
date: 2011-09-09 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,6 +50,7 @@ files:
|
|
50
50
|
- lib/cassiopee.rb
|
51
51
|
- bin/cassie.rb
|
52
52
|
- tests/test-suite.rb
|
53
|
+
- tests/amb.map
|
53
54
|
has_rdoc: true
|
54
55
|
homepage: https://github.com/osallou/cassiopee
|
55
56
|
licenses:
|