cassiopee 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog +7 -0
- data/lib/cassiopee.rb +319 -11
- data/tests/amb.map +2 -0
- data/tests/test-suite.rb +8 -0
- metadata +5 -4
data/Changelog
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v0.1.2 : 09/11 Olivier Sallou
|
2
|
+
add possibility to reload an "index" instead of using index method again
|
3
|
+
fix comment mngt (comments attribute)
|
4
|
+
add filter methods
|
5
|
+
add to_pos method to display results per position
|
6
|
+
add optimal methods
|
7
|
+
add ambiguity support
|
1
8
|
v0.1.1 : fix #1, add filter option
|
2
9
|
08/09/11 Olivier Sallou
|
3
10
|
v0.1.0 : First version
|
data/lib/cassiopee.rb
CHANGED
@@ -3,6 +3,7 @@ require 'logger'
|
|
3
3
|
require 'zlib'
|
4
4
|
require 'rubygems'
|
5
5
|
require 'text'
|
6
|
+
require 'text/util'
|
6
7
|
|
7
8
|
module Cassiopee
|
8
9
|
|
@@ -17,8 +18,36 @@ module Cassiopee
|
|
17
18
|
return computeLevenshtein(pattern,edit)
|
18
19
|
end
|
19
20
|
end
|
21
|
+
|
22
|
+
# Calculate the edit or hamming distance between String and pattern
|
23
|
+
# Extend a String
|
24
|
+
# Return -1 if max is reached
|
25
|
+
|
26
|
+
def computeAmbiguousDistance(pattern,hamming,edit,ambiguous)
|
27
|
+
if(edit==0)
|
28
|
+
return computeHammingAmbiguous(pattern,hamming,ambiguous)
|
29
|
+
else
|
30
|
+
return computeLevenshteinAmbiguous(pattern,edit,ambiguous)
|
31
|
+
end
|
32
|
+
end
|
20
33
|
|
21
|
-
#
|
34
|
+
# Compute Hamming distance but using a mapping matrix of alphabet ambiguity
|
35
|
+
|
36
|
+
def computeHammingAmbiguous(pattern,hamming,ambiguous)
|
37
|
+
pattern = pattern.downcase
|
38
|
+
nberr = 0
|
39
|
+
(0..(self.length-1)).each do |c|
|
40
|
+
if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
|
41
|
+
nberr = nberr+1
|
42
|
+
if(nberr>hamming.to_i)
|
43
|
+
return -1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return nberr
|
48
|
+
end
|
49
|
+
|
50
|
+
# Calculate number of substitution between string and pattern
|
22
51
|
# Extend a String
|
23
52
|
# Return -1 if max is reached
|
24
53
|
|
@@ -35,8 +64,9 @@ module Cassiopee
|
|
35
64
|
end
|
36
65
|
return nberr
|
37
66
|
end
|
38
|
-
|
39
|
-
|
67
|
+
|
68
|
+
|
69
|
+
|
40
70
|
# Calculate the edit distance between string and pattern
|
41
71
|
# Extend a String
|
42
72
|
# Return -1 if max is reached
|
@@ -53,12 +83,79 @@ module Cassiopee
|
|
53
83
|
return distance
|
54
84
|
|
55
85
|
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
# Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity
|
90
|
+
# Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison
|
91
|
+
|
92
|
+
def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
|
93
|
+
|
94
|
+
pattern = pattern.downcase
|
95
|
+
encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
|
96
|
+
|
97
|
+
if Text.encoding_of(self) =~ /^U/i
|
98
|
+
unpack_rule = 'U*'
|
99
|
+
else
|
100
|
+
unpack_rule = 'C*'
|
101
|
+
end
|
102
|
+
|
103
|
+
s = self.unpack(unpack_rule)
|
104
|
+
t = pattern.unpack(unpack_rule)
|
105
|
+
n = s.length
|
106
|
+
m = t.length
|
107
|
+
return m if (0 == n)
|
108
|
+
return n if (0 == m)
|
109
|
+
|
110
|
+
d = (0..m).to_a
|
111
|
+
x = nil
|
112
|
+
|
113
|
+
(0...n).each do |i|
|
114
|
+
e = i+1
|
115
|
+
(0...m).each do |j|
|
116
|
+
cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
|
117
|
+
x = [
|
118
|
+
d[j+1] + 1, # insertion
|
119
|
+
e + 1, # deletion
|
120
|
+
d[j] + cost # substitution
|
121
|
+
].min
|
122
|
+
d[j] = e
|
123
|
+
e = x
|
124
|
+
end
|
125
|
+
d[m] = x
|
126
|
+
end
|
127
|
+
if(x>edit)
|
128
|
+
return -1
|
129
|
+
end
|
130
|
+
|
131
|
+
return x
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
# checks if 2 chars are equal with ambiguity rules
|
136
|
+
# * ambigous is a Hash of char/Array of char mapping
|
137
|
+
|
138
|
+
def isAmbiguousEqual(a,b,ambiguous)
|
139
|
+
if(ambiguous==nil || ambiguous[a.chr]==nil)
|
140
|
+
if(a==b)
|
141
|
+
return true
|
142
|
+
else
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
vin = "" << a.chr
|
147
|
+
if(ambiguous[a.chr].index(b.chr)!=nil)
|
148
|
+
return true
|
149
|
+
else
|
150
|
+
return false
|
151
|
+
end
|
152
|
+
end
|
56
153
|
|
57
154
|
# Base class to index and search through a string
|
58
155
|
|
59
156
|
class Crawler
|
60
157
|
|
61
|
-
# Use alphabet ambiguity (dna/rna) in search
|
158
|
+
# Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
|
62
159
|
attr_accessor :useAmbiguity
|
63
160
|
# Suffix files name/path
|
64
161
|
attr_accessor :file_suffix
|
@@ -66,9 +163,13 @@ module Cassiopee
|
|
66
163
|
attr_accessor :maxthread
|
67
164
|
# Use persistent suffix file ?
|
68
165
|
attr_accessor :use_store
|
166
|
+
# Array of comment characters to skip lines in input sequence file
|
167
|
+
attr_accessor :comments
|
69
168
|
|
70
169
|
@min_position = 0
|
71
170
|
@max_position = 0
|
171
|
+
|
172
|
+
@ambiguous = nil
|
72
173
|
|
73
174
|
FILE_SUFFIX_EXT = ".sfx"
|
74
175
|
FILE_SUFFIX_POS = ".sfp"
|
@@ -76,9 +177,10 @@ module Cassiopee
|
|
76
177
|
SUFFIXLEN = 'suffix_length'
|
77
178
|
|
78
179
|
$maxthread = 1
|
180
|
+
|
79
181
|
|
80
182
|
$log = Logger.new(STDOUT)
|
81
|
-
$log.level = Logger::
|
183
|
+
$log.level = Logger::INFO
|
82
184
|
|
83
185
|
def initialize
|
84
186
|
@useAmbiguity = false
|
@@ -95,7 +197,17 @@ module Cassiopee
|
|
95
197
|
@use_store = false
|
96
198
|
|
97
199
|
@sequence = nil
|
200
|
+
|
201
|
+
@comments = Array["#"]
|
98
202
|
end
|
203
|
+
|
204
|
+
def filterLength
|
205
|
+
filterOptimal(0)
|
206
|
+
end
|
207
|
+
|
208
|
+
def filterCost
|
209
|
+
filterOptimal(1)
|
210
|
+
end
|
99
211
|
|
100
212
|
# Clear suffixes in memory
|
101
213
|
# If using use_store, clear the store too
|
@@ -137,6 +249,53 @@ module Cassiopee
|
|
137
249
|
@min_position = 0
|
138
250
|
@max_position = 0
|
139
251
|
end
|
252
|
+
|
253
|
+
|
254
|
+
# Load ambiguity rules from a file
|
255
|
+
# File format should be:
|
256
|
+
# * A=B,C
|
257
|
+
# D=E,F
|
258
|
+
# ...
|
259
|
+
|
260
|
+
def loadAmbiguityFile(f)
|
261
|
+
if(!File.exists?(f))
|
262
|
+
$log.error("File "<< f << "does not exists")
|
263
|
+
exit(1)
|
264
|
+
end
|
265
|
+
@ambiguous = Hash.new
|
266
|
+
file = File.new(f, "r")
|
267
|
+
while (line = file.gets)
|
268
|
+
definition = line.downcase.chomp
|
269
|
+
ambdef = definition.split('=')
|
270
|
+
ambequal = ambdef[1].split(',')
|
271
|
+
@ambiguous[ambdef[0]] = ambequal
|
272
|
+
end
|
273
|
+
@useAmbiguity = true
|
274
|
+
$log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
|
275
|
+
file.close
|
276
|
+
|
277
|
+
end
|
278
|
+
|
279
|
+
# Load sequence from a previous index command
|
280
|
+
|
281
|
+
def loadIndex
|
282
|
+
seq = ''
|
283
|
+
begin
|
284
|
+
file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
|
285
|
+
while (line = file.gets)
|
286
|
+
input = line.downcase.chomp
|
287
|
+
seq << input
|
288
|
+
end
|
289
|
+
file.close
|
290
|
+
rescue => err
|
291
|
+
$log.error("Exception: #{err}")
|
292
|
+
exit()
|
293
|
+
end
|
294
|
+
@sequence = seq
|
295
|
+
clear()
|
296
|
+
@min_position = 0
|
297
|
+
@max_position = 0
|
298
|
+
end
|
140
299
|
|
141
300
|
# Filter matches to be between min and max start position
|
142
301
|
# If not using use_store, search speed is improved but existing indexes are cleared
|
@@ -153,6 +312,9 @@ module Cassiopee
|
|
153
312
|
# Search exact match
|
154
313
|
|
155
314
|
def searchExact(pattern)
|
315
|
+
if(@useAmbiguity)
|
316
|
+
return searchApproximate(pattern,0)
|
317
|
+
end
|
156
318
|
pattern = pattern.downcase
|
157
319
|
parseSuffixes(@sequence,pattern.length,pattern.length)
|
158
320
|
|
@@ -180,11 +342,11 @@ module Cassiopee
|
|
180
342
|
|
181
343
|
|
182
344
|
def searchApproximate(s,edit)
|
183
|
-
if(edit==0)
|
345
|
+
if(edit==0 && !@useAmbiguity)
|
184
346
|
return searchExact(s)
|
185
347
|
end
|
186
348
|
|
187
|
-
if(edit
|
349
|
+
if(edit>=0)
|
188
350
|
useHamming = true
|
189
351
|
minmatchsize = s.length
|
190
352
|
maxmatchsize = s.length
|
@@ -216,9 +378,17 @@ module Cassiopee
|
|
216
378
|
seq = extractSuffix(posArray[1],posArray[0])
|
217
379
|
seq.extend(Cassiopee)
|
218
380
|
if(useHamming)
|
219
|
-
|
381
|
+
if(@useAmbiguity && @ambiguous!=nil)
|
382
|
+
errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
|
383
|
+
else
|
384
|
+
errors = seq.computeHamming(s,edit)
|
385
|
+
end
|
220
386
|
else
|
221
|
-
|
387
|
+
if(@useAmbiguity && @ambigous!=nil)
|
388
|
+
errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
|
389
|
+
else
|
390
|
+
errors = seq.computeLevenshtein(s,edit)
|
391
|
+
end
|
222
392
|
end
|
223
393
|
if(errors>=0)
|
224
394
|
filteredPosArray = filter(posArray)
|
@@ -285,6 +455,34 @@ module Cassiopee
|
|
285
455
|
end
|
286
456
|
end
|
287
457
|
|
458
|
+
def to_pos
|
459
|
+
positions = Hash.new
|
460
|
+
@matches.each do |match|
|
461
|
+
# match = Array[md5val, errors, posArray]
|
462
|
+
i=0
|
463
|
+
len = 0
|
464
|
+
match[2].each do |pos|
|
465
|
+
if(i==0)
|
466
|
+
len = pos
|
467
|
+
else
|
468
|
+
if(positions.has_key?(pos))
|
469
|
+
posmatch = positions[pos]
|
470
|
+
posmatch << Array[len,match[1]]
|
471
|
+
|
472
|
+
|
473
|
+
else
|
474
|
+
posmatch = Array.new
|
475
|
+
posmatch << Array[len,match[1]]
|
476
|
+
positions[pos] = posmatch
|
477
|
+
end
|
478
|
+
end
|
479
|
+
i += 1
|
480
|
+
end
|
481
|
+
|
482
|
+
end
|
483
|
+
return positions.sort
|
484
|
+
end
|
485
|
+
|
288
486
|
def to_s
|
289
487
|
puts '{ matches: "' << @matches.length << '" }'
|
290
488
|
end
|
@@ -415,8 +613,20 @@ module Cassiopee
|
|
415
613
|
while (line = file.gets)
|
416
614
|
counter = counter + 1
|
417
615
|
input = line.downcase.chomp
|
418
|
-
|
419
|
-
|
616
|
+
skip = false
|
617
|
+
comments.each do |c|
|
618
|
+
$log.debug("skip line ?" << c << " == " << input[0])
|
619
|
+
if(input[0] == c[0])
|
620
|
+
# Line start with a comment char, skip it
|
621
|
+
$log.debug("skip line")
|
622
|
+
skip = true
|
623
|
+
break
|
624
|
+
end
|
625
|
+
end
|
626
|
+
if(!skip)
|
627
|
+
sequence << input
|
628
|
+
data.puts input
|
629
|
+
end
|
420
630
|
end
|
421
631
|
|
422
632
|
end
|
@@ -443,7 +653,105 @@ module Cassiopee
|
|
443
653
|
return obj
|
444
654
|
end
|
445
655
|
end
|
656
|
+
|
657
|
+
# Filter @matches to keep only the longest or the error less matches for a same start position
|
658
|
+
|
659
|
+
def filterOptimal(type)
|
660
|
+
|
661
|
+
positions = Hash.new
|
662
|
+
@matches.each do |match|
|
663
|
+
# match = Array[md5val, errors, posArray]
|
664
|
+
i=0
|
665
|
+
len = 0
|
666
|
+
match[2].each do |pos|
|
667
|
+
if(i==0)
|
668
|
+
len = pos
|
669
|
+
else
|
670
|
+
if(positions.has_key?(pos))
|
671
|
+
posmatch = positions[pos]
|
672
|
+
posmatch << Array[len,match[1],match[0]]
|
673
|
+
#positions[pos] << posmatch
|
674
|
+
|
675
|
+
else
|
676
|
+
posmatch = Array.new
|
677
|
+
posmatch << Array[len,match[1],match[0]]
|
678
|
+
positions[pos] = posmatch
|
679
|
+
end
|
680
|
+
end
|
681
|
+
i += 1
|
682
|
+
end
|
683
|
+
end
|
684
|
+
|
685
|
+
matchtoremove = Array.new
|
686
|
+
positions.each do |pos,posmatch|
|
687
|
+
|
688
|
+
optimal = nil
|
689
|
+
match = nil
|
690
|
+
count = 0
|
691
|
+
newoptimal = nil
|
692
|
+
newmatch = nil
|
693
|
+
|
694
|
+
(0..posmatch.length-1).each do |i|
|
695
|
+
solution = posmatch[i]
|
696
|
+
if(i==0)
|
697
|
+
if(type==0)
|
698
|
+
# length
|
699
|
+
optimal = solution[0]
|
700
|
+
else
|
701
|
+
# cost
|
702
|
+
optimal = solution[1]
|
703
|
+
end
|
704
|
+
match = solution[2].to_s
|
705
|
+
#count += 1
|
706
|
+
next
|
707
|
+
end
|
708
|
+
|
709
|
+
newmatch = solution[2].to_s
|
710
|
+
if(type==0)
|
711
|
+
# length
|
712
|
+
newoptimal = solution[0]
|
713
|
+
if(newoptimal.to_i>optimal.to_i)
|
714
|
+
optimal = newoptimal
|
715
|
+
matchtoremove << match
|
716
|
+
match = newmatch
|
717
|
+
else
|
718
|
+
matchtoremove << newmatch
|
719
|
+
end
|
720
|
+
else
|
721
|
+
# cost
|
722
|
+
newoptimal = solution[1]
|
723
|
+
if(newoptimal<optimal)
|
724
|
+
optimal = newoptimal
|
725
|
+
matchtoremove << match
|
726
|
+
match = newmatch
|
727
|
+
else
|
728
|
+
matchtoremove << newmatch
|
729
|
+
end
|
730
|
+
end
|
731
|
+
count += 1
|
732
|
+
|
733
|
+
end
|
734
|
+
|
735
|
+
end
|
736
|
+
|
737
|
+
newmatches = Array.new
|
738
|
+
@matches.each do |match|
|
739
|
+
found = false
|
740
|
+
matchtoremove.each do |item|
|
741
|
+
if(match[0]==item)
|
742
|
+
found = true
|
743
|
+
break
|
744
|
+
end
|
745
|
+
end
|
746
|
+
if(!found)
|
747
|
+
newmatches << match
|
748
|
+
end
|
749
|
+
end
|
750
|
+
@matches = newmatches
|
751
|
+
|
752
|
+
end
|
446
753
|
|
447
754
|
end
|
448
755
|
|
756
|
+
|
449
757
|
end
|
data/tests/amb.map
ADDED
data/tests/test-suite.rb
CHANGED
@@ -17,6 +17,14 @@ class TestCrawler < Test::Unit::TestCase
|
|
17
17
|
assert_equal(2,match[2].length-1)
|
18
18
|
end
|
19
19
|
|
20
|
+
def test_ambiguous
|
21
|
+
crawler = Cassiopee::Crawler.new
|
22
|
+
crawler.loadAmbiguityFile(File.join(File.dirname(__FILE__), 'amb.map'))
|
23
|
+
crawler.indexString('aaaaaaaaaaacgttttttt')
|
24
|
+
matches = crawler.searchExact('aucgt')
|
25
|
+
assert_equal(1,matches.length)
|
26
|
+
end
|
27
|
+
|
20
28
|
|
21
29
|
def test_hammingsearch
|
22
30
|
crawler = Cassiopee::Crawler.new
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassiopee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Olivier Sallou
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-09-
|
18
|
+
date: 2011-09-09 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,6 +50,7 @@ files:
|
|
50
50
|
- lib/cassiopee.rb
|
51
51
|
- bin/cassie.rb
|
52
52
|
- tests/test-suite.rb
|
53
|
+
- tests/amb.map
|
53
54
|
has_rdoc: true
|
54
55
|
homepage: https://github.com/osallou/cassiopee
|
55
56
|
licenses:
|