cassiopee 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/Changelog CHANGED
@@ -1,3 +1,10 @@
1
+ v0.1.2 : 09/11 Olivier Sallou
2
+ add possibility to reload an "index" instead of using index method again
3
+ fix comment mngt (comments attribute)
4
+ add filter methods
5
+ add to_pos method to display results per position
6
+ add optimal methods
7
+ add ambiguity support
1
8
  v0.1.1 : fix #1, add filter option
2
9
  08/09/11 Olivier Sallou
3
10
  v0.1.0 : First version
data/lib/cassiopee.rb CHANGED
@@ -3,6 +3,7 @@ require 'logger'
3
3
  require 'zlib'
4
4
  require 'rubygems'
5
5
  require 'text'
6
+ require 'text/util'
6
7
 
7
8
  module Cassiopee
8
9
 
@@ -17,8 +18,36 @@ module Cassiopee
17
18
  return computeLevenshtein(pattern,edit)
18
19
  end
19
20
  end
21
+
22
+ # Calculate the edit or hamming distance between String and pattern
23
+ # Extend a String
24
+ # Return -1 if max is reached
25
+
26
+ def computeAmbiguousDistance(pattern,hamming,edit,ambiguous)
27
+ if(edit==0)
28
+ return computeHammingAmbiguous(pattern,hamming,ambiguous)
29
+ else
30
+ return computeLevenshteinAmbiguous(pattern,edit,ambiguous)
31
+ end
32
+ end
20
33
 
21
- # Calculate number of substitution between string and pattern
34
+ # Compute Hamming distance but using a mapping matrix of alphabet ambiguity
35
+
36
+ def computeHammingAmbiguous(pattern,hamming,ambiguous)
37
+ pattern = pattern.downcase
38
+ nberr = 0
39
+ (0..(self.length-1)).each do |c|
40
+ if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
41
+ nberr = nberr+1
42
+ if(nberr>hamming.to_i)
43
+ return -1
44
+ end
45
+ end
46
+ end
47
+ return nberr
48
+ end
49
+
50
+ # Calculate number of substitution between string and pattern
22
51
  # Extend a String
23
52
  # Return -1 if max is reached
24
53
 
@@ -35,8 +64,9 @@ module Cassiopee
35
64
  end
36
65
  return nberr
37
66
  end
38
-
39
-
67
+
68
+
69
+
40
70
  # Calculate the edit distance between string and pattern
41
71
  # Extend a String
42
72
  # Return -1 if max is reached
@@ -53,12 +83,79 @@ module Cassiopee
53
83
  return distance
54
84
 
55
85
  end
86
+
87
+ private
88
+
89
+ # Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity
90
+ # Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison
91
+
92
+ def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
93
+
94
+ pattern = pattern.downcase
95
+ encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
96
+
97
+ if Text.encoding_of(self) =~ /^U/i
98
+ unpack_rule = 'U*'
99
+ else
100
+ unpack_rule = 'C*'
101
+ end
102
+
103
+ s = self.unpack(unpack_rule)
104
+ t = pattern.unpack(unpack_rule)
105
+ n = s.length
106
+ m = t.length
107
+ return m if (0 == n)
108
+ return n if (0 == m)
109
+
110
+ d = (0..m).to_a
111
+ x = nil
112
+
113
+ (0...n).each do |i|
114
+ e = i+1
115
+ (0...m).each do |j|
116
+ cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
117
+ x = [
118
+ d[j+1] + 1, # insertion
119
+ e + 1, # deletion
120
+ d[j] + cost # substitution
121
+ ].min
122
+ d[j] = e
123
+ e = x
124
+ end
125
+ d[m] = x
126
+ end
127
+ if(x>edit)
128
+ return -1
129
+ end
130
+
131
+ return x
132
+ end
133
+
134
+
135
+ # checks if 2 chars are equal with ambiguity rules
136
+ # * ambigous is a Hash of char/Array of char mapping
137
+
138
+ def isAmbiguousEqual(a,b,ambiguous)
139
+ if(ambiguous==nil || ambiguous[a.chr]==nil)
140
+ if(a==b)
141
+ return true
142
+ else
143
+ return false
144
+ end
145
+ end
146
+ vin = "" << a.chr
147
+ if(ambiguous[a.chr].index(b.chr)!=nil)
148
+ return true
149
+ else
150
+ return false
151
+ end
152
+ end
56
153
 
57
154
  # Base class to index and search through a string
58
155
 
59
156
  class Crawler
60
157
 
61
- # Use alphabet ambiguity (dna/rna) in search
158
+ # Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
62
159
  attr_accessor :useAmbiguity
63
160
  # Suffix files name/path
64
161
  attr_accessor :file_suffix
@@ -66,9 +163,13 @@ module Cassiopee
66
163
  attr_accessor :maxthread
67
164
  # Use persistent suffix file ?
68
165
  attr_accessor :use_store
166
+ # Array of comment characters to skip lines in input sequence file
167
+ attr_accessor :comments
69
168
 
70
169
  @min_position = 0
71
170
  @max_position = 0
171
+
172
+ @ambiguous = nil
72
173
 
73
174
  FILE_SUFFIX_EXT = ".sfx"
74
175
  FILE_SUFFIX_POS = ".sfp"
@@ -76,9 +177,10 @@ module Cassiopee
76
177
  SUFFIXLEN = 'suffix_length'
77
178
 
78
179
  $maxthread = 1
180
+
79
181
 
80
182
  $log = Logger.new(STDOUT)
81
- $log.level = Logger::DEBUG
183
+ $log.level = Logger::INFO
82
184
 
83
185
  def initialize
84
186
  @useAmbiguity = false
@@ -95,7 +197,17 @@ module Cassiopee
95
197
  @use_store = false
96
198
 
97
199
  @sequence = nil
200
+
201
+ @comments = Array["#"]
98
202
  end
203
+
204
+ def filterLength
205
+ filterOptimal(0)
206
+ end
207
+
208
+ def filterCost
209
+ filterOptimal(1)
210
+ end
99
211
 
100
212
  # Clear suffixes in memory
101
213
  # If using use_store, clear the store too
@@ -137,6 +249,53 @@ module Cassiopee
137
249
  @min_position = 0
138
250
  @max_position = 0
139
251
  end
252
+
253
+
254
+ # Load ambiguity rules from a file
255
+ # File format should be:
256
+ # * A=B,C
257
+ # D=E,F
258
+ # ...
259
+
260
+ def loadAmbiguityFile(f)
261
+ if(!File.exists?(f))
262
+ $log.error("File "<< f << "does not exists")
263
+ exit(1)
264
+ end
265
+ @ambiguous = Hash.new
266
+ file = File.new(f, "r")
267
+ while (line = file.gets)
268
+ definition = line.downcase.chomp
269
+ ambdef = definition.split('=')
270
+ ambequal = ambdef[1].split(',')
271
+ @ambiguous[ambdef[0]] = ambequal
272
+ end
273
+ @useAmbiguity = true
274
+ $log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
275
+ file.close
276
+
277
+ end
278
+
279
+ # Load sequence from a previous index command
280
+
281
+ def loadIndex
282
+ seq = ''
283
+ begin
284
+ file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
285
+ while (line = file.gets)
286
+ input = line.downcase.chomp
287
+ seq << input
288
+ end
289
+ file.close
290
+ rescue => err
291
+ $log.error("Exception: #{err}")
292
+ exit()
293
+ end
294
+ @sequence = seq
295
+ clear()
296
+ @min_position = 0
297
+ @max_position = 0
298
+ end
140
299
 
141
300
  # Filter matches to be between min and max start position
142
301
  # If not using use_store, search speed is improved but existing indexes are cleared
@@ -153,6 +312,9 @@ module Cassiopee
153
312
  # Search exact match
154
313
 
155
314
  def searchExact(pattern)
315
+ if(@useAmbiguity)
316
+ return searchApproximate(pattern,0)
317
+ end
156
318
  pattern = pattern.downcase
157
319
  parseSuffixes(@sequence,pattern.length,pattern.length)
158
320
 
@@ -180,11 +342,11 @@ module Cassiopee
180
342
 
181
343
 
182
344
  def searchApproximate(s,edit)
183
- if(edit==0)
345
+ if(edit==0 && !@useAmbiguity)
184
346
  return searchExact(s)
185
347
  end
186
348
 
187
- if(edit>0)
349
+ if(edit>=0)
188
350
  useHamming = true
189
351
  minmatchsize = s.length
190
352
  maxmatchsize = s.length
@@ -216,9 +378,17 @@ module Cassiopee
216
378
  seq = extractSuffix(posArray[1],posArray[0])
217
379
  seq.extend(Cassiopee)
218
380
  if(useHamming)
219
- errors = seq.computeHamming(s,edit)
381
+ if(@useAmbiguity && @ambiguous!=nil)
382
+ errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
383
+ else
384
+ errors = seq.computeHamming(s,edit)
385
+ end
220
386
  else
221
- errors = seq.computeLevenshtein(s,edit)
387
+ if(@useAmbiguity && @ambigous!=nil)
388
+ errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
389
+ else
390
+ errors = seq.computeLevenshtein(s,edit)
391
+ end
222
392
  end
223
393
  if(errors>=0)
224
394
  filteredPosArray = filter(posArray)
@@ -285,6 +455,34 @@ module Cassiopee
285
455
  end
286
456
  end
287
457
 
458
+ def to_pos
459
+ positions = Hash.new
460
+ @matches.each do |match|
461
+ # match = Array[md5val, errors, posArray]
462
+ i=0
463
+ len = 0
464
+ match[2].each do |pos|
465
+ if(i==0)
466
+ len = pos
467
+ else
468
+ if(positions.has_key?(pos))
469
+ posmatch = positions[pos]
470
+ posmatch << Array[len,match[1]]
471
+
472
+
473
+ else
474
+ posmatch = Array.new
475
+ posmatch << Array[len,match[1]]
476
+ positions[pos] = posmatch
477
+ end
478
+ end
479
+ i += 1
480
+ end
481
+
482
+ end
483
+ return positions.sort
484
+ end
485
+
288
486
  def to_s
289
487
  puts '{ matches: "' << @matches.length << '" }'
290
488
  end
@@ -415,8 +613,20 @@ module Cassiopee
415
613
  while (line = file.gets)
416
614
  counter = counter + 1
417
615
  input = line.downcase.chomp
418
- sequence << input
419
- data.puts input
616
+ skip = false
617
+ comments.each do |c|
618
+ $log.debug("skip line ?" << c << " == " << input[0])
619
+ if(input[0] == c[0])
620
+ # Line start with a comment char, skip it
621
+ $log.debug("skip line")
622
+ skip = true
623
+ break
624
+ end
625
+ end
626
+ if(!skip)
627
+ sequence << input
628
+ data.puts input
629
+ end
420
630
  end
421
631
 
422
632
  end
@@ -443,7 +653,105 @@ module Cassiopee
443
653
  return obj
444
654
  end
445
655
  end
656
+
657
+ # Filter @matches to keep only the longest or the error less matches for a same start position
658
+
659
+ def filterOptimal(type)
660
+
661
+ positions = Hash.new
662
+ @matches.each do |match|
663
+ # match = Array[md5val, errors, posArray]
664
+ i=0
665
+ len = 0
666
+ match[2].each do |pos|
667
+ if(i==0)
668
+ len = pos
669
+ else
670
+ if(positions.has_key?(pos))
671
+ posmatch = positions[pos]
672
+ posmatch << Array[len,match[1],match[0]]
673
+ #positions[pos] << posmatch
674
+
675
+ else
676
+ posmatch = Array.new
677
+ posmatch << Array[len,match[1],match[0]]
678
+ positions[pos] = posmatch
679
+ end
680
+ end
681
+ i += 1
682
+ end
683
+ end
684
+
685
+ matchtoremove = Array.new
686
+ positions.each do |pos,posmatch|
687
+
688
+ optimal = nil
689
+ match = nil
690
+ count = 0
691
+ newoptimal = nil
692
+ newmatch = nil
693
+
694
+ (0..posmatch.length-1).each do |i|
695
+ solution = posmatch[i]
696
+ if(i==0)
697
+ if(type==0)
698
+ # length
699
+ optimal = solution[0]
700
+ else
701
+ # cost
702
+ optimal = solution[1]
703
+ end
704
+ match = solution[2].to_s
705
+ #count += 1
706
+ next
707
+ end
708
+
709
+ newmatch = solution[2].to_s
710
+ if(type==0)
711
+ # length
712
+ newoptimal = solution[0]
713
+ if(newoptimal.to_i>optimal.to_i)
714
+ optimal = newoptimal
715
+ matchtoremove << match
716
+ match = newmatch
717
+ else
718
+ matchtoremove << newmatch
719
+ end
720
+ else
721
+ # cost
722
+ newoptimal = solution[1]
723
+ if(newoptimal<optimal)
724
+ optimal = newoptimal
725
+ matchtoremove << match
726
+ match = newmatch
727
+ else
728
+ matchtoremove << newmatch
729
+ end
730
+ end
731
+ count += 1
732
+
733
+ end
734
+
735
+ end
736
+
737
+ newmatches = Array.new
738
+ @matches.each do |match|
739
+ found = false
740
+ matchtoremove.each do |item|
741
+ if(match[0]==item)
742
+ found = true
743
+ break
744
+ end
745
+ end
746
+ if(!found)
747
+ newmatches << match
748
+ end
749
+ end
750
+ @matches = newmatches
751
+
752
+ end
446
753
 
447
754
  end
448
755
 
756
+
449
757
  end
data/tests/amb.map ADDED
@@ -0,0 +1,2 @@
1
+ u=a,c
2
+ v=c,g
data/tests/test-suite.rb CHANGED
@@ -17,6 +17,14 @@ class TestCrawler < Test::Unit::TestCase
17
17
  assert_equal(2,match[2].length-1)
18
18
  end
19
19
 
20
+ def test_ambiguous
21
+ crawler = Cassiopee::Crawler.new
22
+ crawler.loadAmbiguityFile(File.join(File.dirname(__FILE__), 'amb.map'))
23
+ crawler.indexString('aaaaaaaaaaacgttttttt')
24
+ matches = crawler.searchExact('aucgt')
25
+ assert_equal(1,matches.length)
26
+ end
27
+
20
28
 
21
29
  def test_hammingsearch
22
30
  crawler = Cassiopee::Crawler.new
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassiopee
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Olivier Sallou
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-04 00:00:00 +02:00
18
+ date: 2011-09-09 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,7 @@ files:
50
50
  - lib/cassiopee.rb
51
51
  - bin/cassie.rb
52
52
  - tests/test-suite.rb
53
+ - tests/amb.map
53
54
  has_rdoc: true
54
55
  homepage: https://github.com/osallou/cassiopee
55
56
  licenses: