cassiopee 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Changelog CHANGED
@@ -1,3 +1,10 @@
1
+ v0.1.2 : 09/11 Olivier Sallou
2
+ add possibility to reload an "index" instead of using index method again
3
+ fix comment mngt (comments attribute)
4
+ add filter methods
5
+ add to_pos method to display results per position
6
+ add optimal methods
7
+ add ambiguity support
1
8
  v0.1.1 : fix #1, add filter option
2
9
  08/09/11 Olivier Sallou
3
10
  v0.1.0 : First version
data/lib/cassiopee.rb CHANGED
@@ -3,6 +3,7 @@ require 'logger'
3
3
  require 'zlib'
4
4
  require 'rubygems'
5
5
  require 'text'
6
+ require 'text/util'
6
7
 
7
8
  module Cassiopee
8
9
 
@@ -17,8 +18,36 @@ module Cassiopee
17
18
  return computeLevenshtein(pattern,edit)
18
19
  end
19
20
  end
21
+
22
+ # Calculate the edit or hamming distance between String and pattern
23
+ # Extend a String
24
+ # Return -1 if max is reached
25
+
26
+ def computeAmbiguousDistance(pattern,hamming,edit,ambiguous)
27
+ if(edit==0)
28
+ return computeHammingAmbiguous(pattern,hamming,ambiguous)
29
+ else
30
+ return computeLevenshteinAmbiguous(pattern,edit,ambiguous)
31
+ end
32
+ end
20
33
 
21
- # Calculate number of substitution between string and pattern
34
+ # Compute Hamming distance but using a mapping matrix of alphabet ambiguity
35
+
36
+ def computeHammingAmbiguous(pattern,hamming,ambiguous)
37
+ pattern = pattern.downcase
38
+ nberr = 0
39
+ (0..(self.length-1)).each do |c|
40
+ if(!isAmbiguousEqual(pattern[c],self[c],ambiguous))
41
+ nberr = nberr+1
42
+ if(nberr>hamming.to_i)
43
+ return -1
44
+ end
45
+ end
46
+ end
47
+ return nberr
48
+ end
49
+
50
+ # Calculate number of substitution between string and pattern
22
51
  # Extend a String
23
52
  # Return -1 if max is reached
24
53
 
@@ -35,8 +64,9 @@ module Cassiopee
35
64
  end
36
65
  return nberr
37
66
  end
38
-
39
-
67
+
68
+
69
+
40
70
  # Calculate the edit distance between string and pattern
41
71
  # Extend a String
42
72
  # Return -1 if max is reached
@@ -53,12 +83,79 @@ module Cassiopee
53
83
  return distance
54
84
 
55
85
  end
86
+
87
+ private
88
+
89
+ # Compute Levenshtein distance but using a mapping matrix of alphabet ambiguity
90
+ # Code comes from Text gem, Text::Levenshtein.distance, adapted for ambiguity comparison
91
+
92
+ def computeLevenshteinAmbiguous(pattern, edit, ambiguous)
93
+
94
+ pattern = pattern.downcase
95
+ encoding = defined?(Encoding) ? self.encoding.to_s : $KCODE
96
+
97
+ if Text.encoding_of(self) =~ /^U/i
98
+ unpack_rule = 'U*'
99
+ else
100
+ unpack_rule = 'C*'
101
+ end
102
+
103
+ s = self.unpack(unpack_rule)
104
+ t = pattern.unpack(unpack_rule)
105
+ n = s.length
106
+ m = t.length
107
+ return m if (0 == n)
108
+ return n if (0 == m)
109
+
110
+ d = (0..m).to_a
111
+ x = nil
112
+
113
+ (0...n).each do |i|
114
+ e = i+1
115
+ (0...m).each do |j|
116
+ cost = (isAmbiguousEqual(s[i],t[j],ambiguous)) ? 0 : 1
117
+ x = [
118
+ d[j+1] + 1, # insertion
119
+ e + 1, # deletion
120
+ d[j] + cost # substitution
121
+ ].min
122
+ d[j] = e
123
+ e = x
124
+ end
125
+ d[m] = x
126
+ end
127
+ if(x>edit)
128
+ return -1
129
+ end
130
+
131
+ return x
132
+ end
133
+
134
+
135
+ # checks if 2 chars are equal with ambiguity rules
136
+ # * ambigous is a Hash of char/Array of char mapping
137
+
138
+ def isAmbiguousEqual(a,b,ambiguous)
139
+ if(ambiguous==nil || ambiguous[a.chr]==nil)
140
+ if(a==b)
141
+ return true
142
+ else
143
+ return false
144
+ end
145
+ end
146
+ vin = "" << a.chr
147
+ if(ambiguous[a.chr].index(b.chr)!=nil)
148
+ return true
149
+ else
150
+ return false
151
+ end
152
+ end
56
153
 
57
154
  # Base class to index and search through a string
58
155
 
59
156
  class Crawler
60
157
 
61
- # Use alphabet ambiguity (dna/rna) in search
158
+ # Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
62
159
  attr_accessor :useAmbiguity
63
160
  # Suffix files name/path
64
161
  attr_accessor :file_suffix
@@ -66,9 +163,13 @@ module Cassiopee
66
163
  attr_accessor :maxthread
67
164
  # Use persistent suffix file ?
68
165
  attr_accessor :use_store
166
+ # Array of comment characters to skip lines in input sequence file
167
+ attr_accessor :comments
69
168
 
70
169
  @min_position = 0
71
170
  @max_position = 0
171
+
172
+ @ambiguous = nil
72
173
 
73
174
  FILE_SUFFIX_EXT = ".sfx"
74
175
  FILE_SUFFIX_POS = ".sfp"
@@ -76,9 +177,10 @@ module Cassiopee
76
177
  SUFFIXLEN = 'suffix_length'
77
178
 
78
179
  $maxthread = 1
180
+
79
181
 
80
182
  $log = Logger.new(STDOUT)
81
- $log.level = Logger::DEBUG
183
+ $log.level = Logger::INFO
82
184
 
83
185
  def initialize
84
186
  @useAmbiguity = false
@@ -95,7 +197,17 @@ module Cassiopee
95
197
  @use_store = false
96
198
 
97
199
  @sequence = nil
200
+
201
+ @comments = Array["#"]
98
202
  end
203
+
204
+ def filterLength
205
+ filterOptimal(0)
206
+ end
207
+
208
+ def filterCost
209
+ filterOptimal(1)
210
+ end
99
211
 
100
212
  # Clear suffixes in memory
101
213
  # If using use_store, clear the store too
@@ -137,6 +249,53 @@ module Cassiopee
137
249
  @min_position = 0
138
250
  @max_position = 0
139
251
  end
252
+
253
+
254
+ # Load ambiguity rules from a file
255
+ # File format should be:
256
+ # * A=B,C
257
+ # D=E,F
258
+ # ...
259
+
260
+ def loadAmbiguityFile(f)
261
+ if(!File.exists?(f))
262
+ $log.error("File "<< f << "does not exists")
263
+ exit(1)
264
+ end
265
+ @ambiguous = Hash.new
266
+ file = File.new(f, "r")
267
+ while (line = file.gets)
268
+ definition = line.downcase.chomp
269
+ ambdef = definition.split('=')
270
+ ambequal = ambdef[1].split(',')
271
+ @ambiguous[ambdef[0]] = ambequal
272
+ end
273
+ @useAmbiguity = true
274
+ $log.debug("loaded ambiguity rules: " << @ambiguous.inspect())
275
+ file.close
276
+
277
+ end
278
+
279
+ # Load sequence from a previous index command
280
+
281
+ def loadIndex
282
+ seq = ''
283
+ begin
284
+ file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r")
285
+ while (line = file.gets)
286
+ input = line.downcase.chomp
287
+ seq << input
288
+ end
289
+ file.close
290
+ rescue => err
291
+ $log.error("Exception: #{err}")
292
+ exit()
293
+ end
294
+ @sequence = seq
295
+ clear()
296
+ @min_position = 0
297
+ @max_position = 0
298
+ end
140
299
 
141
300
  # Filter matches to be between min and max start position
142
301
  # If not using use_store, search speed is improved but existing indexes are cleared
@@ -153,6 +312,9 @@ module Cassiopee
153
312
  # Search exact match
154
313
 
155
314
  def searchExact(pattern)
315
+ if(@useAmbiguity)
316
+ return searchApproximate(pattern,0)
317
+ end
156
318
  pattern = pattern.downcase
157
319
  parseSuffixes(@sequence,pattern.length,pattern.length)
158
320
 
@@ -180,11 +342,11 @@ module Cassiopee
180
342
 
181
343
 
182
344
  def searchApproximate(s,edit)
183
- if(edit==0)
345
+ if(edit==0 && !@useAmbiguity)
184
346
  return searchExact(s)
185
347
  end
186
348
 
187
- if(edit>0)
349
+ if(edit>=0)
188
350
  useHamming = true
189
351
  minmatchsize = s.length
190
352
  maxmatchsize = s.length
@@ -216,9 +378,17 @@ module Cassiopee
216
378
  seq = extractSuffix(posArray[1],posArray[0])
217
379
  seq.extend(Cassiopee)
218
380
  if(useHamming)
219
- errors = seq.computeHamming(s,edit)
381
+ if(@useAmbiguity && @ambiguous!=nil)
382
+ errors = seq.computeHammingAmbiguous(s,edit,@ambiguous)
383
+ else
384
+ errors = seq.computeHamming(s,edit)
385
+ end
220
386
  else
221
- errors = seq.computeLevenshtein(s,edit)
387
+ if(@useAmbiguity && @ambigous!=nil)
388
+ errors = seq.computeLevenshteinAmbiguous(s,edit,@ambigous)
389
+ else
390
+ errors = seq.computeLevenshtein(s,edit)
391
+ end
222
392
  end
223
393
  if(errors>=0)
224
394
  filteredPosArray = filter(posArray)
@@ -285,6 +455,34 @@ module Cassiopee
285
455
  end
286
456
  end
287
457
 
458
+ def to_pos
459
+ positions = Hash.new
460
+ @matches.each do |match|
461
+ # match = Array[md5val, errors, posArray]
462
+ i=0
463
+ len = 0
464
+ match[2].each do |pos|
465
+ if(i==0)
466
+ len = pos
467
+ else
468
+ if(positions.has_key?(pos))
469
+ posmatch = positions[pos]
470
+ posmatch << Array[len,match[1]]
471
+
472
+
473
+ else
474
+ posmatch = Array.new
475
+ posmatch << Array[len,match[1]]
476
+ positions[pos] = posmatch
477
+ end
478
+ end
479
+ i += 1
480
+ end
481
+
482
+ end
483
+ return positions.sort
484
+ end
485
+
288
486
  def to_s
289
487
  puts '{ matches: "' << @matches.length << '" }'
290
488
  end
@@ -415,8 +613,20 @@ module Cassiopee
415
613
  while (line = file.gets)
416
614
  counter = counter + 1
417
615
  input = line.downcase.chomp
418
- sequence << input
419
- data.puts input
616
+ skip = false
617
+ comments.each do |c|
618
+ $log.debug("skip line ?" << c << " == " << input[0])
619
+ if(input[0] == c[0])
620
+ # Line start with a comment char, skip it
621
+ $log.debug("skip line")
622
+ skip = true
623
+ break
624
+ end
625
+ end
626
+ if(!skip)
627
+ sequence << input
628
+ data.puts input
629
+ end
420
630
  end
421
631
 
422
632
  end
@@ -443,7 +653,105 @@ module Cassiopee
443
653
  return obj
444
654
  end
445
655
  end
656
+
657
+ # Filter @matches to keep only the longest or the error less matches for a same start position
658
+
659
+ def filterOptimal(type)
660
+
661
+ positions = Hash.new
662
+ @matches.each do |match|
663
+ # match = Array[md5val, errors, posArray]
664
+ i=0
665
+ len = 0
666
+ match[2].each do |pos|
667
+ if(i==0)
668
+ len = pos
669
+ else
670
+ if(positions.has_key?(pos))
671
+ posmatch = positions[pos]
672
+ posmatch << Array[len,match[1],match[0]]
673
+ #positions[pos] << posmatch
674
+
675
+ else
676
+ posmatch = Array.new
677
+ posmatch << Array[len,match[1],match[0]]
678
+ positions[pos] = posmatch
679
+ end
680
+ end
681
+ i += 1
682
+ end
683
+ end
684
+
685
+ matchtoremove = Array.new
686
+ positions.each do |pos,posmatch|
687
+
688
+ optimal = nil
689
+ match = nil
690
+ count = 0
691
+ newoptimal = nil
692
+ newmatch = nil
693
+
694
+ (0..posmatch.length-1).each do |i|
695
+ solution = posmatch[i]
696
+ if(i==0)
697
+ if(type==0)
698
+ # length
699
+ optimal = solution[0]
700
+ else
701
+ # cost
702
+ optimal = solution[1]
703
+ end
704
+ match = solution[2].to_s
705
+ #count += 1
706
+ next
707
+ end
708
+
709
+ newmatch = solution[2].to_s
710
+ if(type==0)
711
+ # length
712
+ newoptimal = solution[0]
713
+ if(newoptimal.to_i>optimal.to_i)
714
+ optimal = newoptimal
715
+ matchtoremove << match
716
+ match = newmatch
717
+ else
718
+ matchtoremove << newmatch
719
+ end
720
+ else
721
+ # cost
722
+ newoptimal = solution[1]
723
+ if(newoptimal<optimal)
724
+ optimal = newoptimal
725
+ matchtoremove << match
726
+ match = newmatch
727
+ else
728
+ matchtoremove << newmatch
729
+ end
730
+ end
731
+ count += 1
732
+
733
+ end
734
+
735
+ end
736
+
737
+ newmatches = Array.new
738
+ @matches.each do |match|
739
+ found = false
740
+ matchtoremove.each do |item|
741
+ if(match[0]==item)
742
+ found = true
743
+ break
744
+ end
745
+ end
746
+ if(!found)
747
+ newmatches << match
748
+ end
749
+ end
750
+ @matches = newmatches
751
+
752
+ end
446
753
 
447
754
  end
448
755
 
756
+
449
757
  end
data/tests/amb.map ADDED
@@ -0,0 +1,2 @@
1
+ u=a,c
2
+ v=c,g
data/tests/test-suite.rb CHANGED
@@ -17,6 +17,14 @@ class TestCrawler < Test::Unit::TestCase
17
17
  assert_equal(2,match[2].length-1)
18
18
  end
19
19
 
20
+ def test_ambiguous
21
+ crawler = Cassiopee::Crawler.new
22
+ crawler.loadAmbiguityFile(File.join(File.dirname(__FILE__), 'amb.map'))
23
+ crawler.indexString('aaaaaaaaaaacgttttttt')
24
+ matches = crawler.searchExact('aucgt')
25
+ assert_equal(1,matches.length)
26
+ end
27
+
20
28
 
21
29
  def test_hammingsearch
22
30
  crawler = Cassiopee::Crawler.new
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cassiopee
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Olivier Sallou
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-04 00:00:00 +02:00
18
+ date: 2011-09-09 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,7 @@ files:
50
50
  - lib/cassiopee.rb
51
51
  - bin/cassie.rb
52
52
  - tests/test-suite.rb
53
+ - tests/amb.map
53
54
  has_rdoc: true
54
55
  homepage: https://github.com/osallou/cassiopee
55
56
  licenses: