bio-cigar 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 95056409b5f06d3c60a738740f27c4835a414335
4
- data.tar.gz: 7882ac6d35a76b43ac78af2218faf7a46a7542eb
3
+ metadata.gz: 04021bbb9a499a88095d7dbb0da06a82ffbd1cfb
4
+ data.tar.gz: 2c6cd7124a76a38a6f49e602c007241fde0384f5
5
5
  SHA512:
6
- metadata.gz: 5fca372c6030c01121adddbb273bc9fdca7eb25579ea15c5957842a592c7ce337c1b2eab749f511aa73c100d815c3dd02101285daf346629c36625e893c1cac0
7
- data.tar.gz: d5e72661f94fb60bb8c1afec10d8cce76cac22e0e6edf3483a1003cacfb6cef70f006c0fd5dd185f110b67384603e9154881c845a836a4b0c299a6fb70492ee7
6
+ metadata.gz: a366678809c45fe01b25b41fcfacd96c76a7f8239c1c2d700669fdfaad2f4d7c412c0dee0fef8e2d9d605277d31f85ceafe2d0c94bc33e0a1c086b6f2d935d6b
7
+ data.tar.gz: 2fc5b1e96f0ca742cfcbbac73804ee63db968df54e75ce893a6a78a4889926f4555dd32d6ea89c7f1aec897a3fe89ca8d32ced777bac25651a0509a3a8fe6517
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.1.0
@@ -13,17 +13,11 @@ module Bio
13
13
  ref_index = 0
14
14
  query_index = 0
15
15
  each_alignment_chunk do |type, count|
16
+ # puts "ref_i=#{ref_index}, query_index=#{query_index}, num_match=#{num_match}, num_mismatch=#{num_mismatch}"
17
+ # puts "#{type} #{count}"
18
+ # puts "ref=#{reference_sequence_string[ref_index...(reference_sequence_string.length)] }"
19
+ # puts "query=#{query_sequence_string[query_index...(query_sequence_string.length)] }"
16
20
  case type
17
- when 'M'
18
- (0...count).each do |i|
19
- if reference_sequence_string[ref_index+i] == query_sequence_string[query_index+i]
20
- num_match += 1
21
- else
22
- num_mismatch += 1
23
- end
24
- end
25
- ref_index += count
26
- query_index += count
27
21
  when 'I'
28
22
  # Extra characters in the query sequence
29
23
  num_mismatch += count
@@ -36,12 +30,31 @@ module Bio
36
30
  query_index += count
37
31
  when 'H'
38
32
  query_index += count
33
+ when 'P'
34
+ # Do nothing
35
+ when 'N'
36
+ # long skip on the reference sequence
37
+ ref_index += count
39
38
  else
40
- raise "Cigar string not parsed correctly. Unrecognised alignment type #{type}"
39
+ if %w(M = X).include?(type)
40
+ # For = and X, ignore these and recalculate, for ease of programming this method.
41
+ (0...count).each do |i|
42
+ if reference_sequence_string[ref_index+i] == query_sequence_string[query_index+i]
43
+ num_match += 1
44
+ else
45
+ num_mismatch += 1
46
+ end
47
+ end
48
+ ref_index += count
49
+ query_index += count
50
+ else
51
+ raise "Cigar string not parsed correctly. Unrecognised alignment type #{type}"
52
+ end
41
53
  end
54
+ #puts "after, ref_i=#{ref_index}, query_index=#{query_index}, num_match=#{num_match}, num_mismatch=#{num_mismatch}"
42
55
  end
43
56
 
44
- percent = num_match.to_f/(num_match+num_mismatch)*100
57
+ percent = num_match.to_f / (num_match+num_mismatch)*100
45
58
  return percent, num_match, num_mismatch
46
59
  end
47
60
 
@@ -55,7 +68,7 @@ module Bio
55
68
  # end
56
69
  def each_alignment_chunk
57
70
  leftover = @cigar_string
58
- while matches = leftover.match(/^(\d+)([MSIHD])(.*)/)
71
+ while matches = leftover.match(/^(\d+)([MSIHNDP\=X])(.*)/)
59
72
  yield matches[2], matches[1].to_i
60
73
  leftover = matches[3]
61
74
  end
@@ -65,4 +65,48 @@ describe "BioCigar" do
65
65
  sam.seq = 'TCAGAGCTACAAGAGTTTGATCGTGGCTCAGAAGGAACGCTAGCTATATGCTTAACACATGCAAGTCGAACGTTGTTTTCGGGGAGCTGGGCAGAAGGAAAAGAGGCTCCTAGCGTGAAGGTAGCTTGTCTCGCCCAGGAGGTGGGAACAGTTGAAAACAAAGTGGCGAACGGGTGCGTAATGCGTGGGAATCTGCCGAACAGTTCGGGCCAAATCCTGAAGAAAGCTAAAAAGCGCTGTTTGATGAGCC'
66
66
  sam.percent_identity(ref)[0].should == 99.58333333333333
67
67
  end
68
+
69
+ it 'should work with padded reference seqs' do
70
+ ref = 'CCGG'
71
+ sam = Bio::DB::Alignment.new
72
+ sam.cigar = '2M1P1I1P3I2M'
73
+ sam.pos = 1
74
+ sam.seq = 'CCAGGTGG'
75
+ sam.percent_identity(ref).should == [
76
+ 50.0,
77
+ 4,
78
+ 4,
79
+ ]
80
+ end
81
+
82
+ it 'should work with X and =' do
83
+ # SAM:
84
+ # 790 16 2303416 1150 1 196M54S * 0 0 ACTGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGCCCAGGGCTACACACGTGCTACAATGGCCGTTACAAAGCGTCGCTAACCCGCGAGGGGGAGCCAATCGCAAAAAAGCGGCCTCAGTTCAGATTGCAGTCTGCAACTCGACTGCATGAAGTTGGAATCCCTAGTAATCGCGTGTCATTAGCGCGCGGTGAATACGTCCCTGCTCCTTGCACTCACCGCCCGT * AS:i:184
85
+ ref = 'GAGCGAACGTTAGCGGCGGGCTTAACACATGCAAGTCGAACGAGAATGAAGGAGCAATCCTTCTAGTAAAGTGGCGGACGGGTGCGTAACACGTGGATAATCTACCTTCCGGCGGGGGACAACAGTTCGAAAGGACTGCTAATACCGCGTACGTCGGCGAGAGCTCAGGCTCTTGTCGGGAAAGATGGCCAATCCTTGGAAGCTGTCACCGGAAGATGAATCCGCGGCCCATCAGGTAGTTGGTGAGGTAATGGCTCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGCGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGCAATGGGCGAAAGCCTGACCCAGCCACGCCGCGTGAGTGATGAAGGCCTTCGGGTCGTAAAGCTCTGTGGGGAGGGACGAACAAGTGCGTATCGAATAAATACGTGCCCTGACGGTACCTCCTTAGCAAGCACCGGCTAACCATGTGCCAGCAGCCGCGGTAATACATGGGGTGCAAACGTTGCTCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTCGCTTAAGTCGGATGTGAAATCCCTCGGCTTAACTGAGGAAGTGCATCCGAGACTGAATGGCTAGAGTACGAAAGAGGGTCGNNNNNTTCCCGGTGTAGAGGTGAAATTCGTAGATATCGGGAGGAACACCGGCGGCGAAGGCGGCGACCTGGTTCGAGACTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGGATGCTAGATGTTTCTGGTATTGACCCCGGAGGCGTCGTAGCTAACGCGATAAGCATCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTCAATTTGACGCAACGCGAAGAACCTTACCTGGGTTGGAACCCTCCAGAAGTCCGCAGAGATGTGGATGTGCTCGCAAGAGAACTGGATGTCCAGGTGCTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCGTTAGTTGCTAACAGTTCGGCTGAGCACTCTAACGAGACTGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGCCCAGGGCTACACACGTGCTACAATGGTCGTTACAAAGCGTCGCTAACCCGCGAGGGGGAGCTAATCGCAAAAAAGCGGCCTCAGTTCAGATTGCAGTCTGCAACTCGACTGCATGAAGTTGGAATCGCTAGTAATCCCTGATCAGCAGGCAGGGGTGAATACGTTCCCGGGCC'
86
+ query = 'ACTGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGCCCAGGGCTACACACGTGCTACAATGGCCGTTACAAAGCGTCGCTAACCCGCGAGGGGGAGCCAATCGCAAAAAAGCGGCCTCAGTTCAGATTGCAGTCTGCAACTCGACTGCATGAAGTTGGAATCCCTAGTAATCGCGTGTCATTAGCGCGCGGTGAATACGTCCCTGCTCCTTGCACTCACCGCCCGT'
87
+ pos = 1150
88
+
89
+ ref_seq = ref[pos-1...ref.length]
90
+ Bio::Cigar.new('100X96=54S').percent_identity(ref_seq, query).should == [ #This example is a little fake because X and = are not true, but it is re-calculated in the code so should not matter
91
+ 98.46938775510205,
92
+ 193,
93
+ 3
94
+ ]
95
+ end
96
+
97
+ it 'should work with N' do
98
+ ref = 'GTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTA'
99
+ query = 'GTGTAACCC'+ 'TCAGAATA'
100
+ sam = Bio::DB::Alignment.new
101
+ sam.cigar = '9M32N8M'
102
+ sam.pos = 1
103
+ sam.seq = query
104
+ expected_matches = 4+3+4+3
105
+ expected_mismatches = 2+1
106
+ sam.percent_identity(ref).should == [
107
+ expected_matches.to_f/ (expected_matches+expected_mismatches)*100,
108
+ expected_matches,
109
+ expected_mismatches,
110
+ ]
111
+ end
68
112
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-cigar
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-27 00:00:00.000000000 Z
11
+ date: 2014-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-samtools