bio-samtools 2.0.5 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/test/sample.vcf ADDED
@@ -0,0 +1,24 @@
1
+ ##fileformat=VCFv4.1
2
+ ##fileDate=20090805
3
+ ##source=myImputationProgramV3.1
4
+ ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
5
+ ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
6
+ ##phasing=partial
7
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
8
+ ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
9
+ ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
10
+ ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
11
+ ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
12
+ ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
13
+ ##FILTER=<ID=q10,Description="Quality below 10">
14
+ ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16
+ ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18
+ ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20
+ 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
21
+ 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
22
+ 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
23
+ 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
24
+ 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
@@ -0,0 +1,2 @@
1
+ chr_1 1 30
2
+ chr_1 40 45
data/test/test_pileup.rb CHANGED
@@ -14,6 +14,8 @@ class TestPileup < Test::Unit::TestCase
14
14
  @pu2 = Bio::DB::Pileup.new("seq1 279 C 23 ATTT,,.,.TTTT,,,.,TTTTT ;75&<<<<<<<<<=<<<9<<:<<")
15
15
  #an indel..
16
16
  @pu3 = Bio::DB::Pileup.new("seq2 156 * +AG/+AG 71 252 99 11 +AG * 3 8 0")
17
+ #two heterozygous alt snps
18
+ @pu4 = Bio::DB::Pileup.new("seq1 279 C 24 AAAAAAAAATTTTTTTTTAATTAA ;75&<<<<<<<<<=<<<9<<:<<<")
17
19
  end
18
20
 
19
21
  def test_non_ref_count
@@ -52,14 +54,14 @@ class TestPileup < Test::Unit::TestCase
52
54
  def test_snp_gt
53
55
  snp = @pu2.send(:snp_gt)
54
56
  assert_equal('T,', snp[0])
55
- assert_equal('1/2', snp[1])
57
+ assert_equal('1/1', snp[1])
56
58
  end
57
59
 
58
60
  def test_genotype_list
59
61
  gt2 = @pu2.genotype_list
60
62
  gt3 = @pu3.genotype_list
61
63
  assert_equal('T,', gt2[0])
62
- assert_equal('1/2', gt2[1])
64
+ assert_equal('1/1', gt2[1])
63
65
  assert_equal('IAG', gt3[0])
64
66
  assert_equal('1/1', gt3[1])
65
67
  end
@@ -81,4 +83,8 @@ class TestPileup < Test::Unit::TestCase
81
83
  assert_equal(5, @pu.to_s.count("\t"))
82
84
  end
83
85
 
86
+ def test_consensus_iuap
87
+ assert_equal('w', @pu4.consensus_iuap(0.1))
88
+ end
89
+
84
90
  end
data/test/test_sam.rb CHANGED
@@ -22,6 +22,7 @@ class TestBioDbSam < Test::Unit::TestCase
22
22
  File.delete("test/samples/small/test_sorted.bam")
23
23
  File.delete("test/samples/small/maps_merged.bam")
24
24
  File.delete("test/samples/small/maps_cated.bam")
25
+ File.delete("test/samples/small/testu.out")
25
26
  end
26
27
  end
27
28
 
@@ -31,6 +32,7 @@ class TestBioDbSam < Test::Unit::TestCase
31
32
  @testTAMFile = @test_folder + "/test.tam"
32
33
  @testBAMFile = @test_folder + "/testu.bam"
33
34
  @testReference = @test_folder + "/test_chr.fasta"
35
+ @bed_file = @test_folder + "/testu.bed"
34
36
  @sam = Bio::DB::Sam.new(
35
37
  :fasta => @testReference,
36
38
  :bam => @testBAMFile
@@ -97,14 +99,14 @@ class TestBioDbSam < Test::Unit::TestCase
97
99
  def test_fetch_with_function
98
100
  #pass the assert to method
99
101
  block = Proc.new {|a| assert_equal(a.class, Bio::DB::Alignment)}
100
- @sam.fetch_with_function("chr_1", 10,1000, &block)
102
+ @sam.fetch_with_function("chr_1", 10, 1000, &block)
101
103
  end
102
104
 
103
105
  def test_chromosome_coverage
104
106
  #the coverage should only be 1.0 or 2.0
105
- cov = @sam.chromosome_coverage("chr_1", 33, 19)
107
+ cov = @sam.chromosome_coverage("chr_1", 10, 1000)
106
108
  cov.each do |pu|
107
- assert_send([[1.0 , 2.0], :member?, pu])
109
+ assert_send([[1.0 , 2.0, 3.0], :member?, pu])
108
110
  end
109
111
  end
110
112
 
@@ -197,14 +199,23 @@ class TestBioDbSam < Test::Unit::TestCase
197
199
  assert_equal(pileup.chrom, 'chr_1')
198
200
  end
199
201
  end
202
+
203
+ def test_region_new
204
+ reg1 = Bio::DB::Fasta::Region.new(:entry=>"chr_1", :start=>1, :end=>334)
205
+ reg2 = Bio::DB::Fasta::Region.new
206
+ reg2.entry = "chr_1"
207
+ reg2.start = 1
208
+ reg2.end = 334
209
+
210
+ assert_equal(reg1.entry, reg2.entry)
211
+ assert_equal(reg1.start, reg2.start)
212
+ assert_equal(reg1.end, reg2.end)
213
+ end
200
214
 
201
215
  def test_mpileup_reg
202
216
  #create an mpileup
203
- reg = Bio::DB::Fasta::Region.new
204
- reg.entry = "chr_1"
205
- reg.start = 1
206
- reg.end = 334
207
-
217
+ reg = Bio::DB::Fasta::Region.new(:entry=>"chr_1", :start=>1, :end=>334)
218
+
208
219
  @sam.mpileup_cached(:r=>reg,:g => false, :min_cov => 1, :min_per =>0.2) do |pileup|
209
220
  #test that all the objects are Bio::DB::Pileup objects
210
221
  assert_kind_of(Bio::DB::Pileup, pileup)
@@ -227,11 +238,8 @@ class TestBioDbSam < Test::Unit::TestCase
227
238
 
228
239
  def test_mpileup_reg_05
229
240
  #create an mpileup
230
- reg = Bio::DB::Fasta::Region.new
231
- reg.entry = "chr_1"
232
- reg.start = 1
233
- reg.end = 334
234
- @sam.mpileup_cached(:r=>reg,:g => false, :min_cov => 1, :min_per =>0.4) do |pileup|
241
+ reg = Bio::DB::Fasta::Region.new(:entry=>"chr_1", :start=>1, :end=>334)
242
+ @sam.mpileup_cached(:r=>reg, :g => false, :min_cov => 1, :min_per =>0.4) do |pileup|
235
243
  #test that all the objects are Bio::DB::Pileup objects
236
244
  assert_kind_of(Bio::DB::Pileup, pileup)
237
245
  #test that the reference name is 'chr_1' for all objects
@@ -343,5 +351,17 @@ class TestBioDbSam < Test::Unit::TestCase
343
351
  #force an error (use 'samtool' instead of 'samtools')
344
352
  output = Bio::DB::Sam.docs('samtool', 'tview')
345
353
  assert_equal(output, "program must be 'samtools' or 'bcftools'")
346
- end
354
+ end
355
+
356
+ def test_bedcov
357
+ out_file = @test_folder + "/testu.out"
358
+ @sam.bedcov(:bed=>@bed_file, :out=>out_file)
359
+ f = File.open(out_file, "r")
360
+ f.each_line do |line|
361
+ f_array= line.split(/\t/)
362
+ assert_equal(f_array[3], 630)
363
+ end
364
+ f.close
365
+ end
366
+
347
367
  end
@@ -1,85 +1,246 @@
1
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
- "http://www.w3.org/TR/html4/strict.dtd">
3
-
4
- <html lang="en">
1
+ <!doctype html>
2
+ <html>
5
3
  <head>
6
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
- <title>untitled</title>
8
- <meta name="generator" content="TextMate http://macromates.com/">
9
- <meta name="author" content="macleand">
10
- <!-- Date: 2011-10-14 -->
11
- <link rel="stylesheet" type="text/css" href="basic_styles.css"></link>
12
- <link rel="stylesheet" type="text/css" href="http://drnicwilliams.com/external/CodeHighlighter/styles.css"></link>
13
- <script src="http://drnicwilliams.com/external/CodeHighlighter/clean_tumblr_pre.js"></script>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
6
+ <style>
7
+ h1,
8
+ h2,
9
+ h3,
10
+ h4,
11
+ h5,
12
+ h6,
13
+ p,
14
+ blockquote {
15
+ margin: 0;
16
+ padding: 0;
17
+ }
18
+ body {
19
+ font-family: "Helvetica Neue", Helvetica, "Hiragino Sans GB", Arial, sans-serif;
20
+ font-size: 13px;
21
+ line-height: 18px;
22
+ color: #737373;
23
+ background-color: white;
24
+ margin: 10px 13px 10px 13px;
25
+ }
26
+ table {
27
+ margin: 10px 0 15px 0;
28
+ border-collapse: collapse;
29
+ }
30
+ td,th {
31
+ border: 1px solid #ddd;
32
+ padding: 3px 10px;
33
+ }
34
+ th {
35
+ padding: 5px 10px;
36
+ }
37
+
38
+ a {
39
+ color: #0069d6;
40
+ }
41
+ a:hover {
42
+ color: #0050a3;
43
+ text-decoration: none;
44
+ }
45
+ a img {
46
+ border: none;
47
+ }
48
+ p {
49
+ margin-bottom: 9px;
50
+ }
51
+ h1,
52
+ h2,
53
+ h3,
54
+ h4,
55
+ h5,
56
+ h6 {
57
+ color: #404040;
58
+ line-height: 36px;
59
+ }
60
+ h1 {
61
+ margin-bottom: 18px;
62
+ font-size: 30px;
63
+ }
64
+ h2 {
65
+ font-size: 24px;
66
+ }
67
+ h3 {
68
+ font-size: 18px;
69
+ }
70
+ h4 {
71
+ font-size: 16px;
72
+ }
73
+ h5 {
74
+ font-size: 14px;
75
+ }
76
+ h6 {
77
+ font-size: 13px;
78
+ }
79
+ hr {
80
+ margin: 0 0 19px;
81
+ border: 0;
82
+ border-bottom: 1px solid #ccc;
83
+ }
84
+ blockquote {
85
+ padding: 13px 13px 21px 15px;
86
+ margin-bottom: 18px;
87
+ font-family:georgia,serif;
88
+ font-style: italic;
89
+ }
90
+ blockquote:before {
91
+ content:"\201C";
92
+ font-size:40px;
93
+ margin-left:-10px;
94
+ font-family:georgia,serif;
95
+ color:#eee;
96
+ }
97
+ blockquote p {
98
+ font-size: 14px;
99
+ font-weight: 300;
100
+ line-height: 18px;
101
+ margin-bottom: 0;
102
+ font-style: italic;
103
+ }
104
+ code, pre {
105
+ font-family: Monaco, Andale Mono, Courier New, monospace;
106
+ }
107
+ code {
108
+ background-color: #fee9cc;
109
+ color: rgba(0, 0, 0, 0.75);
110
+ padding: 1px 3px;
111
+ font-size: 12px;
112
+ -webkit-border-radius: 3px;
113
+ -moz-border-radius: 3px;
114
+ border-radius: 3px;
115
+ }
116
+ pre {
117
+ display: block;
118
+ padding: 14px;
119
+ margin: 0 0 18px;
120
+ line-height: 16px;
121
+ font-size: 11px;
122
+ border: 1px solid #d9d9d9;
123
+ white-space: pre-wrap;
124
+ word-wrap: break-word;
125
+ }
126
+ pre code {
127
+ background-color: #fff;
128
+ color:#737373;
129
+ font-size: 11px;
130
+ padding: 0;
131
+ }
132
+ sup {
133
+ font-size: 0.83em;
134
+ vertical-align: super;
135
+ line-height: 0;
136
+ }
137
+ * {
138
+ -webkit-print-color-adjust: exact;
139
+ }
140
+ @media screen and (min-width: 914px) {
141
+ body {
142
+ width: 854px;
143
+ margin:10px auto;
144
+ }
145
+ }
146
+ @media print {
147
+ body,code,pre code,h1,h2,h3,h4,h5,h6 {
148
+ color: black;
149
+ }
150
+ table, pre {
151
+ page-break-inside: avoid;
152
+ }
153
+ }
154
+ </style>
155
+ <title>Prerequisites</title>
14
156
 
15
157
  </head>
16
158
  <body>
17
- <div id="wrap">
18
- <div id="main">
19
- <div id="header">
20
- <h1>bio-samtools Basic Tutorial</h1>
21
- </div>
22
- <div>
23
- <h2>Introduction</h2>
24
- <p>bio-samtools is a Ruby binding to the popular <a href="http://samtools.sourceforge.net/">SAMtools</a> library, and provides access to individual read alignments as well as BAM files, reference sequence and pileup information. </p>
25
- </div>
26
- <div>
27
- <h2>Installation</h2>
28
- <p>Installation of bio-samtools is very straightforward, and is accomplished with the Ruby <emph>gems</emph> command. All you need is an internet connection.</p>
29
- <h3>Prerequisites</h3>
30
- <p>bio-samtools relies on the following other rubygems:</p>
31
- <ul>
32
- <li><a href="http://rubygems.org/gems/ffi">FFI</a></li>
33
- <li><a href="http://rubygems.org/gems/bio">bio >= 1.4.1</a> </li>
34
- </ul>
35
- Once these are installed, bio-samtools can be installed with
36
- <pre><code class="ruby">sudo gem install bio-samtools
37
- </code></pre>
38
- It should then be easy to test whether installation went well. Start interactive Ruby (IRB) in the terminal, and type <code>require 'bio-samtools'</code> if the terminal returns <code>true</code> then all is well.
39
- <pre><code class="ruby">$ irb
40
- >> require 'bio-samtools'
41
- => true</code></pre>
42
- <h2>Working with BAM files</h2><br />
43
- <h3>Creating a new SAM object</h3>
44
- <p>
45
- A SAM object represents the alignments in the BAM file, and is very straightforward to create, you will need a sorted BAM file, to access the alignments and a reference sequence in FASTA format to use the reference sequence. The object can be created and opened as follows:</p>
46
- <pre><code class="ruby">bam = Bio::DB::Sam.new(:bam=>"my_sorted.bam", :fasta=>'ref.fasta')
47
- bam.open</code></pre>
48
- Opening the file needs only to be done once for multiple operations on it, access to the alignments is random so you don't need to loop over the entries in the file.
49
- </p>
50
- <h3>Getting Reference Sequence</h3>
51
- <p>Retrieving the reference can only be done if the reference has been loaded, which isn't done automatically in order to save memory. Reference need only be loaded once, and is accessed using reference name, start, end in 1-based co-ordinates. A standard Ruby String object is returned.</p>
52
- <pre><code class="ruby">bam.load_reference
53
- sequence_fragment = bam.fetch_reference("Chr1", 1, 500)</code></pre>
54
- <h3>Getting Alignments</h3>
55
- <p>Alignments can be obtained one at a time by looping over a specified region using the <code>fetch()</code> function.</p>
56
- <pre><code class="ruby">bam.load_reference
57
- bam.fetch("1",3000,4000).each do |alignment|
58
- #do something with the alignment...
59
- end</code></pre>
60
- <p> A separate method <code>fetch_with_function()</code> allows you to pass a block (or a Proc object) to the function for efficient calculation. This example shows a naive conversion of the alignment object to a GFF3 object, which is stored in an array <code>gff_list</code> </p>
61
- <pre><code class="ruby">gff_list = []
62
- fetchAlignment = Proc.new do |a|
63
- #what strand is this alignment on...
64
- a.query_strand ? strand = '+' : strand = '-'
65
- gff_list << Bio::DB::GFF3.new(
66
- :seqid => "Chr1",
67
- :start => a.pos - 1,
68
- :end => a.calend,
69
- :strand => strand,
70
- :sequence => a.seq,
71
- :quality => a.qual,
72
- :feature => 'read',
73
- :source => 'BWA',
74
- :phase => '.',
75
- :score => '.'
76
- )
77
- 0
78
- end
79
- bam.fetch_with_function("Chr1", 3000, 4000, fetchAlignment) #now run the fetch</code></pre>
80
- <h4>Alignment Objects</h4>
81
- <p>The individual alignments represent a single read and are returned as Bio::DB::Alignment objects. These have numerous methods of their own, using <code>require 'pp'</code> will allow you to check the attributes contained in each object. Here is an example alignment object. Remember <code>@</code> represents a Ruby instance variable and can be accessed as any other method. Thus the <code>@is_mapped</code> attribute of an object <code>a</code> is accessed <code>a.is_mapped</code></p>
82
- <pre><code class="ruby">require 'pp'
159
+ <h1>bio-SAMtools Basic Tutorial</h1>
160
+
161
+ <h2>Introduction</h2>
162
+
163
+ <p>bio-SAMtools is a Ruby binding to the popular <a href="http://SAMtools.sourceforge.net/">SAMtools</a> library, and provides access to individual read alignments as well as BAM files, reference sequence and pileup information.</p>
164
+
165
+ <h2>Installation</h2>
166
+
167
+ <p>Installation of bio-SAMtools is very straightforward, and is
168
+ accomplished with the Ruby gems command. All you need is an internet
169
+ connection.</p>
170
+
171
+ <h3>Prerequisites</h3>
172
+
173
+ <p>bio-SAMtools relies on the following other rubygems:</p>
174
+
175
+ <ul>
176
+ <li><a href="http://rubygems.org/gems/bio">bio >= 1.4.2</a></li>
177
+ <li><a href="https://rubygems.org/gems/bio-svgenes">bio-svgenes >= 0.4.1</a></li>
178
+ </ul>
179
+
180
+
181
+ <p>Once these are installed, bio-SAMtools can be installed with</p>
182
+
183
+ <pre><code>sudo gem install bio-SAMtools
184
+ </code></pre>
185
+
186
+ <p>It should then be easy to test whether installation went well. Start
187
+ interactive Ruby (IRB) in the terminal, and type
188
+ <code>require 'bio-SAMtools'</code> if the terminal returns <code>true</code> then all is
189
+ well.</p>
190
+
191
+ <pre><code>$ irb
192
+ &gt;&gt; require 'bio-SAMtools'
193
+ =&gt; true
194
+ </code></pre>
195
+
196
+ <h2>Working with BAM files</h2>
197
+
198
+ <h3>Creating a new SAM object</h3>
199
+
200
+ <p>A SAM object represents the alignments in the BAM file. BAM files (and hence SAM objects here) are what most of SAMtools methods operate on and are very straightforward to create. You will need a sorted BAM file, to access the alignments and a reference sequence in FASTA format to use the reference sequence. The object can be created and opened as follows:</p>
201
+
202
+ <pre><code>bam = Bio::DB::Sam.new(:bam=&gt;"my_sorted.bam", :fasta=&gt;'ref.fasta')
203
+ </code></pre>
204
+
205
+ <p>Opening the file needs only to be done once for multiple operations on
206
+ it, access to the alignments is random so you don't need to loop over
207
+ the entries in the file.</p>
208
+
209
+ <h3>Getting Reference Sequence</h3>
210
+
211
+ <p>The reference is accessed using reference
212
+ name, start, end in 1-based co-ordinates. A standard Ruby String object is returned.</p>
213
+
214
+ <pre><code>sequence_fragment = bam.fetch_reference("Chr1", 1, 100)
215
+ </code></pre>
216
+
217
+ <p>The output from this would be the raw sequence as a string, e.g.</p>
218
+
219
+ <pre><code>cctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
220
+ </code></pre>
221
+
222
+ <p>A reference sequence can be returned as a Bio::Sequence::NA object buy the use of :as_bio => true</p>
223
+
224
+ <pre><code>sequence_fragment = bam.fetch_reference("Chr1", 1, 100, :as_bio =&gt; true)
225
+ </code></pre>
226
+
227
+ <p>The output from this would be a Bio::Sequence::NA object, which provides a fasta-formatted string when printed</p>
228
+
229
+ <pre><code>&gt;Chr1:1-100
230
+ cctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
231
+ </code></pre>
232
+
233
+ <h3>Alignment Objects</h3>
234
+
235
+ <p>The individual alignments represent a single read and are returned as
236
+ Bio::DB::Alignment objects. These have numerous methods of their own,
237
+ using <code>require 'pp'</code> will allow you to check the attributes contained in
238
+ each object. Here is an example alignment object. Remember <code>@</code>
239
+ represents a Ruby instance variable and can be accessed as any other
240
+ method. Thus the <code>@is_mapped</code> attribute of an object <code>a</code> is accessed
241
+ <code>a.is_mapped</code></p>
242
+
243
+ <pre><code>require 'pp'
83
244
  pp an_alignment_object ##some Bio::DB::Alignment object
84
245
  #&lt;Bio::DB::Alignment:0x101113f80
85
246
  @al=#&lt;Bio::DB::SAM::Tools::Bam1T:0x101116a50&gt;,
@@ -108,83 +269,206 @@ pp an_alignment_object ##some Bio::DB::Alignment object
108
269
  @second_in_pair=true,
109
270
  @seq="ACAGTCCAGTCAAAGTACAAATCGAG...",
110
271
  @tags=
111
- {"MD"=>#&lt;Bio::DB::Tag:0x101114ed0 @tag="MD", @type="Z", @value="76"&gt;,
112
- "XO"=>#&lt;Bio::DB::Tag:0x1011155d8 @tag="XO", @type="i", @value="0"&gt;,
113
- "AM"=>#&lt;Bio::DB::Tag:0x101116280 @tag="AM", @type="i", @value="37"&gt;,
114
- "X0"=>#&lt;Bio::DB::Tag:0x101115fb0 @tag="X0", @type="i", @value="1"&gt;,
115
- "X1"=>#&lt;Bio::DB::Tag:0x101115c68 @tag="X1", @type="i", @value="0"&gt;,
116
- "XG"=>#&lt;Bio::DB::Tag:0x101115240 @tag="XG", @type="i", @value="0"&gt;,
117
- "SM"=>#&lt;Bio::DB::Tag:0x1011162f8 @tag="SM", @type="i", @value="37"&gt;,
118
- "XT"=>#&lt;Bio::DB::Tag:0x1011162a8 @tag="XT", @type="A", @value="U"&gt;,
119
- "NM"=>#&lt;Bio::DB::Tag:0x101116348 @tag="NM", @type="i", @value="0"&gt;,
120
- "XM"=>#&lt;Bio::DB::Tag:0x101115948 @tag="XM", @type="i", @value="0"&gt;}&gt;</code></pre>
121
- <h2>Getting Coverage Information</h2><br />
122
- <h3>Per Base Coverage</h3>
123
- <p>It is easy to get the total depth of reads at a given position, the <code>chromosome_coverage</code> function is used. This differs from the previous functions in that a start position and length (rather than end position) are passed to the function. An array of coverages is returned, the first position in the array gives the depth of coverage at the given start position in the genome, the last position in the array gives the depth of coverage at the given start position plus the length given</p>
124
- <pre><code class="ruby">coverages = bam.chromosome_coverage("Chr1", 3000, 1000) #=> [16,16,25,25...]</code></pre>
272
+ {"MD"=&gt;#&lt;Bio::DB::Tag:0x101114ed0 @tag="MD", @type="Z", @value="76"&gt;,
273
+ "XO"=&gt;#&lt;Bio::DB::Tag:0x1011155d8 @tag="XO", @type="i", @value="0"&gt;,
274
+ "AM"=&gt;#&lt;Bio::DB::Tag:0x101116280 @tag="AM", @type="i", @value="37"&gt;,
275
+ "X0"=&gt;#&lt;Bio::DB::Tag:0x101115fb0 @tag="X0", @type="i", @value="1"&gt;,
276
+ "X1"=&gt;#&lt;Bio::DB::Tag:0x101115c68 @tag="X1", @type="i", @value="0"&gt;,
277
+ "XG"=&gt;#&lt;Bio::DB::Tag:0x101115240 @tag="XG", @type="i", @value="0"&gt;,
278
+ "SM"=&gt;#&lt;Bio::DB::Tag:0x1011162f8 @tag="SM", @type="i", @value="37"&gt;,
279
+ "XT"=&gt;#&lt;Bio::DB::Tag:0x1011162a8 @tag="XT", @type="A", @value="U"&gt;,
280
+ "NM"=&gt;#&lt;Bio::DB::Tag:0x101116348 @tag="NM", @type="i", @value="0"&gt;,
281
+ "XM"=&gt;#&lt;Bio::DB::Tag:0x101115948 @tag="XM", @type="i", @value="0"&gt;}&gt;
282
+ </code></pre>
283
+
284
+ <h3>Getting Alignments</h3>
285
+
286
+ <p>Alignments can be obtained one at a time by looping over a specified region using the <code>fetch()</code> function.</p>
287
+
288
+ <pre><code>bam.fetch("Chr1",3000,4000).each do |alignment|
289
+ #do something with the alignment...
290
+ end
291
+ </code></pre>
292
+
293
+ <p>A separate method <code>fetch_with_function()</code> allows you to pass a block (or
294
+ a Proc object) to the function for efficient calculation. This example takes
295
+ an alignment object and returns an array of sequences which exactly match the reference.</p>
296
+
297
+ <pre><code>#an array to hold the matching sequences
298
+ exact_matches = []
299
+
300
+ matches = Proc.new do |a|
301
+ #get the length of each read
302
+ len = a.seq.length
303
+ #get the cigar string
304
+ cigar = a.cigar
305
+ #create a cigar string which represents a full-length match
306
+ cstr = len.to_s &lt;&lt; "M"
307
+ if cigar == cstr
308
+ #add the current sequence to the array if it qualifies
309
+ exact_matches &lt;&lt; a.seq
310
+ end
311
+ end
312
+
313
+ bam.fetch_with_function("Chr1", 100, 500, &amp;matches)
314
+
315
+ puts exact_matches
316
+ </code></pre>
317
+
318
+ <h3>Alignment stats</h3>
319
+
320
+ <p>The SAMtools flagstat method is implemented in bio-SAMtools to quickly examine the number of reads mapped to the reference. This includes the number of paired and singleton reads mapped and also the number of paired-reads that map to different chromosomes/contigs.</p>
321
+
322
+ <pre><code>bam.flag_stats()
323
+ </code></pre>
324
+
325
+ <p>An example output would be</p>
326
+
327
+ <pre><code>34672 + 0 in total (QC-passed reads + QC-failed reads)
328
+ 0 + 0 duplicates
329
+ 33196 + 0 mapped (95.74%:nan%)
330
+ 34672 + 0 paired in sequencing
331
+ 17335 + 0 read1
332
+ 17337 + 0 read2
333
+ 31392 + 0 properly paired (90.54%:nan%)
334
+ 31728 + 0 with itself and mate mapped
335
+ 1468 + 0 singletons (4.23%:nan%)
336
+ 0 + 0 with mate mapped to a different chr
337
+ 0 + 0 with mate mapped to a different chr (mapQ&gt;=5)
338
+ </code></pre>
339
+
340
+ <h2>Getting Coverage Information</h2>
341
+
342
+ <h3>Per Base Coverage</h3>
343
+
344
+ <p>It is easy to get the total depth of reads at a given position, the
345
+ <code>chromosome_coverage</code> function is used. This differs from the previous
346
+ functions in that a start position and length (rather than end position)
347
+ are passed to the function. An array of coverages is returned, the first
348
+ position in the array gives the depth of coverage at the given start
349
+ position in the genome, the last position in the array gives the depth
350
+ of coverage at the given start position plus the length given</p>
351
+
352
+ <pre><code>coverages = bam.chromosome_coverage("Chr1", 3000, 1000) #=&gt; [16,16,25,25...]
353
+ </code></pre>
354
+
125
355
  <h3>Average Coverage In A Region</h3>
126
- <p>Similarly, average (arithmetic mean) of coverage can be retrieved, also with start and length parameters</p>
127
- <pre><code class="ruby">coverages = bam.average_coverage("Chr1", 3000, 1000) #=> 20.287</code></pre>
128
356
 
129
- <h3>Getting Pileup Information</h3>
130
- <p>Pileup format represents the coverage of reads over a single base in the reference. Getting a Pileup over a region is very easy. Note that this is done with <code>mpileup</code> and NOT the now deprecated SAMTools <code>pileup</code> function. Calling the <code>mpileup</code> method creates an iterator that yields a Pileup object for each base.</p>
131
- <pre><code class="ruby">bam.mpileup do |pileup|
132
- puts pileup.consensus #gives the consensus base from the reads for that postion
357
+ <p>Similarly, average (arithmetic mean) of coverage can be retrieved with the <code>average_coverage</code> method.</p>
358
+
359
+ <pre><code>coverages = bam.average_coverage("Chr1", 3000, 1000) #=&gt; 20.287
360
+ </code></pre>
361
+
362
+ <h2>Getting Pileup Information</h2>
363
+
364
+ <p>Pileup format represents the coverage of reads over a single base in the
365
+ reference. Getting a Pileup over a region is very easy. Note that this
366
+ is done with <code>mpileup</code> and NOT the now deprecated SAMtools <code>pileup</code>
367
+ function. Calling the <code>mpileup</code> method creates an iterator that yields a
368
+ Pileup object for each base.</p>
369
+
370
+ <pre><code>bam.mpileup do |pileup|
371
+ puts pileup.consensus #gives the consensus base from the reads for that postion
133
372
  end
134
373
  </code></pre>
374
+
375
+ <h3>Caching pileups</h3>
376
+
377
+ <p>A pileup can be cached, so if you want to execute several operations on the same set of regions, mpilup won't be executed several times. Whenever you finish using a region, call mpileup_clear_cache to free the cache. The argument 'Region' is required, as it will be the key for the underlying hash. We asume that the options (other than the region) are constant. If they are not, the cache mechanism may not be consistent.</p>
378
+
379
+ <pre><code>#create an mpileup
380
+ reg = Bio::DB::Fasta::Region.new
381
+ reg.entry = "Chr1"
382
+ reg.start = 1
383
+ reg.end = 334
384
+
385
+ bam.mpileup_cached(:r=&gt;reg,:g =&gt; false, :min_cov =&gt; 1, :min_per =&gt;0.2) do |pileup|
386
+ puts pileup.consensus
387
+ end
388
+ bam.mpileup_clear_cache(reg)
389
+ </code></pre>
390
+
135
391
  <h4>Pileup options</h4>
136
- <p>The <code>mpileup</code> function takes a range of parameters to allow SAMTools level filtering of reads and alignments. They are specified as key =&gt; value pairs eg</p>
137
- <pre><code class="ruby">bam.mpileup(:r => "Chr1:1000-2000", :Q => 50) do |pileup|
138
- ##only pileups on Chr1 between positions 1000-2000 are considered,
139
- ##bases with Quality Score < 50 are excluded
140
- ...
392
+
393
+ <p>The <code>mpileup</code> function takes a range of parameters to allow SAMtools
394
+ level filtering of reads and alignments. They are specified as key =>
395
+ value pairs eg</p>
396
+
397
+ <pre><code>bam.mpileup(:r =&gt; "Chr1:1000-2000", :Q =&gt; 50) do |pileup|
398
+ ##only pileups on Chr1 between positions 1000-2000 are considered,
399
+ ##bases with Quality Score &lt; 50 are excluded
400
+ ...
141
401
  end
142
- </code></pre>
143
- <p>Not all the options SAMTools allows you to pass to mpileup will return a Pileup object, those that cause mpileup to return BCF/VCF will be ignored. Specifically these are g,u,e,h,I,L,o,p. The table below lists the SAMTools flags supported and the symbols you can use to call them in the mpileup command.
144
- <table>
145
- <tr><th>SAMTools option</th><th>description</th><th>short symbol</th><th>long symbol</th><th>default</th><th>example</th></tr>
146
- <tr><td><code>r</code></td><td>limit retrieval to a region</td><td><code>:r</code></td><td><code>:region</code></td><td>all positions</td><td><code>:r => "Chr1:1000-2000"</code></tr>
147
- <tr><td><code>6</code></td><td>assume Illumina scaled quality scores</td><td><code>:six</code></td><td><code>:illumina_quals</code></td><td>false</td><td><code>:six => true</code></tr>
148
- <tr><td><code>A</code></td><td>count anomalous read pairs scores</td><td><code>:A</code></td><td><code>:count_anomalous</code></td><td>false</td><td><code>:A => true</code></tr>
149
- <tr><td><code>B</code></td><td>disable BAQ computation</td><td><code>:B</code></td><td><code>:no_baq</code></td><td>false</td><td><code>:no_baq => true</code></tr>
150
- <tr><td><code>C</code></td><td>parameter for adjusting mapQ</td><td><code>:C</code></td><td><code>:adjust_mapq</code></td><td>0</td><td><code>:C => 25</code></tr>
151
- <tr><td><code>d</code></td><td>max per-BAM depth to avoid excessive memory usage</td><td><code>:d</code></td><td><code>:max_per_bam_depth</code></td><td>250</td><td><code>:d => 123</code></tr>
152
- <tr><td><code>E</code></td><td>extended BAQ for higher sensitivity but lower specificity</td><td><code>:E</code></td><td><code>:extended_baq</code></td><td>false</td><td><code>:E => true</code></tr>
153
- <tr><td><code>G</code></td><td>exclude read groups listed in FILE</td><td><code>:G</code></td><td><code>:exclude_reads_file</code></td><td>false</td><td><code>:G => 'my_file.txt'</code></tr>
154
- <tr><td><code>l</code></td><td>list of positions (chr pos) or regions (BED)</td><td><code>:l</code></td><td><code>:list_of_positions</code></td><td>false</td><td><code>:l => 'my_posns.bed'</code></tr>
155
- <tr><td><code>M</code></td><td>cap mapping quality at value</td><td><code>:M</code></td><td><code>:mapping_quality_cap</code></td><td>60</td><td><code>:M => 40 </code></tr>
156
- <tr><td><code>R</code></td><td>ignore RG tags</td><td><code>:R</code></td><td><code>:ignore_rg</code></td><td>false</td><td><code>:R => true </code></tr>
157
- <tr><td><code>q</code></td><td>skip alignments with mapping quality smaller than value</td><td><code>:q</code></td><td><code>:min_mapping_quality</code></td><td>0</td><td><code>:q => 30 </code></tr>
158
- <tr><td><code>Q</code></td><td>skip bases with base quality smaller than value</td><td><code>:Q</code></td><td><code>:imin_base_quality</code></td><td>13</td><td><code>:Q => 30 </code></tr>
159
- </table>
160
- <br />
161
- <p>There is an 'experimental' function, <code>mpileup_plus</code>, that can return a Bio::DB::Vcf object when g,u,e,h,I,L,o,p options are passed. The list below shows the symbols you can use to invoke this behaviour:</p>
162
- <ul>
163
- <li><code>:genotype_calling, :g</code></li>
164
- <li><code>:uncompressed_bcf , :u</code></li>
165
- <li><code>:extension_sequencing_probability, :e</code></li>
166
- <li><code>:homopolymer_error_coefficient, :h</code></li>
167
- <li><code>:no_indels, :I</code></li>
168
- <li><code>:skip_indel_over_average_depth, :L</code></li>
169
- <li><code>:gap_open_sequencing_error_probability,:o</code></li>
170
- <li><code>:platforms, :P</code></li>
171
- </ul>
172
- <br />
173
- <h2>Tests</h2>
174
- <p>The easiest way to run the built-in unit tests is to change to the bio-samtools installation directory (discoverable by typing 'gem which bio-samtools' at the command line) and running the separate test files individually.</p>
175
- <pre><code class="ruby">gem which 'bio-samtools'
176
- /some/path/ruby-1.x.x/bio-samtools-0.x.x/lib/bio-samtools.rb
177
- cd /some/path/ruby-1.x.x/bio-samtools-0.x.x
178
- ruby test/test_basic.rb
179
- </code></pre>
180
- <p> Each test file tests different aspects of the gem.
181
- <ul>
182
- <li>test_basic.rb - tests the general functionality of the gem, such as opening and closing BAM, creating indices, fetching regions, checks the correct Pileup format is returned when requested etc.</li>
183
- <li>test_pileup.rb - tests the Pileup class, making sure attributes are set correctly when Pileup data are passed</li>
184
- <li>test_vcf.rb - tests the Vcf class, making sure attributes are set correctly when Vcf data are passed</li>
185
- </div>
186
- </div>
187
- </div>
402
+ </code></pre>
403
+
404
+ <p>Not all the options SAMtools allows you to pass to mpileup will return a
405
+ Pileup object, those that cause mpileup to return BCF/VCF will be
406
+ ignored. Specifically these are g,u,e,h,I,L,o,p. The table below lists
407
+ the SAMtools flags supported and the symbols you can use to call them in
408
+ the mpileup command.</p>
409
+
410
+ <table><tr><th>SAMtools options</th><th>description</th><th>short symbol</th><th>long symbol</th><th>default</th><th>example</th></tr>
411
+ <tr><td>r</td><td>limit retrieval to a region</td><td>:r</td><td>:region</td><td>all positions</td><td>:r => "Chr1:1000-2000"</td></tr>
412
+ <tr><td>6</td><td>assume Illumina scaled quality scores</td><td>:six</td><td>:illumina_quals</td><td>false</td><td>:six => true</td></tr>
413
+ <tr><td>A</td><td>count anomalous read pairs scores</td><td>:A</td><td>:count_anomalous</td><td>false</td><td>:A => true</td></tr>
414
+ <tr><td>B</td><td>disable BAQ computation</td><td>:B</td><td>:no_baq</td><td>false</td><td>:no_baq => true</td></tr>
415
+ <tr><td>C</td><td>parameter for adjusting mapQ</td><td>:C</td><td>:adjust_mapq</td><td>0</td><td>:C => 25</td></tr>
416
+ <tr><td>d</td><td>max per-BAM depth to avoid excessive memory usage</td><td>:d</td><td>:max_per_bam_depth</td><td>250</td><td>:d => 123</td></tr>
417
+ <tr><td>E</td><td>extended BAQ for higher sensitivity but lower specificity</td><td>:E</td><td>:extended_baq</td><td>false</td><td>:E => true</td></tr>
418
+ <tr><td>G</td><td>exclude read groups listed in FILE</td><td>:G</td><td>:exclude_reads_file</td><td>false</td><td>:G => my_file.txt</td></tr>
419
+ <tr><td>l</td><td>list of positions (chr pos) or regions (BED)</td><td>:l</td><td>:list_of_positions</td><td>false</td><td>:l => my_posns.bed</td></tr>
420
+ <tr><td>M</td><td>cap mapping quality at value</td><td>:M</td><td>:mapping_quality_cap</td><td>60</td><td>:M => 40 </td></tr>
421
+ <tr><td>R</td><td>ignore RG tags</td><td>:R</td><td>:ignore_rg</td><td>false</td><td>:R => true </td></tr>
422
+ <tr><td>q</td><td>skip alignments with mapping quality smaller than value</td><td>:q</td><td>:min_mapping_quality</td><td>0</td><td>:q => 30 </td></tr>
423
+ <tr><td>Q</td><td>skip bases with base quality smaller than value</td><td>:Q</td><td>:imin_base_quality</td><td>13</td><td>:Q => 30</td></tr>
424
+ </table>
425
+
426
+
427
+ <h2>Coverage Plots</h2>
428
+
429
+ <p>You can create images that represent read coverage over binned regions of the reference sequence. The output format is svg. A number of parameters can be changed to alter the style of the plot. In the examples below the bin size and fill_color have been used to create plots with different colours and bar widths.</p>
430
+
431
+ <p>The following lines of code...</p>
432
+
433
+ <pre><code>bam.plot_coverage("Chr1", 201, 2000, :bin=&gt;20, :svg =&gt; "out2.svg", :fill_color =&gt; '#F1A1B1')
434
+ bam.plot_coverage("Chr1", 201, 2000, :bin=&gt;50, :svg =&gt; "out.svg", :fill_color =&gt; '#99CCFF')
435
+ bam.plot_coverage("Chr1", 201, 1000, :bin=&gt;250, :svg =&gt; "out3.svg", :fill_color =&gt; '#33AD5C', :stroke =&gt; '#33AD5C')
436
+ </code></pre>
437
+
438
+ <p><img src="http://ethering.github.io/bio-samtools/images/out2.svg" alt="Coverage plot 1" />
439
+ <img src="http://ethering.github.io/bio-samtools/images/out.svg" alt="Coverage plot 2" />
440
+ <img src="http://ethering.github.io/bio-samtools/images/out3.svg" alt="Coverage plot 2" /></p>
441
+
442
+ <h1>VCF methods</h1>
443
+
444
+ <p>For enhanced snp calling, we've included a VCF class which reflects each non-metadata line of a VCF file.
445
+ The VCF class returns the eight fixed fields present in VCF files, namely chromosome, position, ID, reference base, alt bases, alt quality score, filter and info along with the genotype fields, format and samples. This information allows the comparison of variants and their genotypes across any number of samples.
446
+ The following code takes a number of VCF objects and examines them for homozygous alt (1/1) SNPs</p>
447
+
448
+ <pre><code>vcfs = []
449
+ vcfs &lt;&lt; vcf1 = Bio::DB::Vcf.new("20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1") #from a 3.3 vcf file
450
+ vcfs &lt;&lt; vcf2 = Bio::DB::Vcf.new("19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0/0:10,10 0/1:3,3") #from a 4.0 vcf file
451
+ vcfs &lt;&lt; vcf3 = Bio::DB::Vcf.new("20 14380 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,") #from a 4.0 vcf file
452
+
453
+ vcfs.each do |vcf|
454
+ vcf.samples.each do |sample|
455
+ genotype = "#{sample[1]['GT']}"
456
+ if genotype == '1/1' or genotype == '1|1'
457
+ print vcf.chrom, " "
458
+ puts vcf.pos
459
+ end
460
+ end
461
+ end
462
+
463
+ =&gt; 20 14370
464
+ =&gt; 20 14380
465
+ </code></pre>
466
+
467
+ <h2>Tests</h2>
468
+
469
+ <p>The easiest way to run the built-in unit tests is to change to the
470
+ bio-SAMtools source directory and running 'rake test'</p>
188
471
 
472
+ <p>Each test file tests different aspects of the code.</p>
189
473
  </body>
190
- </html>
474
+ </html>