bio-samtools-wrapper 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (125) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.travis.yml +27 -0
  4. data/Gemfile +20 -0
  5. data/LICENSE.txt +702 -0
  6. data/README.md +501 -0
  7. data/Rakefile +73 -0
  8. data/VERSION +1 -0
  9. data/bin/bam_consensus.rb +85 -0
  10. data/bio-samtools-wrapper.gemspec +181 -0
  11. data/doc/Bio/DB/Alignment.html +552 -0
  12. data/doc/Bio/DB/Pileup.html +711 -0
  13. data/doc/Bio/DB/SAM/Library.html +167 -0
  14. data/doc/Bio/DB/SAM/Tools.html +109 -0
  15. data/doc/Bio/DB/SAM.html +1853 -0
  16. data/doc/Bio/DB/Tag.html +208 -0
  17. data/doc/Bio/DB/Vcf.html +431 -0
  18. data/doc/Bio/DB.html +105 -0
  19. data/doc/Bio.html +175 -0
  20. data/doc/LICENSE_txt.html +846 -0
  21. data/doc/created.rid +9 -0
  22. data/doc/fonts/Lato-Light.ttf +0 -0
  23. data/doc/fonts/Lato-LightItalic.ttf +0 -0
  24. data/doc/fonts/Lato-Regular.ttf +0 -0
  25. data/doc/fonts/Lato-RegularItalic.ttf +0 -0
  26. data/doc/fonts/SourceCodePro-Bold.ttf +0 -0
  27. data/doc/fonts/SourceCodePro-Regular.ttf +0 -0
  28. data/doc/fonts.css +167 -0
  29. data/doc/images/add.png +0 -0
  30. data/doc/images/arrow_up.png +0 -0
  31. data/doc/images/brick.png +0 -0
  32. data/doc/images/brick_link.png +0 -0
  33. data/doc/images/bug.png +0 -0
  34. data/doc/images/bullet_black.png +0 -0
  35. data/doc/images/bullet_toggle_minus.png +0 -0
  36. data/doc/images/bullet_toggle_plus.png +0 -0
  37. data/doc/images/date.png +0 -0
  38. data/doc/images/delete.png +0 -0
  39. data/doc/images/find.png +0 -0
  40. data/doc/images/loadingAnimation.gif +0 -0
  41. data/doc/images/macFFBgHack.png +0 -0
  42. data/doc/images/package.png +0 -0
  43. data/doc/images/page_green.png +0 -0
  44. data/doc/images/page_white_text.png +0 -0
  45. data/doc/images/page_white_width.png +0 -0
  46. data/doc/images/plugin.png +0 -0
  47. data/doc/images/ruby.png +0 -0
  48. data/doc/images/tag_blue.png +0 -0
  49. data/doc/images/tag_green.png +0 -0
  50. data/doc/images/transparent.png +0 -0
  51. data/doc/images/wrench.png +0 -0
  52. data/doc/images/wrench_orange.png +0 -0
  53. data/doc/images/zoom.png +0 -0
  54. data/doc/index.html +106 -0
  55. data/doc/js/darkfish.js +140 -0
  56. data/doc/js/jquery.js +18 -0
  57. data/doc/js/navigation.js +142 -0
  58. data/doc/js/search.js +109 -0
  59. data/doc/js/search_index.js +1 -0
  60. data/doc/js/searcher.js +228 -0
  61. data/doc/rdoc.css +580 -0
  62. data/doc/table_of_contents.html +305 -0
  63. data/ext/Makefile-bioruby.patch +12 -0
  64. data/ext/Makefile-suse.patch +11 -0
  65. data/ext/mkrf_conf.rb +118 -0
  66. data/lib/bio/BIOExtensions.rb +89 -0
  67. data/lib/bio/db/alignment.rb +64 -0
  68. data/lib/bio/db/fastadb.rb +320 -0
  69. data/lib/bio/db/pileup.rb +273 -0
  70. data/lib/bio/db/sam/external/COPYING +21 -0
  71. data/lib/bio/db/sam/external/VERSION +1 -0
  72. data/lib/bio/db/sam/library.rb +32 -0
  73. data/lib/bio/db/sam.rb +778 -0
  74. data/lib/bio/db/vcf.rb +105 -0
  75. data/lib/bio-samtools-wrapper.rb +9 -0
  76. data/test/.gitignore +1 -0
  77. data/test/helper.rb +18 -0
  78. data/test/sample.vcf +24 -0
  79. data/test/samples/.gitignore +1 -0
  80. data/test/samples/LCI/NC_001988.ffn +2 -0
  81. data/test/samples/LCI/test.bam +0 -0
  82. data/test/samples/LCI/test.bam.bai +0 -0
  83. data/test/samples/small/dupes.bam +0 -0
  84. data/test/samples/small/dupes.sam +274 -0
  85. data/test/samples/small/ids2.txt +1 -0
  86. data/test/samples/small/map_for_reheader.sam +8 -0
  87. data/test/samples/small/map_to_merge1.bam +0 -0
  88. data/test/samples/small/map_to_merge1.bam.bai +0 -0
  89. data/test/samples/small/map_to_merge1.sam +8 -0
  90. data/test/samples/small/map_to_merge2.bam +0 -0
  91. data/test/samples/small/map_to_merge2.bam.bai +0 -0
  92. data/test/samples/small/map_to_merge2.sam +8 -0
  93. data/test/samples/small/no_md.sam +8 -0
  94. data/test/samples/small/sorted.bam +0 -0
  95. data/test/samples/small/sorted.bam.bai +0 -0
  96. data/test/samples/small/test.sai +0 -0
  97. data/test/samples/small/test.tam +10 -0
  98. data/test/samples/small/test_chr.fasta +1000 -0
  99. data/test/samples/small/test_chr.fasta.1.bt2 +0 -0
  100. data/test/samples/small/test_chr.fasta.2.bt2 +0 -0
  101. data/test/samples/small/test_chr.fasta.3.bt2 +0 -0
  102. data/test/samples/small/test_chr.fasta.4.bt2 +0 -0
  103. data/test/samples/small/test_chr.fasta.amb +2 -0
  104. data/test/samples/small/test_chr.fasta.ann +3 -0
  105. data/test/samples/small/test_chr.fasta.bwt +0 -0
  106. data/test/samples/small/test_chr.fasta.pac +0 -0
  107. data/test/samples/small/test_chr.fasta.rbwt +0 -0
  108. data/test/samples/small/test_chr.fasta.rev.1.bt2 +0 -0
  109. data/test/samples/small/test_chr.fasta.rev.2.bt2 +0 -0
  110. data/test/samples/small/test_chr.fasta.rpac +0 -0
  111. data/test/samples/small/test_chr.fasta.rsa +0 -0
  112. data/test/samples/small/test_chr.fasta.sa +0 -0
  113. data/test/samples/small/test_cov.svg +273 -0
  114. data/test/samples/small/test_fastadb.fasta +34 -0
  115. data/test/samples/small/testu.bam +0 -0
  116. data/test/samples/small/testu.bed +2 -0
  117. data/test/test_bio-samtools-wrapper.rb +1 -0
  118. data/test/test_fastadb.rb +89 -0
  119. data/test/test_pileup.rb +90 -0
  120. data/test/test_sam.rb +421 -0
  121. data/test/test_vcf.rb +79 -0
  122. data/tutorial/tutorial.html +474 -0
  123. data/tutorial/tutorial.md +424 -0
  124. data/tutorial/tutorial.pdf +0 -0
  125. metadata +254 -0
data/README.md ADDED
@@ -0,0 +1,501 @@
1
+ # bio-samtools
2
+
3
+ The original project samtools-ruby belongs to Ricardo H. Ramirez @ [https://github.com/homonecloco/samtools-ruby] (https://github.com/homonecloco/samtools-ruby)
4
+
5
+ ## Introduction
6
+
7
+ Documentation and code come from that project and we'll adapt it for a better integration in BioRuby.
8
+
9
+ Binder of samtools for ruby, on the top of FFI.
10
+
11
+ This project was born from the need to add support of BAM files to
12
+ the [gee_fu genome browser] (http://github.com/danmaclean/gee_fu).
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ gem 'bio-samtools'
19
+
20
+ And then execute:
21
+
22
+ bundle
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install bio-samtools
27
+
28
+ ## Getting started
29
+
30
+ ### Creating a new SAM object
31
+
32
+ A SAM object represents the alignments in the BAM file, and is very straightforward to create, you will need a sorted BAM file, to access the alignments and a reference sequence in FASTA format to use the reference sequence. The object can be created and opened as follows:
33
+
34
+ require 'bio-samtools'
35
+
36
+ bam = Bio::DB::Sam.new(:bam=>"my_sorted.bam", :fasta=>'ref.fasta')
37
+ bam.open
38
+
39
+ ### Getting Reference Sequence
40
+
41
+ Retrieving the reference can only be done if the reference has been loaded, which isn't done automatically in order to save memory. Reference need only be loaded once, and is accessed using reference name, start, end in 1-based co-ordinates. A standard Ruby String object is returned.
42
+
43
+ bam.load_reference
44
+ sequence_fragment = bam.fetch_reference("Chr1", 1, 500)
45
+
46
+ ### Getting Alignments
47
+
48
+ Alignments can be obtained one at a time by looping over a specified region using the fetch() function.
49
+
50
+ bam.load_reference
51
+ bam.fetch("1",3000,4000).each do |alignment|
52
+ #do something with the alignment...
53
+ end
54
+
55
+ #Tutorial
56
+ ##Creating a BAM file
57
+ Often, the output from a next-generation sequence alignment tool will be a file in the [SAM format](http://samtools.github.io/hts-specs/SAMv1.pdf).
58
+
59
+ Typically, we'd create a compressed, indexed binary version of the SAM file, which would allow us to operate on it in a quicker and more efficient manner, being able to randomly access various parts of the alignment. We'd use the `view` to do this. This step would involve takeing our sam file, sorting it and indexing it.
60
+
61
+ ```ruby
62
+ #create the sam object
63
+ sam = Bio::DB::Sam.new(:bam => 'my.sam', :fasta => 'ref.fasta')
64
+
65
+ #create a bam file from the sam file
66
+ sam.view(:b=>true, :S=>true, :o=>'bam.bam')
67
+
68
+ #create a new sam object from the bam file
69
+ unsortedBam = Bio::DB::Sam.new(:bam => 'bam.bam', :fasta => 'ref.fasta')
70
+
71
+ #the bam file might not be sorted (necessary for samtools), so sort it
72
+ unsortedBam.sort(:prefix=>'sortedBam')
73
+
74
+ #create a new sam object
75
+ bam = Bio::DB::Sam.new(:bam => 'sortedBam.bam', :fasta => 'ref.fasta')
76
+ #create a new index
77
+ bam.index()
78
+
79
+ #creates index file sortedBam.bam.bai
80
+ ```
81
+
82
+
83
+ Working with BAM files
84
+ ----------------------
85
+
86
+
87
+ ### Creating a new SAM object
88
+
89
+ A SAM object represents the alignments in the BAM file. BAM files (and hence SAM objects here) are what most of SAMtools methods operate on and are very straightforward to create. You will need a sorted and indexed BAM file, to access the alignments and a reference sequence in FASTA format to use the reference sequence. Let's revisit the last few lines of code from the code above.
90
+
91
+ ```ruby
92
+ bam = Bio::DB::Sam.new(:bam => 'sortedBam.bam', :fasta => 'ref.fasta')
93
+ bam.index()
94
+ ```
95
+
96
+ Creating the new Bio::DB::Sam (named 'bam' in this case) only to be done once for multiple operations on it, access to the alignments is random so you don't need to loop over the entries in the file.
97
+
98
+ ### Getting Reference Sequence
99
+
100
+ The reference is accessed using reference
101
+ name, start, end in 1-based co-ordinates. A standard Ruby String object is returned.
102
+ ```ruby
103
+ sequence_fragment = bam.fetch_reference("Chr1", 1, 100)
104
+ puts sequence_fragment
105
+ => cctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
106
+ ```
107
+
108
+ A reference sequence can be returned as a Bio::Sequence::NA object buy the use of :as_bio => true
109
+ ```ruby
110
+ sequence_fragment = bam.fetch_reference("Chr1", 1, 100, :as_bio => true)
111
+ ```
112
+
113
+ The printed output from this would be a fasta-formatted string
114
+ ```ruby
115
+ puts sequence_fragment
116
+
117
+ => >Chr1:1-100
118
+ => cctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
119
+ ```
120
+
121
+ ### Concatenating BAM files
122
+ BAM files may be concatenated using the `cat` command. The sequence dictionary of each input BAM must be identical, although the `cat` method does not check this.
123
+
124
+ ```ruby
125
+ #create an array of BAM files to cat
126
+ bam_files = [bam1, bam2]
127
+ cat_file = "maps_cated.bam" #the outfile
128
+ #cat the files
129
+ @sam.cat(:out=>cat_file, :bams=>bam_files)
130
+ #create a new Bio::DB::Sam object from the new cat file
131
+ cat_bam = Bio::DB::Sam.new(:fasta => "ref.fasta", :bam => cat_file)
132
+
133
+ ```
134
+
135
+ ### Removing duplicate reads
136
+ The `remove_duplicates` method removes potential PCR duplicates: if multiple read pairs have identical external coordinates it only retain the pair with highest mapping quality. It does not work for unpaired reads (e.g. two ends mapped to different chromosomes or orphan reads).
137
+ ```ruby
138
+
139
+ unduped = "dupes_rmdup.bam" #an outfile for the removed duplicates bam
140
+ #remove single-end duplicates
141
+ bam.remove_duplicates(:s=>true, :out=>unduped)
142
+ #create new Bio::DB::Sam object
143
+ unduped_bam = Bio::DB::Sam.new(:fasta => "ref.fasta", :bam => unduped)
144
+
145
+ ```
146
+
147
+ ### Alignment Objects
148
+
149
+ The individual alignments represent a single read and are returned as
150
+ Bio::DB::Alignment objects. These have numerous methods of their own,
151
+ using `require 'pp'` will allow you to check the attributes contained in
152
+ each object. Here is an example alignment object. Remember `@`
153
+ represents a Ruby instance variable and can be accessed as any other
154
+ method. Thus the `@is_mapped` attribute of an object `a` is accessed
155
+ `a.is_mapped`
156
+
157
+ ```ruby
158
+ require 'pp'
159
+ pp an_alignment_object ##some Bio::DB::Alignment object
160
+ #<Bio::DB::Alignment:0x101113f80
161
+ @al=#<Bio::DB::SAM::Tools::Bam1T:0x101116a50>,
162
+ @calend=4067,
163
+ @cigar="76M",
164
+ @failed_quality=false,
165
+ @first_in_pair=false,
166
+ @flag=163,
167
+ @is_duplicate=false,
168
+ @is_mapped=true,
169
+ @is_paired=true,
170
+ @isize=180,
171
+ @mapq=60,
172
+ @mate_strand=false,
173
+ @mate_unmapped=false,
174
+ @mpos=4096,
175
+ @mrnm="=",
176
+ @pos=3992,
177
+ @primary=true,
178
+ @qlen=76,
179
+ @qname="HWI-EAS396_0001:7:115:17904:15958#0",
180
+ @qual="IIIIIIIIIIIIHHIHGIHIDGGGG...",
181
+ @query_strand=true,
182
+ @query_unmapped=false,
183
+ @rname="1",
184
+ @second_in_pair=true,
185
+ @seq="ACAGTCCAGTCAAAGTACAAATCGAG...",
186
+ @tags=
187
+ {"MD"=>#<Bio::DB::Tag:0x101114ed0 @tag="MD", @type="Z", @value="76">,
188
+ "XO"=>#<Bio::DB::Tag:0x1011155d8 @tag="XO", @type="i", @value="0">,
189
+ "AM"=>#<Bio::DB::Tag:0x101116280 @tag="AM", @type="i", @value="37">,
190
+ "X0"=>#<Bio::DB::Tag:0x101115fb0 @tag="X0", @type="i", @value="1">,
191
+ "X1"=>#<Bio::DB::Tag:0x101115c68 @tag="X1", @type="i", @value="0">,
192
+ "XG"=>#<Bio::DB::Tag:0x101115240 @tag="XG", @type="i", @value="0">,
193
+ "SM"=>#<Bio::DB::Tag:0x1011162f8 @tag="SM", @type="i", @value="37">,
194
+ "XT"=>#<Bio::DB::Tag:0x1011162a8 @tag="XT", @type="A", @value="U">,
195
+ "NM"=>#<Bio::DB::Tag:0x101116348 @tag="NM", @type="i", @value="0">,
196
+ "XM"=>#<Bio::DB::Tag:0x101115948 @tag="XM", @type="i", @value="0">}>
197
+ ```
198
+
199
+ ### Getting Alignments
200
+
201
+ Alignments can be obtained one at a time by looping over a specified region using the `fetch()` function.
202
+
203
+ ```ruby
204
+ bam.fetch("Chr1",3000,4000).each do |alignment|
205
+ #do something with the alignment...
206
+ end
207
+ ```
208
+
209
+ A separate method `fetch_with_function()` allows you to pass a block (or
210
+ a Proc object) to the function for efficient calculation. This example takes
211
+ an alignment object and returns an array of sequences which exactly match the reference.
212
+
213
+ ```ruby
214
+ #an array to hold the matching sequences
215
+ exact_matches = []
216
+
217
+ matches = Proc.new do |a|
218
+ #get the length of each read
219
+ len = a.seq.length
220
+ #get the cigar string
221
+ cigar = a.cigar
222
+ #create a cigar string which represents a full-length match
223
+ cstr = len.to_s << "M"
224
+ if cigar == cstr
225
+ #add the current sequence to the array if it qualifies
226
+ exact_matches << a.seq
227
+ end
228
+ end
229
+
230
+ bam.fetch_with_function("Chr1", 100, 500, &matches)
231
+
232
+ puts exact_matches
233
+ ```
234
+
235
+ ###Alignment stats
236
+
237
+ The SAMtools flagstat method is implemented in bio-samtools to quickly examine the number of reads mapped to the reference. This includes the number of paired and singleton reads mapped and also the number of paired-reads that map to different chromosomes/contigs.
238
+
239
+ ```ruby
240
+ bam.flag_stats()
241
+ ```
242
+
243
+ An example output would be
244
+ ```ruby
245
+ 34672 + 0 in total (QC-passed reads + QC-failed reads)
246
+ 0 + 0 duplicates
247
+ 33196 + 0 mapped (95.74%:nan%)
248
+ 34672 + 0 paired in sequencing
249
+ 17335 + 0 read1
250
+ 17337 + 0 read2
251
+ 31392 + 0 properly paired (90.54%:nan%)
252
+ 31728 + 0 with itself and mate mapped
253
+ 1468 + 0 singletons (4.23%:nan%)
254
+ 0 + 0 with mate mapped to a different chr
255
+ 0 + 0 with mate mapped to a different chr (mapQ>=5)
256
+ ```
257
+
258
+ Getting Coverage Information
259
+ ----------------------------
260
+
261
+
262
+ ### Per Base Coverage
263
+
264
+ It is easy to get the total depth of reads at a given position, the
265
+ `chromosome_coverage` function is used. This differs from the previous
266
+ functions in that a start position and length (rather than end position)
267
+ are passed to the function. An array of coverages is returned, the first
268
+ position in the array gives the depth of coverage at the given start
269
+ position in the genome, the last position in the array gives the depth
270
+ of coverage at the given start position plus the length given
271
+
272
+ ```ruby
273
+ coverages = bam.chromosome_coverage("Chr1", 3000, 1000) #=> [16,16,25,25...]
274
+ ```
275
+
276
+ ### Average Coverage In A Region
277
+
278
+ Similarly, average (arithmetic mean) of coverage can be retrieved with the `average_coverage` method.
279
+
280
+ ```ruby
281
+ coverages = bam.average_coverage("Chr1", 3000, 1000) #=> 20.287
282
+ ```
283
+
284
+ ### Coverage from a BED file
285
+ It is possible to count the number of nucleotides mapped to a given region of a BAM file by providing a [BED formatted](http://genome.ucsc.edu/FAQ/FAQformat.html#format1) file and using the `bedcov` method. The output is the BED file with an extra column providing the number of nucleotides mapped to that region.
286
+
287
+ ```ruby
288
+ bed_file = "test.bed"
289
+ bam.bedcov(:bed=>bed_file)
290
+
291
+ => chr_1 1 30 6
292
+ => chr_1 40 45 8
293
+
294
+ ```
295
+ Alternatively, the `depth` method can be used to get per-position depth information (any unmapped positions will be ignored).
296
+ ```ruby
297
+ bed_file = "test.bed"
298
+ @sam.depth(:b=>bed_file)
299
+
300
+ => chr_1 25 1
301
+ => chr_1 26 1
302
+ => chr_1 27 1
303
+ => chr_1 28 1
304
+ => chr_1 29 1
305
+ => chr_1 30 1
306
+ => chr_1 41 1
307
+ => chr_1 42 1
308
+ => chr_1 43 2
309
+ => chr_1 44 2
310
+ => chr_1 45 2
311
+ ```
312
+ ##Getting Pileup Information
313
+
314
+ Pileup format represents the coverage of reads over a single base in the
315
+ reference. Getting a Pileup over a region is very easy. Note that this
316
+ is done with `mpileup` and NOT the now deprecated SAMtools `pileup`
317
+ function. Calling the `mpileup` method creates an iterator that yields a
318
+ Pileup object for each base.
319
+
320
+ ```ruby
321
+ bam.mpileup do |pileup|
322
+ puts pileup.consensus #gives the consensus base from the reads for that position
323
+ end
324
+ ```
325
+
326
+ ###Caching pileups
327
+ A pileup can be cached, so if you want to execute several operations on the same set of regions, mpilup won't be executed several times. Whenever you finish using a region, call mpileup_clear_cache to free the cache. The argument 'Region' is required, as it will be the key for the underlying hash. We assume that the options (other than the region) are constant. If they are not, the cache mechanism may not be consistent.
328
+
329
+ ```ruby
330
+ #create an mpileup
331
+ reg = Bio::DB::Fasta::Region.new
332
+ reg.entry = "Chr1"
333
+ reg.start = 1
334
+ reg.end = 334
335
+
336
+ bam.mpileup_cached(:r=>reg,:g => false, :min_cov => 1, :min_per =>0.2) do |pileup|
337
+ puts pileup.consensus
338
+ end
339
+ bam.mpileup_clear_cache(reg)
340
+ ```
341
+
342
+
343
+ #### Pileup options
344
+
345
+ The `mpileup` function takes a range of parameters to allow SAMtools
346
+ level filtering of reads and alignments. They are specified as key =\>
347
+ value pairs eg
348
+
349
+ ```ruby
350
+ bam.mpileup(:r => "Chr1:1000-2000", :Q => 50) do |pileup|
351
+ ##only pileups on Chr1 between positions 1000-2000 are considered,
352
+ ##bases with Quality Score < 50 are excluded
353
+ ...
354
+ end
355
+ ```
356
+
357
+ Not all the options SAMtools allows you to pass to mpileup will return a
358
+ Pileup object, The table below lists the SAMtools flags supported and the symbols you can use to call them in
359
+ the mpileup command.
360
+
361
+ <table><tr><th>SAMtools options</th><th>description</th><th>short symbol</th><th>long symbol</th><th>default</th><th>example</th></tr>
362
+ <tr><td>r</td><td>limit retrieval to a region</td><td>:r</td><td>:region</td><td>all positions</td><td>:r => "Chr1:1000-2000"</td></tr>
363
+ <tr><td>6</td><td>assume Illumina scaled quality scores</td><td>:six</td><td>:illumina_quals</td><td>false</td><td>:six => true</td></tr>
364
+ <tr><td>A</td><td>count anomalous read pairs scores</td><td>:A</td><td>:count_anomalous</td><td>false</td><td>:A => true</td></tr>
365
+ <tr><td>B</td><td>disable BAQ computation</td><td>:B</td><td>:no_baq</td><td>false</td><td>:no_baq => true</td></tr>
366
+ <tr><td>C</td><td>parameter for adjusting mapQ</td><td>:C</td><td>:adjust_mapq</td><td>0</td><td>:C => 25</td></tr>
367
+ <tr><td>d</td><td>max per-BAM depth to avoid excessive memory usage</td><td>:d</td><td>:max_per_bam_depth</td><td>250</td><td>:d => 123</td></tr>
368
+ <tr><td>E</td><td>extended BAQ for higher sensitivity but lower specificity</td><td>:E</td><td>:extended_baq</td><td>false</td><td>:E => true</td></tr>
369
+ <tr><td>G</td><td>exclude read groups listed in FILE</td><td>:G</td><td>:exclude_reads_file</td><td>false</td><td>:G => my_file.txt</td></tr>
370
+ <tr><td>l</td><td>list of positions (chr pos) or regions (BED)</td><td>:l</td><td>:list_of_positions</td><td>false</td><td>:l => my_posns.bed</td></tr>
371
+ <tr><td>M</td><td>cap mapping quality at value</td><td>:M</td><td>:mapping_quality_cap</td><td>60</td><td>:M => 40 </td></tr>
372
+ <tr><td>R</td><td>ignore RG tags</td><td>:R</td><td>:ignore_rg</td><td>false</td><td>:R => true </td></tr>
373
+ <tr><td>q</td><td>skip alignments with mapping quality smaller than value</td><td>:q</td><td>:min_mapping_quality</td><td>0</td><td>:q => 30 </td></tr>
374
+ <tr><td>Q</td><td>skip bases with base quality smaller than value</td><td>:Q</td><td>:imin_base_quality</td><td>13</td><td>:Q => 30</td></tr>
375
+ </table>
376
+
377
+
378
+ ##Coverage Plots
379
+ You can create images that represent read coverage over binned regions of the reference sequence. The output format is svg. A number of parameters can be changed to alter the style of the plot. In the examples below the bin size and fill_color have been used to create plots with different colours and bar widths.
380
+
381
+ The following lines of code...
382
+
383
+ ```ruby
384
+ bam.plot_coverage("Chr1", 201, 2000, :bin=>20, :svg => "out2.svg", :fill_color => '#F1A1B1')
385
+ bam.plot_coverage("Chr1", 201, 2000, :bin=>50, :svg => "out.svg", :fill_color => '#99CCFF')
386
+ bam.plot_coverage("Chr1", 201, 1000, :bin=>250, :svg => "out3.svg", :fill_color => '#33AD5C', :stroke => '#33AD5C')
387
+ ```
388
+
389
+ ![Coverage plot 1](http://ethering.github.io/bio-samtools/images/out2.svg)
390
+ ![Coverage plot 2](http://ethering.github.io/bio-samtools/images/out.svg)
391
+ ![Coverage plot 2](http://ethering.github.io/bio-samtools/images/out3.svg)
392
+
393
+ The `plot_coverage` method will also return the raw svg code, for further use. Simply leave out a file name and assign the method to a variable.
394
+
395
+ ```ruby
396
+ svg = bam.plot_coverage("Chr1", 201, 2000, :bin=>50, :fill_color => '#99CCFF')
397
+
398
+ ```
399
+
400
+
401
+ #VCF methods
402
+ For enhanced snp calling, we've included a VCF class which reflects each non-metadata line of a VCF file.
403
+ The VCF class returns the eight fixed fields present in VCF files, namely chromosome, position, ID, reference base, alt bases, alt quality score, filter and info along with the genotype fields, format and samples. This information allows the comparison of variants and their genotypes across any number of samples.
404
+ The following code takes a number of VCF objects and examines them for homozygous alt (1/1) SNPs
405
+
406
+ ```ruby
407
+ vcfs = []
408
+ vcfs << vcf1 = Bio::DB::Vcf.new("20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1") #from a 3.3 vcf file
409
+ vcfs << vcf2 = Bio::DB::Vcf.new("19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0/0:10,10 0/1:3,3") #from a 4.0 vcf file
410
+ vcfs << vcf3 = Bio::DB::Vcf.new("20 14380 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,") #from a 4.0 vcf file
411
+
412
+ vcfs.each do |vcf|
413
+ vcf.samples.each do |sample|
414
+ genotype = sample[1]['GT']
415
+ if genotype == '1/1' or genotype == '1|1'
416
+ print vcf.chrom, " "
417
+ puts vcf.pos
418
+ end
419
+ end
420
+ end
421
+
422
+ => 20 14370
423
+ => 20 14380
424
+ ```
425
+
426
+ ##Other methods not covered
427
+ The SAMtools methods faidx, fixmate, tview, reheader, calmd, targetcut and phase are all included in the current bio-samtools release.
428
+
429
+ Tests
430
+ -----
431
+
432
+ The easiest way to run the built-in unit tests is to change to the
433
+ bio-samtools source directory and running 'rake test'
434
+
435
+ Each test file tests different aspects of the code.
436
+
437
+
438
+
439
+ ## Dependencies
440
+
441
+ * BioRuby >= 1.5 [https://github.com/bioruby/bioruby](https://github.com/bioruby/bioruby)
442
+ * Ruby 2.1.10 and above.
443
+
444
+
445
+ ## FAQ
446
+ * I want to use Ruby 1.x, what can I do?
447
+
448
+ We try to ensure backwards compatibility with old rubies. However we only officially support current versions of [https://www.ruby-lang.org/en/downloads/](Ruby). The code should work however the testing suites used in earlier versions are not currently supported and don't work in modern rubies. This decision ensures compatibility with maintained versions of Ruby.
449
+
450
+
451
+ ## Contributing to bio-samtools
452
+
453
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
454
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
455
+ * Fork the project
456
+ * Start a feature/bugfix branch
457
+ * Commit and push until you are happy with your contribution
458
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
459
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
460
+
461
+ ### TODO
462
+ 1. Filter to the fetching algorithm (give a condition that has to be satisfied to add the alignment to the list)
463
+
464
+ ### To whom do I complain?
465
+ Try [Ricardo.Ramirez-Gonzalez@jic.ac.uk](Ricardo.Ramirez-Gonzalez@jic.ac.uk)
466
+ and [dan.maclean@tsl.ac.uk](dan.maclean@tsl.ac.uk)
467
+
468
+ ### Important Notes
469
+ * samtools is downloaded, compiled and installed inside the gem at install time on the host system
470
+
471
+ * If you use this tool for publication, please cite http://dx.doi.org/10.1186/1751-0473-7-6
472
+
473
+ ### Important Notes for developers
474
+
475
+ Remember that you must compile and install samtools for you host system. In order to do that there are two possible solutions:
476
+
477
+ * download, compile and install the library in bioruby-samtools-your_clone/lib/bio/db/sam/external/samtools and
478
+ bioruby-samtools-your_clone/lib/bio/db/sam/external/bcftools by yourself
479
+ * in your bioruby-samtools-your_clone create the Rakefile typing `cd ext; ruby mkrf_conf.rb; rake -f Rakefile`
480
+
481
+ The latest I think is the easiest way, cause you are replicating the automatic process.
482
+
483
+ For testing just run `rake test`. Tests must be improved.
484
+
485
+ ####Travis integration###
486
+ If you are integrating this library into another tool and testing it with travis, add the follwing in ```.travis.yml```:
487
+
488
+ ```yml
489
+ addons:
490
+ apt:
491
+ packages:
492
+ - zlib1g-dev
493
+ - libncurses5-dev
494
+ - libtinfo-dev
495
+ ```
496
+
497
+ ## Copyright
498
+
499
+ Copyright (c) 2011 Raoul J.P. Bonnal. See LICENSE.txt for
500
+ further details.
501
+
data/Rakefile ADDED
@@ -0,0 +1,73 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+
4
+
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+
15
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
16
+ require 'jeweler'
17
+ @taskClass = Jeweler
18
+ else
19
+ require 'juwelier'
20
+ @taskClass = Juwelier
21
+ end
22
+
23
+
24
+ #Juwelier
25
+
26
+ @taskClass::Tasks.new do |gem|
27
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
28
+ gem.name = "bio-samtools-wrapper"
29
+ gem.homepage = "https://github.com/cb2e6f/bio-samtools-wrapper"
30
+ gem.license = "GPL-3.0"
31
+ gem.summary = %Q{wrapper of samtools for ruby.}
32
+ gem.description = %Q{wrapper of samtools for ruby.
33
+
34
+ This project was born from the need to add support of BAM files to
35
+ the gee_fu genome browser (http://github.com/danmaclean/gee_fu).}
36
+ gem.email = "rob.ellis@jic.ac.uk"
37
+ gem.authors = ["Rob Ellis"]
38
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
39
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
40
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
41
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
42
+ gem.extensions = "ext/mkrf_conf.rb"
43
+ end
44
+ @taskClass::RubygemsDotOrgTasks.new
45
+
46
+ require 'rake/testtask'
47
+ Rake::TestTask.new(:test) do |test|
48
+ test.libs << 'lib' << 'test'
49
+ test.pattern = 'test/**/test_*.rb'
50
+ test.verbose = true
51
+ end
52
+
53
+
54
+ if RUBY_VERSION.start_with?("1.8")
55
+ require 'rcov/rcovtask'
56
+ Rcov::RcovTask.new do |test|
57
+ test.libs << 'test'
58
+ test.pattern = 'test/**/test_*.rb'
59
+ test.verbose = true
60
+ end
61
+ end
62
+
63
+ task :default => :test
64
+
65
+ require 'rdoc/task'
66
+ RDoc::Task.new do |rdoc|
67
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
68
+
69
+ rdoc.rdoc_dir = 'rdoc'
70
+ rdoc.title = "bio-samtools-wrapper #{version}"
71
+ rdoc.rdoc_files.include('README*')
72
+ rdoc.rdoc_files.include('lib/**/*.rb')
73
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 2.7.0
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+
5
+ require 'optparse'
6
+ require 'set'
7
+
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-samtools-wrapper.rb')
11
+ require path
12
+
13
+
14
+ def log(msg)
15
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
16
+ puts "#{time}: #{msg}"
17
+ end
18
+
19
+
20
+ options = {}
21
+ options[:min_cov] = 5
22
+ options[:min_percentage] = 0.5
23
+ options[:output_file] = "-"
24
+ OptionParser.new do |opts|
25
+
26
+ opts.banner = "Usage: bam_consensus.rb [options]"
27
+
28
+ opts.on("-b", "--bam_file FILE", "BAM File to call for the consensus") do |o|
29
+ options[:bam] = o
30
+ end
31
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
32
+ options[:reference] = o
33
+ end
34
+ opts.on("-p", "--min_percentage FLOAT", "Minimum percentage to call for a base. When more than one base gets the percentage, an ambiguty code is produced") do |o|
35
+ options[:min_percentage] = o.to_f / 100
36
+ end
37
+
38
+ opts.on("-m", "--min_cov INT", "Minimum percentage to call for a base. When more than one base gets the percentage, an ambiguty code is produced") do |o|
39
+ options[:min_cov] = o.to_i
40
+ end
41
+
42
+ opts.on("-f", "--filter_entries FILE", "File with a list of entries to process") do |o|
43
+ options[:filter_entries] = o
44
+ end
45
+
46
+ opts.on("-o" "--output_file FILE", "Output of the program, in fasta format") do |o|
47
+ options[:output_file] = o
48
+ end
49
+
50
+ end.parse!
51
+
52
+ bam = @parental_1_sam = Bio::DB::Sam.new({:fasta=>options[:reference], :bam=>options[:bam]})
53
+ region_set = nil
54
+ if options[:filter_entries]
55
+ region_set = Set.new
56
+ File.foreach(options[:filter_entries]) do |line|
57
+ region_set << line.chomp
58
+ end
59
+ end
60
+
61
+ fasta_db = Bio::DB::Fasta::FastaFile.new(:fasta=> options[:reference])
62
+ fasta_db.load_fai_entries
63
+
64
+ output = $stdout
65
+
66
+ output = File.open(options[:output_file], "w") if options[:output_file] != "-"
67
+
68
+ fasta_db.index.entries.each do | r |
69
+ process = true
70
+ region=r.get_full_region
71
+
72
+ process = region_set.include? region.entry if region_set
73
+ if process
74
+ reg = bam.fetch_region({:region=>region, :min_cov=>options[:min_cov],:min_per=>options[:min_percentage], :A=>1})
75
+ cons = reg.consensus
76
+ org = fasta_db.fetch_sequence(region)
77
+ if cons.upcase != org.upcase
78
+ output.puts ">#{region.entry}"
79
+ tmp = cons.scan /.{1,80}/
80
+ output.puts tmp.join("\n")
81
+ end
82
+ end
83
+ end
84
+
85
+ output.close if options[:output_file] != "-"