bio-sambamba 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +11 -0
- data/LICENSE.txt +20 -0
- data/README.md +68 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/features/iterate-alignments.feature +40 -0
- data/features/random-access.feature +10 -0
- data/features/sam-header.feature +23 -0
- data/features/step_definitions/iterate-alignments_steps.rb +83 -0
- data/features/step_definitions/random-access_steps.rb +22 -0
- data/features/step_definitions/sam-header_steps.rb +56 -0
- data/features/step_definitions/validation-steps.rb +34 -0
- data/features/support/env.rb +13 -0
- data/features/syntax-sugar.feature +17 -0
- data/features/validation.feature +16 -0
- data/lib/bio-sambamba.rb +8 -0
- data/lib/bio-sambamba/alignment.rb +131 -0
- data/lib/bio-sambamba/alignmentiterator.rb +45 -0
- data/lib/bio-sambamba/bamfile.rb +45 -0
- data/lib/bio-sambamba/samfile.rb +25 -0
- data/lib/bio-sambamba/samheader.rb +194 -0
- data/test/data/bins.bam +0 -0
- data/test/data/bins.bam.bai +0 -0
- data/test/data/c1215_fixmate.bam +0 -0
- data/test/data/corrupted_zlib_archive.bam +0 -0
- data/test/data/duplicated_block_size.bam +0 -0
- data/test/data/ex1_header.bam +0 -0
- data/test/data/ex1_header.bam.bai +0 -0
- data/test/data/ex1_header.sam +3273 -0
- data/test/data/ex1_header.uncompressed.bam +0 -0
- data/test/data/no_block_size.bam +0 -0
- data/test/data/tags.bam +0 -0
- data/test/data/tags.bam.bai +0 -0
- data/test/data/wrong_bc_subfield_length.bam +0 -0
- data/test/data/wrong_extra_gzip_length.bam +0 -0
- metadata +184 -0
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Artem Tarasov
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# bio-sambamba
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/lomereiter/bioruby-sambamba.png)](http://travis-ci.org/lomereiter/bioruby-sambamba)
|
4
|
+
|
5
|
+
Full description goes here
|
6
|
+
|
7
|
+
Note: this software is under active development!
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```sh
|
12
|
+
rake build
|
13
|
+
rake install
|
14
|
+
```
|
15
|
+
|
16
|
+
In order to use the gem, you also need <code>sambamba</code> tool installed.
|
17
|
+
For that, do the following:
|
18
|
+
|
19
|
+
* install [DMD compiler](http://dlang.org/download.html)
|
20
|
+
* install [Ragel](http://www.complang.org/ragel/) finite state machine compiler
|
21
|
+
* clone sambamba repository and compile the tool
|
22
|
+
|
23
|
+
```sh
|
24
|
+
git clone https://github.com/lomereiter/sambamba.git
|
25
|
+
cd sambamba/CLItools/
|
26
|
+
make
|
27
|
+
```
|
28
|
+
|
29
|
+
* place the executable file <code>build/sambamba</code> to somewhere in your $PATH,
|
30
|
+
for example:
|
31
|
+
|
32
|
+
```sh
|
33
|
+
cp build/sambamba /usr/local/bin
|
34
|
+
```
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'bio-sambamba'
|
40
|
+
```
|
41
|
+
|
42
|
+
The API doc is online. For more code examples see the test files in
|
43
|
+
the source tree.
|
44
|
+
|
45
|
+
## Project home page
|
46
|
+
|
47
|
+
Information on the source tree, documentation, examples, issues and
|
48
|
+
how to contribute, see
|
49
|
+
|
50
|
+
http://github.com/lomereiter/bioruby-sambamba
|
51
|
+
|
52
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
53
|
+
|
54
|
+
## Cite
|
55
|
+
|
56
|
+
If you use this software, please cite one of
|
57
|
+
|
58
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
59
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
60
|
+
|
61
|
+
## Biogems.info
|
62
|
+
|
63
|
+
This Biogem is published at [#bio-sambamba](http://biogems.info/index.html)
|
64
|
+
|
65
|
+
## Copyright
|
66
|
+
|
67
|
+
Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
|
68
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-sambamba"
|
18
|
+
gem.homepage = "http://github.com/lomereiter/bioruby-sambamba"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Ruby wrapper for Sambamba tool}
|
21
|
+
gem.description = %Q{New Sambamba library comes with a command-line tool for working with SAM/BAM files. This gem brings some of its functionality to Ruby.}
|
22
|
+
gem.email = "lomereiter@gmail.com"
|
23
|
+
gem.authors = ["Artem Tarasov"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
|
26
|
+
gem.files.include "lib/bio-sambamba/*.rb"
|
27
|
+
gem.files.include "lib/bio-sambamba.rb"
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'cucumber/rake/task'
|
32
|
+
Cucumber::Rake::Task.new do |features|
|
33
|
+
end
|
34
|
+
|
35
|
+
task :test => :cucumber
|
36
|
+
|
37
|
+
task :default => :test
|
38
|
+
|
39
|
+
require 'rdoc/task'
|
40
|
+
Rake::RDocTask.new do |rdoc|
|
41
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "bio-sambamba #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: iterating alignment records
|
2
|
+
|
3
|
+
In order to have access to all information contained in a BAM file,
|
4
|
+
As a bioinformatician,
|
5
|
+
I want to be able to iterate alignment records from Ruby
|
6
|
+
And have access to all their fields and tags.
|
7
|
+
|
8
|
+
Scenario: accessing alignment records
|
9
|
+
Given I opened a valid BAM file
|
10
|
+
When I use its 'alignments' method
|
11
|
+
Then I should be able to iterate the returned object with 'each'
|
12
|
+
And the objects which I iterate over should represent the alignments
|
13
|
+
And I should be able to access all fields mentioned in SAM/BAM format specification
|
14
|
+
|
15
|
+
Scenario: access existing alignment tag
|
16
|
+
Given I have an alignment
|
17
|
+
And it contains some tags
|
18
|
+
When I access it like a hash
|
19
|
+
And I use 2-character string as a key
|
20
|
+
And the alignment has such tag
|
21
|
+
Then I should be able to see corresponding value
|
22
|
+
And it should be a simple Ruby object (Array, Numeric, or String)
|
23
|
+
|
24
|
+
Scenario: invalid tag key (not of length 2)
|
25
|
+
Given I have an alignment
|
26
|
+
When I access it like a hash
|
27
|
+
But I use string of length different than two, as a key,
|
28
|
+
Then exception should be thrown.
|
29
|
+
|
30
|
+
Scenario: accessing non-existing alignment tag
|
31
|
+
Given I have an alignment
|
32
|
+
And it contains some tags
|
33
|
+
When I access it like a hash
|
34
|
+
But it doesn't contain the requested tag
|
35
|
+
Then nil should be returned.
|
36
|
+
|
37
|
+
Scenario: fetching all tags as a hash
|
38
|
+
Given I have an alignment
|
39
|
+
When I use its 'tags' method
|
40
|
+
Then I should be able to work with the returned object just like with Hash
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Feature: random access to BAM file
|
2
|
+
In order to retrieve information about specific regions,
|
3
|
+
I want to be able to quickly fetch alignments overlapping a region.
|
4
|
+
|
5
|
+
Scenario: fetching alignments
|
6
|
+
Given I have a BAM file
|
7
|
+
And it's sorted by coordinate
|
8
|
+
And I have its index as well
|
9
|
+
When I specify reference sequence and region (1-based beginning and end positions)
|
10
|
+
Then I should be able to immediately have access to alignments overlapping it
|
@@ -0,0 +1,23 @@
|
|
1
|
+
Feature: access to information from SAM header
|
2
|
+
|
3
|
+
In order to work with BAM file,
|
4
|
+
I want to see what its header contains.
|
5
|
+
|
6
|
+
Background:
|
7
|
+
Given I opened a valid BAM file
|
8
|
+
And it contains SAM header
|
9
|
+
|
10
|
+
Scenario: getting raw text
|
11
|
+
When I call 'header' method
|
12
|
+
Then I should see text of SAM header
|
13
|
+
|
14
|
+
Scenario: accessing version and sorting order
|
15
|
+
When SAM header contains @HD line
|
16
|
+
Then I should be able to see format version
|
17
|
+
And I should be able to see sorting order
|
18
|
+
|
19
|
+
Scenario: getting information about reference sequences
|
20
|
+
When SAM header contains @SQ lines
|
21
|
+
Then I should be able to iterate them
|
22
|
+
And I should be able to see sequence names
|
23
|
+
And I should be able to see their lengths
|
@@ -0,0 +1,83 @@
|
|
1
|
+
Before do
|
2
|
+
@bam = Bio::Bam::File.new 'test/data/ex1_header.bam'
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I use its 'alignments' method$/ do
|
6
|
+
@bam.should respond_to(:alignments)
|
7
|
+
end
|
8
|
+
|
9
|
+
Then /^I should be able to iterate the returned object with 'each'$/ do
|
10
|
+
@bam.alignments.should respond_to(:each)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the objects which I iterate over should represent the alignments$/ do
|
14
|
+
@bam.alignments.take(100).each do |read|
|
15
|
+
read.should be_instance_of(Bio::Bam::Alignment)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Then /^I should be able to access all fields mentioned in SAM\/BAM format specification$/ do
|
20
|
+
@read = @bam.alignments.first
|
21
|
+
@read.read_name.should == 'EAS56_57:6:190:289:82'
|
22
|
+
@read.sequence.should == 'CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA'
|
23
|
+
@read.position.should == 100
|
24
|
+
@read.flag.should == 69
|
25
|
+
@read.mapping_quality.should == 0
|
26
|
+
@read.cigar_string.should == '*'
|
27
|
+
@read.reference.should == 'chr1'
|
28
|
+
@read.quality.should == [27, 27, 27, 22, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 23, 26, 26, 27, 22, 26, 19, 27, 26, 27, 26, 26, 26, 26, 26, 24, 19, 27, 26]
|
29
|
+
end
|
30
|
+
|
31
|
+
Given /^I have an alignment$/ do
|
32
|
+
@read = @bam.alignments.first
|
33
|
+
end
|
34
|
+
|
35
|
+
Given /^it contains some tags$/ do
|
36
|
+
end
|
37
|
+
|
38
|
+
When /^I access it like a hash$/ do
|
39
|
+
@read.should respond_to(:[])
|
40
|
+
end
|
41
|
+
|
42
|
+
When /^I use 2-character string as a key$/ do
|
43
|
+
@key = 'MF'
|
44
|
+
end
|
45
|
+
|
46
|
+
When /^the alignment has such tag$/ do
|
47
|
+
@read[@key].should_not be_nil
|
48
|
+
end
|
49
|
+
|
50
|
+
Then /^I should be able to see corresponding value$/ do
|
51
|
+
@read[@key].should be == 192
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^it should be a simple Ruby object \(Array, Numeric, or String\)$/ do
|
55
|
+
@read[@key].should be_kind_of Numeric
|
56
|
+
end
|
57
|
+
|
58
|
+
When /^I use string of length different than two, as a key,$/ do
|
59
|
+
@key = 'key'
|
60
|
+
end
|
61
|
+
|
62
|
+
Then /^exception should be thrown\.$/ do
|
63
|
+
expect{@read[@key]}.to raise_error(RuntimeError)
|
64
|
+
end
|
65
|
+
|
66
|
+
When /^it doesn't contain the requested tag$/ do
|
67
|
+
@key = 'hq'
|
68
|
+
end
|
69
|
+
|
70
|
+
Then /^nil should be returned\.$/ do
|
71
|
+
@read[@key].should be_nil
|
72
|
+
end
|
73
|
+
|
74
|
+
When /^I use its 'tags' method$/ do
|
75
|
+
@tags = @read.tags
|
76
|
+
end
|
77
|
+
|
78
|
+
Then /^I should be able to work with the returned object just like with Hash$/ do
|
79
|
+
@tags.should be_kind_of Hash
|
80
|
+
@tags['MF'].should be == 192
|
81
|
+
@tags.keys.should be == ['MF']
|
82
|
+
@tags.values.should be == [192]
|
83
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Before do
|
2
|
+
@bam = Bio::Bam::File.new './test/data/ex1_header.bam'
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^it's sorted by coordinate$/ do
|
6
|
+
@bam.header.sorting_order.should == 'coordinate'
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^I have its index as well$/ do
|
10
|
+
@bam.should have_index
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^I specify reference sequence and region \(1-based beginning and end positions\)$/ do
|
14
|
+
@region = (1400 ... 1500)
|
15
|
+
@chr = "chr2"
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^I should be able to immediately have access to alignments overlapping it$/ do
|
19
|
+
@alignments = @bam.fetch @chr, @region
|
20
|
+
@alignments.should respond_to(:each).with(0).arguments
|
21
|
+
@alignments.to_a.length.should == 77
|
22
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
Given /^I opened a valid BAM file$/ do
|
2
|
+
filename = './test/data/ex1_header.bam'
|
3
|
+
File.exists?(filename).should be_true
|
4
|
+
@bamfile = Bio::Bam::File.new filename
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /^it contains SAM header$/ do
|
8
|
+
@bamfile.header.raw_contents.length.should be > 0
|
9
|
+
end
|
10
|
+
|
11
|
+
When /^I call 'header' method$/ do
|
12
|
+
@header = @bamfile.header
|
13
|
+
end
|
14
|
+
|
15
|
+
Then /^I should see text of SAM header$/ do
|
16
|
+
@header.raw_contents.should be_kind_of String
|
17
|
+
end
|
18
|
+
|
19
|
+
Given /^SAM header contains @HD line$/ do
|
20
|
+
@header = @bamfile.header
|
21
|
+
@header.raw_contents.should =~ /^@HD/
|
22
|
+
end
|
23
|
+
|
24
|
+
Then /^I should be able to see format version$/ do
|
25
|
+
@version = @header.version
|
26
|
+
@version.should be_kind_of String
|
27
|
+
@version.length.should be > 0
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^I should be able to see sorting order$/ do
|
31
|
+
@sorting_order = @header.sorting_order
|
32
|
+
@sorting_order.should be_kind_of String
|
33
|
+
@sorting_order.length.should be > 0
|
34
|
+
end
|
35
|
+
|
36
|
+
Given /^SAM header contains @SQ lines$/ do
|
37
|
+
@header = @bamfile.header
|
38
|
+
@header.sq_lines.length.should be > 0
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^I should be able to iterate them$/ do
|
42
|
+
@sq_lines = @header.sq_lines
|
43
|
+
@sq_lines.should be_kind_of Array
|
44
|
+
end
|
45
|
+
|
46
|
+
Then /^I should be able to see sequence names$/ do
|
47
|
+
@line = @sq_lines.first
|
48
|
+
@line.should respond_to(:sequence_name).with(0).arguments
|
49
|
+
@line.sequence_name.should be_kind_of String
|
50
|
+
@line.sequence_name.length.should be > 0
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^I should be able to see their lengths$/ do
|
54
|
+
@line.should respond_to(:sequence_length).with(0).arguments
|
55
|
+
@line.sequence_length.should be_kind_of Numeric
|
56
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
Before do
|
2
|
+
# this file is known to contain some invalid records
|
3
|
+
@tagsbam = Bio::Bam::File.new './test/data/tags.bam'
|
4
|
+
end
|
5
|
+
|
6
|
+
Given /^I have an alignment from a BAM file$/ do
|
7
|
+
@alignment = @tagsbam.alignments.to_a[32]
|
8
|
+
end
|
9
|
+
|
10
|
+
When /^I call 'valid\?' method$/ do
|
11
|
+
pending
|
12
|
+
# @is_valid = @alignment.valid?
|
13
|
+
end
|
14
|
+
|
15
|
+
Then /^it should return whether it is valid or not$/ do
|
16
|
+
@is_valid.should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
Given /^I have a BAM file$/ do
|
20
|
+
end
|
21
|
+
|
22
|
+
When /^I want to iterate over its records$/ do
|
23
|
+
@records = @tagsbam.alignments
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /^I should have an option to skip invalid ones$/ do
|
27
|
+
@records.should respond_to(:each_valid).with(0).arguments
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^all the reads in this case should be valid$/ do
|
31
|
+
count = 0
|
32
|
+
@records.each_valid {|read| count += 1 }
|
33
|
+
count.should == 411
|
34
|
+
end
|