bio-sambamba 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +11 -0
- data/LICENSE.txt +20 -0
- data/README.md +68 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/features/iterate-alignments.feature +40 -0
- data/features/random-access.feature +10 -0
- data/features/sam-header.feature +23 -0
- data/features/step_definitions/iterate-alignments_steps.rb +83 -0
- data/features/step_definitions/random-access_steps.rb +22 -0
- data/features/step_definitions/sam-header_steps.rb +56 -0
- data/features/step_definitions/validation-steps.rb +34 -0
- data/features/support/env.rb +13 -0
- data/features/syntax-sugar.feature +17 -0
- data/features/validation.feature +16 -0
- data/lib/bio-sambamba.rb +8 -0
- data/lib/bio-sambamba/alignment.rb +131 -0
- data/lib/bio-sambamba/alignmentiterator.rb +45 -0
- data/lib/bio-sambamba/bamfile.rb +45 -0
- data/lib/bio-sambamba/samfile.rb +25 -0
- data/lib/bio-sambamba/samheader.rb +194 -0
- data/test/data/bins.bam +0 -0
- data/test/data/bins.bam.bai +0 -0
- data/test/data/c1215_fixmate.bam +0 -0
- data/test/data/corrupted_zlib_archive.bam +0 -0
- data/test/data/duplicated_block_size.bam +0 -0
- data/test/data/ex1_header.bam +0 -0
- data/test/data/ex1_header.bam.bai +0 -0
- data/test/data/ex1_header.sam +3273 -0
- data/test/data/ex1_header.uncompressed.bam +0 -0
- data/test/data/no_block_size.bam +0 -0
- data/test/data/tags.bam +0 -0
- data/test/data/tags.bam.bai +0 -0
- data/test/data/wrong_bc_subfield_length.bam +0 -0
- data/test/data/wrong_extra_gzip_length.bam +0 -0
- metadata +184 -0
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Artem Tarasov
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# bio-sambamba
|
2
|
+
|
3
|
+
[](http://travis-ci.org/lomereiter/bioruby-sambamba)
|
4
|
+
|
5
|
+
Full description goes here
|
6
|
+
|
7
|
+
Note: this software is under active development!
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```sh
|
12
|
+
rake build
|
13
|
+
rake install
|
14
|
+
```
|
15
|
+
|
16
|
+
In order to use the gem, you also need <code>sambamba</code> tool installed.
|
17
|
+
For that, do the following:
|
18
|
+
|
19
|
+
* install [DMD compiler](http://dlang.org/download.html)
|
20
|
+
* install [Ragel](http://www.complang.org/ragel/) finite state machine compiler
|
21
|
+
* clone sambamba repository and compile the tool
|
22
|
+
|
23
|
+
```sh
|
24
|
+
git clone https://github.com/lomereiter/sambamba.git
|
25
|
+
cd sambamba/CLItools/
|
26
|
+
make
|
27
|
+
```
|
28
|
+
|
29
|
+
* place the executable file <code>build/sambamba</code> to somewhere in your $PATH,
|
30
|
+
for example:
|
31
|
+
|
32
|
+
```sh
|
33
|
+
cp build/sambamba /usr/local/bin
|
34
|
+
```
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'bio-sambamba'
|
40
|
+
```
|
41
|
+
|
42
|
+
The API doc is online. For more code examples see the test files in
|
43
|
+
the source tree.
|
44
|
+
|
45
|
+
## Project home page
|
46
|
+
|
47
|
+
Information on the source tree, documentation, examples, issues and
|
48
|
+
how to contribute, see
|
49
|
+
|
50
|
+
http://github.com/lomereiter/bioruby-sambamba
|
51
|
+
|
52
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
53
|
+
|
54
|
+
## Cite
|
55
|
+
|
56
|
+
If you use this software, please cite one of
|
57
|
+
|
58
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
59
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
60
|
+
|
61
|
+
## Biogems.info
|
62
|
+
|
63
|
+
This Biogem is published at [#bio-sambamba](http://biogems.info/index.html)
|
64
|
+
|
65
|
+
## Copyright
|
66
|
+
|
67
|
+
Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
|
68
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-sambamba"
|
18
|
+
gem.homepage = "http://github.com/lomereiter/bioruby-sambamba"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Ruby wrapper for Sambamba tool}
|
21
|
+
gem.description = %Q{New Sambamba library comes with a command-line tool for working with SAM/BAM files. This gem brings some of its functionality to Ruby.}
|
22
|
+
gem.email = "lomereiter@gmail.com"
|
23
|
+
gem.authors = ["Artem Tarasov"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
|
26
|
+
gem.files.include "lib/bio-sambamba/*.rb"
|
27
|
+
gem.files.include "lib/bio-sambamba.rb"
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'cucumber/rake/task'
|
32
|
+
Cucumber::Rake::Task.new do |features|
|
33
|
+
end
|
34
|
+
|
35
|
+
task :test => :cucumber
|
36
|
+
|
37
|
+
task :default => :test
|
38
|
+
|
39
|
+
require 'rdoc/task'
|
40
|
+
Rake::RDocTask.new do |rdoc|
|
41
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "bio-sambamba #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Feature: iterating alignment records
|
2
|
+
|
3
|
+
In order to have access to all information contained in a BAM file,
|
4
|
+
As a bioinformatician,
|
5
|
+
I want to be able to iterate alignment records from Ruby
|
6
|
+
And have access to all their fields and tags.
|
7
|
+
|
8
|
+
Scenario: accessing alignment records
|
9
|
+
Given I opened a valid BAM file
|
10
|
+
When I use its 'alignments' method
|
11
|
+
Then I should be able to iterate the returned object with 'each'
|
12
|
+
And the objects which I iterate over should represent the alignments
|
13
|
+
And I should be able to access all fields mentioned in SAM/BAM format specification
|
14
|
+
|
15
|
+
Scenario: access existing alignment tag
|
16
|
+
Given I have an alignment
|
17
|
+
And it contains some tags
|
18
|
+
When I access it like a hash
|
19
|
+
And I use 2-character string as a key
|
20
|
+
And the alignment has such tag
|
21
|
+
Then I should be able to see corresponding value
|
22
|
+
And it should be a simple Ruby object (Array, Numeric, or String)
|
23
|
+
|
24
|
+
Scenario: invalid tag key (not of length 2)
|
25
|
+
Given I have an alignment
|
26
|
+
When I access it like a hash
|
27
|
+
But I use string of length different than two, as a key,
|
28
|
+
Then exception should be thrown.
|
29
|
+
|
30
|
+
Scenario: accessing non-existing alignment tag
|
31
|
+
Given I have an alignment
|
32
|
+
And it contains some tags
|
33
|
+
When I access it like a hash
|
34
|
+
But it doesn't contain the requested tag
|
35
|
+
Then nil should be returned.
|
36
|
+
|
37
|
+
Scenario: fetching all tags as a hash
|
38
|
+
Given I have an alignment
|
39
|
+
When I use its 'tags' method
|
40
|
+
Then I should be able to work with the returned object just like with Hash
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Feature: random access to BAM file
|
2
|
+
In order to retrieve information about specific regions,
|
3
|
+
I want to be able to quickly fetch alignments overlapping a region.
|
4
|
+
|
5
|
+
Scenario: fetching alignments
|
6
|
+
Given I have a BAM file
|
7
|
+
And it's sorted by coordinate
|
8
|
+
And I have its index as well
|
9
|
+
When I specify reference sequence and region (1-based beginning and end positions)
|
10
|
+
Then I should be able to immediately have access to alignments overlapping it
|
@@ -0,0 +1,23 @@
|
|
1
|
+
Feature: access to information from SAM header
|
2
|
+
|
3
|
+
In order to work with BAM file,
|
4
|
+
I want to see what its header contains.
|
5
|
+
|
6
|
+
Background:
|
7
|
+
Given I opened a valid BAM file
|
8
|
+
And it contains SAM header
|
9
|
+
|
10
|
+
Scenario: getting raw text
|
11
|
+
When I call 'header' method
|
12
|
+
Then I should see text of SAM header
|
13
|
+
|
14
|
+
Scenario: accessing version and sorting order
|
15
|
+
When SAM header contains @HD line
|
16
|
+
Then I should be able to see format version
|
17
|
+
And I should be able to see sorting order
|
18
|
+
|
19
|
+
Scenario: getting information about reference sequences
|
20
|
+
When SAM header contains @SQ lines
|
21
|
+
Then I should be able to iterate them
|
22
|
+
And I should be able to see sequence names
|
23
|
+
And I should be able to see their lengths
|
@@ -0,0 +1,83 @@
|
|
1
|
+
Before do
|
2
|
+
@bam = Bio::Bam::File.new 'test/data/ex1_header.bam'
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I use its 'alignments' method$/ do
|
6
|
+
@bam.should respond_to(:alignments)
|
7
|
+
end
|
8
|
+
|
9
|
+
Then /^I should be able to iterate the returned object with 'each'$/ do
|
10
|
+
@bam.alignments.should respond_to(:each)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the objects which I iterate over should represent the alignments$/ do
|
14
|
+
@bam.alignments.take(100).each do |read|
|
15
|
+
read.should be_instance_of(Bio::Bam::Alignment)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Then /^I should be able to access all fields mentioned in SAM\/BAM format specification$/ do
|
20
|
+
@read = @bam.alignments.first
|
21
|
+
@read.read_name.should == 'EAS56_57:6:190:289:82'
|
22
|
+
@read.sequence.should == 'CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA'
|
23
|
+
@read.position.should == 100
|
24
|
+
@read.flag.should == 69
|
25
|
+
@read.mapping_quality.should == 0
|
26
|
+
@read.cigar_string.should == '*'
|
27
|
+
@read.reference.should == 'chr1'
|
28
|
+
@read.quality.should == [27, 27, 27, 22, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 23, 26, 26, 27, 22, 26, 19, 27, 26, 27, 26, 26, 26, 26, 26, 24, 19, 27, 26]
|
29
|
+
end
|
30
|
+
|
31
|
+
Given /^I have an alignment$/ do
|
32
|
+
@read = @bam.alignments.first
|
33
|
+
end
|
34
|
+
|
35
|
+
Given /^it contains some tags$/ do
|
36
|
+
end
|
37
|
+
|
38
|
+
When /^I access it like a hash$/ do
|
39
|
+
@read.should respond_to(:[])
|
40
|
+
end
|
41
|
+
|
42
|
+
When /^I use 2-character string as a key$/ do
|
43
|
+
@key = 'MF'
|
44
|
+
end
|
45
|
+
|
46
|
+
When /^the alignment has such tag$/ do
|
47
|
+
@read[@key].should_not be_nil
|
48
|
+
end
|
49
|
+
|
50
|
+
Then /^I should be able to see corresponding value$/ do
|
51
|
+
@read[@key].should be == 192
|
52
|
+
end
|
53
|
+
|
54
|
+
Then /^it should be a simple Ruby object \(Array, Numeric, or String\)$/ do
|
55
|
+
@read[@key].should be_kind_of Numeric
|
56
|
+
end
|
57
|
+
|
58
|
+
When /^I use string of length different than two, as a key,$/ do
|
59
|
+
@key = 'key'
|
60
|
+
end
|
61
|
+
|
62
|
+
Then /^exception should be thrown\.$/ do
|
63
|
+
expect{@read[@key]}.to raise_error(RuntimeError)
|
64
|
+
end
|
65
|
+
|
66
|
+
When /^it doesn't contain the requested tag$/ do
|
67
|
+
@key = 'hq'
|
68
|
+
end
|
69
|
+
|
70
|
+
Then /^nil should be returned\.$/ do
|
71
|
+
@read[@key].should be_nil
|
72
|
+
end
|
73
|
+
|
74
|
+
When /^I use its 'tags' method$/ do
|
75
|
+
@tags = @read.tags
|
76
|
+
end
|
77
|
+
|
78
|
+
Then /^I should be able to work with the returned object just like with Hash$/ do
|
79
|
+
@tags.should be_kind_of Hash
|
80
|
+
@tags['MF'].should be == 192
|
81
|
+
@tags.keys.should be == ['MF']
|
82
|
+
@tags.values.should be == [192]
|
83
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Before do
|
2
|
+
@bam = Bio::Bam::File.new './test/data/ex1_header.bam'
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^it's sorted by coordinate$/ do
|
6
|
+
@bam.header.sorting_order.should == 'coordinate'
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^I have its index as well$/ do
|
10
|
+
@bam.should have_index
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^I specify reference sequence and region \(1-based beginning and end positions\)$/ do
|
14
|
+
@region = (1400 ... 1500)
|
15
|
+
@chr = "chr2"
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^I should be able to immediately have access to alignments overlapping it$/ do
|
19
|
+
@alignments = @bam.fetch @chr, @region
|
20
|
+
@alignments.should respond_to(:each).with(0).arguments
|
21
|
+
@alignments.to_a.length.should == 77
|
22
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
Given /^I opened a valid BAM file$/ do
|
2
|
+
filename = './test/data/ex1_header.bam'
|
3
|
+
File.exists?(filename).should be_true
|
4
|
+
@bamfile = Bio::Bam::File.new filename
|
5
|
+
end
|
6
|
+
|
7
|
+
Given /^it contains SAM header$/ do
|
8
|
+
@bamfile.header.raw_contents.length.should be > 0
|
9
|
+
end
|
10
|
+
|
11
|
+
When /^I call 'header' method$/ do
|
12
|
+
@header = @bamfile.header
|
13
|
+
end
|
14
|
+
|
15
|
+
Then /^I should see text of SAM header$/ do
|
16
|
+
@header.raw_contents.should be_kind_of String
|
17
|
+
end
|
18
|
+
|
19
|
+
Given /^SAM header contains @HD line$/ do
|
20
|
+
@header = @bamfile.header
|
21
|
+
@header.raw_contents.should =~ /^@HD/
|
22
|
+
end
|
23
|
+
|
24
|
+
Then /^I should be able to see format version$/ do
|
25
|
+
@version = @header.version
|
26
|
+
@version.should be_kind_of String
|
27
|
+
@version.length.should be > 0
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^I should be able to see sorting order$/ do
|
31
|
+
@sorting_order = @header.sorting_order
|
32
|
+
@sorting_order.should be_kind_of String
|
33
|
+
@sorting_order.length.should be > 0
|
34
|
+
end
|
35
|
+
|
36
|
+
Given /^SAM header contains @SQ lines$/ do
|
37
|
+
@header = @bamfile.header
|
38
|
+
@header.sq_lines.length.should be > 0
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^I should be able to iterate them$/ do
|
42
|
+
@sq_lines = @header.sq_lines
|
43
|
+
@sq_lines.should be_kind_of Array
|
44
|
+
end
|
45
|
+
|
46
|
+
Then /^I should be able to see sequence names$/ do
|
47
|
+
@line = @sq_lines.first
|
48
|
+
@line.should respond_to(:sequence_name).with(0).arguments
|
49
|
+
@line.sequence_name.should be_kind_of String
|
50
|
+
@line.sequence_name.length.should be > 0
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^I should be able to see their lengths$/ do
|
54
|
+
@line.should respond_to(:sequence_length).with(0).arguments
|
55
|
+
@line.sequence_length.should be_kind_of Numeric
|
56
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
Before do
|
2
|
+
# this file is known to contain some invalid records
|
3
|
+
@tagsbam = Bio::Bam::File.new './test/data/tags.bam'
|
4
|
+
end
|
5
|
+
|
6
|
+
Given /^I have an alignment from a BAM file$/ do
|
7
|
+
@alignment = @tagsbam.alignments.to_a[32]
|
8
|
+
end
|
9
|
+
|
10
|
+
When /^I call 'valid\?' method$/ do
|
11
|
+
pending
|
12
|
+
# @is_valid = @alignment.valid?
|
13
|
+
end
|
14
|
+
|
15
|
+
Then /^it should return whether it is valid or not$/ do
|
16
|
+
@is_valid.should be_true
|
17
|
+
end
|
18
|
+
|
19
|
+
Given /^I have a BAM file$/ do
|
20
|
+
end
|
21
|
+
|
22
|
+
When /^I want to iterate over its records$/ do
|
23
|
+
@records = @tagsbam.alignments
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /^I should have an option to skip invalid ones$/ do
|
27
|
+
@records.should respond_to(:each_valid).with(0).arguments
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^all the reads in this case should be valid$/ do
|
31
|
+
count = 0
|
32
|
+
@records.each_valid {|read| count += 1 }
|
33
|
+
count.should == 411
|
34
|
+
end
|