bio-sambamba 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.document +5 -0
  2. data/.travis.yml +12 -0
  3. data/Gemfile +11 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.md +68 -0
  6. data/Rakefile +47 -0
  7. data/VERSION +1 -0
  8. data/features/iterate-alignments.feature +40 -0
  9. data/features/random-access.feature +10 -0
  10. data/features/sam-header.feature +23 -0
  11. data/features/step_definitions/iterate-alignments_steps.rb +83 -0
  12. data/features/step_definitions/random-access_steps.rb +22 -0
  13. data/features/step_definitions/sam-header_steps.rb +56 -0
  14. data/features/step_definitions/validation-steps.rb +34 -0
  15. data/features/support/env.rb +13 -0
  16. data/features/syntax-sugar.feature +17 -0
  17. data/features/validation.feature +16 -0
  18. data/lib/bio-sambamba.rb +8 -0
  19. data/lib/bio-sambamba/alignment.rb +131 -0
  20. data/lib/bio-sambamba/alignmentiterator.rb +45 -0
  21. data/lib/bio-sambamba/bamfile.rb +45 -0
  22. data/lib/bio-sambamba/samfile.rb +25 -0
  23. data/lib/bio-sambamba/samheader.rb +194 -0
  24. data/test/data/bins.bam +0 -0
  25. data/test/data/bins.bam.bai +0 -0
  26. data/test/data/c1215_fixmate.bam +0 -0
  27. data/test/data/corrupted_zlib_archive.bam +0 -0
  28. data/test/data/duplicated_block_size.bam +0 -0
  29. data/test/data/ex1_header.bam +0 -0
  30. data/test/data/ex1_header.bam.bai +0 -0
  31. data/test/data/ex1_header.sam +3273 -0
  32. data/test/data/ex1_header.uncompressed.bam +0 -0
  33. data/test/data/no_block_size.bam +0 -0
  34. data/test/data/tags.bam +0 -0
  35. data/test/data/tags.bam.bai +0 -0
  36. data/test/data/wrong_bc_subfield_length.bam +0 -0
  37. data/test/data/wrong_extra_gzip_length.bam +0 -0
  38. metadata +184 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/*.feature
5
+ LICENSE.txt
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - jruby-19mode # JRuby in 1.9 mode
6
+ - rbx-19mode
7
+ # - 1.8.7
8
+ # - jruby-18mode # JRuby in 1.8 mode
9
+ # - rbx-18mode
10
+
11
+ # uncomment this line if your project needs to run something other than `rake`:
12
+ # script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "bio", "~> 1.4.2"
4
+ gem "oj", "~> 1.2.9"
5
+
6
+ group :development do
7
+ gem "bundler", "~> 1.1.4"
8
+ gem "jeweler", "~> 1.8.3"
9
+ gem "rspec", "~> 2.7.0"
10
+ gem "cucumber", "~> 1.2.0"
11
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Artem Tarasov
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # bio-sambamba
2
+
3
+ [![Build Status](https://secure.travis-ci.org/lomereiter/bioruby-sambamba.png)](http://travis-ci.org/lomereiter/bioruby-sambamba)
4
+
5
+ Full description goes here
6
+
7
+ Note: this software is under active development!
8
+
9
+ ## Installation
10
+
11
+ ```sh
12
+ rake build
13
+ rake install
14
+ ```
15
+
16
+ In order to use the gem, you also need <code>sambamba</code> tool installed.
17
+ For that, do the following:
18
+
19
+ * install [DMD compiler](http://dlang.org/download.html)
20
+ * install [Ragel](http://www.complang.org/ragel/) finite state machine compiler
21
+ * clone sambamba repository and compile the tool
22
+
23
+ ```sh
24
+ git clone https://github.com/lomereiter/sambamba.git
25
+ cd sambamba/CLItools/
26
+ make
27
+ ```
28
+
29
+ * place the executable file <code>build/sambamba</code> to somewhere in your $PATH,
30
+ for example:
31
+
32
+ ```sh
33
+ cp build/sambamba /usr/local/bin
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ ```ruby
39
+ require 'bio-sambamba'
40
+ ```
41
+
42
+ The API doc is online. For more code examples see the test files in
43
+ the source tree.
44
+
45
+ ## Project home page
46
+
47
+ Information on the source tree, documentation, examples, issues and
48
+ how to contribute, see
49
+
50
+ http://github.com/lomereiter/bioruby-sambamba
51
+
52
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
53
+
54
+ ## Cite
55
+
56
+ If you use this software, please cite one of
57
+
58
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
59
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
60
+
61
+ ## Biogems.info
62
+
63
+ This Biogem is published at [#bio-sambamba](http://biogems.info/index.html)
64
+
65
+ ## Copyright
66
+
67
+ Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
68
+
data/Rakefile ADDED
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "bio-sambamba"
18
+ gem.homepage = "http://github.com/lomereiter/bioruby-sambamba"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Ruby wrapper for Sambamba tool}
21
+ gem.description = %Q{New Sambamba library comes with a command-line tool for working with SAM/BAM files. This gem brings some of its functionality to Ruby.}
22
+ gem.email = "lomereiter@gmail.com"
23
+ gem.authors = ["Artem Tarasov"]
24
+ # dependencies defined in Gemfile
25
+
26
+ gem.files.include "lib/bio-sambamba/*.rb"
27
+ gem.files.include "lib/bio-sambamba.rb"
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
30
+
31
+ require 'cucumber/rake/task'
32
+ Cucumber::Rake::Task.new do |features|
33
+ end
34
+
35
+ task :test => :cucumber
36
+
37
+ task :default => :test
38
+
39
+ require 'rdoc/task'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "bio-sambamba #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,40 @@
1
+ Feature: iterating alignment records
2
+
3
+ In order to have access to all information contained in a BAM file,
4
+ As a bioinformatician,
5
+ I want to be able to iterate alignment records from Ruby
6
+ And have access to all their fields and tags.
7
+
8
+ Scenario: accessing alignment records
9
+ Given I opened a valid BAM file
10
+ When I use its 'alignments' method
11
+ Then I should be able to iterate the returned object with 'each'
12
+ And the objects which I iterate over should represent the alignments
13
+ And I should be able to access all fields mentioned in SAM/BAM format specification
14
+
15
+ Scenario: access existing alignment tag
16
+ Given I have an alignment
17
+ And it contains some tags
18
+ When I access it like a hash
19
+ And I use 2-character string as a key
20
+ And the alignment has such tag
21
+ Then I should be able to see corresponding value
22
+ And it should be a simple Ruby object (Array, Numeric, or String)
23
+
24
+ Scenario: invalid tag key (not of length 2)
25
+ Given I have an alignment
26
+ When I access it like a hash
27
+ But I use string of length different than two, as a key,
28
+ Then exception should be thrown.
29
+
30
+ Scenario: accessing non-existing alignment tag
31
+ Given I have an alignment
32
+ And it contains some tags
33
+ When I access it like a hash
34
+ But it doesn't contain the requested tag
35
+ Then nil should be returned.
36
+
37
+ Scenario: fetching all tags as a hash
38
+ Given I have an alignment
39
+ When I use its 'tags' method
40
+ Then I should be able to work with the returned object just like with Hash
@@ -0,0 +1,10 @@
1
+ Feature: random access to BAM file
2
+ In order to retrieve information about specific regions,
3
+ I want to be able to quickly fetch alignments overlapping a region.
4
+
5
+ Scenario: fetching alignments
6
+ Given I have a BAM file
7
+ And it's sorted by coordinate
8
+ And I have its index as well
9
+ When I specify reference sequence and region (1-based beginning and end positions)
10
+ Then I should be able to immediately have access to alignments overlapping it
@@ -0,0 +1,23 @@
1
+ Feature: access to information from SAM header
2
+
3
+ In order to work with BAM file,
4
+ I want to see what its header contains.
5
+
6
+ Background:
7
+ Given I opened a valid BAM file
8
+ And it contains SAM header
9
+
10
+ Scenario: getting raw text
11
+ When I call 'header' method
12
+ Then I should see text of SAM header
13
+
14
+ Scenario: accessing version and sorting order
15
+ When SAM header contains @HD line
16
+ Then I should be able to see format version
17
+ And I should be able to see sorting order
18
+
19
+ Scenario: getting information about reference sequences
20
+ When SAM header contains @SQ lines
21
+ Then I should be able to iterate them
22
+ And I should be able to see sequence names
23
+ And I should be able to see their lengths
@@ -0,0 +1,83 @@
1
+ Before do
2
+ @bam = Bio::Bam::File.new 'test/data/ex1_header.bam'
3
+ end
4
+
5
+ When /^I use its 'alignments' method$/ do
6
+ @bam.should respond_to(:alignments)
7
+ end
8
+
9
+ Then /^I should be able to iterate the returned object with 'each'$/ do
10
+ @bam.alignments.should respond_to(:each)
11
+ end
12
+
13
+ Then /^the objects which I iterate over should represent the alignments$/ do
14
+ @bam.alignments.take(100).each do |read|
15
+ read.should be_instance_of(Bio::Bam::Alignment)
16
+ end
17
+ end
18
+
19
+ Then /^I should be able to access all fields mentioned in SAM\/BAM format specification$/ do
20
+ @read = @bam.alignments.first
21
+ @read.read_name.should == 'EAS56_57:6:190:289:82'
22
+ @read.sequence.should == 'CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA'
23
+ @read.position.should == 100
24
+ @read.flag.should == 69
25
+ @read.mapping_quality.should == 0
26
+ @read.cigar_string.should == '*'
27
+ @read.reference.should == 'chr1'
28
+ @read.quality.should == [27, 27, 27, 22, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 23, 26, 26, 27, 22, 26, 19, 27, 26, 27, 26, 26, 26, 26, 26, 24, 19, 27, 26]
29
+ end
30
+
31
+ Given /^I have an alignment$/ do
32
+ @read = @bam.alignments.first
33
+ end
34
+
35
+ Given /^it contains some tags$/ do
36
+ end
37
+
38
+ When /^I access it like a hash$/ do
39
+ @read.should respond_to(:[])
40
+ end
41
+
42
+ When /^I use 2-character string as a key$/ do
43
+ @key = 'MF'
44
+ end
45
+
46
+ When /^the alignment has such tag$/ do
47
+ @read[@key].should_not be_nil
48
+ end
49
+
50
+ Then /^I should be able to see corresponding value$/ do
51
+ @read[@key].should be == 192
52
+ end
53
+
54
+ Then /^it should be a simple Ruby object \(Array, Numeric, or String\)$/ do
55
+ @read[@key].should be_kind_of Numeric
56
+ end
57
+
58
+ When /^I use string of length different than two, as a key,$/ do
59
+ @key = 'key'
60
+ end
61
+
62
+ Then /^exception should be thrown\.$/ do
63
+ expect{@read[@key]}.to raise_error(RuntimeError)
64
+ end
65
+
66
+ When /^it doesn't contain the requested tag$/ do
67
+ @key = 'hq'
68
+ end
69
+
70
+ Then /^nil should be returned\.$/ do
71
+ @read[@key].should be_nil
72
+ end
73
+
74
+ When /^I use its 'tags' method$/ do
75
+ @tags = @read.tags
76
+ end
77
+
78
+ Then /^I should be able to work with the returned object just like with Hash$/ do
79
+ @tags.should be_kind_of Hash
80
+ @tags['MF'].should be == 192
81
+ @tags.keys.should be == ['MF']
82
+ @tags.values.should be == [192]
83
+ end
@@ -0,0 +1,22 @@
1
+ Before do
2
+ @bam = Bio::Bam::File.new './test/data/ex1_header.bam'
3
+ end
4
+
5
+ Given /^it's sorted by coordinate$/ do
6
+ @bam.header.sorting_order.should == 'coordinate'
7
+ end
8
+
9
+ Given /^I have its index as well$/ do
10
+ @bam.should have_index
11
+ end
12
+
13
+ When /^I specify reference sequence and region \(1-based beginning and end positions\)$/ do
14
+ @region = (1400 ... 1500)
15
+ @chr = "chr2"
16
+ end
17
+
18
+ Then /^I should be able to immediately have access to alignments overlapping it$/ do
19
+ @alignments = @bam.fetch @chr, @region
20
+ @alignments.should respond_to(:each).with(0).arguments
21
+ @alignments.to_a.length.should == 77
22
+ end
@@ -0,0 +1,56 @@
1
+ Given /^I opened a valid BAM file$/ do
2
+ filename = './test/data/ex1_header.bam'
3
+ File.exists?(filename).should be_true
4
+ @bamfile = Bio::Bam::File.new filename
5
+ end
6
+
7
+ Given /^it contains SAM header$/ do
8
+ @bamfile.header.raw_contents.length.should be > 0
9
+ end
10
+
11
+ When /^I call 'header' method$/ do
12
+ @header = @bamfile.header
13
+ end
14
+
15
+ Then /^I should see text of SAM header$/ do
16
+ @header.raw_contents.should be_kind_of String
17
+ end
18
+
19
+ Given /^SAM header contains @HD line$/ do
20
+ @header = @bamfile.header
21
+ @header.raw_contents.should =~ /^@HD/
22
+ end
23
+
24
+ Then /^I should be able to see format version$/ do
25
+ @version = @header.version
26
+ @version.should be_kind_of String
27
+ @version.length.should be > 0
28
+ end
29
+
30
+ Then /^I should be able to see sorting order$/ do
31
+ @sorting_order = @header.sorting_order
32
+ @sorting_order.should be_kind_of String
33
+ @sorting_order.length.should be > 0
34
+ end
35
+
36
+ Given /^SAM header contains @SQ lines$/ do
37
+ @header = @bamfile.header
38
+ @header.sq_lines.length.should be > 0
39
+ end
40
+
41
+ Then /^I should be able to iterate them$/ do
42
+ @sq_lines = @header.sq_lines
43
+ @sq_lines.should be_kind_of Array
44
+ end
45
+
46
+ Then /^I should be able to see sequence names$/ do
47
+ @line = @sq_lines.first
48
+ @line.should respond_to(:sequence_name).with(0).arguments
49
+ @line.sequence_name.should be_kind_of String
50
+ @line.sequence_name.length.should be > 0
51
+ end
52
+
53
+ Then /^I should be able to see their lengths$/ do
54
+ @line.should respond_to(:sequence_length).with(0).arguments
55
+ @line.sequence_length.should be_kind_of Numeric
56
+ end
@@ -0,0 +1,34 @@
1
+ Before do
2
+ # this file is known to contain some invalid records
3
+ @tagsbam = Bio::Bam::File.new './test/data/tags.bam'
4
+ end
5
+
6
+ Given /^I have an alignment from a BAM file$/ do
7
+ @alignment = @tagsbam.alignments.to_a[32]
8
+ end
9
+
10
+ When /^I call 'valid\?' method$/ do
11
+ pending
12
+ # @is_valid = @alignment.valid?
13
+ end
14
+
15
+ Then /^it should return whether it is valid or not$/ do
16
+ @is_valid.should be_true
17
+ end
18
+
19
+ Given /^I have a BAM file$/ do
20
+ end
21
+
22
+ When /^I want to iterate over its records$/ do
23
+ @records = @tagsbam.alignments
24
+ end
25
+
26
+ Then /^I should have an option to skip invalid ones$/ do
27
+ @records.should respond_to(:each_valid).with(0).arguments
28
+ end
29
+
30
+ Then /^all the reads in this case should be valid$/ do
31
+ count = 0
32
+ @records.each_valid {|read| count += 1 }
33
+ count.should == 411
34
+ end