bio-sambamba 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +11 -0
- data/LICENSE.txt +20 -0
- data/README.md +68 -0
- data/Rakefile +47 -0
- data/VERSION +1 -0
- data/features/iterate-alignments.feature +40 -0
- data/features/random-access.feature +10 -0
- data/features/sam-header.feature +23 -0
- data/features/step_definitions/iterate-alignments_steps.rb +83 -0
- data/features/step_definitions/random-access_steps.rb +22 -0
- data/features/step_definitions/sam-header_steps.rb +56 -0
- data/features/step_definitions/validation-steps.rb +34 -0
- data/features/support/env.rb +13 -0
- data/features/syntax-sugar.feature +17 -0
- data/features/validation.feature +16 -0
- data/lib/bio-sambamba.rb +8 -0
- data/lib/bio-sambamba/alignment.rb +131 -0
- data/lib/bio-sambamba/alignmentiterator.rb +45 -0
- data/lib/bio-sambamba/bamfile.rb +45 -0
- data/lib/bio-sambamba/samfile.rb +25 -0
- data/lib/bio-sambamba/samheader.rb +194 -0
- data/test/data/bins.bam +0 -0
- data/test/data/bins.bam.bai +0 -0
- data/test/data/c1215_fixmate.bam +0 -0
- data/test/data/corrupted_zlib_archive.bam +0 -0
- data/test/data/duplicated_block_size.bam +0 -0
- data/test/data/ex1_header.bam +0 -0
- data/test/data/ex1_header.bam.bai +0 -0
- data/test/data/ex1_header.sam +3273 -0
- data/test/data/ex1_header.uncompressed.bam +0 -0
- data/test/data/no_block_size.bam +0 -0
- data/test/data/tags.bam +0 -0
- data/test/data/tags.bam.bai +0 -0
- data/test/data/wrong_bc_subfield_length.bam +0 -0
- data/test/data/wrong_extra_gzip_length.bam +0 -0
- metadata +184 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib/')
|
11
|
+
require 'bio-sambamba.rb'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: syntax sugar
|
2
|
+
|
3
|
+
In order to enjoy writing my scripts,
|
4
|
+
As a Rubyista,
|
5
|
+
I want some syntax sugar.
|
6
|
+
|
7
|
+
Scenario: fetching alignments
|
8
|
+
Given I have a BAM file
|
9
|
+
And associated BAI file
|
10
|
+
When I say "bam.alignments.referencing(chromosome).overlapping(500.kbp .. 600.kbp)"
|
11
|
+
Then I should get these alignments
|
12
|
+
|
13
|
+
Scenario: using shortcuts
|
14
|
+
Given I have a BAM file
|
15
|
+
And associated BAI file
|
16
|
+
When I say "bam[chromosome][500.kbp .. 600.kbp]"
|
17
|
+
Then I should get these alignments
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Feature: alignment validation
|
2
|
+
|
3
|
+
In order to be able to filter out invalid reads,
|
4
|
+
As a developer,
|
5
|
+
I want validation support.
|
6
|
+
|
7
|
+
Scenario: checking single read
|
8
|
+
Given I have an alignment from a BAM file
|
9
|
+
When I call 'valid?' method
|
10
|
+
Then it should return whether it is valid or not
|
11
|
+
|
12
|
+
Scenario: iterating over valid records
|
13
|
+
Given I have a BAM file
|
14
|
+
When I want to iterate over its records
|
15
|
+
Then I should have an option to skip invalid ones
|
16
|
+
And all the reads in this case should be valid
|
data/lib/bio-sambamba.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
module Bio
|
2
|
+
module Bam
|
3
|
+
|
4
|
+
# Class representing an alignment record
|
5
|
+
class Alignment
|
6
|
+
|
7
|
+
# Creates a new object from JSON output of sambamba tool
|
8
|
+
def initialize(json)
|
9
|
+
@json = json
|
10
|
+
end
|
11
|
+
|
12
|
+
# Access a record tag
|
13
|
+
def [](tag)
|
14
|
+
raise 'tag length must be two' unless tag.length == 2
|
15
|
+
@json['tags'][tag]
|
16
|
+
end
|
17
|
+
|
18
|
+
# Hash of record tags
|
19
|
+
attr_reader :tags if false
|
20
|
+
|
21
|
+
# Name of reference sequence
|
22
|
+
attr_reader :reference if false
|
23
|
+
|
24
|
+
# Query template name
|
25
|
+
attr_reader :read_name if false
|
26
|
+
|
27
|
+
# 1-based leftmost mapping position
|
28
|
+
attr_reader :position if false
|
29
|
+
|
30
|
+
# Mapping quality
|
31
|
+
attr_reader :mapping_quality if false
|
32
|
+
|
33
|
+
# CIGAR string
|
34
|
+
attr_reader :cigar_string if false
|
35
|
+
|
36
|
+
# Observed template length
|
37
|
+
attr_reader :template_length if false
|
38
|
+
|
39
|
+
# Bitwise flag
|
40
|
+
attr_reader :flag if false
|
41
|
+
|
42
|
+
# Phred-scaled base quality, an integer array
|
43
|
+
# of the same length as the sequence
|
44
|
+
attr_reader :quality if false
|
45
|
+
|
46
|
+
# Segment sequence
|
47
|
+
attr_reader :sequence if false
|
48
|
+
|
49
|
+
# Reference sequence name of the mate/next segment
|
50
|
+
attr_reader :mate_reference if false
|
51
|
+
|
52
|
+
# 1-based leftmost position of the mate/next segment
|
53
|
+
attr_reader :mate_position if false
|
54
|
+
|
55
|
+
{'tags' => 'tags',
|
56
|
+
'reference' => 'rname',
|
57
|
+
'read_name' => 'qname',
|
58
|
+
'position' => 'pos',
|
59
|
+
'mapping_quality' => 'mapq',
|
60
|
+
'cigar_string' => 'cigar',
|
61
|
+
'template_length' => 'tlen',
|
62
|
+
'flag' => 'flag',
|
63
|
+
'quality' => 'qual',
|
64
|
+
'sequence' => 'seq',
|
65
|
+
'mate_reference' => 'rnext',
|
66
|
+
'mate_position' => 'pnext'}.each do |k, v|
|
67
|
+
eval <<-DEFINE_READER
|
68
|
+
def #{k}
|
69
|
+
@json['#{v}']
|
70
|
+
end
|
71
|
+
DEFINE_READER
|
72
|
+
end
|
73
|
+
|
74
|
+
# Template having multiple segments in sequencing
|
75
|
+
def is_paired
|
76
|
+
(flag & 0x1) != 0
|
77
|
+
end
|
78
|
+
|
79
|
+
# Each segment properly aligned according to the aligner
|
80
|
+
def proper_pair
|
81
|
+
(flag & 0x2) != 0
|
82
|
+
end
|
83
|
+
|
84
|
+
# Segment unmapped
|
85
|
+
def is_unmapped
|
86
|
+
(flag & 0x4) != 0
|
87
|
+
end
|
88
|
+
|
89
|
+
# Next segment in the template unmapped
|
90
|
+
def mate_is_unmapped
|
91
|
+
(flag & 0x8) != 0
|
92
|
+
end
|
93
|
+
|
94
|
+
# Sequence being reverse complemented
|
95
|
+
def is_reverse_strand
|
96
|
+
(flag & 0x10) != 0
|
97
|
+
end
|
98
|
+
|
99
|
+
# Sequence of the next segment in the template being reversed
|
100
|
+
def mate_is_reverse_strand
|
101
|
+
(flag & 0x20) != 0
|
102
|
+
end
|
103
|
+
|
104
|
+
# The first segment in the template
|
105
|
+
def is_first_of_pair
|
106
|
+
(flag & 0x40) != 0
|
107
|
+
end
|
108
|
+
|
109
|
+
# The last segment in the template
|
110
|
+
def is_second_of_pair
|
111
|
+
(flag & 0x80) != 0
|
112
|
+
end
|
113
|
+
|
114
|
+
# Secondary alignment
|
115
|
+
def is_secondary_alignment
|
116
|
+
(flag & 0x100) != 0
|
117
|
+
end
|
118
|
+
|
119
|
+
# Not passing quality controls
|
120
|
+
def failed_quality_control
|
121
|
+
(flag & 0x200) != 0
|
122
|
+
end
|
123
|
+
|
124
|
+
# PCR or optical duplicate
|
125
|
+
def is_duplicate
|
126
|
+
(flag & 0x400) != 0
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Bio
|
2
|
+
module Bam
|
3
|
+
|
4
|
+
# Class for iterating through alignments
|
5
|
+
class AlignmentIterator
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
# Creates a new AlignmentIterator object which will
|
9
|
+
# parse JSON outputted by a specified command.
|
10
|
+
def initialize(command)
|
11
|
+
@command = command
|
12
|
+
end
|
13
|
+
|
14
|
+
# Iterate only through valid alignments
|
15
|
+
def each_valid
|
16
|
+
|
17
|
+
return enum_for(:each_valid) if not block_given?
|
18
|
+
|
19
|
+
command = @command
|
20
|
+
if command.index('--valid').nil?
|
21
|
+
command.push '--valid'
|
22
|
+
end
|
23
|
+
|
24
|
+
AlignmentIterator.new(command).each do |read|
|
25
|
+
yield read
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Iterate through all alignments skipping
|
30
|
+
# validation checks
|
31
|
+
def each
|
32
|
+
|
33
|
+
return enum_for(:each) if not block_given?
|
34
|
+
|
35
|
+
Bio::Command.call_command(@command) do |io|
|
36
|
+
io.each do |line|
|
37
|
+
raise line unless line[0] == '{'
|
38
|
+
yield Bio::Bam::Alignment.new(Oj.load(line))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Bio
|
2
|
+
|
3
|
+
# Module for reading BAM files
|
4
|
+
module Bam
|
5
|
+
|
6
|
+
# Class providing access to BAM files
|
7
|
+
class File
|
8
|
+
|
9
|
+
# Creates an object for access to BAM file
|
10
|
+
def initialize(filename)
|
11
|
+
@filename = filename
|
12
|
+
end
|
13
|
+
|
14
|
+
# SAM header
|
15
|
+
def header
|
16
|
+
@header ||= Bio::Bam::SamHeader.new(@filename)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns an AlignmentIterator object for iterating over all alignments in the file
|
20
|
+
def alignments
|
21
|
+
Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', @filename]
|
22
|
+
end
|
23
|
+
|
24
|
+
# True if index file was found
|
25
|
+
def has_index?
|
26
|
+
File::File.exists?(@filename + '.bai') ||
|
27
|
+
File::File.exists?(@filename[0...-1] + 'i')
|
28
|
+
end
|
29
|
+
|
30
|
+
# Fetches alignments overlapping a region.
|
31
|
+
# Returns an AlignmentIterator object.
|
32
|
+
#
|
33
|
+
# ---
|
34
|
+
# *Arguments*:
|
35
|
+
# * _chr_: reference sequence
|
36
|
+
# * _region_: a Range representing an interval. Coordinates are 1-based.
|
37
|
+
def fetch(chr, region)
|
38
|
+
Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json',
|
39
|
+
@filename,
|
40
|
+
"#{chr}:#{region.min}-#{region.max}"]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Bio
|
2
|
+
# Module for reading SAM files
|
3
|
+
module Sam
|
4
|
+
|
5
|
+
# Class providing access to SAM files
|
6
|
+
class File
|
7
|
+
|
8
|
+
# Creates an object for access to SAM file
|
9
|
+
def initialize(filename)
|
10
|
+
@filename = filename
|
11
|
+
end
|
12
|
+
|
13
|
+
# SAM header
|
14
|
+
def header
|
15
|
+
@header ||= Bio::Bam::SamHeader.new(@filename, ['-S'])
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns an AlignmentIterator object for iterating over all alignments in the file
|
19
|
+
def alignments
|
20
|
+
Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', '-S', @filename]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
module Bio
|
2
|
+
module Bam
|
3
|
+
|
4
|
+
# Represents SAM header
|
5
|
+
class SamHeader
|
6
|
+
|
7
|
+
# Creates a new SamHeader object for a specified file,
|
8
|
+
# specifying additional options to pass to sambamba tool
|
9
|
+
def initialize(filename, opts=[])
|
10
|
+
@filename = filename
|
11
|
+
@opts = opts
|
12
|
+
end
|
13
|
+
|
14
|
+
# Raw text of SAM header
|
15
|
+
def raw_contents
|
16
|
+
if @raw_contents.nil? then
|
17
|
+
@raw_contents = Bio::Command.query_command(['sambamba', '-H', @filename] + @opts)
|
18
|
+
if @raw_contents.start_with? "sambamba" then
|
19
|
+
raise @raw_contents
|
20
|
+
end
|
21
|
+
end
|
22
|
+
@raw_contents
|
23
|
+
end
|
24
|
+
|
25
|
+
# Format version
|
26
|
+
def version
|
27
|
+
@json ||= get_json
|
28
|
+
@json['format_version']
|
29
|
+
end
|
30
|
+
|
31
|
+
# Sorting order
|
32
|
+
def sorting_order
|
33
|
+
@json ||= get_json
|
34
|
+
@json['sorting_order']
|
35
|
+
end
|
36
|
+
|
37
|
+
# An array of SQLine objects
|
38
|
+
def sq_lines
|
39
|
+
@json ||= get_json
|
40
|
+
@sq_lines ||= @json['sq_lines'].map{|json| SQLine.new(json)}
|
41
|
+
end
|
42
|
+
|
43
|
+
# An array of RGLine objects
|
44
|
+
def rg_lines
|
45
|
+
@json ||= get_json
|
46
|
+
@sq_lines ||= @json['rg_lines'].map{|json| RGLine.new(json)}
|
47
|
+
end
|
48
|
+
|
49
|
+
# An array of PGLine objects
|
50
|
+
def pg_lines
|
51
|
+
@json ||= get_json
|
52
|
+
@sq_lines ||= @json['pg_lines'].map{|json| PGLine.new(json)}
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
# Calls sambamba to get underlying JSON object
|
57
|
+
def get_json
|
58
|
+
command = ['sambamba', '-H', '--format=json', @filename] + @opts
|
59
|
+
line = Bio::Command.query_command(command)
|
60
|
+
raise line if line[0] != '{'
|
61
|
+
@json = Oj.load(line)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Represents a @SQ line from SAM header
|
66
|
+
class SQLine
|
67
|
+
|
68
|
+
# Wrap JSON object from sambamba output
|
69
|
+
def initialize(json)
|
70
|
+
@json = json
|
71
|
+
end
|
72
|
+
|
73
|
+
# Reference sequence name
|
74
|
+
attr_reader :sequence_name if false
|
75
|
+
|
76
|
+
# Reference sequence length
|
77
|
+
attr_reader :sequence_length if false
|
78
|
+
|
79
|
+
# Genome assembly identifier
|
80
|
+
attr_reader :assembly if false
|
81
|
+
|
82
|
+
# MD5 checksum of the sequence in uppercase, with gaps and spaces removed
|
83
|
+
attr_reader :md5 if false
|
84
|
+
|
85
|
+
# Species
|
86
|
+
attr_reader :species if false
|
87
|
+
|
88
|
+
# URI of the sequence
|
89
|
+
attr_reader :uri if false
|
90
|
+
|
91
|
+
['sequence_name', 'sequence_length',
|
92
|
+
'assembly', 'md5', 'species', 'uri'].each do |sq_line_field|
|
93
|
+
eval <<-DEFINE_READER
|
94
|
+
def #{sq_line_field}
|
95
|
+
@json['#{sq_line_field}']
|
96
|
+
end
|
97
|
+
DEFINE_READER
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Represents @RG line from SAM header, i.e. a read group
|
102
|
+
class RGLine
|
103
|
+
|
104
|
+
# Wrap JSON object from sambamba output
|
105
|
+
def initialize(json)
|
106
|
+
@json = json
|
107
|
+
end
|
108
|
+
|
109
|
+
# Unique read group identifier
|
110
|
+
attr_reader :identifier if false
|
111
|
+
|
112
|
+
# Name of sequencing center
|
113
|
+
attr_reader :sequencing_center if false
|
114
|
+
|
115
|
+
# Description
|
116
|
+
attr_reader :description if false
|
117
|
+
|
118
|
+
# Date the run was produced (ISO8601 date or date/time)
|
119
|
+
attr_reader :date if false
|
120
|
+
|
121
|
+
# Flow order. The array of nucleotide bases that correspond to the
|
122
|
+
# nucleotides used for each flow of each read. Multi-base flows are
|
123
|
+
# encoded in IUPAC format, and non-nucleotide flows by various other
|
124
|
+
# characters.
|
125
|
+
attr_reader :flow_order if false
|
126
|
+
|
127
|
+
# The array of nucleotide bases that correspond to the key sequence of each read
|
128
|
+
attr_reader :key_sequence if false
|
129
|
+
|
130
|
+
# Library
|
131
|
+
attr_reader :library if false
|
132
|
+
|
133
|
+
# Programs used for processing the read group
|
134
|
+
attr_reader :programs if false
|
135
|
+
|
136
|
+
# Predicted median insert size
|
137
|
+
attr_reader :predicted_insert_size if false
|
138
|
+
|
139
|
+
# Platform/technology used to produce the reads
|
140
|
+
attr_reader :platform if false
|
141
|
+
|
142
|
+
# Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
|
143
|
+
attr_reader :platform_unit if false
|
144
|
+
|
145
|
+
# Sample
|
146
|
+
attr_reader :sample if false
|
147
|
+
|
148
|
+
['identifier', 'sequencing_center', 'description', 'date',
|
149
|
+
'flow_order', 'key_sequence', 'library', 'programs',
|
150
|
+
'predicted_insert_size', 'platform',
|
151
|
+
'platform_unit', 'sample'].each do |rg_line_field|
|
152
|
+
eval <<-DEFINE_READER
|
153
|
+
def #{rg_line_field}
|
154
|
+
@json['#{rg_line_field}']
|
155
|
+
end
|
156
|
+
DEFINE_READER
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Represents @PG line from SAM header (program record)
|
161
|
+
class PGLine
|
162
|
+
|
163
|
+
# Wrap JSON object from sambamba output
|
164
|
+
def initialize(json)
|
165
|
+
@json = json
|
166
|
+
end
|
167
|
+
|
168
|
+
# Unique program record identifier
|
169
|
+
attr_reader :identifier if false
|
170
|
+
|
171
|
+
# Program name
|
172
|
+
attr_reader :program_name if false
|
173
|
+
|
174
|
+
# Command line
|
175
|
+
attr_reader :command_line if false
|
176
|
+
|
177
|
+
# Identifier of previous program in chain
|
178
|
+
attr_reader :previous_program if false
|
179
|
+
|
180
|
+
# Program version
|
181
|
+
attr_reader :program_version if false
|
182
|
+
|
183
|
+
['identifier', 'program_name', 'command_line',
|
184
|
+
'previous_program', 'program_version'].each do |rg_line_field|
|
185
|
+
eval <<-DEFINE_READER
|
186
|
+
def #{rg_line_field}
|
187
|
+
@json['#{rg_line_field}']
|
188
|
+
end
|
189
|
+
DEFINE_READER
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
end
|