bio-sambamba 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.document +5 -0
  2. data/.travis.yml +12 -0
  3. data/Gemfile +11 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.md +68 -0
  6. data/Rakefile +47 -0
  7. data/VERSION +1 -0
  8. data/features/iterate-alignments.feature +40 -0
  9. data/features/random-access.feature +10 -0
  10. data/features/sam-header.feature +23 -0
  11. data/features/step_definitions/iterate-alignments_steps.rb +83 -0
  12. data/features/step_definitions/random-access_steps.rb +22 -0
  13. data/features/step_definitions/sam-header_steps.rb +56 -0
  14. data/features/step_definitions/validation-steps.rb +34 -0
  15. data/features/support/env.rb +13 -0
  16. data/features/syntax-sugar.feature +17 -0
  17. data/features/validation.feature +16 -0
  18. data/lib/bio-sambamba.rb +8 -0
  19. data/lib/bio-sambamba/alignment.rb +131 -0
  20. data/lib/bio-sambamba/alignmentiterator.rb +45 -0
  21. data/lib/bio-sambamba/bamfile.rb +45 -0
  22. data/lib/bio-sambamba/samfile.rb +25 -0
  23. data/lib/bio-sambamba/samheader.rb +194 -0
  24. data/test/data/bins.bam +0 -0
  25. data/test/data/bins.bam.bai +0 -0
  26. data/test/data/c1215_fixmate.bam +0 -0
  27. data/test/data/corrupted_zlib_archive.bam +0 -0
  28. data/test/data/duplicated_block_size.bam +0 -0
  29. data/test/data/ex1_header.bam +0 -0
  30. data/test/data/ex1_header.bam.bai +0 -0
  31. data/test/data/ex1_header.sam +3273 -0
  32. data/test/data/ex1_header.uncompressed.bam +0 -0
  33. data/test/data/no_block_size.bam +0 -0
  34. data/test/data/tags.bam +0 -0
  35. data/test/data/tags.bam.bai +0 -0
  36. data/test/data/wrong_bc_subfield_length.bam +0 -0
  37. data/test/data/wrong_extra_gzip_length.bam +0 -0
  38. metadata +184 -0
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib/')
11
+ require 'bio-sambamba.rb'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,17 @@
1
+ Feature: syntax sugar
2
+
3
+ In order to enjoy writing my scripts,
4
+ As a Rubyista,
5
+ I want some syntax sugar.
6
+
7
+ Scenario: fetching alignments
8
+ Given I have a BAM file
9
+ And associated BAI file
10
+ When I say "bam.alignments.referencing(chromosome).overlapping(500.kbp .. 600.kbp)"
11
+ Then I should get these alignments
12
+
13
+ Scenario: using shortcuts
14
+ Given I have a BAM file
15
+ And associated BAI file
16
+ When I say "bam[chromosome][500.kbp .. 600.kbp]"
17
+ Then I should get these alignments
@@ -0,0 +1,16 @@
1
+ Feature: alignment validation
2
+
3
+ In order to be able to filter out invalid reads,
4
+ As a developer,
5
+ I want validation support.
6
+
7
+ Scenario: checking single read
8
+ Given I have an alignment from a BAM file
9
+ When I call 'valid?' method
10
+ Then it should return whether it is valid or not
11
+
12
+ Scenario: iterating over valid records
13
+ Given I have a BAM file
14
+ When I want to iterate over its records
15
+ Then I should have an option to skip invalid ones
16
+ And all the reads in this case should be valid
@@ -0,0 +1,8 @@
1
+ require 'bio/command'
2
+ require 'oj'
3
+
4
+ require 'bio-sambamba/samheader.rb'
5
+ require 'bio-sambamba/alignment.rb'
6
+ require 'bio-sambamba/alignmentiterator.rb'
7
+ require 'bio-sambamba/bamfile.rb'
8
+ require 'bio-sambamba/samfile.rb'
@@ -0,0 +1,131 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Class representing an alignment record
5
+ class Alignment
6
+
7
+ # Creates a new object from JSON output of sambamba tool
8
+ def initialize(json)
9
+ @json = json
10
+ end
11
+
12
+ # Access a record tag
13
+ def [](tag)
14
+ raise 'tag length must be two' unless tag.length == 2
15
+ @json['tags'][tag]
16
+ end
17
+
18
+ # Hash of record tags
19
+ attr_reader :tags if false
20
+
21
+ # Name of reference sequence
22
+ attr_reader :reference if false
23
+
24
+ # Query template name
25
+ attr_reader :read_name if false
26
+
27
+ # 1-based leftmost mapping position
28
+ attr_reader :position if false
29
+
30
+ # Mapping quality
31
+ attr_reader :mapping_quality if false
32
+
33
+ # CIGAR string
34
+ attr_reader :cigar_string if false
35
+
36
+ # Observed template length
37
+ attr_reader :template_length if false
38
+
39
+ # Bitwise flag
40
+ attr_reader :flag if false
41
+
42
+ # Phred-scaled base quality, an integer array
43
+ # of the same length as the sequence
44
+ attr_reader :quality if false
45
+
46
+ # Segment sequence
47
+ attr_reader :sequence if false
48
+
49
+ # Reference sequence name of the mate/next segment
50
+ attr_reader :mate_reference if false
51
+
52
+ # 1-based leftmost position of the mate/next segment
53
+ attr_reader :mate_position if false
54
+
55
+ {'tags' => 'tags',
56
+ 'reference' => 'rname',
57
+ 'read_name' => 'qname',
58
+ 'position' => 'pos',
59
+ 'mapping_quality' => 'mapq',
60
+ 'cigar_string' => 'cigar',
61
+ 'template_length' => 'tlen',
62
+ 'flag' => 'flag',
63
+ 'quality' => 'qual',
64
+ 'sequence' => 'seq',
65
+ 'mate_reference' => 'rnext',
66
+ 'mate_position' => 'pnext'}.each do |k, v|
67
+ eval <<-DEFINE_READER
68
+ def #{k}
69
+ @json['#{v}']
70
+ end
71
+ DEFINE_READER
72
+ end
73
+
74
+ # Template having multiple segments in sequencing
75
+ def is_paired
76
+ (flag & 0x1) != 0
77
+ end
78
+
79
+ # Each segment properly aligned according to the aligner
80
+ def proper_pair
81
+ (flag & 0x2) != 0
82
+ end
83
+
84
+ # Segment unmapped
85
+ def is_unmapped
86
+ (flag & 0x4) != 0
87
+ end
88
+
89
+ # Next segment in the template unmapped
90
+ def mate_is_unmapped
91
+ (flag & 0x8) != 0
92
+ end
93
+
94
+ # Sequence being reverse complemented
95
+ def is_reverse_strand
96
+ (flag & 0x10) != 0
97
+ end
98
+
99
+ # Sequence of the next segment in the template being reversed
100
+ def mate_is_reverse_strand
101
+ (flag & 0x20) != 0
102
+ end
103
+
104
+ # The first segment in the template
105
+ def is_first_of_pair
106
+ (flag & 0x40) != 0
107
+ end
108
+
109
+ # The last segment in the template
110
+ def is_second_of_pair
111
+ (flag & 0x80) != 0
112
+ end
113
+
114
+ # Secondary alignment
115
+ def is_secondary_alignment
116
+ (flag & 0x100) != 0
117
+ end
118
+
119
+ # Not passing quality controls
120
+ def failed_quality_control
121
+ (flag & 0x200) != 0
122
+ end
123
+
124
+ # PCR or optical duplicate
125
+ def is_duplicate
126
+ (flag & 0x400) != 0
127
+ end
128
+ end
129
+
130
+ end
131
+ end
@@ -0,0 +1,45 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Class for iterating through alignments
5
+ class AlignmentIterator
6
+ include Enumerable
7
+
8
+ # Creates a new AlignmentIterator object which will
9
+ # parse JSON outputted by a specified command.
10
+ def initialize(command)
11
+ @command = command
12
+ end
13
+
14
+ # Iterate only through valid alignments
15
+ def each_valid
16
+
17
+ return enum_for(:each_valid) if not block_given?
18
+
19
+ command = @command
20
+ if command.index('--valid').nil?
21
+ command.push '--valid'
22
+ end
23
+
24
+ AlignmentIterator.new(command).each do |read|
25
+ yield read
26
+ end
27
+ end
28
+
29
+ # Iterate through all alignments skipping
30
+ # validation checks
31
+ def each
32
+
33
+ return enum_for(:each) if not block_given?
34
+
35
+ Bio::Command.call_command(@command) do |io|
36
+ io.each do |line|
37
+ raise line unless line[0] == '{'
38
+ yield Bio::Bam::Alignment.new(Oj.load(line))
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,45 @@
1
+ module Bio
2
+
3
+ # Module for reading BAM files
4
+ module Bam
5
+
6
+ # Class providing access to BAM files
7
+ class File
8
+
9
+ # Creates an object for access to BAM file
10
+ def initialize(filename)
11
+ @filename = filename
12
+ end
13
+
14
+ # SAM header
15
+ def header
16
+ @header ||= Bio::Bam::SamHeader.new(@filename)
17
+ end
18
+
19
+ # Returns an AlignmentIterator object for iterating over all alignments in the file
20
+ def alignments
21
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', @filename]
22
+ end
23
+
24
+ # True if index file was found
25
+ def has_index?
26
+ File::File.exists?(@filename + '.bai') ||
27
+ File::File.exists?(@filename[0...-1] + 'i')
28
+ end
29
+
30
+ # Fetches alignments overlapping a region.
31
+ # Returns an AlignmentIterator object.
32
+ #
33
+ # ---
34
+ # *Arguments*:
35
+ # * _chr_: reference sequence
36
+ # * _region_: a Range representing an interval. Coordinates are 1-based.
37
+ def fetch(chr, region)
38
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json',
39
+ @filename,
40
+ "#{chr}:#{region.min}-#{region.max}"]
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,25 @@
1
+ module Bio
2
+ # Module for reading SAM files
3
+ module Sam
4
+
5
+ # Class providing access to SAM files
6
+ class File
7
+
8
+ # Creates an object for access to SAM file
9
+ def initialize(filename)
10
+ @filename = filename
11
+ end
12
+
13
+ # SAM header
14
+ def header
15
+ @header ||= Bio::Bam::SamHeader.new(@filename, ['-S'])
16
+ end
17
+
18
+ # Returns an AlignmentIterator object for iterating over all alignments in the file
19
+ def alignments
20
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', '-S', @filename]
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,194 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Represents SAM header
5
+ class SamHeader
6
+
7
+ # Creates a new SamHeader object for a specified file,
8
+ # specifying additional options to pass to sambamba tool
9
+ def initialize(filename, opts=[])
10
+ @filename = filename
11
+ @opts = opts
12
+ end
13
+
14
+ # Raw text of SAM header
15
+ def raw_contents
16
+ if @raw_contents.nil? then
17
+ @raw_contents = Bio::Command.query_command(['sambamba', '-H', @filename] + @opts)
18
+ if @raw_contents.start_with? "sambamba" then
19
+ raise @raw_contents
20
+ end
21
+ end
22
+ @raw_contents
23
+ end
24
+
25
+ # Format version
26
+ def version
27
+ @json ||= get_json
28
+ @json['format_version']
29
+ end
30
+
31
+ # Sorting order
32
+ def sorting_order
33
+ @json ||= get_json
34
+ @json['sorting_order']
35
+ end
36
+
37
+ # An array of SQLine objects
38
+ def sq_lines
39
+ @json ||= get_json
40
+ @sq_lines ||= @json['sq_lines'].map{|json| SQLine.new(json)}
41
+ end
42
+
43
+ # An array of RGLine objects
44
+ def rg_lines
45
+ @json ||= get_json
46
+ @sq_lines ||= @json['rg_lines'].map{|json| RGLine.new(json)}
47
+ end
48
+
49
+ # An array of PGLine objects
50
+ def pg_lines
51
+ @json ||= get_json
52
+ @sq_lines ||= @json['pg_lines'].map{|json| PGLine.new(json)}
53
+ end
54
+
55
+ private
56
+ # Calls sambamba to get underlying JSON object
57
+ def get_json
58
+ command = ['sambamba', '-H', '--format=json', @filename] + @opts
59
+ line = Bio::Command.query_command(command)
60
+ raise line if line[0] != '{'
61
+ @json = Oj.load(line)
62
+ end
63
+ end
64
+
65
+ # Represents a @SQ line from SAM header
66
+ class SQLine
67
+
68
+ # Wrap JSON object from sambamba output
69
+ def initialize(json)
70
+ @json = json
71
+ end
72
+
73
+ # Reference sequence name
74
+ attr_reader :sequence_name if false
75
+
76
+ # Reference sequence length
77
+ attr_reader :sequence_length if false
78
+
79
+ # Genome assembly identifier
80
+ attr_reader :assembly if false
81
+
82
+ # MD5 checksum of the sequence in uppercase, with gaps and spaces removed
83
+ attr_reader :md5 if false
84
+
85
+ # Species
86
+ attr_reader :species if false
87
+
88
+ # URI of the sequence
89
+ attr_reader :uri if false
90
+
91
+ ['sequence_name', 'sequence_length',
92
+ 'assembly', 'md5', 'species', 'uri'].each do |sq_line_field|
93
+ eval <<-DEFINE_READER
94
+ def #{sq_line_field}
95
+ @json['#{sq_line_field}']
96
+ end
97
+ DEFINE_READER
98
+ end
99
+ end
100
+
101
+ # Represents @RG line from SAM header, i.e. a read group
102
+ class RGLine
103
+
104
+ # Wrap JSON object from sambamba output
105
+ def initialize(json)
106
+ @json = json
107
+ end
108
+
109
+ # Unique read group identifier
110
+ attr_reader :identifier if false
111
+
112
+ # Name of sequencing center
113
+ attr_reader :sequencing_center if false
114
+
115
+ # Description
116
+ attr_reader :description if false
117
+
118
+ # Date the run was produced (ISO8601 date or date/time)
119
+ attr_reader :date if false
120
+
121
+ # Flow order. The array of nucleotide bases that correspond to the
122
+ # nucleotides used for each flow of each read. Multi-base flows are
123
+ # encoded in IUPAC format, and non-nucleotide flows by various other
124
+ # characters.
125
+ attr_reader :flow_order if false
126
+
127
+ # The array of nucleotide bases that correspond to the key sequence of each read
128
+ attr_reader :key_sequence if false
129
+
130
+ # Library
131
+ attr_reader :library if false
132
+
133
+ # Programs used for processing the read group
134
+ attr_reader :programs if false
135
+
136
+ # Predicted median insert size
137
+ attr_reader :predicted_insert_size if false
138
+
139
+ # Platform/technology used to produce the reads
140
+ attr_reader :platform if false
141
+
142
+ # Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
143
+ attr_reader :platform_unit if false
144
+
145
+ # Sample
146
+ attr_reader :sample if false
147
+
148
+ ['identifier', 'sequencing_center', 'description', 'date',
149
+ 'flow_order', 'key_sequence', 'library', 'programs',
150
+ 'predicted_insert_size', 'platform',
151
+ 'platform_unit', 'sample'].each do |rg_line_field|
152
+ eval <<-DEFINE_READER
153
+ def #{rg_line_field}
154
+ @json['#{rg_line_field}']
155
+ end
156
+ DEFINE_READER
157
+ end
158
+ end
159
+
160
+ # Represents @PG line from SAM header (program record)
161
+ class PGLine
162
+
163
+ # Wrap JSON object from sambamba output
164
+ def initialize(json)
165
+ @json = json
166
+ end
167
+
168
+ # Unique program record identifier
169
+ attr_reader :identifier if false
170
+
171
+ # Program name
172
+ attr_reader :program_name if false
173
+
174
+ # Command line
175
+ attr_reader :command_line if false
176
+
177
+ # Identifier of previous program in chain
178
+ attr_reader :previous_program if false
179
+
180
+ # Program version
181
+ attr_reader :program_version if false
182
+
183
+ ['identifier', 'program_name', 'command_line',
184
+ 'previous_program', 'program_version'].each do |rg_line_field|
185
+ eval <<-DEFINE_READER
186
+ def #{rg_line_field}
187
+ @json['#{rg_line_field}']
188
+ end
189
+ DEFINE_READER
190
+ end
191
+ end
192
+
193
+ end
194
+ end