bio-sambamba 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.document +5 -0
  2. data/.travis.yml +12 -0
  3. data/Gemfile +11 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.md +68 -0
  6. data/Rakefile +47 -0
  7. data/VERSION +1 -0
  8. data/features/iterate-alignments.feature +40 -0
  9. data/features/random-access.feature +10 -0
  10. data/features/sam-header.feature +23 -0
  11. data/features/step_definitions/iterate-alignments_steps.rb +83 -0
  12. data/features/step_definitions/random-access_steps.rb +22 -0
  13. data/features/step_definitions/sam-header_steps.rb +56 -0
  14. data/features/step_definitions/validation-steps.rb +34 -0
  15. data/features/support/env.rb +13 -0
  16. data/features/syntax-sugar.feature +17 -0
  17. data/features/validation.feature +16 -0
  18. data/lib/bio-sambamba.rb +8 -0
  19. data/lib/bio-sambamba/alignment.rb +131 -0
  20. data/lib/bio-sambamba/alignmentiterator.rb +45 -0
  21. data/lib/bio-sambamba/bamfile.rb +45 -0
  22. data/lib/bio-sambamba/samfile.rb +25 -0
  23. data/lib/bio-sambamba/samheader.rb +194 -0
  24. data/test/data/bins.bam +0 -0
  25. data/test/data/bins.bam.bai +0 -0
  26. data/test/data/c1215_fixmate.bam +0 -0
  27. data/test/data/corrupted_zlib_archive.bam +0 -0
  28. data/test/data/duplicated_block_size.bam +0 -0
  29. data/test/data/ex1_header.bam +0 -0
  30. data/test/data/ex1_header.bam.bai +0 -0
  31. data/test/data/ex1_header.sam +3273 -0
  32. data/test/data/ex1_header.uncompressed.bam +0 -0
  33. data/test/data/no_block_size.bam +0 -0
  34. data/test/data/tags.bam +0 -0
  35. data/test/data/tags.bam.bai +0 -0
  36. data/test/data/wrong_bc_subfield_length.bam +0 -0
  37. data/test/data/wrong_extra_gzip_length.bam +0 -0
  38. metadata +184 -0
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib/')
11
+ require 'bio-sambamba.rb'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,17 @@
1
+ Feature: syntax sugar
2
+
3
+ In order to enjoy writing my scripts,
4
+ As a Rubyista,
5
+ I want some syntax sugar.
6
+
7
+ Scenario: fetching alignments
8
+ Given I have a BAM file
9
+ And associated BAI file
10
+ When I say "bam.alignments.referencing(chromosome).overlapping(500.kbp .. 600.kbp)"
11
+ Then I should get these alignments
12
+
13
+ Scenario: using shortcuts
14
+ Given I have a BAM file
15
+ And associated BAI file
16
+ When I say "bam[chromosome][500.kbp .. 600.kbp]"
17
+ Then I should get these alignments
@@ -0,0 +1,16 @@
1
+ Feature: alignment validation
2
+
3
+ In order to be able to filter out invalid reads,
4
+ As a developer,
5
+ I want validation support.
6
+
7
+ Scenario: checking single read
8
+ Given I have an alignment from a BAM file
9
+ When I call 'valid?' method
10
+ Then it should return whether it is valid or not
11
+
12
+ Scenario: iterating over valid records
13
+ Given I have a BAM file
14
+ When I want to iterate over its records
15
+ Then I should have an option to skip invalid ones
16
+ And all the reads in this case should be valid
@@ -0,0 +1,8 @@
1
+ require 'bio/command'
2
+ require 'oj'
3
+
4
+ require 'bio-sambamba/samheader.rb'
5
+ require 'bio-sambamba/alignment.rb'
6
+ require 'bio-sambamba/alignmentiterator.rb'
7
+ require 'bio-sambamba/bamfile.rb'
8
+ require 'bio-sambamba/samfile.rb'
@@ -0,0 +1,131 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Class representing an alignment record
5
+ class Alignment
6
+
7
+ # Creates a new object from JSON output of sambamba tool
8
+ def initialize(json)
9
+ @json = json
10
+ end
11
+
12
+ # Access a record tag
13
+ def [](tag)
14
+ raise 'tag length must be two' unless tag.length == 2
15
+ @json['tags'][tag]
16
+ end
17
+
18
+ # Hash of record tags
19
+ attr_reader :tags if false
20
+
21
+ # Name of reference sequence
22
+ attr_reader :reference if false
23
+
24
+ # Query template name
25
+ attr_reader :read_name if false
26
+
27
+ # 1-based leftmost mapping position
28
+ attr_reader :position if false
29
+
30
+ # Mapping quality
31
+ attr_reader :mapping_quality if false
32
+
33
+ # CIGAR string
34
+ attr_reader :cigar_string if false
35
+
36
+ # Observed template length
37
+ attr_reader :template_length if false
38
+
39
+ # Bitwise flag
40
+ attr_reader :flag if false
41
+
42
+ # Phred-scaled base quality, an integer array
43
+ # of the same length as the sequence
44
+ attr_reader :quality if false
45
+
46
+ # Segment sequence
47
+ attr_reader :sequence if false
48
+
49
+ # Reference sequence name of the mate/next segment
50
+ attr_reader :mate_reference if false
51
+
52
+ # 1-based leftmost position of the mate/next segment
53
+ attr_reader :mate_position if false
54
+
55
+ {'tags' => 'tags',
56
+ 'reference' => 'rname',
57
+ 'read_name' => 'qname',
58
+ 'position' => 'pos',
59
+ 'mapping_quality' => 'mapq',
60
+ 'cigar_string' => 'cigar',
61
+ 'template_length' => 'tlen',
62
+ 'flag' => 'flag',
63
+ 'quality' => 'qual',
64
+ 'sequence' => 'seq',
65
+ 'mate_reference' => 'rnext',
66
+ 'mate_position' => 'pnext'}.each do |k, v|
67
+ eval <<-DEFINE_READER
68
+ def #{k}
69
+ @json['#{v}']
70
+ end
71
+ DEFINE_READER
72
+ end
73
+
74
+ # Template having multiple segments in sequencing
75
+ def is_paired
76
+ (flag & 0x1) != 0
77
+ end
78
+
79
+ # Each segment properly aligned according to the aligner
80
+ def proper_pair
81
+ (flag & 0x2) != 0
82
+ end
83
+
84
+ # Segment unmapped
85
+ def is_unmapped
86
+ (flag & 0x4) != 0
87
+ end
88
+
89
+ # Next segment in the template unmapped
90
+ def mate_is_unmapped
91
+ (flag & 0x8) != 0
92
+ end
93
+
94
+ # Sequence being reverse complemented
95
+ def is_reverse_strand
96
+ (flag & 0x10) != 0
97
+ end
98
+
99
+ # Sequence of the next segment in the template being reversed
100
+ def mate_is_reverse_strand
101
+ (flag & 0x20) != 0
102
+ end
103
+
104
+ # The first segment in the template
105
+ def is_first_of_pair
106
+ (flag & 0x40) != 0
107
+ end
108
+
109
+ # The last segment in the template
110
+ def is_second_of_pair
111
+ (flag & 0x80) != 0
112
+ end
113
+
114
+ # Secondary alignment
115
+ def is_secondary_alignment
116
+ (flag & 0x100) != 0
117
+ end
118
+
119
+ # Not passing quality controls
120
+ def failed_quality_control
121
+ (flag & 0x200) != 0
122
+ end
123
+
124
+ # PCR or optical duplicate
125
+ def is_duplicate
126
+ (flag & 0x400) != 0
127
+ end
128
+ end
129
+
130
+ end
131
+ end
@@ -0,0 +1,45 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Class for iterating through alignments
5
+ class AlignmentIterator
6
+ include Enumerable
7
+
8
+ # Creates a new AlignmentIterator object which will
9
+ # parse JSON outputted by a specified command.
10
+ def initialize(command)
11
+ @command = command
12
+ end
13
+
14
+ # Iterate only through valid alignments
15
+ def each_valid
16
+
17
+ return enum_for(:each_valid) if not block_given?
18
+
19
+ command = @command
20
+ if command.index('--valid').nil?
21
+ command.push '--valid'
22
+ end
23
+
24
+ AlignmentIterator.new(command).each do |read|
25
+ yield read
26
+ end
27
+ end
28
+
29
+ # Iterate through all alignments skipping
30
+ # validation checks
31
+ def each
32
+
33
+ return enum_for(:each) if not block_given?
34
+
35
+ Bio::Command.call_command(@command) do |io|
36
+ io.each do |line|
37
+ raise line unless line[0] == '{'
38
+ yield Bio::Bam::Alignment.new(Oj.load(line))
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,45 @@
1
+ module Bio
2
+
3
+ # Module for reading BAM files
4
+ module Bam
5
+
6
+ # Class providing access to BAM files
7
+ class File
8
+
9
+ # Creates an object for access to BAM file
10
+ def initialize(filename)
11
+ @filename = filename
12
+ end
13
+
14
+ # SAM header
15
+ def header
16
+ @header ||= Bio::Bam::SamHeader.new(@filename)
17
+ end
18
+
19
+ # Returns an AlignmentIterator object for iterating over all alignments in the file
20
+ def alignments
21
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', @filename]
22
+ end
23
+
24
+ # True if index file was found
25
+ def has_index?
26
+ File::File.exists?(@filename + '.bai') ||
27
+ File::File.exists?(@filename[0...-1] + 'i')
28
+ end
29
+
30
+ # Fetches alignments overlapping a region.
31
+ # Returns an AlignmentIterator object.
32
+ #
33
+ # ---
34
+ # *Arguments*:
35
+ # * _chr_: reference sequence
36
+ # * _region_: a Range representing an interval. Coordinates are 1-based.
37
+ def fetch(chr, region)
38
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json',
39
+ @filename,
40
+ "#{chr}:#{region.min}-#{region.max}"]
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,25 @@
1
+ module Bio
2
+ # Module for reading SAM files
3
+ module Sam
4
+
5
+ # Class providing access to SAM files
6
+ class File
7
+
8
+ # Creates an object for access to SAM file
9
+ def initialize(filename)
10
+ @filename = filename
11
+ end
12
+
13
+ # SAM header
14
+ def header
15
+ @header ||= Bio::Bam::SamHeader.new(@filename, ['-S'])
16
+ end
17
+
18
+ # Returns an AlignmentIterator object for iterating over all alignments in the file
19
+ def alignments
20
+ Bio::Bam::AlignmentIterator.new ['sambamba', '--format=json', '-S', @filename]
21
+ end
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,194 @@
1
+ module Bio
2
+ module Bam
3
+
4
+ # Represents SAM header
5
+ class SamHeader
6
+
7
+ # Creates a new SamHeader object for a specified file,
8
+ # specifying additional options to pass to sambamba tool
9
+ def initialize(filename, opts=[])
10
+ @filename = filename
11
+ @opts = opts
12
+ end
13
+
14
+ # Raw text of SAM header
15
+ def raw_contents
16
+ if @raw_contents.nil? then
17
+ @raw_contents = Bio::Command.query_command(['sambamba', '-H', @filename] + @opts)
18
+ if @raw_contents.start_with? "sambamba" then
19
+ raise @raw_contents
20
+ end
21
+ end
22
+ @raw_contents
23
+ end
24
+
25
+ # Format version
26
+ def version
27
+ @json ||= get_json
28
+ @json['format_version']
29
+ end
30
+
31
+ # Sorting order
32
+ def sorting_order
33
+ @json ||= get_json
34
+ @json['sorting_order']
35
+ end
36
+
37
+ # An array of SQLine objects
38
+ def sq_lines
39
+ @json ||= get_json
40
+ @sq_lines ||= @json['sq_lines'].map{|json| SQLine.new(json)}
41
+ end
42
+
43
+ # An array of RGLine objects
44
+ def rg_lines
45
+ @json ||= get_json
46
+ @sq_lines ||= @json['rg_lines'].map{|json| RGLine.new(json)}
47
+ end
48
+
49
+ # An array of PGLine objects
50
+ def pg_lines
51
+ @json ||= get_json
52
+ @sq_lines ||= @json['pg_lines'].map{|json| PGLine.new(json)}
53
+ end
54
+
55
+ private
56
+ # Calls sambamba to get underlying JSON object
57
+ def get_json
58
+ command = ['sambamba', '-H', '--format=json', @filename] + @opts
59
+ line = Bio::Command.query_command(command)
60
+ raise line if line[0] != '{'
61
+ @json = Oj.load(line)
62
+ end
63
+ end
64
+
65
+ # Represents a @SQ line from SAM header
66
+ class SQLine
67
+
68
+ # Wrap JSON object from sambamba output
69
+ def initialize(json)
70
+ @json = json
71
+ end
72
+
73
+ # Reference sequence name
74
+ attr_reader :sequence_name if false
75
+
76
+ # Reference sequence length
77
+ attr_reader :sequence_length if false
78
+
79
+ # Genome assembly identifier
80
+ attr_reader :assembly if false
81
+
82
+ # MD5 checksum of the sequence in uppercase, with gaps and spaces removed
83
+ attr_reader :md5 if false
84
+
85
+ # Species
86
+ attr_reader :species if false
87
+
88
+ # URI of the sequence
89
+ attr_reader :uri if false
90
+
91
+ ['sequence_name', 'sequence_length',
92
+ 'assembly', 'md5', 'species', 'uri'].each do |sq_line_field|
93
+ eval <<-DEFINE_READER
94
+ def #{sq_line_field}
95
+ @json['#{sq_line_field}']
96
+ end
97
+ DEFINE_READER
98
+ end
99
+ end
100
+
101
+ # Represents @RG line from SAM header, i.e. a read group
102
+ class RGLine
103
+
104
+ # Wrap JSON object from sambamba output
105
+ def initialize(json)
106
+ @json = json
107
+ end
108
+
109
+ # Unique read group identifier
110
+ attr_reader :identifier if false
111
+
112
+ # Name of sequencing center
113
+ attr_reader :sequencing_center if false
114
+
115
+ # Description
116
+ attr_reader :description if false
117
+
118
+ # Date the run was produced (ISO8601 date or date/time)
119
+ attr_reader :date if false
120
+
121
+ # Flow order. The array of nucleotide bases that correspond to the
122
+ # nucleotides used for each flow of each read. Multi-base flows are
123
+ # encoded in IUPAC format, and non-nucleotide flows by various other
124
+ # characters.
125
+ attr_reader :flow_order if false
126
+
127
+ # The array of nucleotide bases that correspond to the key sequence of each read
128
+ attr_reader :key_sequence if false
129
+
130
+ # Library
131
+ attr_reader :library if false
132
+
133
+ # Programs used for processing the read group
134
+ attr_reader :programs if false
135
+
136
+ # Predicted median insert size
137
+ attr_reader :predicted_insert_size if false
138
+
139
+ # Platform/technology used to produce the reads
140
+ attr_reader :platform if false
141
+
142
+ # Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
143
+ attr_reader :platform_unit if false
144
+
145
+ # Sample
146
+ attr_reader :sample if false
147
+
148
+ ['identifier', 'sequencing_center', 'description', 'date',
149
+ 'flow_order', 'key_sequence', 'library', 'programs',
150
+ 'predicted_insert_size', 'platform',
151
+ 'platform_unit', 'sample'].each do |rg_line_field|
152
+ eval <<-DEFINE_READER
153
+ def #{rg_line_field}
154
+ @json['#{rg_line_field}']
155
+ end
156
+ DEFINE_READER
157
+ end
158
+ end
159
+
160
+ # Represents @PG line from SAM header (program record)
161
+ class PGLine
162
+
163
+ # Wrap JSON object from sambamba output
164
+ def initialize(json)
165
+ @json = json
166
+ end
167
+
168
+ # Unique program record identifier
169
+ attr_reader :identifier if false
170
+
171
+ # Program name
172
+ attr_reader :program_name if false
173
+
174
+ # Command line
175
+ attr_reader :command_line if false
176
+
177
+ # Identifier of previous program in chain
178
+ attr_reader :previous_program if false
179
+
180
+ # Program version
181
+ attr_reader :program_version if false
182
+
183
+ ['identifier', 'program_name', 'command_line',
184
+ 'previous_program', 'program_version'].each do |rg_line_field|
185
+ eval <<-DEFINE_READER
186
+ def #{rg_line_field}
187
+ @json['#{rg_line_field}']
188
+ end
189
+ DEFINE_READER
190
+ end
191
+ end
192
+
193
+ end
194
+ end