bio-sambamba 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,7 +1,7 @@
1
1
  source "http://rubygems.org"
2
2
 
3
3
  gem "bio", "~> 1.4.2"
4
- gem "oj", "~> 1.3.4"
4
+ gem "msgpack", "~> 0.4.7"
5
5
 
6
6
  group :development do
7
7
  gem "bundler", "~> 1.1.4"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
data/lib/bio-sambamba.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'bio/command'
2
- require 'oj'
2
+ require 'json'
3
+ require 'msgpack'
3
4
 
4
5
  require 'bio-sambamba/exception.rb'
5
6
  require 'bio-sambamba/samheader.rb'
@@ -4,95 +4,101 @@ module Bio
4
4
  # Class representing an alignment record
5
5
  class Alignment
6
6
 
7
- # Creates a new object from JSON output of sambamba tool
8
- def initialize(json)
9
- @json = json
7
+ # Creates a new object from MessagePack record
8
+ def initialize(obj, reference_sequence_names)
9
+ @obj = obj
10
+ @reference = reference_sequence_names[ref_id]
11
+ @mate_reference = reference_sequence_names[mate_ref_id]
10
12
  end
11
13
 
12
14
  # Access a record tag
13
15
  def [](tag)
14
16
  raise 'tag length must be two' unless tag.length == 2
15
- @json['tags'][tag]
17
+ tags[tag]
16
18
  end
17
19
 
18
20
  def ==(read)
19
- read.json == json
21
+ read.obj == obj
20
22
  end
21
23
 
22
24
  # Hash of record tags
23
- attr_reader :tags if false
25
+ def tags
26
+ obj[12]
27
+ end
24
28
 
25
- # Name of reference sequence
26
- attr_reader :reference if false
29
+ # ID of reference sequence
30
+ def ref_id
31
+ obj[2]
32
+ end
27
33
 
28
34
  # Query template name
29
- attr_reader :read_name if false
35
+ def read_name
36
+ obj[0]
37
+ end
30
38
 
31
39
  # 1-based leftmost mapping position
32
- attr_reader :position if false
40
+ def position
41
+ obj[3]
42
+ end
33
43
 
34
44
  # Mapping quality
35
- attr_reader :mapping_quality if false
45
+ def mapping_quality
46
+ obj[4]
47
+ end
48
+
49
+ # CIGAR: pairs of operations and lengths,
50
+ # or nil if information is not available
51
+ def cigar_operations
52
+ return nil if obj[5].nil?
53
+ obj[6].chars.zip obj[5]
54
+ end
36
55
 
37
56
  # CIGAR string
38
- attr_reader :cigar if false
57
+ def cigar
58
+ return '*' if cigar_operations.nil?
59
+ cigar_operations.reduce(''){|s, op_len| s + op_len[0] + op_len[1].to_s}
60
+ end
39
61
 
40
62
  # Observed template length
41
- attr_reader :template_length if false
63
+ def template_length
64
+ obj[9]
65
+ end
42
66
 
43
67
  # Bitwise flag
44
- attr_reader :flag if false
68
+ def flag
69
+ obj[1]
70
+ end
45
71
 
46
72
  # Phred-scaled base quality, an integer array
47
73
  # of the same length as the sequence
48
- attr_reader :quality if false
74
+ def quality
75
+ obj[11].bytes.to_a
76
+ end
49
77
 
50
78
  # Segment sequence
51
- attr_reader :sequence if false
79
+ def sequence
80
+ obj[10]
81
+ end
52
82
 
53
83
  # Reference sequence name of the mate/next segment
54
- attr_reader :mate_reference if false
84
+ def mate_ref_id
85
+ obj[7]
86
+ end
55
87
 
56
88
  # 1-based leftmost position of the mate/next segment
57
- attr_reader :mate_position if false
89
+ def mate_position
90
+ obj[8]
91
+ end
58
92
 
59
93
  # The number of reference bases covered
60
94
  def bases_covered
61
- return 0 if cigar == '*'
62
- cigar.split(/([MIDNSHP=X])/).each_slice(2).reduce(0) {|res, op|
63
- res += op[0].to_i unless ('M=XDN'.index op[1]).nil?
95
+ return 0 if cigar_operations.nil?
96
+ cigar_operations.reduce(0) {|res, op|
97
+ res += op[1] unless ('M=XDN'.index op[0]).nil?
64
98
  res
65
99
  }
66
100
  end
67
-
68
- {'position' => 'pos',
69
- 'mapping_quality' => 'mapq',
70
- 'template_length' => 'tlen',
71
- 'flag' => 'flag',
72
- 'mate_position' => 'pnext'
73
- }.each do |k, v|
74
- eval <<-DEFINE_READER
75
- def #{k}
76
- @json['#{v}'].to_i
77
- end
78
- DEFINE_READER
79
- end
80
-
81
- {'tags' => 'tags',
82
- 'reference' => 'rname',
83
- 'read_name' => 'qname',
84
- 'cigar' => 'cigar',
85
- 'quality' => 'qual',
86
- 'sequence' => 'seq',
87
- 'mate_reference' => 'rnext'
88
- }.each do |k, v|
89
- eval <<-DEFINE_READER
90
- def #{k}
91
- @json['#{v}']
92
- end
93
- DEFINE_READER
94
- end
95
-
101
+
96
102
  # Template having multiple segments in sequencing
97
103
  def is_paired
98
104
  (flag & 0x1) != 0
@@ -148,8 +154,14 @@ module Bio
148
154
  (flag & 0x400) != 0
149
155
  end
150
156
 
157
+ # Reference sequence name
158
+ attr_reader :reference
159
+
160
+ # Mate reference sequence name
161
+ attr_reader :mate_reference
162
+
151
163
  private
152
- attr_accessor :json
164
+ attr_accessor :obj
153
165
 
154
166
  end
155
167
 
@@ -8,8 +8,10 @@ module Bio
8
8
 
9
9
  # Creates a new AlignmentIterator object which will
10
10
  # parse JSON outputted by a specified command.
11
- def initialize(command)
11
+ # Names of reference sequences must be provided as well.
12
+ def initialize(command, references)
12
13
  @command = command
14
+ @references = references
13
15
  end
14
16
 
15
17
  # Iterate only through valid alignments
@@ -17,12 +19,14 @@ module Bio
17
19
 
18
20
  return enum_for(:each_valid) if not block_given?
19
21
 
20
- command = @command
22
+ command = get_command
21
23
  if command.index('--valid').nil?
22
24
  command.push '--valid'
23
25
  end
24
26
 
25
- AlignmentIterator.new(command).each do |read|
27
+ iter = self.clone
28
+ iter.command = command
29
+ iter.each do |read|
26
30
  yield read
27
31
  end
28
32
  end
@@ -55,10 +59,21 @@ module Bio
55
59
  command = get_command
56
60
 
57
61
  Bio::Command.call_command_open3(command) do |pin, pout, perr|
58
- pout.each do |line|
59
- json = Oj.load(line)
60
- yield Bio::Bam::Alignment.new(json)
62
+
63
+ counter = 0 # for triggering garbage collection manually
64
+ unpacker = MessagePack::Unpacker.new pout
65
+
66
+ begin
67
+ unpacker.each do |obj|
68
+ counter += 1
69
+ yield Bio::Bam::Alignment.new(obj, @references)
70
+ if (counter & 0xFFF) == 0 then
71
+ ObjectSpace.garbage_collect
72
+ end
73
+ end
74
+ rescue EOFError
61
75
  end
76
+
62
77
  raise_exception_if_stderr_is_not_empty(perr)
63
78
  end
64
79
  end
@@ -101,7 +116,7 @@ module Bio
101
116
  end
102
117
 
103
118
  def clone
104
- iter = AlignmentIterator.new @command
119
+ iter = AlignmentIterator.new @command, @references
105
120
  iter.chromosome = chromosome
106
121
  iter.region = region
107
122
  iter
@@ -29,7 +29,8 @@ module Bio
29
29
 
30
30
  # Returns an AlignmentIterator object for iterating over all alignments in the file
31
31
  def alignments
32
- Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format', 'json', @filename]
32
+ cmdline = ['sambamba', 'view', '--format', 'msgpack', @filename]
33
+ Bio::Bam::AlignmentIterator.new(cmdline, reference_sequence_names)
33
34
  end
34
35
 
35
36
  # True if index file was found
@@ -47,16 +48,25 @@ module Bio
47
48
  # * _chr_: reference sequence
48
49
  # * _region_: a Range representing an interval. Coordinates are 1-based.
49
50
  def fetch(chr, region)
50
- iter = Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format=json',
51
- @filename]
51
+ cmdline = ['sambamba', 'view', '--format=msgpack', @filename]
52
+ iter = Bio::Bam::AlignmentIterator.new(cmdline, reference_sequence_names)
52
53
  iter.chromosome = chr
53
54
  iter.region = region
54
55
  iter
55
56
  end
56
57
 
58
+ def reference_sequences
59
+ @reference_sequences ||= JSON.parse(Bio::Command.query_command ['sambamba', 'view', '-I', @filename])
60
+ end
61
+
57
62
  def [](chr)
58
63
  fetch(chr, nil)
59
64
  end
65
+
66
+ private
67
+ def reference_sequence_names
68
+ @reference_sequence_names ||= reference_sequences.map {|ref| ref['name']}
69
+ end
60
70
  end # class File
61
71
 
62
72
  end # module Bam
@@ -20,7 +20,17 @@ module Bio
20
20
 
21
21
  # Returns an AlignmentIterator object for iterating over all alignments in the file
22
22
  def alignments
23
- Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format', 'json', '-S', @filename]
23
+ cmdline = ['sambamba', 'view', '--format', 'msgpack', '-S', @filename],
24
+ Bio::Bam::AlignmentIterator.new(cmdline, reference_sequences)
25
+ end
26
+
27
+ def reference_sequences
28
+ @reference_sequences ||= Oj.load(Bio::Command.query_command ['sambamba', 'view', '-I', '-S', @filename])
29
+ end
30
+
31
+ private
32
+ def reference_sequence_names
33
+ @reference_sequence_names ||= reference_sequences.map {|ref| ref['name']}
24
34
  end
25
35
  end
26
36
 
@@ -26,172 +26,185 @@ module Bio
26
26
 
27
27
  # Format version
28
28
  def version
29
- @json ||= get_json
30
- @json['format_version']
29
+ obj[0]
31
30
  end
32
31
 
33
32
  # Sorting order
34
33
  def sorting_order
35
- @json ||= get_json
36
- @json['sorting_order']
34
+ obj[1]
37
35
  end
38
36
 
39
37
  # An array of SQLine objects
40
38
  def sq_lines
41
- @json ||= get_json
42
- @sq_lines ||= @json['sq_lines'].map{|json| SQLine.new(json)}
39
+ @sq_lines ||= obj[2].map{|rec| SQLine.new(rec)}
43
40
  end
44
41
 
45
42
  # An array of RGLine objects
46
43
  def rg_lines
47
- @json ||= get_json
48
- @sq_lines ||= @json['rg_lines'].map{|json| RGLine.new(json)}
44
+ @rg_lines ||= obj[3].map{|rec| RGLine.new(rec)}
49
45
  end
50
46
 
51
- # @return [PGLine] array of @PG lines
47
+ # An array of PGLine objects
52
48
  def pg_lines
53
- @json ||= get_json
54
- @sq_lines ||= @json['pg_lines'].map{|json| PGLine.new(json)}
49
+ @pg_lines ||= obj[4].map{|rec| PGLine.new(rec)}
55
50
  end
56
51
 
57
52
  private
58
- # Calls sambamba to get underlying JSON object
59
- def get_json
60
- cmd = ['sambamba', 'view', '-H', '--format=json', @filename] + @opts
53
+ def obj
54
+ return @obj unless @obj.nil?
55
+ cmd = ['sambamba', 'view', '-H', '--format', 'msgpack', @filename] + @opts
61
56
  line = ''
62
57
  Bio::Command.call_command_open3(cmd) do |pin, pout, perr|
63
- line = pout.read
58
+ @obj = MessagePack.unpack(pout.read)
64
59
  raise_exception_if_stderr_is_not_empty(perr)
65
60
  end
66
- @json = Oj.load(line)
61
+ @obj
67
62
  end
68
63
  end
69
64
 
70
65
  # Represents a @SQ line from SAM header
71
66
  class SQLine
72
67
 
73
- # Wrap JSON object from sambamba output
74
- def initialize(json)
75
- @json = json
68
+ # Wrap MessagePack record from sambamba output
69
+ def initialize(obj)
70
+ @obj = obj
76
71
  end
77
72
 
78
73
  # Reference sequence name
79
- attr_reader :sequence_name if false
74
+ def sequence_name
75
+ @obj['SN']
76
+ end
80
77
 
81
78
  # Reference sequence length
82
- attr_reader :sequence_length if false
79
+ def sequence_length
80
+ @obj['LN']
81
+ end
83
82
 
84
83
  # Genome assembly identifier
85
- attr_reader :assembly if false
84
+ def assembly
85
+ @obj['AS']
86
+ end
86
87
 
87
88
  # MD5 checksum of the sequence in uppercase, with gaps and spaces removed
88
- attr_reader :md5 if false
89
+ def md5
90
+ @obj['M5']
91
+ end
89
92
 
90
93
  # Species
91
- attr_reader :species if false
94
+ def species
95
+ @obj['SP']
96
+ end
92
97
 
93
98
  # URI of the sequence
94
- attr_reader :uri if false
95
-
96
- ['sequence_name', 'sequence_length',
97
- 'assembly', 'md5', 'species', 'uri'].each do |sq_line_field|
98
- eval <<-DEFINE_READER
99
- def #{sq_line_field}
100
- @json['#{sq_line_field}']
101
- end
102
- DEFINE_READER
99
+ def uri
100
+ @obj['UR']
103
101
  end
102
+
104
103
  end
105
104
 
106
105
  # Represents @RG line from SAM header, i.e. a read group
107
106
  class RGLine
108
107
 
109
- # Wrap JSON object from sambamba output
110
- def initialize(json)
111
- @json = json
108
+ # Wrap MessagePack record from sambamba output
109
+ def initialize(obj)
110
+ @obj = obj
112
111
  end
113
112
 
114
113
  # Unique read group identifier
115
- attr_reader :identifier if false
114
+ def identifier
115
+ @obj['ID']
116
+ end
116
117
 
117
118
  # Name of sequencing center
118
- attr_reader :sequencing_center if false
119
+ def sequencing_center
120
+ @obj['CN']
121
+ end
119
122
 
120
123
  # Description
121
- attr_reader :description if false
124
+ def description
125
+ @obj['DS']
126
+ end
122
127
 
123
128
  # Date the run was produced (ISO8601 date or date/time)
124
- attr_reader :date if false
129
+ def date
130
+ @obj['DT']
131
+ end
125
132
 
126
133
  # Flow order. The array of nucleotide bases that correspond to the
127
134
  # nucleotides used for each flow of each read. Multi-base flows are
128
135
  # encoded in IUPAC format, and non-nucleotide flows by various other
129
136
  # characters.
130
- attr_reader :flow_order if false
137
+ def flow_order
138
+ @obj['FO']
139
+ end
131
140
 
132
141
  # The array of nucleotide bases that correspond to the key sequence of each read
133
- attr_reader :key_sequence if false
142
+ def key_sequence
143
+ @obj['KS']
144
+ end
134
145
 
135
146
  # Library
136
- attr_reader :library if false
147
+ def library
148
+ @obj['LB']
149
+ end
137
150
 
138
151
  # Programs used for processing the read group
139
- attr_reader :programs if false
152
+ def programs
153
+ @obj['PG']
154
+ end
140
155
 
141
156
  # Predicted median insert size
142
- attr_reader :predicted_insert_size if false
157
+ def predicted_insert_size
158
+ @obj['PI']
159
+ end
143
160
 
144
161
  # Platform/technology used to produce the reads
145
- attr_reader :platform if false
162
+ def platform
163
+ @obj['PL']
164
+ end
146
165
 
147
- # Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
148
- attr_reader :platform_unit if false
166
+ # Platform unit (e.g. flowcell-barcode lane for Illumina or slide for SOLiD). Unique identifier.
167
+ def platform_unit
168
+ @obj['PU']
169
+ end
149
170
 
150
171
  # Sample
151
- attr_reader :sample if false
152
-
153
- ['identifier', 'sequencing_center', 'description', 'date',
154
- 'flow_order', 'key_sequence', 'library', 'programs',
155
- 'predicted_insert_size', 'platform',
156
- 'platform_unit', 'sample'].each do |rg_line_field|
157
- eval <<-DEFINE_READER
158
- def #{rg_line_field}
159
- @json['#{rg_line_field}']
160
- end
161
- DEFINE_READER
172
+ def sample
173
+ @obj['SM']
162
174
  end
163
175
  end
164
176
 
165
177
  # Represents @PG line from SAM header (program record)
166
178
  class PGLine
167
179
 
168
- # Wrap JSON object from sambamba output
169
- def initialize(json)
170
- @json = json
180
+ # Wrap MessagePack record from sambamba output
181
+ def initialize(obj)
182
+ @obj = obj
171
183
  end
172
184
 
173
185
  # Unique program record identifier
174
- attr_reader :identifier if false
186
+ def identifier
187
+ @obj['ID']
188
+ end
175
189
 
176
190
  # Program name
177
- attr_reader :program_name if false
191
+ def program_name
192
+ @obj['PN']
193
+ end
178
194
 
179
195
  # Command line
180
- attr_reader :command_line if false
196
+ def command_line
197
+ @obj['CL']
198
+ end
181
199
 
182
200
  # Identifier of previous program in chain
183
- attr_reader :previous_program if false
201
+ def previous_program
202
+ @obj['PP']
203
+ end
184
204
 
185
205
  # Program version
186
- attr_reader :program_version if false
187
-
188
- ['identifier', 'program_name', 'command_line',
189
- 'previous_program', 'program_version'].each do |rg_line_field|
190
- eval <<-DEFINE_READER
191
- def #{rg_line_field}
192
- @json['#{rg_line_field}']
193
- end
194
- DEFINE_READER
206
+ def program_version
207
+ @obj['VN']
195
208
  end
196
209
  end
197
210
 
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-sambamba
3
3
  version: !ruby/object:Gem::Version
4
- hash: 1889055196096400351
5
4
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 3
10
- version: 0.0.3
5
+ version: 0.0.4
11
6
  platform: ruby
12
7
  authors:
13
8
  - Artem Tarasov
@@ -15,104 +10,74 @@ autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2012-08-15 00:00:00 Z
13
+ date: 2012-08-20 00:00:00 Z
19
14
  dependencies:
20
15
  - !ruby/object:Gem::Dependency
21
- type: :runtime
22
- prerelease: false
23
16
  name: bio
24
- version_requirements: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &id001 !ruby/object:Gem::Requirement
25
18
  none: false
26
19
  requirements:
27
20
  - - ~>
28
21
  - !ruby/object:Gem::Version
29
- hash: 3725784833518553922
30
- segments:
31
- - 1
32
- - 4
33
- - 2
34
22
  version: 1.4.2
35
- requirement: *id001
36
- - !ruby/object:Gem::Dependency
37
23
  type: :runtime
38
24
  prerelease: false
39
- name: oj
40
- version_requirements: &id002 !ruby/object:Gem::Requirement
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: msgpack
28
+ requirement: &id002 !ruby/object:Gem::Requirement
41
29
  none: false
42
30
  requirements:
43
31
  - - ~>
44
32
  - !ruby/object:Gem::Version
45
- hash: 3635686038185777981
46
- segments:
47
- - 1
48
- - 3
49
- - 4
50
- version: 1.3.4
51
- requirement: *id002
52
- - !ruby/object:Gem::Dependency
53
- type: :development
33
+ version: 0.4.7
34
+ type: :runtime
54
35
  prerelease: false
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
55
38
  name: bundler
56
- version_requirements: &id003 !ruby/object:Gem::Requirement
39
+ requirement: &id003 !ruby/object:Gem::Requirement
57
40
  none: false
58
41
  requirements:
59
42
  - - ~>
60
43
  - !ruby/object:Gem::Version
61
- hash: 1287940794363108929
62
- segments:
63
- - 1
64
- - 1
65
- - 4
66
44
  version: 1.1.4
67
- requirement: *id003
68
- - !ruby/object:Gem::Dependency
69
45
  type: :development
70
46
  prerelease: false
47
+ version_requirements: *id003
48
+ - !ruby/object:Gem::Dependency
71
49
  name: jeweler
72
- version_requirements: &id004 !ruby/object:Gem::Requirement
50
+ requirement: &id004 !ruby/object:Gem::Requirement
73
51
  none: false
74
52
  requirements:
75
53
  - - ~>
76
54
  - !ruby/object:Gem::Version
77
- hash: 4544990976642784663
78
- segments:
79
- - 1
80
- - 8
81
- - 3
82
55
  version: 1.8.3
83
- requirement: *id004
84
- - !ruby/object:Gem::Dependency
85
56
  type: :development
86
57
  prerelease: false
58
+ version_requirements: *id004
59
+ - !ruby/object:Gem::Dependency
87
60
  name: rspec
88
- version_requirements: &id005 !ruby/object:Gem::Requirement
61
+ requirement: &id005 !ruby/object:Gem::Requirement
89
62
  none: false
90
63
  requirements:
91
64
  - - ~>
92
65
  - !ruby/object:Gem::Version
93
- hash: 3576421176010082098
94
- segments:
95
- - 2
96
- - 7
97
- - 0
98
66
  version: 2.7.0
99
- requirement: *id005
100
- - !ruby/object:Gem::Dependency
101
67
  type: :development
102
68
  prerelease: false
69
+ version_requirements: *id005
70
+ - !ruby/object:Gem::Dependency
103
71
  name: cucumber
104
- version_requirements: &id006 !ruby/object:Gem::Requirement
72
+ requirement: &id006 !ruby/object:Gem::Requirement
105
73
  none: false
106
74
  requirements:
107
75
  - - ~>
108
76
  - !ruby/object:Gem::Version
109
- hash: 242951009964279202
110
- segments:
111
- - 1
112
- - 2
113
- - 0
114
77
  version: 1.2.0
115
- requirement: *id006
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: *id006
116
81
  description: New Sambamba library comes with a command-line tool for working with SAM/BAM files. This gem brings some of its functionality to Ruby.
117
82
  email: lomereiter@gmail.com
118
83
  executables: []
@@ -179,7 +144,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
179
144
  requirements:
180
145
  - - ">="
181
146
  - !ruby/object:Gem::Version
182
- hash: 2002549777813010636
147
+ hash: -3697473229949428622
183
148
  segments:
184
149
  - 0
185
150
  version: "0"
@@ -188,14 +153,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
188
153
  requirements:
189
154
  - - ">="
190
155
  - !ruby/object:Gem::Version
191
- hash: 2002549777813010636
192
- segments:
193
- - 0
194
156
  version: "0"
195
157
  requirements: []
196
158
 
197
159
  rubyforge_project:
198
- rubygems_version: 1.8.12
160
+ rubygems_version: 1.8.24
199
161
  signing_key:
200
162
  specification_version: 3
201
163
  summary: Ruby wrapper for Sambamba tool