bio-sambamba 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,7 +1,7 @@
1
1
  source "http://rubygems.org"
2
2
 
3
3
  gem "bio", "~> 1.4.2"
4
- gem "oj", "~> 1.3.4"
4
+ gem "msgpack", "~> 0.4.7"
5
5
 
6
6
  group :development do
7
7
  gem "bundler", "~> 1.1.4"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
data/lib/bio-sambamba.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'bio/command'
2
- require 'oj'
2
+ require 'json'
3
+ require 'msgpack'
3
4
 
4
5
  require 'bio-sambamba/exception.rb'
5
6
  require 'bio-sambamba/samheader.rb'
@@ -4,95 +4,101 @@ module Bio
4
4
  # Class representing an alignment record
5
5
  class Alignment
6
6
 
7
- # Creates a new object from JSON output of sambamba tool
8
- def initialize(json)
9
- @json = json
7
+ # Creates a new object from MessagePack record
8
+ def initialize(obj, reference_sequence_names)
9
+ @obj = obj
10
+ @reference = reference_sequence_names[ref_id]
11
+ @mate_reference = reference_sequence_names[mate_ref_id]
10
12
  end
11
13
 
12
14
  # Access a record tag
13
15
  def [](tag)
14
16
  raise 'tag length must be two' unless tag.length == 2
15
- @json['tags'][tag]
17
+ tags[tag]
16
18
  end
17
19
 
18
20
  def ==(read)
19
- read.json == json
21
+ read.obj == obj
20
22
  end
21
23
 
22
24
  # Hash of record tags
23
- attr_reader :tags if false
25
+ def tags
26
+ obj[12]
27
+ end
24
28
 
25
- # Name of reference sequence
26
- attr_reader :reference if false
29
+ # ID of reference sequence
30
+ def ref_id
31
+ obj[2]
32
+ end
27
33
 
28
34
  # Query template name
29
- attr_reader :read_name if false
35
+ def read_name
36
+ obj[0]
37
+ end
30
38
 
31
39
  # 1-based leftmost mapping position
32
- attr_reader :position if false
40
+ def position
41
+ obj[3]
42
+ end
33
43
 
34
44
  # Mapping quality
35
- attr_reader :mapping_quality if false
45
+ def mapping_quality
46
+ obj[4]
47
+ end
48
+
49
+ # CIGAR: pairs of operations and lengths,
50
+ # or nil if information is not available
51
+ def cigar_operations
52
+ return nil if obj[5].nil?
53
+ obj[6].chars.zip obj[5]
54
+ end
36
55
 
37
56
  # CIGAR string
38
- attr_reader :cigar if false
57
+ def cigar
58
+ return '*' if cigar_operations.nil?
59
+ cigar_operations.reduce(''){|s, op_len| s + op_len[0] + op_len[1].to_s}
60
+ end
39
61
 
40
62
  # Observed template length
41
- attr_reader :template_length if false
63
+ def template_length
64
+ obj[9]
65
+ end
42
66
 
43
67
  # Bitwise flag
44
- attr_reader :flag if false
68
+ def flag
69
+ obj[1]
70
+ end
45
71
 
46
72
  # Phred-scaled base quality, an integer array
47
73
  # of the same length as the sequence
48
- attr_reader :quality if false
74
+ def quality
75
+ obj[11].bytes.to_a
76
+ end
49
77
 
50
78
  # Segment sequence
51
- attr_reader :sequence if false
79
+ def sequence
80
+ obj[10]
81
+ end
52
82
 
53
83
  # Reference sequence name of the mate/next segment
54
- attr_reader :mate_reference if false
84
+ def mate_ref_id
85
+ obj[7]
86
+ end
55
87
 
56
88
  # 1-based leftmost position of the mate/next segment
57
- attr_reader :mate_position if false
89
+ def mate_position
90
+ obj[8]
91
+ end
58
92
 
59
93
  # The number of reference bases covered
60
94
  def bases_covered
61
- return 0 if cigar == '*'
62
- cigar.split(/([MIDNSHP=X])/).each_slice(2).reduce(0) {|res, op|
63
- res += op[0].to_i unless ('M=XDN'.index op[1]).nil?
95
+ return 0 if cigar_operations.nil?
96
+ cigar_operations.reduce(0) {|res, op|
97
+ res += op[1] unless ('M=XDN'.index op[0]).nil?
64
98
  res
65
99
  }
66
100
  end
67
-
68
- {'position' => 'pos',
69
- 'mapping_quality' => 'mapq',
70
- 'template_length' => 'tlen',
71
- 'flag' => 'flag',
72
- 'mate_position' => 'pnext'
73
- }.each do |k, v|
74
- eval <<-DEFINE_READER
75
- def #{k}
76
- @json['#{v}'].to_i
77
- end
78
- DEFINE_READER
79
- end
80
-
81
- {'tags' => 'tags',
82
- 'reference' => 'rname',
83
- 'read_name' => 'qname',
84
- 'cigar' => 'cigar',
85
- 'quality' => 'qual',
86
- 'sequence' => 'seq',
87
- 'mate_reference' => 'rnext'
88
- }.each do |k, v|
89
- eval <<-DEFINE_READER
90
- def #{k}
91
- @json['#{v}']
92
- end
93
- DEFINE_READER
94
- end
95
-
101
+
96
102
  # Template having multiple segments in sequencing
97
103
  def is_paired
98
104
  (flag & 0x1) != 0
@@ -148,8 +154,14 @@ module Bio
148
154
  (flag & 0x400) != 0
149
155
  end
150
156
 
157
+ # Reference sequence name
158
+ attr_reader :reference
159
+
160
+ # Mate reference sequence name
161
+ attr_reader :mate_reference
162
+
151
163
  private
152
- attr_accessor :json
164
+ attr_accessor :obj
153
165
 
154
166
  end
155
167
 
@@ -8,8 +8,10 @@ module Bio
8
8
 
9
9
  # Creates a new AlignmentIterator object which will
10
10
  # parse JSON outputted by a specified command.
11
- def initialize(command)
11
+ # Names of reference sequences must be provided as well.
12
+ def initialize(command, references)
12
13
  @command = command
14
+ @references = references
13
15
  end
14
16
 
15
17
  # Iterate only through valid alignments
@@ -17,12 +19,14 @@ module Bio
17
19
 
18
20
  return enum_for(:each_valid) if not block_given?
19
21
 
20
- command = @command
22
+ command = get_command
21
23
  if command.index('--valid').nil?
22
24
  command.push '--valid'
23
25
  end
24
26
 
25
- AlignmentIterator.new(command).each do |read|
27
+ iter = self.clone
28
+ iter.command = command
29
+ iter.each do |read|
26
30
  yield read
27
31
  end
28
32
  end
@@ -55,10 +59,21 @@ module Bio
55
59
  command = get_command
56
60
 
57
61
  Bio::Command.call_command_open3(command) do |pin, pout, perr|
58
- pout.each do |line|
59
- json = Oj.load(line)
60
- yield Bio::Bam::Alignment.new(json)
62
+
63
+ counter = 0 # for triggering garbage collection manually
64
+ unpacker = MessagePack::Unpacker.new pout
65
+
66
+ begin
67
+ unpacker.each do |obj|
68
+ counter += 1
69
+ yield Bio::Bam::Alignment.new(obj, @references)
70
+ if (counter & 0xFFF) == 0 then
71
+ ObjectSpace.garbage_collect
72
+ end
73
+ end
74
+ rescue EOFError
61
75
  end
76
+
62
77
  raise_exception_if_stderr_is_not_empty(perr)
63
78
  end
64
79
  end
@@ -101,7 +116,7 @@ module Bio
101
116
  end
102
117
 
103
118
  def clone
104
- iter = AlignmentIterator.new @command
119
+ iter = AlignmentIterator.new @command, @references
105
120
  iter.chromosome = chromosome
106
121
  iter.region = region
107
122
  iter
@@ -29,7 +29,8 @@ module Bio
29
29
 
30
30
  # Returns an AlignmentIterator object for iterating over all alignments in the file
31
31
  def alignments
32
- Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format', 'json', @filename]
32
+ cmdline = ['sambamba', 'view', '--format', 'msgpack', @filename]
33
+ Bio::Bam::AlignmentIterator.new(cmdline, reference_sequence_names)
33
34
  end
34
35
 
35
36
  # True if index file was found
@@ -47,16 +48,25 @@ module Bio
47
48
  # * _chr_: reference sequence
48
49
  # * _region_: a Range representing an interval. Coordinates are 1-based.
49
50
  def fetch(chr, region)
50
- iter = Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format=json',
51
- @filename]
51
+ cmdline = ['sambamba', 'view', '--format=msgpack', @filename]
52
+ iter = Bio::Bam::AlignmentIterator.new(cmdline, reference_sequence_names)
52
53
  iter.chromosome = chr
53
54
  iter.region = region
54
55
  iter
55
56
  end
56
57
 
58
+ def reference_sequences
59
+ @reference_sequences ||= JSON.parse(Bio::Command.query_command ['sambamba', 'view', '-I', @filename])
60
+ end
61
+
57
62
  def [](chr)
58
63
  fetch(chr, nil)
59
64
  end
65
+
66
+ private
67
+ def reference_sequence_names
68
+ @reference_sequence_names ||= reference_sequences.map {|ref| ref['name']}
69
+ end
60
70
  end # class File
61
71
 
62
72
  end # module Bam
@@ -20,7 +20,17 @@ module Bio
20
20
 
21
21
  # Returns an AlignmentIterator object for iterating over all alignments in the file
22
22
  def alignments
23
- Bio::Bam::AlignmentIterator.new ['sambamba', 'view', '--format', 'json', '-S', @filename]
23
+ cmdline = ['sambamba', 'view', '--format', 'msgpack', '-S', @filename],
24
+ Bio::Bam::AlignmentIterator.new(cmdline, reference_sequences)
25
+ end
26
+
27
+ def reference_sequences
28
+ @reference_sequences ||= Oj.load(Bio::Command.query_command ['sambamba', 'view', '-I', '-S', @filename])
29
+ end
30
+
31
+ private
32
+ def reference_sequence_names
33
+ @reference_sequence_names ||= reference_sequences.map {|ref| ref['name']}
24
34
  end
25
35
  end
26
36
 
@@ -26,172 +26,185 @@ module Bio
26
26
 
27
27
  # Format version
28
28
  def version
29
- @json ||= get_json
30
- @json['format_version']
29
+ obj[0]
31
30
  end
32
31
 
33
32
  # Sorting order
34
33
  def sorting_order
35
- @json ||= get_json
36
- @json['sorting_order']
34
+ obj[1]
37
35
  end
38
36
 
39
37
  # An array of SQLine objects
40
38
  def sq_lines
41
- @json ||= get_json
42
- @sq_lines ||= @json['sq_lines'].map{|json| SQLine.new(json)}
39
+ @sq_lines ||= obj[2].map{|rec| SQLine.new(rec)}
43
40
  end
44
41
 
45
42
  # An array of RGLine objects
46
43
  def rg_lines
47
- @json ||= get_json
48
- @sq_lines ||= @json['rg_lines'].map{|json| RGLine.new(json)}
44
+ @rg_lines ||= obj[3].map{|rec| RGLine.new(rec)}
49
45
  end
50
46
 
51
- # @return [PGLine] array of @PG lines
47
+ # An array of PGLine objects
52
48
  def pg_lines
53
- @json ||= get_json
54
- @sq_lines ||= @json['pg_lines'].map{|json| PGLine.new(json)}
49
+ @pg_lines ||= obj[4].map{|rec| PGLine.new(rec)}
55
50
  end
56
51
 
57
52
  private
58
- # Calls sambamba to get underlying JSON object
59
- def get_json
60
- cmd = ['sambamba', 'view', '-H', '--format=json', @filename] + @opts
53
+ def obj
54
+ return @obj unless @obj.nil?
55
+ cmd = ['sambamba', 'view', '-H', '--format', 'msgpack', @filename] + @opts
61
56
  line = ''
62
57
  Bio::Command.call_command_open3(cmd) do |pin, pout, perr|
63
- line = pout.read
58
+ @obj = MessagePack.unpack(pout.read)
64
59
  raise_exception_if_stderr_is_not_empty(perr)
65
60
  end
66
- @json = Oj.load(line)
61
+ @obj
67
62
  end
68
63
  end
69
64
 
70
65
  # Represents a @SQ line from SAM header
71
66
  class SQLine
72
67
 
73
- # Wrap JSON object from sambamba output
74
- def initialize(json)
75
- @json = json
68
+ # Wrap MessagePack record from sambamba output
69
+ def initialize(obj)
70
+ @obj = obj
76
71
  end
77
72
 
78
73
  # Reference sequence name
79
- attr_reader :sequence_name if false
74
+ def sequence_name
75
+ @obj['SN']
76
+ end
80
77
 
81
78
  # Reference sequence length
82
- attr_reader :sequence_length if false
79
+ def sequence_length
80
+ @obj['LN']
81
+ end
83
82
 
84
83
  # Genome assembly identifier
85
- attr_reader :assembly if false
84
+ def assembly
85
+ @obj['AS']
86
+ end
86
87
 
87
88
  # MD5 checksum of the sequence in uppercase, with gaps and spaces removed
88
- attr_reader :md5 if false
89
+ def md5
90
+ @obj['M5']
91
+ end
89
92
 
90
93
  # Species
91
- attr_reader :species if false
94
+ def species
95
+ @obj['SP']
96
+ end
92
97
 
93
98
  # URI of the sequence
94
- attr_reader :uri if false
95
-
96
- ['sequence_name', 'sequence_length',
97
- 'assembly', 'md5', 'species', 'uri'].each do |sq_line_field|
98
- eval <<-DEFINE_READER
99
- def #{sq_line_field}
100
- @json['#{sq_line_field}']
101
- end
102
- DEFINE_READER
99
+ def uri
100
+ @obj['UR']
103
101
  end
102
+
104
103
  end
105
104
 
106
105
  # Represents @RG line from SAM header, i.e. a read group
107
106
  class RGLine
108
107
 
109
- # Wrap JSON object from sambamba output
110
- def initialize(json)
111
- @json = json
108
+ # Wrap MessagePack record from sambamba output
109
+ def initialize(obj)
110
+ @obj = obj
112
111
  end
113
112
 
114
113
  # Unique read group identifier
115
- attr_reader :identifier if false
114
+ def identifier
115
+ @obj['ID']
116
+ end
116
117
 
117
118
  # Name of sequencing center
118
- attr_reader :sequencing_center if false
119
+ def sequencing_center
120
+ @obj['CN']
121
+ end
119
122
 
120
123
  # Description
121
- attr_reader :description if false
124
+ def description
125
+ @obj['DS']
126
+ end
122
127
 
123
128
  # Date the run was produced (ISO8601 date or date/time)
124
- attr_reader :date if false
129
+ def date
130
+ @obj['DT']
131
+ end
125
132
 
126
133
  # Flow order. The array of nucleotide bases that correspond to the
127
134
  # nucleotides used for each flow of each read. Multi-base flows are
128
135
  # encoded in IUPAC format, and non-nucleotide flows by various other
129
136
  # characters.
130
- attr_reader :flow_order if false
137
+ def flow_order
138
+ @obj['FO']
139
+ end
131
140
 
132
141
  # The array of nucleotide bases that correspond to the key sequence of each read
133
- attr_reader :key_sequence if false
142
+ def key_sequence
143
+ @obj['KS']
144
+ end
134
145
 
135
146
  # Library
136
- attr_reader :library if false
147
+ def library
148
+ @obj['LB']
149
+ end
137
150
 
138
151
  # Programs used for processing the read group
139
- attr_reader :programs if false
152
+ def programs
153
+ @obj['PG']
154
+ end
140
155
 
141
156
  # Predicted median insert size
142
- attr_reader :predicted_insert_size if false
157
+ def predicted_insert_size
158
+ @obj['PI']
159
+ end
143
160
 
144
161
  # Platform/technology used to produce the reads
145
- attr_reader :platform if false
162
+ def platform
163
+ @obj['PL']
164
+ end
146
165
 
147
- # Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
148
- attr_reader :platform_unit if false
166
+ # Platform unit (e.g. flowcell-barcode lane for Illumina or slide for SOLiD). Unique identifier.
167
+ def platform_unit
168
+ @obj['PU']
169
+ end
149
170
 
150
171
  # Sample
151
- attr_reader :sample if false
152
-
153
- ['identifier', 'sequencing_center', 'description', 'date',
154
- 'flow_order', 'key_sequence', 'library', 'programs',
155
- 'predicted_insert_size', 'platform',
156
- 'platform_unit', 'sample'].each do |rg_line_field|
157
- eval <<-DEFINE_READER
158
- def #{rg_line_field}
159
- @json['#{rg_line_field}']
160
- end
161
- DEFINE_READER
172
+ def sample
173
+ @obj['SM']
162
174
  end
163
175
  end
164
176
 
165
177
  # Represents @PG line from SAM header (program record)
166
178
  class PGLine
167
179
 
168
- # Wrap JSON object from sambamba output
169
- def initialize(json)
170
- @json = json
180
+ # Wrap MessagePack record from sambamba output
181
+ def initialize(obj)
182
+ @obj = obj
171
183
  end
172
184
 
173
185
  # Unique program record identifier
174
- attr_reader :identifier if false
186
+ def identifier
187
+ @obj['ID']
188
+ end
175
189
 
176
190
  # Program name
177
- attr_reader :program_name if false
191
+ def program_name
192
+ @obj['PN']
193
+ end
178
194
 
179
195
  # Command line
180
- attr_reader :command_line if false
196
+ def command_line
197
+ @obj['CL']
198
+ end
181
199
 
182
200
  # Identifier of previous program in chain
183
- attr_reader :previous_program if false
201
+ def previous_program
202
+ @obj['PP']
203
+ end
184
204
 
185
205
  # Program version
186
- attr_reader :program_version if false
187
-
188
- ['identifier', 'program_name', 'command_line',
189
- 'previous_program', 'program_version'].each do |rg_line_field|
190
- eval <<-DEFINE_READER
191
- def #{rg_line_field}
192
- @json['#{rg_line_field}']
193
- end
194
- DEFINE_READER
206
+ def program_version
207
+ @obj['VN']
195
208
  end
196
209
  end
197
210
 
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-sambamba
3
3
  version: !ruby/object:Gem::Version
4
- hash: 1889055196096400351
5
4
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 3
10
- version: 0.0.3
5
+ version: 0.0.4
11
6
  platform: ruby
12
7
  authors:
13
8
  - Artem Tarasov
@@ -15,104 +10,74 @@ autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2012-08-15 00:00:00 Z
13
+ date: 2012-08-20 00:00:00 Z
19
14
  dependencies:
20
15
  - !ruby/object:Gem::Dependency
21
- type: :runtime
22
- prerelease: false
23
16
  name: bio
24
- version_requirements: &id001 !ruby/object:Gem::Requirement
17
+ requirement: &id001 !ruby/object:Gem::Requirement
25
18
  none: false
26
19
  requirements:
27
20
  - - ~>
28
21
  - !ruby/object:Gem::Version
29
- hash: 3725784833518553922
30
- segments:
31
- - 1
32
- - 4
33
- - 2
34
22
  version: 1.4.2
35
- requirement: *id001
36
- - !ruby/object:Gem::Dependency
37
23
  type: :runtime
38
24
  prerelease: false
39
- name: oj
40
- version_requirements: &id002 !ruby/object:Gem::Requirement
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: msgpack
28
+ requirement: &id002 !ruby/object:Gem::Requirement
41
29
  none: false
42
30
  requirements:
43
31
  - - ~>
44
32
  - !ruby/object:Gem::Version
45
- hash: 3635686038185777981
46
- segments:
47
- - 1
48
- - 3
49
- - 4
50
- version: 1.3.4
51
- requirement: *id002
52
- - !ruby/object:Gem::Dependency
53
- type: :development
33
+ version: 0.4.7
34
+ type: :runtime
54
35
  prerelease: false
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
55
38
  name: bundler
56
- version_requirements: &id003 !ruby/object:Gem::Requirement
39
+ requirement: &id003 !ruby/object:Gem::Requirement
57
40
  none: false
58
41
  requirements:
59
42
  - - ~>
60
43
  - !ruby/object:Gem::Version
61
- hash: 1287940794363108929
62
- segments:
63
- - 1
64
- - 1
65
- - 4
66
44
  version: 1.1.4
67
- requirement: *id003
68
- - !ruby/object:Gem::Dependency
69
45
  type: :development
70
46
  prerelease: false
47
+ version_requirements: *id003
48
+ - !ruby/object:Gem::Dependency
71
49
  name: jeweler
72
- version_requirements: &id004 !ruby/object:Gem::Requirement
50
+ requirement: &id004 !ruby/object:Gem::Requirement
73
51
  none: false
74
52
  requirements:
75
53
  - - ~>
76
54
  - !ruby/object:Gem::Version
77
- hash: 4544990976642784663
78
- segments:
79
- - 1
80
- - 8
81
- - 3
82
55
  version: 1.8.3
83
- requirement: *id004
84
- - !ruby/object:Gem::Dependency
85
56
  type: :development
86
57
  prerelease: false
58
+ version_requirements: *id004
59
+ - !ruby/object:Gem::Dependency
87
60
  name: rspec
88
- version_requirements: &id005 !ruby/object:Gem::Requirement
61
+ requirement: &id005 !ruby/object:Gem::Requirement
89
62
  none: false
90
63
  requirements:
91
64
  - - ~>
92
65
  - !ruby/object:Gem::Version
93
- hash: 3576421176010082098
94
- segments:
95
- - 2
96
- - 7
97
- - 0
98
66
  version: 2.7.0
99
- requirement: *id005
100
- - !ruby/object:Gem::Dependency
101
67
  type: :development
102
68
  prerelease: false
69
+ version_requirements: *id005
70
+ - !ruby/object:Gem::Dependency
103
71
  name: cucumber
104
- version_requirements: &id006 !ruby/object:Gem::Requirement
72
+ requirement: &id006 !ruby/object:Gem::Requirement
105
73
  none: false
106
74
  requirements:
107
75
  - - ~>
108
76
  - !ruby/object:Gem::Version
109
- hash: 242951009964279202
110
- segments:
111
- - 1
112
- - 2
113
- - 0
114
77
  version: 1.2.0
115
- requirement: *id006
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: *id006
116
81
  description: New Sambamba library comes with a command-line tool for working with SAM/BAM files. This gem brings some of its functionality to Ruby.
117
82
  email: lomereiter@gmail.com
118
83
  executables: []
@@ -179,7 +144,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
179
144
  requirements:
180
145
  - - ">="
181
146
  - !ruby/object:Gem::Version
182
- hash: 2002549777813010636
147
+ hash: -3697473229949428622
183
148
  segments:
184
149
  - 0
185
150
  version: "0"
@@ -188,14 +153,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
188
153
  requirements:
189
154
  - - ">="
190
155
  - !ruby/object:Gem::Version
191
- hash: 2002549777813010636
192
- segments:
193
- - 0
194
156
  version: "0"
195
157
  requirements: []
196
158
 
197
159
  rubyforge_project:
198
- rubygems_version: 1.8.12
160
+ rubygems_version: 1.8.24
199
161
  signing_key:
200
162
  specification_version: 3
201
163
  summary: Ruby wrapper for Sambamba tool