htslib 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTS
4
+ class Bam < Hts
5
+ # High-level mpileup iterator over multiple BAM/CRAM inputs
6
+ class Mpileup
7
+ include Enumerable
8
+
9
+ # Usage:
10
+ # HTS::Bam::Mpileup.open([bam1, bam2], region: "chr1:1-100") do |mpl|
11
+ # mpl.each { |cols| ... }
12
+ # end
13
+ def self.open(*args, **kw)
14
+ m = new(*args, **kw)
15
+ return m unless block_given?
16
+
17
+ begin
18
+ yield m
19
+ ensure
20
+ m.close
21
+ end
22
+ m
23
+ end
24
+
25
+ # Normalize inputs to HTS::Bam instances
26
+ # Accepts array of HTS::Bam or filenames (String)
27
+ def initialize(inputs, region: nil, beg: nil, end_: nil, maxcnt: nil, overlaps: false)
28
+ raise ArgumentError, "inputs must be non-empty" if inputs.nil? || inputs.empty?
29
+
30
+ @owned_bams = [] # Bams we opened here; will be closed on close
31
+ @bams = inputs.map do |x|
32
+ case x
33
+ when HTS::Bam
34
+ x
35
+ when String
36
+ b = HTS::Bam.open(x)
37
+ @owned_bams << b
38
+ b
39
+ else
40
+ raise ArgumentError, "Unsupported input type: #{x.class}"
41
+ end
42
+ end
43
+
44
+ n = @bams.length
45
+ @iters = []
46
+ @data_blocks = [] # per-input packed pointers kept alive
47
+
48
+ # Prepare optional region iterators for each input
49
+ @bams.each_with_index do |bam, i|
50
+ itr = nil
51
+ if region && beg.nil? && end_.nil?
52
+ raise "Index required for region mpileup" unless bam.index_loaded?
53
+
54
+ itr = HTS::LibHTS.sam_itr_querys(bam.instance_variable_get(:@idx), bam.header.struct, region)
55
+ raise "Failed to query region on input ##{i}: #{region}" if itr.null?
56
+ elsif region && beg && end_
57
+ raise "Index required for region mpileup" unless bam.index_loaded?
58
+
59
+ tid = bam.header.get_tid(region)
60
+ itr = HTS::LibHTS.sam_itr_queryi(bam.instance_variable_get(:@idx), tid, beg, end_)
61
+ raise "Failed to query region on input ##{i}: #{region} #{beg} #{end_}" if itr.null?
62
+ elsif beg || end_
63
+ raise ArgumentError, "beg and end_ must be specified together"
64
+ end
65
+ @iters << itr
66
+ end
67
+
68
+ # Build per-input packed pointer blocks so C passes them back to the callback.
69
+ # Layout per input: [0] hts_fp (htsFile*), [1] hdr_struct (bam_hdr_t*), [2] itr (hts_itr_t* or NULL)
70
+ ptr_size = FFI.type_size(:pointer)
71
+ data_array = FFI::MemoryPointer.new(:pointer, n)
72
+ @bams.each_with_index do |bam, i|
73
+ hts_fp = bam.instance_variable_get(:@hts_file)
74
+ hdr_struct = bam.header.struct
75
+ itr = @iters[i]
76
+ block = FFI::MemoryPointer.new(:pointer, 3)
77
+ block.put_pointer(0 * ptr_size, hts_fp)
78
+ block.put_pointer(1 * ptr_size, hdr_struct)
79
+ block.put_pointer(2 * ptr_size, itr && !itr.null? ? itr : FFI::Pointer::NULL)
80
+ @data_blocks << block
81
+ data_array.put_pointer(i * ptr_size, block)
82
+ end
83
+ # Keep the array of per-input blocks alive while the C side holds on to them
84
+ @data_array = data_array
85
+
86
+ @cb = FFI::Function.new(:int, %i[pointer pointer]) do |data, b|
87
+ # Unpack pointers from the per-input block
88
+ hts_fp = data.get_pointer(0 * ptr_size)
89
+ hdr_struct = data.get_pointer(1 * ptr_size)
90
+ itr = data.get_pointer(2 * ptr_size)
91
+ # HTSlib contract: return same as sam_itr_next/sam_read1 (>= 0 on success, -1 on EOF, < -1 on error)
92
+ if itr && !itr.null?
93
+ HTS::LibHTS.sam_itr_next(hts_fp, itr, b)
94
+ else
95
+ HTS::LibHTS.sam_read1(hts_fp, hdr_struct, b)
96
+ end
97
+ end
98
+
99
+ @iter = HTS::LibHTS.bam_mplp_init(n, @cb, @data_array)
100
+ raise "bam_mplp_init failed" if @iter.null?
101
+
102
+ HTS::LibHTS.bam_mplp_set_maxcnt(@iter, maxcnt) if maxcnt
103
+ return unless overlaps
104
+
105
+ rc = HTS::LibHTS.bam_mplp_init_overlaps(@iter)
106
+ raise "bam_mplp_init_overlaps failed" if rc < 0
107
+ end
108
+
109
+ # Yields an array of Pileup::PileupColumn (one per input) for each position
110
+ def each
111
+ return to_enum(__method__) unless block_given?
112
+
113
+ n = @bams.length
114
+ tid_ptr = FFI::MemoryPointer.new(:int)
115
+ pos_ptr = FFI::MemoryPointer.new(:long_long)
116
+ n_ptr = FFI::MemoryPointer.new(:int, n)
117
+ plp_ptr = FFI::MemoryPointer.new(:pointer, n)
118
+ plp1_size = HTS::LibHTS::BamPileup1.size
119
+ headers = @bams.map(&:header)
120
+
121
+ while HTS::LibHTS.bam_mplp64_auto(@iter, tid_ptr, pos_ptr, n_ptr, plp_ptr) > 0
122
+ tid = tid_ptr.read_int
123
+ pos = pos_ptr.read_long_long
124
+
125
+ counts = n_ptr.read_array_of_int(n)
126
+ plp_arr = plp_ptr.read_array_of_pointer(n)
127
+
128
+ cols = Array.new(n)
129
+ i = 0
130
+ while i < n
131
+ c = counts[i]
132
+ if c <= 0 || plp_arr[i].null?
133
+ cols[i] = HTS::Bam::Pileup::PileupColumn.new(tid: tid, pos: pos, alignments: [])
134
+ else
135
+ base_ptr = plp_arr[i]
136
+ aligns = Array.new(c)
137
+ j = 0
138
+ while j < c
139
+ e_ptr = base_ptr + (j * plp1_size)
140
+ entry = HTS::LibHTS::BamPileup1.new(e_ptr)
141
+ aligns[j] = HTS::Bam::Pileup::PileupRecord.new(entry, headers[i])
142
+ j += 1
143
+ end
144
+ cols[i] = HTS::Bam::Pileup::PileupColumn.new(tid: tid, pos: pos, alignments: aligns)
145
+ end
146
+ i += 1
147
+ end
148
+
149
+ yield cols
150
+ end
151
+
152
+ self
153
+ end
154
+
155
+ def close
156
+ if @iter && !@iter.null?
157
+ HTS::LibHTS.bam_mplp_destroy(@iter)
158
+ @iter = FFI::Pointer::NULL
159
+ end
160
+ @iters.each do |itr|
161
+ HTS::LibHTS.hts_itr_destroy(itr) if itr && !itr.null?
162
+ end
163
+ @iters.clear
164
+ # Keep references to callback and data blocks to prevent GC
165
+ @_keepalive = [@cb, @data_array, *@data_blocks]
166
+ # Close owned bams opened by this object
167
+ @owned_bams.each do |b|
168
+ b.close
169
+ rescue StandardError
170
+ end
171
+ @owned_bams.clear
172
+ end
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTS
4
+ class Bam < Hts
5
+ # High-level pileup iterator for a single SAM/BAM/CRAM
6
+ class Pileup
7
+ include Enumerable
8
+
9
+ # Usage:
10
+ # HTS::Bam::Pileup.open(bam, region: "chr1:1-100") do |pl|
11
+ # pl.each { |col| ... }
12
+ # end
13
+ def self.open(*args, **kw)
14
+ pu = new(*args, **kw)
15
+ return pu unless block_given?
16
+
17
+ begin
18
+ yield pu
19
+ ensure
20
+ pu.close
21
+ end
22
+ pu
23
+ end
24
+
25
+ # A column at a reference position with pileup alignments
26
+ PileupColumn = Struct.new(:tid, :pos, :alignments, keyword_init: true) do
27
+ def depth
28
+ alignments.length
29
+ end
30
+ end
31
+
32
+ # A wrapper of one bam_pileup1_t entry
33
+ class PileupRecord
34
+ def initialize(entry, header)
35
+ @entry = entry
36
+ @header = header
37
+ @record = nil
38
+ end
39
+
40
+ # Return Bam::Record. On the first call, duplicate the underlying bam1_t (bam_dup1)
41
+ # so the record becomes safe to keep beyond the current pileup step. Subsequent calls
42
+ # return the cached Bam::Record instance.
43
+ # NOTE: Without duplication, bam1_t memory may be reused by HTSlib on the next step.
44
+ def record
45
+ return @record if @record
46
+
47
+ # Normalize to a raw pointer and duplicate to obtain owned memory.
48
+ b_ptr = @entry[:b].is_a?(FFI::Pointer) ? @entry[:b] : @entry[:b].to_ptr
49
+ dup_ptr = HTS::LibHTS.bam_dup1(b_ptr)
50
+ raise "bam_dup1 failed" if dup_ptr.null?
51
+
52
+ # Build a Bam::Record backed by the duplicated bam1_t.
53
+ @record = HTS::Bam::Record.new(@header, dup_ptr)
54
+ end
55
+
56
+ def query_position
57
+ @entry[:qpos]
58
+ end
59
+
60
+ def indel
61
+ @entry[:indel]
62
+ end
63
+
64
+ def del?
65
+ @entry[:is_del] == 1
66
+ end
67
+
68
+ def head?
69
+ @entry[:is_head] == 1
70
+ end
71
+
72
+ def tail?
73
+ @entry[:is_tail] == 1
74
+ end
75
+
76
+ def refskip?
77
+ @entry[:is_refskip] == 1
78
+ end
79
+ end
80
+
81
+ # Create a Pileup iterator
82
+ # @param bam [HTS::Bam]
83
+ # @param region [String, nil] Optional region string (requires index)
84
+ # @param beg [Integer, nil] Optional begin when using tid/beg/end form
85
+ # @param end_ [Integer, nil] Optional end when using tid/beg/end form
86
+ # @param maxcnt [Integer, nil] Max per-position depth (capped)
87
+ def initialize(bam, region: nil, beg: nil, end_: nil, maxcnt: nil)
88
+ @bam = bam
89
+ @header = bam.header
90
+ @itr = nil
91
+ @cb = nil
92
+ @plp = nil
93
+
94
+ # Optional region iterator
95
+ if region && beg.nil? && end_.nil?
96
+ raise "Index file is required to use region pileup." unless bam.index_loaded?
97
+
98
+ @itr = HTS::LibHTS.sam_itr_querys(bam.instance_variable_get(:@idx), @header.struct, region)
99
+ raise "Failed to query region: #{region}" if @itr.null?
100
+ elsif region && beg && end_
101
+ raise "Index file is required to use region pileup." unless bam.index_loaded?
102
+
103
+ tid = @header.get_tid(region)
104
+ @itr = HTS::LibHTS.sam_itr_queryi(bam.instance_variable_get(:@idx), tid, beg, end_)
105
+ raise "Failed to query region: #{region} #{beg} #{end_}" if @itr.null?
106
+ elsif beg || end_
107
+ raise ArgumentError, "beg and end_ must be specified together"
108
+ end
109
+
110
+ # Build the auto callback for bam_plp_init (micro-optimized)
111
+ # - Hoist ivar/constant lookups out of the callback to reduce per-call overhead.
112
+ # - Specialize callbacks to avoid branching in the hot path.
113
+ hts_fp = @bam.instance_variable_get(:@hts_file)
114
+ hdr_struct = @header.struct
115
+ itr_local = @itr
116
+
117
+ @cb = if itr_local && !itr_local.null?
118
+ FFI::Function.new(:int, %i[pointer pointer]) do |_data, b|
119
+ # HTSlib contract: return same as sam_itr_next (>= 0 on success, -1 on EOF, < -1 on error)
120
+ HTS::LibHTS.sam_itr_next(hts_fp, itr_local, b)
121
+ end
122
+ else
123
+ FFI::Function.new(:int, %i[pointer pointer]) do |_data, b|
124
+ # HTSlib contract: return same as sam_read1 (>= 0 on success, -1 on EOF, < -1 on error)
125
+ HTS::LibHTS.sam_read1(hts_fp, hdr_struct, b)
126
+ end
127
+ end
128
+
129
+ @plp = HTS::LibHTS.bam_plp_init(@cb, nil)
130
+ raise "bam_plp_init failed" if @plp.null?
131
+
132
+ HTS::LibHTS.bam_plp_set_maxcnt(@plp, maxcnt) if maxcnt
133
+ end
134
+
135
+ def each
136
+ return to_enum(__method__) unless block_given?
137
+
138
+ tid_ptr = FFI::MemoryPointer.new(:int)
139
+ pos_ptr = FFI::MemoryPointer.new(:long_long) # hts_pos_t
140
+ n_ptr = FFI::MemoryPointer.new(:int)
141
+
142
+ # Micro-optimizations:
143
+ # - Compute constant struct size once
144
+ # - Hoist header reference outside the loop
145
+ plp1_size = HTS::LibHTS::BamPileup1.size
146
+ header_local = @header
147
+
148
+ loop do
149
+ base_ptr = HTS::LibHTS.bam_plp64_auto(@plp, tid_ptr, pos_ptr, n_ptr)
150
+
151
+ # When base_ptr is NULL, check n to distinguish EOF (n == 0) from error (n < 0)
152
+ if base_ptr.null?
153
+ n = n_ptr.read_int
154
+ raise "HTSlib pileup error (bam_plp64_auto)" if n < 0
155
+
156
+ break
157
+ end
158
+
159
+ tid = tid_ptr.read_int
160
+ pos = pos_ptr.read_long_long
161
+ n = n_ptr.read_int
162
+
163
+ # Construct alignment entries with minimal allocations
164
+ if n.zero?
165
+ alignments = []
166
+ else
167
+ alignments = Array.new(n)
168
+ i = 0
169
+ while i < n
170
+ e_ptr = base_ptr + (i * plp1_size)
171
+ entry = HTS::LibHTS::BamPileup1.new(e_ptr)
172
+ alignments[i] = PileupRecord.new(entry, header_local)
173
+ i += 1
174
+ end
175
+ end
176
+
177
+ yield PileupColumn.new(tid: tid, pos: pos, alignments: alignments)
178
+ end
179
+
180
+ self
181
+ end
182
+
183
+ def reset
184
+ HTS::LibHTS.bam_plp_reset(@plp) if @plp && !@plp.null?
185
+ end
186
+
187
+ def close
188
+ if @plp && !@plp.null?
189
+ HTS::LibHTS.bam_plp_destroy(@plp)
190
+ @plp = FFI::Pointer::NULL
191
+ end
192
+ if @itr && !@itr.null?
193
+ HTS::LibHTS.hts_itr_destroy(@itr)
194
+ @itr = FFI::Pointer::NULL
195
+ end
196
+ # Keep @cb referenced by instance to avoid GC during iteration.
197
+ @cb
198
+ end
199
+ end
200
+ end
201
+ end
@@ -326,6 +326,13 @@ module HTS
326
326
  end
327
327
  end
328
328
 
329
+ # Get base modification information from MM/ML tags
330
+ # @param auto_parse [Boolean] If true (default), parse lazily on first access
331
+ # @return [BaseMod] Base modification object
332
+ def base_mod(auto_parse: true)
333
+ BaseMod.new(self, auto_parse: auto_parse)
334
+ end
335
+
329
336
  # TODO: add a method to get the auxiliary fields as a hash.
330
337
 
331
338
  # TODO: add a method to set the auxiliary fields.
@@ -352,8 +359,13 @@ module HTS
352
359
  private
353
360
 
354
361
  def initialize_copy(orig)
362
+ super
355
363
  @header = orig.header
356
- @bam = LibHTS.bam_dup1(orig.struct)
364
+ # Deep-copy underlying bam1_t to detach from original buffer
365
+ dup_bam1 = LibHTS.bam_dup1(orig.struct)
366
+ raise "bam_dup1 failed" if dup_bam1.null?
367
+
368
+ @bam1 = dup_bam1
357
369
  end
358
370
  end
359
371
  end
data/lib/hts/bam.rb CHANGED
@@ -7,7 +7,9 @@ require_relative "bam/header"
7
7
  require_relative "bam/cigar"
8
8
  require_relative "bam/flag"
9
9
  require_relative "bam/record"
10
- # require_relative "bam/pileup"
10
+ require_relative "bam/base_mod"
11
+ require_relative "bam/pileup"
12
+ require_relative "bam/mpileup"
11
13
  # require_relative "bam/pileup_entry"
12
14
 
13
15
  module HTS
@@ -160,7 +162,7 @@ module HTS
160
162
 
161
163
  position = tell
162
164
  ary = map { |r| r.aux(tag) }
163
- seek(position)
165
+ seek(position) if position
164
166
  ary
165
167
  end
166
168
 
@@ -194,6 +196,13 @@ module HTS
194
196
  self
195
197
  end
196
198
 
199
+ # Iterate alignment records in this file.
200
+ #
201
+ # Performance and memory semantics:
202
+ # - copy: false (default) reuses a single Record instance and its underlying bam1_t buffer.
203
+ # The yielded Record MUST NOT be stored beyond the block; its content will be overwritten
204
+ # by the next iteration. If you need to retain it, call `rec = rec.dup`.
205
+ # - copy: true yields a fresh Record per iteration (deep-copied via bam_dup1). Slower, safe to keep.
197
206
  def each(copy: false, &block)
198
207
  if copy
199
208
  each_record_copy(&block)
@@ -202,23 +211,65 @@ module HTS
202
211
  end
203
212
  end
204
213
 
214
+ # Iterate records in a genomic region or multiple regions.
215
+ # See {#each} for copy semantics. When copy: false, the yielded Record is reused and should not be stored.
216
+ #
217
+ # @param region [String, Array<String>] Region specification(s)
218
+ # - Single region: "chr1:100-200" or "chr1" with beg/end parameters
219
+ # - Multiple regions: ["chr1:100-200", "chr2:500-600", ...]
220
+ # @param beg [Integer, nil] Start position (used with single string region)
221
+ # @param end_ [Integer, nil] End position (used with single string region)
222
+ # @param copy [Boolean] Whether to deep-copy records (see {#each})
223
+ #
224
+ # @example Single region query
225
+ # bam.query("chr1:100-200") { |r| puts r.qname }
226
+ # bam.query("chr1", 100, 200) { |r| puts r.qname }
227
+ #
228
+ # @example Multi-region query
229
+ # bam.query(["chr1:100-200", "chr2:500-600"]) { |r| puts r.qname }
205
230
  def query(region, beg = nil, end_ = nil, copy: false, &block)
206
231
  check_closed
207
232
  raise "Index file is required to call the query method." unless index_loaded?
208
233
 
209
- if beg && end_
210
- tid = header.get_tid(region)
211
- queryi(tid, beg, end_, copy:, &block)
212
- elsif beg.nil? && end_.nil?
213
- querys(region, copy:, &block)
234
+ case region
235
+ when Array
236
+ raise ArgumentError, "beg and end_ cannot be used with array of regions" if beg || end_
237
+
238
+ query_regions(region, copy:, &block)
239
+ when String
240
+ if beg && end_
241
+ tid = header.get_tid(region)
242
+ queryi(tid, beg, end_, copy:, &block)
243
+ elsif beg.nil? && end_.nil?
244
+ querys(region, copy:, &block)
245
+ else
246
+ raise ArgumentError, "beg and end_ must be specified together"
247
+ end
214
248
  else
215
- raise ArgumentError, "beg and end_ must be specified together"
249
+ raise ArgumentError, "region must be String or Array"
216
250
  end
217
251
  end
218
252
 
219
- # def pileup
220
- # Pileup.new(self)
221
- # end
253
+ # Pileup iterator over this file. Optional region can be specified.
254
+ # When a block is given, uses RAII-style and ensures the iterator is closed at block end.
255
+ # Without a block, returns an Enumerator over a live Pileup instance; caller should close when done.
256
+ #
257
+ # @param region [String, nil] region string like "chr1:100-200"
258
+ # @param beg [Integer, nil]
259
+ # @param end_ [Integer, nil]
260
+ # @param maxcnt [Integer, nil] cap on depth per position
261
+ def pileup(region = nil, beg = nil, end_: nil, maxcnt: nil, &block)
262
+ check_closed
263
+ if block_given?
264
+ Pileup.open(self, region:, beg:, end_: end_, maxcnt: maxcnt) do |piter|
265
+ piter.each(&block)
266
+ end
267
+ self
268
+ else
269
+ piter = Pileup.new(self, region:, beg:, end_: end_, maxcnt: maxcnt)
270
+ piter.to_enum(:each)
271
+ end
272
+ end
222
273
 
223
274
  private
224
275
 
@@ -238,6 +289,17 @@ module HTS
238
289
  end
239
290
  end
240
291
 
292
+ # Multi-region query implementation
293
+ def query_regions(regions, copy: false, &block)
294
+ if copy
295
+ query_regions_copy(regions, &block)
296
+ else
297
+ query_regions_reuse(regions, &block)
298
+ end
299
+ end
300
+
301
+ # Internal: yield a single reused Record over the entire file.
302
+ # The underlying bam1_t is mutated on each iteration for speed.
241
303
  def each_record_reuse
242
304
  check_closed
243
305
  # Each does not always start at the beginning of the file.
@@ -250,6 +312,7 @@ module HTS
250
312
  self
251
313
  end
252
314
 
315
+ # Internal: yield deep-copied Records so callers may retain them safely.
253
316
  def each_record_copy
254
317
  check_closed
255
318
  return to_enum(__method__) unless block_given?
@@ -301,6 +364,7 @@ module HTS
301
364
  self
302
365
  end
303
366
 
367
+ # Internal: reused-Record iterator over a query iterator.
304
368
  def query_reuse_yield(qiter)
305
369
  bam1 = LibHTS.bam_init1
306
370
  record = Record.new(header, bam1)
@@ -323,5 +387,27 @@ module HTS
323
387
  ensure
324
388
  LibHTS.hts_itr_destroy(qiter)
325
389
  end
390
+
391
+ # Multi-region query using sequential single-region queries
392
+ # Note: This is a fallback implementation. Ideally we would use sam_itr_regarray
393
+ # but there seem to be issues with the multi-region iterator in the current setup.
394
+ def query_regions_reuse(regions, &block)
395
+ return to_enum(__method__, regions) unless block_given?
396
+
397
+ regions.each do |region|
398
+ querys_reuse(region, &block)
399
+ end
400
+ self
401
+ end
402
+
403
+ # Multi-region query with copied Records using sequential queries
404
+ def query_regions_copy(regions, &block)
405
+ return to_enum(__method__, regions) unless block_given?
406
+
407
+ regions.each do |region|
408
+ querys_copy(region, &block)
409
+ end
410
+ self
411
+ end
326
412
  end
327
413
  end