htslib 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c98f3937d65f091e9e834060f3a94578034d1bf55e7bc372e5ed388e618a6da4
4
- data.tar.gz: 8e180044fef210935695bc60c0352b747b570b705f3ce9318b7538d19ddaf645
3
+ metadata.gz: e1bf158506931c62ffae1a524158de9fbb451796a68a7586cf788203f67a8cc4
4
+ data.tar.gz: d3289551dac8783cfa23f1f8d44e1b4be44b6dab3d6369f816491ceea653188f
5
5
  SHA512:
6
- metadata.gz: 5324913bdbe97580fee1e54b6268f22aed2a0b84d449c8a4f62529981c82a57d6ae5cea7cc77ff1be7332425eea259bfe5daa1107b6144e36ef40bb4997dd28f
7
- data.tar.gz: 7b230700951223aeda09d64fac7cca44d734d54ec03a7836c5a0fb036e552854db80b9ecaf57aa478e317c684e4712f2f03244e02474fd2ea53a4b93346d1371
6
+ metadata.gz: d1c316a599c2dc08e6589f980e9dd4ae6d8d6bababbfef424f35e5c25453b667bdc48570578e88a9eba142553fffb25b49ae4de54a061f99f7cbf286988ec618
7
+ data.tar.gz: 8529c6a02c354419dd722abc0bd6f27ec514ffe980f4b0d7014914de11551a45c4e96b803bc77255c3b423129a8743a3c39556d9d3d44ec6c6692556485379cf
data/README.md CHANGED
@@ -6,6 +6,9 @@
6
6
  [![DOI](https://zenodo.org/badge/247078205.svg)](https://zenodo.org/badge/latestdoi/247078205)
7
7
  [![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://rubydoc.info/gems/htslib)
8
8
 
9
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kojix2/ruby-htslib)
10
+ [![Lines of Code](https://img.shields.io/endpoint?url=https%3A%2F%2Ftokei.kojix2.net%2Fbadge%2Fgithub%2Fkojix2%2Fruby-htslib%2Flines)](https://tokei.kojix2.net/github/kojix2/ruby-htslib)
11
+
9
12
  Ruby-htslib is the [Ruby](https://www.ruby-lang.org) bindings to [HTSlib](https://github.com/samtools/htslib), a C library for high-throughput sequencing data formats. It allows you to read and write file formats commonly used in genomics, such as [SAM, BAM, VCF, and BCF](http://samtools.github.io/hts-specs/), in the Ruby language.
10
13
 
11
14
  :apple: Feel free to fork it!
@@ -0,0 +1,343 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTS
4
+ class Bam < Hts
5
+ # Base modification information from MM/ML tags
6
+ #
7
+ # This class provides access to DNA/RNA base modifications such as methylation.
8
+ # It wraps the htslib base modification API and provides a Ruby-friendly interface.
9
+ #
10
+ # @note BaseMod is a view object that references data in a Record.
11
+ # The state is maintained in hts_base_mod_state structure.
12
+ class BaseMod
13
+ include Enumerable
14
+
15
+ class NotParsedError < StandardError; end
16
+
17
+ attr_reader :record
18
+
19
+ # Individual base modification information
20
+ class Modification
21
+ attr_reader :modified_base, :canonical_base, :strand, :qual
22
+
23
+ # @param modified_base [Integer] Modification code as char or -ChEBI
24
+ # @param canonical_base [Integer] Canonical base (A, C, G, T, N)
25
+ # @param strand [Integer] 0 or 1 for +/- strand
26
+ # @param qual [Integer] Quality (256*probability) or -1 if unknown
27
+ def initialize(modified_base:, canonical_base:, strand:, qual:)
28
+ @modified_base = modified_base
29
+ @canonical_base = canonical_base
30
+ @strand = strand
31
+ @qual = qual
32
+ end
33
+
34
+ # Get modification code as character or ChEBI number as string
35
+ # @return [String] Single character code or ChEBI number as string
36
+ def code
37
+ @modified_base > 0 ? @modified_base.chr : @modified_base.to_s
38
+ end
39
+
40
+ # Get canonical base as character
41
+ # @return [String] Single character (A, C, G, T, N)
42
+ def canonical
43
+ @canonical_base.chr
44
+ end
45
+
46
+ # Get likelihood as a probability (0.0-1.0)
47
+ # @return [Float, nil] Probability or nil if qual is -1
48
+ def probability
49
+ return nil if @qual == -1
50
+
51
+ @qual / 256.0
52
+ end
53
+
54
+ # Convert to hash representation
55
+ # @return [Hash] Hash with modification information
56
+ def to_h
57
+ {
58
+ modified_base: @modified_base,
59
+ code: code,
60
+ canonical_base: @canonical_base,
61
+ canonical: canonical,
62
+ strand: @strand,
63
+ qual: @qual,
64
+ probability: probability
65
+ }
66
+ end
67
+
68
+ # String representation
69
+ # @return [String] String representation of the modification
70
+ def to_s
71
+ if @qual >= 0
72
+ "#{canonical}->#{code}(#{probability.round(3)})"
73
+ else
74
+ "#{canonical}->#{code}"
75
+ end
76
+ end
77
+
78
+ # Inspect string
79
+ # @return [String] Inspect string
80
+ def inspect
81
+ "#<HTS::Bam::BaseMod::Modification #{self}>"
82
+ end
83
+ end
84
+
85
+ # Position-specific modification information
86
+ class Position
87
+ attr_reader :position, :modifications
88
+
89
+ # @param position [Integer] Position in query sequence
90
+ # @param modifications [Array<Modification>] Array of modifications at this position
91
+ def initialize(position, modifications)
92
+ @position = position
93
+ @modifications = modifications
94
+ end
95
+
96
+ # Check if this position has methylation
97
+ # @return [Boolean] true if any modification is methylation ('m')
98
+ def methylated?
99
+ @modifications.any? { |m| m.code == "m" }
100
+ end
101
+
102
+ # Check if this position has hydroxymethylation
103
+ # @return [Boolean] true if any modification is hydroxymethylation ('h')
104
+ def hydroxymethylated?
105
+ @modifications.any? { |m| m.code == "h" }
106
+ end
107
+
108
+ # Convert to hash representation
109
+ # @return [Hash] Hash with position information
110
+ def to_h
111
+ {
112
+ position: @position,
113
+ modifications: @modifications.map(&:to_h)
114
+ }
115
+ end
116
+
117
+ # String representation
118
+ # @return [String] String representation
119
+ def to_s
120
+ mods_str = @modifications.map(&:to_s).join(", ")
121
+ "pos=#{@position} [#{mods_str}]"
122
+ end
123
+
124
+ # Inspect string
125
+ # @return [String] Inspect string
126
+ def inspect
127
+ "#<HTS::Bam::BaseMod::Position #{self}>"
128
+ end
129
+ end
130
+
131
+ # Initialize a new BaseMod object
132
+ # @param record [Record] The BAM record to extract modifications from
133
+ # @param auto_parse [Boolean] If true, parse MM/ML lazily on first access
134
+ def initialize(record, auto_parse: true)
135
+ @record = record
136
+ @state = LibHTS.hts_base_mod_state_alloc
137
+ @closed = false
138
+ @auto_parse = !!auto_parse
139
+ @parsed = false
140
+ raise Error, "Failed to allocate hts_base_mod_state" if @state.null?
141
+ end
142
+
143
+ # Explicitly free the state
144
+ # @return [void]
145
+ def close
146
+ return if @closed
147
+
148
+ # With HtsBaseModState as an AutoPointer, releasing the Ruby object
149
+ # is sufficient. Avoid manual free to prevent double-free.
150
+ @state = nil
151
+ @closed = true
152
+ end
153
+
154
+ # Whether this object has parsed MM/ML tags already
155
+ # @return [Boolean]
156
+ def parsed?
157
+ @parsed
158
+ end
159
+
160
+ # Ensure MM/ML have been parsed, performing lazy parse if enabled.
161
+ # @param flags [Integer]
162
+ # @return [void]
163
+ def ensure_parsed!(flags = 0)
164
+ return if @parsed
165
+
166
+ raise NotParsedError, "BaseMod is not parsed. Call #parse first (auto_parse is disabled)." unless @auto_parse
167
+
168
+ parse(flags)
169
+ end
170
+
171
+ # Parse MM and ML tags from the record
172
+ # @param flags [Integer] Parsing flags (default: 0)
173
+ # @return [Integer] Number of modification types found, or -1 on error
174
+ # @raise [Error] If parsing fails
175
+ def parse(flags = 0)
176
+ ret = LibHTS.bam_parse_basemod2(@record.struct, @state, flags)
177
+ raise Error, "Failed to parse base modifications" if ret < 0
178
+
179
+ @parsed = true
180
+ ret
181
+ end
182
+
183
+ # Get modification information at a specific query position
184
+ # @param position [Integer] Query position (0-based)
185
+ # @param max_mods [Integer] Maximum number of modifications to retrieve
186
+ # @return [Position, nil] Position object with modifications, or nil if none
187
+ def at_pos(position, max_mods: 10)
188
+ # Reset state to ensure deterministic results even after prior iteration
189
+ parsed? ? parse : ensure_parsed!
190
+
191
+ mods_ptr = FFI::MemoryPointer.new(LibHTS::HtsBaseMod, max_mods)
192
+
193
+ ret = LibHTS.bam_mods_at_qpos(@record.struct, position, @state,
194
+ mods_ptr, max_mods)
195
+ return nil if ret <= 0
196
+
197
+ build_position(position, mods_ptr, [ret, max_mods].min)
198
+ end
199
+
200
+ # Array-style access to modifications at a position
201
+ # @param position [Integer] Query position (0-based)
202
+ # @return [Position, nil] Position object with modifications, or nil if none
203
+ def [](position)
204
+ at_pos(position)
205
+ end
206
+
207
+ # Iterate over all positions with modifications
208
+ # @param max_mods [Integer] Maximum number of modifications per position
209
+ # @yield [Position] Position object for each modified position
210
+ # @return [Enumerator] If no block given
211
+ def each_position(max_mods: 10)
212
+ return enum_for(__method__, max_mods: max_mods) unless block_given?
213
+
214
+ # Reset state at the start of iteration to allow repeated enumerations
215
+ parsed? ? parse : ensure_parsed!
216
+
217
+ pos_ptr = FFI::MemoryPointer.new(:int)
218
+ mods_ptr = FFI::MemoryPointer.new(LibHTS::HtsBaseMod, max_mods)
219
+
220
+ loop do
221
+ ret = LibHTS.bam_next_basemod(@record.struct, @state,
222
+ mods_ptr, max_mods, pos_ptr)
223
+ break if ret <= 0
224
+
225
+ position = pos_ptr.read_int
226
+ yield build_position(position, mods_ptr, [ret, max_mods].min)
227
+ end
228
+ end
229
+
230
+ alias each each_position
231
+
232
+ # Get list of modification types present in this record
233
+ # @return [Array<Integer>] Array of modification codes (char code or -ChEBI)
234
+ def modification_types
235
+ ensure_parsed!
236
+
237
+ ntype_ptr = FFI::MemoryPointer.new(:int)
238
+ codes_ptr = LibHTS.bam_mods_recorded(@state, ntype_ptr)
239
+
240
+ ntype = ntype_ptr.read_int
241
+ return [] if ntype <= 0 || codes_ptr.null?
242
+
243
+ codes_ptr.read_array_of_int(ntype)
244
+ end
245
+
246
+ alias recorded_types modification_types
247
+
248
+ # Query information about a specific modification type by code
249
+ # @param code [Integer, String] Modification code (char code or -ChEBI, or single char string)
250
+ # @return [Hash, nil] Hash with canonical, strand, implicit info, or nil if not found
251
+ def query_type(code)
252
+ ensure_parsed!
253
+
254
+ code = code.ord if code.is_a?(String)
255
+
256
+ strand_ptr = FFI::MemoryPointer.new(:int)
257
+ implicit_ptr = FFI::MemoryPointer.new(:int)
258
+ canonical_ptr = FFI::MemoryPointer.new(:char, 1)
259
+
260
+ ret = LibHTS.bam_mods_query_type(@state, code, strand_ptr,
261
+ implicit_ptr, canonical_ptr)
262
+ return nil if ret < 0
263
+
264
+ {
265
+ canonical: canonical_ptr.read_char.chr,
266
+ strand: strand_ptr.read_int,
267
+ implicit: implicit_ptr.read_int != 0
268
+ }
269
+ end
270
+
271
+ # Query information about i-th modification type
272
+ # @param index [Integer] Modification type index (0-based)
273
+ # @return [Hash, nil] Hash with code, canonical, strand, implicit info
274
+ def query_type_at(index)
275
+ ensure_parsed!
276
+
277
+ strand_ptr = FFI::MemoryPointer.new(:int)
278
+ implicit_ptr = FFI::MemoryPointer.new(:int)
279
+ canonical_ptr = FFI::MemoryPointer.new(:char, 1)
280
+
281
+ ret = LibHTS.bam_mods_queryi(@state, index, strand_ptr,
282
+ implicit_ptr, canonical_ptr)
283
+ return nil if ret < 0
284
+
285
+ types = modification_types
286
+ {
287
+ code: types[index],
288
+ canonical: canonical_ptr.read_char.chr,
289
+ strand: strand_ptr.read_int,
290
+ implicit: implicit_ptr.read_int != 0
291
+ }
292
+ end
293
+
294
+ # Get all modifications as an array
295
+ # @return [Array<Position>] Array of all positions with modifications
296
+ def to_a
297
+ each_position.to_a
298
+ end
299
+
300
+ # String representation for debugging
301
+ # @return [String] String representation
302
+ def to_s
303
+ return "#<HTS::Bam::BaseMod (not parsed)>" unless @parsed
304
+
305
+ mods = []
306
+ each_position do |pos|
307
+ mods << pos.to_s
308
+ end
309
+ "#<HTS::Bam::BaseMod #{mods.join(' ')}>"
310
+ end
311
+
312
+ # Inspect string
313
+ # @return [String] Inspect string
314
+ def inspect
315
+ to_s
316
+ end
317
+
318
+ private
319
+
320
+ # Build Position object from hts_base_mod array
321
+ # @param position [Integer] Query position
322
+ # @param mods_ptr [FFI::Pointer] Pointer to array of HtsBaseMod structures
323
+ # @param n_mods [Integer] Number of modifications
324
+ # @return [Position] Position object
325
+ def build_position(position, mods_ptr, n_mods)
326
+ modifications = []
327
+
328
+ n_mods.times do |i|
329
+ mod_struct = LibHTS::HtsBaseMod.new(mods_ptr + i * LibHTS::HtsBaseMod.size)
330
+
331
+ modifications << Modification.new(
332
+ modified_base: mod_struct[:modified_base],
333
+ canonical_base: mod_struct[:canonical_base],
334
+ strand: mod_struct[:strand],
335
+ qual: mod_struct[:qual]
336
+ )
337
+ end
338
+
339
+ Position.new(position, modifications)
340
+ end
341
+ end
342
+ end
343
+ end
@@ -111,6 +111,23 @@ module HTS
111
111
  name2tid(name)
112
112
  end
113
113
 
114
+ # Add a @PG (program) line to the header
115
+ # @param program_name [String] Name of the program
116
+ # @param options [Hash] Key-value pairs for @PG tags (ID, PN, VN, CL, PP, etc.)
117
+ # @return [Integer] 0 on success, -1 on failure
118
+ #
119
+ # This is a convenience wrapper around sam_hdr_add_pg that automatically:
120
+ # - Generates a unique ID if the specified one clashes
121
+ # - Manages PP (previous program) chains automatically
122
+ #
123
+ # @example
124
+ # header.add_pg("bwa", VN: "0.7.17", CL: "bwa mem ref.fa read.fq")
125
+ # header.add_pg("samtools", VN: "1.15", PP: "bwa")
126
+ def add_pg(program_name, **options)
127
+ args = options.flat_map { |k, v| [:string, k.to_s, :string, v.to_s] }
128
+ LibHTS.sam_hdr_add_pg(@sam_hdr, program_name, *args, :pointer, FFI::Pointer::NULL)
129
+ end
130
+
114
131
  private
115
132
 
116
133
  def name2tid(name)
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTS
4
+ class Bam < Hts
5
+ # High-level mpileup iterator over multiple BAM/CRAM inputs
6
+ class Mpileup
7
+ include Enumerable
8
+
9
+ # Usage:
10
+ # HTS::Bam::Mpileup.open([bam1, bam2], region: "chr1:1-100") do |mpl|
11
+ # mpl.each { |cols| ... }
12
+ # end
13
+ def self.open(*args, **kw)
14
+ m = new(*args, **kw)
15
+ return m unless block_given?
16
+
17
+ begin
18
+ yield m
19
+ ensure
20
+ m.close
21
+ end
22
+ m
23
+ end
24
+
25
+ # Normalize inputs to HTS::Bam instances
26
+ # Accepts array of HTS::Bam or filenames (String)
27
+ def initialize(inputs, region: nil, beg: nil, end_: nil, maxcnt: nil, overlaps: false)
28
+ raise ArgumentError, "inputs must be non-empty" if inputs.nil? || inputs.empty?
29
+
30
+ @owned_bams = [] # Bams we opened here; will be closed on close
31
+ @bams = inputs.map do |x|
32
+ case x
33
+ when HTS::Bam
34
+ x
35
+ when String
36
+ b = HTS::Bam.open(x)
37
+ @owned_bams << b
38
+ b
39
+ else
40
+ raise ArgumentError, "Unsupported input type: #{x.class}"
41
+ end
42
+ end
43
+
44
+ n = @bams.length
45
+ @iters = []
46
+ @data_blocks = [] # per-input packed pointers kept alive
47
+
48
+ # Prepare optional region iterators for each input
49
+ @bams.each_with_index do |bam, i|
50
+ itr = nil
51
+ if region && beg.nil? && end_.nil?
52
+ raise "Index required for region mpileup" unless bam.index_loaded?
53
+
54
+ itr = HTS::LibHTS.sam_itr_querys(bam.instance_variable_get(:@idx), bam.header.struct, region)
55
+ raise "Failed to query region on input ##{i}: #{region}" if itr.null?
56
+ elsif region && beg && end_
57
+ raise "Index required for region mpileup" unless bam.index_loaded?
58
+
59
+ tid = bam.header.get_tid(region)
60
+ itr = HTS::LibHTS.sam_itr_queryi(bam.instance_variable_get(:@idx), tid, beg, end_)
61
+ raise "Failed to query region on input ##{i}: #{region} #{beg} #{end_}" if itr.null?
62
+ elsif beg || end_
63
+ raise ArgumentError, "beg and end_ must be specified together"
64
+ end
65
+ @iters << itr
66
+ end
67
+
68
+ # Build per-input packed pointer blocks so C passes them back to the callback.
69
+ # Layout per input: [0] hts_fp (htsFile*), [1] hdr_struct (bam_hdr_t*), [2] itr (hts_itr_t* or NULL)
70
+ ptr_size = FFI.type_size(:pointer)
71
+ data_array = FFI::MemoryPointer.new(:pointer, n)
72
+ @bams.each_with_index do |bam, i|
73
+ hts_fp = bam.instance_variable_get(:@hts_file)
74
+ hdr_struct = bam.header.struct
75
+ itr = @iters[i]
76
+ block = FFI::MemoryPointer.new(:pointer, 3)
77
+ block.put_pointer(0 * ptr_size, hts_fp)
78
+ block.put_pointer(1 * ptr_size, hdr_struct)
79
+ block.put_pointer(2 * ptr_size, itr && !itr.null? ? itr : FFI::Pointer::NULL)
80
+ @data_blocks << block
81
+ data_array.put_pointer(i * ptr_size, block)
82
+ end
83
+ # Keep the array of per-input blocks alive while the C side holds on to them
84
+ @data_array = data_array
85
+
86
+ @cb = FFI::Function.new(:int, %i[pointer pointer]) do |data, b|
87
+ # Unpack pointers from the per-input block
88
+ hts_fp = data.get_pointer(0 * ptr_size)
89
+ hdr_struct = data.get_pointer(1 * ptr_size)
90
+ itr = data.get_pointer(2 * ptr_size)
91
+ # HTSlib contract: return same as sam_itr_next/sam_read1 (>= 0 on success, -1 on EOF, < -1 on error)
92
+ if itr && !itr.null?
93
+ HTS::LibHTS.sam_itr_next(hts_fp, itr, b)
94
+ else
95
+ HTS::LibHTS.sam_read1(hts_fp, hdr_struct, b)
96
+ end
97
+ end
98
+
99
+ @iter = HTS::LibHTS.bam_mplp_init(n, @cb, @data_array)
100
+ raise "bam_mplp_init failed" if @iter.null?
101
+
102
+ HTS::LibHTS.bam_mplp_set_maxcnt(@iter, maxcnt) if maxcnt
103
+ return unless overlaps
104
+
105
+ rc = HTS::LibHTS.bam_mplp_init_overlaps(@iter)
106
+ raise "bam_mplp_init_overlaps failed" if rc < 0
107
+ end
108
+
109
+ # Yields an array of Pileup::PileupColumn (one per input) for each position
110
+ def each
111
+ return to_enum(__method__) unless block_given?
112
+
113
+ n = @bams.length
114
+ tid_ptr = FFI::MemoryPointer.new(:int)
115
+ pos_ptr = FFI::MemoryPointer.new(:long_long)
116
+ n_ptr = FFI::MemoryPointer.new(:int, n)
117
+ plp_ptr = FFI::MemoryPointer.new(:pointer, n)
118
+ plp1_size = HTS::LibHTS::BamPileup1.size
119
+ headers = @bams.map(&:header)
120
+
121
+ while HTS::LibHTS.bam_mplp64_auto(@iter, tid_ptr, pos_ptr, n_ptr, plp_ptr) > 0
122
+ tid = tid_ptr.read_int
123
+ pos = pos_ptr.read_long_long
124
+
125
+ counts = n_ptr.read_array_of_int(n)
126
+ plp_arr = plp_ptr.read_array_of_pointer(n)
127
+
128
+ cols = Array.new(n)
129
+ i = 0
130
+ while i < n
131
+ c = counts[i]
132
+ if c <= 0 || plp_arr[i].null?
133
+ cols[i] = HTS::Bam::Pileup::PileupColumn.new(tid: tid, pos: pos, alignments: [])
134
+ else
135
+ base_ptr = plp_arr[i]
136
+ aligns = Array.new(c)
137
+ j = 0
138
+ while j < c
139
+ e_ptr = base_ptr + (j * plp1_size)
140
+ entry = HTS::LibHTS::BamPileup1.new(e_ptr)
141
+ aligns[j] = HTS::Bam::Pileup::PileupRecord.new(entry, headers[i])
142
+ j += 1
143
+ end
144
+ cols[i] = HTS::Bam::Pileup::PileupColumn.new(tid: tid, pos: pos, alignments: aligns)
145
+ end
146
+ i += 1
147
+ end
148
+
149
+ yield cols
150
+ end
151
+
152
+ self
153
+ end
154
+
155
+ def close
156
+ if @iter && !@iter.null?
157
+ HTS::LibHTS.bam_mplp_destroy(@iter)
158
+ @iter = FFI::Pointer::NULL
159
+ end
160
+ @iters.each do |itr|
161
+ HTS::LibHTS.hts_itr_destroy(itr) if itr && !itr.null?
162
+ end
163
+ @iters.clear
164
+ # Keep references to callback and data blocks to prevent GC
165
+ @_keepalive = [@cb, @data_array, *@data_blocks]
166
+ # Close owned bams opened by this object
167
+ @owned_bams.each do |b|
168
+ b.close
169
+ rescue StandardError
170
+ end
171
+ @owned_bams.clear
172
+ end
173
+ end
174
+ end
175
+ end