htslib 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,19 @@ module HTS
8
8
  # NOTE: This class has a lot of methods that are not stable.
9
9
  # The method names and the number of arguments may change in the future.
10
10
  class Header
11
+ BCF_TYPE_MAP = {
12
+ int: "Integer",
13
+ integer: "Integer",
14
+ int32: "Integer",
15
+ float: "Float",
16
+ real: "Float",
17
+ string: "String",
18
+ str: "String",
19
+ character: "Character",
20
+ char: "Character",
21
+ flag: "Flag"
22
+ }.freeze
23
+
11
24
  def initialize(arg = nil)
12
25
  case arg
13
26
  when LibHTS::HtsFile
@@ -20,6 +33,12 @@ module HTS
20
33
  raise TypeError, "Invalid argument"
21
34
  end
22
35
 
36
+ @sync_depth = 0
37
+ @sync_needed = false
38
+ @subset_samples = nil
39
+ @subset_imap = nil
40
+ @subset_imap_pointer = nil
41
+
23
42
  yield self if block_given?
24
43
  end
25
44
 
@@ -36,13 +55,34 @@ module HTS
36
55
  end
37
56
 
38
57
  def set_version(version)
39
- LibHTS.bcf_hdr_set_version(@bcf_hdr, version)
58
+ rc = LibHTS.bcf_hdr_set_version(@bcf_hdr, version)
59
+ raise "Failed to set VCF header version" if rc.negative?
60
+
61
+ mark_sync_needed!
62
+ sync_if_needed!
63
+ self
40
64
  end
41
65
 
42
66
  def nsamples
43
67
  LibHTS.bcf_hdr_nsamples(@bcf_hdr)
44
68
  end
45
69
 
70
+ def target_count
71
+ target_names.size
72
+ end
73
+
74
+ def get_tid(name)
75
+ name2id(name)
76
+ end
77
+
78
+ def target_name(rid)
79
+ id2name(rid)
80
+ end
81
+
82
+ def target_names
83
+ seqnames
84
+ end
85
+
46
86
  def samples
47
87
  # bcf_hdr_id2name is macro function
48
88
  @bcf_hdr[:samples]
@@ -50,17 +90,65 @@ module HTS
50
90
  .map(&:read_string)
51
91
  end
52
92
 
93
+ attr_reader :subset_samples, :subset_imap_pointer
94
+
95
+ def subset?
96
+ !@subset_imap.nil?
97
+ end
98
+
99
+ def subset_sample_count
100
+ subset? ? @subset_samples.length : 0
101
+ end
102
+
103
+ def subset(samples)
104
+ subset_samples = normalize_subset_samples(samples)
105
+ validate_subset_samples!(subset_samples)
106
+
107
+ nil
108
+ imap_pointer = nil
109
+ if subset_samples.empty?
110
+ subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, 0, ::FFI::Pointer::NULL, ::FFI::Pointer::NULL)
111
+ else
112
+ encoded_samples = subset_samples.map { |name| FFI::MemoryPointer.from_string(name) }
113
+ sample_pointers = FFI::MemoryPointer.new(:pointer, subset_samples.length)
114
+ sample_pointers.write_array_of_pointer(encoded_samples)
115
+ imap_pointer = FFI::MemoryPointer.new(:int, subset_samples.length)
116
+ subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, subset_samples.length, sample_pointers, imap_pointer)
117
+ end
118
+
119
+ raise SubsetError, "Failed to subset BCF header samples #{subset_samples.inspect}" if subset_hdr.to_ptr.null?
120
+
121
+ composed_imap = compose_subset_imap(read_subset_imap(imap_pointer, subset_samples.length))
122
+ self.class.new(subset_hdr).tap do |header|
123
+ header.send(:set_subset_state, subset_samples, composed_imap)
124
+ end
125
+ end
126
+
53
127
  def add_sample(sample, sync: true)
54
- LibHTS.bcf_hdr_add_sample(@bcf_hdr, sample)
55
- self.sync if sync
128
+ rc = LibHTS.bcf_hdr_add_sample(@bcf_hdr, sample)
129
+ raise "Failed to add sample #{sample}" if rc.negative?
130
+
131
+ mark_sync_needed!
132
+ sync_if_needed! if sync
133
+ self
56
134
  end
57
135
 
58
136
  def merge(hdr)
59
- LibHTS.bcf_hdr_merge(@bcf_hdr, hdr.struct)
137
+ merged = LibHTS.bcf_hdr_merge(@bcf_hdr, hdr.struct)
138
+ raise "Failed to merge BCF headers" if merged.to_ptr.null?
139
+
140
+ @bcf_hdr = merged
141
+ mark_sync_needed!
142
+ sync_if_needed!
143
+ self
60
144
  end
61
145
 
62
146
  def sync
63
- LibHTS.bcf_hdr_sync(@bcf_hdr)
147
+ rc = LibHTS.bcf_hdr_sync(@bcf_hdr)
148
+ raise "Failed to sync BCF header" if rc.negative?
149
+
150
+ @sync_needed = false
151
+ self
64
152
  end
65
153
 
66
154
  def read_bcf(fname)
@@ -68,32 +156,121 @@ module HTS
68
156
  end
69
157
 
70
158
  def append(line)
71
- LibHTS.bcf_hdr_append(@bcf_hdr, line)
159
+ rc = LibHTS.bcf_hdr_append(@bcf_hdr, line)
160
+ raise "Failed to append VCF header line" if rc.negative?
161
+
162
+ mark_sync_needed!
163
+ self
72
164
  end
73
165
 
74
- def delete(bcf_hl_type, key) # FIXME
166
+ def delete(bcf_hl_type, key = nil) # FIXME
167
+ existed = hrec_exists?(bcf_hl_type, key)
75
168
  type = bcf_hl_type_to_int(bcf_hl_type)
76
169
  LibHTS.bcf_hdr_remove(@bcf_hdr, type, key)
170
+ mark_sync_needed! if existed
171
+ existed
77
172
  end
78
173
 
79
174
  def get_hrec(bcf_hl_type, key, value, str_class = nil)
80
175
  type = bcf_hl_type_to_int(bcf_hl_type)
81
- hrec = LibHTS.bcf_hdr_get_hrec(@bcf_hdr, type, key, value, str_class)
82
- HeaderRecord.new(hrec)
176
+ hrec = borrowed_hrec(type, key, value, str_class)
177
+ return nil if hrec.to_ptr.null?
178
+
179
+ HeaderRecord.new(owned_hrec(hrec))
180
+ end
181
+
182
+ def edit
183
+ @sync_depth += 1
184
+ yield self
185
+ self
186
+ ensure
187
+ @sync_depth -= 1
188
+ sync_if_needed!
189
+ end
190
+
191
+ def add_contig(id, length: nil, **attributes)
192
+ fields = [["ID", id.to_s]]
193
+ fields << ["length", length.to_s] unless length.nil?
194
+ fields.concat normalize_meta_attributes(attributes)
195
+ append_structured_meta("contig", fields)
196
+ end
197
+
198
+ def remove_contig(id)
199
+ delete("CONTIG", id.to_s).tap { sync_if_needed! }
200
+ end
201
+
202
+ def add_filter(id, description:, **attributes)
203
+ fields = [["ID", id.to_s], ["Description", description.to_s]]
204
+ fields.concat normalize_meta_attributes(attributes)
205
+ append_structured_meta("FILTER", fields)
206
+ end
207
+
208
+ def remove_filter(id)
209
+ delete("FILTER", id.to_s).tap { sync_if_needed! }
210
+ end
211
+
212
+ def add_info(id, number:, type:, description:, **attributes)
213
+ fields = [["ID", id.to_s], ["Number", normalize_bcf_number(number)], ["Type", normalize_bcf_type(type)],
214
+ ["Description", description.to_s]]
215
+ fields.concat normalize_meta_attributes(attributes)
216
+ append_structured_meta("INFO", fields)
217
+ end
218
+
219
+ def update_info(id, number:, type:, description:, **attributes)
220
+ delete("INFO", id.to_s)
221
+ add_info(id, number:, type:, description:, **attributes)
222
+ end
223
+
224
+ def remove_info(id)
225
+ delete("INFO", id.to_s).tap { sync_if_needed! }
226
+ end
227
+
228
+ def add_format(id, number:, type:, description:, **attributes)
229
+ fields = [["ID", id.to_s], ["Number", normalize_bcf_number(number)], ["Type", normalize_bcf_type(type)],
230
+ ["Description", description.to_s]]
231
+ fields.concat normalize_meta_attributes(attributes)
232
+ append_structured_meta("FORMAT", fields)
233
+ end
234
+
235
+ def update_format(id, number:, type:, description:, **attributes)
236
+ delete("FORMAT", id.to_s)
237
+ add_format(id, number:, type:, description:, **attributes)
238
+ end
239
+
240
+ def remove_format(id)
241
+ delete("FORMAT", id.to_s).tap { sync_if_needed! }
242
+ end
243
+
244
+ def add_meta(key, value = nil, **attributes)
245
+ if attributes.empty?
246
+ append("###{key}=#{value}")
247
+ sync_if_needed!
248
+ self
249
+ else
250
+ append_structured_meta(key.to_s, normalize_meta_attributes(attributes))
251
+ end
83
252
  end
84
253
 
85
254
  def seqnames
86
255
  n = FFI::MemoryPointer.new(:int)
87
256
  names = LibHTS.bcf_hdr_seqnames(@bcf_hdr, n)
88
- names.read_array_of_pointer(n.read_int)
89
- .map(&:read_string)
257
+ begin
258
+ names.read_array_of_pointer(n.read_int)
259
+ .map(&:read_string)
260
+ ensure
261
+ LibHTS.hts_free(names) unless names.null?
262
+ end
90
263
  end
91
264
 
92
265
  def to_s
93
266
  kstr = LibHTS::KString.new
94
- raise "Failed to get header string" unless LibHTS.bcf_hdr_format(@bcf_hdr, 0, kstr)
267
+ begin
268
+ raise "Failed to get header string" if LibHTS.bcf_hdr_format(@bcf_hdr, 0, kstr).negative?
95
269
 
96
- kstr[:s]
270
+ kstr.read_string_copy
271
+ ensure
272
+ kstr.free_buffer
273
+ end
97
274
  end
98
275
 
99
276
  def name2id(name)
@@ -106,6 +283,82 @@ module HTS
106
283
 
107
284
  private
108
285
 
286
+ def normalize_bcf_type(type)
287
+ BCF_TYPE_MAP.fetch(type.to_sym, type.to_s)
288
+ end
289
+
290
+ def normalize_bcf_number(number)
291
+ case number
292
+ when :a, :A then "A"
293
+ when :r, :R then "R"
294
+ when :g, :G then "G"
295
+ when :variable, :var, :dot then "."
296
+ else number.to_s
297
+ end
298
+ end
299
+
300
+ def normalize_meta_attributes(attributes)
301
+ attributes.map do |key, value|
302
+ meta_key = key.to_s.split("_").map.with_index { |part, index| index.zero? ? part : part.capitalize }.join
303
+ meta_value = value.is_a?(Array) ? value.join(",") : value.to_s
304
+ [meta_key, meta_value]
305
+ end
306
+ end
307
+
308
+ def append_structured_meta(label, fields)
309
+ body = fields.map { |key, value| "#{key}=#{format_meta_value(key, value)}" }.join(",")
310
+ append("###{label}=<#{body}>")
311
+ sync_if_needed!
312
+ self
313
+ end
314
+
315
+ def format_meta_value(key, value)
316
+ return quote_meta_value(value) if key == "Description"
317
+ return value if value.match?(/\A[[:alnum:]_.:+-]+\z/)
318
+
319
+ quote_meta_value(value)
320
+ end
321
+
322
+ def quote_meta_value(value)
323
+ %("#{value.gsub(/([\\"])/, '\\\\1')}")
324
+ end
325
+
326
+ def mark_sync_needed!
327
+ @sync_needed = true
328
+ end
329
+
330
+ def sync_if_needed!
331
+ sync if @sync_needed && @sync_depth.zero?
332
+ end
333
+
334
+ def hrec_exists?(bcf_hl_type, key)
335
+ type = bcf_hl_type_to_int(bcf_hl_type)
336
+ lookup_key, lookup_value, str_class = hrec_lookup_args(type, key)
337
+ hrec = borrowed_hrec(type, lookup_key, lookup_value, str_class)
338
+ !hrec.to_ptr.null?
339
+ end
340
+
341
+ def borrowed_hrec(type, key, value, str_class)
342
+ LibHTS.bcf_hdr_get_hrec(@bcf_hdr, type, key, value, str_class)
343
+ end
344
+
345
+ def owned_hrec(hrec)
346
+ LibHTS.bcf_hrec_dup(hrec).tap do |owned|
347
+ raise "Failed to duplicate BCF header record" if owned.to_ptr.null?
348
+ end
349
+ end
350
+
351
+ def hrec_lookup_args(type, key)
352
+ case type
353
+ when LibHTS::BCF_HL_FLT, LibHTS::BCF_HL_INFO, LibHTS::BCF_HL_FMT, LibHTS::BCF_HL_CTG
354
+ ["ID", key, nil]
355
+ when LibHTS::BCF_HL_GEN
356
+ [key, nil, nil]
357
+ else
358
+ ["ID", key, nil]
359
+ end
360
+ end
361
+
109
362
  def bcf_hl_type_to_int(bcf_hl_type)
110
363
  return bcf_hl_type if bcf_hl_type.is_a?(Integer)
111
364
 
@@ -129,6 +382,60 @@ module HTS
129
382
 
130
383
  def initialize_copy(orig)
131
384
  @bcf_hdr = LibHTS.bcf_hdr_dup(orig.struct)
385
+ @sync_depth = 0
386
+ @sync_needed = false
387
+ set_subset_state(orig.subset_samples, orig.send(:subset_imap))
388
+ end
389
+
390
+ protected
391
+
392
+ attr_reader :subset_imap
393
+
394
+ def set_subset_state(samples, imap)
395
+ @subset_samples = samples&.dup
396
+ @subset_imap = imap&.dup
397
+ @subset_imap_pointer = build_subset_imap_pointer(@subset_imap)
398
+ end
399
+
400
+ private
401
+
402
+ def normalize_subset_samples(samples)
403
+ case samples
404
+ when String
405
+ [samples]
406
+ else
407
+ Array(samples).map(&:to_s)
408
+ end
409
+ rescue TypeError
410
+ raise SubsetError, "Sample subset must be a String or an Array of sample names"
411
+ end
412
+
413
+ def validate_subset_samples!(subset_samples)
414
+ duplicates = subset_samples.group_by(&:itself).select { |_name, group| group.length > 1 }.keys
415
+ raise SubsetError, "Duplicate sample names in subset: #{duplicates.join(', ')}" unless duplicates.empty?
416
+
417
+ missing = subset_samples.reject { |name| samples.include?(name) }
418
+ raise UnknownSampleError, "Unknown sample names: #{missing.join(', ')}" unless missing.empty?
419
+ end
420
+
421
+ def read_subset_imap(pointer, length)
422
+ return [] if length.zero?
423
+
424
+ pointer.read_array_of_int(length)
425
+ end
426
+
427
+ def compose_subset_imap(imap)
428
+ base_imap = @subset_imap || Array.new(samples.length, &:itself)
429
+ imap.map { |index| base_imap.fetch(index) }
430
+ end
431
+
432
+ def build_subset_imap_pointer(imap)
433
+ return nil unless imap
434
+ return nil if imap.empty?
435
+
436
+ FFI::MemoryPointer.new(:int, imap.length).tap do |pointer|
437
+ pointer.write_array_of_int(imap)
438
+ end
132
439
  end
133
440
  end
134
441
  end
@@ -31,8 +31,12 @@ module HTS
31
31
 
32
32
  def to_s
33
33
  kstr = LibHTS::KString.new
34
- LibHTS.bcf_hrec_format(@bcf_hrec, kstr)
35
- kstr[:s]
34
+ begin
35
+ LibHTS.bcf_hrec_format(@bcf_hrec, kstr)
36
+ kstr.read_string_copy
37
+ ensure
38
+ kstr.free_buffer
39
+ end
36
40
  end
37
41
 
38
42
  private