htslib 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ module HTS
5
5
  class Format
6
6
  def initialize(record)
7
7
  @record = record
8
- @p1 = FFI::MemoryPointer.new(:pointer) # FIXME: naming
9
8
  end
10
9
 
11
10
  # @note: Why is this method named "get" instead of "fetch"?
@@ -13,70 +12,137 @@ module HTS
13
12
  # which provides methods like `get_int`, `get_float`, etc.
14
13
  # I think they are better than `fetch_int`` and `fetch_float`.
15
14
  def get(key, type = nil)
16
- n = FFI::MemoryPointer.new(:int)
17
- p1 = @p1
18
- h = @record.header.struct
19
- r = @record.struct
15
+ return get_typed(key, type) unless type.nil?
20
16
 
21
- format_values = proc do |typ|
22
- ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, typ)
23
- return nil if ret < 0 # return from method.
17
+ return decode_genotypes if key == "GT"
24
18
 
25
- p1.read_pointer
19
+ case header_format_type(key)
20
+ when :int
21
+ decode_integer_values(key)
22
+ when :float
23
+ decode_float_values(key)
24
+ when :flag
25
+ raise_unsupported_format_flag(key)
26
+ when :string
27
+ get_string_values(key)
26
28
  end
29
+ end
27
30
 
31
+ def get_raw(key, type = nil)
28
32
  # The GT FORMAT field is special in that it is marked as a string in the header,
29
33
  # but it is actually encoded as an integer.
30
- if key == "GT"
31
- type = :int
32
- elsif type.nil?
33
- type = ht_type_to_sym(get_fmt_type(key))
34
- end
34
+ type = if type.nil?
35
+ key == "GT" ? :int : header_format_type(key)
36
+ else
37
+ type.to_sym
38
+ end
35
39
 
36
- case type&.to_sym
40
+ case type
37
41
  when :int, :int32
38
- format_values.call(LibHTS::BCF_HT_INT)
39
- .read_array_of_int32(n.read_int)
42
+ raise_unsupported_format_flag(key)
43
+ get_numeric_values(key, LibHTS::BCF_HT_INT, "integer") { |dst, len| dst.read_array_of_int32(len) }
40
44
  when :float, :real
41
- format_values.call(LibHTS::BCF_HT_REAL)
42
- .read_array_of_float(n.read_int)
45
+ raise_unsupported_format_flag(key)
46
+ get_float_words(key)
43
47
  when :flag
44
- raise NotImplementedError, "Flag type not implemented yet. " \
45
- "Please file an issue on GitHub."
46
- # format_values.call(LibHTS::BCF_HT_FLAG)
47
- # .read_int == 1
48
+ raise_unsupported_format_flag(key)
48
49
  when :string, :str
49
- raise NotImplementedError, "String type not implemented yet. " \
50
- "Please file an issue on GitHub."
51
- # format_values.call(LibHTS::BCF_HT_STR)
52
- # .read_string
50
+ return decode_genotypes if key == "GT"
51
+
52
+ raise_unsupported_format_flag(key)
53
+ get_string_values(key)
53
54
  end
54
55
  end
55
56
 
56
57
  # For compatibility with HTS.cr.
57
58
  def get_int(key)
58
- get(key, :int)
59
+ get_raw(key, :int)
59
60
  end
60
61
 
61
62
  # For compatibility with HTS.cr.
62
63
  def get_float(key)
63
- get(key, :float)
64
+ get_typed(key, :float)
64
65
  end
65
66
 
66
67
  # For compatibility with HTS.cr.
67
68
  def get_flag(key)
68
- get(key, :flag)
69
+ get_raw(key, :flag)
69
70
  end
70
71
 
71
72
  # For compatibility with HTS.cr.
72
73
  def get_string(key)
73
- get(key, :string)
74
+ get_raw(key, :string)
75
+ end
76
+
77
+ # For compatibility with HTS.cr.
78
+ def get_genotypes
79
+ get_numeric_values("GT", LibHTS::BCF_HT_INT, "genotype") { |dst, len| dst.read_array_of_int32(len) }
74
80
  end
75
81
 
76
82
  def [](key)
77
83
  get(key)
78
84
  end
79
85
 
86
+ def update_int(key, values)
87
+ raise UnsupportedFormatOperationError, "Use update_genotypes for GT" if key == "GT"
88
+
89
+ ensure_expected_format_type!(key, :int, "integer")
90
+ values = normalize_int_values(values)
91
+ validate_numeric_sample_count!(key, values.size)
92
+
93
+ ptr = FFI::MemoryPointer.new(:int32, values.size)
94
+ ptr.write_array_of_int32(values)
95
+ check_update_rc!(LibHTS.bcf_update_format_int32(@record.header.struct, @record.struct, key, ptr, values.size),
96
+ key)
97
+ end
98
+
99
+ def update_float(key, values)
100
+ ensure_expected_format_type!(key, :float, "float")
101
+ values = normalize_float_values(values)
102
+ validate_numeric_sample_count!(key, values.size)
103
+
104
+ ptr = FFI::MemoryPointer.new(:float, values.size)
105
+ ptr.write_array_of_float(values)
106
+ check_update_rc!(LibHTS.bcf_update_format_float(@record.header.struct, @record.struct, key, ptr, values.size),
107
+ key)
108
+ end
109
+
110
+ def update_string(key, values)
111
+ raise UnsupportedFormatOperationError, "Use update_genotypes for GT" if key == "GT"
112
+
113
+ ensure_expected_format_type!(key, :string, "string")
114
+ values = normalize_string_values(values)
115
+ validate_string_sample_count!(key, values.size)
116
+
117
+ strings = values.map { |value| FFI::MemoryPointer.from_string(value) }
118
+ ptr = FFI::MemoryPointer.new(:pointer, strings.size)
119
+ ptr.write_array_of_pointer(strings)
120
+ check_update_rc!(LibHTS.bcf_update_format_string(@record.header.struct, @record.struct, key, ptr, values.size),
121
+ key)
122
+ end
123
+
124
+ def update_genotypes(values)
125
+ ensure_gt_defined!
126
+
127
+ values = normalize_int_values(values)
128
+ validate_numeric_sample_count!("GT", values.size)
129
+
130
+ ptr = FFI::MemoryPointer.new(:int32, values.size)
131
+ ptr.write_array_of_int32(values)
132
+ check_update_rc!(LibHTS.bcf_update_genotypes(@record.header.struct, @record.struct, ptr, values.size), "GT")
133
+ end
134
+
135
+ def delete(key)
136
+ return false if header_format_type(key).nil?
137
+ return false unless format_present?(key)
138
+
139
+ type = key == "GT" ? LibHTS::BCF_HT_INT : header_format_type_code(key)
140
+ ret = LibHTS.bcf_update_format(@record.header.struct, @record.struct, key, FFI::Pointer::NULL, 0, type)
141
+ raise FormatUpdateError, "Failed to delete FORMAT field '#{key}': #{ret}" if ret < 0
142
+
143
+ true
144
+ end
145
+
80
146
  def fields
81
147
  ids.map do |id|
82
148
  name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
@@ -112,6 +178,266 @@ module HTS
112
178
 
113
179
  private
114
180
 
181
+ def get_numeric_values(key, hts_type, expected_type)
182
+ ndst = FFI::MemoryPointer.new(:int)
183
+ ndst.write_int(0)
184
+ dst_ptr = FFI::MemoryPointer.new(:pointer)
185
+ dst_ptr.write_pointer(FFI::Pointer::NULL)
186
+
187
+ ret = LibHTS.bcf_get_format_values(@record.header.struct, @record.struct, key, dst_ptr, ndst, hts_type)
188
+ ret = normalize_format_rc(ret, key, expected_type)
189
+ return nil unless ret
190
+
191
+ dst = dst_ptr.read_pointer
192
+ begin
193
+ yield(dst, ret)
194
+ ensure
195
+ LibHTS.hts_free(dst) unless dst.null?
196
+ dst_ptr.write_pointer(FFI::Pointer::NULL)
197
+ end
198
+ end
199
+
200
+ def get_string_values(key)
201
+ ndst = FFI::MemoryPointer.new(:int)
202
+ ndst.write_int(0)
203
+ dst_ptr = FFI::MemoryPointer.new(:pointer)
204
+ dst_ptr.write_pointer(FFI::Pointer::NULL)
205
+
206
+ ret = LibHTS.bcf_get_format_string(@record.header.struct, @record.struct, key, dst_ptr, ndst)
207
+ ret = normalize_format_rc(ret, key, "string")
208
+ return nil unless ret
209
+
210
+ dst = dst_ptr.read_pointer
211
+ sample_count = @record.header.nsamples
212
+ begin
213
+ dst.read_array_of_pointer(sample_count).map(&:read_string)
214
+ ensure
215
+ unless dst.null?
216
+ collapsed = sample_count.positive? ? dst.get_pointer(0) : FFI::Pointer::NULL
217
+ LibHTS.hts_free(collapsed) unless collapsed.null?
218
+ LibHTS.hts_free(dst)
219
+ end
220
+ dst_ptr.write_pointer(FFI::Pointer::NULL)
221
+ end
222
+ end
223
+
224
+ def decode_integer_values(key)
225
+ values = get_raw(key, :int)
226
+ return nil unless values
227
+
228
+ sample_values = split_sample_values(values)
229
+ if scalar_format?(key)
230
+ sample_values.map do |values_per_sample|
231
+ map_integer_missing_value(trim_integer_vector_end(values_per_sample).first)
232
+ end
233
+ else
234
+ sample_values.map do |values_per_sample|
235
+ map_integer_missing(trim_integer_vector_end(values_per_sample))
236
+ end
237
+ end
238
+ end
239
+
240
+ def decode_float_values(key)
241
+ values = get_float_words(key)
242
+ return nil unless values
243
+
244
+ sample_values = split_sample_values(values)
245
+ if scalar_format?(key)
246
+ sample_values.map do |values_per_sample|
247
+ decode_float_word(trim_float_vector_end(values_per_sample).first)
248
+ end
249
+ else
250
+ sample_values.map do |values_per_sample|
251
+ map_float_words(trim_float_vector_end(values_per_sample))
252
+ end
253
+ end
254
+ end
255
+
256
+ def decode_genotypes
257
+ genotypes = get_genotypes
258
+ return nil unless genotypes
259
+
260
+ split_sample_values(genotypes).map do |sample_values|
261
+ decode_genotype_sample(trim_genotype_vector_end(sample_values))
262
+ end
263
+ end
264
+
265
+ def decode_genotype_sample(values)
266
+ values.each_with_index.map do |value, index|
267
+ allele = if LibHTS.bcf_gt_is_missing(value) != 0
268
+ "."
269
+ else
270
+ LibHTS.bcf_gt_allele(value).to_s
271
+ end
272
+
273
+ next allele if index.zero?
274
+
275
+ separator = LibHTS.bcf_gt_is_phased(value) != 0 ? "|" : "/"
276
+ "#{separator}#{allele}"
277
+ end.join
278
+ end
279
+
280
+ def split_sample_values(values)
281
+ sample_count = @record.header.nsamples
282
+ return [] if sample_count <= 0
283
+
284
+ raise FormatReadError, "Failed to split FORMAT values by sample" unless (values.size % sample_count).zero?
285
+
286
+ values_per_sample = values.size / sample_count
287
+ Array.new(sample_count) do |sample_index|
288
+ start = sample_index * values_per_sample
289
+ values[start, values_per_sample]
290
+ end
291
+ end
292
+
293
+ def trim_genotype_vector_end(values)
294
+ end_index = values.index { |value| LibHTS.bcf_gt_is_vector_end(value) != 0 } || values.size
295
+ values[0, end_index]
296
+ end
297
+
298
+ def trim_integer_vector_end(values)
299
+ end_index = values.index { |value| value == LibHTS.bcf_int32_vector_end } || values.size
300
+ values[0, end_index]
301
+ end
302
+
303
+ def trim_float_vector_end(values)
304
+ end_index = values.index(LibHTS.bcf_float_vector_end) || values.size
305
+ values[0, end_index]
306
+ end
307
+
308
+ def map_integer_missing(values)
309
+ values.map { |value| map_integer_missing_value(value) }
310
+ end
311
+
312
+ def map_integer_missing_value(value)
313
+ value == LibHTS.bcf_int32_missing ? nil : value
314
+ end
315
+
316
+ def map_float_words(values)
317
+ values.map { |value| decode_float_word(value) }
318
+ end
319
+
320
+ def decode_float_word(value)
321
+ return nil if value == LibHTS.bcf_float_missing
322
+ return nil if value == LibHTS.bcf_float_vector_end
323
+
324
+ [value].pack("V").unpack1("e")
325
+ end
326
+
327
+ def get_float_words(key)
328
+ get_numeric_values(key, LibHTS::BCF_HT_REAL, "float") { |dst, len| dst.get_array_of_uint32(0, len) }
329
+ end
330
+
331
+ def get_typed(key, type)
332
+ case type.to_sym
333
+ when :float, :real
334
+ raise_unsupported_format_flag(key)
335
+ words = get_float_words(key)
336
+ words&.map { |word| decode_float_word(word) }
337
+ else
338
+ get_raw(key, type)
339
+ end
340
+ end
341
+
342
+ def normalize_format_rc(rc, key, expected_type)
343
+ case rc
344
+ when -1, -3
345
+ nil
346
+ when -2
347
+ raise FormatTypeError, "Tag #{key} is not #{expected_type} FORMAT field"
348
+ when -4
349
+ raise FormatReadError, "Failed to read FORMAT/#{key}"
350
+ else
351
+ rc
352
+ end
353
+ end
354
+
355
+ def raise_unsupported_format_flag(key)
356
+ return unless header_format_type(key) == :flag
357
+
358
+ raise UnsupportedFormatOperationError,
359
+ "FORMAT flag fields are not supported: #{key}"
360
+ end
361
+
362
+ def ensure_expected_format_type!(key, expected_type, label)
363
+ actual_type = header_format_type(key)
364
+ raise FormatDefinitionError, "FORMAT tag #{key} not defined in header" if actual_type.nil?
365
+
366
+ raise_unsupported_format_flag(key)
367
+ raise FormatTypeError, "Tag #{key} is not #{label} FORMAT field" unless actual_type == expected_type
368
+ end
369
+
370
+ def ensure_gt_defined!
371
+ raise FormatDefinitionError, "FORMAT tag GT not defined in header" if header_format_type("GT").nil?
372
+ end
373
+
374
+ def check_update_rc!(rc, key)
375
+ case rc
376
+ when -1
377
+ raise FormatDefinitionError, "FORMAT tag #{key} not defined in header"
378
+ when 0
379
+ rc
380
+ else
381
+ raise FormatUpdateError, "Failed to update FORMAT field '#{key}': #{rc}" if rc.negative?
382
+
383
+ rc
384
+ end
385
+ end
386
+
387
+ def validate_numeric_sample_count!(key, value_count)
388
+ sample_count = @record.header.nsamples
389
+ raise ArgumentError, "FORMAT fields require at least one sample" if sample_count <= 0
390
+ return if (value_count % sample_count).zero?
391
+
392
+ raise ArgumentError, "FORMAT values for #{key} must be divisible by sample count (#{sample_count})"
393
+ end
394
+
395
+ def validate_string_sample_count!(key, value_count)
396
+ sample_count = @record.header.nsamples
397
+ raise ArgumentError, "FORMAT fields require at least one sample" if sample_count <= 0
398
+ return if value_count == sample_count
399
+
400
+ raise ArgumentError, "FORMAT string values for #{key} must provide one entry per sample (#{sample_count})"
401
+ end
402
+
403
+ def normalize_int_values(values)
404
+ values = Array(values)
405
+ raise ArgumentError, "Cannot update FORMAT field with empty array. Use delete instead." if values.empty?
406
+ raise ArgumentError, "FORMAT integer values must all be Integer" unless values.all?(Integer)
407
+ raise RangeError, "FORMAT integer values must fit int32" unless values.all? { |value| int32_range?(value) }
408
+
409
+ values
410
+ end
411
+
412
+ def normalize_float_values(values)
413
+ values = Array(values)
414
+ raise ArgumentError, "Cannot update FORMAT field with empty array. Use delete instead." if values.empty?
415
+ raise ArgumentError, "FORMAT float values must all be Numeric" unless values.all?(Numeric)
416
+
417
+ values.map(&:to_f)
418
+ end
419
+
420
+ def normalize_string_values(values)
421
+ values = Array(values)
422
+ raise ArgumentError, "Cannot update FORMAT field with empty array. Use delete instead." if values.empty?
423
+ raise ArgumentError, "FORMAT string values must all be String" unless values.all?(String)
424
+
425
+ values
426
+ end
427
+
428
+ def format_present?(key)
429
+ if key == "GT"
430
+ !get_genotypes.nil?
431
+ else
432
+ case header_format_type(key)
433
+ when :int then !get_int(key).nil?
434
+ when :float then !get_float(key).nil?
435
+ when :string then !get_string(key).nil?
436
+ else false
437
+ end
438
+ end
439
+ end
440
+
115
441
  def fmt_ptr
116
442
  @record.struct[:d][:fmt].to_ptr
117
443
  end
@@ -135,15 +461,43 @@ module HTS
135
461
  nil
136
462
  end
137
463
 
464
+ def scalar_format?(key)
465
+ header_format_number(key) == 1
466
+ end
467
+
468
+ def header_format_number(key)
469
+ id = LibHTS.bcf_hdr_id2int(@record.header.struct, LibHTS::BCF_DT_ID, key)
470
+ return nil if id.negative?
471
+ return nil unless LibHTS.bcf_hdr_idinfo_exists(@record.header.struct, LibHTS::BCF_HL_FMT, id)
472
+
473
+ LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
474
+ end
475
+
476
+ def header_format_type_code(key)
477
+ id = LibHTS.bcf_hdr_id2int(@record.header.struct, LibHTS::BCF_DT_ID, key)
478
+ return nil if id.negative?
479
+ return nil unless LibHTS.bcf_hdr_idinfo_exists(@record.header.struct, LibHTS::BCF_HL_FMT, id)
480
+
481
+ LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
482
+ end
483
+
484
+ def header_format_type(key)
485
+ ht_type_to_sym(header_format_type_code(key))
486
+ end
487
+
138
488
  def ht_type_to_sym(t)
139
489
  case t
140
490
  when LibHTS::BCF_HT_FLAG then :flag
141
491
  when LibHTS::BCF_HT_INT then :int
142
492
  when LibHTS::BCF_HT_REAL then :float
143
493
  when LibHTS::BCF_HT_STR then :string
144
- when LibHTS::BCF_HT_LONG then :float
494
+ when LibHTS::BCF_HT_LONG then :int64
145
495
  end
146
496
  end
497
+
498
+ def int32_range?(value)
499
+ value >= -2_147_483_648 && value <= 2_147_483_647
500
+ end
147
501
  end
148
502
  end
149
503
  end