htslib 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/TUTORIAL.md +67 -0
- data/lib/hts/bam/auxi.rb +329 -2
- data/lib/hts/bam/cigar.rb +10 -2
- data/lib/hts/bam/header.rb +293 -6
- data/lib/hts/bam/mpileup.rb +7 -7
- data/lib/hts/bam/record.rb +23 -15
- data/lib/hts/bam.rb +32 -22
- data/lib/hts/bcf/errors.rb +27 -0
- data/lib/hts/bcf/format.rb +386 -32
- data/lib/hts/bcf/header.rb +320 -13
- data/lib/hts/bcf/header_record.rb +6 -2
- data/lib/hts/bcf/info.rb +269 -28
- data/lib/hts/bcf/record.rb +9 -5
- data/lib/hts/bcf.rb +163 -34
- data/lib/hts/faidx.rb +110 -73
- data/lib/hts/hts.rb +4 -1
- data/lib/hts/libhts/constants.rb +41 -3
- data/lib/hts/libhts/cram.rb +0 -5
- data/lib/hts/libhts/fai.rb +13 -8
- data/lib/hts/libhts/hfile.rb +4 -4
- data/lib/hts/libhts/hts.rb +6 -0
- data/lib/hts/libhts/sam.rb +20 -4
- data/lib/hts/libhts/vcf.rb +10 -7
- data/lib/hts/libhts/vcf_funcs.rb +31 -2
- data/lib/hts/tabix.rb +29 -2
- data/lib/hts/version.rb +1 -1
- metadata +3 -3
- data/lib/hts/faidx/sequence.rb +0 -62
data/lib/hts/bcf/header.rb
CHANGED
|
@@ -8,6 +8,19 @@ module HTS
|
|
|
8
8
|
# NOTE: This class has a lot of methods that are not stable.
|
|
9
9
|
# The method names and the number of arguments may change in the future.
|
|
10
10
|
class Header
|
|
11
|
+
BCF_TYPE_MAP = {
|
|
12
|
+
int: "Integer",
|
|
13
|
+
integer: "Integer",
|
|
14
|
+
int32: "Integer",
|
|
15
|
+
float: "Float",
|
|
16
|
+
real: "Float",
|
|
17
|
+
string: "String",
|
|
18
|
+
str: "String",
|
|
19
|
+
character: "Character",
|
|
20
|
+
char: "Character",
|
|
21
|
+
flag: "Flag"
|
|
22
|
+
}.freeze
|
|
23
|
+
|
|
11
24
|
def initialize(arg = nil)
|
|
12
25
|
case arg
|
|
13
26
|
when LibHTS::HtsFile
|
|
@@ -20,6 +33,12 @@ module HTS
|
|
|
20
33
|
raise TypeError, "Invalid argument"
|
|
21
34
|
end
|
|
22
35
|
|
|
36
|
+
@sync_depth = 0
|
|
37
|
+
@sync_needed = false
|
|
38
|
+
@subset_samples = nil
|
|
39
|
+
@subset_imap = nil
|
|
40
|
+
@subset_imap_pointer = nil
|
|
41
|
+
|
|
23
42
|
yield self if block_given?
|
|
24
43
|
end
|
|
25
44
|
|
|
@@ -36,13 +55,34 @@ module HTS
|
|
|
36
55
|
end
|
|
37
56
|
|
|
38
57
|
def set_version(version)
|
|
39
|
-
LibHTS.bcf_hdr_set_version(@bcf_hdr, version)
|
|
58
|
+
rc = LibHTS.bcf_hdr_set_version(@bcf_hdr, version)
|
|
59
|
+
raise "Failed to set VCF header version" if rc.negative?
|
|
60
|
+
|
|
61
|
+
mark_sync_needed!
|
|
62
|
+
sync_if_needed!
|
|
63
|
+
self
|
|
40
64
|
end
|
|
41
65
|
|
|
42
66
|
def nsamples
|
|
43
67
|
LibHTS.bcf_hdr_nsamples(@bcf_hdr)
|
|
44
68
|
end
|
|
45
69
|
|
|
70
|
+
def target_count
|
|
71
|
+
target_names.size
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def get_tid(name)
|
|
75
|
+
name2id(name)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def target_name(rid)
|
|
79
|
+
id2name(rid)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def target_names
|
|
83
|
+
seqnames
|
|
84
|
+
end
|
|
85
|
+
|
|
46
86
|
def samples
|
|
47
87
|
# bcf_hdr_id2name is macro function
|
|
48
88
|
@bcf_hdr[:samples]
|
|
@@ -50,17 +90,65 @@ module HTS
|
|
|
50
90
|
.map(&:read_string)
|
|
51
91
|
end
|
|
52
92
|
|
|
93
|
+
attr_reader :subset_samples, :subset_imap_pointer
|
|
94
|
+
|
|
95
|
+
def subset?
|
|
96
|
+
!@subset_imap.nil?
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def subset_sample_count
|
|
100
|
+
subset? ? @subset_samples.length : 0
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def subset(samples)
|
|
104
|
+
subset_samples = normalize_subset_samples(samples)
|
|
105
|
+
validate_subset_samples!(subset_samples)
|
|
106
|
+
|
|
107
|
+
nil
|
|
108
|
+
imap_pointer = nil
|
|
109
|
+
if subset_samples.empty?
|
|
110
|
+
subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, 0, ::FFI::Pointer::NULL, ::FFI::Pointer::NULL)
|
|
111
|
+
else
|
|
112
|
+
encoded_samples = subset_samples.map { |name| FFI::MemoryPointer.from_string(name) }
|
|
113
|
+
sample_pointers = FFI::MemoryPointer.new(:pointer, subset_samples.length)
|
|
114
|
+
sample_pointers.write_array_of_pointer(encoded_samples)
|
|
115
|
+
imap_pointer = FFI::MemoryPointer.new(:int, subset_samples.length)
|
|
116
|
+
subset_hdr = LibHTS.bcf_hdr_subset(@bcf_hdr, subset_samples.length, sample_pointers, imap_pointer)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
raise SubsetError, "Failed to subset BCF header samples #{subset_samples.inspect}" if subset_hdr.to_ptr.null?
|
|
120
|
+
|
|
121
|
+
composed_imap = compose_subset_imap(read_subset_imap(imap_pointer, subset_samples.length))
|
|
122
|
+
self.class.new(subset_hdr).tap do |header|
|
|
123
|
+
header.send(:set_subset_state, subset_samples, composed_imap)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
53
127
|
def add_sample(sample, sync: true)
|
|
54
|
-
LibHTS.bcf_hdr_add_sample(@bcf_hdr, sample)
|
|
55
|
-
|
|
128
|
+
rc = LibHTS.bcf_hdr_add_sample(@bcf_hdr, sample)
|
|
129
|
+
raise "Failed to add sample #{sample}" if rc.negative?
|
|
130
|
+
|
|
131
|
+
mark_sync_needed!
|
|
132
|
+
sync_if_needed! if sync
|
|
133
|
+
self
|
|
56
134
|
end
|
|
57
135
|
|
|
58
136
|
def merge(hdr)
|
|
59
|
-
LibHTS.bcf_hdr_merge(@bcf_hdr, hdr.struct)
|
|
137
|
+
merged = LibHTS.bcf_hdr_merge(@bcf_hdr, hdr.struct)
|
|
138
|
+
raise "Failed to merge BCF headers" if merged.to_ptr.null?
|
|
139
|
+
|
|
140
|
+
@bcf_hdr = merged
|
|
141
|
+
mark_sync_needed!
|
|
142
|
+
sync_if_needed!
|
|
143
|
+
self
|
|
60
144
|
end
|
|
61
145
|
|
|
62
146
|
def sync
|
|
63
|
-
LibHTS.bcf_hdr_sync(@bcf_hdr)
|
|
147
|
+
rc = LibHTS.bcf_hdr_sync(@bcf_hdr)
|
|
148
|
+
raise "Failed to sync BCF header" if rc.negative?
|
|
149
|
+
|
|
150
|
+
@sync_needed = false
|
|
151
|
+
self
|
|
64
152
|
end
|
|
65
153
|
|
|
66
154
|
def read_bcf(fname)
|
|
@@ -68,32 +156,121 @@ module HTS
|
|
|
68
156
|
end
|
|
69
157
|
|
|
70
158
|
def append(line)
|
|
71
|
-
LibHTS.bcf_hdr_append(@bcf_hdr, line)
|
|
159
|
+
rc = LibHTS.bcf_hdr_append(@bcf_hdr, line)
|
|
160
|
+
raise "Failed to append VCF header line" if rc.negative?
|
|
161
|
+
|
|
162
|
+
mark_sync_needed!
|
|
163
|
+
self
|
|
72
164
|
end
|
|
73
165
|
|
|
74
|
-
def delete(bcf_hl_type, key) # FIXME
|
|
166
|
+
def delete(bcf_hl_type, key = nil) # FIXME
|
|
167
|
+
existed = hrec_exists?(bcf_hl_type, key)
|
|
75
168
|
type = bcf_hl_type_to_int(bcf_hl_type)
|
|
76
169
|
LibHTS.bcf_hdr_remove(@bcf_hdr, type, key)
|
|
170
|
+
mark_sync_needed! if existed
|
|
171
|
+
existed
|
|
77
172
|
end
|
|
78
173
|
|
|
79
174
|
def get_hrec(bcf_hl_type, key, value, str_class = nil)
|
|
80
175
|
type = bcf_hl_type_to_int(bcf_hl_type)
|
|
81
|
-
hrec =
|
|
82
|
-
|
|
176
|
+
hrec = borrowed_hrec(type, key, value, str_class)
|
|
177
|
+
return nil if hrec.to_ptr.null?
|
|
178
|
+
|
|
179
|
+
HeaderRecord.new(owned_hrec(hrec))
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def edit
|
|
183
|
+
@sync_depth += 1
|
|
184
|
+
yield self
|
|
185
|
+
self
|
|
186
|
+
ensure
|
|
187
|
+
@sync_depth -= 1
|
|
188
|
+
sync_if_needed!
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def add_contig(id, length: nil, **attributes)
|
|
192
|
+
fields = [["ID", id.to_s]]
|
|
193
|
+
fields << ["length", length.to_s] unless length.nil?
|
|
194
|
+
fields.concat normalize_meta_attributes(attributes)
|
|
195
|
+
append_structured_meta("contig", fields)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def remove_contig(id)
|
|
199
|
+
delete("CONTIG", id.to_s).tap { sync_if_needed! }
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def add_filter(id, description:, **attributes)
|
|
203
|
+
fields = [["ID", id.to_s], ["Description", description.to_s]]
|
|
204
|
+
fields.concat normalize_meta_attributes(attributes)
|
|
205
|
+
append_structured_meta("FILTER", fields)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def remove_filter(id)
|
|
209
|
+
delete("FILTER", id.to_s).tap { sync_if_needed! }
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def add_info(id, number:, type:, description:, **attributes)
|
|
213
|
+
fields = [["ID", id.to_s], ["Number", normalize_bcf_number(number)], ["Type", normalize_bcf_type(type)],
|
|
214
|
+
["Description", description.to_s]]
|
|
215
|
+
fields.concat normalize_meta_attributes(attributes)
|
|
216
|
+
append_structured_meta("INFO", fields)
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def update_info(id, number:, type:, description:, **attributes)
|
|
220
|
+
delete("INFO", id.to_s)
|
|
221
|
+
add_info(id, number:, type:, description:, **attributes)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def remove_info(id)
|
|
225
|
+
delete("INFO", id.to_s).tap { sync_if_needed! }
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def add_format(id, number:, type:, description:, **attributes)
|
|
229
|
+
fields = [["ID", id.to_s], ["Number", normalize_bcf_number(number)], ["Type", normalize_bcf_type(type)],
|
|
230
|
+
["Description", description.to_s]]
|
|
231
|
+
fields.concat normalize_meta_attributes(attributes)
|
|
232
|
+
append_structured_meta("FORMAT", fields)
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def update_format(id, number:, type:, description:, **attributes)
|
|
236
|
+
delete("FORMAT", id.to_s)
|
|
237
|
+
add_format(id, number:, type:, description:, **attributes)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def remove_format(id)
|
|
241
|
+
delete("FORMAT", id.to_s).tap { sync_if_needed! }
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def add_meta(key, value = nil, **attributes)
|
|
245
|
+
if attributes.empty?
|
|
246
|
+
append("###{key}=#{value}")
|
|
247
|
+
sync_if_needed!
|
|
248
|
+
self
|
|
249
|
+
else
|
|
250
|
+
append_structured_meta(key.to_s, normalize_meta_attributes(attributes))
|
|
251
|
+
end
|
|
83
252
|
end
|
|
84
253
|
|
|
85
254
|
def seqnames
|
|
86
255
|
n = FFI::MemoryPointer.new(:int)
|
|
87
256
|
names = LibHTS.bcf_hdr_seqnames(@bcf_hdr, n)
|
|
88
|
-
|
|
89
|
-
|
|
257
|
+
begin
|
|
258
|
+
names.read_array_of_pointer(n.read_int)
|
|
259
|
+
.map(&:read_string)
|
|
260
|
+
ensure
|
|
261
|
+
LibHTS.hts_free(names) unless names.null?
|
|
262
|
+
end
|
|
90
263
|
end
|
|
91
264
|
|
|
92
265
|
def to_s
|
|
93
266
|
kstr = LibHTS::KString.new
|
|
94
|
-
|
|
267
|
+
begin
|
|
268
|
+
raise "Failed to get header string" if LibHTS.bcf_hdr_format(@bcf_hdr, 0, kstr).negative?
|
|
95
269
|
|
|
96
|
-
|
|
270
|
+
kstr.read_string_copy
|
|
271
|
+
ensure
|
|
272
|
+
kstr.free_buffer
|
|
273
|
+
end
|
|
97
274
|
end
|
|
98
275
|
|
|
99
276
|
def name2id(name)
|
|
@@ -106,6 +283,82 @@ module HTS
|
|
|
106
283
|
|
|
107
284
|
private
|
|
108
285
|
|
|
286
|
+
def normalize_bcf_type(type)
|
|
287
|
+
BCF_TYPE_MAP.fetch(type.to_sym, type.to_s)
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def normalize_bcf_number(number)
|
|
291
|
+
case number
|
|
292
|
+
when :a, :A then "A"
|
|
293
|
+
when :r, :R then "R"
|
|
294
|
+
when :g, :G then "G"
|
|
295
|
+
when :variable, :var, :dot then "."
|
|
296
|
+
else number.to_s
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def normalize_meta_attributes(attributes)
|
|
301
|
+
attributes.map do |key, value|
|
|
302
|
+
meta_key = key.to_s.split("_").map.with_index { |part, index| index.zero? ? part : part.capitalize }.join
|
|
303
|
+
meta_value = value.is_a?(Array) ? value.join(",") : value.to_s
|
|
304
|
+
[meta_key, meta_value]
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def append_structured_meta(label, fields)
|
|
309
|
+
body = fields.map { |key, value| "#{key}=#{format_meta_value(key, value)}" }.join(",")
|
|
310
|
+
append("###{label}=<#{body}>")
|
|
311
|
+
sync_if_needed!
|
|
312
|
+
self
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def format_meta_value(key, value)
|
|
316
|
+
return quote_meta_value(value) if key == "Description"
|
|
317
|
+
return value if value.match?(/\A[[:alnum:]_.:+-]+\z/)
|
|
318
|
+
|
|
319
|
+
quote_meta_value(value)
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def quote_meta_value(value)
|
|
323
|
+
%("#{value.gsub(/([\\"])/, '\\\\1')}")
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def mark_sync_needed!
|
|
327
|
+
@sync_needed = true
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def sync_if_needed!
|
|
331
|
+
sync if @sync_needed && @sync_depth.zero?
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
def hrec_exists?(bcf_hl_type, key)
|
|
335
|
+
type = bcf_hl_type_to_int(bcf_hl_type)
|
|
336
|
+
lookup_key, lookup_value, str_class = hrec_lookup_args(type, key)
|
|
337
|
+
hrec = borrowed_hrec(type, lookup_key, lookup_value, str_class)
|
|
338
|
+
!hrec.to_ptr.null?
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def borrowed_hrec(type, key, value, str_class)
|
|
342
|
+
LibHTS.bcf_hdr_get_hrec(@bcf_hdr, type, key, value, str_class)
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
def owned_hrec(hrec)
|
|
346
|
+
LibHTS.bcf_hrec_dup(hrec).tap do |owned|
|
|
347
|
+
raise "Failed to duplicate BCF header record" if owned.to_ptr.null?
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def hrec_lookup_args(type, key)
|
|
352
|
+
case type
|
|
353
|
+
when LibHTS::BCF_HL_FLT, LibHTS::BCF_HL_INFO, LibHTS::BCF_HL_FMT, LibHTS::BCF_HL_CTG
|
|
354
|
+
["ID", key, nil]
|
|
355
|
+
when LibHTS::BCF_HL_GEN
|
|
356
|
+
[key, nil, nil]
|
|
357
|
+
else
|
|
358
|
+
["ID", key, nil]
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
109
362
|
def bcf_hl_type_to_int(bcf_hl_type)
|
|
110
363
|
return bcf_hl_type if bcf_hl_type.is_a?(Integer)
|
|
111
364
|
|
|
@@ -129,6 +382,60 @@ module HTS
|
|
|
129
382
|
|
|
130
383
|
def initialize_copy(orig)
|
|
131
384
|
@bcf_hdr = LibHTS.bcf_hdr_dup(orig.struct)
|
|
385
|
+
@sync_depth = 0
|
|
386
|
+
@sync_needed = false
|
|
387
|
+
set_subset_state(orig.subset_samples, orig.send(:subset_imap))
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
protected
|
|
391
|
+
|
|
392
|
+
attr_reader :subset_imap
|
|
393
|
+
|
|
394
|
+
def set_subset_state(samples, imap)
|
|
395
|
+
@subset_samples = samples&.dup
|
|
396
|
+
@subset_imap = imap&.dup
|
|
397
|
+
@subset_imap_pointer = build_subset_imap_pointer(@subset_imap)
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
private
|
|
401
|
+
|
|
402
|
+
def normalize_subset_samples(samples)
|
|
403
|
+
case samples
|
|
404
|
+
when String
|
|
405
|
+
[samples]
|
|
406
|
+
else
|
|
407
|
+
Array(samples).map(&:to_s)
|
|
408
|
+
end
|
|
409
|
+
rescue TypeError
|
|
410
|
+
raise SubsetError, "Sample subset must be a String or an Array of sample names"
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
def validate_subset_samples!(subset_samples)
|
|
414
|
+
duplicates = subset_samples.group_by(&:itself).select { |_name, group| group.length > 1 }.keys
|
|
415
|
+
raise SubsetError, "Duplicate sample names in subset: #{duplicates.join(', ')}" unless duplicates.empty?
|
|
416
|
+
|
|
417
|
+
missing = subset_samples.reject { |name| samples.include?(name) }
|
|
418
|
+
raise UnknownSampleError, "Unknown sample names: #{missing.join(', ')}" unless missing.empty?
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def read_subset_imap(pointer, length)
|
|
422
|
+
return [] if length.zero?
|
|
423
|
+
|
|
424
|
+
pointer.read_array_of_int(length)
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
def compose_subset_imap(imap)
|
|
428
|
+
base_imap = @subset_imap || Array.new(samples.length, &:itself)
|
|
429
|
+
imap.map { |index| base_imap.fetch(index) }
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
def build_subset_imap_pointer(imap)
|
|
433
|
+
return nil unless imap
|
|
434
|
+
return nil if imap.empty?
|
|
435
|
+
|
|
436
|
+
FFI::MemoryPointer.new(:int, imap.length).tap do |pointer|
|
|
437
|
+
pointer.write_array_of_int(imap)
|
|
438
|
+
end
|
|
132
439
|
end
|
|
133
440
|
end
|
|
134
441
|
end
|