htslib 0.0.10 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/hts/bam.rb CHANGED
@@ -1,8 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Based on hts-python
4
- # https://github.com/quinlan-lab/hts-python
5
-
6
3
  require_relative "../htslib"
7
4
 
8
5
  require_relative "hts"
@@ -12,10 +9,11 @@ require_relative "bam/flag"
12
9
  require_relative "bam/record"
13
10
 
14
11
  module HTS
12
+ # A class for working with SAM, BAM, CRAM files.
15
13
  class Bam
16
14
  include Enumerable
17
15
 
18
- attr_reader :file_name, :index_path, :mode, :header
16
+ attr_reader :file_name, :index_name, :mode, :header
19
17
 
20
18
  def self.open(*args, **kw)
21
19
  file = new(*args, **kw) # do not yield
@@ -38,9 +36,10 @@ module HTS
38
36
 
39
37
  # NOTE: Do not check for the existence of local files, since file_names may be remote URIs.
40
38
 
41
- @file_name = file_name
42
- @mode = mode
43
- @hts_file = LibHTS.hts_open(@file_name, mode)
39
+ @file_name = file_name
40
+ @index_name = index
41
+ @mode = mode
42
+ @hts_file = LibHTS.hts_open(@file_name, mode)
44
43
 
45
44
  raise Errno::ENOENT, "Failed to open #{@file_name}" if @hts_file.null?
46
45
 
@@ -49,33 +48,31 @@ module HTS
49
48
  raise "Failed to load fasta index: #{fai}" if r < 0
50
49
  end
51
50
 
52
- if threads&.> 0
53
- r = LibHTS.hts_set_threads(@hts_file, threads)
54
- raise "Failed to set number of threads: #{threads}" if r < 0
55
- end
51
+ set_threads(threads) if threads
56
52
 
57
53
  return if @mode[0] == "w"
58
54
 
59
55
  @header = Bam::Header.new(@hts_file)
60
-
61
56
  create_index(index) if create_index
62
-
63
57
  @idx = load_index(index)
64
-
65
58
  @start_position = tell
59
+ super # do nothing
66
60
  end
67
61
 
68
62
  def create_index(index_name = nil)
63
+ check_closed
64
+
65
+ warn "Create index for #{@file_name} to #{index_name}"
69
66
  if index
70
- warn "Create index for #{@file_name} to #{index_name}"
71
67
  LibHTS.sam_index_build2(@file_name, index_name, -1)
72
68
  else
73
- warn "Create index for #{@file_name} to #{index_name}"
74
69
  LibHTS.sam_index_build(@file_name, -1)
75
70
  end
76
71
  end
77
72
 
78
73
  def load_index(index_name = nil)
74
+ check_closed
75
+
79
76
  if index_name
80
77
  LibHTS.sam_index_load2(@hts_file, @file_name, index_name)
81
78
  else
@@ -84,6 +81,8 @@ module HTS
84
81
  end
85
82
 
86
83
  def index_loaded?
84
+ check_closed
85
+
87
86
  !@idx.null?
88
87
  end
89
88
 
@@ -91,29 +90,34 @@ module HTS
91
90
  def close
92
91
  LibHTS.hts_idx_destroy(@idx) if @idx&.null?
93
92
  @idx = nil
94
- LibHTS.hts_close(@hts_file)
95
- @hts_file = nil
96
- end
97
-
98
- def closed?
99
- @hts_file.nil? || @hts_file.null?
93
+ super
100
94
  end
101
95
 
102
96
  def write_header(header)
97
+ check_closed
98
+
103
99
  @header = header.dup
104
100
  LibHTS.hts_set_fai_filename(@hts_file, @file_name)
105
101
  LibHTS.sam_hdr_write(@hts_file, header)
106
102
  end
107
103
 
108
104
  def write(aln)
105
+ check_closed
106
+
109
107
  aln_dup = aln.dup
110
108
  LibHTS.sam_write1(@hts_file, header, aln_dup) > 0 || raise
111
109
  end
112
110
 
113
- # Iterate over each record.
114
- # Generate a new Record object each time.
115
- # Slower than each.
116
- def each_copy
111
+ def each(copy: false, &block)
112
+ if copy
113
+ each_record_copy(&block)
114
+ else
115
+ each_record_reuse(&block)
116
+ end
117
+ end
118
+
119
+ private def each_record_copy
120
+ check_closed
117
121
  return to_enum(__method__) unless block_given?
118
122
 
119
123
  while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) != -1
@@ -123,13 +127,10 @@ module HTS
123
127
  self
124
128
  end
125
129
 
126
- # Iterate over each record.
127
- # Record object is reused.
128
- # Faster than each_copy.
129
- def each
130
+ private def each_record_reuse
131
+ check_closed
130
132
  # Each does not always start at the beginning of the file.
131
133
  # This is the common behavior of IO objects in Ruby.
132
- # This may change in the future.
133
134
  return to_enum(__method__) unless block_given?
134
135
 
135
136
  bam1 = LibHTS.bam_init1
@@ -138,22 +139,109 @@ module HTS
138
139
  self
139
140
  end
140
141
 
141
- # query [WIP]
142
- def query(region)
142
+ def query(region, copy: false, &block)
143
+ if copy
144
+ query_copy(region, &block)
145
+ else
146
+ query_reuse(region, &block)
147
+ end
148
+ end
149
+
150
+ private def query_copy(region)
151
+ check_closed
143
152
  raise "Index file is required to call the query method." unless index_loaded?
153
+ return to_enum(__method__, region) unless block_given?
144
154
 
145
155
  qiter = LibHTS.sam_itr_querys(@idx, header, region)
156
+
146
157
  begin
147
- bam1 = LibHTS.bam_init1
148
- slen = LibHTS.sam_itr_next(@hts_file, qiter, bam1)
149
- while slen > 0
150
- yield Record.new(bam1, header)
158
+ loop do
151
159
  bam1 = LibHTS.bam_init1
152
160
  slen = LibHTS.sam_itr_next(@hts_file, qiter, bam1)
161
+ break if slen == -1
162
+ raise if slen < -1
163
+
164
+ yield Record.new(bam1, header)
153
165
  end
154
166
  ensure
155
167
  LibHTS.hts_itr_destroy(qiter)
156
168
  end
169
+ self
170
+ end
171
+
172
+ private def query_reuse(region)
173
+ check_closed
174
+ raise "Index file is required to call the query method." unless index_loaded?
175
+ return to_enum(__method__, region) unless block_given?
176
+
177
+ qiter = LibHTS.sam_itr_querys(@idx, header, region)
178
+
179
+ bam1 = LibHTS.bam_init1
180
+ record = Record.new(bam1, header)
181
+ begin
182
+ yield record while LibHTS.sam_itr_next(@hts_file, qiter, bam1) > 0
183
+ ensure
184
+ LibHTS.hts_itr_destroy(qiter)
185
+ end
186
+ self
187
+ end
188
+
189
+ # @!macro [attach] define_getter
190
+ # @method $1
191
+ # Get $1 array
192
+ # @return [Array] the $1 array
193
+ define_getter :qname
194
+ define_getter :flag
195
+ define_getter :chrom
196
+ define_getter :pos
197
+ define_getter :mapq
198
+ define_getter :cigar
199
+ define_getter :mate_chrom
200
+ define_getter :mate_pos
201
+ define_getter :insert_size
202
+ define_getter :seq
203
+ define_getter :qual
204
+
205
+ alias isize insert_size
206
+ alias mpos mate_pos
207
+
208
+ def aux(tag)
209
+ warn "experimental"
210
+ check_closed
211
+ position = tell
212
+ ary = map { |r| r.aux(tag) }
213
+ seek(position)
214
+ ary
215
+ end
216
+
217
+ # @!macro [attach] define_iterator
218
+ # @method each_$1
219
+ # Get $1 iterator
220
+ define_iterator :qname
221
+ define_iterator :flag
222
+ define_iterator :chrom
223
+ define_iterator :pos
224
+ define_iterator :mapq
225
+ define_iterator :cigar
226
+ define_iterator :mate_chrom
227
+ define_iterator :mate_pos
228
+ define_iterator :insert_size
229
+ define_iterator :seq
230
+ define_iterator :qual
231
+
232
+ alias each_isize each_insert_size
233
+ alias each_mpos each_mate_pos
234
+
235
+ def each_aux(tag)
236
+ warn "experimental"
237
+ check_closed
238
+ return to_enum(__method__, tag) unless block_given?
239
+
240
+ each do |record|
241
+ yield record.aux(tag)
242
+ end
243
+
244
+ self
157
245
  end
158
246
  end
159
247
  end
@@ -1,9 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # https://github.com/brentp/hts-nim/blob/master/src/hts/vcf.nim
4
- # This is a port from Nim.
5
- # TODO: Make it more like Ruby.
6
-
7
3
  module HTS
8
4
  class Bcf < Hts
9
5
  class Format
@@ -32,20 +28,32 @@ module HTS
32
28
  get(key, :string)
33
29
  end
34
30
 
31
+ def [](key)
32
+ get(key)
33
+ end
34
+
35
35
  def get(key, type = nil)
36
36
  n = FFI::MemoryPointer.new(:int)
37
37
  p1 = @p1
38
38
  h = @record.header.struct
39
39
  r = @record.struct
40
40
 
41
- format_values = proc do |type|
42
- ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, type)
41
+ format_values = proc do |typ|
42
+ ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, typ)
43
43
  return nil if ret < 0 # return from method.
44
44
 
45
45
  p1.read_pointer
46
46
  end
47
47
 
48
- case type.to_sym
48
+ # The GT FORMAT field is special in that it is marked as a string in the header,
49
+ # but it is actually encoded as an integer.
50
+ if key == "GT"
51
+ type = :int
52
+ elsif type.nil?
53
+ type = ht_type_to_sym(get_fmt_type(key))
54
+ end
55
+
56
+ case type&.to_sym
49
57
  when :int, :int32
50
58
  format_values.call(LibHTS::BCF_HT_INT)
51
59
  .read_array_of_int32(n.read_int)
@@ -53,22 +61,85 @@ module HTS
53
61
  format_values.call(LibHTS::BCF_HT_REAL)
54
62
  .read_array_of_float(n.read_int)
55
63
  when :flag
56
- raise NotImplementedError, "Flag type not implemented yet."
64
+ raise NotImplementedError, "Flag type not implemented yet. " \
65
+ "Please file an issue on GitHub."
57
66
  # format_values.call(LibHTS::BCF_HT_FLAG)
58
67
  # .read_int == 1
59
68
  when :string, :str
60
- raise NotImplementedError, "String type not implemented yet."
69
+ raise NotImplementedError, "String type not implemented yet. " \
70
+ "Please file an issue on GitHub."
61
71
  # format_values.call(LibHTS::BCF_HT_STR)
62
72
  # .read_string
63
73
  end
64
74
  end
65
75
 
66
- def set; end
76
+ def fields
77
+ ids.map do |id|
78
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
79
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
80
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
81
+ {
82
+ name:,
83
+ n: num,
84
+ type: ht_type_to_sym(type),
85
+ id:
86
+ }
87
+ end
88
+ end
67
89
 
68
- # def fields # iterator
69
- # end
90
+ def length
91
+ @record.struct[:n_fmt]
92
+ end
93
+
94
+ def size
95
+ length
96
+ end
97
+
98
+ def to_h
99
+ ret = {}
100
+ ids.each do |id|
101
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
102
+ ret[name] = get(name)
103
+ end
104
+ ret
105
+ end
70
106
 
71
- def genotypes; end
107
+ # def genotypes; end
108
+
109
+ private
110
+
111
+ def fmt_ptr
112
+ @record.struct[:d][:fmt].to_ptr
113
+ end
114
+
115
+ def ids
116
+ fmt_ptr.read_array_of_struct(LibHTS::BcfFmt, length).map do |fmt|
117
+ fmt[:id]
118
+ end
119
+ end
120
+
121
+ def get_fmt_type(qname)
122
+ @record.struct[:n_fmt].times do |i|
123
+ fmt = LibHTS::BcfFmt.new(@record.struct[:d][:fmt] + i * LibHTS::BcfFmt.size)
124
+ id = fmt[:id]
125
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
126
+ if name == qname
127
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
128
+ return type
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def ht_type_to_sym(t)
135
+ case t
136
+ when LibHTS::BCF_HT_FLAG then :flag
137
+ when LibHTS::BCF_HT_INT then :int
138
+ when LibHTS::BCF_HT_REAL then :float
139
+ when LibHTS::BCF_HT_STR then :string
140
+ when LibHTS::BCF_HT_LONG then :float
141
+ end
142
+ end
72
143
  end
73
144
  end
74
145
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  module HTS
4
4
  class Bcf < Hts
5
+ # A class for working with VCF records.
5
6
  class Header
6
7
  def initialize(hts_file)
7
8
  @bcf_hdr = LibHTS.bcf_hdr_read(hts_file)
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module HTS
4
+ class Bcf < Hts
5
+ class HeaderRecord
6
+ def initialize
7
+ @bcf_hrec
8
+ end
9
+ end
10
+ end
11
+ end
data/lib/hts/bcf/info.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  module HTS
4
4
  class Bcf < Hts
5
+ # Info field
5
6
  class Info
6
7
  def initialize(record)
7
8
  @record = record
@@ -28,6 +29,10 @@ module HTS
28
29
  get(key, :flag)
29
30
  end
30
31
 
32
+ def [](key)
33
+ get(key)
34
+ end
35
+
31
36
  # @note Specify the type. If you don't specify a type, it will still work, but it will be slower.
32
37
  def get(key, type = nil)
33
38
  n = FFI::MemoryPointer.new(:int)
@@ -35,14 +40,14 @@ module HTS
35
40
  h = @record.header.struct
36
41
  r = @record.struct
37
42
 
38
- info_values = proc do |type|
39
- ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, type)
43
+ info_values = proc do |typ|
44
+ ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, typ)
40
45
  return nil if ret < 0 # return from method.
41
46
 
42
47
  p1.read_pointer
43
48
  end
44
49
 
45
- type ||= info_type_to_string(get_info_type(key))
50
+ type ||= ht_type_to_sym(get_info_type(key))
46
51
 
47
52
  case type&.to_sym
48
53
  when :int, :int32
@@ -67,47 +72,68 @@ module HTS
67
72
 
68
73
  # FIXME: naming? room for improvement.
69
74
  def fields
70
- n_info = @record.struct[:n_info]
71
- Array.new(n_info) do |i|
72
- fld = LibHTS::BcfInfo.new(
73
- @record.struct[:d][:info] +
74
- i * LibHTS::BcfInfo.size
75
- )
75
+ keys.map do |key|
76
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
77
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_INFO, key)
78
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, key)
76
79
  {
77
- name: LibHTS.bcf_hdr_int2id(
78
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
79
- ),
80
- n: LibHTS.bcf_hdr_id2number(
81
- @record.header.struct, LibHTS::BCF_HL_INFO, fld[:key]
82
- ),
83
- vtype: fld[:type], i: fld[:key]
80
+ name:,
81
+ n: num,
82
+ type: ht_type_to_sym(type),
83
+ key:
84
84
  }
85
85
  end
86
86
  end
87
87
 
88
+ def length
89
+ @record.struct[:n_info]
90
+ end
91
+
92
+ def size
93
+ length
94
+ end
95
+
96
+ def to_h
97
+ ret = {}
98
+ keys.each do |key|
99
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
100
+ ret[name] = get(name)
101
+ end
102
+ ret
103
+ end
104
+
88
105
  private
89
106
 
107
+ def info_ptr
108
+ @record.struct[:d][:info].to_ptr
109
+ end
110
+
111
+ def keys
112
+ info_ptr.read_array_of_struct(LibHTS::BcfInfo, length).map do |info|
113
+ info[:key]
114
+ end
115
+ end
116
+
90
117
  def get_info_type(key)
91
118
  @record.struct[:n_info].times do |i|
92
- fld = LibHTS::BcfInfo.new(
93
- @record.struct[:d][:info] +
94
- i * LibHTS::BcfInfo.size
95
- )
96
- id = LibHTS.bcf_hdr_int2id(
97
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
98
- )
99
- return fld[:type] if id == key
119
+ info = LibHTS::BcfInfo.new(@record.struct[:d][:info] + i * LibHTS::BcfInfo.size)
120
+ k = info[:key]
121
+ id = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, k)
122
+ if id == key
123
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, k)
124
+ return type
125
+ end
100
126
  end
127
+ nil
101
128
  end
102
129
 
103
- def info_type_to_string(t)
130
+ def ht_type_to_sym(t)
104
131
  case t
105
- when 0 then :flag
106
- when 1, 2, 3, 4 then :int
107
- when 5 then :float
108
- when 7 then :string
109
- else
110
- raise "Unknown info type: #{t}"
132
+ when LibHTS::BCF_HT_FLAG then :flag
133
+ when LibHTS::BCF_HT_INT then :int
134
+ when LibHTS::BCF_HT_REAL then :float
135
+ when LibHTS::BCF_HT_STR then :string
136
+ when LibHTS::BCF_HT_LONG then :float
111
137
  end
112
138
  end
113
139
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  module HTS
4
4
  class Bcf < Hts
5
+ # A class for working with VCF records.
5
6
  class Record
6
7
  def initialize(bcf_t, header)
7
8
  @bcf1 = bcf_t
@@ -18,57 +19,46 @@ module HTS
18
19
  @bcf1.to_ptr
19
20
  end
20
21
 
21
- # def inspect; end
22
-
23
- def formats; end
22
+ # Get the reference id of the record.
23
+ def rid
24
+ @bcf1[:rid]
25
+ end
24
26
 
25
- def genotypes; end
27
+ def rid=(rid)
28
+ @bcf1[:rid] = rid
29
+ end
26
30
 
31
+ # Get the chromosome of variant.
27
32
  def chrom
28
- rid = @bcf1[:rid]
29
-
30
33
  LibHTS.bcf_hdr_id2name(@header.struct, rid)
31
34
  end
32
35
 
36
+ # Return 0-based position.
33
37
  def pos
34
- @bcf1[:pos] + 1 # FIXME
38
+ @bcf1[:pos]
35
39
  end
36
40
 
37
- def start
38
- @bcf1[:pos]
41
+ def pos=(pos)
42
+ @bcf1[:pos] = pos
39
43
  end
40
44
 
41
- def stop
42
- @bcf1[:pos] + @bcf1[:rlen]
45
+ # Return the 0-based, exclusive end position
46
+ def endpos
47
+ pos + @bcf1[:rlen]
43
48
  end
44
49
 
50
+ # Return the value of the ID column.
45
51
  def id
46
52
  LibHTS.bcf_unpack(@bcf1, LibHTS::BCF_UN_INFO)
47
53
  @bcf1[:d][:id]
48
54
  end
49
55
 
50
- def filter
51
- LibHTS.bcf_unpack(@bcf1, LibHTS::BCF_UN_FLT)
52
- d = @bcf1[:d]
53
- n_flt = d[:n_flt]
54
-
55
- case n_flt
56
- when 0
57
- "PASS"
58
- when 1
59
- i = d[:flt].read_int
60
- LibHTS.bcf_hdr_int2id(@header.struct, LibHTS::BCF_DT_ID, i)
61
- when 2
62
- d[:flt].get_array_of_int(0, n_flt).map do |i|
63
- LibHTS.bcf_hdr_int2id(@header.struct, LibHTS::BCF_DT_ID, i)
64
- end
65
- else
66
- raise "Unexpected number of filters. n_flt: #{n_flt}"
67
- end
56
+ def id=(id)
57
+ LibHTS.bcf_update_id(@header, @bcf1, id)
68
58
  end
69
59
 
70
- def qual
71
- @bcf1[:qual]
60
+ def clear_id
61
+ LibHTS.bcf_update_id(@header, @bcf1, ".")
72
62
  end
73
63
 
74
64
  def ref
@@ -90,14 +80,52 @@ module HTS
90
80
  ).map(&:read_string)
91
81
  end
92
82
 
93
- def info
83
+ # Get variant quality.
84
+ def qual
85
+ @bcf1[:qual]
86
+ end
87
+
88
+ def qual=(qual)
89
+ @bcf1[:qual] = qual
90
+ end
91
+
92
+ def filter
93
+ LibHTS.bcf_unpack(@bcf1, LibHTS::BCF_UN_FLT)
94
+ d = @bcf1[:d]
95
+ n_flt = d[:n_flt]
96
+
97
+ case n_flt
98
+ when 0
99
+ "PASS"
100
+ when 1
101
+ id = d[:flt].read_int
102
+ LibHTS.bcf_hdr_int2id(@header.struct, LibHTS::BCF_DT_ID, id)
103
+ when 2..nil
104
+ d[:flt].get_array_of_int(0, n_flt).map do |i|
105
+ LibHTS.bcf_hdr_int2id(@header.struct, LibHTS::BCF_DT_ID, i)
106
+ end
107
+ else
108
+ raise "Unexpected number of filters. n_flt: #{n_flt}"
109
+ end
110
+ end
111
+
112
+ def info(key = nil)
94
113
  LibHTS.bcf_unpack(@bcf1, LibHTS::BCF_UN_SHR)
95
- Info.new(self)
114
+ info = Info.new(self)
115
+ if key
116
+ info.get(key)
117
+ else
118
+ info
119
+ end
96
120
  end
97
121
 
98
- def format
122
+ def format(key = nil)
99
123
  LibHTS.bcf_unpack(@bcf1, LibHTS::BCF_UN_FMT)
100
- Format.new(self)
124
+ if key
125
+ Format.new(self).get(key)
126
+ else
127
+ Format.new(self)
128
+ end
101
129
  end
102
130
 
103
131
  def to_s