htslib 0.0.8 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/hts/bam.rb CHANGED
@@ -1,21 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Based on hts-python
4
- # https://github.com/quinlan-lab/hts-python
3
+ require_relative "../htslib"
5
4
 
5
+ require_relative "hts"
6
6
  require_relative "bam/header"
7
7
  require_relative "bam/cigar"
8
8
  require_relative "bam/flag"
9
9
  require_relative "bam/record"
10
10
 
11
11
  module HTS
12
+ # A class for working with SAM, BAM, CRAM files.
12
13
  class Bam
13
14
  include Enumerable
14
15
 
15
- attr_reader :file_path, :mode, :header
16
+ attr_reader :file_name, :index_name, :mode, :header
16
17
 
17
- def self.open(...)
18
- file = new(...)
18
+ def self.open(*args, **kw)
19
+ file = new(*args, **kw) # do not yield
19
20
  return file unless block_given?
20
21
 
21
22
  begin
@@ -26,112 +27,123 @@ module HTS
26
27
  file
27
28
  end
28
29
 
29
- def initialize(filename, mode = "r", fai: nil, threads: nil, index: nil)
30
- raise "HTS::Bam.new() dose not take block; Please use HTS::Bam.open() instead" if block_given?
31
-
32
- @file_path = filename == "-" ? "-" : File.expand_path(filename)
33
-
34
- if mode[0] == "r" && !File.exist?(file_path)
35
- message = "No such SAM/BAM file - #{file_path}"
30
+ def initialize(file_name, mode = "r", index: nil, fai: nil, threads: nil,
31
+ create_index: false)
32
+ if block_given?
33
+ message = "HTS::Bam.new() dose not take block; Please use HTS::Bam.open() instead"
36
34
  raise message
37
35
  end
38
36
 
39
- @mode = mode
40
- @hts_file = LibHTS.hts_open(file_path, mode)
37
+ # NOTE: Do not check for the existence of local files, since file_names may be remote URIs.
38
+
39
+ @file_name = file_name
40
+ @index_name = index
41
+ @mode = mode
42
+ @hts_file = LibHTS.hts_open(@file_name, mode)
43
+
44
+ raise Errno::ENOENT, "Failed to open #{@file_name}" if @hts_file.null?
41
45
 
42
46
  if fai
43
- fai_path = File.expand_path(fai)
44
- r = LibHTS.hts_set_fai_filename(@hts_file, fai_path)
47
+ r = LibHTS.hts_set_fai_filename(@hts_file, fai)
45
48
  raise "Failed to load fasta index: #{fai}" if r < 0
46
49
  end
47
50
 
48
- if threads&.> 0
49
- r = LibHTS.hts_set_threads(@hts_file, threads)
50
- raise "Failed to set number of threads: #{threads}" if r < 0
51
- end
51
+ set_threads(threads) if threads
52
52
 
53
- return if mode[0] == "w"
53
+ return if @mode[0] == "w"
54
54
 
55
55
  @header = Bam::Header.new(@hts_file)
56
+ create_index(index) if create_index
57
+ @idx = load_index(index)
58
+ @start_position = tell
59
+ super # do nothing
60
+ end
56
61
 
57
- create_index if index
62
+ def create_index(index_name = nil)
63
+ check_closed
58
64
 
59
- # load index
60
- @idx = LibHTS.sam_index_load(@hts_file, file_path)
65
+ warn "Create index for #{@file_name} to #{index_name}"
66
+ if index
67
+ LibHTS.sam_index_build2(@file_name, index_name, -1)
68
+ else
69
+ LibHTS.sam_index_build(@file_name, -1)
70
+ end
61
71
  end
62
72
 
63
- def create_index
64
- warn "Create index for #{file_path}"
65
- LibHTS.sam_index_build(file_path, -1)
66
- idx = LibHTS.sam_index_load(@hts_file, file_path)
67
- raise "Failed to load index: #{file_path}" if idx.null?
68
- end
73
+ def load_index(index_name = nil)
74
+ check_closed
69
75
 
70
- def struct
71
- @hts_file
76
+ if index_name
77
+ LibHTS.sam_index_load2(@hts_file, @file_name, index_name)
78
+ else
79
+ LibHTS.sam_index_load3(@hts_file, @file_name, nil, 2) # should be 3 ? (copy remote file to local?)
80
+ end
72
81
  end
73
82
 
74
- def to_ptr
75
- @hts_file.to_ptr
83
+ def index_loaded?
84
+ check_closed
85
+
86
+ !@idx.null?
76
87
  end
77
88
 
78
89
  # Close the current file.
79
90
  def close
80
- LibHTS.hts_idx_destroy(@idx) if @idx
91
+ LibHTS.hts_idx_destroy(@idx) if @idx&.null?
81
92
  @idx = nil
82
- LibHTS.hts_close(@hts_file)
83
- @hts_file = nil
84
- end
85
-
86
- def closed?
87
- @hts_file.nil?
93
+ super
88
94
  end
89
95
 
90
96
  def write_header(header)
97
+ check_closed
98
+
91
99
  @header = header.dup
92
- LibHTS.hts_set_fai_filename(@hts_file, @file_path)
100
+ LibHTS.hts_set_fai_filename(@hts_file, @file_name)
93
101
  LibHTS.sam_hdr_write(@hts_file, header)
94
102
  end
95
103
 
96
104
  def write(aln)
105
+ check_closed
106
+
97
107
  aln_dup = aln.dup
98
108
  LibHTS.sam_write1(@hts_file, header, aln_dup) > 0 || raise
99
109
  end
100
110
 
101
- # Flush the current file.
102
- def flush
103
- # LibHTS.bgzf_flush(@@hts_file.fp.bgzf)
111
+ def each(copy: false, &block)
112
+ if copy
113
+ each_record_copy(&block)
114
+ else
115
+ each_record_reuse(&block)
116
+ end
104
117
  end
105
118
 
106
- # Iterate over each record.
107
- # Record object is reused.
108
- # Faster than each_copy.
109
- def each
110
- # Each does not always start at the beginning of the file.
111
- # This is the common behavior of IO objects in Ruby.
112
- # This may change in the future.
119
+ private def each_record_copy
120
+ check_closed
113
121
  return to_enum(__method__) unless block_given?
114
122
 
115
- bam1 = LibHTS.bam_init1
116
- record = Record.new(bam1, header)
117
- yield record while LibHTS.sam_read1(@hts_file, header, bam1) > 0
123
+ while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) != -1
124
+ record = Record.new(bam1, header)
125
+ yield record
126
+ end
127
+ self
118
128
  end
119
129
 
120
- # Iterate over each record.
121
- # Generate a new Record object each time.
122
- # Slower than each.
123
- def each_copy
130
+ private def each_record_reuse
131
+ check_closed
132
+ # Each does not always start at the beginning of the file.
133
+ # This is the common behavior of IO objects in Ruby.
124
134
  return to_enum(__method__) unless block_given?
125
135
 
126
- while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) > 0
127
- record = Record.new(bam1, header)
128
- yield record
129
- end
136
+ bam1 = LibHTS.bam_init1
137
+ record = Record.new(bam1, header)
138
+ yield record while LibHTS.sam_read1(@hts_file, header, bam1) != -1
139
+ self
130
140
  end
131
141
 
132
142
  # query [WIP]
133
143
  def query(region)
134
- # FIXME: when @idx is nil
144
+ check_closed
145
+ raise "Index file is required to call the query method." unless index_loaded?
146
+
135
147
  qiter = LibHTS.sam_itr_querys(@idx, header, region)
136
148
  begin
137
149
  bam1 = LibHTS.bam_init1
@@ -145,5 +157,63 @@ module HTS
145
157
  LibHTS.hts_itr_destroy(qiter)
146
158
  end
147
159
  end
160
+
161
+ # @!macro [attach] define_getter
162
+ # @method $1
163
+ # Get $1 array
164
+ # @return [Array] the $1 array
165
+ define_getter :qname
166
+ define_getter :flag
167
+ define_getter :chrom
168
+ define_getter :pos
169
+ define_getter :mapq
170
+ define_getter :cigar
171
+ define_getter :mate_chrom
172
+ define_getter :mate_pos
173
+ define_getter :insert_size
174
+ define_getter :seq
175
+ define_getter :qual
176
+
177
+ alias isize insert_size
178
+ alias mpos mate_pos
179
+
180
+ def aux(tag)
181
+ warn "experimental"
182
+ check_closed
183
+ position = tell
184
+ ary = map { |r| r.aux(tag) }
185
+ seek(position)
186
+ ary
187
+ end
188
+
189
+ # @!macro [attach] define_iterator
190
+ # @method each_$1
191
+ # Get $1 iterator
192
+ define_iterator :qname
193
+ define_iterator :flag
194
+ define_iterator :chrom
195
+ define_iterator :pos
196
+ define_iterator :mapq
197
+ define_iterator :cigar
198
+ define_iterator :mate_chrom
199
+ define_iterator :mate_pos
200
+ define_iterator :insert_size
201
+ define_iterator :seq
202
+ define_iterator :qual
203
+
204
+ alias each_isize each_insert_size
205
+ alias each_mpos each_mate_pos
206
+
207
+ def each_aux(tag)
208
+ warn "experimental"
209
+ check_closed
210
+ return to_enum(__method__, tag) unless block_given?
211
+
212
+ each do |record|
213
+ yield record.aux(tag)
214
+ end
215
+
216
+ self
217
+ end
148
218
  end
149
219
  end
@@ -1,11 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # https://github.com/brentp/hts-nim/blob/master/src/hts/vcf.nim
4
- # This is a port from Nim.
5
- # TODO: Make it more like Ruby.
6
-
7
3
  module HTS
8
- class Bcf
4
+ class Bcf < Hts
9
5
  class Format
10
6
  def initialize(record)
11
7
  @record = record
@@ -32,20 +28,32 @@ module HTS
32
28
  get(key, :string)
33
29
  end
34
30
 
31
+ def [](key)
32
+ get(key)
33
+ end
34
+
35
35
  def get(key, type = nil)
36
36
  n = FFI::MemoryPointer.new(:int)
37
37
  p1 = @p1
38
38
  h = @record.header.struct
39
39
  r = @record.struct
40
40
 
41
- format_values = proc do |type|
42
- ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, type)
41
+ format_values = proc do |typ|
42
+ ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, typ)
43
43
  return nil if ret < 0 # return from method.
44
44
 
45
45
  p1.read_pointer
46
46
  end
47
47
 
48
- case type.to_sym
48
+ # The GT FORMAT field is special in that it is marked as a string in the header,
49
+ # but it is actually encoded as an integer.
50
+ if key == "GT"
51
+ type = :int
52
+ elsif type.nil?
53
+ type = ht_type_to_sym(get_fmt_type(key))
54
+ end
55
+
56
+ case type&.to_sym
49
57
  when :int, :int32
50
58
  format_values.call(LibHTS::BCF_HT_INT)
51
59
  .read_array_of_int32(n.read_int)
@@ -53,22 +61,85 @@ module HTS
53
61
  format_values.call(LibHTS::BCF_HT_REAL)
54
62
  .read_array_of_float(n.read_int)
55
63
  when :flag
56
- raise NotImplementedError, "Flag type not implemented yet."
64
+ raise NotImplementedError, "Flag type not implemented yet. " \
65
+ "Please file an issue on GitHub."
57
66
  # format_values.call(LibHTS::BCF_HT_FLAG)
58
67
  # .read_int == 1
59
68
  when :string, :str
60
- raise NotImplementedError, "String type not implemented yet."
69
+ raise NotImplementedError, "String type not implemented yet. " \
70
+ "Please file an issue on GitHub."
61
71
  # format_values.call(LibHTS::BCF_HT_STR)
62
72
  # .read_string
63
73
  end
64
74
  end
65
75
 
66
- def set; end
76
+ def fields
77
+ ids.map do |id|
78
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
79
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
80
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
81
+ {
82
+ name:,
83
+ n: num,
84
+ type: ht_type_to_sym(type),
85
+ id:
86
+ }
87
+ end
88
+ end
67
89
 
68
- # def fields # iterator
69
- # end
90
+ def length
91
+ @record.struct[:n_fmt]
92
+ end
93
+
94
+ def size
95
+ length
96
+ end
97
+
98
+ def to_h
99
+ ret = {}
100
+ ids.each do |id|
101
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
102
+ ret[name] = get(name)
103
+ end
104
+ ret
105
+ end
70
106
 
71
- def genotypes; end
107
+ # def genotypes; end
108
+
109
+ private
110
+
111
+ def fmt_ptr
112
+ @record.struct[:d][:fmt].to_ptr
113
+ end
114
+
115
+ def ids
116
+ fmt_ptr.read_array_of_struct(LibHTS::BcfFmt, length).map do |fmt|
117
+ fmt[:id]
118
+ end
119
+ end
120
+
121
+ def get_fmt_type(qname)
122
+ @record.struct[:n_fmt].times do |i|
123
+ fmt = LibHTS::BcfFmt.new(@record.struct[:d][:fmt] + i * LibHTS::BcfFmt.size)
124
+ id = fmt[:id]
125
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
126
+ if name == qname
127
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
128
+ return type
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def ht_type_to_sym(t)
135
+ case t
136
+ when LibHTS::BCF_HT_FLAG then :flag
137
+ when LibHTS::BCF_HT_INT then :int
138
+ when LibHTS::BCF_HT_REAL then :float
139
+ when LibHTS::BCF_HT_STR then :string
140
+ when LibHTS::BCF_HT_LONG then :float
141
+ end
142
+ end
72
143
  end
73
144
  end
74
145
  end
@@ -1,7 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
+ # A class for working with VCF records.
5
6
  class Header
6
7
  def initialize(hts_file)
7
8
  @bcf_hdr = LibHTS.bcf_hdr_read(hts_file)
@@ -19,14 +20,14 @@ module HTS
19
20
  LibHTS.bcf_hdr_get_version(@bcf_hdr)
20
21
  end
21
22
 
22
- def sample_count
23
+ def nsamples
23
24
  LibHTS.bcf_hdr_nsamples(@bcf_hdr)
24
25
  end
25
26
 
26
- def sample_names
27
+ def samples
27
28
  # bcf_hdr_id2name is macro function
28
29
  @bcf_hdr[:samples]
29
- .read_array_of_pointer(sample_count)
30
+ .read_array_of_pointer(nsamples)
30
31
  .map(&:read_string)
31
32
  end
32
33
 
data/lib/hts/bcf/info.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
+ # Info field
5
6
  class Info
6
7
  def initialize(record)
7
8
  @record = record
@@ -28,6 +29,10 @@ module HTS
28
29
  get(key, :flag)
29
30
  end
30
31
 
32
+ def [](key)
33
+ get(key)
34
+ end
35
+
31
36
  # @note Specify the type. If you don't specify a type, it will still work, but it will be slower.
32
37
  def get(key, type = nil)
33
38
  n = FFI::MemoryPointer.new(:int)
@@ -35,14 +40,14 @@ module HTS
35
40
  h = @record.header.struct
36
41
  r = @record.struct
37
42
 
38
- info_values = proc do |type|
39
- ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, type)
43
+ info_values = proc do |typ|
44
+ ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, typ)
40
45
  return nil if ret < 0 # return from method.
41
46
 
42
47
  p1.read_pointer
43
48
  end
44
49
 
45
- type ||= info_type_to_string(get_info_type(key))
50
+ type ||= ht_type_to_sym(get_info_type(key))
46
51
 
47
52
  case type&.to_sym
48
53
  when :int, :int32
@@ -67,47 +72,68 @@ module HTS
67
72
 
68
73
  # FIXME: naming? room for improvement.
69
74
  def fields
70
- n_info = @record.struct[:n_info]
71
- Array.new(n_info) do |i|
72
- fld = LibHTS::BcfInfo.new(
73
- @record.struct[:d][:info] +
74
- i * LibHTS::BcfInfo.size
75
- )
75
+ keys.map do |key|
76
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
77
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_INFO, key)
78
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, key)
76
79
  {
77
- name: LibHTS.bcf_hdr_int2id(
78
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
79
- ),
80
- n: LibHTS.bcf_hdr_id2number(
81
- @record.header.struct, LibHTS::BCF_HL_INFO, fld[:key]
82
- ),
83
- vtype: fld[:type], i: fld[:key]
80
+ name:,
81
+ n: num,
82
+ type: ht_type_to_sym(type),
83
+ key:
84
84
  }
85
85
  end
86
86
  end
87
87
 
88
+ def length
89
+ @record.struct[:n_info]
90
+ end
91
+
92
+ def size
93
+ length
94
+ end
95
+
96
+ def to_h
97
+ ret = {}
98
+ keys.each do |key|
99
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
100
+ ret[name] = get(name)
101
+ end
102
+ ret
103
+ end
104
+
88
105
  private
89
106
 
107
+ def info_ptr
108
+ @record.struct[:d][:info].to_ptr
109
+ end
110
+
111
+ def keys
112
+ info_ptr.read_array_of_struct(LibHTS::BcfInfo, length).map do |info|
113
+ info[:key]
114
+ end
115
+ end
116
+
90
117
  def get_info_type(key)
91
118
  @record.struct[:n_info].times do |i|
92
- fld = LibHTS::BcfInfo.new(
93
- @record.struct[:d][:info] +
94
- i * LibHTS::BcfInfo.size
95
- )
96
- id = LibHTS.bcf_hdr_int2id(
97
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
98
- )
99
- return fld[:type] if id == key
119
+ info = LibHTS::BcfInfo.new(@record.struct[:d][:info] + i * LibHTS::BcfInfo.size)
120
+ k = info[:key]
121
+ id = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, k)
122
+ if id == key
123
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, k)
124
+ return type
125
+ end
100
126
  end
127
+ nil
101
128
  end
102
129
 
103
- def info_type_to_string(t)
130
+ def ht_type_to_sym(t)
104
131
  case t
105
- when 0 then :flag
106
- when 1, 2, 3, 4 then :int
107
- when 5 then :float
108
- when 7 then :string
109
- else
110
- raise "Unknown info type: #{t}"
132
+ when LibHTS::BCF_HT_FLAG then :flag
133
+ when LibHTS::BCF_HT_INT then :int
134
+ when LibHTS::BCF_HT_REAL then :float
135
+ when LibHTS::BCF_HT_STR then :string
136
+ when LibHTS::BCF_HT_LONG then :float
111
137
  end
112
138
  end
113
139
  end