htslib 0.0.8 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/hts/bam.rb CHANGED
@@ -1,21 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Based on hts-python
4
- # https://github.com/quinlan-lab/hts-python
3
+ require_relative "../htslib"
5
4
 
5
+ require_relative "hts"
6
6
  require_relative "bam/header"
7
7
  require_relative "bam/cigar"
8
8
  require_relative "bam/flag"
9
9
  require_relative "bam/record"
10
10
 
11
11
  module HTS
12
+ # A class for working with SAM, BAM, CRAM files.
12
13
  class Bam
13
14
  include Enumerable
14
15
 
15
- attr_reader :file_path, :mode, :header
16
+ attr_reader :file_name, :index_name, :mode, :header
16
17
 
17
- def self.open(...)
18
- file = new(...)
18
+ def self.open(*args, **kw)
19
+ file = new(*args, **kw) # do not yield
19
20
  return file unless block_given?
20
21
 
21
22
  begin
@@ -26,112 +27,123 @@ module HTS
26
27
  file
27
28
  end
28
29
 
29
- def initialize(filename, mode = "r", fai: nil, threads: nil, index: nil)
30
- raise "HTS::Bam.new() dose not take block; Please use HTS::Bam.open() instead" if block_given?
31
-
32
- @file_path = filename == "-" ? "-" : File.expand_path(filename)
33
-
34
- if mode[0] == "r" && !File.exist?(file_path)
35
- message = "No such SAM/BAM file - #{file_path}"
30
+ def initialize(file_name, mode = "r", index: nil, fai: nil, threads: nil,
31
+ create_index: false)
32
+ if block_given?
33
+ message = "HTS::Bam.new() dose not take block; Please use HTS::Bam.open() instead"
36
34
  raise message
37
35
  end
38
36
 
39
- @mode = mode
40
- @hts_file = LibHTS.hts_open(file_path, mode)
37
+ # NOTE: Do not check for the existence of local files, since file_names may be remote URIs.
38
+
39
+ @file_name = file_name
40
+ @index_name = index
41
+ @mode = mode
42
+ @hts_file = LibHTS.hts_open(@file_name, mode)
43
+
44
+ raise Errno::ENOENT, "Failed to open #{@file_name}" if @hts_file.null?
41
45
 
42
46
  if fai
43
- fai_path = File.expand_path(fai)
44
- r = LibHTS.hts_set_fai_filename(@hts_file, fai_path)
47
+ r = LibHTS.hts_set_fai_filename(@hts_file, fai)
45
48
  raise "Failed to load fasta index: #{fai}" if r < 0
46
49
  end
47
50
 
48
- if threads&.> 0
49
- r = LibHTS.hts_set_threads(@hts_file, threads)
50
- raise "Failed to set number of threads: #{threads}" if r < 0
51
- end
51
+ set_threads(threads) if threads
52
52
 
53
- return if mode[0] == "w"
53
+ return if @mode[0] == "w"
54
54
 
55
55
  @header = Bam::Header.new(@hts_file)
56
+ create_index(index) if create_index
57
+ @idx = load_index(index)
58
+ @start_position = tell
59
+ super # do nothing
60
+ end
56
61
 
57
- create_index if index
62
+ def create_index(index_name = nil)
63
+ check_closed
58
64
 
59
- # load index
60
- @idx = LibHTS.sam_index_load(@hts_file, file_path)
65
+ warn "Create index for #{@file_name} to #{index_name}"
66
+ if index
67
+ LibHTS.sam_index_build2(@file_name, index_name, -1)
68
+ else
69
+ LibHTS.sam_index_build(@file_name, -1)
70
+ end
61
71
  end
62
72
 
63
- def create_index
64
- warn "Create index for #{file_path}"
65
- LibHTS.sam_index_build(file_path, -1)
66
- idx = LibHTS.sam_index_load(@hts_file, file_path)
67
- raise "Failed to load index: #{file_path}" if idx.null?
68
- end
73
+ def load_index(index_name = nil)
74
+ check_closed
69
75
 
70
- def struct
71
- @hts_file
76
+ if index_name
77
+ LibHTS.sam_index_load2(@hts_file, @file_name, index_name)
78
+ else
79
+ LibHTS.sam_index_load3(@hts_file, @file_name, nil, 2) # should be 3 ? (copy remote file to local?)
80
+ end
72
81
  end
73
82
 
74
- def to_ptr
75
- @hts_file.to_ptr
83
+ def index_loaded?
84
+ check_closed
85
+
86
+ !@idx.null?
76
87
  end
77
88
 
78
89
  # Close the current file.
79
90
  def close
80
- LibHTS.hts_idx_destroy(@idx) if @idx
91
+ LibHTS.hts_idx_destroy(@idx) if @idx&.null?
81
92
  @idx = nil
82
- LibHTS.hts_close(@hts_file)
83
- @hts_file = nil
84
- end
85
-
86
- def closed?
87
- @hts_file.nil?
93
+ super
88
94
  end
89
95
 
90
96
  def write_header(header)
97
+ check_closed
98
+
91
99
  @header = header.dup
92
- LibHTS.hts_set_fai_filename(@hts_file, @file_path)
100
+ LibHTS.hts_set_fai_filename(@hts_file, @file_name)
93
101
  LibHTS.sam_hdr_write(@hts_file, header)
94
102
  end
95
103
 
96
104
  def write(aln)
105
+ check_closed
106
+
97
107
  aln_dup = aln.dup
98
108
  LibHTS.sam_write1(@hts_file, header, aln_dup) > 0 || raise
99
109
  end
100
110
 
101
- # Flush the current file.
102
- def flush
103
- # LibHTS.bgzf_flush(@@hts_file.fp.bgzf)
111
+ def each(copy: false, &block)
112
+ if copy
113
+ each_record_copy(&block)
114
+ else
115
+ each_record_reuse(&block)
116
+ end
104
117
  end
105
118
 
106
- # Iterate over each record.
107
- # Record object is reused.
108
- # Faster than each_copy.
109
- def each
110
- # Each does not always start at the beginning of the file.
111
- # This is the common behavior of IO objects in Ruby.
112
- # This may change in the future.
119
+ private def each_record_copy
120
+ check_closed
113
121
  return to_enum(__method__) unless block_given?
114
122
 
115
- bam1 = LibHTS.bam_init1
116
- record = Record.new(bam1, header)
117
- yield record while LibHTS.sam_read1(@hts_file, header, bam1) > 0
123
+ while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) != -1
124
+ record = Record.new(bam1, header)
125
+ yield record
126
+ end
127
+ self
118
128
  end
119
129
 
120
- # Iterate over each record.
121
- # Generate a new Record object each time.
122
- # Slower than each.
123
- def each_copy
130
+ private def each_record_reuse
131
+ check_closed
132
+ # Each does not always start at the beginning of the file.
133
+ # This is the common behavior of IO objects in Ruby.
124
134
  return to_enum(__method__) unless block_given?
125
135
 
126
- while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) > 0
127
- record = Record.new(bam1, header)
128
- yield record
129
- end
136
+ bam1 = LibHTS.bam_init1
137
+ record = Record.new(bam1, header)
138
+ yield record while LibHTS.sam_read1(@hts_file, header, bam1) != -1
139
+ self
130
140
  end
131
141
 
132
142
  # query [WIP]
133
143
  def query(region)
134
- # FIXME: when @idx is nil
144
+ check_closed
145
+ raise "Index file is required to call the query method." unless index_loaded?
146
+
135
147
  qiter = LibHTS.sam_itr_querys(@idx, header, region)
136
148
  begin
137
149
  bam1 = LibHTS.bam_init1
@@ -145,5 +157,63 @@ module HTS
145
157
  LibHTS.hts_itr_destroy(qiter)
146
158
  end
147
159
  end
160
+
161
+ # @!macro [attach] define_getter
162
+ # @method $1
163
+ # Get $1 array
164
+ # @return [Array] the $1 array
165
+ define_getter :qname
166
+ define_getter :flag
167
+ define_getter :chrom
168
+ define_getter :pos
169
+ define_getter :mapq
170
+ define_getter :cigar
171
+ define_getter :mate_chrom
172
+ define_getter :mate_pos
173
+ define_getter :insert_size
174
+ define_getter :seq
175
+ define_getter :qual
176
+
177
+ alias isize insert_size
178
+ alias mpos mate_pos
179
+
180
+ def aux(tag)
181
+ warn "experimental"
182
+ check_closed
183
+ position = tell
184
+ ary = map { |r| r.aux(tag) }
185
+ seek(position)
186
+ ary
187
+ end
188
+
189
+ # @!macro [attach] define_iterator
190
+ # @method each_$1
191
+ # Get $1 iterator
192
+ define_iterator :qname
193
+ define_iterator :flag
194
+ define_iterator :chrom
195
+ define_iterator :pos
196
+ define_iterator :mapq
197
+ define_iterator :cigar
198
+ define_iterator :mate_chrom
199
+ define_iterator :mate_pos
200
+ define_iterator :insert_size
201
+ define_iterator :seq
202
+ define_iterator :qual
203
+
204
+ alias each_isize each_insert_size
205
+ alias each_mpos each_mate_pos
206
+
207
+ def each_aux(tag)
208
+ warn "experimental"
209
+ check_closed
210
+ return to_enum(__method__, tag) unless block_given?
211
+
212
+ each do |record|
213
+ yield record.aux(tag)
214
+ end
215
+
216
+ self
217
+ end
148
218
  end
149
219
  end
@@ -1,11 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # https://github.com/brentp/hts-nim/blob/master/src/hts/vcf.nim
4
- # This is a port from Nim.
5
- # TODO: Make it more like Ruby.
6
-
7
3
  module HTS
8
- class Bcf
4
+ class Bcf < Hts
9
5
  class Format
10
6
  def initialize(record)
11
7
  @record = record
@@ -32,20 +28,32 @@ module HTS
32
28
  get(key, :string)
33
29
  end
34
30
 
31
+ def [](key)
32
+ get(key)
33
+ end
34
+
35
35
  def get(key, type = nil)
36
36
  n = FFI::MemoryPointer.new(:int)
37
37
  p1 = @p1
38
38
  h = @record.header.struct
39
39
  r = @record.struct
40
40
 
41
- format_values = proc do |type|
42
- ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, type)
41
+ format_values = proc do |typ|
42
+ ret = LibHTS.bcf_get_format_values(h, r, key, p1, n, typ)
43
43
  return nil if ret < 0 # return from method.
44
44
 
45
45
  p1.read_pointer
46
46
  end
47
47
 
48
- case type.to_sym
48
+ # The GT FORMAT field is special in that it is marked as a string in the header,
49
+ # but it is actually encoded as an integer.
50
+ if key == "GT"
51
+ type = :int
52
+ elsif type.nil?
53
+ type = ht_type_to_sym(get_fmt_type(key))
54
+ end
55
+
56
+ case type&.to_sym
49
57
  when :int, :int32
50
58
  format_values.call(LibHTS::BCF_HT_INT)
51
59
  .read_array_of_int32(n.read_int)
@@ -53,22 +61,85 @@ module HTS
53
61
  format_values.call(LibHTS::BCF_HT_REAL)
54
62
  .read_array_of_float(n.read_int)
55
63
  when :flag
56
- raise NotImplementedError, "Flag type not implemented yet."
64
+ raise NotImplementedError, "Flag type not implemented yet. " \
65
+ "Please file an issue on GitHub."
57
66
  # format_values.call(LibHTS::BCF_HT_FLAG)
58
67
  # .read_int == 1
59
68
  when :string, :str
60
- raise NotImplementedError, "String type not implemented yet."
69
+ raise NotImplementedError, "String type not implemented yet. " \
70
+ "Please file an issue on GitHub."
61
71
  # format_values.call(LibHTS::BCF_HT_STR)
62
72
  # .read_string
63
73
  end
64
74
  end
65
75
 
66
- def set; end
76
+ def fields
77
+ ids.map do |id|
78
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
79
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
80
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
81
+ {
82
+ name:,
83
+ n: num,
84
+ type: ht_type_to_sym(type),
85
+ id:
86
+ }
87
+ end
88
+ end
67
89
 
68
- # def fields # iterator
69
- # end
90
+ def length
91
+ @record.struct[:n_fmt]
92
+ end
93
+
94
+ def size
95
+ length
96
+ end
97
+
98
+ def to_h
99
+ ret = {}
100
+ ids.each do |id|
101
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
102
+ ret[name] = get(name)
103
+ end
104
+ ret
105
+ end
70
106
 
71
- def genotypes; end
107
+ # def genotypes; end
108
+
109
+ private
110
+
111
+ def fmt_ptr
112
+ @record.struct[:d][:fmt].to_ptr
113
+ end
114
+
115
+ def ids
116
+ fmt_ptr.read_array_of_struct(LibHTS::BcfFmt, length).map do |fmt|
117
+ fmt[:id]
118
+ end
119
+ end
120
+
121
+ def get_fmt_type(qname)
122
+ @record.struct[:n_fmt].times do |i|
123
+ fmt = LibHTS::BcfFmt.new(@record.struct[:d][:fmt] + i * LibHTS::BcfFmt.size)
124
+ id = fmt[:id]
125
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
126
+ if name == qname
127
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
128
+ return type
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def ht_type_to_sym(t)
135
+ case t
136
+ when LibHTS::BCF_HT_FLAG then :flag
137
+ when LibHTS::BCF_HT_INT then :int
138
+ when LibHTS::BCF_HT_REAL then :float
139
+ when LibHTS::BCF_HT_STR then :string
140
+ when LibHTS::BCF_HT_LONG then :float
141
+ end
142
+ end
72
143
  end
73
144
  end
74
145
  end
@@ -1,7 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
+ # A class for working with VCF records.
5
6
  class Header
6
7
  def initialize(hts_file)
7
8
  @bcf_hdr = LibHTS.bcf_hdr_read(hts_file)
@@ -19,14 +20,14 @@ module HTS
19
20
  LibHTS.bcf_hdr_get_version(@bcf_hdr)
20
21
  end
21
22
 
22
- def sample_count
23
+ def nsamples
23
24
  LibHTS.bcf_hdr_nsamples(@bcf_hdr)
24
25
  end
25
26
 
26
- def sample_names
27
+ def samples
27
28
  # bcf_hdr_id2name is macro function
28
29
  @bcf_hdr[:samples]
29
- .read_array_of_pointer(sample_count)
30
+ .read_array_of_pointer(nsamples)
30
31
  .map(&:read_string)
31
32
  end
32
33
 
data/lib/hts/bcf/info.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
+ # Info field
5
6
  class Info
6
7
  def initialize(record)
7
8
  @record = record
@@ -28,6 +29,10 @@ module HTS
28
29
  get(key, :flag)
29
30
  end
30
31
 
32
+ def [](key)
33
+ get(key)
34
+ end
35
+
31
36
  # @note Specify the type. If you don't specify a type, it will still work, but it will be slower.
32
37
  def get(key, type = nil)
33
38
  n = FFI::MemoryPointer.new(:int)
@@ -35,14 +40,14 @@ module HTS
35
40
  h = @record.header.struct
36
41
  r = @record.struct
37
42
 
38
- info_values = proc do |type|
39
- ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, type)
43
+ info_values = proc do |typ|
44
+ ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, typ)
40
45
  return nil if ret < 0 # return from method.
41
46
 
42
47
  p1.read_pointer
43
48
  end
44
49
 
45
- type ||= info_type_to_string(get_info_type(key))
50
+ type ||= ht_type_to_sym(get_info_type(key))
46
51
 
47
52
  case type&.to_sym
48
53
  when :int, :int32
@@ -67,47 +72,68 @@ module HTS
67
72
 
68
73
  # FIXME: naming? room for improvement.
69
74
  def fields
70
- n_info = @record.struct[:n_info]
71
- Array.new(n_info) do |i|
72
- fld = LibHTS::BcfInfo.new(
73
- @record.struct[:d][:info] +
74
- i * LibHTS::BcfInfo.size
75
- )
75
+ keys.map do |key|
76
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
77
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_INFO, key)
78
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, key)
76
79
  {
77
- name: LibHTS.bcf_hdr_int2id(
78
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
79
- ),
80
- n: LibHTS.bcf_hdr_id2number(
81
- @record.header.struct, LibHTS::BCF_HL_INFO, fld[:key]
82
- ),
83
- vtype: fld[:type], i: fld[:key]
80
+ name:,
81
+ n: num,
82
+ type: ht_type_to_sym(type),
83
+ key:
84
84
  }
85
85
  end
86
86
  end
87
87
 
88
+ def length
89
+ @record.struct[:n_info]
90
+ end
91
+
92
+ def size
93
+ length
94
+ end
95
+
96
+ def to_h
97
+ ret = {}
98
+ keys.each do |key|
99
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
100
+ ret[name] = get(name)
101
+ end
102
+ ret
103
+ end
104
+
88
105
  private
89
106
 
107
+ def info_ptr
108
+ @record.struct[:d][:info].to_ptr
109
+ end
110
+
111
+ def keys
112
+ info_ptr.read_array_of_struct(LibHTS::BcfInfo, length).map do |info|
113
+ info[:key]
114
+ end
115
+ end
116
+
90
117
  def get_info_type(key)
91
118
  @record.struct[:n_info].times do |i|
92
- fld = LibHTS::BcfInfo.new(
93
- @record.struct[:d][:info] +
94
- i * LibHTS::BcfInfo.size
95
- )
96
- id = LibHTS.bcf_hdr_int2id(
97
- @record.header.struct, LibHTS::BCF_DT_ID, fld[:key]
98
- )
99
- return fld[:type] if id == key
119
+ info = LibHTS::BcfInfo.new(@record.struct[:d][:info] + i * LibHTS::BcfInfo.size)
120
+ k = info[:key]
121
+ id = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, k)
122
+ if id == key
123
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, k)
124
+ return type
125
+ end
100
126
  end
127
+ nil
101
128
  end
102
129
 
103
- def info_type_to_string(t)
130
+ def ht_type_to_sym(t)
104
131
  case t
105
- when 0 then :flag
106
- when 1, 2, 3, 4 then :int
107
- when 5 then :float
108
- when 7 then :string
109
- else
110
- raise "Unknown info type: #{t}"
132
+ when LibHTS::BCF_HT_FLAG then :flag
133
+ when LibHTS::BCF_HT_INT then :int
134
+ when LibHTS::BCF_HT_REAL then :float
135
+ when LibHTS::BCF_HT_STR then :string
136
+ when LibHTS::BCF_HT_LONG then :float
111
137
  end
112
138
  end
113
139
  end