htslib 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/hts/bam.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Based on hts-python
4
- # https://github.com/quinlan-lab/hts-python
3
+ require_relative "../htslib"
5
4
 
5
+ require_relative "hts"
6
6
  require_relative "bam/header"
7
7
  require_relative "bam/cigar"
8
8
  require_relative "bam/flag"
@@ -12,99 +12,143 @@ module HTS
12
12
  class Bam
13
13
  include Enumerable
14
14
 
15
- attr_reader :file_path, :mode, :header
16
- # HtfFile is FFI::BitStruct
17
- attr_reader :htf_file
15
+ attr_reader :file_name, :index_name, :mode, :header
18
16
 
19
- class << self
20
- alias open new
21
- end
17
+ def self.open(*args, **kw)
18
+ file = new(*args, **kw) # do not yield
19
+ return file unless block_given?
22
20
 
23
- def initialize(file_path, mode = "r", create_index: nil)
24
- file_path = File.expand_path(file_path)
21
+ begin
22
+ yield file
23
+ ensure
24
+ file.close
25
+ end
26
+ file
27
+ end
25
28
 
26
- unless File.exist?(file_path)
27
- message = "No such SAM/BAM file - #{file_path}"
29
+ def initialize(file_name, mode = "r", index: nil, fai: nil, threads: nil,
30
+ create_index: false)
31
+ if block_given?
32
+ message = "HTS::Bam.new() dose not take block; Please use HTS::Bam.open() instead"
28
33
  raise message
29
34
  end
30
35
 
31
- @file_path = file_path
32
- @mode = mode
33
- @htf_file = LibHTS.hts_open(file_path, mode)
34
- @header = Bam::Header.new(LibHTS.sam_hdr_read(htf_file))
35
-
36
- # read
37
- if mode[0] == "r"
38
- # load index
39
- @idx = LibHTS.sam_index_load(htf_file, file_path)
40
- # create index
41
- if create_index || (@idx.null? && create_index.nil?)
42
- warn "Create index for #{file_path}"
43
- LibHTS.sam_index_build(file_path, -1)
44
- @idx = LibHTS.sam_index_load(htf_file, file_path)
45
- end
46
- else
47
- # FIXME: implement
48
- raise "not implemented yet."
36
+ # NOTE: Do not check for the existence of local files, since file_names may be remote URIs.
37
+
38
+ @file_name = file_name
39
+ @index_name = index
40
+ @mode = mode
41
+ @hts_file = LibHTS.hts_open(@file_name, mode)
42
+
43
+ raise Errno::ENOENT, "Failed to open #{@file_name}" if @hts_file.null?
44
+
45
+ if fai
46
+ r = LibHTS.hts_set_fai_filename(@hts_file, fai)
47
+ raise "Failed to load fasta index: #{fai}" if r < 0
49
48
  end
50
49
 
51
- # IO like API
52
- if block_given?
53
- begin
54
- yield self
55
- ensure
56
- close
57
- end
50
+ if threads&.> 0
51
+ r = LibHTS.hts_set_threads(@hts_file, threads)
52
+ raise "Failed to set number of threads: #{threads}" if r < 0
58
53
  end
59
- end
60
54
 
61
- def struct
62
- htf_file
55
+ return if @mode[0] == "w"
56
+
57
+ @header = Bam::Header.new(@hts_file)
58
+
59
+ create_index(index) if create_index
60
+
61
+ @idx = load_index(index)
62
+
63
+ @start_position = tell
63
64
  end
64
65
 
65
- def to_ptr
66
- htf_file.to_ptr
66
+ def create_index(index_name = nil)
67
+ warn "Create index for #{@file_name} to #{index_name}"
68
+ if index
69
+ LibHTS.sam_index_build2(@file_name, index_name, -1)
70
+ else
71
+ LibHTS.sam_index_build(@file_name, -1)
72
+ end
67
73
  end
68
74
 
69
- def write(alns)
70
- alns.each do
71
- LibHTS.sam_write1(htf_file, header, alns.b) > 0 || raise
75
+ def load_index(index_name = nil)
76
+ if index_name
77
+ LibHTS.sam_index_load2(@hts_file, @file_name, index_name)
78
+ else
79
+ LibHTS.sam_index_load3(@hts_file, @file_name, nil, 2) # should be 3 ? (copy remote file to local?)
72
80
  end
73
81
  end
74
82
 
83
+ def index_loaded?
84
+ !@idx.null?
85
+ end
86
+
75
87
  # Close the current file.
76
88
  def close
77
- LibHTS.hts_close(htf_file)
89
+ LibHTS.hts_idx_destroy(@idx) if @idx&.null?
90
+ @idx = nil
91
+ super
92
+ end
93
+
94
+ def write_header(header)
95
+ raise IOError, "closed stream" if closed?
96
+
97
+ @header = header.dup
98
+ LibHTS.hts_set_fai_filename(@hts_file, @file_name)
99
+ LibHTS.sam_hdr_write(@hts_file, header)
78
100
  end
79
101
 
80
- # Flush the current file.
81
- def flush
82
- # LibHTS.bgzf_flush(@htf_file.fp.bgzf)
102
+ def write(aln)
103
+ raise IOError, "closed stream" if closed?
104
+
105
+ aln_dup = aln.dup
106
+ LibHTS.sam_write1(@hts_file, header, aln_dup) > 0 || raise
83
107
  end
84
108
 
109
+ # Iterate over each record.
110
+ # Generate a new Record object each time.
111
+ # Slower than each.
112
+ def each_copy
113
+ raise IOError, "closed stream" if closed?
114
+ return to_enum(__method__) unless block_given?
115
+
116
+ while LibHTS.sam_read1(@hts_file, header, bam1 = LibHTS.bam_init1) != -1
117
+ record = Record.new(bam1, header)
118
+ yield record
119
+ end
120
+ self
121
+ end
122
+
123
+ # Iterate over each record.
124
+ # Record object is reused.
125
+ # Faster than each_copy.
85
126
  def each
127
+ raise IOError, "closed stream" if closed?
86
128
  # Each does not always start at the beginning of the file.
87
129
  # This is the common behavior of IO objects in Ruby.
88
130
  # This may change in the future.
89
131
  return to_enum(__method__) unless block_given?
90
132
 
91
- while LibHTS.sam_read1(htf_file, header, bam1 = LibHTS.bam_init1) > 0
92
- record = Record.new(bam1, header)
93
- yield record
94
- end
133
+ bam1 = LibHTS.bam_init1
134
+ record = Record.new(bam1, header)
135
+ yield record while LibHTS.sam_read1(@hts_file, header, bam1) != -1
95
136
  self
96
137
  end
97
138
 
98
139
  # query [WIP]
99
140
  def query(region)
141
+ raise IOError, "closed stream" if closed?
142
+ raise "Index file is required to call the query method." unless index_loaded?
143
+
100
144
  qiter = LibHTS.sam_itr_querys(@idx, header, region)
101
145
  begin
102
146
  bam1 = LibHTS.bam_init1
103
- slen = LibHTS.sam_itr_next(htf_file, qiter, bam1)
147
+ slen = LibHTS.sam_itr_next(@hts_file, qiter, bam1)
104
148
  while slen > 0
105
149
  yield Record.new(bam1, header)
106
150
  bam1 = LibHTS.bam_init1
107
- slen = LibHTS.sam_itr_next(htf_file, qiter, bam1)
151
+ slen = LibHTS.sam_itr_next(@hts_file, qiter, bam1)
108
152
  end
109
153
  ensure
110
154
  LibHTS.hts_itr_destroy(qiter)
@@ -1,21 +1,41 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # https://github.com/brentp/hts-nim/blob/master/src/hts/vcf.nim
4
- # This is a port from Nim.
5
- # TODO: Make it more like Ruby.
6
-
7
3
  module HTS
8
- class Bcf
4
+ class Bcf < Hts
9
5
  class Format
10
6
  def initialize(record)
11
7
  @record = record
12
8
  @p1 = FFI::MemoryPointer.new(:pointer) # FIXME: naming
13
9
  end
14
10
 
11
+ # For compatibility with htslib.cr.
12
+ def get_int(key)
13
+ get(key, :int)
14
+ end
15
+
16
+ # For compatibility with htslib.cr.
17
+ def get_float(key)
18
+ get(key, :float)
19
+ end
20
+
21
+ # For compatibility with htslib.cr.
22
+ def get_flag(key)
23
+ get(key, :flag)
24
+ end
25
+
26
+ # For compatibility with htslib.cr.
27
+ def get_string(key)
28
+ get(key, :string)
29
+ end
30
+
31
+ def [](key)
32
+ get(key)
33
+ end
34
+
15
35
  def get(key, type = nil)
16
36
  n = FFI::MemoryPointer.new(:int)
17
37
  p1 = @p1
18
- h = @record.bcf.header.struct
38
+ h = @record.header.struct
19
39
  r = @record.struct
20
40
 
21
41
  format_values = proc do |type|
@@ -25,7 +45,15 @@ module HTS
25
45
  p1.read_pointer
26
46
  end
27
47
 
28
- case type.to_sym
48
+ # The GT FORMAT field is special in that it is marked as a string in the header,
49
+ # but it is actually encoded as an integer.
50
+ if key == "GT"
51
+ type = :int
52
+ elsif type.nil?
53
+ type = ht_type_to_sym(get_fmt_type(key))
54
+ end
55
+
56
+ case type&.to_sym
29
57
  when :int, :int32
30
58
  format_values.call(LibHTS::BCF_HT_INT)
31
59
  .read_array_of_int32(n.read_int)
@@ -33,21 +61,85 @@ module HTS
33
61
  format_values.call(LibHTS::BCF_HT_REAL)
34
62
  .read_array_of_float(n.read_int)
35
63
  when :flag
36
- format_values.call(LibHTS::BCF_HT_FLAG)
37
- .read_int == 1
64
+ raise NotImplementedError, "Flag type not implemented yet. " \
65
+ "Please file an issue on GitHub."
66
+ # format_values.call(LibHTS::BCF_HT_FLAG)
67
+ # .read_int == 1
38
68
  when :string, :str
39
- raise NotImplementedError, "String type not implemented yet."
40
- format_values.call(LibHTS::BCF_HT_STR)
41
- .read_string
69
+ raise NotImplementedError, "String type not implemented yet. " \
70
+ "Please file an issue on GitHub."
71
+ # format_values.call(LibHTS::BCF_HT_STR)
72
+ # .read_string
42
73
  end
43
74
  end
44
75
 
45
- def set; end
76
+ def fields
77
+ ids.map do |id|
78
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
79
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_FMT, id)
80
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
81
+ {
82
+ name: name,
83
+ n: num,
84
+ type: ht_type_to_sym(type),
85
+ id: id
86
+ }
87
+ end
88
+ end
46
89
 
47
- # def fields # iterator
48
- # end
90
+ def length
91
+ @record.struct[:n_fmt]
92
+ end
49
93
 
50
- def genotypes; end
94
+ def size
95
+ length
96
+ end
97
+
98
+ def to_h
99
+ ret = {}
100
+ ids.each do |id|
101
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
102
+ ret[name] = get(name)
103
+ end
104
+ ret
105
+ end
106
+
107
+ # def genotypes; end
108
+
109
+ private
110
+
111
+ def fmt_ptr
112
+ @record.struct[:d][:fmt].to_ptr
113
+ end
114
+
115
+ def ids
116
+ fmt_ptr.read_array_of_struct(LibHTS::BcfFmt, length).map do |fmt|
117
+ fmt[:id]
118
+ end
119
+ end
120
+
121
+ def get_fmt_type(qname)
122
+ @record.struct[:n_fmt].times do |i|
123
+ fmt = LibHTS::BcfFmt.new(@record.struct[:d][:fmt] + i * LibHTS::BcfFmt.size)
124
+ id = fmt[:id]
125
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, id)
126
+ if name == qname
127
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_FMT, id)
128
+ return type
129
+ end
130
+ end
131
+ nil
132
+ end
133
+
134
+ def ht_type_to_sym(t)
135
+ case t
136
+ when LibHTS::BCF_HT_FLAG then :flag
137
+ when LibHTS::BCF_HT_INT then :int
138
+ when LibHTS::BCF_HT_REAL then :float
139
+ when LibHTS::BCF_HT_STR then :string
140
+ when LibHTS::BCF_HT_LONG then :float
141
+ end
142
+ end
51
143
  end
52
144
  end
53
145
  end
@@ -1,10 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
5
  class Header
6
- def initialize(bcf_hdr)
7
- @bcf_hdr = bcf_hdr
6
+ def initialize(hts_file)
7
+ @bcf_hdr = LibHTS.bcf_hdr_read(hts_file)
8
8
  end
9
9
 
10
10
  def struct
@@ -15,12 +15,33 @@ module HTS
15
15
  @bcf_hdr.to_ptr
16
16
  end
17
17
 
18
+ def get_version
19
+ LibHTS.bcf_hdr_get_version(@bcf_hdr)
20
+ end
21
+
22
+ def nsamples
23
+ LibHTS.bcf_hdr_nsamples(@bcf_hdr)
24
+ end
25
+
26
+ def samples
27
+ # bcf_hdr_id2name is macro function
28
+ @bcf_hdr[:samples]
29
+ .read_array_of_pointer(nsamples)
30
+ .map(&:read_string)
31
+ end
32
+
18
33
  def to_s
19
34
  kstr = LibHTS::KString.new
20
35
  raise "Failed to get header string" unless LibHTS.bcf_hdr_format(@bcf_hdr, 0, kstr)
21
36
 
22
37
  kstr[:s]
23
38
  end
39
+
40
+ private
41
+
42
+ def initialize_copy(orig)
43
+ @bcf_hdr = LibHTS.bcf_hdr_dup(orig.struct)
44
+ end
24
45
  end
25
46
  end
26
47
  end
data/lib/hts/bcf/info.rb CHANGED
@@ -1,27 +1,52 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HTS
4
- class Bcf
4
+ class Bcf < Hts
5
5
  class Info
6
6
  def initialize(record)
7
7
  @record = record
8
+ @p1 = FFI::MemoryPointer.new(:pointer) # FIXME: naming
9
+ end
10
+
11
+ # For compatibility with htslib.cr.
12
+ def get_int(key)
13
+ get(key, :int)
14
+ end
15
+
16
+ # For compatibility with htslib.cr.
17
+ def get_float(key)
18
+ get(key, :float)
19
+ end
20
+
21
+ # For compatibility with htslib.cr.
22
+ def get_string(key)
23
+ get(key, :string)
24
+ end
25
+
26
+ # For compatibility with htslib.cr.
27
+ def get_flag(key)
28
+ get(key, :flag)
29
+ end
30
+
31
+ def [](key)
32
+ get(key)
8
33
  end
9
34
 
10
35
  # @note Specify the type. If you don't specify a type, it will still work, but it will be slower.
11
36
  def get(key, type = nil)
12
37
  n = FFI::MemoryPointer.new(:int)
13
- p1 = @record.p1
14
- h = @record.bcf.header.struct
38
+ p1 = @p1
39
+ h = @record.header.struct
15
40
  r = @record.struct
16
41
 
17
- info_values = proc do |type|
18
- ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, type)
42
+ info_values = proc do |typ|
43
+ ret = LibHTS.bcf_get_info_values(h, r, key, p1, n, typ)
19
44
  return nil if ret < 0 # return from method.
20
45
 
21
46
  p1.read_pointer
22
47
  end
23
48
 
24
- type ||= info_type_to_string(get_info_type(key))
49
+ type ||= ht_type_to_sym(get_info_type(key))
25
50
 
26
51
  case type&.to_sym
27
52
  when :int, :int32
@@ -46,47 +71,68 @@ module HTS
46
71
 
47
72
  # FIXME: naming? room for improvement.
48
73
  def fields
49
- n_info = @record.struct[:n_info]
50
- Array.new(n_info) do |i|
51
- fld = LibHTS::BcfInfo.new(
52
- @record.struct[:d][:info] +
53
- i * LibHTS::BcfInfo.size
54
- )
74
+ keys.map do |key|
75
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
76
+ num = LibHTS.bcf_hdr_id2number(@record.header.struct, LibHTS::BCF_HL_INFO, key)
77
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, key)
55
78
  {
56
- name: LibHTS.bcf_hdr_int2id(
57
- @record.bcf.header.struct, LibHTS::BCF_DT_ID, fld[:key]
58
- ),
59
- n: LibHTS.bcf_hdr_id2number(
60
- @record.bcf.header.struct, LibHTS::BCF_HL_INFO, fld[:key]
61
- ),
62
- vtype: fld[:type], i: fld[:key]
79
+ name: name,
80
+ n: num,
81
+ type: ht_type_to_sym(type),
82
+ key: key
63
83
  }
64
84
  end
65
85
  end
66
86
 
87
+ def length
88
+ @record.struct[:n_info]
89
+ end
90
+
91
+ def size
92
+ length
93
+ end
94
+
95
+ def to_h
96
+ ret = {}
97
+ keys.each do |key|
98
+ name = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, key)
99
+ ret[name] = get(name)
100
+ end
101
+ ret
102
+ end
103
+
67
104
  private
68
105
 
106
+ def info_ptr
107
+ @record.struct[:d][:info].to_ptr
108
+ end
109
+
110
+ def keys
111
+ info_ptr.read_array_of_struct(LibHTS::BcfInfo, length).map do |info|
112
+ info[:key]
113
+ end
114
+ end
115
+
69
116
  def get_info_type(key)
70
117
  @record.struct[:n_info].times do |i|
71
- fld = LibHTS::BcfInfo.new(
72
- @record.struct[:d][:info] +
73
- i * LibHTS::BcfInfo.size
74
- )
75
- id = LibHTS.bcf_hdr_int2id(
76
- @record.bcf.header.struct, LibHTS::BCF_DT_ID, fld[:key]
77
- )
78
- return fld[:type] if id == key
118
+ info = LibHTS::BcfInfo.new(@record.struct[:d][:info] + i * LibHTS::BcfInfo.size)
119
+ k = info[:key]
120
+ id = LibHTS.bcf_hdr_int2id(@record.header.struct, LibHTS::BCF_DT_ID, k)
121
+ if id == key
122
+ type = LibHTS.bcf_hdr_id2type(@record.header.struct, LibHTS::BCF_HL_INFO, k)
123
+ return type
124
+ end
79
125
  end
126
+ nil
80
127
  end
81
128
 
82
- def info_type_to_string(t)
129
+ def ht_type_to_sym(t)
83
130
  case t
84
- when 0 then :flag
85
- when 1, 2, 3, 4 then :int
86
- when 5 then :float
87
- when 7 then :string
88
- else
89
- raise "Unknown info type: #{t}"
131
+ when LibHTS::BCF_HT_FLAG then :flag
132
+ when LibHTS::BCF_HT_INT then :int
133
+ when LibHTS::BCF_HT_REAL then :float
134
+ when LibHTS::BCF_HT_STR then :string
135
+ when LibHTS::BCF_HT_LONG then :float
90
136
  end
91
137
  end
92
138
  end