bio-bgzf 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/bio-bgzf.rb +2 -1
- data/lib/bio-bgzf/block.rb +13 -7
- data/lib/bio-bgzf/constants.rb +3 -1
- data/lib/bio-bgzf/reader.rb +24 -2
- data/lib/bio-bgzf/writer.rb +65 -0
- data/spec/bio-bgzf_spec.rb +3 -1
- metadata +9 -10
- data/README.rdoc +0 -48
data/README.md
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/csw/bioruby-bgzf)
|
4
4
|
|
5
|
-
This library provides support for
|
6
|
-
|
5
|
+
This library provides support for [BGZF][] (Blocked GZip Format) in
|
6
|
+
Ruby. BGZF, originally defined as part of the [SAM/BAM][]
|
7
7
|
specification, is used to compress record-oriented bioinformatics data
|
8
8
|
in a way that facilitates random access, unlike plain gzip. A BGZF
|
9
9
|
file consists of contatenated 64 KB blocks, each an independent gzip
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ Jeweler::Tasks.new do |gem|
|
|
18
18
|
gem.homepage = "http://github.com/csw/bioruby-bgzf"
|
19
19
|
gem.license = "MIT"
|
20
20
|
gem.summary = %Q{Reading/writing BGZF blocks}
|
21
|
-
gem.description = %Q{
|
21
|
+
gem.description = %Q{This library provides support for BGZF (Blocked GZip Format) in Ruby. BGZF, originally defined as part of the SAM/BAM specification, is used to compress record-oriented data in a way that facilitates random access, unlike plain gzip. BGZF is principally used for bioinformatics data but would be useful in other contexts as well.}
|
22
22
|
gem.email = "cswh@umich.edu"
|
23
23
|
gem.authors = ["Artem Tarasov", "Clayton Wheeler"]
|
24
24
|
# dependencies defined in Gemfile
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/bio-bgzf.rb
CHANGED
data/lib/bio-bgzf/block.rb
CHANGED
@@ -1,18 +1,24 @@
|
|
1
1
|
module Bio::BGZF
|
2
2
|
|
3
|
+
class FormatError < StandardError
|
4
|
+
end
|
5
|
+
|
6
|
+
class NotBGZFError < FormatError
|
7
|
+
end
|
8
|
+
|
3
9
|
def read_bgzf_block(f)
|
4
10
|
hstart = f.read(12)
|
5
11
|
return nil if hstart == nil # EOF?
|
6
12
|
magic, gzip_extra_length = hstart.unpack('Vxxxxxxv')
|
7
|
-
raise
|
13
|
+
raise NotBGZFError, "wrong BGZF magic: #{sprintf('%08x', magic)}" unless magic == 0x04088B1F
|
8
14
|
|
9
15
|
len = 0
|
10
16
|
bsize = nil
|
11
17
|
while len < gzip_extra_length do
|
12
18
|
si1, si2, slen = f.read(4).unpack('CCv')
|
13
19
|
if si1 == 66 and si2 == 67 then
|
14
|
-
raise "BC subfield length is #{slen} but must be 2" if slen != 2
|
15
|
-
raise 'duplicate field with block size' unless bsize.nil?
|
20
|
+
raise FormatError, "BC subfield length is #{slen} but must be 2" if slen != 2
|
21
|
+
raise FormatError, 'duplicate field with block size' unless bsize.nil?
|
16
22
|
bsize = f.read(2).unpack('v')[0]
|
17
23
|
f.seek(slen - 2, IO::SEEK_CUR)
|
18
24
|
else
|
@@ -22,9 +28,9 @@ module Bio::BGZF
|
|
22
28
|
end
|
23
29
|
|
24
30
|
if len != gzip_extra_length then
|
25
|
-
raise "total length of subfields is #{len} bytes but must be #{gzip_extra_length}"
|
31
|
+
raise FormatError, "total length of subfields is #{len} bytes but must be #{gzip_extra_length}"
|
26
32
|
end
|
27
|
-
raise 'block size was not found in any subfield' if bsize.nil?
|
33
|
+
raise NotBGZFError, 'block size was not found in any subfield' if bsize.nil?
|
28
34
|
|
29
35
|
compressed_data = f.read(bsize - gzip_extra_length - 19)
|
30
36
|
crc32, input_size = f.read(8).unpack('VV')
|
@@ -38,11 +44,11 @@ module Bio::BGZF
|
|
38
44
|
return nil if cdata == nil
|
39
45
|
data = unpack(cdata)
|
40
46
|
if data.bytesize != in_size
|
41
|
-
raise "Expected #{in_size} bytes from BGZF block at #{pos}, but got #{data.bytesize} bytes!"
|
47
|
+
raise FormatError, "Expected #{in_size} bytes from BGZF block at #{pos}, but got #{data.bytesize} bytes!"
|
42
48
|
end
|
43
49
|
crc = Zlib.crc32(data, 0)
|
44
50
|
if crc != expected_crc
|
45
|
-
raise "CRC error: expected #{expected_crc.to_s(16)}, got #{crc.to_s(16)}"
|
51
|
+
raise FormatError, "CRC error: expected #{expected_crc.to_s(16)}, got #{crc.to_s(16)}"
|
46
52
|
end
|
47
53
|
return data
|
48
54
|
end
|
data/lib/bio-bgzf/constants.rb
CHANGED
data/lib/bio-bgzf/reader.rb
CHANGED
@@ -10,14 +10,32 @@ module Bio::BGZF
|
|
10
10
|
@cur_block = nil
|
11
11
|
end
|
12
12
|
|
13
|
+
# Returns the reader's current virtual offset. Between
|
14
|
+
# {#read_block} calls, the file position will always be at the
|
15
|
+
# start of a block or at EOF, so the low 16 bits of the virtual
|
16
|
+
# offset will always be zero.
|
17
|
+
#
|
18
|
+
# @return [Integer] virtual offset for current position
|
13
19
|
def tell
|
14
20
|
f.tell << 16
|
15
21
|
end
|
16
22
|
|
23
|
+
# Reads the BGZF block at the current position. Returns its
|
24
|
+
# decompressed data.
|
25
|
+
#
|
26
|
+
# @return [String] decompressed block data
|
17
27
|
def read_block
|
18
28
|
decompress_block(f)
|
19
29
|
end
|
20
30
|
|
31
|
+
# Reads a portion of a BGZF block, starting from the given virtual
|
32
|
+
# offset. If the offset is the start of a block (low 16 bits are
|
33
|
+
# zero) the entire block's data will be returned. Otherwise, the
|
34
|
+
# subset of the data starting at the given offset will be
|
35
|
+
# returned.
|
36
|
+
#
|
37
|
+
# @param [Integer] vo virtual offset to start from
|
38
|
+
# @return [String] decompressed block data
|
21
39
|
def read_block_at(vo)
|
22
40
|
block_offset = vo_block_offset(vo)
|
23
41
|
data_offset = vo_data_offset(vo)
|
@@ -30,10 +48,14 @@ module Bio::BGZF
|
|
30
48
|
end
|
31
49
|
end
|
32
50
|
|
51
|
+
# Iterates over the blocks in a BGZF file, yielding [block, vo] pairs where
|
33
52
|
def each_block
|
34
53
|
if block_given?
|
35
|
-
while
|
36
|
-
|
54
|
+
while true
|
55
|
+
pos = tell
|
56
|
+
b = read_block
|
57
|
+
break unless b
|
58
|
+
yield b, pos
|
37
59
|
end
|
38
60
|
else
|
39
61
|
enum_for(:each_block)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Bio::BGZF
|
2
|
+
|
3
|
+
class Writer
|
4
|
+
include Bio::BGZF
|
5
|
+
|
6
|
+
attr_reader :f, :buf
|
7
|
+
|
8
|
+
def initialize(f)
|
9
|
+
@f = f
|
10
|
+
@buf = ''
|
11
|
+
if block_given?
|
12
|
+
begin
|
13
|
+
yield self
|
14
|
+
ensure
|
15
|
+
self.close
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def tell
|
21
|
+
f.tell << 16
|
22
|
+
end
|
23
|
+
|
24
|
+
def write_buf
|
25
|
+
if buf.size > 0
|
26
|
+
raise "Buffer too large: #{buf.bytesize}" if buf.bytesize > MAX_BYTES
|
27
|
+
block = pack(buf)
|
28
|
+
f.write(block)
|
29
|
+
@buf = ''
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def write(s)
|
34
|
+
if s.bytesize > MAX_BYTES
|
35
|
+
write_buf
|
36
|
+
_each_slice(s) do |slice|
|
37
|
+
write(slice)
|
38
|
+
end
|
39
|
+
else
|
40
|
+
if (s.bytesize + buf.bytesize) > MAX_BYTES
|
41
|
+
write_buf
|
42
|
+
end
|
43
|
+
buf << s
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def _each_slice(s)
|
48
|
+
n = 0
|
49
|
+
size = s.bytesize
|
50
|
+
while true
|
51
|
+
offset = n * MAX_BYTES
|
52
|
+
break if offset >= size
|
53
|
+
yield s.slice(offset, MAX_BYTES)
|
54
|
+
n += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def close
|
59
|
+
write_buf
|
60
|
+
f.close
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/spec/bio-bgzf_spec.rb
CHANGED
@@ -11,8 +11,10 @@ describe Bio::BGZF do
|
|
11
11
|
it "should be able to read BGZF blocks from a samtools file" do
|
12
12
|
File.open("test/data/mm8.chrM.maf.gz") do |f|
|
13
13
|
r = Bio::BGZF::Reader.new(f)
|
14
|
-
r.each_block do |block|
|
14
|
+
r.each_block do |block, pos|
|
15
15
|
block.size.should <= 65536
|
16
|
+
pos.should.is_a? Integer
|
17
|
+
Bio::BGZF::vo_data_offset(pos).should == 0
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-bgzf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-08-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -60,25 +60,23 @@ dependencies:
|
|
60
60
|
- - ~>
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 2.10.0
|
63
|
-
description:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
description: This library provides support for BGZF (Blocked GZip Format) in Ruby.
|
64
|
+
BGZF, originally defined as part of the SAM/BAM specification, is used to compress
|
65
|
+
record-oriented data in a way that facilitates random access, unlike plain gzip.
|
66
|
+
BGZF is principally used for bioinformatics data but would be useful in other contexts
|
67
|
+
as well.
|
68
68
|
email: cswh@umich.edu
|
69
69
|
executables: []
|
70
70
|
extensions: []
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE.txt
|
73
73
|
- README.md
|
74
|
-
- README.rdoc
|
75
74
|
files:
|
76
75
|
- .document
|
77
76
|
- .travis.yml
|
78
77
|
- Gemfile
|
79
78
|
- LICENSE.txt
|
80
79
|
- README.md
|
81
|
-
- README.rdoc
|
82
80
|
- Rakefile
|
83
81
|
- VERSION
|
84
82
|
- lib/bio-bgzf.rb
|
@@ -88,6 +86,7 @@ files:
|
|
88
86
|
- lib/bio-bgzf/reader.rb
|
89
87
|
- lib/bio-bgzf/unpack.rb
|
90
88
|
- lib/bio-bgzf/vo.rb
|
89
|
+
- lib/bio-bgzf/writer.rb
|
91
90
|
- spec/bio-bgzf_spec.rb
|
92
91
|
- test/data/mm8.chrM.maf.gz
|
93
92
|
homepage: http://github.com/csw/bioruby-bgzf
|
@@ -105,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
105
104
|
version: '0'
|
106
105
|
segments:
|
107
106
|
- 0
|
108
|
-
hash:
|
107
|
+
hash: 4066832272659457160
|
109
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
110
109
|
none: false
|
111
110
|
requirements:
|
data/README.rdoc
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
= bio-bgzf
|
2
|
-
|
3
|
-
{<img
|
4
|
-
src="https://secure.travis-ci.org/lomereiter/bioruby-bgzf.png"
|
5
|
-
/>}[http://travis-ci.org/#!/lomereiter/bioruby-bgzf]
|
6
|
-
|
7
|
-
Full description goes here
|
8
|
-
|
9
|
-
Note: this software is under active development!
|
10
|
-
|
11
|
-
== Installation
|
12
|
-
|
13
|
-
gem install bio-bgzf
|
14
|
-
|
15
|
-
== Usage
|
16
|
-
|
17
|
-
== Developers
|
18
|
-
|
19
|
-
To use the library
|
20
|
-
|
21
|
-
require 'bio-bgzf'
|
22
|
-
|
23
|
-
The API doc is online. For more code examples see also the test files in
|
24
|
-
the source tree.
|
25
|
-
|
26
|
-
== Project home page
|
27
|
-
|
28
|
-
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
-
|
30
|
-
http://github.com/lomereiter/bioruby-bgzf
|
31
|
-
|
32
|
-
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
-
|
34
|
-
== Cite
|
35
|
-
|
36
|
-
If you use this software, please cite one of
|
37
|
-
|
38
|
-
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
-
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
-
|
41
|
-
== Biogems.info
|
42
|
-
|
43
|
-
This Biogem is published at http://biogems.info/index.html#bio-bgzf
|
44
|
-
|
45
|
-
== Copyright
|
46
|
-
|
47
|
-
Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
|
48
|
-
|