bio-bgzf 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -2
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/bio-bgzf.rb +2 -1
- data/lib/bio-bgzf/block.rb +13 -7
- data/lib/bio-bgzf/constants.rb +3 -1
- data/lib/bio-bgzf/reader.rb +24 -2
- data/lib/bio-bgzf/writer.rb +65 -0
- data/spec/bio-bgzf_spec.rb +3 -1
- metadata +9 -10
- data/README.rdoc +0 -48
data/README.md
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://secure.travis-ci.org/csw/bioruby-bgzf.png)](http://travis-ci.org/csw/bioruby-bgzf)
|
4
4
|
|
5
|
-
This library provides support for
|
6
|
-
|
5
|
+
This library provides support for [BGZF][] (Blocked GZip Format) in
|
6
|
+
Ruby. BGZF, originally defined as part of the [SAM/BAM][]
|
7
7
|
specification, is used to compress record-oriented bioinformatics data
|
8
8
|
in a way that facilitates random access, unlike plain gzip. A BGZF
|
9
9
|
file consists of contatenated 64 KB blocks, each an independent gzip
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ Jeweler::Tasks.new do |gem|
|
|
18
18
|
gem.homepage = "http://github.com/csw/bioruby-bgzf"
|
19
19
|
gem.license = "MIT"
|
20
20
|
gem.summary = %Q{Reading/writing BGZF blocks}
|
21
|
-
gem.description = %Q{
|
21
|
+
gem.description = %Q{This library provides support for BGZF (Blocked GZip Format) in Ruby. BGZF, originally defined as part of the SAM/BAM specification, is used to compress record-oriented data in a way that facilitates random access, unlike plain gzip. BGZF is principally used for bioinformatics data but would be useful in other contexts as well.}
|
22
22
|
gem.email = "cswh@umich.edu"
|
23
23
|
gem.authors = ["Artem Tarasov", "Clayton Wheeler"]
|
24
24
|
# dependencies defined in Gemfile
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/bio-bgzf.rb
CHANGED
data/lib/bio-bgzf/block.rb
CHANGED
@@ -1,18 +1,24 @@
|
|
1
1
|
module Bio::BGZF
|
2
2
|
|
3
|
+
class FormatError < StandardError
|
4
|
+
end
|
5
|
+
|
6
|
+
class NotBGZFError < FormatError
|
7
|
+
end
|
8
|
+
|
3
9
|
def read_bgzf_block(f)
|
4
10
|
hstart = f.read(12)
|
5
11
|
return nil if hstart == nil # EOF?
|
6
12
|
magic, gzip_extra_length = hstart.unpack('Vxxxxxxv')
|
7
|
-
raise
|
13
|
+
raise NotBGZFError, "wrong BGZF magic: #{sprintf('%08x', magic)}" unless magic == 0x04088B1F
|
8
14
|
|
9
15
|
len = 0
|
10
16
|
bsize = nil
|
11
17
|
while len < gzip_extra_length do
|
12
18
|
si1, si2, slen = f.read(4).unpack('CCv')
|
13
19
|
if si1 == 66 and si2 == 67 then
|
14
|
-
raise "BC subfield length is #{slen} but must be 2" if slen != 2
|
15
|
-
raise 'duplicate field with block size' unless bsize.nil?
|
20
|
+
raise FormatError, "BC subfield length is #{slen} but must be 2" if slen != 2
|
21
|
+
raise FormatError, 'duplicate field with block size' unless bsize.nil?
|
16
22
|
bsize = f.read(2).unpack('v')[0]
|
17
23
|
f.seek(slen - 2, IO::SEEK_CUR)
|
18
24
|
else
|
@@ -22,9 +28,9 @@ module Bio::BGZF
|
|
22
28
|
end
|
23
29
|
|
24
30
|
if len != gzip_extra_length then
|
25
|
-
raise "total length of subfields is #{len} bytes but must be #{gzip_extra_length}"
|
31
|
+
raise FormatError, "total length of subfields is #{len} bytes but must be #{gzip_extra_length}"
|
26
32
|
end
|
27
|
-
raise 'block size was not found in any subfield' if bsize.nil?
|
33
|
+
raise NotBGZFError, 'block size was not found in any subfield' if bsize.nil?
|
28
34
|
|
29
35
|
compressed_data = f.read(bsize - gzip_extra_length - 19)
|
30
36
|
crc32, input_size = f.read(8).unpack('VV')
|
@@ -38,11 +44,11 @@ module Bio::BGZF
|
|
38
44
|
return nil if cdata == nil
|
39
45
|
data = unpack(cdata)
|
40
46
|
if data.bytesize != in_size
|
41
|
-
raise "Expected #{in_size} bytes from BGZF block at #{pos}, but got #{data.bytesize} bytes!"
|
47
|
+
raise FormatError, "Expected #{in_size} bytes from BGZF block at #{pos}, but got #{data.bytesize} bytes!"
|
42
48
|
end
|
43
49
|
crc = Zlib.crc32(data, 0)
|
44
50
|
if crc != expected_crc
|
45
|
-
raise "CRC error: expected #{expected_crc.to_s(16)}, got #{crc.to_s(16)}"
|
51
|
+
raise FormatError, "CRC error: expected #{expected_crc.to_s(16)}, got #{crc.to_s(16)}"
|
46
52
|
end
|
47
53
|
return data
|
48
54
|
end
|
data/lib/bio-bgzf/constants.rb
CHANGED
data/lib/bio-bgzf/reader.rb
CHANGED
@@ -10,14 +10,32 @@ module Bio::BGZF
|
|
10
10
|
@cur_block = nil
|
11
11
|
end
|
12
12
|
|
13
|
+
# Returns the reader's current virtual offset. Between
|
14
|
+
# {#read_block} calls, the file position will always be at the
|
15
|
+
# start of a block or at EOF, so the low 16 bits of the virtual
|
16
|
+
# offset will always be zero.
|
17
|
+
#
|
18
|
+
# @return [Integer] virtual offset for current position
|
13
19
|
def tell
|
14
20
|
f.tell << 16
|
15
21
|
end
|
16
22
|
|
23
|
+
# Reads the BGZF block at the current position. Returns its
|
24
|
+
# decompressed data.
|
25
|
+
#
|
26
|
+
# @return [String] decompressed block data
|
17
27
|
def read_block
|
18
28
|
decompress_block(f)
|
19
29
|
end
|
20
30
|
|
31
|
+
# Reads a portion of a BGZF block, starting from the given virtual
|
32
|
+
# offset. If the offset is the start of a block (low 16 bits are
|
33
|
+
# zero) the entire block's data will be returned. Otherwise, the
|
34
|
+
# subset of the data starting at the given offset will be
|
35
|
+
# returned.
|
36
|
+
#
|
37
|
+
# @param [Integer] vo virtual offset to start from
|
38
|
+
# @return [String] decompressed block data
|
21
39
|
def read_block_at(vo)
|
22
40
|
block_offset = vo_block_offset(vo)
|
23
41
|
data_offset = vo_data_offset(vo)
|
@@ -30,10 +48,14 @@ module Bio::BGZF
|
|
30
48
|
end
|
31
49
|
end
|
32
50
|
|
51
|
+
# Iterates over the blocks in a BGZF file, yielding [block, vo] pairs where
|
33
52
|
def each_block
|
34
53
|
if block_given?
|
35
|
-
while
|
36
|
-
|
54
|
+
while true
|
55
|
+
pos = tell
|
56
|
+
b = read_block
|
57
|
+
break unless b
|
58
|
+
yield b, pos
|
37
59
|
end
|
38
60
|
else
|
39
61
|
enum_for(:each_block)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Bio::BGZF
|
2
|
+
|
3
|
+
class Writer
|
4
|
+
include Bio::BGZF
|
5
|
+
|
6
|
+
attr_reader :f, :buf
|
7
|
+
|
8
|
+
def initialize(f)
|
9
|
+
@f = f
|
10
|
+
@buf = ''
|
11
|
+
if block_given?
|
12
|
+
begin
|
13
|
+
yield self
|
14
|
+
ensure
|
15
|
+
self.close
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def tell
|
21
|
+
f.tell << 16
|
22
|
+
end
|
23
|
+
|
24
|
+
def write_buf
|
25
|
+
if buf.size > 0
|
26
|
+
raise "Buffer too large: #{buf.bytesize}" if buf.bytesize > MAX_BYTES
|
27
|
+
block = pack(buf)
|
28
|
+
f.write(block)
|
29
|
+
@buf = ''
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def write(s)
|
34
|
+
if s.bytesize > MAX_BYTES
|
35
|
+
write_buf
|
36
|
+
_each_slice(s) do |slice|
|
37
|
+
write(slice)
|
38
|
+
end
|
39
|
+
else
|
40
|
+
if (s.bytesize + buf.bytesize) > MAX_BYTES
|
41
|
+
write_buf
|
42
|
+
end
|
43
|
+
buf << s
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def _each_slice(s)
|
48
|
+
n = 0
|
49
|
+
size = s.bytesize
|
50
|
+
while true
|
51
|
+
offset = n * MAX_BYTES
|
52
|
+
break if offset >= size
|
53
|
+
yield s.slice(offset, MAX_BYTES)
|
54
|
+
n += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def close
|
59
|
+
write_buf
|
60
|
+
f.close
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/spec/bio-bgzf_spec.rb
CHANGED
@@ -11,8 +11,10 @@ describe Bio::BGZF do
|
|
11
11
|
it "should be able to read BGZF blocks from a samtools file" do
|
12
12
|
File.open("test/data/mm8.chrM.maf.gz") do |f|
|
13
13
|
r = Bio::BGZF::Reader.new(f)
|
14
|
-
r.each_block do |block|
|
14
|
+
r.each_block do |block, pos|
|
15
15
|
block.size.should <= 65536
|
16
|
+
pos.should.is_a? Integer
|
17
|
+
Bio::BGZF::vo_data_offset(pos).should == 0
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-bgzf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-08-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -60,25 +60,23 @@ dependencies:
|
|
60
60
|
- - ~>
|
61
61
|
- !ruby/object:Gem::Version
|
62
62
|
version: 2.10.0
|
63
|
-
description:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
description: This library provides support for BGZF (Blocked GZip Format) in Ruby.
|
64
|
+
BGZF, originally defined as part of the SAM/BAM specification, is used to compress
|
65
|
+
record-oriented data in a way that facilitates random access, unlike plain gzip.
|
66
|
+
BGZF is principally used for bioinformatics data but would be useful in other contexts
|
67
|
+
as well.
|
68
68
|
email: cswh@umich.edu
|
69
69
|
executables: []
|
70
70
|
extensions: []
|
71
71
|
extra_rdoc_files:
|
72
72
|
- LICENSE.txt
|
73
73
|
- README.md
|
74
|
-
- README.rdoc
|
75
74
|
files:
|
76
75
|
- .document
|
77
76
|
- .travis.yml
|
78
77
|
- Gemfile
|
79
78
|
- LICENSE.txt
|
80
79
|
- README.md
|
81
|
-
- README.rdoc
|
82
80
|
- Rakefile
|
83
81
|
- VERSION
|
84
82
|
- lib/bio-bgzf.rb
|
@@ -88,6 +86,7 @@ files:
|
|
88
86
|
- lib/bio-bgzf/reader.rb
|
89
87
|
- lib/bio-bgzf/unpack.rb
|
90
88
|
- lib/bio-bgzf/vo.rb
|
89
|
+
- lib/bio-bgzf/writer.rb
|
91
90
|
- spec/bio-bgzf_spec.rb
|
92
91
|
- test/data/mm8.chrM.maf.gz
|
93
92
|
homepage: http://github.com/csw/bioruby-bgzf
|
@@ -105,7 +104,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
105
104
|
version: '0'
|
106
105
|
segments:
|
107
106
|
- 0
|
108
|
-
hash:
|
107
|
+
hash: 4066832272659457160
|
109
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
110
109
|
none: false
|
111
110
|
requirements:
|
data/README.rdoc
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
= bio-bgzf
|
2
|
-
|
3
|
-
{<img
|
4
|
-
src="https://secure.travis-ci.org/lomereiter/bioruby-bgzf.png"
|
5
|
-
/>}[http://travis-ci.org/#!/lomereiter/bioruby-bgzf]
|
6
|
-
|
7
|
-
Full description goes here
|
8
|
-
|
9
|
-
Note: this software is under active development!
|
10
|
-
|
11
|
-
== Installation
|
12
|
-
|
13
|
-
gem install bio-bgzf
|
14
|
-
|
15
|
-
== Usage
|
16
|
-
|
17
|
-
== Developers
|
18
|
-
|
19
|
-
To use the library
|
20
|
-
|
21
|
-
require 'bio-bgzf'
|
22
|
-
|
23
|
-
The API doc is online. For more code examples see also the test files in
|
24
|
-
the source tree.
|
25
|
-
|
26
|
-
== Project home page
|
27
|
-
|
28
|
-
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
-
|
30
|
-
http://github.com/lomereiter/bioruby-bgzf
|
31
|
-
|
32
|
-
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
-
|
34
|
-
== Cite
|
35
|
-
|
36
|
-
If you use this software, please cite one of
|
37
|
-
|
38
|
-
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
-
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
-
|
41
|
-
== Biogems.info
|
42
|
-
|
43
|
-
This Biogem is published at http://biogems.info/index.html#bio-bgzf
|
44
|
-
|
45
|
-
== Copyright
|
46
|
-
|
47
|
-
Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
|
48
|
-
|