bio-bgzf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +71 -0
- data/README.rdoc +48 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/lib/bio-bgzf.rb +6 -0
- data/lib/bio-bgzf/block.rb +51 -0
- data/lib/bio-bgzf/constants.rb +19 -0
- data/lib/bio-bgzf/pack.rb +37 -0
- data/lib/bio-bgzf/reader.rb +45 -0
- data/lib/bio-bgzf/unpack.rb +10 -0
- data/lib/bio-bgzf/vo.rb +11 -0
- data/spec/bio-bgzf_spec.rb +28 -0
- metadata +120 -0
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
# - rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2012 Artem Tarasov
|
2
|
+
Copyright (c) 2012 Clayton Wheeler
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# bio-bgzf
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/csw/bioruby-bgzf.png)](http://travis-ci.org/csw/bioruby-bgzf)
|
4
|
+
|
5
|
+
This library provides support for the [BGZF][] (Blocked GZip Format)
|
6
|
+
in Ruby. BGZF, originally defined as part of the [SAM/BAM][]
|
7
|
+
specification, is used to compress record-oriented bioinformatics data
|
8
|
+
in a way that facilitates random access, unlike plain gzip. A BGZF
|
9
|
+
file consists of contatenated 64 KB blocks, each an independent gzip
|
10
|
+
stream. It can be decompressed in its entirety with gzip, but this
|
11
|
+
library enables random access using 'virtual offsets' as defined in
|
12
|
+
SAM/BAM.
|
13
|
+
|
14
|
+
A virtual offset is a 64-bit quantity, with a 48-bit block offset
|
15
|
+
giving the position in the file of the start of the block followed by
|
16
|
+
a 16-bit data offset giving a position within the file.
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
```sh
|
21
|
+
gem install bio-bgzf
|
22
|
+
```
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'bio-bgzf'
|
28
|
+
|
29
|
+
File.open('example.gz') do |f|
|
30
|
+
r = Bio::BGZF::Reader.new(f)
|
31
|
+
while true do
|
32
|
+
block_vo = r.tell
|
33
|
+
block = r.read_block
|
34
|
+
break unless block
|
35
|
+
end
|
36
|
+
block = f.read_block_at(block_vo)
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
The API doc is online. For more code examples see the test files in
|
41
|
+
the source tree.
|
42
|
+
|
43
|
+
## Project home page
|
44
|
+
|
45
|
+
Information on the source tree, documentation, examples, issues and
|
46
|
+
how to contribute, see
|
47
|
+
|
48
|
+
http://github.com/csw/bioruby-bgzf
|
49
|
+
|
50
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
51
|
+
|
52
|
+
## Cite
|
53
|
+
|
54
|
+
If you use this software, please cite one of
|
55
|
+
|
56
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
57
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
58
|
+
|
59
|
+
## Biogems.info
|
60
|
+
|
61
|
+
This Biogem is published at [#bio-bgzf](http://biogems.info/index.html)
|
62
|
+
|
63
|
+
## Copyright
|
64
|
+
|
65
|
+
Copyright (c) 2012 Artem Tarasov and Clayton Wheeler. See LICENSE.txt
|
66
|
+
for further details.
|
67
|
+
|
68
|
+
[BGZF]: http://blastedbio.blogspot.com/2011/11/bgzf-blocked-bigger-better-gzip.html
|
69
|
+
[SAM/BAM]: http://samtools.sourceforge.net/SAM1.pdf
|
70
|
+
|
71
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-bgzf
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/lomereiter/bioruby-bgzf.png"
|
5
|
+
/>}[http://travis-ci.org/#!/lomereiter/bioruby-bgzf]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-bgzf
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-bgzf'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/lomereiter/bioruby-bgzf
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-bgzf
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2012 Artem Tarasov. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-bgzf"
|
18
|
+
gem.homepage = "http://github.com/csw/bioruby-bgzf"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Reading/writing BGZF blocks}
|
21
|
+
gem.description = %Q{BGZF compression is used nowadays only for providing random access to BAM format. However, it is completely independent from the format, and can be used for arbitrary data format. The gem allows to read BGZF blocks from streams and pack strings into blocks, aiming to facilitate introducing BGZF compression for Ruby users.}
|
22
|
+
gem.email = "cswh@umich.edu"
|
23
|
+
gem.authors = ["Artem Tarasov", "Clayton Wheeler"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
task :test => :spec
|
35
|
+
|
36
|
+
task :default => :test
|
37
|
+
|
38
|
+
require 'rdoc/task'
|
39
|
+
Rake::RDocTask.new do |rdoc|
|
40
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
41
|
+
|
42
|
+
rdoc.rdoc_dir = 'rdoc'
|
43
|
+
rdoc.title = "bio-bgzf #{version}"
|
44
|
+
rdoc.rdoc_files.include('README*')
|
45
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
46
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/bio-bgzf.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
module Bio::BGZF
|
2
|
+
|
3
|
+
def read_bgzf_block(f)
|
4
|
+
hstart = f.read(12)
|
5
|
+
return nil if hstart == nil # EOF?
|
6
|
+
magic, gzip_extra_length = hstart.unpack('Vxxxxxxv')
|
7
|
+
raise 'wrong BGZF magic' unless magic == 0x04088B1F
|
8
|
+
|
9
|
+
len = 0
|
10
|
+
bsize = nil
|
11
|
+
while len < gzip_extra_length do
|
12
|
+
si1, si2, slen = f.read(4).unpack('CCv')
|
13
|
+
if si1 == 66 and si2 == 67 then
|
14
|
+
raise "BC subfield length is #{slen} but must be 2" if slen != 2
|
15
|
+
raise 'duplicate field with block size' unless bsize.nil?
|
16
|
+
bsize = f.read(2).unpack('v')[0]
|
17
|
+
f.seek(slen - 2, IO::SEEK_CUR)
|
18
|
+
else
|
19
|
+
f.seek(slen, IO::SEEK_CUR)
|
20
|
+
end
|
21
|
+
len += 4 + slen
|
22
|
+
end
|
23
|
+
|
24
|
+
if len != gzip_extra_length then
|
25
|
+
raise "total length of subfields is #{len} bytes but must be #{gzip_extra_length}"
|
26
|
+
end
|
27
|
+
raise 'block size was not found in any subfield' if bsize.nil?
|
28
|
+
|
29
|
+
compressed_data = f.read(bsize - gzip_extra_length - 19)
|
30
|
+
crc32, input_size = f.read(8).unpack('VV')
|
31
|
+
|
32
|
+
return compressed_data, input_size, crc32
|
33
|
+
end
|
34
|
+
module_function :read_bgzf_block
|
35
|
+
|
36
|
+
def decompress_block(f)
|
37
|
+
cdata, in_size, expected_crc = read_bgzf_block(f)
|
38
|
+
return nil if cdata == nil
|
39
|
+
crc = Zlib.crc32(cdata, 0)
|
40
|
+
if crc != expected_crc
|
41
|
+
raise "CRC error: expected #{expected_crc.to_s(16)}, got #{crc.to_s(16)}"
|
42
|
+
end
|
43
|
+
data = unpack(cdata)
|
44
|
+
if data.bytesize != in_size
|
45
|
+
raise "Expected #{in_size} bytes from BGZF block at #{pos}, but got #{data.bytesize} bytes!"
|
46
|
+
end
|
47
|
+
return data
|
48
|
+
end
|
49
|
+
module_function :decompress_block
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module Bio::BGZF
|
4
|
+
|
5
|
+
# Packs +str+ into a BGZF block using
|
6
|
+
# given compression +level+.
|
7
|
+
def pack(str, level=Zlib::BEST_COMPRESSION)
|
8
|
+
zs = Zlib::Deflate.new level, -15
|
9
|
+
cdata = zs.deflate str, Zlib::FINISH
|
10
|
+
zs.close
|
11
|
+
|
12
|
+
crc32 = Zlib.crc32 cdata, 0
|
13
|
+
isize = str.length
|
14
|
+
|
15
|
+
bsize = cdata.length + 19 + XLEN
|
16
|
+
|
17
|
+
array = [ ID1,
|
18
|
+
ID2,
|
19
|
+
CM,
|
20
|
+
FLG,
|
21
|
+
MTIME,
|
22
|
+
XFL,
|
23
|
+
OS,
|
24
|
+
XLEN,
|
25
|
+
SI1,
|
26
|
+
SI2,
|
27
|
+
SLEN,
|
28
|
+
bsize,
|
29
|
+
cdata,
|
30
|
+
crc32,
|
31
|
+
isize
|
32
|
+
]
|
33
|
+
|
34
|
+
array.pack('CCCCVCCvCCvva*VV')
|
35
|
+
end
|
36
|
+
module_function :pack
|
37
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Bio::BGZF
|
2
|
+
|
3
|
+
class Reader
|
4
|
+
include Bio::BGZF
|
5
|
+
|
6
|
+
attr_reader :f
|
7
|
+
|
8
|
+
def initialize(f)
|
9
|
+
@f = f
|
10
|
+
@cur_block = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def tell
|
14
|
+
f.tell << 16
|
15
|
+
end
|
16
|
+
|
17
|
+
def read_block
|
18
|
+
decompress_block(f)
|
19
|
+
end
|
20
|
+
|
21
|
+
def read_block_at(vo)
|
22
|
+
block_offset = vo_block_offset(vo)
|
23
|
+
data_offset = vo_data_offset(vo)
|
24
|
+
f.seek(block_offset)
|
25
|
+
block_data = decompress_block(f)
|
26
|
+
if data_offset == 0
|
27
|
+
return block_data
|
28
|
+
else
|
29
|
+
return block_data.slice(data_offset...block_data.size)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each_block
|
34
|
+
if block_given?
|
35
|
+
while b = read_block
|
36
|
+
yield b
|
37
|
+
end
|
38
|
+
else
|
39
|
+
enum_for(:each_block)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
data/lib/bio-bgzf/vo.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'rspec/expectations'
|
2
|
+
require 'bio-bgzf'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
describe Bio::BGZF do
|
6
|
+
it "should be able to pack strings to BGZF blocks" do
|
7
|
+
Bio::BGZF.should respond_to(:pack).with(1).argument
|
8
|
+
Bio::BGZF.pack("asdfghjkl").should be_instance_of String
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should be able to iteratively read BGZF blocks from stream" do
|
12
|
+
str = ''
|
13
|
+
1000.times { str += (Random.rand(26) + 65).chr }
|
14
|
+
|
15
|
+
file = Tempfile.new 'bgzfstring'
|
16
|
+
str.chars.each_slice(42).map(&:join).each do |s|
|
17
|
+
file.write(Bio::BGZF.pack s)
|
18
|
+
end
|
19
|
+
file.flush
|
20
|
+
file.rewind
|
21
|
+
|
22
|
+
str2 = ''
|
23
|
+
r = Bio::BGZF::Reader.new(file)
|
24
|
+
r.each_block {|block| str2 += block }
|
25
|
+
|
26
|
+
str2.should == str
|
27
|
+
end
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-bgzf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Artem Tarasov
|
9
|
+
- Clayton Wheeler
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-07-29 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: bundler
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ~>
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: 1.1.0
|
31
|
+
- !ruby/object:Gem::Dependency
|
32
|
+
name: jeweler
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ~>
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 1.8.3
|
39
|
+
type: :development
|
40
|
+
prerelease: false
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ~>
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.8.3
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.10.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ~>
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 2.10.0
|
63
|
+
description: BGZF compression is used nowadays only for providing random access to
|
64
|
+
BAM format. However, it is completely independent from the format, and can be used
|
65
|
+
for arbitrary data format. The gem allows to read BGZF blocks from streams and pack
|
66
|
+
strings into blocks, aiming to facilitate introducing BGZF compression for Ruby
|
67
|
+
users.
|
68
|
+
email: cswh@umich.edu
|
69
|
+
executables: []
|
70
|
+
extensions: []
|
71
|
+
extra_rdoc_files:
|
72
|
+
- LICENSE.txt
|
73
|
+
- README.md
|
74
|
+
- README.rdoc
|
75
|
+
files:
|
76
|
+
- .document
|
77
|
+
- .travis.yml
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- README.rdoc
|
82
|
+
- Rakefile
|
83
|
+
- VERSION
|
84
|
+
- lib/bio-bgzf.rb
|
85
|
+
- lib/bio-bgzf/block.rb
|
86
|
+
- lib/bio-bgzf/constants.rb
|
87
|
+
- lib/bio-bgzf/pack.rb
|
88
|
+
- lib/bio-bgzf/reader.rb
|
89
|
+
- lib/bio-bgzf/unpack.rb
|
90
|
+
- lib/bio-bgzf/vo.rb
|
91
|
+
- spec/bio-bgzf_spec.rb
|
92
|
+
homepage: http://github.com/csw/bioruby-bgzf
|
93
|
+
licenses:
|
94
|
+
- MIT
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ! '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
segments:
|
106
|
+
- 0
|
107
|
+
hash: -2167366817840907754
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ! '>='
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: '0'
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.24
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: Reading/writing BGZF blocks
|
120
|
+
test_files: []
|