bio-maf 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +147 -113
- data/bin/maf_count +0 -1
- data/bin/maf_dump_blocks +0 -1
- data/bin/maf_extract +177 -0
- data/bin/maf_index +15 -8
- data/bin/maf_tile +2 -0
- data/bin/maf_to_fasta +4 -7
- data/bio-maf.gemspec +3 -4
- data/features/maf-indexing.feature +21 -1
- data/features/step_definitions/convert_steps.rb +2 -7
- data/features/step_definitions/index_steps.rb +4 -0
- data/lib/bio-maf.rb +5 -0
- data/lib/bio/maf/index.rb +33 -23
- data/lib/bio/maf/maf.rb +10 -7
- data/lib/bio/maf/parser.rb +37 -15
- data/lib/bio/maf/tiler.rb +60 -8
- data/lib/bio/maf/writer.rb +26 -0
- data/man/maf_extract.1 +159 -0
- data/man/maf_extract.1.ronn +175 -0
- data/man/maf_index.1 +21 -10
- data/man/maf_index.1.ronn +14 -7
- data/man/maf_tile.1 +12 -0
- data/man/maf_tile.1.ronn +9 -0
- data/spec/bio/maf/index_spec.rb +23 -0
- metadata +15 -11
data/bin/maf_index
CHANGED
@@ -13,29 +13,35 @@ PRINTERS = {
|
|
13
13
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
|
-
$options.
|
16
|
+
$options.ref_only = true
|
17
|
+
$options.reader = if RUBY_PLATFORM == 'java'
|
18
|
+
Bio::MAF::ThreadedChunkReader
|
19
|
+
else
|
20
|
+
Bio::MAF::ChunkReader
|
21
|
+
end
|
17
22
|
|
18
23
|
def build_index(maf, index)
|
19
24
|
parser = Bio::MAF::Parser.new(maf,
|
20
25
|
:chunk_reader => $options.reader,
|
21
26
|
:parse_extended => false)
|
22
|
-
idx = Bio::MAF::KyotoIndex.build(parser, index)
|
27
|
+
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
23
28
|
idx.close
|
24
29
|
end
|
25
30
|
|
26
31
|
op = OptionParser.new do |opts|
|
27
32
|
opts.banner = "Usage: maf_index [options] <maf> <index>"
|
28
|
-
|
29
|
-
|
33
|
+
opts.separator ""
|
34
|
+
opts.separator "Options:"
|
35
|
+
opts.on("-a", "--all", "Index all sequences, not just reference seq") do
|
36
|
+
$options.ref_only = false
|
37
|
+
end
|
38
|
+
Bio::MAF::handle_logging_options(opts)
|
30
39
|
opts.on("--time", "print elapsed time") do
|
31
40
|
$options.bench = true
|
32
41
|
end
|
33
|
-
opts.on("-d", "--dump") do
|
42
|
+
opts.on("-d", "--dump", "Dump contents of given INDEX") do
|
34
43
|
$options.mode = :dump
|
35
44
|
end
|
36
|
-
opts.on("-t", "--threaded") do
|
37
|
-
$options.reader = Bio::MAF::ThreadedChunkReader
|
38
|
-
end
|
39
45
|
opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
|
40
46
|
require 'ruby-prof'
|
41
47
|
if pspec =~ /(\w+):(.+)/
|
@@ -49,6 +55,7 @@ op = OptionParser.new do |opts|
|
|
49
55
|
end
|
50
56
|
|
51
57
|
op.parse!(ARGV)
|
58
|
+
Bio::Log::CLI.configure('bio-maf')
|
52
59
|
|
53
60
|
maf_p = ARGV.shift if $options.mode == :build
|
54
61
|
index_p = ARGV.shift
|
data/bin/maf_tile
CHANGED
@@ -65,9 +65,11 @@ o_parser = OptionParser.new do |opts|
|
|
65
65
|
"(requires --output-base)") do |bed|
|
66
66
|
options.bed = bed
|
67
67
|
end
|
68
|
+
Bio::MAF::handle_logging_options(opts)
|
68
69
|
end
|
69
70
|
|
70
71
|
o_parser.parse!(ARGV)
|
72
|
+
Bio::Log::CLI.configure('bio-maf')
|
71
73
|
|
72
74
|
maf_p = ARGV.shift
|
73
75
|
index_p = ARGV.shift
|
data/bin/maf_to_fasta
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'bio-maf'
|
4
|
-
require 'bigbio'
|
5
4
|
require 'optparse'
|
6
5
|
require 'ostruct'
|
7
6
|
|
@@ -42,16 +41,14 @@ if options.profile_gc
|
|
42
41
|
end
|
43
42
|
|
44
43
|
parser = options.parser.new(src_path)
|
45
|
-
|
44
|
+
File.open(dst_path, 'w') do |outf|
|
45
|
+
writer = Bio::MAF::FASTAWriter.new(outf)
|
46
46
|
|
47
|
-
parser.each_block do |block|
|
48
|
-
|
49
|
-
seq.write_fasta(writer)
|
47
|
+
parser.each_block do |block|
|
48
|
+
writer.write_block(block)
|
50
49
|
end
|
51
50
|
end
|
52
51
|
|
53
|
-
writer.close
|
54
|
-
|
55
52
|
if options.profile_gc
|
56
53
|
$stderr.puts GC::Profiler.result
|
57
54
|
GC::Profiler.disable
|
data/bio-maf.gemspec
CHANGED
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.1"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-07-
|
9
|
+
s.date = "2012-07-26"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
|
-
s.executables = ["maf_count", "maf_dump_blocks", "maf_extract_ranges_count", "maf_index", "maf_parse_bench", "maf_to_fasta", "maf_write", "random_ranges"]
|
13
12
|
s.extra_rdoc_files = [
|
14
13
|
"LICENSE.txt",
|
15
14
|
"README.md"
|
@@ -33,8 +32,8 @@ Gem::Specification.new do |s|
|
|
33
32
|
end
|
34
33
|
|
35
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
36
|
-
s.add_runtime_dependency('bio-bigbio', [">= 0"])
|
37
35
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
36
|
+
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
38
37
|
if RUBY_PLATFORM == 'java'
|
39
38
|
s.add_runtime_dependency('kyotocabinet-java', ["~> 0.3.0"])
|
40
39
|
else
|
@@ -30,6 +30,15 @@ Feature: Indexed access to MAF files
|
|
30
30
|
And sequence mm8.chr7 of block 0 has start 80082592
|
31
31
|
And sequence mm8.chr7 of block 1 has start 80082713
|
32
32
|
|
33
|
+
Scenario: Extract alignment blocks by chromosomal range on non-ref sequence
|
34
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
35
|
+
When I open it with a MAF reader
|
36
|
+
And build an index on all sequences
|
37
|
+
And search for blocks between positions 136011819 and 136012026 of rn4.chr1
|
38
|
+
Then 2 blocks are obtained
|
39
|
+
And sequence mm8.chr7 of block 0 has start 80082368
|
40
|
+
And sequence mm8.chr7 of block 1 has start 80082471
|
41
|
+
|
33
42
|
@no_jruby
|
34
43
|
Scenario: Build MAF index with CLI tool
|
35
44
|
Given test files:
|
@@ -40,6 +49,17 @@ Feature: Indexed access to MAF files
|
|
40
49
|
"""
|
41
50
|
And a file named "mm8_chr7_tiny.kct" should exist
|
42
51
|
|
52
|
+
@no_jruby
|
53
|
+
Scenario: Build MAF index on all sequences with CLI tool
|
54
|
+
Given test files:
|
55
|
+
| mm8_chr7_tiny.maf |
|
56
|
+
When I run `maf_index --all mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
|
57
|
+
And I run `maf_index -d mm8_chr7_tiny.kct`
|
58
|
+
Then it should pass with regex:
|
59
|
+
"""
|
60
|
+
9 \[bin 585\] 594:631
|
61
|
+
"""
|
62
|
+
|
43
63
|
@no_jruby
|
44
64
|
Scenario: Dump MAF index with CLI tool
|
45
65
|
Given test files:
|
@@ -51,4 +71,4 @@ Feature: Indexed access to MAF files
|
|
51
71
|
0 \[bin 1195\] 80082334:80082368
|
52
72
|
"""
|
53
73
|
|
54
|
-
|
74
|
+
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'bigbio' # FASTA support
|
2
|
-
|
3
1
|
Given /^a MAF source file "(.*?)"$/ do |src|
|
4
2
|
@src_f = $test_data + src
|
5
3
|
@src_f.exist?.should be_true
|
@@ -13,15 +11,12 @@ end
|
|
13
11
|
|
14
12
|
When /^I select FASTA output$/ do
|
15
13
|
@dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
|
16
|
-
@dst
|
17
|
-
@writer = FastaWriter.new(@dst.path)
|
14
|
+
@writer = Bio::MAF::FASTAWriter.new(@dst)
|
18
15
|
end
|
19
16
|
|
20
17
|
When /^process the file$/ do
|
21
18
|
@parser.each_block do |block|
|
22
|
-
block
|
23
|
-
seq.write_fasta(@writer)
|
24
|
-
end
|
19
|
+
@writer.write_block(block)
|
25
20
|
end
|
26
21
|
@writer.close
|
27
22
|
end
|
@@ -2,6 +2,10 @@ When /^build an index on the reference sequence$/ do
|
|
2
2
|
@idx = Bio::MAF::KyotoIndex.build(@parser, '%')
|
3
3
|
end
|
4
4
|
|
5
|
+
When /^build an index on all sequences$/ do
|
6
|
+
@idx = Bio::MAF::KyotoIndex.build(@parser, '%', false)
|
7
|
+
end
|
8
|
+
|
5
9
|
Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
|
6
10
|
@idx = Bio::MAF::KyotoIndex.open($test_data + name)
|
7
11
|
end
|
data/lib/bio-maf.rb
CHANGED
@@ -8,5 +8,10 @@
|
|
8
8
|
#
|
9
9
|
# In this file only require other files. Avoid other source code.
|
10
10
|
|
11
|
+
require 'bio-logger'
|
12
|
+
log = Bio::Log::LoggerPlus.new('bio-maf')
|
13
|
+
log.outputters = Bio::Log::Outputter.stderr
|
14
|
+
log.level = Bio::Log::WARN
|
15
|
+
|
11
16
|
require 'bio/ucsc'
|
12
17
|
require 'bio/maf'
|
data/lib/bio/maf/index.rb
CHANGED
@@ -207,7 +207,7 @@ module Bio
|
|
207
207
|
# (could build a real one, too...)
|
208
208
|
maf = options[:maf]
|
209
209
|
parser = Parser.new(maf, @parse_options)
|
210
|
-
|
210
|
+
LOG.warn { "WARNING: building temporary index on #{maf}." }
|
211
211
|
index = KyotoIndex.build(parser, '%')
|
212
212
|
register_index(index, maf)
|
213
213
|
end
|
@@ -247,7 +247,7 @@ module Bio
|
|
247
247
|
|
248
248
|
# @api private
|
249
249
|
def with_parser(chrom)
|
250
|
-
|
250
|
+
LOG.debug { "Creating parser with options #{@parse_options.inspect}" }
|
251
251
|
parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
|
252
252
|
parser.sequence_filter = self.sequence_filter
|
253
253
|
begin
|
@@ -262,7 +262,7 @@ module Bio
|
|
262
262
|
class KyotoIndex
|
263
263
|
include KVHelpers
|
264
264
|
|
265
|
-
attr_reader :db, :species, :species_max_id
|
265
|
+
attr_reader :db, :species, :species_max_id, :ref_only
|
266
266
|
attr_accessor :index_sequences, :ref_seq
|
267
267
|
|
268
268
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
@@ -353,9 +353,9 @@ module Bio
|
|
353
353
|
# @param [Parser] parser MAF parser for file to index
|
354
354
|
# @param [String] path path to index file to create
|
355
355
|
# @return [KyotoIndex]
|
356
|
-
def self.build(parser, path)
|
356
|
+
def self.build(parser, path, ref_only=true)
|
357
357
|
idx = self.new(path)
|
358
|
-
idx.
|
358
|
+
idx.build(parser, ref_only)
|
359
359
|
return idx
|
360
360
|
end
|
361
361
|
|
@@ -391,11 +391,11 @@ module Bio
|
|
391
391
|
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
392
392
|
# @api public
|
393
393
|
def find(intervals, parser, filter={}, &blk)
|
394
|
-
|
394
|
+
start = Time.now
|
395
395
|
fl = fetch_list(intervals, filter)
|
396
|
-
|
397
|
-
|
398
|
-
|
396
|
+
LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
|
397
|
+
fl.size,
|
398
|
+
Time.now - start) }
|
399
399
|
if ! fl.empty?
|
400
400
|
parser.fetch_blocks(fl, &blk)
|
401
401
|
else
|
@@ -426,6 +426,7 @@ module Bio
|
|
426
426
|
def initialize(path, db_arg=nil)
|
427
427
|
@species = {}
|
428
428
|
@species_max_id = -1
|
429
|
+
@max_sid = -1
|
429
430
|
if db_arg || ((path.size > 1) and File.exist?(path))
|
430
431
|
mode = KyotoCabinet::DB::OREADER
|
431
432
|
else
|
@@ -585,8 +586,8 @@ module Bio
|
|
585
586
|
n_completed += 1
|
586
587
|
end
|
587
588
|
threads.each { |t| t.join }
|
588
|
-
|
589
|
-
|
589
|
+
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
|
590
|
+
to_fetch.size, n_threads, Time.now - start) }
|
590
591
|
to_fetch
|
591
592
|
end
|
592
593
|
|
@@ -602,8 +603,8 @@ module Bio
|
|
602
603
|
completed.put(result)
|
603
604
|
rescue Exception => e
|
604
605
|
completed.put(e)
|
605
|
-
|
606
|
-
|
606
|
+
LOG.error "Worker failing: #{e.class}: #{e}"
|
607
|
+
LOG.error e
|
607
608
|
raise e
|
608
609
|
end
|
609
610
|
end
|
@@ -658,17 +659,20 @@ module Bio
|
|
658
659
|
|| gi.include?(i_start)
|
659
660
|
end
|
660
661
|
|
661
|
-
def
|
662
|
+
def build(parser, ref_only=true)
|
662
663
|
first_block = parser.parse_block
|
663
664
|
self.ref_seq = first_block.sequences.first.source
|
665
|
+
@ref_only = ref_only
|
664
666
|
db[REF_SEQ_KEY] = ref_seq
|
665
667
|
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
666
|
-
@index_sequences = {
|
667
|
-
store_index_sequences!
|
668
|
+
@index_sequences = {}
|
668
669
|
index_blocks([first_block])
|
669
|
-
|
670
|
+
n = 0
|
671
|
+
parser.each_block.each_slice(1000).each do |blocks|
|
670
672
|
index_blocks(blocks)
|
673
|
+
n += blocks.size
|
671
674
|
end
|
675
|
+
LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
|
672
676
|
db.synchronize(true)
|
673
677
|
end
|
674
678
|
|
@@ -685,12 +689,18 @@ module Bio
|
|
685
689
|
h[name] = id
|
686
690
|
end
|
687
691
|
@index_sequences = h
|
692
|
+
@max_sid = @index_sequences.values.max
|
688
693
|
end
|
689
694
|
|
690
|
-
def
|
691
|
-
|
692
|
-
|
695
|
+
def seq_id_for(name)
|
696
|
+
sid = index_sequences[name]
|
697
|
+
if ! sid
|
698
|
+
@max_sid += 1
|
699
|
+
sid = @max_sid
|
700
|
+
db.set("sequence:#{name}", sid.to_s)
|
701
|
+
index_sequences[name] = sid
|
693
702
|
end
|
703
|
+
return sid
|
694
704
|
end
|
695
705
|
|
696
706
|
def load_species
|
@@ -742,9 +752,9 @@ module Bio
|
|
742
752
|
end
|
743
753
|
h = {}
|
744
754
|
val = build_block_value(block)
|
745
|
-
block.sequences.
|
746
|
-
|
747
|
-
|
755
|
+
to_index = ref_only ? [block.sequences.first] : block.sequences
|
756
|
+
to_index.each do |seq|
|
757
|
+
seq_id = seq_id_for(seq.source)
|
748
758
|
seq_end = seq.start + seq.size
|
749
759
|
bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
|
750
760
|
key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -64,7 +64,6 @@ module Bio
|
|
64
64
|
attr_reader :size
|
65
65
|
|
66
66
|
def initialize(vars, sequences, offset, size, filtered)
|
67
|
-
#raise ArgumentError, "no sequences given for block at offset #{offset}!" unless sequences && sequences.first
|
68
67
|
@vars = vars
|
69
68
|
@sequences = sequences
|
70
69
|
@offset = offset
|
@@ -258,8 +257,13 @@ module Bio
|
|
258
257
|
attr_accessor :quality
|
259
258
|
alias_method :source_size, :src_size
|
260
259
|
|
261
|
-
def initialize(
|
262
|
-
@source
|
260
|
+
def initialize(source, start, size, strand, src_size, text)
|
261
|
+
@source = source
|
262
|
+
@start = start
|
263
|
+
@size = size
|
264
|
+
@strand = strand
|
265
|
+
@src_size = src_size
|
266
|
+
@text = text
|
263
267
|
end
|
264
268
|
|
265
269
|
def end
|
@@ -354,9 +358,8 @@ module Bio
|
|
354
358
|
Bio::BioAlignment::Sequence.new(source, text)
|
355
359
|
end
|
356
360
|
|
357
|
-
def
|
358
|
-
|
359
|
-
text)
|
361
|
+
def fasta_desc
|
362
|
+
"#{source}:#{start}-#{start + size}"
|
360
363
|
end
|
361
364
|
|
362
365
|
def joinable_with?(o)
|
@@ -441,7 +444,7 @@ module Bio
|
|
441
444
|
attr_reader :status
|
442
445
|
|
443
446
|
def initialize(*args)
|
444
|
-
super(*args[0..4])
|
447
|
+
super(*(args[0..4] << nil))
|
445
448
|
@status = args[5]
|
446
449
|
end
|
447
450
|
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -5,7 +5,8 @@ require 'java' if RUBY_PLATFORM == 'java'
|
|
5
5
|
module Bio
|
6
6
|
# @api public
|
7
7
|
module MAF
|
8
|
-
|
8
|
+
LOG = Bio::Log::LoggerPlus['bio-maf']
|
9
|
+
|
9
10
|
# @api public
|
10
11
|
class ParseError < Exception; end
|
11
12
|
|
@@ -601,10 +602,9 @@ module Bio
|
|
601
602
|
end
|
602
603
|
end
|
603
604
|
elapsed = Time.now - start
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
# elapsed, rate)
|
605
|
+
rate = (total_size / 1048576.0) / elapsed
|
606
|
+
LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
|
607
|
+
elapsed, rate) }
|
608
608
|
end
|
609
609
|
|
610
610
|
# Fetch and parse the blocks given by the merged fetch list, in
|
@@ -639,13 +639,13 @@ module Bio
|
|
639
639
|
end
|
640
640
|
threads.each { |t| t.join }
|
641
641
|
elapsed = Time.now - start
|
642
|
-
|
643
|
-
|
644
|
-
|
642
|
+
LOG.debug { sprintf("Fetched blocks from %d threads in %.1fs.",
|
643
|
+
n_threads,
|
644
|
+
elapsed) }
|
645
645
|
mb = total_size / 1048576.0
|
646
|
-
|
647
|
-
|
648
|
-
|
646
|
+
LOG.debug { sprintf("%.3f MB processed (%.1f MB/s).",
|
647
|
+
mb,
|
648
|
+
mb / elapsed) }
|
649
649
|
end
|
650
650
|
|
651
651
|
# Create a worker thread for parallel parsing.
|
@@ -667,8 +667,8 @@ module Bio
|
|
667
667
|
end
|
668
668
|
end
|
669
669
|
rescue Exception => e
|
670
|
-
|
671
|
-
|
670
|
+
LOG.error "Worker failing: #{e.class}: #{e}"
|
671
|
+
LOG.error e
|
672
672
|
raise e
|
673
673
|
end
|
674
674
|
end
|
@@ -835,8 +835,8 @@ module Bio
|
|
835
835
|
end
|
836
836
|
queue.put(:eof)
|
837
837
|
rescue
|
838
|
-
|
839
|
-
|
838
|
+
LOG.error "worker exiting: #{$!.class}: #{$!}"
|
839
|
+
LOG.error $!
|
840
840
|
end
|
841
841
|
end
|
842
842
|
saw_eof = false
|
@@ -902,6 +902,28 @@ module Bio
|
|
902
902
|
include MAFParsing
|
903
903
|
end
|
904
904
|
|
905
|
+
def handle_logging_options(opts)
|
906
|
+
opts.on("--logger filename", String,
|
907
|
+
"Log to file (default STDOUT)") do |name|
|
908
|
+
Bio::Log::CLI.logger(name)
|
909
|
+
end
|
910
|
+
opts.on("--trace options", String,
|
911
|
+
"Set log level",
|
912
|
+
"(default INFO, see bio-logger)") do |s|
|
913
|
+
Bio::Log::CLI.trace(s)
|
914
|
+
end
|
915
|
+
opts.on("-q", "--quiet", "Run quietly") do
|
916
|
+
Bio::Log::CLI.trace('error')
|
917
|
+
end
|
918
|
+
opts.on("-v", "--verbose", "Run verbosely") do
|
919
|
+
Bio::Log::CLI.trace('info')
|
920
|
+
end
|
921
|
+
opts.on("--debug", "Run with extra debugging output") do
|
922
|
+
Bio::Log::CLI.trace('debug')
|
923
|
+
end
|
924
|
+
end
|
925
|
+
module_function :handle_logging_options
|
926
|
+
|
905
927
|
end
|
906
928
|
|
907
929
|
end
|