bio-maf 0.3.0-java → 0.3.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/DEVELOPMENT.md +4 -0
- data/README.md +172 -114
- data/bin/maf_count +0 -1
- data/bin/maf_dump_blocks +0 -1
- data/bin/maf_extract +180 -0
- data/bin/maf_index +15 -8
- data/bin/maf_tile +2 -0
- data/bin/maf_to_fasta +4 -7
- data/bio-maf.gemspec +3 -4
- data/features/maf-indexing.feature +21 -1
- data/features/step_definitions/convert_steps.rb +2 -7
- data/features/step_definitions/index_steps.rb +4 -0
- data/lib/bio-maf.rb +5 -0
- data/lib/bio/maf/index.rb +33 -23
- data/lib/bio/maf/maf.rb +10 -7
- data/lib/bio/maf/parser.rb +37 -15
- data/lib/bio/maf/tiler.rb +60 -8
- data/lib/bio/maf/writer.rb +26 -0
- data/man/maf_extract.1 +268 -0
- data/man/maf_extract.1.ronn +213 -0
- data/man/maf_index.1 +21 -10
- data/man/maf_index.1.ronn +14 -7
- data/man/maf_tile.1 +12 -0
- data/man/maf_tile.1.ronn +9 -0
- data/spec/bio/maf/index_spec.rb +23 -0
- metadata +14 -10
data/bin/maf_index
CHANGED
@@ -13,29 +13,35 @@ PRINTERS = {
|
|
13
13
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
|
-
$options.
|
16
|
+
$options.ref_only = true
|
17
|
+
$options.reader = if RUBY_PLATFORM == 'java'
|
18
|
+
Bio::MAF::ThreadedChunkReader
|
19
|
+
else
|
20
|
+
Bio::MAF::ChunkReader
|
21
|
+
end
|
17
22
|
|
18
23
|
def build_index(maf, index)
|
19
24
|
parser = Bio::MAF::Parser.new(maf,
|
20
25
|
:chunk_reader => $options.reader,
|
21
26
|
:parse_extended => false)
|
22
|
-
idx = Bio::MAF::KyotoIndex.build(parser, index)
|
27
|
+
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
23
28
|
idx.close
|
24
29
|
end
|
25
30
|
|
26
31
|
op = OptionParser.new do |opts|
|
27
32
|
opts.banner = "Usage: maf_index [options] <maf> <index>"
|
28
|
-
|
29
|
-
|
33
|
+
opts.separator ""
|
34
|
+
opts.separator "Options:"
|
35
|
+
opts.on("-a", "--all", "Index all sequences, not just reference seq") do
|
36
|
+
$options.ref_only = false
|
37
|
+
end
|
38
|
+
Bio::MAF::handle_logging_options(opts)
|
30
39
|
opts.on("--time", "print elapsed time") do
|
31
40
|
$options.bench = true
|
32
41
|
end
|
33
|
-
opts.on("-d", "--dump") do
|
42
|
+
opts.on("-d", "--dump", "Dump contents of given INDEX") do
|
34
43
|
$options.mode = :dump
|
35
44
|
end
|
36
|
-
opts.on("-t", "--threaded") do
|
37
|
-
$options.reader = Bio::MAF::ThreadedChunkReader
|
38
|
-
end
|
39
45
|
opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
|
40
46
|
require 'ruby-prof'
|
41
47
|
if pspec =~ /(\w+):(.+)/
|
@@ -49,6 +55,7 @@ op = OptionParser.new do |opts|
|
|
49
55
|
end
|
50
56
|
|
51
57
|
op.parse!(ARGV)
|
58
|
+
Bio::Log::CLI.configure('bio-maf')
|
52
59
|
|
53
60
|
maf_p = ARGV.shift if $options.mode == :build
|
54
61
|
index_p = ARGV.shift
|
data/bin/maf_tile
CHANGED
@@ -65,9 +65,11 @@ o_parser = OptionParser.new do |opts|
|
|
65
65
|
"(requires --output-base)") do |bed|
|
66
66
|
options.bed = bed
|
67
67
|
end
|
68
|
+
Bio::MAF::handle_logging_options(opts)
|
68
69
|
end
|
69
70
|
|
70
71
|
o_parser.parse!(ARGV)
|
72
|
+
Bio::Log::CLI.configure('bio-maf')
|
71
73
|
|
72
74
|
maf_p = ARGV.shift
|
73
75
|
index_p = ARGV.shift
|
data/bin/maf_to_fasta
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'bio-maf'
|
4
|
-
require 'bigbio'
|
5
4
|
require 'optparse'
|
6
5
|
require 'ostruct'
|
7
6
|
|
@@ -42,16 +41,14 @@ if options.profile_gc
|
|
42
41
|
end
|
43
42
|
|
44
43
|
parser = options.parser.new(src_path)
|
45
|
-
|
44
|
+
File.open(dst_path, 'w') do |outf|
|
45
|
+
writer = Bio::MAF::FASTAWriter.new(outf)
|
46
46
|
|
47
|
-
parser.each_block do |block|
|
48
|
-
|
49
|
-
seq.write_fasta(writer)
|
47
|
+
parser.each_block do |block|
|
48
|
+
writer.write_block(block)
|
50
49
|
end
|
51
50
|
end
|
52
51
|
|
53
|
-
writer.close
|
54
|
-
|
55
52
|
if options.profile_gc
|
56
53
|
$stderr.puts GC::Profiler.result
|
57
54
|
GC::Profiler.disable
|
data/bio-maf.gemspec
CHANGED
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "bio-maf"
|
5
|
-
s.version = "0.3.
|
5
|
+
s.version = "0.3.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Clayton Wheeler"]
|
9
|
-
s.date = "2012-07-
|
9
|
+
s.date = "2012-07-26"
|
10
10
|
s.description = "Multiple Alignment Format parser for BioRuby."
|
11
11
|
s.email = "cswh@umich.edu"
|
12
|
-
s.executables = ["maf_count", "maf_dump_blocks", "maf_extract_ranges_count", "maf_index", "maf_parse_bench", "maf_to_fasta", "maf_write", "random_ranges"]
|
13
12
|
s.extra_rdoc_files = [
|
14
13
|
"LICENSE.txt",
|
15
14
|
"README.md"
|
@@ -33,8 +32,8 @@ Gem::Specification.new do |s|
|
|
33
32
|
end
|
34
33
|
|
35
34
|
s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
|
36
|
-
s.add_runtime_dependency('bio-bigbio', [">= 0"])
|
37
35
|
s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
|
36
|
+
s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
|
38
37
|
if RUBY_PLATFORM == 'java'
|
39
38
|
s.add_runtime_dependency('kyotocabinet-java', ["~> 0.3.0"])
|
40
39
|
else
|
@@ -30,6 +30,15 @@ Feature: Indexed access to MAF files
|
|
30
30
|
And sequence mm8.chr7 of block 0 has start 80082592
|
31
31
|
And sequence mm8.chr7 of block 1 has start 80082713
|
32
32
|
|
33
|
+
Scenario: Extract alignment blocks by chromosomal range on non-ref sequence
|
34
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
35
|
+
When I open it with a MAF reader
|
36
|
+
And build an index on all sequences
|
37
|
+
And search for blocks between positions 136011819 and 136012026 of rn4.chr1
|
38
|
+
Then 2 blocks are obtained
|
39
|
+
And sequence mm8.chr7 of block 0 has start 80082368
|
40
|
+
And sequence mm8.chr7 of block 1 has start 80082471
|
41
|
+
|
33
42
|
@no_jruby
|
34
43
|
Scenario: Build MAF index with CLI tool
|
35
44
|
Given test files:
|
@@ -40,6 +49,17 @@ Feature: Indexed access to MAF files
|
|
40
49
|
"""
|
41
50
|
And a file named "mm8_chr7_tiny.kct" should exist
|
42
51
|
|
52
|
+
@no_jruby
|
53
|
+
Scenario: Build MAF index on all sequences with CLI tool
|
54
|
+
Given test files:
|
55
|
+
| mm8_chr7_tiny.maf |
|
56
|
+
When I run `maf_index --all mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
|
57
|
+
And I run `maf_index -d mm8_chr7_tiny.kct`
|
58
|
+
Then it should pass with regex:
|
59
|
+
"""
|
60
|
+
9 \[bin 585\] 594:631
|
61
|
+
"""
|
62
|
+
|
43
63
|
@no_jruby
|
44
64
|
Scenario: Dump MAF index with CLI tool
|
45
65
|
Given test files:
|
@@ -51,4 +71,4 @@ Feature: Indexed access to MAF files
|
|
51
71
|
0 \[bin 1195\] 80082334:80082368
|
52
72
|
"""
|
53
73
|
|
54
|
-
|
74
|
+
|
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'bigbio' # FASTA support
|
2
|
-
|
3
1
|
Given /^a MAF source file "(.*?)"$/ do |src|
|
4
2
|
@src_f = $test_data + src
|
5
3
|
@src_f.exist?.should be_true
|
@@ -13,15 +11,12 @@ end
|
|
13
11
|
|
14
12
|
When /^I select FASTA output$/ do
|
15
13
|
@dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
|
16
|
-
@dst
|
17
|
-
@writer = FastaWriter.new(@dst.path)
|
14
|
+
@writer = Bio::MAF::FASTAWriter.new(@dst)
|
18
15
|
end
|
19
16
|
|
20
17
|
When /^process the file$/ do
|
21
18
|
@parser.each_block do |block|
|
22
|
-
block
|
23
|
-
seq.write_fasta(@writer)
|
24
|
-
end
|
19
|
+
@writer.write_block(block)
|
25
20
|
end
|
26
21
|
@writer.close
|
27
22
|
end
|
@@ -2,6 +2,10 @@ When /^build an index on the reference sequence$/ do
|
|
2
2
|
@idx = Bio::MAF::KyotoIndex.build(@parser, '%')
|
3
3
|
end
|
4
4
|
|
5
|
+
When /^build an index on all sequences$/ do
|
6
|
+
@idx = Bio::MAF::KyotoIndex.build(@parser, '%', false)
|
7
|
+
end
|
8
|
+
|
5
9
|
Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
|
6
10
|
@idx = Bio::MAF::KyotoIndex.open($test_data + name)
|
7
11
|
end
|
data/lib/bio-maf.rb
CHANGED
@@ -8,5 +8,10 @@
|
|
8
8
|
#
|
9
9
|
# In this file only require other files. Avoid other source code.
|
10
10
|
|
11
|
+
require 'bio-logger'
|
12
|
+
log = Bio::Log::LoggerPlus.new('bio-maf')
|
13
|
+
log.outputters = Bio::Log::Outputter.stderr
|
14
|
+
log.level = Bio::Log::WARN
|
15
|
+
|
11
16
|
require 'bio/ucsc'
|
12
17
|
require 'bio/maf'
|
data/lib/bio/maf/index.rb
CHANGED
@@ -207,7 +207,7 @@ module Bio
|
|
207
207
|
# (could build a real one, too...)
|
208
208
|
maf = options[:maf]
|
209
209
|
parser = Parser.new(maf, @parse_options)
|
210
|
-
|
210
|
+
LOG.warn { "WARNING: building temporary index on #{maf}." }
|
211
211
|
index = KyotoIndex.build(parser, '%')
|
212
212
|
register_index(index, maf)
|
213
213
|
end
|
@@ -247,7 +247,7 @@ module Bio
|
|
247
247
|
|
248
248
|
# @api private
|
249
249
|
def with_parser(chrom)
|
250
|
-
|
250
|
+
LOG.debug { "Creating parser with options #{@parse_options.inspect}" }
|
251
251
|
parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
|
252
252
|
parser.sequence_filter = self.sequence_filter
|
253
253
|
begin
|
@@ -262,7 +262,7 @@ module Bio
|
|
262
262
|
class KyotoIndex
|
263
263
|
include KVHelpers
|
264
264
|
|
265
|
-
attr_reader :db, :species, :species_max_id
|
265
|
+
attr_reader :db, :species, :species_max_id, :ref_only
|
266
266
|
attr_accessor :index_sequences, :ref_seq
|
267
267
|
|
268
268
|
FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
|
@@ -353,9 +353,9 @@ module Bio
|
|
353
353
|
# @param [Parser] parser MAF parser for file to index
|
354
354
|
# @param [String] path path to index file to create
|
355
355
|
# @return [KyotoIndex]
|
356
|
-
def self.build(parser, path)
|
356
|
+
def self.build(parser, path, ref_only=true)
|
357
357
|
idx = self.new(path)
|
358
|
-
idx.
|
358
|
+
idx.build(parser, ref_only)
|
359
359
|
return idx
|
360
360
|
end
|
361
361
|
|
@@ -391,11 +391,11 @@ module Bio
|
|
391
391
|
# @return [Enumerable<Block>] each matching {Block}, if no block given
|
392
392
|
# @api public
|
393
393
|
def find(intervals, parser, filter={}, &blk)
|
394
|
-
|
394
|
+
start = Time.now
|
395
395
|
fl = fetch_list(intervals, filter)
|
396
|
-
|
397
|
-
|
398
|
-
|
396
|
+
LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
|
397
|
+
fl.size,
|
398
|
+
Time.now - start) }
|
399
399
|
if ! fl.empty?
|
400
400
|
parser.fetch_blocks(fl, &blk)
|
401
401
|
else
|
@@ -426,6 +426,7 @@ module Bio
|
|
426
426
|
def initialize(path, db_arg=nil)
|
427
427
|
@species = {}
|
428
428
|
@species_max_id = -1
|
429
|
+
@max_sid = -1
|
429
430
|
if db_arg || ((path.size > 1) and File.exist?(path))
|
430
431
|
mode = KyotoCabinet::DB::OREADER
|
431
432
|
else
|
@@ -585,8 +586,8 @@ module Bio
|
|
585
586
|
n_completed += 1
|
586
587
|
end
|
587
588
|
threads.each { |t| t.join }
|
588
|
-
|
589
|
-
|
589
|
+
LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
|
590
|
+
to_fetch.size, n_threads, Time.now - start) }
|
590
591
|
to_fetch
|
591
592
|
end
|
592
593
|
|
@@ -602,8 +603,8 @@ module Bio
|
|
602
603
|
completed.put(result)
|
603
604
|
rescue Exception => e
|
604
605
|
completed.put(e)
|
605
|
-
|
606
|
-
|
606
|
+
LOG.error "Worker failing: #{e.class}: #{e}"
|
607
|
+
LOG.error e
|
607
608
|
raise e
|
608
609
|
end
|
609
610
|
end
|
@@ -658,17 +659,20 @@ module Bio
|
|
658
659
|
|| gi.include?(i_start)
|
659
660
|
end
|
660
661
|
|
661
|
-
def
|
662
|
+
def build(parser, ref_only=true)
|
662
663
|
first_block = parser.parse_block
|
663
664
|
self.ref_seq = first_block.sequences.first.source
|
665
|
+
@ref_only = ref_only
|
664
666
|
db[REF_SEQ_KEY] = ref_seq
|
665
667
|
db[FORMAT_VERSION_KEY] = FORMAT_VERSION
|
666
|
-
@index_sequences = {
|
667
|
-
store_index_sequences!
|
668
|
+
@index_sequences = {}
|
668
669
|
index_blocks([first_block])
|
669
|
-
|
670
|
+
n = 0
|
671
|
+
parser.each_block.each_slice(1000).each do |blocks|
|
670
672
|
index_blocks(blocks)
|
673
|
+
n += blocks.size
|
671
674
|
end
|
675
|
+
LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
|
672
676
|
db.synchronize(true)
|
673
677
|
end
|
674
678
|
|
@@ -685,12 +689,18 @@ module Bio
|
|
685
689
|
h[name] = id
|
686
690
|
end
|
687
691
|
@index_sequences = h
|
692
|
+
@max_sid = @index_sequences.values.max
|
688
693
|
end
|
689
694
|
|
690
|
-
def
|
691
|
-
|
692
|
-
|
695
|
+
def seq_id_for(name)
|
696
|
+
sid = index_sequences[name]
|
697
|
+
if ! sid
|
698
|
+
@max_sid += 1
|
699
|
+
sid = @max_sid
|
700
|
+
db.set("sequence:#{name}", sid.to_s)
|
701
|
+
index_sequences[name] = sid
|
693
702
|
end
|
703
|
+
return sid
|
694
704
|
end
|
695
705
|
|
696
706
|
def load_species
|
@@ -742,9 +752,9 @@ module Bio
|
|
742
752
|
end
|
743
753
|
h = {}
|
744
754
|
val = build_block_value(block)
|
745
|
-
block.sequences.
|
746
|
-
|
747
|
-
|
755
|
+
to_index = ref_only ? [block.sequences.first] : block.sequences
|
756
|
+
to_index.each do |seq|
|
757
|
+
seq_id = seq_id_for(seq.source)
|
748
758
|
seq_end = seq.start + seq.size
|
749
759
|
bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
|
750
760
|
key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
|
data/lib/bio/maf/maf.rb
CHANGED
@@ -64,7 +64,6 @@ module Bio
|
|
64
64
|
attr_reader :size
|
65
65
|
|
66
66
|
def initialize(vars, sequences, offset, size, filtered)
|
67
|
-
#raise ArgumentError, "no sequences given for block at offset #{offset}!" unless sequences && sequences.first
|
68
67
|
@vars = vars
|
69
68
|
@sequences = sequences
|
70
69
|
@offset = offset
|
@@ -258,8 +257,13 @@ module Bio
|
|
258
257
|
attr_accessor :quality
|
259
258
|
alias_method :source_size, :src_size
|
260
259
|
|
261
|
-
def initialize(
|
262
|
-
@source
|
260
|
+
def initialize(source, start, size, strand, src_size, text)
|
261
|
+
@source = source
|
262
|
+
@start = start
|
263
|
+
@size = size
|
264
|
+
@strand = strand
|
265
|
+
@src_size = src_size
|
266
|
+
@text = text
|
263
267
|
end
|
264
268
|
|
265
269
|
def end
|
@@ -354,9 +358,8 @@ module Bio
|
|
354
358
|
Bio::BioAlignment::Sequence.new(source, text)
|
355
359
|
end
|
356
360
|
|
357
|
-
def
|
358
|
-
|
359
|
-
text)
|
361
|
+
def fasta_desc
|
362
|
+
"#{source}:#{start}-#{start + size}"
|
360
363
|
end
|
361
364
|
|
362
365
|
def joinable_with?(o)
|
@@ -441,7 +444,7 @@ module Bio
|
|
441
444
|
attr_reader :status
|
442
445
|
|
443
446
|
def initialize(*args)
|
444
|
-
super(*args[0..4])
|
447
|
+
super(*(args[0..4] << nil))
|
445
448
|
@status = args[5]
|
446
449
|
end
|
447
450
|
|
data/lib/bio/maf/parser.rb
CHANGED
@@ -5,7 +5,8 @@ require 'java' if RUBY_PLATFORM == 'java'
|
|
5
5
|
module Bio
|
6
6
|
# @api public
|
7
7
|
module MAF
|
8
|
-
|
8
|
+
LOG = Bio::Log::LoggerPlus['bio-maf']
|
9
|
+
|
9
10
|
# @api public
|
10
11
|
class ParseError < Exception; end
|
11
12
|
|
@@ -601,10 +602,9 @@ module Bio
|
|
601
602
|
end
|
602
603
|
end
|
603
604
|
elapsed = Time.now - start
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
# elapsed, rate)
|
605
|
+
rate = (total_size / 1048576.0) / elapsed
|
606
|
+
LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
|
607
|
+
elapsed, rate) }
|
608
608
|
end
|
609
609
|
|
610
610
|
# Fetch and parse the blocks given by the merged fetch list, in
|
@@ -639,13 +639,13 @@ module Bio
|
|
639
639
|
end
|
640
640
|
threads.each { |t| t.join }
|
641
641
|
elapsed = Time.now - start
|
642
|
-
|
643
|
-
|
644
|
-
|
642
|
+
LOG.debug { sprintf("Fetched blocks from %d threads in %.1fs.",
|
643
|
+
n_threads,
|
644
|
+
elapsed) }
|
645
645
|
mb = total_size / 1048576.0
|
646
|
-
|
647
|
-
|
648
|
-
|
646
|
+
LOG.debug { sprintf("%.3f MB processed (%.1f MB/s).",
|
647
|
+
mb,
|
648
|
+
mb / elapsed) }
|
649
649
|
end
|
650
650
|
|
651
651
|
# Create a worker thread for parallel parsing.
|
@@ -667,8 +667,8 @@ module Bio
|
|
667
667
|
end
|
668
668
|
end
|
669
669
|
rescue Exception => e
|
670
|
-
|
671
|
-
|
670
|
+
LOG.error "Worker failing: #{e.class}: #{e}"
|
671
|
+
LOG.error e
|
672
672
|
raise e
|
673
673
|
end
|
674
674
|
end
|
@@ -835,8 +835,8 @@ module Bio
|
|
835
835
|
end
|
836
836
|
queue.put(:eof)
|
837
837
|
rescue
|
838
|
-
|
839
|
-
|
838
|
+
LOG.error "worker exiting: #{$!.class}: #{$!}"
|
839
|
+
LOG.error $!
|
840
840
|
end
|
841
841
|
end
|
842
842
|
saw_eof = false
|
@@ -902,6 +902,28 @@ module Bio
|
|
902
902
|
include MAFParsing
|
903
903
|
end
|
904
904
|
|
905
|
+
def handle_logging_options(opts)
|
906
|
+
opts.on("--logger filename", String,
|
907
|
+
"Log to file (default STDOUT)") do |name|
|
908
|
+
Bio::Log::CLI.logger(name)
|
909
|
+
end
|
910
|
+
opts.on("--trace options", String,
|
911
|
+
"Set log level",
|
912
|
+
"(default INFO, see bio-logger)") do |s|
|
913
|
+
Bio::Log::CLI.trace(s)
|
914
|
+
end
|
915
|
+
opts.on("-q", "--quiet", "Run quietly") do
|
916
|
+
Bio::Log::CLI.trace('error')
|
917
|
+
end
|
918
|
+
opts.on("-v", "--verbose", "Run verbosely") do
|
919
|
+
Bio::Log::CLI.trace('info')
|
920
|
+
end
|
921
|
+
opts.on("--debug", "Run with extra debugging output") do
|
922
|
+
Bio::Log::CLI.trace('debug')
|
923
|
+
end
|
924
|
+
end
|
925
|
+
module_function :handle_logging_options
|
926
|
+
|
905
927
|
end
|
906
928
|
|
907
929
|
end
|