bio-maf 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/bin/maf_index CHANGED
@@ -13,29 +13,35 @@ PRINTERS = {
13
13
 
14
14
  $options = OpenStruct.new
15
15
  $options.mode = :build
16
- $options.reader = Bio::MAF::ChunkReader
16
+ $options.ref_only = true
17
+ $options.reader = if RUBY_PLATFORM == 'java'
18
+ Bio::MAF::ThreadedChunkReader
19
+ else
20
+ Bio::MAF::ChunkReader
21
+ end
17
22
 
18
23
  def build_index(maf, index)
19
24
  parser = Bio::MAF::Parser.new(maf,
20
25
  :chunk_reader => $options.reader,
21
26
  :parse_extended => false)
22
- idx = Bio::MAF::KyotoIndex.build(parser, index)
27
+ idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
23
28
  idx.close
24
29
  end
25
30
 
26
31
  op = OptionParser.new do |opts|
27
32
  opts.banner = "Usage: maf_index [options] <maf> <index>"
28
- #opts.separator ""
29
- #opts.separator "Options:"
33
+ opts.separator ""
34
+ opts.separator "Options:"
35
+ opts.on("-a", "--all", "Index all sequences, not just reference seq") do
36
+ $options.ref_only = false
37
+ end
38
+ Bio::MAF::handle_logging_options(opts)
30
39
  opts.on("--time", "print elapsed time") do
31
40
  $options.bench = true
32
41
  end
33
- opts.on("-d", "--dump") do
42
+ opts.on("-d", "--dump", "Dump contents of given INDEX") do
34
43
  $options.mode = :dump
35
44
  end
36
- opts.on("-t", "--threaded") do
37
- $options.reader = Bio::MAF::ThreadedChunkReader
38
- end
39
45
  opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
40
46
  require 'ruby-prof'
41
47
  if pspec =~ /(\w+):(.+)/
@@ -49,6 +55,7 @@ op = OptionParser.new do |opts|
49
55
  end
50
56
 
51
57
  op.parse!(ARGV)
58
+ Bio::Log::CLI.configure('bio-maf')
52
59
 
53
60
  maf_p = ARGV.shift if $options.mode == :build
54
61
  index_p = ARGV.shift
data/bin/maf_tile CHANGED
@@ -65,9 +65,11 @@ o_parser = OptionParser.new do |opts|
65
65
  "(requires --output-base)") do |bed|
66
66
  options.bed = bed
67
67
  end
68
+ Bio::MAF::handle_logging_options(opts)
68
69
  end
69
70
 
70
71
  o_parser.parse!(ARGV)
72
+ Bio::Log::CLI.configure('bio-maf')
71
73
 
72
74
  maf_p = ARGV.shift
73
75
  index_p = ARGV.shift
data/bin/maf_to_fasta CHANGED
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
@@ -42,16 +41,14 @@ if options.profile_gc
42
41
  end
43
42
 
44
43
  parser = options.parser.new(src_path)
45
- writer = FastaWriter.new(dst_path)
44
+ File.open(dst_path, 'w') do |outf|
45
+ writer = Bio::MAF::FASTAWriter.new(outf)
46
46
 
47
- parser.each_block do |block|
48
- block.each_raw_seq do |seq|
49
- seq.write_fasta(writer)
47
+ parser.each_block do |block|
48
+ writer.write_block(block)
50
49
  end
51
50
  end
52
51
 
53
- writer.close
54
-
55
52
  if options.profile_gc
56
53
  $stderr.puts GC::Profiler.result
57
54
  GC::Profiler.disable
data/bio-maf.gemspec CHANGED
@@ -2,14 +2,13 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "bio-maf"
5
- s.version = "0.3.0"
5
+ s.version = "0.3.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Clayton Wheeler"]
9
- s.date = "2012-07-18"
9
+ s.date = "2012-07-26"
10
10
  s.description = "Multiple Alignment Format parser for BioRuby."
11
11
  s.email = "cswh@umich.edu"
12
- s.executables = ["maf_count", "maf_dump_blocks", "maf_extract_ranges_count", "maf_index", "maf_parse_bench", "maf_to_fasta", "maf_write", "random_ranges"]
13
12
  s.extra_rdoc_files = [
14
13
  "LICENSE.txt",
15
14
  "README.md"
@@ -33,8 +32,8 @@ Gem::Specification.new do |s|
33
32
  end
34
33
 
35
34
  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
36
- s.add_runtime_dependency('bio-bigbio', [">= 0"])
37
35
  s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
36
+ s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
38
37
  if RUBY_PLATFORM == 'java'
39
38
  s.add_runtime_dependency('kyotocabinet-java', ["~> 0.3.0"])
40
39
  else
@@ -30,6 +30,15 @@ Feature: Indexed access to MAF files
30
30
  And sequence mm8.chr7 of block 0 has start 80082592
31
31
  And sequence mm8.chr7 of block 1 has start 80082713
32
32
 
33
+ Scenario: Extract alignment blocks by chromosomal range on non-ref sequence
34
+ Given a MAF source file "mm8_chr7_tiny.maf"
35
+ When I open it with a MAF reader
36
+ And build an index on all sequences
37
+ And search for blocks between positions 136011819 and 136012026 of rn4.chr1
38
+ Then 2 blocks are obtained
39
+ And sequence mm8.chr7 of block 0 has start 80082368
40
+ And sequence mm8.chr7 of block 1 has start 80082471
41
+
33
42
  @no_jruby
34
43
  Scenario: Build MAF index with CLI tool
35
44
  Given test files:
@@ -40,6 +49,17 @@ Feature: Indexed access to MAF files
40
49
  """
41
50
  And a file named "mm8_chr7_tiny.kct" should exist
42
51
 
52
+ @no_jruby
53
+ Scenario: Build MAF index on all sequences with CLI tool
54
+ Given test files:
55
+ | mm8_chr7_tiny.maf |
56
+ When I run `maf_index --all mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
57
+ And I run `maf_index -d mm8_chr7_tiny.kct`
58
+ Then it should pass with regex:
59
+ """
60
+ 9 \[bin 585\] 594:631
61
+ """
62
+
43
63
  @no_jruby
44
64
  Scenario: Dump MAF index with CLI tool
45
65
  Given test files:
@@ -51,4 +71,4 @@ Feature: Indexed access to MAF files
51
71
  0 \[bin 1195\] 80082334:80082368
52
72
  """
53
73
 
54
-
74
+
@@ -1,5 +1,3 @@
1
- require 'bigbio' # FASTA support
2
-
3
1
  Given /^a MAF source file "(.*?)"$/ do |src|
4
2
  @src_f = $test_data + src
5
3
  @src_f.exist?.should be_true
@@ -13,15 +11,12 @@ end
13
11
 
14
12
  When /^I select FASTA output$/ do
15
13
  @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
- @dst.close
17
- @writer = FastaWriter.new(@dst.path)
14
+ @writer = Bio::MAF::FASTAWriter.new(@dst)
18
15
  end
19
16
 
20
17
  When /^process the file$/ do
21
18
  @parser.each_block do |block|
22
- block.each_raw_seq do |seq|
23
- seq.write_fasta(@writer)
24
- end
19
+ @writer.write_block(block)
25
20
  end
26
21
  @writer.close
27
22
  end
@@ -2,6 +2,10 @@ When /^build an index on the reference sequence$/ do
2
2
  @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
3
  end
4
4
 
5
+ When /^build an index on all sequences$/ do
6
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%', false)
7
+ end
8
+
5
9
  Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
10
  @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
11
  end
data/lib/bio-maf.rb CHANGED
@@ -8,5 +8,10 @@
8
8
  #
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
+ require 'bio-logger'
12
+ log = Bio::Log::LoggerPlus.new('bio-maf')
13
+ log.outputters = Bio::Log::Outputter.stderr
14
+ log.level = Bio::Log::WARN
15
+
11
16
  require 'bio/ucsc'
12
17
  require 'bio/maf'
data/lib/bio/maf/index.rb CHANGED
@@ -207,7 +207,7 @@ module Bio
207
207
  # (could build a real one, too...)
208
208
  maf = options[:maf]
209
209
  parser = Parser.new(maf, @parse_options)
210
- # $stderr.puts "WARNING: building temporary index on #{maf}."
210
+ LOG.warn { "WARNING: building temporary index on #{maf}." }
211
211
  index = KyotoIndex.build(parser, '%')
212
212
  register_index(index, maf)
213
213
  end
@@ -247,7 +247,7 @@ module Bio
247
247
 
248
248
  # @api private
249
249
  def with_parser(chrom)
250
- # $stderr.puts "Creating parser with options #{@parse_options.inspect}"
250
+ LOG.debug { "Creating parser with options #{@parse_options.inspect}" }
251
251
  parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
252
252
  parser.sequence_filter = self.sequence_filter
253
253
  begin
@@ -262,7 +262,7 @@ module Bio
262
262
  class KyotoIndex
263
263
  include KVHelpers
264
264
 
265
- attr_reader :db, :species, :species_max_id
265
+ attr_reader :db, :species, :species_max_id, :ref_only
266
266
  attr_accessor :index_sequences, :ref_seq
267
267
 
268
268
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
@@ -353,9 +353,9 @@ module Bio
353
353
  # @param [Parser] parser MAF parser for file to index
354
354
  # @param [String] path path to index file to create
355
355
  # @return [KyotoIndex]
356
- def self.build(parser, path)
356
+ def self.build(parser, path, ref_only=true)
357
357
  idx = self.new(path)
358
- idx.build_default(parser)
358
+ idx.build(parser, ref_only)
359
359
  return idx
360
360
  end
361
361
 
@@ -391,11 +391,11 @@ module Bio
391
391
  # @return [Enumerable<Block>] each matching {Block}, if no block given
392
392
  # @api public
393
393
  def find(intervals, parser, filter={}, &blk)
394
- # start = Time.now
394
+ start = Time.now
395
395
  fl = fetch_list(intervals, filter)
396
- # $stderr.printf("Built fetch list of %d items in %.3fs.\n",
397
- # fl.size,
398
- # Time.now - start)
396
+ LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
397
+ fl.size,
398
+ Time.now - start) }
399
399
  if ! fl.empty?
400
400
  parser.fetch_blocks(fl, &blk)
401
401
  else
@@ -426,6 +426,7 @@ module Bio
426
426
  def initialize(path, db_arg=nil)
427
427
  @species = {}
428
428
  @species_max_id = -1
429
+ @max_sid = -1
429
430
  if db_arg || ((path.size > 1) and File.exist?(path))
430
431
  mode = KyotoCabinet::DB::OREADER
431
432
  else
@@ -585,8 +586,8 @@ module Bio
585
586
  n_completed += 1
586
587
  end
587
588
  threads.each { |t| t.join }
588
- $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
589
- to_fetch.size, n_threads, Time.now - start)
589
+ LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
590
+ to_fetch.size, n_threads, Time.now - start) }
590
591
  to_fetch
591
592
  end
592
593
 
@@ -602,8 +603,8 @@ module Bio
602
603
  completed.put(result)
603
604
  rescue Exception => e
604
605
  completed.put(e)
605
- $stderr.puts "Worker failing: #{e.class}: #{e}"
606
- $stderr.puts e.backtrace.join("\n")
606
+ LOG.error "Worker failing: #{e.class}: #{e}"
607
+ LOG.error e
607
608
  raise e
608
609
  end
609
610
  end
@@ -658,17 +659,20 @@ module Bio
658
659
  || gi.include?(i_start)
659
660
  end
660
661
 
661
- def build_default(parser)
662
+ def build(parser, ref_only=true)
662
663
  first_block = parser.parse_block
663
664
  self.ref_seq = first_block.sequences.first.source
665
+ @ref_only = ref_only
664
666
  db[REF_SEQ_KEY] = ref_seq
665
667
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
666
- @index_sequences = { ref_seq => 0 }
667
- store_index_sequences!
668
+ @index_sequences = {}
668
669
  index_blocks([first_block])
669
- parser.enum_for(:each_block).each_slice(1000).each do |blocks|
670
+ n = 0
671
+ parser.each_block.each_slice(1000).each do |blocks|
670
672
  index_blocks(blocks)
673
+ n += blocks.size
671
674
  end
675
+ LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
672
676
  db.synchronize(true)
673
677
  end
674
678
 
@@ -685,12 +689,18 @@ module Bio
685
689
  h[name] = id
686
690
  end
687
691
  @index_sequences = h
692
+ @max_sid = @index_sequences.values.max
688
693
  end
689
694
 
690
- def store_index_sequences!
691
- index_sequences.each do |name, id|
692
- db.set("sequence:#{name}", id.to_s)
695
+ def seq_id_for(name)
696
+ sid = index_sequences[name]
697
+ if ! sid
698
+ @max_sid += 1
699
+ sid = @max_sid
700
+ db.set("sequence:#{name}", sid.to_s)
701
+ index_sequences[name] = sid
693
702
  end
703
+ return sid
694
704
  end
695
705
 
696
706
  def load_species
@@ -742,9 +752,9 @@ module Bio
742
752
  end
743
753
  h = {}
744
754
  val = build_block_value(block)
745
- block.sequences.each do |seq|
746
- seq_id = index_sequences[seq.source]
747
- next unless seq_id
755
+ to_index = ref_only ? [block.sequences.first] : block.sequences
756
+ to_index.each do |seq|
757
+ seq_id = seq_id_for(seq.source)
748
758
  seq_end = seq.start + seq.size
749
759
  bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
750
760
  key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
data/lib/bio/maf/maf.rb CHANGED
@@ -64,7 +64,6 @@ module Bio
64
64
  attr_reader :size
65
65
 
66
66
  def initialize(vars, sequences, offset, size, filtered)
67
- #raise ArgumentError, "no sequences given for block at offset #{offset}!" unless sequences && sequences.first
68
67
  @vars = vars
69
68
  @sequences = sequences
70
69
  @offset = offset
@@ -258,8 +257,13 @@ module Bio
258
257
  attr_accessor :quality
259
258
  alias_method :source_size, :src_size
260
259
 
261
- def initialize(*args)
262
- @source, @start, @size, @strand, @src_size, @text = args
260
+ def initialize(source, start, size, strand, src_size, text)
261
+ @source = source
262
+ @start = start
263
+ @size = size
264
+ @strand = strand
265
+ @src_size = src_size
266
+ @text = text
263
267
  end
264
268
 
265
269
  def end
@@ -354,9 +358,8 @@ module Bio
354
358
  Bio::BioAlignment::Sequence.new(source, text)
355
359
  end
356
360
 
357
- def write_fasta(writer)
358
- writer.write("#{source}:#{start}-#{start + size}",
359
- text)
361
+ def fasta_desc
362
+ "#{source}:#{start}-#{start + size}"
360
363
  end
361
364
 
362
365
  def joinable_with?(o)
@@ -441,7 +444,7 @@ module Bio
441
444
  attr_reader :status
442
445
 
443
446
  def initialize(*args)
444
- super(*args[0..4])
447
+ super(*(args[0..4] << nil))
445
448
  @status = args[5]
446
449
  end
447
450
 
@@ -5,7 +5,8 @@ require 'java' if RUBY_PLATFORM == 'java'
5
5
  module Bio
6
6
  # @api public
7
7
  module MAF
8
-
8
+ LOG = Bio::Log::LoggerPlus['bio-maf']
9
+
9
10
  # @api public
10
11
  class ParseError < Exception; end
11
12
 
@@ -601,10 +602,9 @@ module Bio
601
602
  end
602
603
  end
603
604
  elapsed = Time.now - start
604
- # TODO: debug log
605
- # rate = (total_size / 1048576.0) / elapsed
606
- # $stderr.printf("Fetched blocks in %.3fs, %.1f MB/s.\n",
607
- # elapsed, rate)
605
+ rate = (total_size / 1048576.0) / elapsed
606
+ LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
607
+ elapsed, rate) }
608
608
  end
609
609
 
610
610
  # Fetch and parse the blocks given by the merged fetch list, in
@@ -639,13 +639,13 @@ module Bio
639
639
  end
640
640
  threads.each { |t| t.join }
641
641
  elapsed = Time.now - start
642
- $stderr.printf("Fetched blocks from %d threads in %.1fs.\n",
643
- n_threads,
644
- elapsed)
642
+ LOG.debug { sprintf("Fetched blocks from %d threads in %.1fs.",
643
+ n_threads,
644
+ elapsed) }
645
645
  mb = total_size / 1048576.0
646
- $stderr.printf("%.3f MB processed (%.1f MB/s).\n",
647
- mb,
648
- mb / elapsed)
646
+ LOG.debug { sprintf("%.3f MB processed (%.1f MB/s).",
647
+ mb,
648
+ mb / elapsed) }
649
649
  end
650
650
 
651
651
  # Create a worker thread for parallel parsing.
@@ -667,8 +667,8 @@ module Bio
667
667
  end
668
668
  end
669
669
  rescue Exception => e
670
- $stderr.puts "Worker failing: #{e.class}: #{e}"
671
- $stderr.puts e.backtrace.join("\n")
670
+ LOG.error "Worker failing: #{e.class}: #{e}"
671
+ LOG.error e
672
672
  raise e
673
673
  end
674
674
  end
@@ -835,8 +835,8 @@ module Bio
835
835
  end
836
836
  queue.put(:eof)
837
837
  rescue
838
- $stderr.puts "worker exiting: #{$!.class}: #{$!}"
839
- $stderr.puts $!.backtrace.join("\n")
838
+ LOG.error "worker exiting: #{$!.class}: #{$!}"
839
+ LOG.error $!
840
840
  end
841
841
  end
842
842
  saw_eof = false
@@ -902,6 +902,28 @@ module Bio
902
902
  include MAFParsing
903
903
  end
904
904
 
905
+ def handle_logging_options(opts)
906
+ opts.on("--logger filename", String,
907
+ "Log to file (default STDOUT)") do |name|
908
+ Bio::Log::CLI.logger(name)
909
+ end
910
+ opts.on("--trace options", String,
911
+ "Set log level",
912
+ "(default INFO, see bio-logger)") do |s|
913
+ Bio::Log::CLI.trace(s)
914
+ end
915
+ opts.on("-q", "--quiet", "Run quietly") do
916
+ Bio::Log::CLI.trace('error')
917
+ end
918
+ opts.on("-v", "--verbose", "Run verbosely") do
919
+ Bio::Log::CLI.trace('info')
920
+ end
921
+ opts.on("--debug", "Run with extra debugging output") do
922
+ Bio::Log::CLI.trace('debug')
923
+ end
924
+ end
925
+ module_function :handle_logging_options
926
+
905
927
  end
906
928
 
907
929
  end