bio-maf 0.3.0-java → 0.3.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,29 +13,35 @@ PRINTERS = {
13
13
 
14
14
  $options = OpenStruct.new
15
15
  $options.mode = :build
16
- $options.reader = Bio::MAF::ChunkReader
16
+ $options.ref_only = true
17
+ $options.reader = if RUBY_PLATFORM == 'java'
18
+ Bio::MAF::ThreadedChunkReader
19
+ else
20
+ Bio::MAF::ChunkReader
21
+ end
17
22
 
18
23
  def build_index(maf, index)
19
24
  parser = Bio::MAF::Parser.new(maf,
20
25
  :chunk_reader => $options.reader,
21
26
  :parse_extended => false)
22
- idx = Bio::MAF::KyotoIndex.build(parser, index)
27
+ idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
23
28
  idx.close
24
29
  end
25
30
 
26
31
  op = OptionParser.new do |opts|
27
32
  opts.banner = "Usage: maf_index [options] <maf> <index>"
28
- #opts.separator ""
29
- #opts.separator "Options:"
33
+ opts.separator ""
34
+ opts.separator "Options:"
35
+ opts.on("-a", "--all", "Index all sequences, not just reference seq") do
36
+ $options.ref_only = false
37
+ end
38
+ Bio::MAF::handle_logging_options(opts)
30
39
  opts.on("--time", "print elapsed time") do
31
40
  $options.bench = true
32
41
  end
33
- opts.on("-d", "--dump") do
42
+ opts.on("-d", "--dump", "Dump contents of given INDEX") do
34
43
  $options.mode = :dump
35
44
  end
36
- opts.on("-t", "--threaded") do
37
- $options.reader = Bio::MAF::ThreadedChunkReader
38
- end
39
45
  opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
40
46
  require 'ruby-prof'
41
47
  if pspec =~ /(\w+):(.+)/
@@ -49,6 +55,7 @@ op = OptionParser.new do |opts|
49
55
  end
50
56
 
51
57
  op.parse!(ARGV)
58
+ Bio::Log::CLI.configure('bio-maf')
52
59
 
53
60
  maf_p = ARGV.shift if $options.mode == :build
54
61
  index_p = ARGV.shift
@@ -65,9 +65,11 @@ o_parser = OptionParser.new do |opts|
65
65
  "(requires --output-base)") do |bed|
66
66
  options.bed = bed
67
67
  end
68
+ Bio::MAF::handle_logging_options(opts)
68
69
  end
69
70
 
70
71
  o_parser.parse!(ARGV)
72
+ Bio::Log::CLI.configure('bio-maf')
71
73
 
72
74
  maf_p = ARGV.shift
73
75
  index_p = ARGV.shift
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'bio-maf'
4
- require 'bigbio'
5
4
  require 'optparse'
6
5
  require 'ostruct'
7
6
 
@@ -42,16 +41,14 @@ if options.profile_gc
42
41
  end
43
42
 
44
43
  parser = options.parser.new(src_path)
45
- writer = FastaWriter.new(dst_path)
44
+ File.open(dst_path, 'w') do |outf|
45
+ writer = Bio::MAF::FASTAWriter.new(outf)
46
46
 
47
- parser.each_block do |block|
48
- block.each_raw_seq do |seq|
49
- seq.write_fasta(writer)
47
+ parser.each_block do |block|
48
+ writer.write_block(block)
50
49
  end
51
50
  end
52
51
 
53
- writer.close
54
-
55
52
  if options.profile_gc
56
53
  $stderr.puts GC::Profiler.result
57
54
  GC::Profiler.disable
@@ -2,14 +2,13 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "bio-maf"
5
- s.version = "0.3.0"
5
+ s.version = "0.3.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Clayton Wheeler"]
9
- s.date = "2012-07-18"
9
+ s.date = "2012-07-26"
10
10
  s.description = "Multiple Alignment Format parser for BioRuby."
11
11
  s.email = "cswh@umich.edu"
12
- s.executables = ["maf_count", "maf_dump_blocks", "maf_extract_ranges_count", "maf_index", "maf_parse_bench", "maf_to_fasta", "maf_write", "random_ranges"]
13
12
  s.extra_rdoc_files = [
14
13
  "LICENSE.txt",
15
14
  "README.md"
@@ -33,8 +32,8 @@ Gem::Specification.new do |s|
33
32
  end
34
33
 
35
34
  s.add_runtime_dependency('bio-alignment', ["~> 0.0.7"])
36
- s.add_runtime_dependency('bio-bigbio', [">= 0"])
37
35
  s.add_runtime_dependency('bio-genomic-interval', ["~> 0.1.2"])
36
+ s.add_runtime_dependency('bio-logger', ["~> 1.0.1"])
38
37
  if RUBY_PLATFORM == 'java'
39
38
  s.add_runtime_dependency('kyotocabinet-java', ["~> 0.3.0"])
40
39
  else
@@ -30,6 +30,15 @@ Feature: Indexed access to MAF files
30
30
  And sequence mm8.chr7 of block 0 has start 80082592
31
31
  And sequence mm8.chr7 of block 1 has start 80082713
32
32
 
33
+ Scenario: Extract alignment blocks by chromosomal range on non-ref sequence
34
+ Given a MAF source file "mm8_chr7_tiny.maf"
35
+ When I open it with a MAF reader
36
+ And build an index on all sequences
37
+ And search for blocks between positions 136011819 and 136012026 of rn4.chr1
38
+ Then 2 blocks are obtained
39
+ And sequence mm8.chr7 of block 0 has start 80082368
40
+ And sequence mm8.chr7 of block 1 has start 80082471
41
+
33
42
  @no_jruby
34
43
  Scenario: Build MAF index with CLI tool
35
44
  Given test files:
@@ -40,6 +49,17 @@ Feature: Indexed access to MAF files
40
49
  """
41
50
  And a file named "mm8_chr7_tiny.kct" should exist
42
51
 
52
+ @no_jruby
53
+ Scenario: Build MAF index on all sequences with CLI tool
54
+ Given test files:
55
+ | mm8_chr7_tiny.maf |
56
+ When I run `maf_index --all mm8_chr7_tiny.maf mm8_chr7_tiny.kct`
57
+ And I run `maf_index -d mm8_chr7_tiny.kct`
58
+ Then it should pass with regex:
59
+ """
60
+ 9 \[bin 585\] 594:631
61
+ """
62
+
43
63
  @no_jruby
44
64
  Scenario: Dump MAF index with CLI tool
45
65
  Given test files:
@@ -51,4 +71,4 @@ Feature: Indexed access to MAF files
51
71
  0 \[bin 1195\] 80082334:80082368
52
72
  """
53
73
 
54
-
74
+
@@ -1,5 +1,3 @@
1
- require 'bigbio' # FASTA support
2
-
3
1
  Given /^a MAF source file "(.*?)"$/ do |src|
4
2
  @src_f = $test_data + src
5
3
  @src_f.exist?.should be_true
@@ -13,15 +11,12 @@ end
13
11
 
14
12
  When /^I select FASTA output$/ do
15
13
  @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
- @dst.close
17
- @writer = FastaWriter.new(@dst.path)
14
+ @writer = Bio::MAF::FASTAWriter.new(@dst)
18
15
  end
19
16
 
20
17
  When /^process the file$/ do
21
18
  @parser.each_block do |block|
22
- block.each_raw_seq do |seq|
23
- seq.write_fasta(@writer)
24
- end
19
+ @writer.write_block(block)
25
20
  end
26
21
  @writer.close
27
22
  end
@@ -2,6 +2,10 @@ When /^build an index on the reference sequence$/ do
2
2
  @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
3
  end
4
4
 
5
+ When /^build an index on all sequences$/ do
6
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%', false)
7
+ end
8
+
5
9
  Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
10
  @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
11
  end
@@ -8,5 +8,10 @@
8
8
  #
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
+ require 'bio-logger'
12
+ log = Bio::Log::LoggerPlus.new('bio-maf')
13
+ log.outputters = Bio::Log::Outputter.stderr
14
+ log.level = Bio::Log::WARN
15
+
11
16
  require 'bio/ucsc'
12
17
  require 'bio/maf'
@@ -207,7 +207,7 @@ module Bio
207
207
  # (could build a real one, too...)
208
208
  maf = options[:maf]
209
209
  parser = Parser.new(maf, @parse_options)
210
- # $stderr.puts "WARNING: building temporary index on #{maf}."
210
+ LOG.warn { "WARNING: building temporary index on #{maf}." }
211
211
  index = KyotoIndex.build(parser, '%')
212
212
  register_index(index, maf)
213
213
  end
@@ -247,7 +247,7 @@ module Bio
247
247
 
248
248
  # @api private
249
249
  def with_parser(chrom)
250
- # $stderr.puts "Creating parser with options #{@parse_options.inspect}"
250
+ LOG.debug { "Creating parser with options #{@parse_options.inspect}" }
251
251
  parser = Parser.new(@maf_by_chrom[chrom], @parse_options)
252
252
  parser.sequence_filter = self.sequence_filter
253
253
  begin
@@ -262,7 +262,7 @@ module Bio
262
262
  class KyotoIndex
263
263
  include KVHelpers
264
264
 
265
- attr_reader :db, :species, :species_max_id
265
+ attr_reader :db, :species, :species_max_id, :ref_only
266
266
  attr_accessor :index_sequences, :ref_seq
267
267
 
268
268
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
@@ -353,9 +353,9 @@ module Bio
353
353
  # @param [Parser] parser MAF parser for file to index
354
354
  # @param [String] path path to index file to create
355
355
  # @return [KyotoIndex]
356
- def self.build(parser, path)
356
+ def self.build(parser, path, ref_only=true)
357
357
  idx = self.new(path)
358
- idx.build_default(parser)
358
+ idx.build(parser, ref_only)
359
359
  return idx
360
360
  end
361
361
 
@@ -391,11 +391,11 @@ module Bio
391
391
  # @return [Enumerable<Block>] each matching {Block}, if no block given
392
392
  # @api public
393
393
  def find(intervals, parser, filter={}, &blk)
394
- # start = Time.now
394
+ start = Time.now
395
395
  fl = fetch_list(intervals, filter)
396
- # $stderr.printf("Built fetch list of %d items in %.3fs.\n",
397
- # fl.size,
398
- # Time.now - start)
396
+ LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
397
+ fl.size,
398
+ Time.now - start) }
399
399
  if ! fl.empty?
400
400
  parser.fetch_blocks(fl, &blk)
401
401
  else
@@ -426,6 +426,7 @@ module Bio
426
426
  def initialize(path, db_arg=nil)
427
427
  @species = {}
428
428
  @species_max_id = -1
429
+ @max_sid = -1
429
430
  if db_arg || ((path.size > 1) and File.exist?(path))
430
431
  mode = KyotoCabinet::DB::OREADER
431
432
  else
@@ -585,8 +586,8 @@ module Bio
585
586
  n_completed += 1
586
587
  end
587
588
  threads.each { |t| t.join }
588
- $stderr.printf("Matched %d index records with %d threads in %.3f seconds.\n",
589
- to_fetch.size, n_threads, Time.now - start)
589
+ LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
590
+ to_fetch.size, n_threads, Time.now - start) }
590
591
  to_fetch
591
592
  end
592
593
 
@@ -602,8 +603,8 @@ module Bio
602
603
  completed.put(result)
603
604
  rescue Exception => e
604
605
  completed.put(e)
605
- $stderr.puts "Worker failing: #{e.class}: #{e}"
606
- $stderr.puts e.backtrace.join("\n")
606
+ LOG.error "Worker failing: #{e.class}: #{e}"
607
+ LOG.error e
607
608
  raise e
608
609
  end
609
610
  end
@@ -658,17 +659,20 @@ module Bio
658
659
  || gi.include?(i_start)
659
660
  end
660
661
 
661
- def build_default(parser)
662
+ def build(parser, ref_only=true)
662
663
  first_block = parser.parse_block
663
664
  self.ref_seq = first_block.sequences.first.source
665
+ @ref_only = ref_only
664
666
  db[REF_SEQ_KEY] = ref_seq
665
667
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
666
- @index_sequences = { ref_seq => 0 }
667
- store_index_sequences!
668
+ @index_sequences = {}
668
669
  index_blocks([first_block])
669
- parser.enum_for(:each_block).each_slice(1000).each do |blocks|
670
+ n = 0
671
+ parser.each_block.each_slice(1000).each do |blocks|
670
672
  index_blocks(blocks)
673
+ n += blocks.size
671
674
  end
675
+ LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
672
676
  db.synchronize(true)
673
677
  end
674
678
 
@@ -685,12 +689,18 @@ module Bio
685
689
  h[name] = id
686
690
  end
687
691
  @index_sequences = h
692
+ @max_sid = @index_sequences.values.max
688
693
  end
689
694
 
690
- def store_index_sequences!
691
- index_sequences.each do |name, id|
692
- db.set("sequence:#{name}", id.to_s)
695
+ def seq_id_for(name)
696
+ sid = index_sequences[name]
697
+ if ! sid
698
+ @max_sid += 1
699
+ sid = @max_sid
700
+ db.set("sequence:#{name}", sid.to_s)
701
+ index_sequences[name] = sid
693
702
  end
703
+ return sid
694
704
  end
695
705
 
696
706
  def load_species
@@ -742,9 +752,9 @@ module Bio
742
752
  end
743
753
  h = {}
744
754
  val = build_block_value(block)
745
- block.sequences.each do |seq|
746
- seq_id = index_sequences[seq.source]
747
- next unless seq_id
755
+ to_index = ref_only ? [block.sequences.first] : block.sequences
756
+ to_index.each do |seq|
757
+ seq_id = seq_id_for(seq.source)
748
758
  seq_end = seq.start + seq.size
749
759
  bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
750
760
  key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
@@ -64,7 +64,6 @@ module Bio
64
64
  attr_reader :size
65
65
 
66
66
  def initialize(vars, sequences, offset, size, filtered)
67
- #raise ArgumentError, "no sequences given for block at offset #{offset}!" unless sequences && sequences.first
68
67
  @vars = vars
69
68
  @sequences = sequences
70
69
  @offset = offset
@@ -258,8 +257,13 @@ module Bio
258
257
  attr_accessor :quality
259
258
  alias_method :source_size, :src_size
260
259
 
261
- def initialize(*args)
262
- @source, @start, @size, @strand, @src_size, @text = args
260
+ def initialize(source, start, size, strand, src_size, text)
261
+ @source = source
262
+ @start = start
263
+ @size = size
264
+ @strand = strand
265
+ @src_size = src_size
266
+ @text = text
263
267
  end
264
268
 
265
269
  def end
@@ -354,9 +358,8 @@ module Bio
354
358
  Bio::BioAlignment::Sequence.new(source, text)
355
359
  end
356
360
 
357
- def write_fasta(writer)
358
- writer.write("#{source}:#{start}-#{start + size}",
359
- text)
361
+ def fasta_desc
362
+ "#{source}:#{start}-#{start + size}"
360
363
  end
361
364
 
362
365
  def joinable_with?(o)
@@ -441,7 +444,7 @@ module Bio
441
444
  attr_reader :status
442
445
 
443
446
  def initialize(*args)
444
- super(*args[0..4])
447
+ super(*(args[0..4] << nil))
445
448
  @status = args[5]
446
449
  end
447
450
 
@@ -5,7 +5,8 @@ require 'java' if RUBY_PLATFORM == 'java'
5
5
  module Bio
6
6
  # @api public
7
7
  module MAF
8
-
8
+ LOG = Bio::Log::LoggerPlus['bio-maf']
9
+
9
10
  # @api public
10
11
  class ParseError < Exception; end
11
12
 
@@ -601,10 +602,9 @@ module Bio
601
602
  end
602
603
  end
603
604
  elapsed = Time.now - start
604
- # TODO: debug log
605
- # rate = (total_size / 1048576.0) / elapsed
606
- # $stderr.printf("Fetched blocks in %.3fs, %.1f MB/s.\n",
607
- # elapsed, rate)
605
+ rate = (total_size / 1048576.0) / elapsed
606
+ LOG.debug { sprintf("Fetched blocks in %.3fs, %.1f MB/s.",
607
+ elapsed, rate) }
608
608
  end
609
609
 
610
610
  # Fetch and parse the blocks given by the merged fetch list, in
@@ -639,13 +639,13 @@ module Bio
639
639
  end
640
640
  threads.each { |t| t.join }
641
641
  elapsed = Time.now - start
642
- $stderr.printf("Fetched blocks from %d threads in %.1fs.\n",
643
- n_threads,
644
- elapsed)
642
+ LOG.debug { sprintf("Fetched blocks from %d threads in %.1fs.",
643
+ n_threads,
644
+ elapsed) }
645
645
  mb = total_size / 1048576.0
646
- $stderr.printf("%.3f MB processed (%.1f MB/s).\n",
647
- mb,
648
- mb / elapsed)
646
+ LOG.debug { sprintf("%.3f MB processed (%.1f MB/s).",
647
+ mb,
648
+ mb / elapsed) }
649
649
  end
650
650
 
651
651
  # Create a worker thread for parallel parsing.
@@ -667,8 +667,8 @@ module Bio
667
667
  end
668
668
  end
669
669
  rescue Exception => e
670
- $stderr.puts "Worker failing: #{e.class}: #{e}"
671
- $stderr.puts e.backtrace.join("\n")
670
+ LOG.error "Worker failing: #{e.class}: #{e}"
671
+ LOG.error e
672
672
  raise e
673
673
  end
674
674
  end
@@ -835,8 +835,8 @@ module Bio
835
835
  end
836
836
  queue.put(:eof)
837
837
  rescue
838
- $stderr.puts "worker exiting: #{$!.class}: #{$!}"
839
- $stderr.puts $!.backtrace.join("\n")
838
+ LOG.error "worker exiting: #{$!.class}: #{$!}"
839
+ LOG.error $!
840
840
  end
841
841
  end
842
842
  saw_eof = false
@@ -902,6 +902,28 @@ module Bio
902
902
  include MAFParsing
903
903
  end
904
904
 
905
+ def handle_logging_options(opts)
906
+ opts.on("--logger filename", String,
907
+ "Log to file (default STDOUT)") do |name|
908
+ Bio::Log::CLI.logger(name)
909
+ end
910
+ opts.on("--trace options", String,
911
+ "Set log level",
912
+ "(default INFO, see bio-logger)") do |s|
913
+ Bio::Log::CLI.trace(s)
914
+ end
915
+ opts.on("-q", "--quiet", "Run quietly") do
916
+ Bio::Log::CLI.trace('error')
917
+ end
918
+ opts.on("-v", "--verbose", "Run verbosely") do
919
+ Bio::Log::CLI.trace('info')
920
+ end
921
+ opts.on("--debug", "Run with extra debugging output") do
922
+ Bio::Log::CLI.trace('debug')
923
+ end
924
+ end
925
+ module_function :handle_logging_options
926
+
905
927
  end
906
928
 
907
929
  end