bio-maf 1.0.0-java → 1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
    
        data/bin/maf_bgzip
    CHANGED
    
    | @@ -1,5 +1,6 @@ | |
| 1 1 | 
             
            #!/usr/bin/env ruby
         | 
| 2 2 |  | 
| 3 | 
            +
            require 'optparse'
         | 
| 3 4 | 
             
            require 'ostruct'
         | 
| 4 5 |  | 
| 5 6 | 
             
            require 'bio-maf'
         | 
| @@ -8,6 +9,9 @@ require 'bio-bgzf' | |
| 8 9 | 
             
            $options = OpenStruct.new
         | 
| 9 10 | 
             
            $options.dir = '.'
         | 
| 10 11 | 
             
            $options.ref_only = true
         | 
| 12 | 
            +
            $options.n_jobs = 1
         | 
| 13 | 
            +
            $options.force = false
         | 
| 14 | 
            +
            $options.level = 2
         | 
| 11 15 |  | 
| 12 16 | 
             
            op = OptionParser.new do |opts|
         | 
| 13 17 | 
             
              opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
         | 
| @@ -26,31 +30,155 @@ op = OptionParser.new do |opts| | |
| 26 30 | 
             
                      "(has no effect without --index)") do
         | 
| 27 31 | 
             
                $options.ref_only = false
         | 
| 28 32 | 
             
              end
         | 
| 33 | 
            +
              opts.on("-l", "--level LEVEL", Integer,
         | 
| 34 | 
            +
                      "gzip compression level for BGZF (1-9)") do |level|
         | 
| 35 | 
            +
                unless 1 <= level && level <= 9
         | 
| 36 | 
            +
                  $stderr.puts "Invalid compression level: #{level}"
         | 
| 37 | 
            +
                  $stderr.puts opts
         | 
| 38 | 
            +
                  exit 2
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
                $options.level = level
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
              opts.on("-f", "--force",
         | 
| 43 | 
            +
                      "Replace output files if they already exist") do
         | 
| 44 | 
            +
                $options.force = true
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
              opts.on("-j", "--jobs N", Integer,
         | 
| 47 | 
            +
                      "Run N concurrent jobs (default 1)") do |n|
         | 
| 48 | 
            +
                $options.n_jobs = n
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
              Bio::MAF::handle_logging_options(opts)
         | 
| 29 51 | 
             
            end
         | 
| 30 52 |  | 
| 31 53 | 
             
            op.parse!(ARGV)
         | 
| 54 | 
            +
            Bio::Log::CLI.configure('bio-maf')
         | 
| 32 55 |  | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 56 | 
            +
            INTERVAL = 10
         | 
| 57 | 
            +
            LOG = Bio::MAF::LOG
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            def make_processing_task(maf)
         | 
| 60 | 
            +
              maf_base = File.basename(maf)
         | 
| 36 61 | 
             
              base = maf_base.gsub(/\.maf.*/, '')
         | 
| 37 62 | 
             
              bgz_path = "#{$options.dir}/#{base}.maf.bgz"
         | 
| 63 | 
            +
              if File.exist?(bgz_path) && ! $options.force
         | 
| 64 | 
            +
                LOG.error "#{bgz_path} already exists, refusing to overwrite " \
         | 
| 65 | 
            +
                "without --force!"
         | 
| 66 | 
            +
                exit 1
         | 
| 67 | 
            +
              end
         | 
| 68 | 
            +
              idx_path = nil
         | 
| 69 | 
            +
              if $options.index
         | 
| 70 | 
            +
                idx_path = "#{$options.dir}/#{base}.kct"
         | 
| 71 | 
            +
                if File.exist?(idx_path) && ! $options.force
         | 
| 72 | 
            +
                  LOG.error "#{idx_path} already exists, refusing to overwrite " \
         | 
| 73 | 
            +
                  "without --force!"
         | 
| 74 | 
            +
                  exit 1
         | 
| 75 | 
            +
                end
         | 
| 76 | 
            +
              end
         | 
| 77 | 
            +
              lambda { process_maf(maf, bgz_path, idx_path) }
         | 
| 78 | 
            +
            end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            def process_maf(maf_path, bgz_path, idx_path)
         | 
| 81 | 
            +
              maf_base = File.basename(maf_path)
         | 
| 82 | 
            +
              LOG.debug { "Processing #{maf_base}." }
         | 
| 38 83 | 
             
              p = Bio::MAF::Parser.new(maf_path,
         | 
| 39 | 
            -
                                       : | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 84 | 
            +
                                       :retain_text => true)
         | 
| 85 | 
            +
              if idx_path
         | 
| 86 | 
            +
                if File.exists?(idx_path)
         | 
| 87 | 
            +
                  File.unlink(idx_path)
         | 
| 88 | 
            +
                end
         | 
| 89 | 
            +
                idx = Bio::MAF::KyotoIndex.new(idx_path)
         | 
| 90 | 
            +
                idx.prep(bgz_path, :bgzf, $options.ref_only)
         | 
| 91 | 
            +
                exec = Bio::MAF::Executor.create
         | 
| 92 | 
            +
              end
         | 
| 93 | 
            +
              start_t = Time.now
         | 
| 94 | 
            +
              last_t = start_t
         | 
| 95 | 
            +
              last_pos = 0
         | 
| 96 | 
            +
              n_blocks = 0
         | 
| 97 | 
            +
              maf_size = File.size(maf_path)
         | 
| 98 | 
            +
              File.open(bgz_path, 'wb') do |out_f|
         | 
| 99 | 
            +
                Bio::BGZF::Writer.new(out_f, $options.level) do |bgz_w|
         | 
| 43 100 | 
             
                  maf_w = Bio::MAF::Writer.new(bgz_w)
         | 
| 44 101 | 
             
                  maf_w.write_header(p.header)
         | 
| 45 102 | 
             
                  p.each_block do |block|
         | 
| 46 | 
            -
                     | 
| 103 | 
            +
                    bgz_w.write(block.orig_text)
         | 
| 104 | 
            +
                    if idx
         | 
| 105 | 
            +
                      block.offset = bgz_w.last_write_pos
         | 
| 106 | 
            +
                      exec.submit do
         | 
| 107 | 
            +
                        idx.index_blocks([block])
         | 
| 108 | 
            +
                      end
         | 
| 109 | 
            +
                    end
         | 
| 110 | 
            +
                    n_blocks += 1
         | 
| 111 | 
            +
                    if n_blocks % 100 == 0
         | 
| 112 | 
            +
                      cur_t = Time.now
         | 
| 113 | 
            +
                      delta_t = cur_t - last_t
         | 
| 114 | 
            +
                      if delta_t > INTERVAL
         | 
| 115 | 
            +
                        cur_pos = p.phys_f.tell
         | 
| 116 | 
            +
                        LOG.debug {
         | 
| 117 | 
            +
                          pos_mb = cur_pos.to_f / 1048576
         | 
| 118 | 
            +
                          delta_bytes = cur_pos - last_pos
         | 
| 119 | 
            +
                          rate = delta_bytes.to_f / delta_t
         | 
| 120 | 
            +
                          mb_rate = rate / 1048576
         | 
| 121 | 
            +
                          pct = cur_pos.to_f / maf_size * 100
         | 
| 122 | 
            +
                          elapsed = cur_t - start_t
         | 
| 123 | 
            +
                          sprintf("%s: processed %.1f MB (%.1f%%) in %ds, %.2f MB/s.",
         | 
| 124 | 
            +
                                            maf_base,
         | 
| 125 | 
            +
                                            pos_mb,
         | 
| 126 | 
            +
                                            pct,
         | 
| 127 | 
            +
                                            elapsed,
         | 
| 128 | 
            +
                                  mb_rate)
         | 
| 129 | 
            +
                        }
         | 
| 130 | 
            +
                        last_t = cur_t
         | 
| 131 | 
            +
                        last_pos = cur_pos
         | 
| 132 | 
            +
                      end
         | 
| 133 | 
            +
                    end
         | 
| 47 134 | 
             
                  end
         | 
| 48 135 | 
             
                end
         | 
| 49 136 | 
             
              end
         | 
| 137 | 
            +
              unc = p.f.tell if p.f != p.phys_f
         | 
| 50 138 | 
             
              p.close
         | 
| 51 | 
            -
              if  | 
| 52 | 
            -
                 | 
| 53 | 
            -
                 | 
| 54 | 
            -
             | 
| 139 | 
            +
              if idx
         | 
| 140 | 
            +
                exec.shutdown
         | 
| 141 | 
            +
                idx.db.synchronize(true)
         | 
| 142 | 
            +
              end
         | 
| 143 | 
            +
              elapsed = Time.now - start_t
         | 
| 144 | 
            +
              mb = maf_size.to_f / 1048576
         | 
| 145 | 
            +
              mb_rate = mb / elapsed
         | 
| 146 | 
            +
              LOG.info { sprintf("Processed %s (%.1f MB) in %ds, %.2f MB/s",
         | 
| 147 | 
            +
                                 maf_base,
         | 
| 148 | 
            +
                                 mb,
         | 
| 149 | 
            +
                                 elapsed,
         | 
| 150 | 
            +
                                 mb_rate) }
         | 
| 151 | 
            +
              if unc
         | 
| 152 | 
            +
                LOG.info {
         | 
| 153 | 
            +
                  unc_mb = unc / 1048576
         | 
| 154 | 
            +
                  unc_rate = unc_mb / elapsed
         | 
| 155 | 
            +
                  sprintf("  Uncompressed: %.1f MB, %.2f MB/s",
         | 
| 156 | 
            +
                          unc_mb, unc_rate)
         | 
| 157 | 
            +
                }
         | 
| 55 158 | 
             
              end
         | 
| 159 | 
            +
              LOG.info {
         | 
| 160 | 
            +
                raw_size = unc || maf_size
         | 
| 161 | 
            +
                avg_block_kb = raw_size.to_f / n_blocks / 1024
         | 
| 162 | 
            +
                sprintf("  %d alignment blocks, average size %.2f KB",
         | 
| 163 | 
            +
                        n_blocks, avg_block_kb)
         | 
| 164 | 
            +
              }
         | 
| 165 | 
            +
              LOG.info {
         | 
| 166 | 
            +
                orig_size = unc ? unc : maf_size
         | 
| 167 | 
            +
                bgzf_size = File.size(bgz_path).to_f
         | 
| 168 | 
            +
                ratio = bgzf_size / orig_size
         | 
| 169 | 
            +
                sprintf("  Compressed with BGZF (level=%d) to %.1f MB (%.1fx)",
         | 
| 170 | 
            +
                        $options.level,
         | 
| 171 | 
            +
                        bgzf_size / 1048576,
         | 
| 172 | 
            +
                        ratio)
         | 
| 173 | 
            +
              }
         | 
| 174 | 
            +
            end
         | 
| 175 | 
            +
             | 
| 176 | 
            +
            runner = Bio::MAF::JobRunner.create($options.n_jobs)
         | 
| 177 | 
            +
            LOG.debug "Created #{runner.class} set for #{$options.n_jobs} concurrent jobs."
         | 
| 178 | 
            +
            ARGV.each do |maf|
         | 
| 179 | 
            +
              task = make_processing_task(maf)
         | 
| 180 | 
            +
              runner.add(&task)
         | 
| 56 181 | 
             
            end
         | 
| 182 | 
            +
            LOG.debug "Running jobs."
         | 
| 183 | 
            +
            runner.run
         | 
| 184 | 
            +
            LOG.debug "Finished processing."
         | 
    
        data/bin/maf_extract
    CHANGED
    
    | @@ -6,12 +6,13 @@ require 'ostruct' | |
| 6 6 |  | 
| 7 7 | 
             
            include Bio::MAF
         | 
| 8 8 |  | 
| 9 | 
            -
            options = OpenStruct.new
         | 
| 10 | 
            -
            options.mode = :intersect
         | 
| 11 | 
            -
            options.format = :maf
         | 
| 12 | 
            -
            options. | 
| 13 | 
            -
            options. | 
| 14 | 
            -
            options. | 
| 9 | 
            +
            $options = OpenStruct.new
         | 
| 10 | 
            +
            $options.mode = :intersect
         | 
| 11 | 
            +
            $options.format = :maf
         | 
| 12 | 
            +
            $options.one_based = false
         | 
| 13 | 
            +
            $options.seq_filter = {}
         | 
| 14 | 
            +
            $options.block_filter = {}
         | 
| 15 | 
            +
            $options.parse_options = {}
         | 
| 15 16 |  | 
| 16 17 | 
             
            def handle_list_spec(spec)
         | 
| 17 18 | 
             
              if spec =~ /^@(.+)/
         | 
| @@ -23,7 +24,11 @@ end | |
| 23 24 |  | 
| 24 25 | 
             
            def handle_interval_spec(int)
         | 
| 25 26 | 
             
              if int =~ /(.+):(\d+)-(\d+)/
         | 
| 26 | 
            -
                 | 
| 27 | 
            +
                if $options.one_based
         | 
| 28 | 
            +
                  Bio::GenomicInterval.new($1, $2.to_i, $3.to_i)
         | 
| 29 | 
            +
                else
         | 
| 30 | 
            +
                  Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
         | 
| 31 | 
            +
                end
         | 
| 27 32 | 
             
              else
         | 
| 28 33 | 
             
                raise "Invalid interval specification: #{int}"
         | 
| 29 34 | 
             
              end
         | 
| @@ -34,13 +39,13 @@ $op = OptionParser.new do |opts| | |
| 34 39 | 
             
              opts.separator ""
         | 
| 35 40 | 
             
              opts.separator "MAF source options (either --maf or --maf-dir must be given):"
         | 
| 36 41 | 
             
              opts.on("-m", "--maf MAF", "MAF file") do |maf|
         | 
| 37 | 
            -
                options.maf = maf
         | 
| 42 | 
            +
                $options.maf = maf
         | 
| 38 43 | 
             
              end
         | 
| 39 44 | 
             
              opts.on("-i", "--index INDEX", "MAF index") do |idx|
         | 
| 40 | 
            -
                options.idx = idx
         | 
| 45 | 
            +
                $options.idx = idx
         | 
| 41 46 | 
             
              end
         | 
| 42 47 | 
             
              opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
         | 
| 43 | 
            -
                options.maf_dir = dir
         | 
| 48 | 
            +
                $options.maf_dir = dir
         | 
| 44 49 | 
             
              end
         | 
| 45 50 | 
             
              opts.separator ""
         | 
| 46 51 | 
             
              opts.separator "Extraction options:"
         | 
| @@ -49,21 +54,26 @@ $op = OptionParser.new do |opts| | |
| 49 54 | 
             
                      "blocks intersecting the given region,",
         | 
| 50 55 | 
             
                      "or 'slice' to extract subsets covering ",
         | 
| 51 56 | 
             
                      "given regions") do |mode|
         | 
| 52 | 
            -
                options.mode = mode
         | 
| 57 | 
            +
                $options.mode = mode
         | 
| 53 58 | 
             
              end
         | 
| 54 59 | 
             
              opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
         | 
| 55 | 
            -
                options.bed = bed
         | 
| 60 | 
            +
                $options.bed = bed
         | 
| 56 61 | 
             
              end
         | 
| 57 62 | 
             
              opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
         | 
| 58 | 
            -
                options.interval = handle_interval_spec(int)
         | 
| 63 | 
            +
                $options.interval = handle_interval_spec(int)
         | 
| 64 | 
            +
              end
         | 
| 65 | 
            +
              opts.on("--one-based",
         | 
| 66 | 
            +
                      "Treat all intervals as one-based",
         | 
| 67 | 
            +
                      "(even from BED files, contrary to the standard)") do
         | 
| 68 | 
            +
                $options.one_based = true
         | 
| 59 69 | 
             
              end
         | 
| 60 70 | 
             
              opts.separator ""
         | 
| 61 71 | 
             
              opts.separator "Output options:"
         | 
| 62 72 | 
             
              opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
         | 
| 63 | 
            -
                options.format = fmt
         | 
| 73 | 
            +
                $options.format = fmt
         | 
| 64 74 | 
             
              end
         | 
| 65 75 | 
             
              opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
         | 
| 66 | 
            -
                options.out_path = out
         | 
| 76 | 
            +
                $options.out_path = out
         | 
| 67 77 | 
             
              end
         | 
| 68 78 | 
             
              opts.separator ""
         | 
| 69 79 | 
             
              opts.separator "Filtering options:"
         | 
| @@ -71,41 +81,41 @@ $op = OptionParser.new do |opts| | |
| 71 81 | 
             
                      "Filter out all but the species in the",
         | 
| 72 82 | 
             
                      "given comma-separated list",
         | 
| 73 83 | 
             
                      "(or @FILE to read from a file)") do |spec|
         | 
| 74 | 
            -
                options.seq_filter[:only_species] = handle_list_spec(spec)
         | 
| 84 | 
            +
                $options.seq_filter[:only_species] = handle_list_spec(spec)
         | 
| 75 85 | 
             
              end
         | 
| 76 86 | 
             
              opts.on("--with-all-species SPECIES",
         | 
| 77 87 | 
             
                      "Only match blocks with all the given",
         | 
| 78 88 | 
             
                      "species, comma-separated",
         | 
| 79 89 | 
             
                      "(or @FILE to read from a file)") do |spec|
         | 
| 80 | 
            -
                options.block_filter[:with_all_species] = handle_list_spec(spec)
         | 
| 90 | 
            +
                $options.block_filter[:with_all_species] = handle_list_spec(spec)
         | 
| 81 91 | 
             
              end
         | 
| 82 92 | 
             
              opts.on("--min-sequences N", Integer,
         | 
| 83 93 | 
             
                      "Match only blocks with at least N sequences") do |n|
         | 
| 84 | 
            -
                options.block_filter[:at_least_n_sequences] = n
         | 
| 94 | 
            +
                $options.block_filter[:at_least_n_sequences] = n
         | 
| 85 95 | 
             
              end
         | 
| 86 96 | 
             
              opts.on("--min-text-size N", Integer,
         | 
| 87 97 | 
             
                      "Match only blocks with minimum text size N") do |n|
         | 
| 88 | 
            -
                options.block_filter[:min_size] = n
         | 
| 98 | 
            +
                $options.block_filter[:min_size] = n
         | 
| 89 99 | 
             
              end
         | 
| 90 100 | 
             
              opts.on("--max-text-size N", Integer,
         | 
| 91 101 | 
             
                      "Match only blocks with maximum text size N") do |n|
         | 
| 92 | 
            -
                options.block_filter[:max_size] = n
         | 
| 102 | 
            +
                $options.block_filter[:max_size] = n
         | 
| 93 103 | 
             
              end
         | 
| 94 104 | 
             
              opts.separator ""
         | 
| 95 105 | 
             
              opts.separator "Block processing options:"
         | 
| 96 106 | 
             
              opts.on("--join-blocks",
         | 
| 97 107 | 
             
                      "Join blocks if appropriate after filtering",
         | 
| 98 108 | 
             
                      "out sequences") do
         | 
| 99 | 
            -
                options.parse_options[:join_blocks] = true
         | 
| 109 | 
            +
                $options.parse_options[:join_blocks] = true
         | 
| 100 110 | 
             
              end
         | 
| 101 111 | 
             
              opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
         | 
| 102 | 
            -
                options.parse_options[:remove_gaps] = true
         | 
| 112 | 
            +
                $options.parse_options[:remove_gaps] = true
         | 
| 103 113 | 
             
              end
         | 
| 104 114 | 
             
              opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
         | 
| 105 | 
            -
                options.parse_options[:parse_extended] = true
         | 
| 115 | 
            +
                $options.parse_options[:parse_extended] = true
         | 
| 106 116 | 
             
              end
         | 
| 107 117 | 
             
              opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
         | 
| 108 | 
            -
                options.parse_options[:parse_empty] = true
         | 
| 118 | 
            +
                $options.parse_options[:parse_empty] = true
         | 
| 109 119 | 
             
              end
         | 
| 110 120 | 
             
              opts.separator ""
         | 
| 111 121 | 
             
              opts.separator "Logging options:"
         | 
| @@ -120,24 +130,24 @@ def usage(msg) | |
| 120 130 | 
             
              exit 2
         | 
| 121 131 | 
             
            end
         | 
| 122 132 |  | 
| 123 | 
            -
            if options.maf
         | 
| 124 | 
            -
              access = Access.file(options.maf, options.idx, options.parse_options)
         | 
| 125 | 
            -
            elsif options.maf_dir
         | 
| 126 | 
            -
              access = Access.maf_dir(options.maf_dir, options.parse_options)
         | 
| 133 | 
            +
            if $options.maf
         | 
| 134 | 
            +
              access = Access.file($options.maf, $options.idx, $options.parse_options)
         | 
| 135 | 
            +
            elsif $options.maf_dir
         | 
| 136 | 
            +
              access = Access.maf_dir($options.maf_dir, $options.parse_options)
         | 
| 127 137 | 
             
            else
         | 
| 128 138 | 
             
              usage "Must supply --maf or --maf-dir!"
         | 
| 129 139 | 
             
            end
         | 
| 130 140 |  | 
| 131 141 | 
             
            begin
         | 
| 132 | 
            -
              access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
         | 
| 133 | 
            -
              access.block_filter = options.block_filter unless options.block_filter.empty?
         | 
| 134 | 
            -
              if options.out_path
         | 
| 135 | 
            -
                outf = File.open(options.out_path, 'w')
         | 
| 142 | 
            +
              access.sequence_filter = $options.seq_filter unless $options.seq_filter.empty?
         | 
| 143 | 
            +
              access.block_filter = $options.block_filter unless $options.block_filter.empty?
         | 
| 144 | 
            +
              if $options.out_path
         | 
| 145 | 
            +
                outf = File.open($options.out_path, 'w')
         | 
| 136 146 | 
             
              else
         | 
| 137 147 | 
             
                outf = $stdout
         | 
| 138 148 | 
             
              end
         | 
| 139 149 |  | 
| 140 | 
            -
              case options.format
         | 
| 150 | 
            +
              case $options.format
         | 
| 141 151 | 
             
              when :maf
         | 
| 142 152 | 
             
                writer = Writer.new(outf)
         | 
| 143 153 | 
             
              when :fasta
         | 
| @@ -146,20 +156,20 @@ begin | |
| 146 156 | 
             
                raise "unsupported output format #{format}!"
         | 
| 147 157 | 
             
              end
         | 
| 148 158 |  | 
| 149 | 
            -
              if options.bed
         | 
| 150 | 
            -
                intervals = read_bed_intervals(options.bed)
         | 
| 151 | 
            -
              elsif options.interval
         | 
| 152 | 
            -
                intervals = [options.interval]
         | 
| 159 | 
            +
              if $options.bed
         | 
| 160 | 
            +
                intervals = read_bed_intervals($options.bed)
         | 
| 161 | 
            +
              elsif $options.interval
         | 
| 162 | 
            +
                intervals = [$options.interval]
         | 
| 153 163 | 
             
              else
         | 
| 154 164 | 
             
                usage "Must supply --interval or --bed!"
         | 
| 155 165 | 
             
              end
         | 
| 156 166 |  | 
| 157 167 | 
             
              # TODO: provide access to original MAF header?
         | 
| 158 | 
            -
              if options.format == :maf
         | 
| 168 | 
            +
              if $options.format == :maf
         | 
| 159 169 | 
             
                writer.write_header(Header.default)
         | 
| 160 170 | 
             
              end
         | 
| 161 171 |  | 
| 162 | 
            -
              case options.mode
         | 
| 172 | 
            +
              case $options.mode
         | 
| 163 173 | 
             
              when :intersect
         | 
| 164 174 | 
             
                access.find(intervals) do |block|
         | 
| 165 175 | 
             
                  writer.write_block(block)
         | 
| @@ -172,7 +182,7 @@ begin | |
| 172 182 | 
             
                  end
         | 
| 173 183 | 
             
                end
         | 
| 174 184 | 
             
              else
         | 
| 175 | 
            -
                raise "Unsupported mode #{options.mode}!"
         | 
| 185 | 
            +
                raise "Unsupported mode #{$options.mode}!"
         | 
| 176 186 | 
             
              end
         | 
| 177 187 |  | 
| 178 188 | 
             
            ensure
         | 
    
        data/bin/maf_index
    CHANGED
    
    | @@ -14,10 +14,10 @@ PRINTERS = { | |
| 14 14 | 
             
            $options = OpenStruct.new
         | 
| 15 15 | 
             
            $options.mode = :build
         | 
| 16 16 | 
             
            $options.ref_only = true
         | 
| 17 | 
            +
            $options.parser_opts = { :parse_extended => false }
         | 
| 17 18 |  | 
| 18 19 | 
             
            def build_index(maf, index)
         | 
| 19 | 
            -
              parser = Bio::MAF::Parser.new(maf,
         | 
| 20 | 
            -
                                            :parse_extended => false)
         | 
| 20 | 
            +
              parser = Bio::MAF::Parser.new(maf, $options.parser_opts)
         | 
| 21 21 | 
             
              idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
         | 
| 22 22 | 
             
              idx.close
         | 
| 23 23 | 
             
            end
         | 
| @@ -36,6 +36,15 @@ op = OptionParser.new do |opts| | |
| 36 36 | 
             
              opts.on("-d", "--dump", "Dump contents of given INDEX") do
         | 
| 37 37 | 
             
                $options.mode = :dump
         | 
| 38 38 | 
             
              end
         | 
| 39 | 
            +
              opts.on("-O", "--parser-option OPT") do |opt|
         | 
| 40 | 
            +
                if opt =~ /(-?)(.+)/
         | 
| 41 | 
            +
                  val = ! ($1 == "-")
         | 
| 42 | 
            +
                  option = $2.to_sym
         | 
| 43 | 
            +
                  $options.parser_opts[option] = val
         | 
| 44 | 
            +
                else
         | 
| 45 | 
            +
                  raise "malformed parser option #{opt}!"
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
              end
         | 
| 39 48 | 
             
              opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
         | 
| 40 49 | 
             
                require 'ruby-prof'
         | 
| 41 50 | 
             
                if pspec =~ /(\w+):(.+)/
         | 
    
        data/bin/maf_tile
    CHANGED
    
    | @@ -10,7 +10,16 @@ def parse_interval(line) | |
| 10 10 | 
             
              src, r_start_s, r_end_s, _ = line.split(nil, 4)
         | 
| 11 11 | 
             
              r_start = r_start_s.to_i
         | 
| 12 12 | 
             
              r_end = r_end_s.to_i
         | 
| 13 | 
            -
               | 
| 13 | 
            +
              i_src = if $options.bed_species
         | 
| 14 | 
            +
                        "#{$options.bed_species}.#{src}"
         | 
| 15 | 
            +
                      else
         | 
| 16 | 
            +
                        src
         | 
| 17 | 
            +
                      end
         | 
| 18 | 
            +
              if $options.one_based
         | 
| 19 | 
            +
                Bio::GenomicInterval.new(i_src, r_start, r_end)
         | 
| 20 | 
            +
              else
         | 
| 21 | 
            +
                Bio::GenomicInterval.zero_based(i_src, r_start, r_end)
         | 
| 22 | 
            +
              end
         | 
| 14 23 | 
             
            end
         | 
| 15 24 |  | 
| 16 25 | 
             
            def target_for(base, interval, &blk)
         | 
| @@ -18,52 +27,96 @@ def target_for(base, interval, &blk) | |
| 18 27 | 
             
              File.open(path, 'w', &blk)
         | 
| 19 28 | 
             
            end
         | 
| 20 29 |  | 
| 21 | 
            -
            def apply_options( | 
| 22 | 
            -
              tiler.reference = options.ref if options.ref
         | 
| 23 | 
            -
              tiler.species = options.species
         | 
| 24 | 
            -
              tiler.species_map = options.species_map
         | 
| 30 | 
            +
            def apply_options(tiler)
         | 
| 31 | 
            +
              tiler.reference = $options.ref if $options.ref
         | 
| 32 | 
            +
              tiler.species = $options.species
         | 
| 33 | 
            +
              tiler.species_map = $options.species_map
         | 
| 34 | 
            +
              tiler.fill_char = $options.fill_char if $options.fill_char
         | 
| 25 35 | 
             
            end
         | 
| 26 36 |  | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 37 | 
            +
            def each_tiler(access, intervals)
         | 
| 38 | 
            +
              intervals.each do |int|
         | 
| 39 | 
            +
                access.tile(int) do |tiler|
         | 
| 40 | 
            +
                  apply_options(tiler)
         | 
| 41 | 
            +
                  yield tiler
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
            end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            $options = OpenStruct.new
         | 
| 47 | 
            +
            $options.p = { :threads => 1 }
         | 
| 48 | 
            +
            $options.species = []
         | 
| 49 | 
            +
            $options.species_map = {}
         | 
| 50 | 
            +
            $options.usage = false
         | 
| 32 51 |  | 
| 33 52 | 
             
            o_parser = OptionParser.new do |opts|
         | 
| 34 53 | 
             
              opts.banner = "Usage: maf_tile [options] <maf> [index]"
         | 
| 35 54 | 
             
              opts.separator ""
         | 
| 36 55 | 
             
              opts.separator "Options:"
         | 
| 37 56 | 
             
              opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
         | 
| 38 | 
            -
                options.ref = ref
         | 
| 39 | 
            -
              end
         | 
| 40 | 
            -
              opts.on("-i", "--interval [CHR:]BEGIN | 
| 41 | 
            -
                if int =~ /(.+):(\d+) | 
| 42 | 
            -
                   | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
                  options.interval = ($1.to_i)...($2.to_i)
         | 
| 57 | 
            +
                $options.ref = ref
         | 
| 58 | 
            +
              end
         | 
| 59 | 
            +
              opts.on("-i", "--interval [CHR:]BEGIN-END", "Genomic interval, zero-based") do |int|
         | 
| 60 | 
            +
                if int =~ /(.+):(\d+)-(\d+)/
         | 
| 61 | 
            +
                  $options.genomic_interval_spec = [$1, $2.to_i, $3.to_i]
         | 
| 62 | 
            +
                elsif int =~ /(\d+)-(\d+)/
         | 
| 63 | 
            +
                  $options.interval = ($1.to_i)...($2.to_i)
         | 
| 46 64 | 
             
                else
         | 
| 47 65 | 
             
                  $stderr.puts "Invalid interval specification #{int}!"
         | 
| 48 | 
            -
                  options.usage = true
         | 
| 66 | 
            +
                  $options.usage = true
         | 
| 49 67 | 
             
                end
         | 
| 50 68 | 
             
              end
         | 
| 51 | 
            -
              opts.on("- | 
| 69 | 
            +
              opts.on("--one-based",
         | 
| 70 | 
            +
                      "Treat all intervals as one-based",
         | 
| 71 | 
            +
                      "(even from BED files, contrary to the standard)") do
         | 
| 72 | 
            +
                $options.one_based = true
         | 
| 73 | 
            +
              end
         | 
| 74 | 
            +
              opts.on("-s", "--species SPECIES[:NAME]",
         | 
| 75 | 
            +
                      "Species to use (mapped name optional)",
         | 
| 76 | 
            +
                      "(can be a comma-separated list)") do |sp|
         | 
| 52 77 | 
             
                if sp =~ /:/
         | 
| 53 78 | 
             
                  species, mapped = sp.split(/:/)
         | 
| 54 | 
            -
                  options.species << species
         | 
| 55 | 
            -
                  options.species_map[species] = mapped
         | 
| 79 | 
            +
                  $options.species << species
         | 
| 80 | 
            +
                  $options.species_map[species] = mapped
         | 
| 81 | 
            +
                elsif sp =~ /,/
         | 
| 82 | 
            +
                  $options.species.concat(sp.split(/,/))
         | 
| 56 83 | 
             
                else
         | 
| 57 | 
            -
                  options.species << sp
         | 
| 84 | 
            +
                  $options.species << sp
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
              end
         | 
| 87 | 
            +
              opts.on("--species-file FILE", "File specifying species and optionally mapped names") do |file|
         | 
| 88 | 
            +
                File.open(file) do |f|
         | 
| 89 | 
            +
                  f.each_line do |line|
         | 
| 90 | 
            +
                    next if line =~ /^#/
         | 
| 91 | 
            +
                    parts = line.split
         | 
| 92 | 
            +
                    next unless parts.size > 0
         | 
| 93 | 
            +
                    $options.species << parts[0]
         | 
| 94 | 
            +
                    $options.species_map[parts[0]] = parts[1] if parts[1]
         | 
| 95 | 
            +
                  end
         | 
| 58 96 | 
             
                end
         | 
| 59 97 | 
             
              end
         | 
| 60 98 | 
             
              opts.on("-o", "--output-base BASE", "Base name for output files",
         | 
| 61 99 | 
             
                      "Use stdout for a single interval if not given") do |base|
         | 
| 62 | 
            -
                options.output_base = base
         | 
| 100 | 
            +
                $options.output_base = base
         | 
| 63 101 | 
             
              end
         | 
| 64 102 | 
             
              opts.on("--bed BED", "BED file specifying intervals",
         | 
| 65 103 | 
             
                      "(requires --output-base)") do |bed|
         | 
| 66 | 
            -
                options.bed = bed
         | 
| 104 | 
            +
                $options.bed = bed
         | 
| 105 | 
            +
              end
         | 
| 106 | 
            +
              opts.on("--bed-species SPECIES",
         | 
| 107 | 
            +
                      "Species to prepend to BED chromosome specs") do |species|
         | 
| 108 | 
            +
                $options.bed_species = species
         | 
| 109 | 
            +
              end
         | 
| 110 | 
            +
              opts.on("--fill-char C",
         | 
| 111 | 
            +
                      "Fill gaps with character C",
         | 
| 112 | 
            +
                      "(default is *)") do |char|
         | 
| 113 | 
            +
                $options.fill_char = char
         | 
| 114 | 
            +
              end
         | 
| 115 | 
            +
              opts.on("--upcase", "Fold all sequence data to upper case") do
         | 
| 116 | 
            +
                $options.p[:upcase] = true
         | 
| 117 | 
            +
              end
         | 
| 118 | 
            +
              opts.on("--concat", "Concatenate result blocks") do
         | 
| 119 | 
            +
                $options.concat = true
         | 
| 67 120 | 
             
              end
         | 
| 68 121 | 
             
              Bio::MAF::handle_logging_options(opts)
         | 
| 69 122 | 
             
            end
         | 
| @@ -74,52 +127,96 @@ Bio::Log::CLI.configure('bio-maf') | |
| 74 127 | 
             
            maf_p = ARGV.shift
         | 
| 75 128 | 
             
            index_p = ARGV.shift
         | 
| 76 129 |  | 
| 77 | 
            -
            unless  | 
| 78 | 
            -
               | 
| 79 | 
            -
               | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 130 | 
            +
            unless maf_p
         | 
| 131 | 
            +
              $stderr.puts "Must specify MAF file to process!"
         | 
| 132 | 
            +
              $options.usage = true
         | 
| 133 | 
            +
            end
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            if $options.species.empty?
         | 
| 136 | 
            +
              $stderr.puts "Must specify species to tile with --species!"
         | 
| 137 | 
            +
              $options.usage = true
         | 
| 138 | 
            +
            end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            unless $options.bed || $options.interval || $options.genomic_interval_spec
         | 
| 141 | 
            +
              $stderr.puts "Must specify --bed or --interval!"
         | 
| 142 | 
            +
              $options.usage = true
         | 
| 143 | 
            +
            end
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            if $options.bed && ! ($options.output_base || $options.concat)
         | 
| 146 | 
            +
              $stderr.puts "Must specify --output-base or --concat when specifying --bed!"
         | 
| 147 | 
            +
              $options.usage = true
         | 
| 148 | 
            +
            end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            if (! $options.output_base) && ! ($options.interval || $options.genomic_interval_spec || ($options.bed && $options.concat))
         | 
| 151 | 
            +
              $stderr.puts "Must specify --interval or --bed with --concat if --output-base is not given!"
         | 
| 152 | 
            +
              $options.usage = true
         | 
| 153 | 
            +
            end
         | 
| 154 | 
            +
             | 
| 155 | 
            +
            if $options.usage
         | 
| 82 156 | 
             
              $stderr.puts o_parser
         | 
| 83 157 | 
             
              exit 2
         | 
| 84 158 | 
             
            end
         | 
| 85 159 |  | 
| 86 160 | 
             
            access = if File.directory? maf_p
         | 
| 87 | 
            -
                       Bio::MAF::Access.maf_dir(maf_p, options.p)
         | 
| 161 | 
            +
                       Bio::MAF::Access.maf_dir(maf_p, $options.p)
         | 
| 88 162 | 
             
                     else
         | 
| 89 | 
            -
                       Bio::MAF::Access.file(maf_p, index_p, options.p)
         | 
| 163 | 
            +
                       Bio::MAF::Access.file(maf_p, index_p, $options.p)
         | 
| 90 164 | 
             
                     end
         | 
| 91 165 |  | 
| 92 | 
            -
            if options.bed
         | 
| 166 | 
            +
            if $options.bed
         | 
| 93 167 | 
             
              intervals = []
         | 
| 94 | 
            -
              File.open(options.bed) do |bed_f|
         | 
| 168 | 
            +
              File.open($options.bed) do |bed_f|
         | 
| 95 169 | 
             
                bed_f.each_line { |line| intervals << parse_interval(line) }
         | 
| 96 170 | 
             
              end
         | 
| 97 | 
            -
              intervals.sort_by! { |int| int.zero_start }
         | 
| 98 | 
            -
               | 
| 99 | 
            -
                 | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 171 | 
            +
              #intervals.sort_by! { |int| int.zero_start }
         | 
| 172 | 
            +
              if $options.concat
         | 
| 173 | 
            +
                # concatenate, as with exons
         | 
| 174 | 
            +
                concat = Array.new($options.species.size)
         | 
| 175 | 
            +
                concat.fill { '' }
         | 
| 176 | 
            +
                non_fill = nil
         | 
| 177 | 
            +
                each_tiler(access, intervals) do |tiler|
         | 
| 178 | 
            +
                  non_fill = tiler.non_fill_re if ! non_fill
         | 
| 179 | 
            +
                  concat.zip(tiler.build_bio_alignment) do |buf, seq|
         | 
| 180 | 
            +
                    buf << seq.to_s
         | 
| 181 | 
            +
                  end
         | 
| 182 | 
            +
                end
         | 
| 183 | 
            +
                fh = $options.output_base ? File.open($options.output_base, 'wb') : $stdout
         | 
| 184 | 
            +
                $options.species.zip(concat) do |species, seq|
         | 
| 185 | 
            +
                  if non_fill.match(seq)
         | 
| 186 | 
            +
                    sp_out_name = $options.species_map[species] || species
         | 
| 187 | 
            +
                    fh.puts ">#{sp_out_name}", seq.scan(/.{1,70}/)
         | 
| 188 | 
            +
                  end
         | 
| 189 | 
            +
                end
         | 
| 190 | 
            +
              else
         | 
| 191 | 
            +
                # output each interval separately
         | 
| 192 | 
            +
                each_tiler(access, intervals) do |tiler|
         | 
| 193 | 
            +
                  target_for($options.output_base, tiler.interval) do |target|
         | 
| 102 194 | 
             
                    tiler.write_fasta(target)
         | 
| 103 195 | 
             
                  end
         | 
| 104 196 | 
             
                end
         | 
| 105 197 | 
             
              end
         | 
| 106 198 | 
             
            else
         | 
| 107 199 | 
             
              # single interval
         | 
| 108 | 
            -
              if options. | 
| 109 | 
            -
                 | 
| 200 | 
            +
              if $options.genomic_interval_spec
         | 
| 201 | 
            +
                spec = $options.genomic_interval_spec
         | 
| 202 | 
            +
                if $options.one_based
         | 
| 203 | 
            +
                  interval = Bio::GenomicInterval.new(*spec)
         | 
| 204 | 
            +
                else
         | 
| 205 | 
            +
                  interval = Bio::GenomicInterval.zero_based(*spec)
         | 
| 206 | 
            +
                end
         | 
| 110 207 | 
             
              else
         | 
| 111 208 | 
             
                if access.indices.size != 1
         | 
| 112 209 | 
             
                  raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
         | 
| 113 210 | 
             
                end
         | 
| 114 211 | 
             
                ref_seq = access.indices.keys.first
         | 
| 115 212 | 
             
                interval = Bio::GenomicInterval.zero_based(ref_seq,
         | 
| 116 | 
            -
                                                           options.interval.begin,
         | 
| 117 | 
            -
                                                           options.interval.end)
         | 
| 213 | 
            +
                                                           $options.interval.begin,
         | 
| 214 | 
            +
                                                           $options.interval.end)
         | 
| 118 215 | 
             
              end
         | 
| 119 216 | 
             
              access.tile(interval) do |tiler|
         | 
| 120 | 
            -
                apply_options( | 
| 121 | 
            -
                if options.output_base
         | 
| 122 | 
            -
                  target = target_for(options.output_base, tiler.interval)
         | 
| 217 | 
            +
                apply_options(tiler)
         | 
| 218 | 
            +
                if $options.output_base
         | 
| 219 | 
            +
                  target = target_for($options.output_base, tiler.interval)
         | 
| 123 220 | 
             
                else
         | 
| 124 221 | 
             
                  target = $stdout
         | 
| 125 222 | 
             
                end
         |