RubyGems - finishm - Versions diffs - 0.0.1 → 0.0.2 - Mend

finishm 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Gemfile +19 -19
data/VERSION +1 -1
data/finishm.gemspec +631 -0
data/lib/assembly/graph_generator.rb +0 -1
data/lib/assembly/probed_graph.rb +0 -1
metadata +99 -96
data/bin/assembly_visualiser +0 -106
data/bin/check_primer_combinations.rb +0 -73
data/bin/contig_joiner.rb +0 -244
data/bin/contigs_against_assembly.rb +0 -153
data/bin/finishm_assembler +0 -55
data/bin/finishm_gap_closer.rb +0 -241
data/bin/kmer_abundance_file_tool.rb +0 -49
data/bin/kmer_pattern_to_assembly.rb +0 -377
data/bin/kmer_profile_finder.rb +0 -92
data/bin/kmers_count_parse.d +0 -52
data/bin/kmers_count_tabulate.d +0 -123
data/bin/kmers_count_tabulate.rb +0 -84
data/bin/pcr_result_parser.rb +0 -108
data/bin/primer_finder.rb +0 -119
data/bin/read_selection_by_kmer.d +0 -174
data/bin/scaffold_by_pattern.rb +0 -119
data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
data/bin/scaffold_end_coverages.rb +0 -69
data/bin/trail_validator.rb +0 -84

data/bin/primer_finder.rb DELETED

@@ -1,119 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'bio'
-require 'progressbar'
-$:.unshift File.join(File.dirname(__FILE__),'..','lib')
-require 'priner'
-if __FILE__ == $0 #needs to be removed if this script is distributed as part of a rubygem
-  SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-  # Parse command line options into the options hash
-  options = {
-    :logger => 'stderr',
-    :reverse_complement => false,
-    :gc_clamp_length => 2,
-    :min_melting_temperature => 56,
-    :max_melting_temperature => 62,
-    :min_primer_length => 15,
-  }
-  o = OptionParser.new do |opts|
-    opts.banner = "
-      Usage: #{SCRIPT_NAME} <arguments> <sequence>
-      Take a sequence, and find an oligos that fit certain parameters. Sort of like primer3_core but only look for single oligos, not pairs.\n\n"
-    opts.on("-s", "--sequence-file FILE", "Fasta file of sequences [required]") do |arg|
-      options[:input_file] = arg
-    end
-    opts.separator "\nOptional arguments:\n\n"
-    opts.on("-r", "--reverse-complement", "Design primers pointing backwards off the start of the sequence in reverse, not off the end forwards [default: #{options[:reverse_complement]}]") do
-      options[:reverse_complement] = true
-    end
-    # logger options
-    opts.separator "\nVerbosity:\n\n"
-    opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {Bio::Log::CLI.trace('error')}
-    opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-    opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| Bio::Log::CLI.trace(s)}
-  end; o.parse!
-  if ARGV.length != 0
-    $stderr.puts o
-    exit 1
-  end
-  # Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
-  Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-  # Read the contigs in
-  contigs = []
-  Bio::FlatFile.foreach(options[:input_file]) do |entry|
-    contigs.push entry.seq.to_s
-  end
-  log.info "Read in #{contigs.length} contigs from #{options[:contigs_file]}"
-  raise unless contigs.length == 1
-  seq = contigs[0]
-  # Reverse complement if required
-  if options[:reverse_complement]
-    seq = Bio::Sequence::NA.new(seq).reverse_complement.to_s
-  end
-  seq.upcase!
-  raise "Whackiness in the supplied sequence! #{seq}" unless seq.match(/^[ATGC]+$/)
-  raise unless options[:min_primer_length] > options[:gc_clamp_length]
-  # Find out all those positions that have a GC clamp of enough
-  gc_clamped_positions = []
-  (0...seq.length).each do |pos|
-    next unless pos-options[:min_primer_length] >= -1
-    if seq[pos-options[:gc_clamp_length]+1..pos].match(/^[GC]+$/)
-      gc_clamped_positions << pos
-    end
-  end
-  log.info "Found #{gc_clamped_positions.length} positions with a suitable GC-clamp"
-  # Find those with suitable melting temperatures
-  designer = OligoDesigner.new
-  progress = ProgressBar.new('primer_finding', gc_clamped_positions.length)
-  gc_clamped_positions.each do |pos|
-    # Iteratively make primers longer. Start with min primer length, end when the Tm exceeds the maximum allowable
-    current_length = options[:min_primer_length]
-    over_tm_max = false
-    while !over_tm_max and pos-current_length >= 0
-      oligo = seq[pos-current_length+1..pos]
-      tm = designer.melting_temperature oligo
-      puts oligo
-      puts tm
-      exit
-      if tm > options[:min_melting_temperature]
-        if tm < options[:max_melting_temperature]
-          # This is a hit
-          cur_seq = oligo
-          if options[:reverse_complement]
-            cur_seq = Bio::Sequence::NA.new(cur_seq).reverse_complement.to_s
-          end
-          puts [
-            cur_seq, tm
-          ].join("\t")
-        else
-          over_tm_max =  true
-          break #making the primer longer can only result in higher melting temperatures, and we are already over the max
-        end
-      end
-      current_length += 1
-    end
-    progress.inc
-  end
-  progress.finish
-end #end if running as a script

data/bin/read_selection_by_kmer.d DELETED

@@ -1,174 +0,0 @@
-#!/usr/bin/env rdmd
-import std.stdio;
-import std.string;
-import std.conv;
-import std.getopt;
-import std.file;
-import std.array;
-import bio.core.fasta;
-import bio.core.sequence;
-import std.algorithm;
-import std.container;
-import std.c.stdlib;
-void main(string[] args){
-  string whitelistFile, blacklistFile, fastaFile = null;
-  bool help, verbose, quiet, debugging = false;
-  int targetPerKmer = 1;
-  int minLeftoverLength;
-  getopt(args,
-    "whitelist",  &whitelistFile,
-    "blacklist",  &blacklistFile,
-    "reads",  &fastaFile,
-    "kmer-coverage-target", &targetPerKmer,
-    "min-leftover-length", &minLeftoverLength,
-    "verbose", &verbose,
-    "debug", &debugging,
-    "quiet", &quiet,
-    "help|h",         &help,
-  );
-  if(help){
-    writeln("my helpa");
-  }
-  else if(whitelistFile is null){stderr.writeln("Error: Need to specify a newline-separeted list of whitelisted kmers as --whitelist <file>");}
-  else if(fastaFile is null){stderr.writeln("Error: Need to specify a fasta file of reads to work with as --reads <fasta_file>");}
-  else {
-    if(debugging) verbose = true;
-    if(verbose) quiet=false;
-    //read in a text file of kmers that we wish to find, the whitelist
-    auto whites = split(cast(string) read(whitelistFile));
-    if (verbose)
-      stderr.writeln("Read in ",whites.length," kmers as a whitelist, e.g. ",whites.front);
-    // Find the minimum length of kmer being searched for
-    auto whitelistMinLength = map!"a.length"(whites).reduce!"a<b ? a : b";
-    auto whitelistMaxLength = map!"a.length"(whites).reduce!"a<b ? b : a";
-    if (whitelistMinLength != whitelistMaxLength){
-      stderr.writeln("Kmers must be of uniform length, but these ones weren't..");
-      exit(1);
-    }
-    if (verbose)
-      stderr.writeln("Minimum length of kmer in whitelist is ",whitelistMinLength);
-    //if blacklistFile is specified, read in a list of kmers that are blacklisted, otherwise make an empty array
-    bool[string] blacks;
-    if(blacklistFile != null){
-      foreach(kmer; split(cast(string) read(blacklistFile))){
-        if (kmer.length != whitelistMinLength){
-          stderr.writeln("Kmers (currently) must be of uniform length, but some blacklisted ones weren't..");
-          exit(1);
-        }
-        blacks[kmer] = true;
-        if(verbose)
-          stderr.writeln("Read in ",blacks.length," blacklisted kmers e.g. ",blacks.keys.front);
-      }
-    } else {
-      if(verbose)
-        stderr.writeln("No blacklisted kmers specified");
-    }
-    int[string] whitelistCounts;
-    foreach(white; whites){
-      whitelistCounts[white] = 0;
-    }
-    int num_reads_whitelisted = 0;
-    int num_reads_blacklisted = 0;
-    //Iterate through the fastq reads given.
-    auto fastas = fastaRecords(fastaFile);
-    bool all_accounted_for = false;
-    ptrdiff_t range_end;
-    string[] kmers;
-    if (minLeftoverLength)
-      kmers = new string[4];
-    else
-      kmers = new string[2];
-    foreach(seq; fastas){
-      if (verbose)
-        stderr.writeln("Inspecting ", seq);
-      //If they contain one of the blacklist kmers, then skip
-      string fwd = seq.sequence;
-      string rev = to!string(nucleotideSequence(fwd, true));
-      range_end = fwd.length - whitelistMinLength + 1;
-      if (minLeftoverLength)
-        range_end -= minLeftoverLength;
-      if (range_end < 0) continue; //If the read is too short, then don't even bother comparing it
-      if (debugging) stderr.writeln("Range end was ",range_end);
-      //How many of each whitelist kmers are found (including in the reverse complement)?
-      bool whitelisted = false;
-      foreach(i; 0 .. range_end){
-        kmers[0] = fwd[i .. (i+whitelistMinLength)];
-        kmers[1] = rev[i .. (i+whitelistMinLength)];
-        // if min leftover length is specified then search the reverse complement of the fwd as well
-        if (minLeftoverLength){
-          kmers[2] = to!string(nucleotideSequence(kmers[0], true));
-          kmers[3] = to!string(nucleotideSequence(kmers[1], true));
-        }
-        foreach(kmer; kmers){
-          if (debugging)
-            stderr.writeln("Whitelist inspecting kmer ",kmer," at position ",i);
-          if (kmer in whitelistCounts && whitelistCounts[kmer] < targetPerKmer){
-            whitelisted = true;
-            whitelistCounts[kmer] += 1;
-            if (whitelistCounts[kmer] >= targetPerKmer){
-              if(verbose)
-                stderr.writeln("kmer index ",i," now accounted for");
-              if (count!((x){return x<targetPerKmer;})(whitelistCounts.values) == 0){
-                if(verbose)
-                  stderr.writeln("All whitelisted kmers now accounted for");
-                all_accounted_for = true; //all done, no more fasta entries required
-              }
-            }
-          }
-        }
-      }
-      if(!whitelisted) continue;
-      else if (verbose) stderr.writeln("Read contains a valid whitelisted kmer");
-      //I'm sure there is a faster way to search for an array of strings within a particular string, but eh for now.
-      bool blacklisted = false;
-      if (blacklistFile != null){
-        foreach(i; 0 .. fwd.length - whitelistMinLength+1){
-          auto kmer = fwd[i .. (i+whitelistMinLength)];
-          if (kmer in blacks){
-            //blacklisted kmer found
-            blacklisted = true;
-            break;
-          }
-        }
-      }
-      if(blacklisted){
-        num_reads_blacklisted += 1;
-        if(verbose)
-          stderr.writeln(fwd," contains a blacklisted kmer, not including this one");
-        continue;
-      } else {
-        if(verbose)
-          stderr.writeln(fwd," not blacklisted");
-      }
-      //print this sequence, as it is whitelisted and not blacklisted
-      num_reads_whitelisted += 1;
-      writeln(">", seq.header);
-      writeln(fwd);
-      if(all_accounted_for) break;
-    }
-    //output the number of kmers that were sufficiently covered
-    ulong num_counted = count!("a >= b")(whitelistCounts.values, targetPerKmer);
-    ulong num_not_counted = whitelistCounts.length - num_counted;
-    if(!quiet){
-      stderr.writeln("Found ",num_counted," from the whitelist as expected and ",num_not_counted," not enough times");
-      stderr.writeln("There were ",num_reads_whitelisted," reads output, and ",num_reads_blacklisted," reads blacklisted");
-    }
-  }
-}

data/bin/scaffold_by_pattern.rb DELETED

@@ -1,119 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'csv'
-require 'bio'
-require 'tempfile'
-require 'systemu'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-$:.unshift File.join(File.dirname(__FILE__),'..','lib')
-require 'kmer_abundance_pattern'
-# Parse command line options into the options hash
-options = {
-  :logger => 'stderr',
-  :log_level => 'info',
-  :number_of_kmers => 100,
-}
-o = OptionParser.new do |opts|
-  opts.banner = "
-    Usage: #{SCRIPT_NAME} -f <scaffolds_fasta> -k <kmer_abundance_file>
-    Take a fasta file of contigs, and a multiple kmer count file. Output the patterns each contig end shows up.n\n"
-  opts.on("-f FASTA_FILE", "Fasta file containing multiple sequences that we are attempting to scaffold together [required]") do |arg|
-    options[:fasta_file] = arg
-  end
-  opts.on("-k KMER_FILE", "kmer frequencies [required]") do |arg|
-    options[:kmer_file] = arg
-  end
-  opts.on("--kmer KMER_SIZE", "kmer length [required]") do |arg|
-    options[:kmer_size] = arg.to_i
-  end
-  opts.on("--upper-threshold ARG", "kmer frequency cutoff to saying 'present' [required]") do |arg|
-    options[:upper_threshold] = arg.to_i
-  end
-  opts.on("--lower-threshold ARG", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
-    options[:lower_threshold] = arg.to_i
-  end
-  # logger options
-  opts.separator "\nVerbosity:\n\n"
-  opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
-  opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-  opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
-end; o.parse!
-if ARGV.length != 0 or options[:fasta_file].nil? or options[:kmer_file].nil? or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:kmer_size].nil?
-  $stderr.puts o
-  exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-get_kmer_abundances_from_kmers = lambda do |kmers|
-  # fgrep the kmer abundance file for these particular ones
-  patterns = {}
-  Tempfile.open('kemrs') do |tempfile|
-    tempfile.puts kmers.join "\n"
-    tempfile.close
-    # for each of the kmers that come back, output their pattern in the kmer abundance file
-    grep_cmd = "fgrep -f #{tempfile.path} #{options[:kmer_file].inspect}"
-    log.debug "Running cmd with #{kmers.length} kmers: #{grep_cmd}"
-    status, stdout, stderr = systemu grep_cmd
-    raise stderr if stderr != ''
-    raise unless status.exitstatus == 0
-    num_kmers = stdout.split("\n").length
-    log.debug "Finished grepping for kmers, found #{num_kmers} kmers"
-    stdout.each_line do |line|
-      CSV.parse(line, :col_sep => ' ') do |row|
-        pattern = KmerAbundancePattern.new
-        pattern.parse_from_kmer_abundance row[1...row.length].collect{|a| a.to_f}, options[:lower_threshold], options[:upper_threshold]
-        rep = pattern.binary_string
-        patterns[rep] ||= 0
-        patterns[rep] += 1
-      end
-    end
-  end
-  patterns.sort{|a,b| b[1]<=>a[1]}.collect{|a| a.join(',')}.join("\t")
-end
-# For each of the sequences in the fasta file
-Bio::FlatFile.foreach(Bio::FastaFormat, options[:fasta_file]) do |seq|
-  # Extract the first 100 kmers
-  kmers = []
-  i = 0
-  seq.to_biosequence.window_search(options[:kmer_size]) do |s|
-    kmers.push s.seq
-    i += 1
-    break if i>options[:number_of_kmers]
-  end
-  # output to a tempfile
-  patterns = get_kmer_abundances_from_kmers.call kmers
-  puts [
-    seq.definition,
-    'start',
-    patterns
-  ].join("\t")
-  # repeat for the end of the contig
-  kmers = []
-  i = 0
-  seq.naseq.reverse_complement.window_search(options[:kmer_size]) do |s|
-    kmers.push s.seq.upcase
-    i += 1
-    break if i>options[:number_of_kmers]
-  end
-  patterns = get_kmer_abundances_from_kmers.call kmers
-  puts [
-    seq.definition,
-    'end',
-    patterns
-  ].join("\t")
-end

data/bin/scaffold_connection_possibilities_to_knowns.rb DELETED

@@ -1,193 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'csv'
-require 'pp'
-require 'bio'
-require 'pry'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-$:.unshift File.join(ENV['HOME'],'git/yargraph/lib')
-require 'yargraph'
-# Parse command line options into the options hash
-options = {
-  :logger => 'stderr',
-  :log_level => 'info',
-}
-o = OptionParser.new do |opts|
-  opts.banner = "
-    Usage: #{SCRIPT_NAME} <arguments>
-    Description of what this program does...\n\n"
-  opts.on("-c", "--connnections FILE", "connections file [required]") do |arg|
-    options[:connections_file] = arg
-  end
-  opts.on("-f", "--fasta FILE", "Fasta file of all contigs [required]") do |arg|
-    options[:fasta_file] = arg
-  end
-  # logger options
-  opts.separator "\nVerbosity:\n\n"
-  opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
-  opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-  opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
-end; o.parse!
-if ARGV.length != 0 or options[:connections_file].nil?
-  $stderr.puts o
-  exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-class Probe
-  attr_accessor :contig_name, :side
-  def to_setable
-    [@contig_name, @side]
-  end
-end
-class ProbeSet
-end
-graph = Yargraph::UndirectedGraph.new
-num_probes_circular = 0
-CSV.foreach(options[:connections_file], :col_sep => "\t") do |row|
-  if row.length == 3
-    # e.g. 110811_E_1_D_nesoni_single_contig_1011407.start	110811_E_1_D_nesoni_single_contig_1030181.start	225
-    splits1 = nil
-    splits2 = nil
-    splits1 = row[0].match(/^(.+)\.(.+)$/)
-    splits2 = row[1].match(/^(.+)\.(.+)$/)
-    raise if splits1.nil? or splits2.nil?
-    distance = row[2]
-    row = [
-      splits1[1], splits1[2], splits2[1], splits2[2], distance
-      ].flatten
-  end
-  raise unless row.length == 5
-  # e.g. seq1    end     seq23   start   6878
-  probe1 = Probe.new
-  probe1.contig_name = row[0]
-  probe1.side = row[1]
-  probe2 = Probe.new
-  probe2.contig_name = row[2]
-  probe2.side = row[3]
-  if probe1.contig_name == probe2.contig_name and probe1.side == probe2.side
-    num_probes_circular += 1
-  else
-    graph.add_edge probe1.to_setable, probe2.to_setable
-  end
-end
-probes = graph.vertices.to_a
-# Connect all starts to the ends
-probes.each do |array|
-  contig_name = array[0]
-  start_probe = Probe.new
-  start_probe.contig_name = contig_name
-  start_probe.side = 'start'
-  end_probe = Probe.new
-  end_probe.contig_name = contig_name
-  end_probe.side = 'end'
-  graph.add_edge start_probe.to_setable, end_probe.to_setable
-end
-log.info "Removed #{num_probes_circular} connections that join a contig end to itself"
-# First try the not computationally intensive way - can we find any?
-edge_result = graph.some_edges_in_all_hamiltonian_cycles
-cross_contig_connections = []
-edge_result.edges_in_all.each do |v1, v2|
-  if v1[0] != v2[0]
-    cross_contig_connections.push [v1,v2]
-  end
-end
-if !cross_contig_connections.empty?
-  length = cross_contig_connections.length
-  log.info "Good news. Found #{length} connections that are in all Hamiltonian paths (and thus can probably be scaffolded together):"
-  cross_contig_connections.each do |connection|
-    log.info connection[0].to_s + "\t" + connection[1].to_s
-  end
-  if length == probes.length and edge_result.contains_hamcycle != false
-    log.info "Extra good news. You just scaffolded your genome into a single scaffold"
-  end
-end
-if edge_result.contains_hamcycle == false
-  log.warn "Bad news. The connectivity graph contains no Hamiltonian cycles, and so the contigs cannot be scaffolded into one circular genome"
-end
-# Determine if there are any ends that don't connect to anything
-contig_names = []
-Bio::FlatFile.foreach(options[:fasta_file]) do |seq|
-  contig_name = seq.definition.split(/\s+/)[0]
-  contig_names.push contig_name
-  %w(start end).each do |side|
-    probe = Probe.new
-    probe.contig_name = contig_name
-    probe.side = side
-    if graph.edges[probe.to_setable].empty?
-      log.info "Unable to find any connections from #{probe.to_setable}"
-    end
-  end
-end
-# Determine if there is any possible plasmids
-num_plasmids_found = 0
-contig_names.each do |contig_name|
-  probe = Probe.new
-  probe.contig_name = contig_name
-  probe.side = 'start'
-  rev_probe = Probe.new
-  rev_probe.contig_name = contig_name
-  rev_probe.side = 'end'
-  # Both the start and the end must only connect to each other
-  if graph.edges[probe.to_setable].length == 1 and
-    graph.edges[rev_probe.to_setable].length == 1 and
-    graph.edges[probe.to_setable].include?(rev_probe.to_setable)
-    num_plasmids_found += 1
-    log.info "Contig #{contig_name} appears to be circular and not connect to other contigs, suggesting it may be a plasmid"
-  end
-end
-log.info "Found #{num_plasmids_found} contigs that appear to be plasmids based on connectivity"
-log.info "Attempting a better but more computationally intensive method of determining edges that are in all hamiltonian paths.."
-# First try to see if there is any hamiltonian paths?
-paths = []
-max_path_count = 4
-operation_limit = 50000
-graph.hamiltonian_cycles(operation_limit) do |path|
-  if paths.length <= max_path_count
-    paths.push path
-  else
-    break
-  end
-end
-if paths.length < max_path_count
-  log.info "Found exactly #{paths.length} Hamiltonian cycles"
-else
-  log.info "Gave up searching for Hamiltonian cycles as there are at least #{max_path_count} cycles"
-end
-# OK so
-#edges_in_all = graph.edges_in_all_hamiltonian_cycles