RubyGems - finishm - Versions diffs - 0.0.1 → 0.0.2 - Mend

finishm 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Gemfile +19 -19
data/VERSION +1 -1
data/finishm.gemspec +631 -0
data/lib/assembly/graph_generator.rb +0 -1
data/lib/assembly/probed_graph.rb +0 -1
metadata +99 -96
data/bin/assembly_visualiser +0 -106
data/bin/check_primer_combinations.rb +0 -73
data/bin/contig_joiner.rb +0 -244
data/bin/contigs_against_assembly.rb +0 -153
data/bin/finishm_assembler +0 -55
data/bin/finishm_gap_closer.rb +0 -241
data/bin/kmer_abundance_file_tool.rb +0 -49
data/bin/kmer_pattern_to_assembly.rb +0 -377
data/bin/kmer_profile_finder.rb +0 -92
data/bin/kmers_count_parse.d +0 -52
data/bin/kmers_count_tabulate.d +0 -123
data/bin/kmers_count_tabulate.rb +0 -84
data/bin/pcr_result_parser.rb +0 -108
data/bin/primer_finder.rb +0 -119
data/bin/read_selection_by_kmer.d +0 -174
data/bin/scaffold_by_pattern.rb +0 -119
data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
data/bin/scaffold_end_coverages.rb +0 -69
data/bin/trail_validator.rb +0 -84

data/bin/kmer_profile_finder.rb DELETED

@@ -1,92 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'csv'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-# Parse command line options into the options hash
-options = {
-  :logger => 'stderr',
-  :log_level => 'info',
-}
-o = OptionParser.new do |opts|
-  opts.banner = "
-    Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
-    Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
-  opts.on("--upper-threshold ARG", "kmer frequency cutoff to saying 'present' [required]") do |arg|
-    options[:upper_threshold] = arg.to_i
-  end
-  opts.on("--lower-threshold ARG", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
-    options[:lower_threshold] = arg.to_i
-  end
-  # logger options
-  opts.separator "\nVerbosity:\n\n"
-  opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
-  opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-  opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
-end; o.parse!
-if ARGV.length != 1 or options[:upper_threshold].nil? or options[:lower_threshold].nil?
-  $stderr.puts o
-  exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-encoded_counts = {}
-max_i =  0
-input_file = nil
-if ARGV[0] == '-'
-  input_file = $stdin
-else
-  input_file = File.open ARGV[0]
-end
-csv = CSV.new(input_file, :col_sep => ' ')
-csv.each do |row|
-  kmer = row[0]
-  counts = row[1...row.length].collect{|s| s.to_i}
-  index = 0
-  counts.each_with_index do |count, i|
-    max_i = i if i > max_i
-    if count > options[:upper_threshold]
-      increment = (1<<i)
-      index += increment
-      log.debug "Found a passable for #{options[:threshold]} in index #{i} for #{counts}, count is now #{index}" if log.debug?
-    elsif count < options[:lower_threshold]
-      # do nothing
-    else
-      # coverage was in no man's land between thresholds.
-      # Ignore this kmer as noise.
-      break
-    end
-  end
-  if index != 0
-    encoded_counts[index] ||= 0
-    encoded_counts[index] += 1
-  end
-end
-(0..encoded_counts.keys.max).each do |i|
-  total = encoded_counts[i]
-  unless total.nil?
-    unencoded = i.to_s(2)
-    while unencoded.length <= max_i
-      unencoded = '0'+unencoded
-    end
-    puts [
-      i,
-      total,
-      unencoded,
-    ].join "\t"
-  end
-end

data/bin/kmers_count_parse.d DELETED

@@ -1,52 +0,0 @@
-#!/usr/bin/env rdmd
-import std.stdio;
-import std.csv;
-import std.typecons;
-import std.getopt;
-import std.algorithm;
-void main(string[] args)
-{
-  bool usePercentages = false;
-  int minCount = 1;
-  string trace = "info";
-  getopt(args,
-    "percentage", &usePercentages,
-    "min-count",  &minCount,
-    "trace",      &trace
-  );
-  auto kmersFile = File(args[1]);
-  int lineCount = 0;
-  foreach (line; kmersFile.byLine()) {
-    lineCount += 1;
-    if (lineCount % (1024*1024) == 0){
-      stderr.writeln("Parsed ",lineCount, " lines");
-    }
-    auto reader = csvReader!(Tuple!(string,
-      int, int, int, int, int,
-      int, int, int, int, int,
-      int, int, int, int, int,
-      int, int, int, int, int,
-      int, int, int, int, int,
-      int, int, int, int, int,
-      int, int))(line, ' ');
-    foreach (record; reader) {
-      /*writeln(record[0]);
-      writeln(record[1]);
-      writeln(record[1..32]);*/
-      int[] range =  [1,2,3,4];
-      int sum = 0;
-      foreach(T; record[1..32]){
-        sum += T;
-      }
-      if (sum >= minCount){
-        writeln(line);
-      }
-    }
-  }
-}

data/bin/kmers_count_tabulate.d DELETED

@@ -1,123 +0,0 @@
-#!/usr/bin/env rdmd
-import std.stdio;
-import std.conv;
-import std.string;
-import std.regex;
-import std.getopt;
-void main(string[] args){
-  bool usePercentages = false;
-  int minCount = 1;
-  string trace = "info";
-  getopt(args,
-    "percentage", &usePercentages,
-    "min-count",  &minCount,
-    "trace",      &trace
-  );
-  //Create an array of open file handles, one for each argument given
-  auto filenames = args[1 .. $];
-  int[] totalCounts = new int[filenames.length];
-  int kmerLength = 0;
-  foreach(i, file; filenames){
-    int count = 0;
-    auto f = File(file);
-    char[] line;
-    while (f.readln(line)){
-    //foreach(line; f.byLine()){
-      if (kmerLength==0){
-        kmerLength = 101;//to!int(indexOf(line, " "));
-        stderr.writeln("Detected kmer length of ",kmerLength);
-      }
-      int thisCount = to!int(line[kmerLength+1 .. $-1]);
-      //int thisCount = to!int(line[kmerLength+1 .. $]);
-      count += thisCount;
-      //if (count > 4000002){break;}
-    }
-    totalCounts[i] = count;
-  }
-  stderr.writeln("Finished ccounting total kmers, totals were: ",totalCounts);
-  if(false){
-  bool allFinished = false;
-  bool[] finished = new bool[filenames.length];
-  foreach (f; finished){f=false;}
-  File[] files = new File[filenames.length];
-  foreach(i; 0 .. files.length){
-    files[i] = File(filenames[i]);
-  }
-  struct KmerCount {
-    char[101] kmer;
-    int count;
-  }
-  KmerCount[] currentRows = new KmerCount[files.length];
-  char[] lineBuffer;
-  foreach (i; 0..currentRows.length){
-    //read in the line to teh buffer, so memory is not reallocated
-    files[i].readln(lineBuffer);
-    currentRows[i].kmer = chomp(files[i].readln(currentRows[i].kmer))[0..kmerLength];
-    currentRows[i].count = to!int(line[kmerLength+1..$]);
-  }
-  //write headers
-  enum ctr = ctRegex!(".*/(.+)");
-  foreach(f; filenames){
-    write("\t",match(f, ctr).captures[1]);
-  }
-  writeln();
-  string[] toPrint = new string[filenames.length+1];
-  int kmersCounted = 0;
-  while (!allFinished){
-    kmersCounted += 1;
-    if (kmersCounted % (1024*1024) == 0){stderr.writeln("Processed ",kmersCounted," kmers.");}
-    //Find the lowest kmer
-    string lowestKmer = null;
-    foreach (kc; currentRows){
-      if (lowestKmer == null || kc.kmer < lowestKmer){
-        lowestKmer = kc.kmer;
-      }
-    }
-    //Go through each file, printing the number of this kmer found
-    int totalObservations = 0;
-    toPrint[0] = lowestKmer;
-    foreach (i, kc; currentRows){
-      if (kc.kmer == lowestKmer){
-        totalObservations += kc.count;
-        if (usePercentages){
-          toPrint[i+1] = to!string(to!float(kc.count)/totalCounts[i]);
-        } else {
-          toPrint[i+1] = to!string(kc.count);
-        }
-        // Read a new line in, check if this file is finished
-        auto line = files[i].readln;
-        if (line == null){
-          finished[i] = true;
-          allFinished = true; //guilty until proven innocent
-          foreach(f; finished){
-            if (!f){
-              allFinished = false;
-            }
-          }
-          currentRows[i].kmer = null;
-          currentRows[i].count = -1;
-        } else {
-          //Regular line to be read in
-          currentRows[i].kmer = line[0..kmerLength];
-          currentRows[i].count = to!int(line[kmerLength+1..$-1]);
-        }
-      } else {
-        toPrint[i+1] = "0";
-      }
-    }
-    if (totalObservations >= minCount){
-      writeln(join(toPrint, "\t"));
-    }
-  }
-}}

data/bin/kmers_count_tabulate.rb DELETED

@@ -1,84 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'csv'
-require 'progressbar'
-require 'tempfile'
-require 'systemu'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-# Parse command line options into the options hash
-options = {
-  :logger => 'stderr',
-  :log_level => 'info',
-  :min_count => 1,
-}
-o = OptionParser.new do |opts|
-  opts.banner = "
-    Usage: #{SCRIPT_NAME} <kmers_count_output1> [<kmers_count_output2> ..]
-    Take a list of files output from libngs' kmers_count tool, after being run through gnu sort.
-    Create a table, where the columns are each file, the rows are each kmer, and
-    the cells are the percent of that file's kmer actually is that kmer.\n\n"
-  opts.on("--output-file FILENAME", "Output file path [required]") do |arg|
-    options[:output_file] = arg
-  end
-  opts.on("--percentage", "description [default: #{options[:eg]}]") do
-  raise "not yet implemented"
-    options[:percentage_outputs] = true
-  end
-  opts.on("--min-count COUNT", "require at least this many kmers to be output into the output file [default: #{options[:min_count]}]") do |arg|
-  raise "not yet implemented"
-    options[:min_count] = arg.to_i
-  end
-  # logger options
-  opts.separator "\nVerbosity:\n\n"
-  opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
-  opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-  opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
-end; o.parse!
-if ARGV.length == 0 or options[:output_file].nil?
-  $stderr.puts o
-  exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-filenames = ARGV
-raise "I need more than 1 file" unless filenames.length > 1
-log.info "Joining these files: #{filenames.inspect}"
-# run gnu join on each file
-current_build_file = filenames[0] #Build off the current build file first, then a tempfile subsequently
-Tempfile.open('kmers_join1') do |tempfile1|
-  Tempfile.open('kmers_join2') do |tempfile2|
-    filenames.each_with_index do |file, i|
-      next if i==0
-      first_file_output_fields = (2..(i+1)).to_a.collect{|n| "1.#{n.to_s}"}.join(',')
-      cmd = "join -a1 -a2 -e 0 -o0,#{first_file_output_fields},2.2 #{current_build_file.inspect} #{file} >#{tempfile2.path}"
-      log.info "At #{Time.now}, running #{cmd}.."
-      status, stdout, stderr = systemu cmd
-      raise stderr unless stderr == ''
-      raise 'exitstatus bad1!' unless status.exitstatus == 0
-      status, stdout, stderr = systemu "mv #{tempfile2.path} #{tempfile1.path}"
-      raise stderr unless stderr == ''
-      raise 'exitstatus bad2!' unless status.exitstatus == 0
-      current_build_file = tempfile1.path
-    end
-    status, stdout, stderr = systemu "mv #{current_build_file} #{options[:output_file]}"
-    raise stderr unless stderr == ''
-    raise 'exitstatus bad3!' unless status.exitstatus == 0
-  end
-end

data/bin/pcr_result_parser.rb DELETED

@@ -1,108 +0,0 @@
-#!/usr/bin/env ruby
-require 'optparse'
-require 'bio-logger'
-require 'csv'
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-# Parse command line options into the options hash
-options = {
-  :logger => 'stderr',
-  :log_level => 'info',
-}
-o = OptionParser.new do |opts|
-  opts.banner = "
-    Usage: #{SCRIPT_NAME} <arguments>
-    Takes a list of PCR primers that were put in several lanes (not all primers in all lanes), and a list of bands that were found, and decipher which bands are the result of which primer pairs, as best as possible\n\n"
-  opts.on("--bands-file FILE", "tsv file, with the band names as the first column, and the lane numbers that they appear in as the second column (comma separated) [required]") do |arg|
-    options[:bands_file] = arg
-  end
-  opts.on("--primers-file FILE", "tsv file, with the lane names as the first column, and the set of primers numbers that are in each lane as the second column (comma separated) [required]") do |arg|
-    options[:primers_file] = arg
-  end
-  # logger options
-  opts.separator "\nVerbosity:\n\n"
-  opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
-  opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
-  opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
-end; o.parse!
-if ARGV.length != 0 or options[:bands_file].nil? or options[:primers_file].nil?
-  $stderr.puts o
-  exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-# Read in the bands
-bands_to_lanes = {}
-CSV.foreach(options[:bands_file], :col_sep => "\t") do |row|
-  raise "Malformed bands file in this line: #{row.inspect}" unless row.length == 2
-  band_name = row[0]
-  raise "Two bands were labeled the same way, as #{band_name.inspect}" if bands_to_lanes.key?(band_name)
-  lanes_of_this_band = row[1].split(/[,\s]/).collect{|c| c.strip}
-  bands_to_lanes[band_name] = lanes_of_this_band
-end
-log.info "Parsed in #{bands_to_lanes.length} bands, found #{bands_to_lanes.collect{|k,v| v.length}.join(',')} lanes each, respectively"
-# Read in the primer sets
-lanes_to_primers = {}
-CSV.foreach(options[:primers_file], :col_sep => "\t") do |row|
-  raise "Malformed primers file in this line: #{row.inspect}" unless row.length == 2
-  lane_name = row[0]
-  raise "Two lanes were labeled the same way, as #{lane_name.inspect}" if lanes_to_primers.key?(lane_name)
-  primers_of_this_band = row[1].split(/[,\s]/).collect{|c| c.strip}
-  lanes_to_primers[lane_name] = primers_of_this_band
-end
-log.info "Parsed in #{lanes_to_primers.length} lanes, with #{lanes_to_primers.collect{|k,v| v.length}.join(',')} primers each, respectively"
-# Go through each pairing of primers. Which primer sets explain each band?
-all_primers = lanes_to_primers.values.flatten.sort.uniq
-lanes = lanes_to_primers.keys
-bands = bands_to_lanes.keys
-bands_to_explaining_primer_pairs = {}
-bands.each do |band|
-  log.debug "Going after band #{band}"
-  all_primers.combination(2) do |array|
-    primer1 = array.sort[0]
-    primer2 = array.sort[1]
-    log.debug "Testing primers #{primer1}/#{primer2} agreement with band #{band}"
-    band_agrees_with_this_primer_pair = true
-    lanes.each do |lane|
-      band_is_in_this_lane = bands_to_lanes[band].include?(lane)
-      primers_here = lanes_to_primers[lane]
-      if band_is_in_this_lane and (!primers_here.include?(primer1) or !primers_here.include?(primer2))
-        log.debug "primer pair #{primer1}/#{primer2} fails for band #{band} because band was present but at least one primer wasn't, in lane #{lane}"
-        band_agrees_with_this_primer_pair = false
-      end
-      if !band_is_in_this_lane and (primers_here.include?(primer1) and primers_here.include?(primer2))
-        log.debug "primer pair #{primer1}/#{primer2} fails for band #{band} because band was not present but both primers were, in lane #{lane}"
-        band_agrees_with_this_primer_pair = false
-      end
-    end
-    if band_agrees_with_this_primer_pair
-      log.debug "Found a suitable pair of primers for band #{band}: #{primer1}/#{primer2}"
-      bands_to_explaining_primer_pairs[band] ||= []
-      bands_to_explaining_primer_pairs[band].push array
-    end
-  end
-  puts [
-    band,
-    bands_to_explaining_primer_pairs[band].nil? ? 'none' : bands_to_explaining_primer_pairs[band].collect{|a| "(#{a.join(',')})"}.join(', ')
-  ].join("\t")
-end