finishm 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,92 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- }
14
- o = OptionParser.new do |opts|
15
- opts.banner = "
16
- Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
17
-
18
- Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
19
-
20
- opts.on("--upper-threshold ARG", "kmer frequency cutoff to saying 'present' [required]") do |arg|
21
- options[:upper_threshold] = arg.to_i
22
- end
23
- opts.on("--lower-threshold ARG", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
24
- options[:lower_threshold] = arg.to_i
25
- end
26
-
27
- # logger options
28
- opts.separator "\nVerbosity:\n\n"
29
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
30
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
31
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
32
- end; o.parse!
33
- if ARGV.length != 1 or options[:upper_threshold].nil? or options[:lower_threshold].nil?
34
- $stderr.puts o
35
- exit 1
36
- end
37
- # Setup logging
38
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
39
-
40
- encoded_counts = {}
41
- max_i = 0
42
-
43
- input_file = nil
44
- if ARGV[0] == '-'
45
- input_file = $stdin
46
- else
47
- input_file = File.open ARGV[0]
48
- end
49
- csv = CSV.new(input_file, :col_sep => ' ')
50
-
51
- csv.each do |row|
52
- kmer = row[0]
53
- counts = row[1...row.length].collect{|s| s.to_i}
54
- index = 0
55
- counts.each_with_index do |count, i|
56
- max_i = i if i > max_i
57
-
58
- if count > options[:upper_threshold]
59
- increment = (1<<i)
60
- index += increment
61
- log.debug "Found a passable for #{options[:threshold]} in index #{i} for #{counts}, count is now #{index}" if log.debug?
62
- elsif count < options[:lower_threshold]
63
- # do nothing
64
- else
65
- # coverage was in no man's land between thresholds.
66
- # Ignore this kmer as noise.
67
- break
68
- end
69
- end
70
-
71
- if index != 0
72
- encoded_counts[index] ||= 0
73
- encoded_counts[index] += 1
74
- end
75
- end
76
-
77
- (0..encoded_counts.keys.max).each do |i|
78
- total = encoded_counts[i]
79
- unless total.nil?
80
- unencoded = i.to_s(2)
81
-
82
- while unencoded.length <= max_i
83
- unencoded = '0'+unencoded
84
- end
85
-
86
- puts [
87
- i,
88
- total,
89
- unencoded,
90
- ].join "\t"
91
- end
92
- end
@@ -1,52 +0,0 @@
1
- #!/usr/bin/env rdmd
2
-
3
- import std.stdio;
4
- import std.csv;
5
- import std.typecons;
6
- import std.getopt;
7
- import std.algorithm;
8
-
9
-
10
-
11
- void main(string[] args)
12
- {
13
- bool usePercentages = false;
14
- int minCount = 1;
15
- string trace = "info";
16
- getopt(args,
17
- "percentage", &usePercentages,
18
- "min-count", &minCount,
19
- "trace", &trace
20
- );
21
-
22
- auto kmersFile = File(args[1]);
23
-
24
- int lineCount = 0;
25
- foreach (line; kmersFile.byLine()) {
26
- lineCount += 1;
27
- if (lineCount % (1024*1024) == 0){
28
- stderr.writeln("Parsed ",lineCount, " lines");
29
- }
30
- auto reader = csvReader!(Tuple!(string,
31
- int, int, int, int, int,
32
- int, int, int, int, int,
33
- int, int, int, int, int,
34
- int, int, int, int, int,
35
- int, int, int, int, int,
36
- int, int, int, int, int,
37
- int, int))(line, ' ');
38
- foreach (record; reader) {
39
- /*writeln(record[0]);
40
- writeln(record[1]);
41
- writeln(record[1..32]);*/
42
- int[] range = [1,2,3,4];
43
- int sum = 0;
44
- foreach(T; record[1..32]){
45
- sum += T;
46
- }
47
- if (sum >= minCount){
48
- writeln(line);
49
- }
50
- }
51
- }
52
- }
@@ -1,123 +0,0 @@
1
- #!/usr/bin/env rdmd
2
-
3
- import std.stdio;
4
- import std.conv;
5
- import std.string;
6
- import std.regex;
7
- import std.getopt;
8
-
9
- void main(string[] args){
10
- bool usePercentages = false;
11
- int minCount = 1;
12
- string trace = "info";
13
- getopt(args,
14
- "percentage", &usePercentages,
15
- "min-count", &minCount,
16
- "trace", &trace
17
- );
18
-
19
- //Create an array of open file handles, one for each argument given
20
- auto filenames = args[1 .. $];
21
- int[] totalCounts = new int[filenames.length];
22
- int kmerLength = 0;
23
-
24
- foreach(i, file; filenames){
25
- int count = 0;
26
- auto f = File(file);
27
- char[] line;
28
- while (f.readln(line)){
29
- //foreach(line; f.byLine()){
30
- if (kmerLength==0){
31
- kmerLength = 101;//to!int(indexOf(line, " "));
32
- stderr.writeln("Detected kmer length of ",kmerLength);
33
- }
34
- int thisCount = to!int(line[kmerLength+1 .. $-1]);
35
- //int thisCount = to!int(line[kmerLength+1 .. $]);
36
- count += thisCount;
37
- //if (count > 4000002){break;}
38
- }
39
- totalCounts[i] = count;
40
- }
41
- stderr.writeln("Finished ccounting total kmers, totals were: ",totalCounts);
42
-
43
- if(false){
44
- bool allFinished = false;
45
- bool[] finished = new bool[filenames.length];
46
- foreach (f; finished){f=false;}
47
-
48
- File[] files = new File[filenames.length];
49
- foreach(i; 0 .. files.length){
50
- files[i] = File(filenames[i]);
51
- }
52
-
53
- struct KmerCount {
54
- char[101] kmer;
55
- int count;
56
- }
57
- KmerCount[] currentRows = new KmerCount[files.length];
58
- char[] lineBuffer;
59
- foreach (i; 0..currentRows.length){
60
- //read in the line to teh buffer, so memory is not reallocated
61
- files[i].readln(lineBuffer);
62
- currentRows[i].kmer = chomp(files[i].readln(currentRows[i].kmer))[0..kmerLength];
63
- currentRows[i].count = to!int(line[kmerLength+1..$]);
64
- }
65
-
66
- //write headers
67
- enum ctr = ctRegex!(".*/(.+)");
68
- foreach(f; filenames){
69
- write("\t",match(f, ctr).captures[1]);
70
- }
71
- writeln();
72
-
73
- string[] toPrint = new string[filenames.length+1];
74
- int kmersCounted = 0;
75
- while (!allFinished){
76
- kmersCounted += 1;
77
- if (kmersCounted % (1024*1024) == 0){stderr.writeln("Processed ",kmersCounted," kmers.");}
78
- //Find the lowest kmer
79
- string lowestKmer = null;
80
- foreach (kc; currentRows){
81
- if (lowestKmer == null || kc.kmer < lowestKmer){
82
- lowestKmer = kc.kmer;
83
- }
84
- }
85
-
86
- //Go through each file, printing the number of this kmer found
87
- int totalObservations = 0;
88
- toPrint[0] = lowestKmer;
89
- foreach (i, kc; currentRows){
90
- if (kc.kmer == lowestKmer){
91
- totalObservations += kc.count;
92
- if (usePercentages){
93
- toPrint[i+1] = to!string(to!float(kc.count)/totalCounts[i]);
94
- } else {
95
- toPrint[i+1] = to!string(kc.count);
96
- }
97
-
98
- // Read a new line in, check if this file is finished
99
- auto line = files[i].readln;
100
- if (line == null){
101
- finished[i] = true;
102
- allFinished = true; //guilty until proven innocent
103
- foreach(f; finished){
104
- if (!f){
105
- allFinished = false;
106
- }
107
- }
108
- currentRows[i].kmer = null;
109
- currentRows[i].count = -1;
110
- } else {
111
- //Regular line to be read in
112
- currentRows[i].kmer = line[0..kmerLength];
113
- currentRows[i].count = to!int(line[kmerLength+1..$-1]);
114
- }
115
- } else {
116
- toPrint[i+1] = "0";
117
- }
118
- }
119
- if (totalObservations >= minCount){
120
- writeln(join(toPrint, "\t"));
121
- }
122
- }
123
- }}
@@ -1,84 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
- require 'progressbar'
7
- require 'tempfile'
8
- require 'systemu'
9
-
10
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
11
-
12
- # Parse command line options into the options hash
13
- options = {
14
- :logger => 'stderr',
15
- :log_level => 'info',
16
- :min_count => 1,
17
- }
18
- o = OptionParser.new do |opts|
19
- opts.banner = "
20
- Usage: #{SCRIPT_NAME} <kmers_count_output1> [<kmers_count_output2> ..]
21
-
22
- Take a list of files output from libngs' kmers_count tool, after being run through gnu sort.
23
-
24
- Create a table, where the columns are each file, the rows are each kmer, and
25
- the cells are the percent of that file's kmer actually is that kmer.\n\n"
26
-
27
-
28
- opts.on("--output-file FILENAME", "Output file path [required]") do |arg|
29
- options[:output_file] = arg
30
- end
31
-
32
- opts.on("--percentage", "description [default: #{options[:eg]}]") do
33
- raise "not yet implemented"
34
- options[:percentage_outputs] = true
35
- end
36
- opts.on("--min-count COUNT", "require at least this many kmers to be output into the output file [default: #{options[:min_count]}]") do |arg|
37
- raise "not yet implemented"
38
- options[:min_count] = arg.to_i
39
- end
40
-
41
- # logger options
42
- opts.separator "\nVerbosity:\n\n"
43
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
44
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
45
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
46
- end; o.parse!
47
- if ARGV.length == 0 or options[:output_file].nil?
48
- $stderr.puts o
49
- exit 1
50
- end
51
- # Setup logging
52
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
53
-
54
- filenames = ARGV
55
- raise "I need more than 1 file" unless filenames.length > 1
56
- log.info "Joining these files: #{filenames.inspect}"
57
-
58
- # run gnu join on each file
59
- current_build_file = filenames[0] #Build off the current build file first, then a tempfile subsequently
60
-
61
- Tempfile.open('kmers_join1') do |tempfile1|
62
- Tempfile.open('kmers_join2') do |tempfile2|
63
- filenames.each_with_index do |file, i|
64
- next if i==0
65
-
66
- first_file_output_fields = (2..(i+1)).to_a.collect{|n| "1.#{n.to_s}"}.join(',')
67
- cmd = "join -a1 -a2 -e 0 -o0,#{first_file_output_fields},2.2 #{current_build_file.inspect} #{file} >#{tempfile2.path}"
68
- log.info "At #{Time.now}, running #{cmd}.."
69
- status, stdout, stderr = systemu cmd
70
- raise stderr unless stderr == ''
71
- raise 'exitstatus bad1!' unless status.exitstatus == 0
72
- status, stdout, stderr = systemu "mv #{tempfile2.path} #{tempfile1.path}"
73
- raise stderr unless stderr == ''
74
- raise 'exitstatus bad2!' unless status.exitstatus == 0
75
- current_build_file = tempfile1.path
76
- end
77
- status, stdout, stderr = systemu "mv #{current_build_file} #{options[:output_file]}"
78
- raise stderr unless stderr == ''
79
- raise 'exitstatus bad3!' unless status.exitstatus == 0
80
- end
81
- end
82
-
83
-
84
-
@@ -1,108 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'csv'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- }
14
- o = OptionParser.new do |opts|
15
- opts.banner = "
16
- Usage: #{SCRIPT_NAME} <arguments>
17
-
18
- Takes a list of PCR primers that were put in several lanes (not all primers in all lanes), and a list of bands that were found, and decipher which bands are the result of which primer pairs, as best as possible\n\n"
19
-
20
- opts.on("--bands-file FILE", "tsv file, with the band names as the first column, and the lane numbers that they appear in as the second column (comma separated) [required]") do |arg|
21
- options[:bands_file] = arg
22
- end
23
- opts.on("--primers-file FILE", "tsv file, with the lane names as the first column, and the set of primers numbers that are in each lane as the second column (comma separated) [required]") do |arg|
24
- options[:primers_file] = arg
25
- end
26
-
27
- # logger options
28
- opts.separator "\nVerbosity:\n\n"
29
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
30
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
31
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
32
- end; o.parse!
33
- if ARGV.length != 0 or options[:bands_file].nil? or options[:primers_file].nil?
34
- $stderr.puts o
35
- exit 1
36
- end
37
- # Setup logging
38
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
39
-
40
-
41
- # Read in the bands
42
- bands_to_lanes = {}
43
- CSV.foreach(options[:bands_file], :col_sep => "\t") do |row|
44
- raise "Malformed bands file in this line: #{row.inspect}" unless row.length == 2
45
-
46
- band_name = row[0]
47
- raise "Two bands were labeled the same way, as #{band_name.inspect}" if bands_to_lanes.key?(band_name)
48
-
49
- lanes_of_this_band = row[1].split(/[,\s]/).collect{|c| c.strip}
50
- bands_to_lanes[band_name] = lanes_of_this_band
51
- end
52
- log.info "Parsed in #{bands_to_lanes.length} bands, found #{bands_to_lanes.collect{|k,v| v.length}.join(',')} lanes each, respectively"
53
-
54
- # Read in the primer sets
55
- lanes_to_primers = {}
56
- CSV.foreach(options[:primers_file], :col_sep => "\t") do |row|
57
- raise "Malformed primers file in this line: #{row.inspect}" unless row.length == 2
58
-
59
- lane_name = row[0]
60
- raise "Two lanes were labeled the same way, as #{lane_name.inspect}" if lanes_to_primers.key?(lane_name)
61
-
62
- primers_of_this_band = row[1].split(/[,\s]/).collect{|c| c.strip}
63
- lanes_to_primers[lane_name] = primers_of_this_band
64
- end
65
- log.info "Parsed in #{lanes_to_primers.length} lanes, with #{lanes_to_primers.collect{|k,v| v.length}.join(',')} primers each, respectively"
66
-
67
-
68
- # Go through each pairing of primers. Which primer sets explain each band?
69
- all_primers = lanes_to_primers.values.flatten.sort.uniq
70
- lanes = lanes_to_primers.keys
71
- bands = bands_to_lanes.keys
72
-
73
- bands_to_explaining_primer_pairs = {}
74
-
75
- bands.each do |band|
76
- log.debug "Going after band #{band}"
77
- all_primers.combination(2) do |array|
78
-
79
- primer1 = array.sort[0]
80
- primer2 = array.sort[1]
81
- log.debug "Testing primers #{primer1}/#{primer2} agreement with band #{band}"
82
-
83
- band_agrees_with_this_primer_pair = true
84
- lanes.each do |lane|
85
- band_is_in_this_lane = bands_to_lanes[band].include?(lane)
86
- primers_here = lanes_to_primers[lane]
87
- if band_is_in_this_lane and (!primers_here.include?(primer1) or !primers_here.include?(primer2))
88
- log.debug "primer pair #{primer1}/#{primer2} fails for band #{band} because band was present but at least one primer wasn't, in lane #{lane}"
89
- band_agrees_with_this_primer_pair = false
90
- end
91
- if !band_is_in_this_lane and (primers_here.include?(primer1) and primers_here.include?(primer2))
92
- log.debug "primer pair #{primer1}/#{primer2} fails for band #{band} because band was not present but both primers were, in lane #{lane}"
93
- band_agrees_with_this_primer_pair = false
94
- end
95
- end
96
-
97
- if band_agrees_with_this_primer_pair
98
- log.debug "Found a suitable pair of primers for band #{band}: #{primer1}/#{primer2}"
99
- bands_to_explaining_primer_pairs[band] ||= []
100
- bands_to_explaining_primer_pairs[band].push array
101
- end
102
- end
103
-
104
- puts [
105
- band,
106
- bands_to_explaining_primer_pairs[band].nil? ? 'none' : bands_to_explaining_primer_pairs[band].collect{|a| "(#{a.join(',')})"}.join(', ')
107
- ].join("\t")
108
- end