finishm 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +19 -19
- data/VERSION +1 -1
- data/finishm.gemspec +631 -0
- data/lib/assembly/graph_generator.rb +0 -1
- data/lib/assembly/probed_graph.rb +0 -1
- metadata +99 -96
- data/bin/assembly_visualiser +0 -106
- data/bin/check_primer_combinations.rb +0 -73
- data/bin/contig_joiner.rb +0 -244
- data/bin/contigs_against_assembly.rb +0 -153
- data/bin/finishm_assembler +0 -55
- data/bin/finishm_gap_closer.rb +0 -241
- data/bin/kmer_abundance_file_tool.rb +0 -49
- data/bin/kmer_pattern_to_assembly.rb +0 -377
- data/bin/kmer_profile_finder.rb +0 -92
- data/bin/kmers_count_parse.d +0 -52
- data/bin/kmers_count_tabulate.d +0 -123
- data/bin/kmers_count_tabulate.rb +0 -84
- data/bin/pcr_result_parser.rb +0 -108
- data/bin/primer_finder.rb +0 -119
- data/bin/read_selection_by_kmer.d +0 -174
- data/bin/scaffold_by_pattern.rb +0 -119
- data/bin/scaffold_connection_possibilities_to_knowns.rb +0 -193
- data/bin/scaffold_end_coverages.rb +0 -69
- data/bin/trail_validator.rb +0 -84
data/bin/primer_finder.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'bio'
|
6
|
-
require 'progressbar'
|
7
|
-
|
8
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
9
|
-
require 'priner'
|
10
|
-
|
11
|
-
if __FILE__ == $0 #needs to be removed if this script is distributed as part of a rubygem
|
12
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
13
|
-
|
14
|
-
# Parse command line options into the options hash
|
15
|
-
options = {
|
16
|
-
:logger => 'stderr',
|
17
|
-
:reverse_complement => false,
|
18
|
-
:gc_clamp_length => 2,
|
19
|
-
:min_melting_temperature => 56,
|
20
|
-
:max_melting_temperature => 62,
|
21
|
-
:min_primer_length => 15,
|
22
|
-
}
|
23
|
-
o = OptionParser.new do |opts|
|
24
|
-
opts.banner = "
|
25
|
-
Usage: #{SCRIPT_NAME} <arguments> <sequence>
|
26
|
-
|
27
|
-
Take a sequence, and find an oligos that fit certain parameters. Sort of like primer3_core but only look for single oligos, not pairs.\n\n"
|
28
|
-
|
29
|
-
opts.on("-s", "--sequence-file FILE", "Fasta file of sequences [required]") do |arg|
|
30
|
-
options[:input_file] = arg
|
31
|
-
end
|
32
|
-
|
33
|
-
opts.separator "\nOptional arguments:\n\n"
|
34
|
-
opts.on("-r", "--reverse-complement", "Design primers pointing backwards off the start of the sequence in reverse, not off the end forwards [default: #{options[:reverse_complement]}]") do
|
35
|
-
options[:reverse_complement] = true
|
36
|
-
end
|
37
|
-
|
38
|
-
# logger options
|
39
|
-
opts.separator "\nVerbosity:\n\n"
|
40
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {Bio::Log::CLI.trace('error')}
|
41
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
42
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| Bio::Log::CLI.trace(s)}
|
43
|
-
end; o.parse!
|
44
|
-
if ARGV.length != 0
|
45
|
-
$stderr.puts o
|
46
|
-
exit 1
|
47
|
-
end
|
48
|
-
# Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
|
49
|
-
Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
50
|
-
|
51
|
-
# Read the contigs in
|
52
|
-
contigs = []
|
53
|
-
Bio::FlatFile.foreach(options[:input_file]) do |entry|
|
54
|
-
contigs.push entry.seq.to_s
|
55
|
-
end
|
56
|
-
log.info "Read in #{contigs.length} contigs from #{options[:contigs_file]}"
|
57
|
-
raise unless contigs.length == 1
|
58
|
-
|
59
|
-
seq = contigs[0]
|
60
|
-
|
61
|
-
# Reverse complement if required
|
62
|
-
if options[:reverse_complement]
|
63
|
-
seq = Bio::Sequence::NA.new(seq).reverse_complement.to_s
|
64
|
-
end
|
65
|
-
seq.upcase!
|
66
|
-
|
67
|
-
raise "Whackiness in the supplied sequence! #{seq}" unless seq.match(/^[ATGC]+$/)
|
68
|
-
raise unless options[:min_primer_length] > options[:gc_clamp_length]
|
69
|
-
|
70
|
-
# Find out all those positions that have a GC clamp of enough
|
71
|
-
gc_clamped_positions = []
|
72
|
-
(0...seq.length).each do |pos|
|
73
|
-
next unless pos-options[:min_primer_length] >= -1
|
74
|
-
|
75
|
-
if seq[pos-options[:gc_clamp_length]+1..pos].match(/^[GC]+$/)
|
76
|
-
gc_clamped_positions << pos
|
77
|
-
end
|
78
|
-
end
|
79
|
-
log.info "Found #{gc_clamped_positions.length} positions with a suitable GC-clamp"
|
80
|
-
|
81
|
-
# Find those with suitable melting temperatures
|
82
|
-
designer = OligoDesigner.new
|
83
|
-
progress = ProgressBar.new('primer_finding', gc_clamped_positions.length)
|
84
|
-
gc_clamped_positions.each do |pos|
|
85
|
-
# Iteratively make primers longer. Start with min primer length, end when the Tm exceeds the maximum allowable
|
86
|
-
current_length = options[:min_primer_length]
|
87
|
-
|
88
|
-
over_tm_max = false
|
89
|
-
while !over_tm_max and pos-current_length >= 0
|
90
|
-
oligo = seq[pos-current_length+1..pos]
|
91
|
-
tm = designer.melting_temperature oligo
|
92
|
-
puts oligo
|
93
|
-
puts tm
|
94
|
-
exit
|
95
|
-
|
96
|
-
if tm > options[:min_melting_temperature]
|
97
|
-
if tm < options[:max_melting_temperature]
|
98
|
-
# This is a hit
|
99
|
-
cur_seq = oligo
|
100
|
-
if options[:reverse_complement]
|
101
|
-
cur_seq = Bio::Sequence::NA.new(cur_seq).reverse_complement.to_s
|
102
|
-
end
|
103
|
-
|
104
|
-
puts [
|
105
|
-
cur_seq, tm
|
106
|
-
].join("\t")
|
107
|
-
else
|
108
|
-
over_tm_max = true
|
109
|
-
break #making the primer longer can only result in higher melting temperatures, and we are already over the max
|
110
|
-
end
|
111
|
-
end
|
112
|
-
current_length += 1
|
113
|
-
end
|
114
|
-
progress.inc
|
115
|
-
end
|
116
|
-
progress.finish
|
117
|
-
|
118
|
-
|
119
|
-
end #end if running as a script
|
@@ -1,174 +0,0 @@
|
|
1
|
-
#!/usr/bin/env rdmd
|
2
|
-
|
3
|
-
import std.stdio;
|
4
|
-
import std.string;
|
5
|
-
import std.conv;
|
6
|
-
import std.getopt;
|
7
|
-
import std.file;
|
8
|
-
import std.array;
|
9
|
-
import bio.core.fasta;
|
10
|
-
import bio.core.sequence;
|
11
|
-
import std.algorithm;
|
12
|
-
import std.container;
|
13
|
-
import std.c.stdlib;
|
14
|
-
|
15
|
-
void main(string[] args){
|
16
|
-
string whitelistFile, blacklistFile, fastaFile = null;
|
17
|
-
bool help, verbose, quiet, debugging = false;
|
18
|
-
int targetPerKmer = 1;
|
19
|
-
int minLeftoverLength;
|
20
|
-
getopt(args,
|
21
|
-
"whitelist", &whitelistFile,
|
22
|
-
"blacklist", &blacklistFile,
|
23
|
-
"reads", &fastaFile,
|
24
|
-
"kmer-coverage-target", &targetPerKmer,
|
25
|
-
"min-leftover-length", &minLeftoverLength,
|
26
|
-
"verbose", &verbose,
|
27
|
-
"debug", &debugging,
|
28
|
-
"quiet", &quiet,
|
29
|
-
"help|h", &help,
|
30
|
-
);
|
31
|
-
if(help){
|
32
|
-
writeln("my helpa");
|
33
|
-
}
|
34
|
-
else if(whitelistFile is null){stderr.writeln("Error: Need to specify a newline-separeted list of whitelisted kmers as --whitelist <file>");}
|
35
|
-
else if(fastaFile is null){stderr.writeln("Error: Need to specify a fasta file of reads to work with as --reads <fasta_file>");}
|
36
|
-
else {
|
37
|
-
if(debugging) verbose = true;
|
38
|
-
if(verbose) quiet=false;
|
39
|
-
|
40
|
-
|
41
|
-
//read in a text file of kmers that we wish to find, the whitelist
|
42
|
-
auto whites = split(cast(string) read(whitelistFile));
|
43
|
-
if (verbose)
|
44
|
-
stderr.writeln("Read in ",whites.length," kmers as a whitelist, e.g. ",whites.front);
|
45
|
-
|
46
|
-
|
47
|
-
// Find the minimum length of kmer being searched for
|
48
|
-
auto whitelistMinLength = map!"a.length"(whites).reduce!"a<b ? a : b";
|
49
|
-
auto whitelistMaxLength = map!"a.length"(whites).reduce!"a<b ? b : a";
|
50
|
-
if (whitelistMinLength != whitelistMaxLength){
|
51
|
-
stderr.writeln("Kmers must be of uniform length, but these ones weren't..");
|
52
|
-
exit(1);
|
53
|
-
}
|
54
|
-
if (verbose)
|
55
|
-
stderr.writeln("Minimum length of kmer in whitelist is ",whitelistMinLength);
|
56
|
-
|
57
|
-
//if blacklistFile is specified, read in a list of kmers that are blacklisted, otherwise make an empty array
|
58
|
-
bool[string] blacks;
|
59
|
-
if(blacklistFile != null){
|
60
|
-
foreach(kmer; split(cast(string) read(blacklistFile))){
|
61
|
-
if (kmer.length != whitelistMinLength){
|
62
|
-
stderr.writeln("Kmers (currently) must be of uniform length, but some blacklisted ones weren't..");
|
63
|
-
exit(1);
|
64
|
-
}
|
65
|
-
blacks[kmer] = true;
|
66
|
-
if(verbose)
|
67
|
-
stderr.writeln("Read in ",blacks.length," blacklisted kmers e.g. ",blacks.keys.front);
|
68
|
-
}
|
69
|
-
} else {
|
70
|
-
if(verbose)
|
71
|
-
stderr.writeln("No blacklisted kmers specified");
|
72
|
-
}
|
73
|
-
|
74
|
-
int[string] whitelistCounts;
|
75
|
-
foreach(white; whites){
|
76
|
-
whitelistCounts[white] = 0;
|
77
|
-
}
|
78
|
-
int num_reads_whitelisted = 0;
|
79
|
-
int num_reads_blacklisted = 0;
|
80
|
-
|
81
|
-
//Iterate through the fastq reads given.
|
82
|
-
auto fastas = fastaRecords(fastaFile);
|
83
|
-
bool all_accounted_for = false;
|
84
|
-
ptrdiff_t range_end;
|
85
|
-
string[] kmers;
|
86
|
-
if (minLeftoverLength)
|
87
|
-
kmers = new string[4];
|
88
|
-
else
|
89
|
-
kmers = new string[2];
|
90
|
-
foreach(seq; fastas){
|
91
|
-
if (verbose)
|
92
|
-
stderr.writeln("Inspecting ", seq);
|
93
|
-
//If they contain one of the blacklist kmers, then skip
|
94
|
-
string fwd = seq.sequence;
|
95
|
-
string rev = to!string(nucleotideSequence(fwd, true));
|
96
|
-
|
97
|
-
range_end = fwd.length - whitelistMinLength + 1;
|
98
|
-
if (minLeftoverLength)
|
99
|
-
range_end -= minLeftoverLength;
|
100
|
-
if (range_end < 0) continue; //If the read is too short, then don't even bother comparing it
|
101
|
-
if (debugging) stderr.writeln("Range end was ",range_end);
|
102
|
-
|
103
|
-
//How many of each whitelist kmers are found (including in the reverse complement)?
|
104
|
-
bool whitelisted = false;
|
105
|
-
foreach(i; 0 .. range_end){
|
106
|
-
kmers[0] = fwd[i .. (i+whitelistMinLength)];
|
107
|
-
kmers[1] = rev[i .. (i+whitelistMinLength)];
|
108
|
-
// if min leftover length is specified then search the reverse complement of the fwd as well
|
109
|
-
if (minLeftoverLength){
|
110
|
-
kmers[2] = to!string(nucleotideSequence(kmers[0], true));
|
111
|
-
kmers[3] = to!string(nucleotideSequence(kmers[1], true));
|
112
|
-
}
|
113
|
-
foreach(kmer; kmers){
|
114
|
-
if (debugging)
|
115
|
-
stderr.writeln("Whitelist inspecting kmer ",kmer," at position ",i);
|
116
|
-
if (kmer in whitelistCounts && whitelistCounts[kmer] < targetPerKmer){
|
117
|
-
whitelisted = true;
|
118
|
-
whitelistCounts[kmer] += 1;
|
119
|
-
if (whitelistCounts[kmer] >= targetPerKmer){
|
120
|
-
if(verbose)
|
121
|
-
stderr.writeln("kmer index ",i," now accounted for");
|
122
|
-
if (count!((x){return x<targetPerKmer;})(whitelistCounts.values) == 0){
|
123
|
-
if(verbose)
|
124
|
-
stderr.writeln("All whitelisted kmers now accounted for");
|
125
|
-
all_accounted_for = true; //all done, no more fasta entries required
|
126
|
-
}
|
127
|
-
}
|
128
|
-
}
|
129
|
-
}
|
130
|
-
}
|
131
|
-
if(!whitelisted) continue;
|
132
|
-
else if (verbose) stderr.writeln("Read contains a valid whitelisted kmer");
|
133
|
-
|
134
|
-
//I'm sure there is a faster way to search for an array of strings within a particular string, but eh for now.
|
135
|
-
bool blacklisted = false;
|
136
|
-
if (blacklistFile != null){
|
137
|
-
foreach(i; 0 .. fwd.length - whitelistMinLength+1){
|
138
|
-
auto kmer = fwd[i .. (i+whitelistMinLength)];
|
139
|
-
if (kmer in blacks){
|
140
|
-
//blacklisted kmer found
|
141
|
-
blacklisted = true;
|
142
|
-
break;
|
143
|
-
}
|
144
|
-
}
|
145
|
-
}
|
146
|
-
|
147
|
-
if(blacklisted){
|
148
|
-
num_reads_blacklisted += 1;
|
149
|
-
if(verbose)
|
150
|
-
stderr.writeln(fwd," contains a blacklisted kmer, not including this one");
|
151
|
-
continue;
|
152
|
-
} else {
|
153
|
-
if(verbose)
|
154
|
-
stderr.writeln(fwd," not blacklisted");
|
155
|
-
}
|
156
|
-
|
157
|
-
//print this sequence, as it is whitelisted and not blacklisted
|
158
|
-
num_reads_whitelisted += 1;
|
159
|
-
writeln(">", seq.header);
|
160
|
-
writeln(fwd);
|
161
|
-
|
162
|
-
if(all_accounted_for) break;
|
163
|
-
}
|
164
|
-
|
165
|
-
//output the number of kmers that were sufficiently covered
|
166
|
-
|
167
|
-
ulong num_counted = count!("a >= b")(whitelistCounts.values, targetPerKmer);
|
168
|
-
ulong num_not_counted = whitelistCounts.length - num_counted;
|
169
|
-
if(!quiet){
|
170
|
-
stderr.writeln("Found ",num_counted," from the whitelist as expected and ",num_not_counted," not enough times");
|
171
|
-
stderr.writeln("There were ",num_reads_whitelisted," reads output, and ",num_reads_blacklisted," reads blacklisted");
|
172
|
-
}
|
173
|
-
}
|
174
|
-
}
|
data/bin/scaffold_by_pattern.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
require 'bio'
|
7
|
-
require 'tempfile'
|
8
|
-
require 'systemu'
|
9
|
-
|
10
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
11
|
-
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
12
|
-
require 'kmer_abundance_pattern'
|
13
|
-
|
14
|
-
# Parse command line options into the options hash
|
15
|
-
options = {
|
16
|
-
:logger => 'stderr',
|
17
|
-
:log_level => 'info',
|
18
|
-
:number_of_kmers => 100,
|
19
|
-
}
|
20
|
-
o = OptionParser.new do |opts|
|
21
|
-
opts.banner = "
|
22
|
-
Usage: #{SCRIPT_NAME} -f <scaffolds_fasta> -k <kmer_abundance_file>
|
23
|
-
|
24
|
-
Take a fasta file of contigs, and a multiple kmer count file. Output the patterns each contig end shows up.n\n"
|
25
|
-
|
26
|
-
|
27
|
-
opts.on("-f FASTA_FILE", "Fasta file containing multiple sequences that we are attempting to scaffold together [required]") do |arg|
|
28
|
-
options[:fasta_file] = arg
|
29
|
-
end
|
30
|
-
opts.on("-k KMER_FILE", "kmer frequencies [required]") do |arg|
|
31
|
-
options[:kmer_file] = arg
|
32
|
-
end
|
33
|
-
opts.on("--kmer KMER_SIZE", "kmer length [required]") do |arg|
|
34
|
-
options[:kmer_size] = arg.to_i
|
35
|
-
end
|
36
|
-
opts.on("--upper-threshold ARG", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
37
|
-
options[:upper_threshold] = arg.to_i
|
38
|
-
end
|
39
|
-
opts.on("--lower-threshold ARG", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
40
|
-
options[:lower_threshold] = arg.to_i
|
41
|
-
end
|
42
|
-
|
43
|
-
# logger options
|
44
|
-
opts.separator "\nVerbosity:\n\n"
|
45
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
46
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
47
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
48
|
-
end; o.parse!
|
49
|
-
if ARGV.length != 0 or options[:fasta_file].nil? or options[:kmer_file].nil? or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:kmer_size].nil?
|
50
|
-
$stderr.puts o
|
51
|
-
exit 1
|
52
|
-
end
|
53
|
-
# Setup logging
|
54
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
55
|
-
|
56
|
-
get_kmer_abundances_from_kmers = lambda do |kmers|
|
57
|
-
# fgrep the kmer abundance file for these particular ones
|
58
|
-
patterns = {}
|
59
|
-
Tempfile.open('kemrs') do |tempfile|
|
60
|
-
tempfile.puts kmers.join "\n"
|
61
|
-
tempfile.close
|
62
|
-
|
63
|
-
# for each of the kmers that come back, output their pattern in the kmer abundance file
|
64
|
-
grep_cmd = "fgrep -f #{tempfile.path} #{options[:kmer_file].inspect}"
|
65
|
-
log.debug "Running cmd with #{kmers.length} kmers: #{grep_cmd}"
|
66
|
-
status, stdout, stderr = systemu grep_cmd
|
67
|
-
raise stderr if stderr != ''
|
68
|
-
raise unless status.exitstatus == 0
|
69
|
-
num_kmers = stdout.split("\n").length
|
70
|
-
log.debug "Finished grepping for kmers, found #{num_kmers} kmers"
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
stdout.each_line do |line|
|
75
|
-
CSV.parse(line, :col_sep => ' ') do |row|
|
76
|
-
pattern = KmerAbundancePattern.new
|
77
|
-
pattern.parse_from_kmer_abundance row[1...row.length].collect{|a| a.to_f}, options[:lower_threshold], options[:upper_threshold]
|
78
|
-
rep = pattern.binary_string
|
79
|
-
patterns[rep] ||= 0
|
80
|
-
patterns[rep] += 1
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
patterns.sort{|a,b| b[1]<=>a[1]}.collect{|a| a.join(',')}.join("\t")
|
85
|
-
end
|
86
|
-
|
87
|
-
# For each of the sequences in the fasta file
|
88
|
-
Bio::FlatFile.foreach(Bio::FastaFormat, options[:fasta_file]) do |seq|
|
89
|
-
# Extract the first 100 kmers
|
90
|
-
kmers = []
|
91
|
-
i = 0
|
92
|
-
seq.to_biosequence.window_search(options[:kmer_size]) do |s|
|
93
|
-
kmers.push s.seq
|
94
|
-
i += 1
|
95
|
-
break if i>options[:number_of_kmers]
|
96
|
-
end
|
97
|
-
# output to a tempfile
|
98
|
-
patterns = get_kmer_abundances_from_kmers.call kmers
|
99
|
-
puts [
|
100
|
-
seq.definition,
|
101
|
-
'start',
|
102
|
-
patterns
|
103
|
-
].join("\t")
|
104
|
-
|
105
|
-
# repeat for the end of the contig
|
106
|
-
kmers = []
|
107
|
-
i = 0
|
108
|
-
seq.naseq.reverse_complement.window_search(options[:kmer_size]) do |s|
|
109
|
-
kmers.push s.seq.upcase
|
110
|
-
i += 1
|
111
|
-
break if i>options[:number_of_kmers]
|
112
|
-
end
|
113
|
-
patterns = get_kmer_abundances_from_kmers.call kmers
|
114
|
-
puts [
|
115
|
-
seq.definition,
|
116
|
-
'end',
|
117
|
-
patterns
|
118
|
-
].join("\t")
|
119
|
-
end
|
@@ -1,193 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
require 'bio-logger'
|
5
|
-
require 'csv'
|
6
|
-
require 'pp'
|
7
|
-
require 'bio'
|
8
|
-
require 'pry'
|
9
|
-
|
10
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
11
|
-
|
12
|
-
$:.unshift File.join(ENV['HOME'],'git/yargraph/lib')
|
13
|
-
require 'yargraph'
|
14
|
-
|
15
|
-
# Parse command line options into the options hash
|
16
|
-
options = {
|
17
|
-
:logger => 'stderr',
|
18
|
-
:log_level => 'info',
|
19
|
-
}
|
20
|
-
o = OptionParser.new do |opts|
|
21
|
-
opts.banner = "
|
22
|
-
Usage: #{SCRIPT_NAME} <arguments>
|
23
|
-
|
24
|
-
Description of what this program does...\n\n"
|
25
|
-
|
26
|
-
opts.on("-c", "--connnections FILE", "connections file [required]") do |arg|
|
27
|
-
options[:connections_file] = arg
|
28
|
-
end
|
29
|
-
opts.on("-f", "--fasta FILE", "Fasta file of all contigs [required]") do |arg|
|
30
|
-
options[:fasta_file] = arg
|
31
|
-
end
|
32
|
-
|
33
|
-
# logger options
|
34
|
-
opts.separator "\nVerbosity:\n\n"
|
35
|
-
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
36
|
-
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
37
|
-
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
38
|
-
end; o.parse!
|
39
|
-
if ARGV.length != 0 or options[:connections_file].nil?
|
40
|
-
$stderr.puts o
|
41
|
-
exit 1
|
42
|
-
end
|
43
|
-
# Setup logging
|
44
|
-
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
45
|
-
|
46
|
-
class Probe
|
47
|
-
attr_accessor :contig_name, :side
|
48
|
-
|
49
|
-
def to_setable
|
50
|
-
[@contig_name, @side]
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
class ProbeSet
|
55
|
-
|
56
|
-
end
|
57
|
-
|
58
|
-
graph = Yargraph::UndirectedGraph.new
|
59
|
-
|
60
|
-
num_probes_circular = 0
|
61
|
-
CSV.foreach(options[:connections_file], :col_sep => "\t") do |row|
|
62
|
-
if row.length == 3
|
63
|
-
# e.g. 110811_E_1_D_nesoni_single_contig_1011407.start 110811_E_1_D_nesoni_single_contig_1030181.start 225
|
64
|
-
splits1 = nil
|
65
|
-
splits2 = nil
|
66
|
-
splits1 = row[0].match(/^(.+)\.(.+)$/)
|
67
|
-
splits2 = row[1].match(/^(.+)\.(.+)$/)
|
68
|
-
raise if splits1.nil? or splits2.nil?
|
69
|
-
distance = row[2]
|
70
|
-
row = [
|
71
|
-
splits1[1], splits1[2], splits2[1], splits2[2], distance
|
72
|
-
].flatten
|
73
|
-
end
|
74
|
-
|
75
|
-
raise unless row.length == 5
|
76
|
-
# e.g. seq1 end seq23 start 6878
|
77
|
-
|
78
|
-
probe1 = Probe.new
|
79
|
-
probe1.contig_name = row[0]
|
80
|
-
probe1.side = row[1]
|
81
|
-
|
82
|
-
probe2 = Probe.new
|
83
|
-
probe2.contig_name = row[2]
|
84
|
-
probe2.side = row[3]
|
85
|
-
|
86
|
-
if probe1.contig_name == probe2.contig_name and probe1.side == probe2.side
|
87
|
-
num_probes_circular += 1
|
88
|
-
else
|
89
|
-
graph.add_edge probe1.to_setable, probe2.to_setable
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
probes = graph.vertices.to_a
|
94
|
-
|
95
|
-
# Connect all starts to the ends
|
96
|
-
probes.each do |array|
|
97
|
-
contig_name = array[0]
|
98
|
-
|
99
|
-
start_probe = Probe.new
|
100
|
-
start_probe.contig_name = contig_name
|
101
|
-
start_probe.side = 'start'
|
102
|
-
end_probe = Probe.new
|
103
|
-
end_probe.contig_name = contig_name
|
104
|
-
end_probe.side = 'end'
|
105
|
-
|
106
|
-
graph.add_edge start_probe.to_setable, end_probe.to_setable
|
107
|
-
end
|
108
|
-
log.info "Removed #{num_probes_circular} connections that join a contig end to itself"
|
109
|
-
|
110
|
-
# First try the not computationally intensive way - can we find any?
|
111
|
-
edge_result = graph.some_edges_in_all_hamiltonian_cycles
|
112
|
-
|
113
|
-
cross_contig_connections = []
|
114
|
-
edge_result.edges_in_all.each do |v1, v2|
|
115
|
-
if v1[0] != v2[0]
|
116
|
-
cross_contig_connections.push [v1,v2]
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
if !cross_contig_connections.empty?
|
121
|
-
length = cross_contig_connections.length
|
122
|
-
|
123
|
-
log.info "Good news. Found #{length} connections that are in all Hamiltonian paths (and thus can probably be scaffolded together):"
|
124
|
-
cross_contig_connections.each do |connection|
|
125
|
-
log.info connection[0].to_s + "\t" + connection[1].to_s
|
126
|
-
end
|
127
|
-
if length == probes.length and edge_result.contains_hamcycle != false
|
128
|
-
log.info "Extra good news. You just scaffolded your genome into a single scaffold"
|
129
|
-
end
|
130
|
-
end
|
131
|
-
if edge_result.contains_hamcycle == false
|
132
|
-
log.warn "Bad news. The connectivity graph contains no Hamiltonian cycles, and so the contigs cannot be scaffolded into one circular genome"
|
133
|
-
end
|
134
|
-
|
135
|
-
# Determine if there are any ends that don't connect to anything
|
136
|
-
contig_names = []
|
137
|
-
Bio::FlatFile.foreach(options[:fasta_file]) do |seq|
|
138
|
-
contig_name = seq.definition.split(/\s+/)[0]
|
139
|
-
contig_names.push contig_name
|
140
|
-
%w(start end).each do |side|
|
141
|
-
probe = Probe.new
|
142
|
-
probe.contig_name = contig_name
|
143
|
-
probe.side = side
|
144
|
-
if graph.edges[probe.to_setable].empty?
|
145
|
-
log.info "Unable to find any connections from #{probe.to_setable}"
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
|
-
# Determine if there is any possible plasmids
|
151
|
-
num_plasmids_found = 0
|
152
|
-
contig_names.each do |contig_name|
|
153
|
-
probe = Probe.new
|
154
|
-
probe.contig_name = contig_name
|
155
|
-
probe.side = 'start'
|
156
|
-
rev_probe = Probe.new
|
157
|
-
rev_probe.contig_name = contig_name
|
158
|
-
rev_probe.side = 'end'
|
159
|
-
|
160
|
-
# Both the start and the end must only connect to each other
|
161
|
-
if graph.edges[probe.to_setable].length == 1 and
|
162
|
-
graph.edges[rev_probe.to_setable].length == 1 and
|
163
|
-
graph.edges[probe.to_setable].include?(rev_probe.to_setable)
|
164
|
-
|
165
|
-
num_plasmids_found += 1
|
166
|
-
log.info "Contig #{contig_name} appears to be circular and not connect to other contigs, suggesting it may be a plasmid"
|
167
|
-
end
|
168
|
-
end
|
169
|
-
log.info "Found #{num_plasmids_found} contigs that appear to be plasmids based on connectivity"
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
log.info "Attempting a better but more computationally intensive method of determining edges that are in all hamiltonian paths.."
|
174
|
-
# First try to see if there is any hamiltonian paths?
|
175
|
-
paths = []
|
176
|
-
max_path_count = 4
|
177
|
-
operation_limit = 50000
|
178
|
-
graph.hamiltonian_cycles(operation_limit) do |path|
|
179
|
-
if paths.length <= max_path_count
|
180
|
-
paths.push path
|
181
|
-
else
|
182
|
-
break
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
if paths.length < max_path_count
|
187
|
-
log.info "Found exactly #{paths.length} Hamiltonian cycles"
|
188
|
-
else
|
189
|
-
log.info "Gave up searching for Hamiltonian cycles as there are at least #{max_path_count} cycles"
|
190
|
-
end
|
191
|
-
|
192
|
-
# OK so
|
193
|
-
#edges_in_all = graph.edges_in_all_hamiltonian_cycles
|