finishm 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,73 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
-
6
- $:.unshift File.join(ENV['HOME'],'git','bioruby-primer3','lib')
7
- require 'bio-primer3'
8
-
9
- if __FILE__ == $0 #needs to be removed if this script is distributed as part of a rubygem
10
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
11
-
12
- # Parse command line options into the options hash
13
- options = {
14
- :logger => 'stderr',
15
- }
16
- o = OptionParser.new do |opts|
17
- opts.banner = "
18
- Usage: #{SCRIPT_NAME} -p1 <primer1> -f2 <primer_list_file>
19
-
20
- Uses primer3's \"check primers\" to find whether primers match against each other\n\n"
21
-
22
- opts.on("--primer1 PRIMER", "Primer on one side [required]") do |arg|
23
- options[:primer1] = arg
24
- end
25
- opts.on("--primers2 PRIMER_FILE", "A list of primers in a file, newline separated [required]") do |arg|
26
- options[:primers2_file] = arg
27
- end
28
-
29
- # logger options
30
- opts.separator "\nVerbosity:\n\n"
31
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {Bio::Log::CLI.trace('error')}
32
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
33
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| Bio::Log::CLI.trace(s)}
34
- end; o.parse!
35
- if ARGV.length != 0
36
- $stderr.puts o
37
- exit 1
38
- end
39
- # Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
40
- Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
41
-
42
-
43
- # Read in input data
44
- primers1 = [options[:primer1]]
45
- primers2 = File.open(options[:primers2_file]).read.split("\n").collect{|c| c.strip}
46
- log.info "Read in #{primers1.length} left primers and #{primers2.length} right primers e.g. #{primers1[0]} and #{primers2[0]}"
47
-
48
- goods = 0
49
- bads = 0
50
- failed_to_run = 0
51
- primers1.each do |primer1|
52
- primers2.each do |primer2|
53
- begin
54
- result, obj = Bio::Primer3.test_primer_compatibility primer1, primer2, 'PRIMER_EXPLAIN_FLAG'=>1
55
-
56
- puts [
57
- primer1, primer2, result, obj['PRIMER_LEFT_EXPLAIN'], obj['PRIMER_RIGHT_EXPLAIN']
58
- ].join "\t"
59
-
60
- if result
61
- goods += 1
62
- else
63
- bads += 1
64
- end
65
-
66
- rescue Exception => e
67
- failed_to_run += 1
68
- end
69
- end
70
- end
71
- log.info "Found #{goods} OK primer pairs and #{bads} not OK primer pairs"
72
- log.warn "#{failed_to_run} weren't checked by Primer3 because it failed to run" if failed_to_run > 0
73
- end #end if running as a script
@@ -1,244 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'bio-velvet'
6
- require 'tempfile'
7
- require 'pp'
8
-
9
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
10
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
11
- require 'priner'
12
-
13
- # Parse command line options into the options hash
14
- options = {
15
- :logger => 'stderr',
16
- :log_level => 'info',
17
- :velvet_kmer_size => 73,#TODO: these options should be exposed to the user, and perhaps not guessed at
18
- :velvetg_arguments => '-read_trkg yes',# -exp_cov 41 -cov_cutoff 12.0973243610491', #hack
19
- :contig_end_length => 300,
20
- :output_assembly_path => 'velvetAssembly',
21
- :graph_search_leash_length => 3000,
22
- }
23
- o = OptionParser.new do |opts|
24
- opts.banner = "
25
- Usage: #{SCRIPT_NAME} --reads <read_file> --contigs <contigs_file>
26
-
27
- Takes a set of reads and a set of contigs. Then it runs an assembly based on those reads,
28
- and tries to fill in possible gaps between the contigs. There may be multiple ways
29
- to join two contig ends together - in this that multiple cases are reported. \n\n"
30
-
31
-
32
- opts.on("--reads FILE", "gzipped fastq file of reads to perform the re-assembly with [required]") do |arg|
33
- options[:reads_file] = arg
34
- end
35
- opts.on("--contigs FILE", "fasta file of contigs to be joined together [required]") do |arg|
36
- options[:contigs_file] = arg
37
- end
38
-
39
- opts.separator "\nOptional arguments:\n\n"
40
- opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
41
- options[:overall_trail_output_fasta_file] = arg
42
- end
43
- opts.on("--already-assembled-velvet-directory PATH", "Skip until after assembly in this process, and start from this assembly directory created during a previous run of this script [default: off]") do |arg|
44
- options[:previous_assembly] = arg
45
- end
46
- opts.on("--serialize-velvet-graph FILE", "So that the velvet graph does not have to be reparsed, serialise the parsed object for later use in this file [default: off]") do |arg|
47
- options[:serialize_parsed_graph_file] = arg
48
- end
49
- opts.on("--already-serialized-velvet-graph FILE", "Restore the parsed velvet graph from this file [default: off]") do |arg|
50
- options[:previously_serialized_parsed_graph_file] = arg
51
- end
52
-
53
-
54
- # logger options
55
- opts.separator "\nVerbosity:\n\n"
56
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
57
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
58
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
59
- end; o.parse!
60
- if ARGV.length != 0 or options[:reads_file].nil? or options[:contigs_file].nil?
61
- $stderr.puts o
62
- exit 1
63
- end
64
- # Setup logging
65
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
66
- Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
67
-
68
- # Extract contig ends from each of the input contigs, so that the contig ends can be found in the
69
- # assembly graph structure.
70
- contig_ends = []
71
- velvet_sequence_id_to_contig_end = {}
72
- contig_lengths = {}
73
- class ContigEnd
74
- attr_accessor :sequence, :start_or_end, :contig_name, :velvet_sequence_id
75
- end
76
- velvet_read_index = 1
77
- Bio::FlatFile.foreach(options[:contigs_file]) do |seq|
78
- contig_lengths[seq.definition] = seq.seq.length
79
- if seq.seq.length < options[:contig_end_length]
80
- log.warn "Contig #{seq.definition} is shorter than the end length used to anchor the contig in the assembly. This is not ideal but may be ok."
81
- #TODO: fix this - should be counting from the middle. Should I just ignore those ones?
82
- end
83
- # Add the start of the contig
84
- contig_end = ContigEnd.new
85
- contig_end.start_or_end = :start
86
- contig_end.sequence = Bio::Sequence::NA.new(seq.seq[0...options[:contig_end_length]]).reverse_complement.to_s
87
- contig_end.contig_name = seq.definition
88
- velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
89
- contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
90
- contig_ends.push contig_end
91
-
92
-
93
- # Add the back of the contig
94
- contig_end = ContigEnd.new
95
- contig_end.start_or_end = :end
96
- s = seq.seq
97
- contig_end.sequence = s[s.length-options[:contig_end_length]...s.length]
98
- contig_end.contig_name = seq.definition
99
- velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
100
- contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
101
- contig_ends.push contig_end
102
- end
103
- log.info "Parsed in #{contig_ends.length} contig ends from the two sides of each input contig"
104
-
105
-
106
- graph = nil
107
- if options[:previously_serialized_parsed_graph_file].nil?
108
- velvet_result = nil
109
- if options[:previous_assembly].nil? #If assembly has not already been carried out
110
- Tempfile.open('anchors.fa') do |tempfile|
111
- contig_ends.each do |contig_end|
112
- tempfile.puts ">anchor#{contig_end.velvet_sequence_id}"
113
- tempfile.puts contig_end.sequence
114
- end
115
-
116
- log.info "Assembling sampled reads with velvet"
117
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
118
- velvet_result = Bio::Velvet::Runner.new.velvet(
119
- options[:velvet_kmer_size],
120
- "-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
121
- options[:velvetg_arguments],
122
- :output_assembly_path => options[:output_assembly_path]
123
- )
124
- if log.debug?
125
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
126
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
127
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
128
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
129
- end
130
- log.info "Finished running assembly"
131
- end
132
- else
133
- log.info "Using previous assembly stored at #{options[:previous_assembly]}"
134
- velvet_result = Bio::Velvet::Result.new
135
- velvet_result.result_directory = options[:previous_assembly]
136
- end
137
-
138
- require 'ruby-prof'
139
- RubyProf.start
140
-
141
- log.info "Parsing the graph output from velvet"
142
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'Graph2')
143
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
144
-
145
- result = RubyProf.stop
146
- printer = RubyProf::FlatPrinter.new(result)
147
- printer.print(STDOUT)
148
-
149
- if options[:serialize_parsed_graph_file]
150
- log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
151
- File.open(options[:serialize_parsed_graph_file],'wb') do |f|
152
- f.print Marshal.dump(graph)
153
- end
154
- log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
155
- end
156
- else
157
- log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
158
- graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
159
- log.info "Restoration complete"
160
- end
161
-
162
-
163
-
164
- # Find the anchoring nodes for each of the contig ends
165
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
166
- log.info "Finding node representing the end of the each contig"
167
- i = 1
168
- anchor_sequence_ids = contig_ends.collect{|c| c.velvet_sequence_id}
169
- anchoring_nodes_and_directions = finder.find_unique_nodes_with_sequence_ids(graph, anchor_sequence_ids)
170
- num_anchors_found = anchoring_nodes_and_directions.reject{|s,e| e[0].nil?}.length
171
- anchoring_node_id_to_contig_end = {}
172
- anchoring_nodes_and_directions.each do |seq_id, node_and_direction|
173
- next if node_and_direction[0].nil? #skip when there is no node found in the graph for this contig end
174
- anchoring_node_id_to_contig_end[node_and_direction[0].node_id] = velvet_sequence_id_to_contig_end[seq_id]
175
- end
176
- log.info "Found anchoring nodes for #{num_anchors_found} out of #{contig_ends.length} contig ends"
177
-
178
- log.info "Searching for trails between the nodes within the assembly graph"
179
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
180
- trail_sets = cartographer.find_trails_between_node_set(graph, anchoring_nodes_and_directions.values.reject{|v| v[0].nil?}, options[:graph_search_leash_length])
181
- log.info "Found #{trail_sets.reduce(0){|s,set|s+=set.length}} trail(s) in total"
182
-
183
- node_id_to_contig_description = {}
184
- anchoring_nodes_and_directions.each do |seq_id, pair|
185
- next if pair.empty? #When no nodes were found
186
- node_id = pair[0].node_id
187
- node_id_to_contig_description[node_id] = velvet_sequence_id_to_contig_end[seq_id]
188
- end
189
- contig_end_id_to_partners = {}
190
- # Tabulate all the partners each way (complete the previously triangular matrix)
191
- trail_sets.each do |trail_set|
192
- trail_set.each do |trail|
193
- start_id = trail.first.node.node_id
194
- end_id = trail.last.node.node_id
195
- contig_end_id_to_partners[start_id] ||= []
196
- contig_end_id_to_partners[start_id].push node_id_to_contig_description[end_id]
197
- contig_end_id_to_partners[end_id] ||= []
198
- contig_end_id_to_partners[end_id].push node_id_to_contig_description[start_id]
199
- end
200
- end
201
-
202
- puts %w(contig_end_id contig_name contig_length connections).join "\t"
203
- trail_sets.each_with_index do |trail_set, i|
204
- partner_contig_ends = contig_end_id_to_partners[contig_ends[i].velvet_sequence_id]
205
- partner_contig_ends ||= []
206
- # Each contig has 2 trail sets associated with it - one for the start and one for the end
207
- puts [
208
- contig_ends[i].velvet_sequence_id,
209
- contig_ends[i].contig_name,
210
- contig_lengths[contig_ends[i].contig_name],
211
- partner_contig_ends.collect{|c| c.velvet_sequence_id}.sort.join(',')
212
- ].join("\t")
213
- end
214
-
215
- if options[:overall_trail_output_fasta_file]
216
- File.open(options[:overall_trail_output_fasta_file],'w') do |outfile|
217
- trail_sets.each do |trail_set|
218
- trail_set.each do |trail|
219
- begin
220
- trail_sequence = trail.sequence #Get the trail sequence first as this may not be possible.
221
-
222
- start_id = trail.first.node.node_id
223
- end_id = trail.last.node.node_id
224
- start_contig_end = anchoring_node_id_to_contig_end[start_id]
225
- end_contig_end = anchoring_node_id_to_contig_end[end_id]
226
- outfile.print '>'
227
- outfile.print start_contig_end.contig_name
228
- outfile.print '_'
229
- outfile.print start_contig_end.start_or_end
230
- outfile.print ':'
231
- outfile.print end_contig_end.contig_name
232
- outfile.print '_'
233
- outfile.puts end_contig_end.start_or_end
234
-
235
- outfile.puts trail_sequence
236
- rescue Bio::Velvet::NotImplementedException => e
237
- log.warn "Problem getting sequence of found trail #{trail.to_s}, skipping this trail: #{e.to_s}"
238
- end
239
- end
240
- end
241
- end
242
- end
243
-
244
-
@@ -1,153 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'systemu'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- }
14
- o = OptionParser.new do |opts|
15
- opts.banner = "
16
- Usage: #{SCRIPT_NAME} -b <contigs_against_assembly.blast_outfmt6.csv>
17
-
18
- Takes a set of contigs, and an assembly. Works out if there are any contigs where there is a blast hit spanning of the contigs using two of the assembly's contig ends.\n\n"
19
-
20
- opts.on("--query FASTA_FILE", "new contigs fasta file [Required]") do |arg|
21
- options[:query_file] = arg
22
- end
23
- opts.on("--blastdb FASTA_FILE_FORMATTED", "basename of makeblastdb output [Required]") do |arg|
24
- options[:blastdb] = arg
25
- end
26
-
27
- # logger options
28
- opts.separator "\nVerbosity:\n\n"
29
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
30
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
31
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
32
- end; o.parse!
33
- if ARGV.length != 0 or options[:query_file].nil? or options[:blastdb].nil?
34
- $stderr.puts o
35
- exit 1
36
- end
37
- # Setup logging
38
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
39
-
40
-
41
- # Read in the blast file
42
- blast_results = []
43
- class BlastResult
44
- attr_accessor :qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart, :qend, :sstart, :subject_end, :evalue, :bitscore, :query_length, :subject_length
45
-
46
- attr_accessor :cutoff_inwards
47
-
48
- def initialize
49
- @cutoff_inwards = 500
50
- end
51
-
52
- def hits_end_of_subject?
53
- @subject_end >= @subject_length-@cutoff_inwards and @length >= 100
54
- end
55
-
56
- def hits_start_of_subject?
57
- @sstart <= @cutoff_inwards and @length >= 100
58
- end
59
-
60
- def hits_end_of_query?
61
- @qend >= @query_length-@cutoff_inwards and @length >= 100
62
- end
63
-
64
- def hits_start_of_query?
65
- @qstart <= @cutoff_inwards and @length >= 100
66
- end
67
- end
68
-
69
- status, blast_output, stderr = systemu "blastn -query #{options[:query_file].inspect} -db #{options[:blastdb].inspect} -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -evalue 1e-5"
70
- raise stderr unless stderr==""
71
- raise "bad status running blast" unless status.exitstatus == 0
72
- log.debug "Finished running blast, presumably successfully"
73
-
74
- blast_output.each_line do |line|
75
- res = BlastResult.new
76
- row = line.chomp.split "\t"
77
- [:qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart,
78
- :qend, :sstart, :subject_end, :evalue, :bitscore,
79
- :query_length, :subject_length].each_with_index do |attr, i|
80
- res.send "#{attr}=".to_sym, row[i]
81
- end
82
- [:length, :mismatch, :gapopen, :qstart,
83
- :qend, :sstart, :subject_end,:query_length, :subject_length].each do |attr|
84
- res.send "#{attr}=".to_sym, res.send(attr).to_i
85
- end
86
- [:pident, :evalue, :bitscore].each do |attr|
87
- res.send "#{attr}=".to_sym, res.send(attr).to_f
88
- end
89
-
90
- blast_results.push res
91
- end
92
- log.info "Parsed #{blast_results.length} blast results e.g. #{blast_results[0].inspect}"
93
-
94
-
95
- query_to_blast_results = {}
96
- hit_to_blast_results = {}
97
- blast_results.each do |result|
98
- query_to_blast_results[result.qseqid] ||= []
99
- query_to_blast_results[result.qseqid].push result
100
-
101
- hit_to_blast_results[result.sseqid] ||= []
102
- hit_to_blast_results[result.sseqid].push result
103
- end
104
-
105
- # For each query sequence, does it map to the ends of both contigs
106
- header = %w(query subject1 subject2 qstart1? qend1? sstart1? send1? qstart2? qend2? sstart2? send2?).join("\t")
107
- query_to_blast_results.each do |query_id, hits|
108
- query_length = hits[0].query_length
109
- keepers = []
110
-
111
- hits.each do |hit|
112
- # perfect if it hits the start or the end (but not both) of both the query and the subject, unless it is circular
113
- if hit.hits_start_of_query? ^ hit.hits_end_of_query? and
114
- hit.hits_start_of_subject? ^ hit.hits_end_of_subject?
115
- keepers.push hit
116
- elsif hit.hits_start_of_query? or hit.hits_end_of_query? or
117
- hit.hits_start_of_subject? or hit.hits_end_of_subject?
118
- log.info "There's a half-correct hit for #{query_id}: qstart? #{hit.hits_start_of_query?} qend #{hit.hits_end_of_query?} "+
119
- "sstart #{hit.hits_start_of_subject?} send #{hit.hits_end_of_subject?}, to subject sequence #{hit.sseqid}"
120
- end
121
- end
122
-
123
- if keepers.empty?
124
- log.debug "no latchings found for #{query_id}"
125
- elsif keepers.length == 1
126
- log.info "Query #{query_id} only latches on to a single end, maybe manually inspect"
127
- elsif keepers.length == 2
128
- log.debug "Query #{query_id} has 2 keepers!"
129
- q = keepers.collect{|hit| hit.hits_start_of_query?}.join
130
- s = keepers.collect{|hit| hit.hits_start_of_subject?}.join
131
- if (q == 'truefalse' or q == 'falsetrue') and
132
- (s == 'truefalse' or s == 'falsetrue')
133
- outs = (0..1).collect{|i|
134
- [
135
- keepers[i].hits_start_of_query?,
136
- keepers[i].hits_end_of_query?,
137
- keepers[i].hits_start_of_subject?,
138
- keepers[i].hits_end_of_subject?,
139
- ]
140
- }.flatten
141
- unless header.nil?
142
- puts header
143
- header = nil
144
- end
145
- puts [query_id, keepers[0].sseqid, keepers[1].sseqid, outs].flatten.join("\t")
146
- else
147
- log.info "Query #{query_id} has 2 keepers, but they are fighting it seems"
148
- end
149
- else
150
- log.info "More than 2 keepers found for #{query_id}, manual inspection likely required"
151
- end
152
- end
153
-