finishm 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
-
6
- $:.unshift File.join(ENV['HOME'],'git','bioruby-primer3','lib')
7
- require 'bio-primer3'
8
-
9
- if __FILE__ == $0 #needs to be removed if this script is distributed as part of a rubygem
10
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
11
-
12
- # Parse command line options into the options hash
13
- options = {
14
- :logger => 'stderr',
15
- }
16
- o = OptionParser.new do |opts|
17
- opts.banner = "
18
- Usage: #{SCRIPT_NAME} -p1 <primer1> -f2 <primer_list_file>
19
-
20
- Uses primer3's \"check primers\" to find whether primers match against each other\n\n"
21
-
22
- opts.on("--primer1 PRIMER", "Primer on one side [required]") do |arg|
23
- options[:primer1] = arg
24
- end
25
- opts.on("--primers2 PRIMER_FILE", "A list of primers in a file, newline separated [required]") do |arg|
26
- options[:primers2_file] = arg
27
- end
28
-
29
- # logger options
30
- opts.separator "\nVerbosity:\n\n"
31
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {Bio::Log::CLI.trace('error')}
32
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
33
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| Bio::Log::CLI.trace(s)}
34
- end; o.parse!
35
- if ARGV.length != 0
36
- $stderr.puts o
37
- exit 1
38
- end
39
- # Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
40
- Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
41
-
42
-
43
- # Read in input data
44
- primers1 = [options[:primer1]]
45
- primers2 = File.open(options[:primers2_file]).read.split("\n").collect{|c| c.strip}
46
- log.info "Read in #{primers1.length} left primers and #{primers2.length} right primers e.g. #{primers1[0]} and #{primers2[0]}"
47
-
48
- goods = 0
49
- bads = 0
50
- failed_to_run = 0
51
- primers1.each do |primer1|
52
- primers2.each do |primer2|
53
- begin
54
- result, obj = Bio::Primer3.test_primer_compatibility primer1, primer2, 'PRIMER_EXPLAIN_FLAG'=>1
55
-
56
- puts [
57
- primer1, primer2, result, obj['PRIMER_LEFT_EXPLAIN'], obj['PRIMER_RIGHT_EXPLAIN']
58
- ].join "\t"
59
-
60
- if result
61
- goods += 1
62
- else
63
- bads += 1
64
- end
65
-
66
- rescue Exception => e
67
- failed_to_run += 1
68
- end
69
- end
70
- end
71
- log.info "Found #{goods} OK primer pairs and #{bads} not OK primer pairs"
72
- log.warn "#{failed_to_run} weren't checked by Primer3 because it failed to run" if failed_to_run > 0
73
- end #end if running as a script
@@ -1,244 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'bio-velvet'
6
- require 'tempfile'
7
- require 'pp'
8
-
9
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
10
- $:.unshift File.join(File.dirname(__FILE__),'..','lib')
11
- require 'priner'
12
-
13
- # Parse command line options into the options hash
14
- options = {
15
- :logger => 'stderr',
16
- :log_level => 'info',
17
- :velvet_kmer_size => 73,#TODO: these options should be exposed to the user, and perhaps not guessed at
18
- :velvetg_arguments => '-read_trkg yes',# -exp_cov 41 -cov_cutoff 12.0973243610491', #hack
19
- :contig_end_length => 300,
20
- :output_assembly_path => 'velvetAssembly',
21
- :graph_search_leash_length => 3000,
22
- }
23
- o = OptionParser.new do |opts|
24
- opts.banner = "
25
- Usage: #{SCRIPT_NAME} --reads <read_file> --contigs <contigs_file>
26
-
27
- Takes a set of reads and a set of contigs. Then it runs an assembly based on those reads,
28
- and tries to fill in possible gaps between the contigs. There may be multiple ways
29
- to join two contig ends together - in this that multiple cases are reported. \n\n"
30
-
31
-
32
- opts.on("--reads FILE", "gzipped fastq file of reads to perform the re-assembly with [required]") do |arg|
33
- options[:reads_file] = arg
34
- end
35
- opts.on("--contigs FILE", "fasta file of contigs to be joined together [required]") do |arg|
36
- options[:contigs_file] = arg
37
- end
38
-
39
- opts.separator "\nOptional arguments:\n\n"
40
- opts.on("--output-trails-fasta PATH", "Output found paths to this file in fasta format [default: off]") do |arg|
41
- options[:overall_trail_output_fasta_file] = arg
42
- end
43
- opts.on("--already-assembled-velvet-directory PATH", "Skip until after assembly in this process, and start from this assembly directory created during a previous run of this script [default: off]") do |arg|
44
- options[:previous_assembly] = arg
45
- end
46
- opts.on("--serialize-velvet-graph FILE", "So that the velvet graph does not have to be reparsed, serialise the parsed object for later use in this file [default: off]") do |arg|
47
- options[:serialize_parsed_graph_file] = arg
48
- end
49
- opts.on("--already-serialized-velvet-graph FILE", "Restore the parsed velvet graph from this file [default: off]") do |arg|
50
- options[:previously_serialized_parsed_graph_file] = arg
51
- end
52
-
53
-
54
- # logger options
55
- opts.separator "\nVerbosity:\n\n"
56
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
57
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
58
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
59
- end; o.parse!
60
- if ARGV.length != 0 or options[:reads_file].nil? or options[:contigs_file].nil?
61
- $stderr.puts o
62
- exit 1
63
- end
64
- # Setup logging
65
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
66
- Bio::Log::LoggerPlus.new 'bio-velvet'; Bio::Log::CLI.configure 'bio-velvet'
67
-
68
- # Extract contig ends from each of the input contigs, so that the contig ends can be found in the
69
- # assembly graph structure.
70
- contig_ends = []
71
- velvet_sequence_id_to_contig_end = {}
72
- contig_lengths = {}
73
- class ContigEnd
74
- attr_accessor :sequence, :start_or_end, :contig_name, :velvet_sequence_id
75
- end
76
- velvet_read_index = 1
77
- Bio::FlatFile.foreach(options[:contigs_file]) do |seq|
78
- contig_lengths[seq.definition] = seq.seq.length
79
- if seq.seq.length < options[:contig_end_length]
80
- log.warn "Contig #{seq.definition} is shorter than the end length used to anchor the contig in the assembly. This is not ideal but may be ok."
81
- #TODO: fix this - should be counting from the middle. Should I just ignore those ones?
82
- end
83
- # Add the start of the contig
84
- contig_end = ContigEnd.new
85
- contig_end.start_or_end = :start
86
- contig_end.sequence = Bio::Sequence::NA.new(seq.seq[0...options[:contig_end_length]]).reverse_complement.to_s
87
- contig_end.contig_name = seq.definition
88
- velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
89
- contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
90
- contig_ends.push contig_end
91
-
92
-
93
- # Add the back of the contig
94
- contig_end = ContigEnd.new
95
- contig_end.start_or_end = :end
96
- s = seq.seq
97
- contig_end.sequence = s[s.length-options[:contig_end_length]...s.length]
98
- contig_end.contig_name = seq.definition
99
- velvet_sequence_id_to_contig_end[velvet_read_index] = contig_end
100
- contig_end.velvet_sequence_id = velvet_read_index; velvet_read_index += 1
101
- contig_ends.push contig_end
102
- end
103
- log.info "Parsed in #{contig_ends.length} contig ends from the two sides of each input contig"
104
-
105
-
106
- graph = nil
107
- if options[:previously_serialized_parsed_graph_file].nil?
108
- velvet_result = nil
109
- if options[:previous_assembly].nil? #If assembly has not already been carried out
110
- Tempfile.open('anchors.fa') do |tempfile|
111
- contig_ends.each do |contig_end|
112
- tempfile.puts ">anchor#{contig_end.velvet_sequence_id}"
113
- tempfile.puts contig_end.sequence
114
- end
115
-
116
- log.info "Assembling sampled reads with velvet"
117
- # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
118
- velvet_result = Bio::Velvet::Runner.new.velvet(
119
- options[:velvet_kmer_size],
120
- "-short #{tempfile.path} -short2 -fastq.gz #{options[:reads_file]}",
121
- options[:velvetg_arguments],
122
- :output_assembly_path => options[:output_assembly_path]
123
- )
124
- if log.debug?
125
- log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
126
- log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
127
- log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
128
- log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
129
- end
130
- log.info "Finished running assembly"
131
- end
132
- else
133
- log.info "Using previous assembly stored at #{options[:previous_assembly]}"
134
- velvet_result = Bio::Velvet::Result.new
135
- velvet_result.result_directory = options[:previous_assembly]
136
- end
137
-
138
- require 'ruby-prof'
139
- RubyProf.start
140
-
141
- log.info "Parsing the graph output from velvet"
142
- graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'Graph2')
143
- log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
144
-
145
- result = RubyProf.stop
146
- printer = RubyProf::FlatPrinter.new(result)
147
- printer.print(STDOUT)
148
-
149
- if options[:serialize_parsed_graph_file]
150
- log.info "Storing a binary version of the graph file for later use at #{options[:serialize_parsed_graph_file]}"
151
- File.open(options[:serialize_parsed_graph_file],'wb') do |f|
152
- f.print Marshal.dump(graph)
153
- end
154
- log.info "Stored a binary representation of the velvet graph at #{options[:serialize_parsed_graph_file]}"
155
- end
156
- else
157
- log.info "Restoring graph file from #{options[:previously_serialized_parsed_graph_file]}.."
158
- graph = Marshal.load(File.open(options[:previously_serialized_parsed_graph_file]))
159
- log.info "Restoration complete"
160
- end
161
-
162
-
163
-
164
- # Find the anchoring nodes for each of the contig ends
165
- finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
166
- log.info "Finding node representing the end of the each contig"
167
- i = 1
168
- anchor_sequence_ids = contig_ends.collect{|c| c.velvet_sequence_id}
169
- anchoring_nodes_and_directions = finder.find_unique_nodes_with_sequence_ids(graph, anchor_sequence_ids)
170
- num_anchors_found = anchoring_nodes_and_directions.reject{|s,e| e[0].nil?}.length
171
- anchoring_node_id_to_contig_end = {}
172
- anchoring_nodes_and_directions.each do |seq_id, node_and_direction|
173
- next if node_and_direction[0].nil? #skip when there is no node found in the graph for this contig end
174
- anchoring_node_id_to_contig_end[node_and_direction[0].node_id] = velvet_sequence_id_to_contig_end[seq_id]
175
- end
176
- log.info "Found anchoring nodes for #{num_anchors_found} out of #{contig_ends.length} contig ends"
177
-
178
- log.info "Searching for trails between the nodes within the assembly graph"
179
- cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
180
- trail_sets = cartographer.find_trails_between_node_set(graph, anchoring_nodes_and_directions.values.reject{|v| v[0].nil?}, options[:graph_search_leash_length])
181
- log.info "Found #{trail_sets.reduce(0){|s,set|s+=set.length}} trail(s) in total"
182
-
183
- node_id_to_contig_description = {}
184
- anchoring_nodes_and_directions.each do |seq_id, pair|
185
- next if pair.empty? #When no nodes were found
186
- node_id = pair[0].node_id
187
- node_id_to_contig_description[node_id] = velvet_sequence_id_to_contig_end[seq_id]
188
- end
189
- contig_end_id_to_partners = {}
190
- # Tabulate all the partners each way (complete the previously triangular matrix)
191
- trail_sets.each do |trail_set|
192
- trail_set.each do |trail|
193
- start_id = trail.first.node.node_id
194
- end_id = trail.last.node.node_id
195
- contig_end_id_to_partners[start_id] ||= []
196
- contig_end_id_to_partners[start_id].push node_id_to_contig_description[end_id]
197
- contig_end_id_to_partners[end_id] ||= []
198
- contig_end_id_to_partners[end_id].push node_id_to_contig_description[start_id]
199
- end
200
- end
201
-
202
- puts %w(contig_end_id contig_name contig_length connections).join "\t"
203
- trail_sets.each_with_index do |trail_set, i|
204
- partner_contig_ends = contig_end_id_to_partners[contig_ends[i].velvet_sequence_id]
205
- partner_contig_ends ||= []
206
- # Each contig has 2 trail sets associated with it - one for the start and one for the end
207
- puts [
208
- contig_ends[i].velvet_sequence_id,
209
- contig_ends[i].contig_name,
210
- contig_lengths[contig_ends[i].contig_name],
211
- partner_contig_ends.collect{|c| c.velvet_sequence_id}.sort.join(',')
212
- ].join("\t")
213
- end
214
-
215
- if options[:overall_trail_output_fasta_file]
216
- File.open(options[:overall_trail_output_fasta_file],'w') do |outfile|
217
- trail_sets.each do |trail_set|
218
- trail_set.each do |trail|
219
- begin
220
- trail_sequence = trail.sequence #Get the trail sequence first as this may not be possible.
221
-
222
- start_id = trail.first.node.node_id
223
- end_id = trail.last.node.node_id
224
- start_contig_end = anchoring_node_id_to_contig_end[start_id]
225
- end_contig_end = anchoring_node_id_to_contig_end[end_id]
226
- outfile.print '>'
227
- outfile.print start_contig_end.contig_name
228
- outfile.print '_'
229
- outfile.print start_contig_end.start_or_end
230
- outfile.print ':'
231
- outfile.print end_contig_end.contig_name
232
- outfile.print '_'
233
- outfile.puts end_contig_end.start_or_end
234
-
235
- outfile.puts trail_sequence
236
- rescue Bio::Velvet::NotImplementedException => e
237
- log.warn "Problem getting sequence of found trail #{trail.to_s}, skipping this trail: #{e.to_s}"
238
- end
239
- end
240
- end
241
- end
242
- end
243
-
244
-
@@ -1,153 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'optparse'
4
- require 'bio-logger'
5
- require 'systemu'
6
-
7
- SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
8
-
9
- # Parse command line options into the options hash
10
- options = {
11
- :logger => 'stderr',
12
- :log_level => 'info',
13
- }
14
- o = OptionParser.new do |opts|
15
- opts.banner = "
16
- Usage: #{SCRIPT_NAME} -b <contigs_against_assembly.blast_outfmt6.csv>
17
-
18
- Takes a set of contigs, and an assembly. Works out if there are any contigs where there is a blast hit spanning of the contigs using two of the assembly's contig ends.\n\n"
19
-
20
- opts.on("--query FASTA_FILE", "new contigs fasta file [Required]") do |arg|
21
- options[:query_file] = arg
22
- end
23
- opts.on("--blastdb FASTA_FILE_FORMATTED", "basename of makeblastdb output [Required]") do |arg|
24
- options[:blastdb] = arg
25
- end
26
-
27
- # logger options
28
- opts.separator "\nVerbosity:\n\n"
29
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
30
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
31
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
32
- end; o.parse!
33
- if ARGV.length != 0 or options[:query_file].nil? or options[:blastdb].nil?
34
- $stderr.puts o
35
- exit 1
36
- end
37
- # Setup logging
38
- Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
39
-
40
-
41
- # Read in the blast file
42
- blast_results = []
43
- class BlastResult
44
- attr_accessor :qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart, :qend, :sstart, :subject_end, :evalue, :bitscore, :query_length, :subject_length
45
-
46
- attr_accessor :cutoff_inwards
47
-
48
- def initialize
49
- @cutoff_inwards = 500
50
- end
51
-
52
- def hits_end_of_subject?
53
- @subject_end >= @subject_length-@cutoff_inwards and @length >= 100
54
- end
55
-
56
- def hits_start_of_subject?
57
- @sstart <= @cutoff_inwards and @length >= 100
58
- end
59
-
60
- def hits_end_of_query?
61
- @qend >= @query_length-@cutoff_inwards and @length >= 100
62
- end
63
-
64
- def hits_start_of_query?
65
- @qstart <= @cutoff_inwards and @length >= 100
66
- end
67
- end
68
-
69
- status, blast_output, stderr = systemu "blastn -query #{options[:query_file].inspect} -db #{options[:blastdb].inspect} -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' -evalue 1e-5"
70
- raise stderr unless stderr==""
71
- raise "bad status running blast" unless status.exitstatus == 0
72
- log.debug "Finished running blast, presumably successfully"
73
-
74
- blast_output.each_line do |line|
75
- res = BlastResult.new
76
- row = line.chomp.split "\t"
77
- [:qseqid, :sseqid, :pident, :length, :mismatch, :gapopen, :qstart,
78
- :qend, :sstart, :subject_end, :evalue, :bitscore,
79
- :query_length, :subject_length].each_with_index do |attr, i|
80
- res.send "#{attr}=".to_sym, row[i]
81
- end
82
- [:length, :mismatch, :gapopen, :qstart,
83
- :qend, :sstart, :subject_end,:query_length, :subject_length].each do |attr|
84
- res.send "#{attr}=".to_sym, res.send(attr).to_i
85
- end
86
- [:pident, :evalue, :bitscore].each do |attr|
87
- res.send "#{attr}=".to_sym, res.send(attr).to_f
88
- end
89
-
90
- blast_results.push res
91
- end
92
- log.info "Parsed #{blast_results.length} blast results e.g. #{blast_results[0].inspect}"
93
-
94
-
95
- query_to_blast_results = {}
96
- hit_to_blast_results = {}
97
- blast_results.each do |result|
98
- query_to_blast_results[result.qseqid] ||= []
99
- query_to_blast_results[result.qseqid].push result
100
-
101
- hit_to_blast_results[result.sseqid] ||= []
102
- hit_to_blast_results[result.sseqid].push result
103
- end
104
-
105
- # For each query sequence, does it map to the ends of both contigs
106
- header = %w(query subject1 subject2 qstart1? qend1? sstart1? send1? qstart2? qend2? sstart2? send2?).join("\t")
107
- query_to_blast_results.each do |query_id, hits|
108
- query_length = hits[0].query_length
109
- keepers = []
110
-
111
- hits.each do |hit|
112
- # perfect if it hits the start or the end (but not both) of both the query and the subject, unless it is circular
113
- if hit.hits_start_of_query? ^ hit.hits_end_of_query? and
114
- hit.hits_start_of_subject? ^ hit.hits_end_of_subject?
115
- keepers.push hit
116
- elsif hit.hits_start_of_query? or hit.hits_end_of_query? or
117
- hit.hits_start_of_subject? or hit.hits_end_of_subject?
118
- log.info "There's a half-correct hit for #{query_id}: qstart? #{hit.hits_start_of_query?} qend #{hit.hits_end_of_query?} "+
119
- "sstart #{hit.hits_start_of_subject?} send #{hit.hits_end_of_subject?}, to subject sequence #{hit.sseqid}"
120
- end
121
- end
122
-
123
- if keepers.empty?
124
- log.debug "no latchings found for #{query_id}"
125
- elsif keepers.length == 1
126
- log.info "Query #{query_id} only latches on to a single end, maybe manually inspect"
127
- elsif keepers.length == 2
128
- log.debug "Query #{query_id} has 2 keepers!"
129
- q = keepers.collect{|hit| hit.hits_start_of_query?}.join
130
- s = keepers.collect{|hit| hit.hits_start_of_subject?}.join
131
- if (q == 'truefalse' or q == 'falsetrue') and
132
- (s == 'truefalse' or s == 'falsetrue')
133
- outs = (0..1).collect{|i|
134
- [
135
- keepers[i].hits_start_of_query?,
136
- keepers[i].hits_end_of_query?,
137
- keepers[i].hits_start_of_subject?,
138
- keepers[i].hits_end_of_subject?,
139
- ]
140
- }.flatten
141
- unless header.nil?
142
- puts header
143
- header = nil
144
- end
145
- puts [query_id, keepers[0].sseqid, keepers[1].sseqid, outs].flatten.join("\t")
146
- else
147
- log.info "Query #{query_id} has 2 keepers, but they are fighting it seems"
148
- end
149
- else
150
- log.info "More than 2 keepers found for #{query_id}, manual inspection likely required"
151
- end
152
- end
153
-