finishm 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/finishm +6 -2
- data/finishm.gemspec +3 -3
- data/lib/assembly/all_orfs.rb +61 -7
- data/lib/assembly/contig_printer.rb +18 -1
- data/lib/assembly/scaffold_breaker.rb +2 -1
- data/lib/finishm/orfs_finder.rb +50 -8
- data/lib/finishm/roundup.rb +7 -1
- data/lib/finishm/visualise.rb +4 -3
- data/spec/all_orfs_spec.rb +194 -5
- data/spec/contig_printer_spec.rb +21 -3
- data/spec/scaffold_breaker_spec.rb +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce5a708f7e7dbd95e357c0e58f94ef29c49d0cbf
|
4
|
+
data.tar.gz: ddf1f623a8045f626455e6c25bf78176c792407d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d73d9bbbc21761606fc5503d8818792d5a083f766e6a8c62336e17b50505c4c9c1897b79307f16f1e30b9f97b484695baf7ee46e009de5a375ba3660e26ede4
|
7
|
+
data.tar.gz: 718ad349ab3c87b1d370ed22eb4d7c68574f67ba6c0ab7a477db94211d12b9920e259c2f0aa05194df0e6e5ca79b5167f22f82708c65d50f2adc613b2a180ee1
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/bin/finishm
CHANGED
@@ -18,12 +18,16 @@ global = OptionParser.new do |opts|
|
|
18
18
|
opts.banner = "
|
19
19
|
Usage: #{SCRIPT_NAME} <command> [<arguments>]
|
20
20
|
|
21
|
-
FinishM is a collection of tasks related to assembly and metagenome assembly.
|
21
|
+
FinishM is a collection of tasks related to assembly and metagenome assembly. Common commands:
|
22
|
+
|
23
|
+
roundup\tImprove a genome by connecting scaffolds and gapfilling
|
24
|
+
visualise\tVisualise the DeBruijn graph
|
25
|
+
|
26
|
+
Other available commands:
|
22
27
|
|
23
28
|
wander\tTry to connect contigs (experimental)
|
24
29
|
gapfill\tFill assembly gaps (N characters) (experimental)
|
25
30
|
explore\tWhat happens in the graph beyond the end of my contig(s)? (experimental)
|
26
|
-
visualise\tVisualise the DeBruijn graph (experimental)
|
27
31
|
|
28
32
|
Commands for PCR finishing:
|
29
33
|
|
data/finishm.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: finishm 0.0.
|
5
|
+
# stub: finishm 0.0.4 ruby lib
|
6
6
|
# stub: ext/mkrf_conf.rb
|
7
7
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = "finishm"
|
10
|
-
s.version = "0.0.
|
10
|
+
s.version = "0.0.4"
|
11
11
|
|
12
12
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.authors = ["Ben J. Woodcroft"]
|
15
|
-
s.date = "2015-
|
15
|
+
s.date = "2015-08-30"
|
16
16
|
s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
|
17
17
|
s.email = "donttrustben near gmail.com"
|
18
18
|
s.executables = ["finishm"]
|
data/lib/assembly/all_orfs.rb
CHANGED
@@ -9,6 +9,32 @@ module Bio
|
|
9
9
|
CODON_LENGTH = 3
|
10
10
|
START_CODONS = ['ATG']
|
11
11
|
STOP_CODONS = ['TAG', 'TAA', 'TGA']
|
12
|
+
CODONS = {
|
13
|
+
'A' => ['GCT', 'GCC', 'GCA', 'GCG'],
|
14
|
+
'R' => ['CGT', 'CGC', 'CGA','CGG', 'AGA', 'AGG'],
|
15
|
+
'N' => ['AAT', 'AAC'],
|
16
|
+
'D' => ['GAT', 'GAC'],
|
17
|
+
'C' => ['TGT', 'TGC'],
|
18
|
+
'Q' => ['CAA', 'CAG'],
|
19
|
+
'E' => ['GAA', 'GAG'],
|
20
|
+
'G' => ['GGT', 'GGC', 'GGA', 'GGG'],
|
21
|
+
'H' => ['CAT', 'CAC'],
|
22
|
+
'I' => ['ATT', 'ATC', 'ATA'],
|
23
|
+
'L' => ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
24
|
+
'K' => ['AAA', 'AAG'],
|
25
|
+
'M' => ['ATG'],
|
26
|
+
'F' => ['TTT', 'TTC'],
|
27
|
+
'P' => ['CCT', 'CCC', 'CCA', 'CCG'],
|
28
|
+
'S' => ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
29
|
+
'T' => ['ACT', 'ACC', 'ACA', 'ACG'],
|
30
|
+
'W' => ['TGG'],
|
31
|
+
'Y' => ['TAT', 'TAC'],
|
32
|
+
'V' => ['GTT', 'GTC', 'GTA', 'GTG']
|
33
|
+
}
|
34
|
+
TRANSLATOR = CODONS.reduce({}) do |memo, pair|
|
35
|
+
pair[1].each{|key| memo[key] = pair[0]}
|
36
|
+
memo
|
37
|
+
end
|
12
38
|
|
13
39
|
# Search for open reading frames in a graph, in all the paths begining at a set of
|
14
40
|
# nodes through a graph (or a subset defined by range)
|
@@ -78,7 +104,7 @@ module Bio
|
|
78
104
|
max_num_paths = options[:max_gapfill_paths]
|
79
105
|
max_num_paths ||= 2196
|
80
106
|
max_cycles = options[:max_cycles] || 1
|
81
|
-
min_orf_length = options[:
|
107
|
+
min_orf_length = options[:min_orf_length] || 0
|
82
108
|
|
83
109
|
counter = SingleCoherentPathsBetweenNodesFinder::CycleCounter.new(max_cycles)
|
84
110
|
decide_stack = lambda do |to_push|
|
@@ -474,10 +500,11 @@ module Bio
|
|
474
500
|
return to_return
|
475
501
|
end
|
476
502
|
|
477
|
-
def orf_sequences_from_trails(trails)
|
478
|
-
to_return =
|
503
|
+
def orf_sequences_from_trails(trails, min_orf_length=nil)
|
504
|
+
to_return = []
|
479
505
|
trails.each do |trail|
|
480
506
|
fwd_sequence, twin_sequence = trail.otrail.sequences_within_path
|
507
|
+
trail_length = fwd_sequence.length
|
481
508
|
# forward / twin directions
|
482
509
|
[
|
483
510
|
[fwd_sequence, trail.fwd_orfs_result],
|
@@ -500,10 +527,12 @@ module Bio
|
|
500
527
|
end
|
501
528
|
name = "(#{onodes[0].to_shorthand}:#{pair[0].position_in_node}),#{onodes[1...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{pair[1].position_in_node})"
|
502
529
|
|
503
|
-
to_return[name
|
530
|
+
to_return.push [name, sequence[start_position...end_position]]
|
504
531
|
end
|
505
532
|
result.initial_stop_markers.each do |marker|
|
506
533
|
end_position = marker.position_in_trail
|
534
|
+
start_position = end_position % 3 #trim sequence to multiple of 3
|
535
|
+
next if min_orf_length and end_position - start_position < min_orf_length
|
507
536
|
|
508
537
|
# orf_name
|
509
538
|
last_node = nil
|
@@ -514,20 +543,29 @@ module Bio
|
|
514
543
|
end
|
515
544
|
name = "#{onodes[0...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{marker.position_in_node})"
|
516
545
|
|
517
|
-
to_return[name
|
546
|
+
to_return.push [name, sequence[start_position...end_position]]
|
518
547
|
end
|
519
548
|
result.final_start_markers.each do |marker|
|
520
549
|
start_position = marker.position_in_trail - 3
|
550
|
+
end_position = (trail_length - start_position) % 3
|
551
|
+
next if min_orf_length and trail_length - end_position - start_position < min_orf_length
|
521
552
|
|
522
553
|
# orf_name
|
523
554
|
onodes = trail.otrail.trail.drop_while{|onode| onode.node != marker.node}
|
524
555
|
name = "(#{onodes[0].to_shorthand}:#{marker.position_in_node}),#{onodes[1..-1].collect{|onode| onode.to_shorthand}.join(',')}"
|
556
|
+
to_return.push [name, sequence[start_position..-1-end_position]]
|
525
557
|
end
|
526
558
|
end
|
527
559
|
if result.nil? or (result.start_stop_pairs.empty? and result.final_start_markers.empty? and result.initial_stop_markers.empty?)
|
528
|
-
|
560
|
+
(0..2).each do |frame|
|
561
|
+
start_position = frame
|
562
|
+
end_position = (trail_length - start_position) % 3
|
563
|
+
next if min_orf_length and trail_length - end_position - start_position < min_orf_length
|
529
564
|
|
530
|
-
|
565
|
+
# orf_name
|
566
|
+
name = "#{trail.otrail.to_shorthand}"
|
567
|
+
to_return.push [name, sequence[start_position..-1-end_position]]
|
568
|
+
end
|
531
569
|
end
|
532
570
|
end
|
533
571
|
end
|
@@ -535,6 +573,22 @@ module Bio
|
|
535
573
|
return to_return
|
536
574
|
end
|
537
575
|
|
576
|
+
def sequence2AA(sequence)
|
577
|
+
remaining = sequence
|
578
|
+
aa = ""
|
579
|
+
while remaining.length > 0
|
580
|
+
codon = remaining[0...3]
|
581
|
+
log.debug "Found next codon #{codon}" if log.debug?
|
582
|
+
if not TRANSLATOR.has_key?(codon)
|
583
|
+
raise "Cannot translate invalid codon #{codon} in sequence #{sequence}."
|
584
|
+
end
|
585
|
+
log.debug "Codon translated to #{TRANSLATOR[codon]}" if log.debug?
|
586
|
+
aa += TRANSLATOR[codon]
|
587
|
+
remaining = remaining[3..-1]
|
588
|
+
end
|
589
|
+
return aa
|
590
|
+
end
|
591
|
+
|
538
592
|
# positions of last base of codons
|
539
593
|
class Marker
|
540
594
|
attr_accessor :position_in_trail, :position_in_node, :node
|
@@ -34,6 +34,23 @@ module Bio
|
|
34
34
|
# Enumerable of Enumerables of OrientedNode objects, each list of OrientedNode objects
|
35
35
|
# corresponds to a path that forms the connection
|
36
36
|
attr_accessor :paths
|
37
|
+
|
38
|
+
# Remove all except the path with maximal coverage from @paths
|
39
|
+
def collapse_paths_to_maximal_coverage_path!
|
40
|
+
return if @paths.nil? or @paths.empty?
|
41
|
+
get_coverage = lambda do |path|
|
42
|
+
numerator = 0
|
43
|
+
denominator = 0
|
44
|
+
path.each do |onode|
|
45
|
+
numerator += onode.node.coverage * onode.node.length_alone
|
46
|
+
denominator += onode.node.length_alone
|
47
|
+
end
|
48
|
+
numerator.to_f / denominator
|
49
|
+
end
|
50
|
+
@paths = [@paths.max do |path1, path2|
|
51
|
+
get_coverage.call(path1) <=> get_coverage.call(path2)
|
52
|
+
end]
|
53
|
+
end
|
37
54
|
end
|
38
55
|
|
39
56
|
# Given two contigs, return a consensus path and variants of the path.
|
@@ -293,7 +310,7 @@ module Bio
|
|
293
310
|
def clustalo(sequences)
|
294
311
|
i = 0
|
295
312
|
stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
|
296
|
-
log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}"
|
313
|
+
log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" if log.debug?
|
297
314
|
stdout = Bio::Commandeer.run "clustalo -t DNA -i - --output-order=input-order", {:stdin => stdin, :log => log}
|
298
315
|
to_return = []
|
299
316
|
header = true
|
@@ -101,7 +101,8 @@ class Bio::FinishM::ScaffoldBreaker
|
|
101
101
|
|
102
102
|
unless seq.seq.match(/^[ATGCN]+$/i)
|
103
103
|
example = seq.seq.match(/([^ATGCN])/i)[1]
|
104
|
-
log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}
|
104
|
+
log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}. Replacing them with Ns"
|
105
|
+
seq.seq.gsub! /[^ATGCN]/i, 'N'
|
105
106
|
end
|
106
107
|
|
107
108
|
if seq.seq.match(/^N+$/i)
|
data/lib/finishm/orfs_finder.rb
CHANGED
@@ -2,21 +2,39 @@ class Bio::FinishM::ORFsFinder
|
|
2
2
|
include Bio::FinishM::Logging
|
3
3
|
|
4
4
|
DEFAULT_OPTIONS = {
|
5
|
-
:min_orf_length =>
|
5
|
+
:min_orf_length => 96
|
6
6
|
}
|
7
7
|
|
8
8
|
def add_options(optparse_object, options)
|
9
9
|
options.merge! Bio::FinishM::Visualise::DEFAULT_OPTIONS
|
10
10
|
options.merge! DEFAULT_OPTIONS
|
11
|
-
optparse_object.banner = "\nUsage: finishm find_orfs --
|
11
|
+
optparse_object.banner = "\nUsage: finishm find_orfs [--orf-amino-acids OUTPUT_FAA --orf-nucleotides OUTPUT_FNA]
|
12
12
|
|
13
13
|
Find possible open reading frames in assembly graph
|
14
14
|
\n\n"
|
15
15
|
|
16
|
-
optparse_object.separator "
|
16
|
+
optparse_object.separator "\nOutput sequence files\n\n"
|
17
|
+
optparse_object.on("--orf-amino-acids OUTPUT_FAA", "Output ORF amino acid sequences [default: orf.faa unless --orf-nucleotides is specified]") do |arg|
|
18
|
+
options[:output_faa] = arg
|
19
|
+
end
|
20
|
+
optparse_object.on("--orf-nucleotides OUTPUT_FNA", "Output ORF nucleotide sequences [default: orf.fna unless --orf-amino-acids is specified]") do |arg|
|
21
|
+
options[:output_fna] = arg
|
22
|
+
end
|
23
|
+
|
24
|
+
optparse_object.separator "\nInput genome information"
|
17
25
|
optparse_object.separator "\nIf an assembly is to be done, there must be some definition of reads:\n\n" #TODO improve this help
|
18
26
|
Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
|
19
27
|
|
28
|
+
optparse_object.separator "\nOptional arguments:\n\n"
|
29
|
+
optparse_object.on("--min-orf-length", "Minimum ORF length [default: 96]") do |arg|
|
30
|
+
length = arg.to_i
|
31
|
+
if length.to_s != arg or length.nil? or length < 1
|
32
|
+
raise "Unable to parse minimum orf length parameter #{arg}, cannot continue"
|
33
|
+
end
|
34
|
+
options[:min_orf_length] = length
|
35
|
+
end
|
36
|
+
|
37
|
+
|
20
38
|
optparse_object.separator "\nOptional graph-exploration arguments:\n\n"
|
21
39
|
Bio::FinishM::Visualise.new.add_probe_options(optparse_object, options)
|
22
40
|
|
@@ -59,7 +77,18 @@ class Bio::FinishM::ORFsFinder
|
|
59
77
|
end
|
60
78
|
|
61
79
|
initial_onodes = Bio::FinishM::PathCounter.new.get_leash_start_nodes(finishm_graph, options[:range])
|
62
|
-
find_orfs_in_graph(finishm_graph, initial_onodes, options)
|
80
|
+
orfs = find_orfs_in_graph(finishm_graph, initial_onodes, options)
|
81
|
+
log.info "Found #{orfs.length} open reading frames longer than #{options[:min_orf_length]}."
|
82
|
+
if not options[:output_fna] and not options[:output_faa]
|
83
|
+
options[:output_fna] = 'orfs.fna'
|
84
|
+
end
|
85
|
+
|
86
|
+
if options[:output_fna]
|
87
|
+
write_orfs_to_file(orfs, options[:output_fna])
|
88
|
+
end
|
89
|
+
if options[:output_faa]
|
90
|
+
write_orfs_to_file(orfs, options[:output_faa], translate=true)
|
91
|
+
end
|
63
92
|
end
|
64
93
|
|
65
94
|
def find_orfs_in_graph(finishm_graph, initial_onodes, options={})
|
@@ -73,11 +102,24 @@ class Bio::FinishM::ORFsFinder
|
|
73
102
|
orf_trails = orfer.find_orfs_in_graph(finishm_graph.graph, initial_paths,
|
74
103
|
options[:min_orf_length], options[:range])
|
75
104
|
|
76
|
-
|
105
|
+
orfer.orf_sequences_from_trails(orf_trails, options[:min_orf_length])
|
106
|
+
end
|
77
107
|
|
78
|
-
|
79
|
-
|
80
|
-
|
108
|
+
def write_orfs_to_file(found_orfs, orfs_file, translate=false)
|
109
|
+
if translate
|
110
|
+
translator = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
111
|
+
end
|
112
|
+
File.open(orfs_file,'w') do |f|
|
113
|
+
counter = 0
|
114
|
+
found_orfs.each do |name_and_sequence|
|
115
|
+
counter += 1
|
116
|
+
f.puts ">finishm_orf_#{counter} #{name_and_sequence[0]}"
|
117
|
+
if translate
|
118
|
+
f.puts translator.sequence2AA(name_and_sequence[1][0...-3])
|
119
|
+
else
|
120
|
+
f.puts name_and_sequence[1]
|
121
|
+
end
|
122
|
+
end
|
81
123
|
end
|
82
124
|
end
|
83
125
|
|
data/lib/finishm/roundup.rb
CHANGED
@@ -10,6 +10,7 @@ class Bio::FinishM::RoundUp
|
|
10
10
|
:gapfill_only => false,
|
11
11
|
:max_explore_nodes => 10000,
|
12
12
|
:max_gapfill_paths => 10,
|
13
|
+
:gapfill_with_max_coverage => false,
|
13
14
|
}
|
14
15
|
|
15
16
|
def add_options(optparse_object, options)
|
@@ -64,6 +65,9 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
64
65
|
optparse_object.on("--max-explore-nodes NUM", Integer, "Only explore this many nodes. If max is reached, do not make connections. [default: #{options[:max_explore_nodes] }]") do |arg|
|
65
66
|
options[:max_explore_nodes] = arg
|
66
67
|
end
|
68
|
+
optparse_object.on("--gapfill-with-max-coverage", "When gapfilling, take the path with maximal coverage and do not print variants [default: #{options[:gapfill_with_max_coverage] }]") do
|
69
|
+
options[:gapfill_with_max_coverage] = true
|
70
|
+
end
|
67
71
|
optparse_object.on("--debug", "Build the graph, then drop to a pry console. [default: #{options[:debug] }]") do
|
68
72
|
options[:debug] = true
|
69
73
|
end
|
@@ -193,6 +197,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
193
197
|
# Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
|
194
198
|
scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
|
195
199
|
else
|
200
|
+
acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
196
201
|
scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
|
197
202
|
master_graph.graph,
|
198
203
|
scaffold_sequence,
|
@@ -321,7 +326,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
321
326
|
return gapfilled_sequence, num_gapfills, all_variants
|
322
327
|
end
|
323
328
|
|
324
|
-
def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths)
|
329
|
+
def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options)
|
325
330
|
scaffold_sequence = nil
|
326
331
|
gapfilled = -1
|
327
332
|
if aconn.paths.length == 0 or aconn.paths.length > max_gapfill_paths
|
@@ -329,6 +334,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
329
334
|
scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
|
330
335
|
gapfilled = false
|
331
336
|
else
|
337
|
+
acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
332
338
|
scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
|
333
339
|
master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
|
334
340
|
)
|
data/lib/finishm/visualise.rb
CHANGED
@@ -38,7 +38,8 @@ class Bio::FinishM::Visualise
|
|
38
38
|
return validate_argv_length(argv) ||
|
39
39
|
validate_visualisation_options(options) ||
|
40
40
|
validate_probe_options(options) ||
|
41
|
-
validate_assembly_options(options)
|
41
|
+
validate_assembly_options(options) ||
|
42
|
+
validate_scaffold_options(options)
|
42
43
|
end
|
43
44
|
|
44
45
|
def add_visualisation_options(optparse_object, options)
|
@@ -79,8 +80,8 @@ class Bio::FinishM::Visualise
|
|
79
80
|
|
80
81
|
def validate_scaffold_options(options)
|
81
82
|
# If scaffolds are defined, then probe genomes must also be defined
|
82
|
-
if options[:
|
83
|
-
return "If --scaffolds is
|
83
|
+
if options[:scaffold_sides] and !options[:assembly_files]
|
84
|
+
return "If --scaffolds is given, then --genomes must also be given"
|
84
85
|
end
|
85
86
|
end
|
86
87
|
|
data/spec/all_orfs_spec.rb
CHANGED
@@ -195,6 +195,127 @@ describe "AllOrfs" do
|
|
195
195
|
res.collect{|result| result.initial_start_markers}.should == [[],[]]
|
196
196
|
end
|
197
197
|
|
198
|
+
it 'should find two same-phase orfs along a trail' do
|
199
|
+
graph, = GraphTesting.emit_otrails([
|
200
|
+
[1,2,3]
|
201
|
+
])
|
202
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
203
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAAATGGA' #stop codon 'TAA', start codon 'ATG'
|
204
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
205
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
206
|
+
|
207
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
208
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
209
|
+
#pp problems
|
210
|
+
|
211
|
+
paths = orfer.find_orfs_from_problems(problems)
|
212
|
+
#pp paths
|
213
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
214
|
+
[1,2,3]
|
215
|
+
]
|
216
|
+
res = paths.trails[0].fwd_orfs_result
|
217
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
218
|
+
[6,15],
|
219
|
+
[18,30]
|
220
|
+
]
|
221
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
222
|
+
[[1,6],[2,5]],
|
223
|
+
[[2,8], [3,10]]
|
224
|
+
]
|
225
|
+
res.initial_start_markers.should == []
|
226
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
227
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
|
228
|
+
res.final_start_markers.should == []
|
229
|
+
end
|
230
|
+
|
231
|
+
it 'should end orfs at first stop codon in forward direction' do
|
232
|
+
graph, = GraphTesting.emit_otrails([
|
233
|
+
[1,2,3]
|
234
|
+
])
|
235
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
236
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAAAAAGA' #stop codon 'TAA'
|
237
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
238
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
239
|
+
|
240
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
241
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
242
|
+
#pp problems
|
243
|
+
|
244
|
+
paths = orfer.find_orfs_from_problems(problems)
|
245
|
+
#pp paths
|
246
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
247
|
+
[1,2,3]
|
248
|
+
]
|
249
|
+
res = paths.trails[0].fwd_orfs_result
|
250
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
251
|
+
[6,15]
|
252
|
+
]
|
253
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
254
|
+
[[1,6],[2,5]]
|
255
|
+
]
|
256
|
+
res.initial_start_markers.should == []
|
257
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
258
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
|
259
|
+
res.final_start_markers.should == []
|
260
|
+
end
|
261
|
+
|
262
|
+
it 'should end orfs at first stop codon in twin direction' do
|
263
|
+
graph = GraphTesting.emit([
|
264
|
+
[1,2],
|
265
|
+
[2,3]
|
266
|
+
])
|
267
|
+
graph.nodes[1].ends_of_kmers_of_twin_node = 'TTAGTTTTTT' # stop codon 'TAG'
|
268
|
+
graph.nodes[2].ends_of_kmers_of_twin_node = 'TTTAGTTTTT' # stop codon 'TAG'
|
269
|
+
graph.nodes[3].ends_of_kmers_of_twin_node = 'TAAATGTTTT' # stop codon 'TAA', start codon 'ATG'
|
270
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
271
|
+
|
272
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
273
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
274
|
+
#pp problems
|
275
|
+
|
276
|
+
paths = orfer.find_orfs_from_problems(problems)
|
277
|
+
#pp paths
|
278
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
279
|
+
[1,2,3]
|
280
|
+
]
|
281
|
+
res = paths.trails[0].twin_orfs_result
|
282
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
283
|
+
[6,15]
|
284
|
+
]
|
285
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
286
|
+
[[3,6],[2,5]]
|
287
|
+
]
|
288
|
+
res.initial_start_markers.should == []
|
289
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
290
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[3,3]]
|
291
|
+
res.final_start_markers.should == []
|
292
|
+
end
|
293
|
+
|
294
|
+
it 'should return the first initial stop codon in forward direction' do
|
295
|
+
graph = GraphTesting.emit([
|
296
|
+
[1,2],
|
297
|
+
[2,3]
|
298
|
+
])
|
299
|
+
graph.nodes[1].ends_of_kmers_of_node = 'AAATAGAAAA' # stop codon 'TAG'
|
300
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAGAAAAA' # stop codon 'TAG'
|
301
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
302
|
+
|
303
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
304
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
305
|
+
#pp problems
|
306
|
+
|
307
|
+
paths = orfer.find_orfs_from_problems(problems)
|
308
|
+
#pp paths
|
309
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
310
|
+
[1,2,3]
|
311
|
+
]
|
312
|
+
res = paths.trails[0].fwd_orfs_result
|
313
|
+
res.start_stop_pairs.should == []
|
314
|
+
res.initial_start_markers.should == []
|
315
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [6]
|
316
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,6]]
|
317
|
+
res.final_start_markers.should == []
|
318
|
+
end
|
198
319
|
|
199
320
|
it 'should respect terminal nodes' do
|
200
321
|
fail '#todo'
|
@@ -433,11 +554,79 @@ describe "AllOrfs" do
|
|
433
554
|
|
434
555
|
paths = orfer.find_orfs_from_problems(problems)
|
435
556
|
#pp paths
|
436
|
-
orfer.orf_sequences_from_trails(paths.trails).should ==
|
437
|
-
'(1s:6),2s,(3s:10)'
|
438
|
-
'1s,
|
439
|
-
',
|
440
|
-
|
557
|
+
orfer.orf_sequences_from_trails(paths.trails).should == [
|
558
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
559
|
+
[',(1s:3)', 'TAA'],
|
560
|
+
['1s,2s,3s', 'T'*30],
|
561
|
+
['1s,2s,3s', 'T'*27],
|
562
|
+
['1s,2s,3s', 'T'*27]
|
563
|
+
]
|
564
|
+
end
|
565
|
+
|
566
|
+
it 'should respect minimum orf length' do
|
567
|
+
graph = GraphTesting.emit([
|
568
|
+
[1,2],
|
569
|
+
[2,3]
|
570
|
+
])
|
571
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
572
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
573
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
574
|
+
|
575
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
576
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
577
|
+
|
578
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 30)
|
579
|
+
orfer.orf_sequences_from_trails(paths.trails, 30).should == [
|
580
|
+
['1s,2s,3s', 'T'*30]
|
581
|
+
]
|
582
|
+
|
583
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 20)
|
584
|
+
orfer.orf_sequences_from_trails(paths.trails, 20).should == [
|
585
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
586
|
+
['1s,2s,3s', 'T'*30],
|
587
|
+
['1s,2s,3s', 'T'*27],
|
588
|
+
['1s,2s,3s', 'T'*27]
|
589
|
+
]
|
590
|
+
|
591
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 0)
|
592
|
+
orfer.orf_sequences_from_trails(paths.trails, 0).should == [
|
593
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
594
|
+
[',(1s:3)', 'TAA'],
|
595
|
+
['1s,2s,3s', 'T'*30],
|
596
|
+
['1s,2s,3s', 'T'*27],
|
597
|
+
['1s,2s,3s', 'T'*27]
|
598
|
+
]
|
599
|
+
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
describe 'sequence2AA' do
|
604
|
+
it 'should return corresponding amino acids for an orf sequence' do
|
605
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
606
|
+
orfer.sequence2AA('GCTGCCGCAGCG').should == 'AAAA'
|
607
|
+
orfer.sequence2AA('CGTCGCCGACGGAGAAGG').should == 'RRRRRR'
|
608
|
+
orfer.sequence2AA('AATAAC').should == 'NN'
|
609
|
+
orfer.sequence2AA('GATGAC').should == 'DD'
|
610
|
+
orfer.sequence2AA('TGTTGC').should == 'CC'
|
611
|
+
orfer.sequence2AA('CAACAG').should == 'QQ'
|
612
|
+
orfer.sequence2AA('GAAGAG').should == 'EE'
|
613
|
+
orfer.sequence2AA('GGTGGCGGAGGG').should == 'GGGG'
|
614
|
+
orfer.sequence2AA('CATCAC').should == 'HH'
|
615
|
+
orfer.sequence2AA('ATTATCATA').should == 'III'
|
616
|
+
orfer.sequence2AA('TTATTGCTTCTCCTACTG').should == 'LLLLLL'
|
617
|
+
orfer.sequence2AA('AAAAAG').should == 'KK'
|
618
|
+
orfer.sequence2AA('ATG').should == 'M'
|
619
|
+
orfer.sequence2AA('TTTTTC').should == 'FF'
|
620
|
+
orfer.sequence2AA('CCTCCCCCACCG').should == 'PPPP'
|
621
|
+
orfer.sequence2AA('TCTTCCTCATCGAGTAGC').should == 'SSSSSS'
|
622
|
+
orfer.sequence2AA('ACTACCACAACG').should == 'TTTT'
|
623
|
+
orfer.sequence2AA('TGG').should == 'W'
|
624
|
+
orfer.sequence2AA('TATTAC').should == 'YY'
|
625
|
+
orfer.sequence2AA('GTTGTCGTAGTG').should == 'VVVV'
|
626
|
+
lambda { orfer.sequence2AA('TAA') }.should raise_error
|
627
|
+
lambda { orfer.sequence2AA('TGA') }.should raise_error
|
628
|
+
lambda { orfer.sequence2AA('TAG') }.should raise_error
|
629
|
+
lambda { orfer.sequence2AA('ABCXYZ') }.should raise_error
|
441
630
|
end
|
442
631
|
end
|
443
632
|
end
|
data/spec/contig_printer_spec.rb
CHANGED
@@ -42,7 +42,7 @@ describe "ContigPrinter" do
|
|
42
42
|
'14S:GTT',
|
43
43
|
].sort
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it 'should handle not variants' do
|
47
47
|
seqs = [
|
48
48
|
'ATGAATATGTGCATAGGATT',
|
@@ -235,7 +235,7 @@ describe "ContigPrinter" do
|
|
235
235
|
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
|
236
236
|
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
|
237
237
|
]
|
238
|
-
expected =
|
238
|
+
expected =
|
239
239
|
'ATGAACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTCTAGTGGCGCACGGGTGCGTAACGCGTGGGAATCTGCCCTTGGGTACGG'+
|
240
240
|
'AATAACAGTTAGAAATGACTGCTAATACCGTATAATGACTTCGGTCCAAAGATTTATCGCCCAGGGATGAGCCCGCGTAGGATTAGCTTGTTGGTGAGGTAAANN'+
|
241
241
|
'NTNNCNNANNNNNNNNNNNNTNNNNNGNNNNNNNNNNNGNTNAGNNNCNNNGNNNNNGNGANNTGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGC'+
|
@@ -283,9 +283,27 @@ describe "ContigPrinter" do
|
|
283
283
|
it 'should handle when start_coord is not == 0 and both reads are outwards facing' do
|
284
284
|
raise
|
285
285
|
end
|
286
|
-
|
286
|
+
|
287
287
|
it 'should handle when the example path is not the same length as the reference path' do
|
288
288
|
fail
|
289
289
|
end
|
290
290
|
end
|
291
|
+
|
292
|
+
describe 'AnchoredConnection' do
|
293
|
+
it 'should collapse_paths_to_maximal_coverage_path!' do
|
294
|
+
graph = Bio::Velvet::Graph.parse_from_file(File.join TEST_DATA_DIR, 'contig_printer','1','seq.fa.velvet','LastGraph')
|
295
|
+
graph.nodes.length.should == 13
|
296
|
+
acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
|
297
|
+
acon.start_probe_noded_read = graph.nodes[9].short_reads.select{|nr| nr.read_id == 161}[0] #Found these by using bwa and inspecting the Sequence velvet file
|
298
|
+
acon.end_probe_noded_read = graph.nodes[4].short_reads.select{|nr| nr.read_id == 1045}[0]
|
299
|
+
acon.start_probe_contig_offset = 2
|
300
|
+
acon.end_probe_contig_offset = 3
|
301
|
+
acon.paths = [
|
302
|
+
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
|
303
|
+
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
|
304
|
+
]
|
305
|
+
acon.collapse_paths_to_maximal_coverage_path!
|
306
|
+
acon.paths.collect{|path| path.to_shorthand}.should == [%w(9s 12s 7e 13s 5e 11e 2s 10s 4e).join(',')]
|
307
|
+
end
|
308
|
+
end
|
291
309
|
end
|
@@ -141,4 +141,18 @@ describe "ScaffoldBreaker" do
|
|
141
141
|
brokes[0].sequence.should == seq
|
142
142
|
end
|
143
143
|
end
|
144
|
+
|
145
|
+
it 'should replace non-ATGC characters with N' do
|
146
|
+
breaker = Bio::FinishM::ScaffoldBreaker.new
|
147
|
+
Tempfile.open('a') do |tmp|
|
148
|
+
tmp.puts '>ab'
|
149
|
+
seq = 'AAAAANNNGGGYYYTTNNAA'
|
150
|
+
tmp.puts seq
|
151
|
+
# 1234567890123456789
|
152
|
+
tmp.close
|
153
|
+
|
154
|
+
brokes = breaker.break_scaffolds(tmp.path)
|
155
|
+
brokes[0].sequence.should == 'AAAAANNNGGGNNNTTNNAA'
|
156
|
+
end
|
157
|
+
end
|
144
158
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: finishm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J. Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-ipcress
|