finishm 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/finishm +6 -2
- data/finishm.gemspec +3 -3
- data/lib/assembly/all_orfs.rb +61 -7
- data/lib/assembly/contig_printer.rb +18 -1
- data/lib/assembly/scaffold_breaker.rb +2 -1
- data/lib/finishm/orfs_finder.rb +50 -8
- data/lib/finishm/roundup.rb +7 -1
- data/lib/finishm/visualise.rb +4 -3
- data/spec/all_orfs_spec.rb +194 -5
- data/spec/contig_printer_spec.rb +21 -3
- data/spec/scaffold_breaker_spec.rb +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce5a708f7e7dbd95e357c0e58f94ef29c49d0cbf
|
4
|
+
data.tar.gz: ddf1f623a8045f626455e6c25bf78176c792407d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d73d9bbbc21761606fc5503d8818792d5a083f766e6a8c62336e17b50505c4c9c1897b79307f16f1e30b9f97b484695baf7ee46e009de5a375ba3660e26ede4
|
7
|
+
data.tar.gz: 718ad349ab3c87b1d370ed22eb4d7c68574f67ba6c0ab7a477db94211d12b9920e259c2f0aa05194df0e6e5ca79b5167f22f82708c65d50f2adc613b2a180ee1
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/bin/finishm
CHANGED
@@ -18,12 +18,16 @@ global = OptionParser.new do |opts|
|
|
18
18
|
opts.banner = "
|
19
19
|
Usage: #{SCRIPT_NAME} <command> [<arguments>]
|
20
20
|
|
21
|
-
FinishM is a collection of tasks related to assembly and metagenome assembly.
|
21
|
+
FinishM is a collection of tasks related to assembly and metagenome assembly. Common commands:
|
22
|
+
|
23
|
+
roundup\tImprove a genome by connecting scaffolds and gapfilling
|
24
|
+
visualise\tVisualise the DeBruijn graph
|
25
|
+
|
26
|
+
Other available commands:
|
22
27
|
|
23
28
|
wander\tTry to connect contigs (experimental)
|
24
29
|
gapfill\tFill assembly gaps (N characters) (experimental)
|
25
30
|
explore\tWhat happens in the graph beyond the end of my contig(s)? (experimental)
|
26
|
-
visualise\tVisualise the DeBruijn graph (experimental)
|
27
31
|
|
28
32
|
Commands for PCR finishing:
|
29
33
|
|
data/finishm.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: finishm 0.0.
|
5
|
+
# stub: finishm 0.0.4 ruby lib
|
6
6
|
# stub: ext/mkrf_conf.rb
|
7
7
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = "finishm"
|
10
|
-
s.version = "0.0.
|
10
|
+
s.version = "0.0.4"
|
11
11
|
|
12
12
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.authors = ["Ben J. Woodcroft"]
|
15
|
-
s.date = "2015-
|
15
|
+
s.date = "2015-08-30"
|
16
16
|
s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
|
17
17
|
s.email = "donttrustben near gmail.com"
|
18
18
|
s.executables = ["finishm"]
|
data/lib/assembly/all_orfs.rb
CHANGED
@@ -9,6 +9,32 @@ module Bio
|
|
9
9
|
CODON_LENGTH = 3
|
10
10
|
START_CODONS = ['ATG']
|
11
11
|
STOP_CODONS = ['TAG', 'TAA', 'TGA']
|
12
|
+
CODONS = {
|
13
|
+
'A' => ['GCT', 'GCC', 'GCA', 'GCG'],
|
14
|
+
'R' => ['CGT', 'CGC', 'CGA','CGG', 'AGA', 'AGG'],
|
15
|
+
'N' => ['AAT', 'AAC'],
|
16
|
+
'D' => ['GAT', 'GAC'],
|
17
|
+
'C' => ['TGT', 'TGC'],
|
18
|
+
'Q' => ['CAA', 'CAG'],
|
19
|
+
'E' => ['GAA', 'GAG'],
|
20
|
+
'G' => ['GGT', 'GGC', 'GGA', 'GGG'],
|
21
|
+
'H' => ['CAT', 'CAC'],
|
22
|
+
'I' => ['ATT', 'ATC', 'ATA'],
|
23
|
+
'L' => ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
24
|
+
'K' => ['AAA', 'AAG'],
|
25
|
+
'M' => ['ATG'],
|
26
|
+
'F' => ['TTT', 'TTC'],
|
27
|
+
'P' => ['CCT', 'CCC', 'CCA', 'CCG'],
|
28
|
+
'S' => ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
29
|
+
'T' => ['ACT', 'ACC', 'ACA', 'ACG'],
|
30
|
+
'W' => ['TGG'],
|
31
|
+
'Y' => ['TAT', 'TAC'],
|
32
|
+
'V' => ['GTT', 'GTC', 'GTA', 'GTG']
|
33
|
+
}
|
34
|
+
TRANSLATOR = CODONS.reduce({}) do |memo, pair|
|
35
|
+
pair[1].each{|key| memo[key] = pair[0]}
|
36
|
+
memo
|
37
|
+
end
|
12
38
|
|
13
39
|
# Search for open reading frames in a graph, in all the paths begining at a set of
|
14
40
|
# nodes through a graph (or a subset defined by range)
|
@@ -78,7 +104,7 @@ module Bio
|
|
78
104
|
max_num_paths = options[:max_gapfill_paths]
|
79
105
|
max_num_paths ||= 2196
|
80
106
|
max_cycles = options[:max_cycles] || 1
|
81
|
-
min_orf_length = options[:
|
107
|
+
min_orf_length = options[:min_orf_length] || 0
|
82
108
|
|
83
109
|
counter = SingleCoherentPathsBetweenNodesFinder::CycleCounter.new(max_cycles)
|
84
110
|
decide_stack = lambda do |to_push|
|
@@ -474,10 +500,11 @@ module Bio
|
|
474
500
|
return to_return
|
475
501
|
end
|
476
502
|
|
477
|
-
def orf_sequences_from_trails(trails)
|
478
|
-
to_return =
|
503
|
+
def orf_sequences_from_trails(trails, min_orf_length=nil)
|
504
|
+
to_return = []
|
479
505
|
trails.each do |trail|
|
480
506
|
fwd_sequence, twin_sequence = trail.otrail.sequences_within_path
|
507
|
+
trail_length = fwd_sequence.length
|
481
508
|
# forward / twin directions
|
482
509
|
[
|
483
510
|
[fwd_sequence, trail.fwd_orfs_result],
|
@@ -500,10 +527,12 @@ module Bio
|
|
500
527
|
end
|
501
528
|
name = "(#{onodes[0].to_shorthand}:#{pair[0].position_in_node}),#{onodes[1...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{pair[1].position_in_node})"
|
502
529
|
|
503
|
-
to_return[name
|
530
|
+
to_return.push [name, sequence[start_position...end_position]]
|
504
531
|
end
|
505
532
|
result.initial_stop_markers.each do |marker|
|
506
533
|
end_position = marker.position_in_trail
|
534
|
+
start_position = end_position % 3 #trim sequence to multiple of 3
|
535
|
+
next if min_orf_length and end_position - start_position < min_orf_length
|
507
536
|
|
508
537
|
# orf_name
|
509
538
|
last_node = nil
|
@@ -514,20 +543,29 @@ module Bio
|
|
514
543
|
end
|
515
544
|
name = "#{onodes[0...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{marker.position_in_node})"
|
516
545
|
|
517
|
-
to_return[name
|
546
|
+
to_return.push [name, sequence[start_position...end_position]]
|
518
547
|
end
|
519
548
|
result.final_start_markers.each do |marker|
|
520
549
|
start_position = marker.position_in_trail - 3
|
550
|
+
end_position = (trail_length - start_position) % 3
|
551
|
+
next if min_orf_length and trail_length - end_position - start_position < min_orf_length
|
521
552
|
|
522
553
|
# orf_name
|
523
554
|
onodes = trail.otrail.trail.drop_while{|onode| onode.node != marker.node}
|
524
555
|
name = "(#{onodes[0].to_shorthand}:#{marker.position_in_node}),#{onodes[1..-1].collect{|onode| onode.to_shorthand}.join(',')}"
|
556
|
+
to_return.push [name, sequence[start_position..-1-end_position]]
|
525
557
|
end
|
526
558
|
end
|
527
559
|
if result.nil? or (result.start_stop_pairs.empty? and result.final_start_markers.empty? and result.initial_stop_markers.empty?)
|
528
|
-
|
560
|
+
(0..2).each do |frame|
|
561
|
+
start_position = frame
|
562
|
+
end_position = (trail_length - start_position) % 3
|
563
|
+
next if min_orf_length and trail_length - end_position - start_position < min_orf_length
|
529
564
|
|
530
|
-
|
565
|
+
# orf_name
|
566
|
+
name = "#{trail.otrail.to_shorthand}"
|
567
|
+
to_return.push [name, sequence[start_position..-1-end_position]]
|
568
|
+
end
|
531
569
|
end
|
532
570
|
end
|
533
571
|
end
|
@@ -535,6 +573,22 @@ module Bio
|
|
535
573
|
return to_return
|
536
574
|
end
|
537
575
|
|
576
|
+
def sequence2AA(sequence)
|
577
|
+
remaining = sequence
|
578
|
+
aa = ""
|
579
|
+
while remaining.length > 0
|
580
|
+
codon = remaining[0...3]
|
581
|
+
log.debug "Found next codon #{codon}" if log.debug?
|
582
|
+
if not TRANSLATOR.has_key?(codon)
|
583
|
+
raise "Cannot translate invalid codon #{codon} in sequence #{sequence}."
|
584
|
+
end
|
585
|
+
log.debug "Codon translated to #{TRANSLATOR[codon]}" if log.debug?
|
586
|
+
aa += TRANSLATOR[codon]
|
587
|
+
remaining = remaining[3..-1]
|
588
|
+
end
|
589
|
+
return aa
|
590
|
+
end
|
591
|
+
|
538
592
|
# positions of last base of codons
|
539
593
|
class Marker
|
540
594
|
attr_accessor :position_in_trail, :position_in_node, :node
|
@@ -34,6 +34,23 @@ module Bio
|
|
34
34
|
# Enumerable of Enumerables of OrientedNode objects, each list of OrientedNode objects
|
35
35
|
# corresponds to a path that forms the connection
|
36
36
|
attr_accessor :paths
|
37
|
+
|
38
|
+
# Remove all except the path with maximal coverage from @paths
|
39
|
+
def collapse_paths_to_maximal_coverage_path!
|
40
|
+
return if @paths.nil? or @paths.empty?
|
41
|
+
get_coverage = lambda do |path|
|
42
|
+
numerator = 0
|
43
|
+
denominator = 0
|
44
|
+
path.each do |onode|
|
45
|
+
numerator += onode.node.coverage * onode.node.length_alone
|
46
|
+
denominator += onode.node.length_alone
|
47
|
+
end
|
48
|
+
numerator.to_f / denominator
|
49
|
+
end
|
50
|
+
@paths = [@paths.max do |path1, path2|
|
51
|
+
get_coverage.call(path1) <=> get_coverage.call(path2)
|
52
|
+
end]
|
53
|
+
end
|
37
54
|
end
|
38
55
|
|
39
56
|
# Given two contigs, return a consensus path and variants of the path.
|
@@ -293,7 +310,7 @@ module Bio
|
|
293
310
|
def clustalo(sequences)
|
294
311
|
i = 0
|
295
312
|
stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
|
296
|
-
log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}"
|
313
|
+
log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" if log.debug?
|
297
314
|
stdout = Bio::Commandeer.run "clustalo -t DNA -i - --output-order=input-order", {:stdin => stdin, :log => log}
|
298
315
|
to_return = []
|
299
316
|
header = true
|
@@ -101,7 +101,8 @@ class Bio::FinishM::ScaffoldBreaker
|
|
101
101
|
|
102
102
|
unless seq.seq.match(/^[ATGCN]+$/i)
|
103
103
|
example = seq.seq.match(/([^ATGCN])/i)[1]
|
104
|
-
log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}
|
104
|
+
log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}. Replacing them with Ns"
|
105
|
+
seq.seq.gsub! /[^ATGCN]/i, 'N'
|
105
106
|
end
|
106
107
|
|
107
108
|
if seq.seq.match(/^N+$/i)
|
data/lib/finishm/orfs_finder.rb
CHANGED
@@ -2,21 +2,39 @@ class Bio::FinishM::ORFsFinder
|
|
2
2
|
include Bio::FinishM::Logging
|
3
3
|
|
4
4
|
DEFAULT_OPTIONS = {
|
5
|
-
:min_orf_length =>
|
5
|
+
:min_orf_length => 96
|
6
6
|
}
|
7
7
|
|
8
8
|
def add_options(optparse_object, options)
|
9
9
|
options.merge! Bio::FinishM::Visualise::DEFAULT_OPTIONS
|
10
10
|
options.merge! DEFAULT_OPTIONS
|
11
|
-
optparse_object.banner = "\nUsage: finishm find_orfs --
|
11
|
+
optparse_object.banner = "\nUsage: finishm find_orfs [--orf-amino-acids OUTPUT_FAA --orf-nucleotides OUTPUT_FNA]
|
12
12
|
|
13
13
|
Find possible open reading frames in assembly graph
|
14
14
|
\n\n"
|
15
15
|
|
16
|
-
optparse_object.separator "
|
16
|
+
optparse_object.separator "\nOutput sequence files\n\n"
|
17
|
+
optparse_object.on("--orf-amino-acids OUTPUT_FAA", "Output ORF amino acid sequences [default: orf.faa unless --orf-nucleotides is specified]") do |arg|
|
18
|
+
options[:output_faa] = arg
|
19
|
+
end
|
20
|
+
optparse_object.on("--orf-nucleotides OUTPUT_FNA", "Output ORF nucleotide sequences [default: orf.fna unless --orf-amino-acids is specified]") do |arg|
|
21
|
+
options[:output_fna] = arg
|
22
|
+
end
|
23
|
+
|
24
|
+
optparse_object.separator "\nInput genome information"
|
17
25
|
optparse_object.separator "\nIf an assembly is to be done, there must be some definition of reads:\n\n" #TODO improve this help
|
18
26
|
Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
|
19
27
|
|
28
|
+
optparse_object.separator "\nOptional arguments:\n\n"
|
29
|
+
optparse_object.on("--min-orf-length", "Minimum ORF length [default: 96]") do |arg|
|
30
|
+
length = arg.to_i
|
31
|
+
if length.to_s != arg or length.nil? or length < 1
|
32
|
+
raise "Unable to parse minimum orf length parameter #{arg}, cannot continue"
|
33
|
+
end
|
34
|
+
options[:min_orf_length] = length
|
35
|
+
end
|
36
|
+
|
37
|
+
|
20
38
|
optparse_object.separator "\nOptional graph-exploration arguments:\n\n"
|
21
39
|
Bio::FinishM::Visualise.new.add_probe_options(optparse_object, options)
|
22
40
|
|
@@ -59,7 +77,18 @@ class Bio::FinishM::ORFsFinder
|
|
59
77
|
end
|
60
78
|
|
61
79
|
initial_onodes = Bio::FinishM::PathCounter.new.get_leash_start_nodes(finishm_graph, options[:range])
|
62
|
-
find_orfs_in_graph(finishm_graph, initial_onodes, options)
|
80
|
+
orfs = find_orfs_in_graph(finishm_graph, initial_onodes, options)
|
81
|
+
log.info "Found #{orfs.length} open reading frames longer than #{options[:min_orf_length]}."
|
82
|
+
if not options[:output_fna] and not options[:output_faa]
|
83
|
+
options[:output_fna] = 'orfs.fna'
|
84
|
+
end
|
85
|
+
|
86
|
+
if options[:output_fna]
|
87
|
+
write_orfs_to_file(orfs, options[:output_fna])
|
88
|
+
end
|
89
|
+
if options[:output_faa]
|
90
|
+
write_orfs_to_file(orfs, options[:output_faa], translate=true)
|
91
|
+
end
|
63
92
|
end
|
64
93
|
|
65
94
|
def find_orfs_in_graph(finishm_graph, initial_onodes, options={})
|
@@ -73,11 +102,24 @@ class Bio::FinishM::ORFsFinder
|
|
73
102
|
orf_trails = orfer.find_orfs_in_graph(finishm_graph.graph, initial_paths,
|
74
103
|
options[:min_orf_length], options[:range])
|
75
104
|
|
76
|
-
|
105
|
+
orfer.orf_sequences_from_trails(orf_trails, options[:min_orf_length])
|
106
|
+
end
|
77
107
|
|
78
|
-
|
79
|
-
|
80
|
-
|
108
|
+
def write_orfs_to_file(found_orfs, orfs_file, translate=false)
|
109
|
+
if translate
|
110
|
+
translator = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
111
|
+
end
|
112
|
+
File.open(orfs_file,'w') do |f|
|
113
|
+
counter = 0
|
114
|
+
found_orfs.each do |name_and_sequence|
|
115
|
+
counter += 1
|
116
|
+
f.puts ">finishm_orf_#{counter} #{name_and_sequence[0]}"
|
117
|
+
if translate
|
118
|
+
f.puts translator.sequence2AA(name_and_sequence[1][0...-3])
|
119
|
+
else
|
120
|
+
f.puts name_and_sequence[1]
|
121
|
+
end
|
122
|
+
end
|
81
123
|
end
|
82
124
|
end
|
83
125
|
|
data/lib/finishm/roundup.rb
CHANGED
@@ -10,6 +10,7 @@ class Bio::FinishM::RoundUp
|
|
10
10
|
:gapfill_only => false,
|
11
11
|
:max_explore_nodes => 10000,
|
12
12
|
:max_gapfill_paths => 10,
|
13
|
+
:gapfill_with_max_coverage => false,
|
13
14
|
}
|
14
15
|
|
15
16
|
def add_options(optparse_object, options)
|
@@ -64,6 +65,9 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
64
65
|
optparse_object.on("--max-explore-nodes NUM", Integer, "Only explore this many nodes. If max is reached, do not make connections. [default: #{options[:max_explore_nodes] }]") do |arg|
|
65
66
|
options[:max_explore_nodes] = arg
|
66
67
|
end
|
68
|
+
optparse_object.on("--gapfill-with-max-coverage", "When gapfilling, take the path with maximal coverage and do not print variants [default: #{options[:gapfill_with_max_coverage] }]") do
|
69
|
+
options[:gapfill_with_max_coverage] = true
|
70
|
+
end
|
67
71
|
optparse_object.on("--debug", "Build the graph, then drop to a pry console. [default: #{options[:debug] }]") do
|
68
72
|
options[:debug] = true
|
69
73
|
end
|
@@ -193,6 +197,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
193
197
|
# Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
|
194
198
|
scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
|
195
199
|
else
|
200
|
+
acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
196
201
|
scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
|
197
202
|
master_graph.graph,
|
198
203
|
scaffold_sequence,
|
@@ -321,7 +326,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
321
326
|
return gapfilled_sequence, num_gapfills, all_variants
|
322
327
|
end
|
323
328
|
|
324
|
-
def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths)
|
329
|
+
def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options)
|
325
330
|
scaffold_sequence = nil
|
326
331
|
gapfilled = -1
|
327
332
|
if aconn.paths.length == 0 or aconn.paths.length > max_gapfill_paths
|
@@ -329,6 +334,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
329
334
|
scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
|
330
335
|
gapfilled = false
|
331
336
|
else
|
337
|
+
acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
332
338
|
scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
|
333
339
|
master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
|
334
340
|
)
|
data/lib/finishm/visualise.rb
CHANGED
@@ -38,7 +38,8 @@ class Bio::FinishM::Visualise
|
|
38
38
|
return validate_argv_length(argv) ||
|
39
39
|
validate_visualisation_options(options) ||
|
40
40
|
validate_probe_options(options) ||
|
41
|
-
validate_assembly_options(options)
|
41
|
+
validate_assembly_options(options) ||
|
42
|
+
validate_scaffold_options(options)
|
42
43
|
end
|
43
44
|
|
44
45
|
def add_visualisation_options(optparse_object, options)
|
@@ -79,8 +80,8 @@ class Bio::FinishM::Visualise
|
|
79
80
|
|
80
81
|
def validate_scaffold_options(options)
|
81
82
|
# If scaffolds are defined, then probe genomes must also be defined
|
82
|
-
if options[:
|
83
|
-
return "If --scaffolds is
|
83
|
+
if options[:scaffold_sides] and !options[:assembly_files]
|
84
|
+
return "If --scaffolds is given, then --genomes must also be given"
|
84
85
|
end
|
85
86
|
end
|
86
87
|
|
data/spec/all_orfs_spec.rb
CHANGED
@@ -195,6 +195,127 @@ describe "AllOrfs" do
|
|
195
195
|
res.collect{|result| result.initial_start_markers}.should == [[],[]]
|
196
196
|
end
|
197
197
|
|
198
|
+
it 'should find two same-phase orfs along a trail' do
|
199
|
+
graph, = GraphTesting.emit_otrails([
|
200
|
+
[1,2,3]
|
201
|
+
])
|
202
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
203
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAAATGGA' #stop codon 'TAA', start codon 'ATG'
|
204
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
205
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
206
|
+
|
207
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
208
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
209
|
+
#pp problems
|
210
|
+
|
211
|
+
paths = orfer.find_orfs_from_problems(problems)
|
212
|
+
#pp paths
|
213
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
214
|
+
[1,2,3]
|
215
|
+
]
|
216
|
+
res = paths.trails[0].fwd_orfs_result
|
217
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
218
|
+
[6,15],
|
219
|
+
[18,30]
|
220
|
+
]
|
221
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
222
|
+
[[1,6],[2,5]],
|
223
|
+
[[2,8], [3,10]]
|
224
|
+
]
|
225
|
+
res.initial_start_markers.should == []
|
226
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
227
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
|
228
|
+
res.final_start_markers.should == []
|
229
|
+
end
|
230
|
+
|
231
|
+
it 'should end orfs at first stop codon in forward direction' do
|
232
|
+
graph, = GraphTesting.emit_otrails([
|
233
|
+
[1,2,3]
|
234
|
+
])
|
235
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
236
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAAAAAGA' #stop codon 'TAA'
|
237
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
238
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
239
|
+
|
240
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
241
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
242
|
+
#pp problems
|
243
|
+
|
244
|
+
paths = orfer.find_orfs_from_problems(problems)
|
245
|
+
#pp paths
|
246
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
247
|
+
[1,2,3]
|
248
|
+
]
|
249
|
+
res = paths.trails[0].fwd_orfs_result
|
250
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
251
|
+
[6,15]
|
252
|
+
]
|
253
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
254
|
+
[[1,6],[2,5]]
|
255
|
+
]
|
256
|
+
res.initial_start_markers.should == []
|
257
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
258
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
|
259
|
+
res.final_start_markers.should == []
|
260
|
+
end
|
261
|
+
|
262
|
+
it 'should end orfs at first stop codon in twin direction' do
|
263
|
+
graph = GraphTesting.emit([
|
264
|
+
[1,2],
|
265
|
+
[2,3]
|
266
|
+
])
|
267
|
+
graph.nodes[1].ends_of_kmers_of_twin_node = 'TTAGTTTTTT' # stop codon 'TAG'
|
268
|
+
graph.nodes[2].ends_of_kmers_of_twin_node = 'TTTAGTTTTT' # stop codon 'TAG'
|
269
|
+
graph.nodes[3].ends_of_kmers_of_twin_node = 'TAAATGTTTT' # stop codon 'TAA', start codon 'ATG'
|
270
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
271
|
+
|
272
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
273
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
274
|
+
#pp problems
|
275
|
+
|
276
|
+
paths = orfer.find_orfs_from_problems(problems)
|
277
|
+
#pp paths
|
278
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
279
|
+
[1,2,3]
|
280
|
+
]
|
281
|
+
res = paths.trails[0].twin_orfs_result
|
282
|
+
GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
|
283
|
+
[6,15]
|
284
|
+
]
|
285
|
+
GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
|
286
|
+
[[3,6],[2,5]]
|
287
|
+
]
|
288
|
+
res.initial_start_markers.should == []
|
289
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
|
290
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[3,3]]
|
291
|
+
res.final_start_markers.should == []
|
292
|
+
end
|
293
|
+
|
294
|
+
it 'should return the first initial stop codon in forward direction' do
|
295
|
+
graph = GraphTesting.emit([
|
296
|
+
[1,2],
|
297
|
+
[2,3]
|
298
|
+
])
|
299
|
+
graph.nodes[1].ends_of_kmers_of_node = 'AAATAGAAAA' # stop codon 'TAG'
|
300
|
+
graph.nodes[2].ends_of_kmers_of_node = 'AATAGAAAAA' # stop codon 'TAG'
|
301
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
302
|
+
|
303
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
304
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
305
|
+
#pp problems
|
306
|
+
|
307
|
+
paths = orfer.find_orfs_from_problems(problems)
|
308
|
+
#pp paths
|
309
|
+
GraphTesting.sorted_paths(paths.trails).should == [
|
310
|
+
[1,2,3]
|
311
|
+
]
|
312
|
+
res = paths.trails[0].fwd_orfs_result
|
313
|
+
res.start_stop_pairs.should == []
|
314
|
+
res.initial_start_markers.should == []
|
315
|
+
GraphTesting.marker_positions(res.initial_stop_markers).should == [6]
|
316
|
+
GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,6]]
|
317
|
+
res.final_start_markers.should == []
|
318
|
+
end
|
198
319
|
|
199
320
|
it 'should respect terminal nodes' do
|
200
321
|
fail '#todo'
|
@@ -433,11 +554,79 @@ describe "AllOrfs" do
|
|
433
554
|
|
434
555
|
paths = orfer.find_orfs_from_problems(problems)
|
435
556
|
#pp paths
|
436
|
-
orfer.orf_sequences_from_trails(paths.trails).should ==
|
437
|
-
'(1s:6),2s,(3s:10)'
|
438
|
-
'1s,
|
439
|
-
',
|
440
|
-
|
557
|
+
orfer.orf_sequences_from_trails(paths.trails).should == [
|
558
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
559
|
+
[',(1s:3)', 'TAA'],
|
560
|
+
['1s,2s,3s', 'T'*30],
|
561
|
+
['1s,2s,3s', 'T'*27],
|
562
|
+
['1s,2s,3s', 'T'*27]
|
563
|
+
]
|
564
|
+
end
|
565
|
+
|
566
|
+
it 'should respect minimum orf length' do
|
567
|
+
graph = GraphTesting.emit([
|
568
|
+
[1,2],
|
569
|
+
[2,3]
|
570
|
+
])
|
571
|
+
graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
|
572
|
+
graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
|
573
|
+
initial_path = GraphTesting.make_onodes(graph, %w(1s))
|
574
|
+
|
575
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
576
|
+
problems = orfer.find_all_problems(graph, [initial_path])
|
577
|
+
|
578
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 30)
|
579
|
+
orfer.orf_sequences_from_trails(paths.trails, 30).should == [
|
580
|
+
['1s,2s,3s', 'T'*30]
|
581
|
+
]
|
582
|
+
|
583
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 20)
|
584
|
+
orfer.orf_sequences_from_trails(paths.trails, 20).should == [
|
585
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
586
|
+
['1s,2s,3s', 'T'*30],
|
587
|
+
['1s,2s,3s', 'T'*27],
|
588
|
+
['1s,2s,3s', 'T'*27]
|
589
|
+
]
|
590
|
+
|
591
|
+
paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 0)
|
592
|
+
orfer.orf_sequences_from_trails(paths.trails, 0).should == [
|
593
|
+
['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
|
594
|
+
[',(1s:3)', 'TAA'],
|
595
|
+
['1s,2s,3s', 'T'*30],
|
596
|
+
['1s,2s,3s', 'T'*27],
|
597
|
+
['1s,2s,3s', 'T'*27]
|
598
|
+
]
|
599
|
+
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
describe 'sequence2AA' do
|
604
|
+
it 'should return corresponding amino acids for an orf sequence' do
|
605
|
+
orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
|
606
|
+
orfer.sequence2AA('GCTGCCGCAGCG').should == 'AAAA'
|
607
|
+
orfer.sequence2AA('CGTCGCCGACGGAGAAGG').should == 'RRRRRR'
|
608
|
+
orfer.sequence2AA('AATAAC').should == 'NN'
|
609
|
+
orfer.sequence2AA('GATGAC').should == 'DD'
|
610
|
+
orfer.sequence2AA('TGTTGC').should == 'CC'
|
611
|
+
orfer.sequence2AA('CAACAG').should == 'QQ'
|
612
|
+
orfer.sequence2AA('GAAGAG').should == 'EE'
|
613
|
+
orfer.sequence2AA('GGTGGCGGAGGG').should == 'GGGG'
|
614
|
+
orfer.sequence2AA('CATCAC').should == 'HH'
|
615
|
+
orfer.sequence2AA('ATTATCATA').should == 'III'
|
616
|
+
orfer.sequence2AA('TTATTGCTTCTCCTACTG').should == 'LLLLLL'
|
617
|
+
orfer.sequence2AA('AAAAAG').should == 'KK'
|
618
|
+
orfer.sequence2AA('ATG').should == 'M'
|
619
|
+
orfer.sequence2AA('TTTTTC').should == 'FF'
|
620
|
+
orfer.sequence2AA('CCTCCCCCACCG').should == 'PPPP'
|
621
|
+
orfer.sequence2AA('TCTTCCTCATCGAGTAGC').should == 'SSSSSS'
|
622
|
+
orfer.sequence2AA('ACTACCACAACG').should == 'TTTT'
|
623
|
+
orfer.sequence2AA('TGG').should == 'W'
|
624
|
+
orfer.sequence2AA('TATTAC').should == 'YY'
|
625
|
+
orfer.sequence2AA('GTTGTCGTAGTG').should == 'VVVV'
|
626
|
+
lambda { orfer.sequence2AA('TAA') }.should raise_error
|
627
|
+
lambda { orfer.sequence2AA('TGA') }.should raise_error
|
628
|
+
lambda { orfer.sequence2AA('TAG') }.should raise_error
|
629
|
+
lambda { orfer.sequence2AA('ABCXYZ') }.should raise_error
|
441
630
|
end
|
442
631
|
end
|
443
632
|
end
|
data/spec/contig_printer_spec.rb
CHANGED
@@ -42,7 +42,7 @@ describe "ContigPrinter" do
|
|
42
42
|
'14S:GTT',
|
43
43
|
].sort
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it 'should handle not variants' do
|
47
47
|
seqs = [
|
48
48
|
'ATGAATATGTGCATAGGATT',
|
@@ -235,7 +235,7 @@ describe "ContigPrinter" do
|
|
235
235
|
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
|
236
236
|
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
|
237
237
|
]
|
238
|
-
expected =
|
238
|
+
expected =
|
239
239
|
'ATGAACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTCTAGTGGCGCACGGGTGCGTAACGCGTGGGAATCTGCCCTTGGGTACGG'+
|
240
240
|
'AATAACAGTTAGAAATGACTGCTAATACCGTATAATGACTTCGGTCCAAAGATTTATCGCCCAGGGATGAGCCCGCGTAGGATTAGCTTGTTGGTGAGGTAAANN'+
|
241
241
|
'NTNNCNNANNNNNNNNNNNNTNNNNNGNNNNNNNNNNNGNTNAGNNNCNNNGNNNNNGNGANNTGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGC'+
|
@@ -283,9 +283,27 @@ describe "ContigPrinter" do
|
|
283
283
|
it 'should handle when start_coord is not == 0 and both reads are outwards facing' do
|
284
284
|
raise
|
285
285
|
end
|
286
|
-
|
286
|
+
|
287
287
|
it 'should handle when the example path is not the same length as the reference path' do
|
288
288
|
fail
|
289
289
|
end
|
290
290
|
end
|
291
|
+
|
292
|
+
describe 'AnchoredConnection' do
|
293
|
+
it 'should collapse_paths_to_maximal_coverage_path!' do
|
294
|
+
graph = Bio::Velvet::Graph.parse_from_file(File.join TEST_DATA_DIR, 'contig_printer','1','seq.fa.velvet','LastGraph')
|
295
|
+
graph.nodes.length.should == 13
|
296
|
+
acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
|
297
|
+
acon.start_probe_noded_read = graph.nodes[9].short_reads.select{|nr| nr.read_id == 161}[0] #Found these by using bwa and inspecting the Sequence velvet file
|
298
|
+
acon.end_probe_noded_read = graph.nodes[4].short_reads.select{|nr| nr.read_id == 1045}[0]
|
299
|
+
acon.start_probe_contig_offset = 2
|
300
|
+
acon.end_probe_contig_offset = 3
|
301
|
+
acon.paths = [
|
302
|
+
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
|
303
|
+
GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
|
304
|
+
]
|
305
|
+
acon.collapse_paths_to_maximal_coverage_path!
|
306
|
+
acon.paths.collect{|path| path.to_shorthand}.should == [%w(9s 12s 7e 13s 5e 11e 2s 10s 4e).join(',')]
|
307
|
+
end
|
308
|
+
end
|
291
309
|
end
|
@@ -141,4 +141,18 @@ describe "ScaffoldBreaker" do
|
|
141
141
|
brokes[0].sequence.should == seq
|
142
142
|
end
|
143
143
|
end
|
144
|
+
|
145
|
+
it 'should replace non-ATGC characters with N' do
|
146
|
+
breaker = Bio::FinishM::ScaffoldBreaker.new
|
147
|
+
Tempfile.open('a') do |tmp|
|
148
|
+
tmp.puts '>ab'
|
149
|
+
seq = 'AAAAANNNGGGYYYTTNNAA'
|
150
|
+
tmp.puts seq
|
151
|
+
# 1234567890123456789
|
152
|
+
tmp.close
|
153
|
+
|
154
|
+
brokes = breaker.break_scaffolds(tmp.path)
|
155
|
+
brokes[0].sequence.should == 'AAAAANNNGGGNNNTTNNAA'
|
156
|
+
end
|
157
|
+
end
|
144
158
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: finishm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J. Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-ipcress
|