finishm 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4185d82df8d5de264dee94a05867f051d0332b1
4
- data.tar.gz: 313d7fd065da5b8fb74e3acd1ed0ed6a20f2b827
3
+ metadata.gz: ce5a708f7e7dbd95e357c0e58f94ef29c49d0cbf
4
+ data.tar.gz: ddf1f623a8045f626455e6c25bf78176c792407d
5
5
  SHA512:
6
- metadata.gz: 0e74180aa4d37ee0307d86ab8352d63531612a76be161366d83e4dbc09a7bfc71d42f0152bafc3e503a5ae5bba6bd00040299d2d0a38717266c8a36a5f873a9c
7
- data.tar.gz: 42237938390eb63124b5e9cc33fd01fc69d6ca354117c4745b13980d6f149fbeb0f9759eb36b91701d2f97bfb02a47a9e5cd800837e87d578eb712e06a889581
6
+ metadata.gz: 6d73d9bbbc21761606fc5503d8818792d5a083f766e6a8c62336e17b50505c4c9c1897b79307f16f1e30b9f97b484695baf7ee46e009de5a375ba3660e26ede4
7
+ data.tar.gz: 718ad349ab3c87b1d370ed22eb4d7c68574f67ba6c0ab7a477db94211d12b9920e259c2f0aa05194df0e6e5ca79b5167f22f82708c65d50f2adc613b2a180ee1
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.4
@@ -18,12 +18,16 @@ global = OptionParser.new do |opts|
18
18
  opts.banner = "
19
19
  Usage: #{SCRIPT_NAME} <command> [<arguments>]
20
20
 
21
- FinishM is a collection of tasks related to assembly and metagenome assembly. Available commands:
21
+ FinishM is a collection of tasks related to assembly and metagenome assembly. Common commands:
22
+
23
+ roundup\tImprove a genome by connecting scaffolds and gapfilling
24
+ visualise\tVisualise the DeBruijn graph
25
+
26
+ Other available commands:
22
27
 
23
28
  wander\tTry to connect contigs (experimental)
24
29
  gapfill\tFill assembly gaps (N characters) (experimental)
25
30
  explore\tWhat happens in the graph beyond the end of my contig(s)? (experimental)
26
- visualise\tVisualise the DeBruijn graph (experimental)
27
31
 
28
32
  Commands for PCR finishing:
29
33
 
@@ -2,17 +2,17 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: finishm 0.0.2 ruby lib
5
+ # stub: finishm 0.0.4 ruby lib
6
6
  # stub: ext/mkrf_conf.rb
7
7
 
8
8
  Gem::Specification.new do |s|
9
9
  s.name = "finishm"
10
- s.version = "0.0.2"
10
+ s.version = "0.0.4"
11
11
 
12
12
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13
13
  s.require_paths = ["lib"]
14
14
  s.authors = ["Ben J. Woodcroft"]
15
- s.date = "2015-07-30"
15
+ s.date = "2015-08-30"
16
16
  s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
17
17
  s.email = "donttrustben near gmail.com"
18
18
  s.executables = ["finishm"]
@@ -9,6 +9,32 @@ module Bio
9
9
  CODON_LENGTH = 3
10
10
  START_CODONS = ['ATG']
11
11
  STOP_CODONS = ['TAG', 'TAA', 'TGA']
12
+ CODONS = {
13
+ 'A' => ['GCT', 'GCC', 'GCA', 'GCG'],
14
+ 'R' => ['CGT', 'CGC', 'CGA','CGG', 'AGA', 'AGG'],
15
+ 'N' => ['AAT', 'AAC'],
16
+ 'D' => ['GAT', 'GAC'],
17
+ 'C' => ['TGT', 'TGC'],
18
+ 'Q' => ['CAA', 'CAG'],
19
+ 'E' => ['GAA', 'GAG'],
20
+ 'G' => ['GGT', 'GGC', 'GGA', 'GGG'],
21
+ 'H' => ['CAT', 'CAC'],
22
+ 'I' => ['ATT', 'ATC', 'ATA'],
23
+ 'L' => ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
24
+ 'K' => ['AAA', 'AAG'],
25
+ 'M' => ['ATG'],
26
+ 'F' => ['TTT', 'TTC'],
27
+ 'P' => ['CCT', 'CCC', 'CCA', 'CCG'],
28
+ 'S' => ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
29
+ 'T' => ['ACT', 'ACC', 'ACA', 'ACG'],
30
+ 'W' => ['TGG'],
31
+ 'Y' => ['TAT', 'TAC'],
32
+ 'V' => ['GTT', 'GTC', 'GTA', 'GTG']
33
+ }
34
+ TRANSLATOR = CODONS.reduce({}) do |memo, pair|
35
+ pair[1].each{|key| memo[key] = pair[0]}
36
+ memo
37
+ end
12
38
 
13
39
  # Search for open reading frames in a graph, in all the paths begining at a set of
14
40
  # nodes through a graph (or a subset defined by range)
@@ -78,7 +104,7 @@ module Bio
78
104
  max_num_paths = options[:max_gapfill_paths]
79
105
  max_num_paths ||= 2196
80
106
  max_cycles = options[:max_cycles] || 1
81
- min_orf_length = options[:minimum_orf_length] || 0
107
+ min_orf_length = options[:min_orf_length] || 0
82
108
 
83
109
  counter = SingleCoherentPathsBetweenNodesFinder::CycleCounter.new(max_cycles)
84
110
  decide_stack = lambda do |to_push|
@@ -474,10 +500,11 @@ module Bio
474
500
  return to_return
475
501
  end
476
502
 
477
- def orf_sequences_from_trails(trails)
478
- to_return = {}
503
+ def orf_sequences_from_trails(trails, min_orf_length=nil)
504
+ to_return = []
479
505
  trails.each do |trail|
480
506
  fwd_sequence, twin_sequence = trail.otrail.sequences_within_path
507
+ trail_length = fwd_sequence.length
481
508
  # forward / twin directions
482
509
  [
483
510
  [fwd_sequence, trail.fwd_orfs_result],
@@ -500,10 +527,12 @@ module Bio
500
527
  end
501
528
  name = "(#{onodes[0].to_shorthand}:#{pair[0].position_in_node}),#{onodes[1...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{pair[1].position_in_node})"
502
529
 
503
- to_return[name] ||= sequence[start_position...end_position]
530
+ to_return.push [name, sequence[start_position...end_position]]
504
531
  end
505
532
  result.initial_stop_markers.each do |marker|
506
533
  end_position = marker.position_in_trail
534
+ start_position = end_position % 3 #trim sequence to multiple of 3
535
+ next if min_orf_length and end_position - start_position < min_orf_length
507
536
 
508
537
  # orf_name
509
538
  last_node = nil
@@ -514,20 +543,29 @@ module Bio
514
543
  end
515
544
  name = "#{onodes[0...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{marker.position_in_node})"
516
545
 
517
- to_return[name] ||= sequence[0...end_position]
546
+ to_return.push [name, sequence[start_position...end_position]]
518
547
  end
519
548
  result.final_start_markers.each do |marker|
520
549
  start_position = marker.position_in_trail - 3
550
+ end_position = (trail_length - start_position) % 3
551
+ next if min_orf_length and trail_length - end_position - start_position < min_orf_length
521
552
 
522
553
  # orf_name
523
554
  onodes = trail.otrail.trail.drop_while{|onode| onode.node != marker.node}
524
555
  name = "(#{onodes[0].to_shorthand}:#{marker.position_in_node}),#{onodes[1..-1].collect{|onode| onode.to_shorthand}.join(',')}"
556
+ to_return.push [name, sequence[start_position..-1-end_position]]
525
557
  end
526
558
  end
527
559
  if result.nil? or (result.start_stop_pairs.empty? and result.final_start_markers.empty? and result.initial_stop_markers.empty?)
528
- name = "#{trail.otrail.to_shorthand}"
560
+ (0..2).each do |frame|
561
+ start_position = frame
562
+ end_position = (trail_length - start_position) % 3
563
+ next if min_orf_length and trail_length - end_position - start_position < min_orf_length
529
564
 
530
- to_return[name] ||= sequence
565
+ # orf_name
566
+ name = "#{trail.otrail.to_shorthand}"
567
+ to_return.push [name, sequence[start_position..-1-end_position]]
568
+ end
531
569
  end
532
570
  end
533
571
  end
@@ -535,6 +573,22 @@ module Bio
535
573
  return to_return
536
574
  end
537
575
 
576
+ def sequence2AA(sequence)
577
+ remaining = sequence
578
+ aa = ""
579
+ while remaining.length > 0
580
+ codon = remaining[0...3]
581
+ log.debug "Found next codon #{codon}" if log.debug?
582
+ if not TRANSLATOR.has_key?(codon)
583
+ raise "Cannot translate invalid codon #{codon} in sequence #{sequence}."
584
+ end
585
+ log.debug "Codon translated to #{TRANSLATOR[codon]}" if log.debug?
586
+ aa += TRANSLATOR[codon]
587
+ remaining = remaining[3..-1]
588
+ end
589
+ return aa
590
+ end
591
+
538
592
  # positions of last base of codons
539
593
  class Marker
540
594
  attr_accessor :position_in_trail, :position_in_node, :node
@@ -34,6 +34,23 @@ module Bio
34
34
  # Enumerable of Enumerables of OrientedNode objects, each list of OrientedNode objects
35
35
  # corresponds to a path that forms the connection
36
36
  attr_accessor :paths
37
+
38
+ # Remove all except the path with maximal coverage from @paths
39
+ def collapse_paths_to_maximal_coverage_path!
40
+ return if @paths.nil? or @paths.empty?
41
+ get_coverage = lambda do |path|
42
+ numerator = 0
43
+ denominator = 0
44
+ path.each do |onode|
45
+ numerator += onode.node.coverage * onode.node.length_alone
46
+ denominator += onode.node.length_alone
47
+ end
48
+ numerator.to_f / denominator
49
+ end
50
+ @paths = [@paths.max do |path1, path2|
51
+ get_coverage.call(path1) <=> get_coverage.call(path2)
52
+ end]
53
+ end
37
54
  end
38
55
 
39
56
  # Given two contigs, return a consensus path and variants of the path.
@@ -293,7 +310,7 @@ module Bio
293
310
  def clustalo(sequences)
294
311
  i = 0
295
312
  stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
296
- log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" #if log.debug?
313
+ log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" if log.debug?
297
314
  stdout = Bio::Commandeer.run "clustalo -t DNA -i - --output-order=input-order", {:stdin => stdin, :log => log}
298
315
  to_return = []
299
316
  header = true
@@ -101,7 +101,8 @@ class Bio::FinishM::ScaffoldBreaker
101
101
 
102
102
  unless seq.seq.match(/^[ATGCN]+$/i)
103
103
  example = seq.seq.match(/([^ATGCN])/i)[1]
104
- log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}, continuing optimistically, but not quite sure what will happen.. good luck"
104
+ log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}. Replacing them with Ns"
105
+ seq.seq.gsub! /[^ATGCN]/i, 'N'
105
106
  end
106
107
 
107
108
  if seq.seq.match(/^N+$/i)
@@ -2,21 +2,39 @@ class Bio::FinishM::ORFsFinder
2
2
  include Bio::FinishM::Logging
3
3
 
4
4
  DEFAULT_OPTIONS = {
5
- :min_orf_length => 100
5
+ :min_orf_length => 96
6
6
  }
7
7
 
8
8
  def add_options(optparse_object, options)
9
9
  options.merge! Bio::FinishM::Visualise::DEFAULT_OPTIONS
10
10
  options.merge! DEFAULT_OPTIONS
11
- optparse_object.banner = "\nUsage: finishm find_orfs --assembly-???
11
+ optparse_object.banner = "\nUsage: finishm find_orfs [--orf-amino-acids OUTPUT_FAA --orf-nucleotides OUTPUT_FNA]
12
12
 
13
13
  Find possible open reading frames in assembly graph
14
14
  \n\n"
15
15
 
16
- optparse_object.separator "Input genome information"
16
+ optparse_object.separator "\nOutput sequence files\n\n"
17
+ optparse_object.on("--orf-amino-acids OUTPUT_FAA", "Output ORF amino acid sequences [default: orf.faa unless --orf-nucleotides is specified]") do |arg|
18
+ options[:output_faa] = arg
19
+ end
20
+ optparse_object.on("--orf-nucleotides OUTPUT_FNA", "Output ORF nucleotide sequences [default: orf.fna unless --orf-amino-acids is specified]") do |arg|
21
+ options[:output_fna] = arg
22
+ end
23
+
24
+ optparse_object.separator "\nInput genome information"
17
25
  optparse_object.separator "\nIf an assembly is to be done, there must be some definition of reads:\n\n" #TODO improve this help
18
26
  Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
19
27
 
28
+ optparse_object.separator "\nOptional arguments:\n\n"
29
+ optparse_object.on("--min-orf-length", "Minimum ORF length [default: 96]") do |arg|
30
+ length = arg.to_i
31
+ if length.to_s != arg or length.nil? or length < 1
32
+ raise "Unable to parse minimum orf length parameter #{arg}, cannot continue"
33
+ end
34
+ options[:min_orf_length] = length
35
+ end
36
+
37
+
20
38
  optparse_object.separator "\nOptional graph-exploration arguments:\n\n"
21
39
  Bio::FinishM::Visualise.new.add_probe_options(optparse_object, options)
22
40
 
@@ -59,7 +77,18 @@ class Bio::FinishM::ORFsFinder
59
77
  end
60
78
 
61
79
  initial_onodes = Bio::FinishM::PathCounter.new.get_leash_start_nodes(finishm_graph, options[:range])
62
- find_orfs_in_graph(finishm_graph, initial_onodes, options)
80
+ orfs = find_orfs_in_graph(finishm_graph, initial_onodes, options)
81
+ log.info "Found #{orfs.length} open reading frames longer than #{options[:min_orf_length]}."
82
+ if not options[:output_fna] and not options[:output_faa]
83
+ options[:output_fna] = 'orfs.fna'
84
+ end
85
+
86
+ if options[:output_fna]
87
+ write_orfs_to_file(orfs, options[:output_fna])
88
+ end
89
+ if options[:output_faa]
90
+ write_orfs_to_file(orfs, options[:output_faa], translate=true)
91
+ end
63
92
  end
64
93
 
65
94
  def find_orfs_in_graph(finishm_graph, initial_onodes, options={})
@@ -73,11 +102,24 @@ class Bio::FinishM::ORFsFinder
73
102
  orf_trails = orfer.find_orfs_in_graph(finishm_graph.graph, initial_paths,
74
103
  options[:min_orf_length], options[:range])
75
104
 
76
- found_orfs = orfer.orf_sequences_from_trails(orf_trails)
105
+ orfer.orf_sequences_from_trails(orf_trails, options[:min_orf_length])
106
+ end
77
107
 
78
- found_orfs.each_pair do |name, sequence|
79
- puts ">#{name}"
80
- puts sequence
108
+ def write_orfs_to_file(found_orfs, orfs_file, translate=false)
109
+ if translate
110
+ translator = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
111
+ end
112
+ File.open(orfs_file,'w') do |f|
113
+ counter = 0
114
+ found_orfs.each do |name_and_sequence|
115
+ counter += 1
116
+ f.puts ">finishm_orf_#{counter} #{name_and_sequence[0]}"
117
+ if translate
118
+ f.puts translator.sequence2AA(name_and_sequence[1][0...-3])
119
+ else
120
+ f.puts name_and_sequence[1]
121
+ end
122
+ end
81
123
  end
82
124
  end
83
125
 
@@ -10,6 +10,7 @@ class Bio::FinishM::RoundUp
10
10
  :gapfill_only => false,
11
11
  :max_explore_nodes => 10000,
12
12
  :max_gapfill_paths => 10,
13
+ :gapfill_with_max_coverage => false,
13
14
  }
14
15
 
15
16
  def add_options(optparse_object, options)
@@ -64,6 +65,9 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
64
65
  optparse_object.on("--max-explore-nodes NUM", Integer, "Only explore this many nodes. If max is reached, do not make connections. [default: #{options[:max_explore_nodes] }]") do |arg|
65
66
  options[:max_explore_nodes] = arg
66
67
  end
68
+ optparse_object.on("--gapfill-with-max-coverage", "When gapfilling, take the path with maximal coverage and do not print variants [default: #{options[:gapfill_with_max_coverage] }]") do
69
+ options[:gapfill_with_max_coverage] = true
70
+ end
67
71
  optparse_object.on("--debug", "Build the graph, then drop to a pry console. [default: #{options[:debug] }]") do
68
72
  options[:debug] = true
69
73
  end
@@ -193,6 +197,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
193
197
  # Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
194
198
  scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
195
199
  else
200
+ acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
196
201
  scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
197
202
  master_graph.graph,
198
203
  scaffold_sequence,
@@ -321,7 +326,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
321
326
  return gapfilled_sequence, num_gapfills, all_variants
322
327
  end
323
328
 
324
- def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths)
329
+ def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options)
325
330
  scaffold_sequence = nil
326
331
  gapfilled = -1
327
332
  if aconn.paths.length == 0 or aconn.paths.length > max_gapfill_paths
@@ -329,6 +334,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
329
334
  scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
330
335
  gapfilled = false
331
336
  else
337
+ acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
332
338
  scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
333
339
  master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
334
340
  )
@@ -38,7 +38,8 @@ class Bio::FinishM::Visualise
38
38
  return validate_argv_length(argv) ||
39
39
  validate_visualisation_options(options) ||
40
40
  validate_probe_options(options) ||
41
- validate_assembly_options(options)
41
+ validate_assembly_options(options) ||
42
+ validate_scaffold_options(options)
42
43
  end
43
44
 
44
45
  def add_visualisation_options(optparse_object, options)
@@ -79,8 +80,8 @@ class Bio::FinishM::Visualise
79
80
 
80
81
  def validate_scaffold_options(options)
81
82
  # If scaffolds are defined, then probe genomes must also be defined
82
- if options[:scaffolds] and !options[:assembly_files]
83
- return "If --scaffolds is defined, so then must --genomes"
83
+ if options[:scaffold_sides] and !options[:assembly_files]
84
+ return "If --scaffolds is given, then --genomes must also be given"
84
85
  end
85
86
  end
86
87
 
@@ -195,6 +195,127 @@ describe "AllOrfs" do
195
195
  res.collect{|result| result.initial_start_markers}.should == [[],[]]
196
196
  end
197
197
 
198
+ it 'should find two same-phase orfs along a trail' do
199
+ graph, = GraphTesting.emit_otrails([
200
+ [1,2,3]
201
+ ])
202
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
203
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAAATGGA' #stop codon 'TAA', start codon 'ATG'
204
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
205
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
206
+
207
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
208
+ problems = orfer.find_all_problems(graph, [initial_path])
209
+ #pp problems
210
+
211
+ paths = orfer.find_orfs_from_problems(problems)
212
+ #pp paths
213
+ GraphTesting.sorted_paths(paths.trails).should == [
214
+ [1,2,3]
215
+ ]
216
+ res = paths.trails[0].fwd_orfs_result
217
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
218
+ [6,15],
219
+ [18,30]
220
+ ]
221
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
222
+ [[1,6],[2,5]],
223
+ [[2,8], [3,10]]
224
+ ]
225
+ res.initial_start_markers.should == []
226
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
227
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
228
+ res.final_start_markers.should == []
229
+ end
230
+
231
+ it 'should end orfs at first stop codon in forward direction' do
232
+ graph, = GraphTesting.emit_otrails([
233
+ [1,2,3]
234
+ ])
235
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
236
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAAAAAGA' #stop codon 'TAA'
237
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
238
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
239
+
240
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
241
+ problems = orfer.find_all_problems(graph, [initial_path])
242
+ #pp problems
243
+
244
+ paths = orfer.find_orfs_from_problems(problems)
245
+ #pp paths
246
+ GraphTesting.sorted_paths(paths.trails).should == [
247
+ [1,2,3]
248
+ ]
249
+ res = paths.trails[0].fwd_orfs_result
250
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
251
+ [6,15]
252
+ ]
253
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
254
+ [[1,6],[2,5]]
255
+ ]
256
+ res.initial_start_markers.should == []
257
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
258
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
259
+ res.final_start_markers.should == []
260
+ end
261
+
262
+ it 'should end orfs at first stop codon in twin direction' do
263
+ graph = GraphTesting.emit([
264
+ [1,2],
265
+ [2,3]
266
+ ])
267
+ graph.nodes[1].ends_of_kmers_of_twin_node = 'TTAGTTTTTT' # stop codon 'TAG'
268
+ graph.nodes[2].ends_of_kmers_of_twin_node = 'TTTAGTTTTT' # stop codon 'TAG'
269
+ graph.nodes[3].ends_of_kmers_of_twin_node = 'TAAATGTTTT' # stop codon 'TAA', start codon 'ATG'
270
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
271
+
272
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
273
+ problems = orfer.find_all_problems(graph, [initial_path])
274
+ #pp problems
275
+
276
+ paths = orfer.find_orfs_from_problems(problems)
277
+ #pp paths
278
+ GraphTesting.sorted_paths(paths.trails).should == [
279
+ [1,2,3]
280
+ ]
281
+ res = paths.trails[0].twin_orfs_result
282
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
283
+ [6,15]
284
+ ]
285
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
286
+ [[3,6],[2,5]]
287
+ ]
288
+ res.initial_start_markers.should == []
289
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
290
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[3,3]]
291
+ res.final_start_markers.should == []
292
+ end
293
+
294
+ it 'should return the first initial stop codon in forward direction' do
295
+ graph = GraphTesting.emit([
296
+ [1,2],
297
+ [2,3]
298
+ ])
299
+ graph.nodes[1].ends_of_kmers_of_node = 'AAATAGAAAA' # stop codon 'TAG'
300
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAGAAAAA' # stop codon 'TAG'
301
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
302
+
303
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
304
+ problems = orfer.find_all_problems(graph, [initial_path])
305
+ #pp problems
306
+
307
+ paths = orfer.find_orfs_from_problems(problems)
308
+ #pp paths
309
+ GraphTesting.sorted_paths(paths.trails).should == [
310
+ [1,2,3]
311
+ ]
312
+ res = paths.trails[0].fwd_orfs_result
313
+ res.start_stop_pairs.should == []
314
+ res.initial_start_markers.should == []
315
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [6]
316
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,6]]
317
+ res.final_start_markers.should == []
318
+ end
198
319
 
199
320
  it 'should respect terminal nodes' do
200
321
  fail '#todo'
@@ -433,11 +554,79 @@ describe "AllOrfs" do
433
554
 
434
555
  paths = orfer.find_orfs_from_problems(problems)
435
556
  #pp paths
436
- orfer.orf_sequences_from_trails(paths.trails).should == {
437
- '(1s:6),2s,(3s:10)' => 'ATGGAAAAAAAAAAAAAAAAAAAATAA',
438
- '1s,2s,3s' => 'T'*30,
439
- ',(1s:3)' => 'TAA'
440
- }
557
+ orfer.orf_sequences_from_trails(paths.trails).should == [
558
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
559
+ [',(1s:3)', 'TAA'],
560
+ ['1s,2s,3s', 'T'*30],
561
+ ['1s,2s,3s', 'T'*27],
562
+ ['1s,2s,3s', 'T'*27]
563
+ ]
564
+ end
565
+
566
+ it 'should respect minimum orf length' do
567
+ graph = GraphTesting.emit([
568
+ [1,2],
569
+ [2,3]
570
+ ])
571
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
572
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
573
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
574
+
575
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
576
+ problems = orfer.find_all_problems(graph, [initial_path])
577
+
578
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 30)
579
+ orfer.orf_sequences_from_trails(paths.trails, 30).should == [
580
+ ['1s,2s,3s', 'T'*30]
581
+ ]
582
+
583
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 20)
584
+ orfer.orf_sequences_from_trails(paths.trails, 20).should == [
585
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
586
+ ['1s,2s,3s', 'T'*30],
587
+ ['1s,2s,3s', 'T'*27],
588
+ ['1s,2s,3s', 'T'*27]
589
+ ]
590
+
591
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 0)
592
+ orfer.orf_sequences_from_trails(paths.trails, 0).should == [
593
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
594
+ [',(1s:3)', 'TAA'],
595
+ ['1s,2s,3s', 'T'*30],
596
+ ['1s,2s,3s', 'T'*27],
597
+ ['1s,2s,3s', 'T'*27]
598
+ ]
599
+
600
+ end
601
+ end
602
+
603
+ describe 'sequence2AA' do
604
+ it 'should return corresponding amino acids for an orf sequence' do
605
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
606
+ orfer.sequence2AA('GCTGCCGCAGCG').should == 'AAAA'
607
+ orfer.sequence2AA('CGTCGCCGACGGAGAAGG').should == 'RRRRRR'
608
+ orfer.sequence2AA('AATAAC').should == 'NN'
609
+ orfer.sequence2AA('GATGAC').should == 'DD'
610
+ orfer.sequence2AA('TGTTGC').should == 'CC'
611
+ orfer.sequence2AA('CAACAG').should == 'QQ'
612
+ orfer.sequence2AA('GAAGAG').should == 'EE'
613
+ orfer.sequence2AA('GGTGGCGGAGGG').should == 'GGGG'
614
+ orfer.sequence2AA('CATCAC').should == 'HH'
615
+ orfer.sequence2AA('ATTATCATA').should == 'III'
616
+ orfer.sequence2AA('TTATTGCTTCTCCTACTG').should == 'LLLLLL'
617
+ orfer.sequence2AA('AAAAAG').should == 'KK'
618
+ orfer.sequence2AA('ATG').should == 'M'
619
+ orfer.sequence2AA('TTTTTC').should == 'FF'
620
+ orfer.sequence2AA('CCTCCCCCACCG').should == 'PPPP'
621
+ orfer.sequence2AA('TCTTCCTCATCGAGTAGC').should == 'SSSSSS'
622
+ orfer.sequence2AA('ACTACCACAACG').should == 'TTTT'
623
+ orfer.sequence2AA('TGG').should == 'W'
624
+ orfer.sequence2AA('TATTAC').should == 'YY'
625
+ orfer.sequence2AA('GTTGTCGTAGTG').should == 'VVVV'
626
+ lambda { orfer.sequence2AA('TAA') }.should raise_error
627
+ lambda { orfer.sequence2AA('TGA') }.should raise_error
628
+ lambda { orfer.sequence2AA('TAG') }.should raise_error
629
+ lambda { orfer.sequence2AA('ABCXYZ') }.should raise_error
441
630
  end
442
631
  end
443
632
  end
@@ -42,7 +42,7 @@ describe "ContigPrinter" do
42
42
  '14S:GTT',
43
43
  ].sort
44
44
  end
45
-
45
+
46
46
  it 'should handle not variants' do
47
47
  seqs = [
48
48
  'ATGAATATGTGCATAGGATT',
@@ -235,7 +235,7 @@ describe "ContigPrinter" do
235
235
  GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
236
236
  GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
237
237
  ]
238
- expected =
238
+ expected =
239
239
  'ATGAACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTCTAGTGGCGCACGGGTGCGTAACGCGTGGGAATCTGCCCTTGGGTACGG'+
240
240
  'AATAACAGTTAGAAATGACTGCTAATACCGTATAATGACTTCGGTCCAAAGATTTATCGCCCAGGGATGAGCCCGCGTAGGATTAGCTTGTTGGTGAGGTAAANN'+
241
241
  'NTNNCNNANNNNNNNNNNNNTNNNNNGNNNNNNNNNNNGNTNAGNNNCNNNGNNNNNGNGANNTGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGC'+
@@ -283,9 +283,27 @@ describe "ContigPrinter" do
283
283
  it 'should handle when start_coord is not == 0 and both reads are outwards facing' do
284
284
  raise
285
285
  end
286
-
286
+
287
287
  it 'should handle when the example path is not the same length as the reference path' do
288
288
  fail
289
289
  end
290
290
  end
291
+
292
+ describe 'AnchoredConnection' do
293
+ it 'should collapse_paths_to_maximal_coverage_path!' do
294
+ graph = Bio::Velvet::Graph.parse_from_file(File.join TEST_DATA_DIR, 'contig_printer','1','seq.fa.velvet','LastGraph')
295
+ graph.nodes.length.should == 13
296
+ acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
297
+ acon.start_probe_noded_read = graph.nodes[9].short_reads.select{|nr| nr.read_id == 161}[0] #Found these by using bwa and inspecting the Sequence velvet file
298
+ acon.end_probe_noded_read = graph.nodes[4].short_reads.select{|nr| nr.read_id == 1045}[0]
299
+ acon.start_probe_contig_offset = 2
300
+ acon.end_probe_contig_offset = 3
301
+ acon.paths = [
302
+ GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
303
+ GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
304
+ ]
305
+ acon.collapse_paths_to_maximal_coverage_path!
306
+ acon.paths.collect{|path| path.to_shorthand}.should == [%w(9s 12s 7e 13s 5e 11e 2s 10s 4e).join(',')]
307
+ end
308
+ end
291
309
  end
@@ -141,4 +141,18 @@ describe "ScaffoldBreaker" do
141
141
  brokes[0].sequence.should == seq
142
142
  end
143
143
  end
144
+
145
+ it 'should replace non-ATGC characters with N' do
146
+ breaker = Bio::FinishM::ScaffoldBreaker.new
147
+ Tempfile.open('a') do |tmp|
148
+ tmp.puts '>ab'
149
+ seq = 'AAAAANNNGGGYYYTTNNAA'
150
+ tmp.puts seq
151
+ # 1234567890123456789
152
+ tmp.close
153
+
154
+ brokes = breaker.break_scaffolds(tmp.path)
155
+ brokes[0].sequence.should == 'AAAAANNNGGGNNNTTNNAA'
156
+ end
157
+ end
144
158
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: finishm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J. Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-30 00:00:00.000000000 Z
11
+ date: 2015-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-ipcress