finishm 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4185d82df8d5de264dee94a05867f051d0332b1
4
- data.tar.gz: 313d7fd065da5b8fb74e3acd1ed0ed6a20f2b827
3
+ metadata.gz: ce5a708f7e7dbd95e357c0e58f94ef29c49d0cbf
4
+ data.tar.gz: ddf1f623a8045f626455e6c25bf78176c792407d
5
5
  SHA512:
6
- metadata.gz: 0e74180aa4d37ee0307d86ab8352d63531612a76be161366d83e4dbc09a7bfc71d42f0152bafc3e503a5ae5bba6bd00040299d2d0a38717266c8a36a5f873a9c
7
- data.tar.gz: 42237938390eb63124b5e9cc33fd01fc69d6ca354117c4745b13980d6f149fbeb0f9759eb36b91701d2f97bfb02a47a9e5cd800837e87d578eb712e06a889581
6
+ metadata.gz: 6d73d9bbbc21761606fc5503d8818792d5a083f766e6a8c62336e17b50505c4c9c1897b79307f16f1e30b9f97b484695baf7ee46e009de5a375ba3660e26ede4
7
+ data.tar.gz: 718ad349ab3c87b1d370ed22eb4d7c68574f67ba6c0ab7a477db94211d12b9920e259c2f0aa05194df0e6e5ca79b5167f22f82708c65d50f2adc613b2a180ee1
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.4
@@ -18,12 +18,16 @@ global = OptionParser.new do |opts|
18
18
  opts.banner = "
19
19
  Usage: #{SCRIPT_NAME} <command> [<arguments>]
20
20
 
21
- FinishM is a collection of tasks related to assembly and metagenome assembly. Available commands:
21
+ FinishM is a collection of tasks related to assembly and metagenome assembly. Common commands:
22
+
23
+ roundup\tImprove a genome by connecting scaffolds and gapfilling
24
+ visualise\tVisualise the DeBruijn graph
25
+
26
+ Other available commands:
22
27
 
23
28
  wander\tTry to connect contigs (experimental)
24
29
  gapfill\tFill assembly gaps (N characters) (experimental)
25
30
  explore\tWhat happens in the graph beyond the end of my contig(s)? (experimental)
26
- visualise\tVisualise the DeBruijn graph (experimental)
27
31
 
28
32
  Commands for PCR finishing:
29
33
 
@@ -2,17 +2,17 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: finishm 0.0.2 ruby lib
5
+ # stub: finishm 0.0.4 ruby lib
6
6
  # stub: ext/mkrf_conf.rb
7
7
 
8
8
  Gem::Specification.new do |s|
9
9
  s.name = "finishm"
10
- s.version = "0.0.2"
10
+ s.version = "0.0.4"
11
11
 
12
12
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13
13
  s.require_paths = ["lib"]
14
14
  s.authors = ["Ben J. Woodcroft"]
15
- s.date = "2015-07-30"
15
+ s.date = "2015-08-30"
16
16
  s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
17
17
  s.email = "donttrustben near gmail.com"
18
18
  s.executables = ["finishm"]
@@ -9,6 +9,32 @@ module Bio
9
9
  CODON_LENGTH = 3
10
10
  START_CODONS = ['ATG']
11
11
  STOP_CODONS = ['TAG', 'TAA', 'TGA']
12
+ CODONS = {
13
+ 'A' => ['GCT', 'GCC', 'GCA', 'GCG'],
14
+ 'R' => ['CGT', 'CGC', 'CGA','CGG', 'AGA', 'AGG'],
15
+ 'N' => ['AAT', 'AAC'],
16
+ 'D' => ['GAT', 'GAC'],
17
+ 'C' => ['TGT', 'TGC'],
18
+ 'Q' => ['CAA', 'CAG'],
19
+ 'E' => ['GAA', 'GAG'],
20
+ 'G' => ['GGT', 'GGC', 'GGA', 'GGG'],
21
+ 'H' => ['CAT', 'CAC'],
22
+ 'I' => ['ATT', 'ATC', 'ATA'],
23
+ 'L' => ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
24
+ 'K' => ['AAA', 'AAG'],
25
+ 'M' => ['ATG'],
26
+ 'F' => ['TTT', 'TTC'],
27
+ 'P' => ['CCT', 'CCC', 'CCA', 'CCG'],
28
+ 'S' => ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
29
+ 'T' => ['ACT', 'ACC', 'ACA', 'ACG'],
30
+ 'W' => ['TGG'],
31
+ 'Y' => ['TAT', 'TAC'],
32
+ 'V' => ['GTT', 'GTC', 'GTA', 'GTG']
33
+ }
34
+ TRANSLATOR = CODONS.reduce({}) do |memo, pair|
35
+ pair[1].each{|key| memo[key] = pair[0]}
36
+ memo
37
+ end
12
38
 
13
39
  # Search for open reading frames in a graph, in all the paths begining at a set of
14
40
  # nodes through a graph (or a subset defined by range)
@@ -78,7 +104,7 @@ module Bio
78
104
  max_num_paths = options[:max_gapfill_paths]
79
105
  max_num_paths ||= 2196
80
106
  max_cycles = options[:max_cycles] || 1
81
- min_orf_length = options[:minimum_orf_length] || 0
107
+ min_orf_length = options[:min_orf_length] || 0
82
108
 
83
109
  counter = SingleCoherentPathsBetweenNodesFinder::CycleCounter.new(max_cycles)
84
110
  decide_stack = lambda do |to_push|
@@ -474,10 +500,11 @@ module Bio
474
500
  return to_return
475
501
  end
476
502
 
477
- def orf_sequences_from_trails(trails)
478
- to_return = {}
503
+ def orf_sequences_from_trails(trails, min_orf_length=nil)
504
+ to_return = []
479
505
  trails.each do |trail|
480
506
  fwd_sequence, twin_sequence = trail.otrail.sequences_within_path
507
+ trail_length = fwd_sequence.length
481
508
  # forward / twin directions
482
509
  [
483
510
  [fwd_sequence, trail.fwd_orfs_result],
@@ -500,10 +527,12 @@ module Bio
500
527
  end
501
528
  name = "(#{onodes[0].to_shorthand}:#{pair[0].position_in_node}),#{onodes[1...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{pair[1].position_in_node})"
502
529
 
503
- to_return[name] ||= sequence[start_position...end_position]
530
+ to_return.push [name, sequence[start_position...end_position]]
504
531
  end
505
532
  result.initial_stop_markers.each do |marker|
506
533
  end_position = marker.position_in_trail
534
+ start_position = end_position % 3 #trim sequence to multiple of 3
535
+ next if min_orf_length and end_position - start_position < min_orf_length
507
536
 
508
537
  # orf_name
509
538
  last_node = nil
@@ -514,20 +543,29 @@ module Bio
514
543
  end
515
544
  name = "#{onodes[0...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{marker.position_in_node})"
516
545
 
517
- to_return[name] ||= sequence[0...end_position]
546
+ to_return.push [name, sequence[start_position...end_position]]
518
547
  end
519
548
  result.final_start_markers.each do |marker|
520
549
  start_position = marker.position_in_trail - 3
550
+ end_position = (trail_length - start_position) % 3
551
+ next if min_orf_length and trail_length - end_position - start_position < min_orf_length
521
552
 
522
553
  # orf_name
523
554
  onodes = trail.otrail.trail.drop_while{|onode| onode.node != marker.node}
524
555
  name = "(#{onodes[0].to_shorthand}:#{marker.position_in_node}),#{onodes[1..-1].collect{|onode| onode.to_shorthand}.join(',')}"
556
+ to_return.push [name, sequence[start_position..-1-end_position]]
525
557
  end
526
558
  end
527
559
  if result.nil? or (result.start_stop_pairs.empty? and result.final_start_markers.empty? and result.initial_stop_markers.empty?)
528
- name = "#{trail.otrail.to_shorthand}"
560
+ (0..2).each do |frame|
561
+ start_position = frame
562
+ end_position = (trail_length - start_position) % 3
563
+ next if min_orf_length and trail_length - end_position - start_position < min_orf_length
529
564
 
530
- to_return[name] ||= sequence
565
+ # orf_name
566
+ name = "#{trail.otrail.to_shorthand}"
567
+ to_return.push [name, sequence[start_position..-1-end_position]]
568
+ end
531
569
  end
532
570
  end
533
571
  end
@@ -535,6 +573,22 @@ module Bio
535
573
  return to_return
536
574
  end
537
575
 
576
+ def sequence2AA(sequence)
577
+ remaining = sequence
578
+ aa = ""
579
+ while remaining.length > 0
580
+ codon = remaining[0...3]
581
+ log.debug "Found next codon #{codon}" if log.debug?
582
+ if not TRANSLATOR.has_key?(codon)
583
+ raise "Cannot translate invalid codon #{codon} in sequence #{sequence}."
584
+ end
585
+ log.debug "Codon translated to #{TRANSLATOR[codon]}" if log.debug?
586
+ aa += TRANSLATOR[codon]
587
+ remaining = remaining[3..-1]
588
+ end
589
+ return aa
590
+ end
591
+
538
592
  # positions of last base of codons
539
593
  class Marker
540
594
  attr_accessor :position_in_trail, :position_in_node, :node
@@ -34,6 +34,23 @@ module Bio
34
34
  # Enumerable of Enumerables of OrientedNode objects, each list of OrientedNode objects
35
35
  # corresponds to a path that forms the connection
36
36
  attr_accessor :paths
37
+
38
+ # Remove all except the path with maximal coverage from @paths
39
+ def collapse_paths_to_maximal_coverage_path!
40
+ return if @paths.nil? or @paths.empty?
41
+ get_coverage = lambda do |path|
42
+ numerator = 0
43
+ denominator = 0
44
+ path.each do |onode|
45
+ numerator += onode.node.coverage * onode.node.length_alone
46
+ denominator += onode.node.length_alone
47
+ end
48
+ numerator.to_f / denominator
49
+ end
50
+ @paths = [@paths.max do |path1, path2|
51
+ get_coverage.call(path1) <=> get_coverage.call(path2)
52
+ end]
53
+ end
37
54
  end
38
55
 
39
56
  # Given two contigs, return a consensus path and variants of the path.
@@ -293,7 +310,7 @@ module Bio
293
310
  def clustalo(sequences)
294
311
  i = 0
295
312
  stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
296
- log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" #if log.debug?
313
+ log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" if log.debug?
297
314
  stdout = Bio::Commandeer.run "clustalo -t DNA -i - --output-order=input-order", {:stdin => stdin, :log => log}
298
315
  to_return = []
299
316
  header = true
@@ -101,7 +101,8 @@ class Bio::FinishM::ScaffoldBreaker
101
101
 
102
102
  unless seq.seq.match(/^[ATGCN]+$/i)
103
103
  example = seq.seq.match(/([^ATGCN])/i)[1]
104
- log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}, continuing optimistically, but not quite sure what will happen.. good luck"
104
+ log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}. Replacing them with Ns"
105
+ seq.seq.gsub! /[^ATGCN]/i, 'N'
105
106
  end
106
107
 
107
108
  if seq.seq.match(/^N+$/i)
@@ -2,21 +2,39 @@ class Bio::FinishM::ORFsFinder
2
2
  include Bio::FinishM::Logging
3
3
 
4
4
  DEFAULT_OPTIONS = {
5
- :min_orf_length => 100
5
+ :min_orf_length => 96
6
6
  }
7
7
 
8
8
  def add_options(optparse_object, options)
9
9
  options.merge! Bio::FinishM::Visualise::DEFAULT_OPTIONS
10
10
  options.merge! DEFAULT_OPTIONS
11
- optparse_object.banner = "\nUsage: finishm find_orfs --assembly-???
11
+ optparse_object.banner = "\nUsage: finishm find_orfs [--orf-amino-acids OUTPUT_FAA --orf-nucleotides OUTPUT_FNA]
12
12
 
13
13
  Find possible open reading frames in assembly graph
14
14
  \n\n"
15
15
 
16
- optparse_object.separator "Input genome information"
16
+ optparse_object.separator "\nOutput sequence files\n\n"
17
+ optparse_object.on("--orf-amino-acids OUTPUT_FAA", "Output ORF amino acid sequences [default: orf.faa unless --orf-nucleotides is specified]") do |arg|
18
+ options[:output_faa] = arg
19
+ end
20
+ optparse_object.on("--orf-nucleotides OUTPUT_FNA", "Output ORF nucleotide sequences [default: orf.fna unless --orf-amino-acids is specified]") do |arg|
21
+ options[:output_fna] = arg
22
+ end
23
+
24
+ optparse_object.separator "\nInput genome information"
17
25
  optparse_object.separator "\nIf an assembly is to be done, there must be some definition of reads:\n\n" #TODO improve this help
18
26
  Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
19
27
 
28
+ optparse_object.separator "\nOptional arguments:\n\n"
29
+ optparse_object.on("--min-orf-length", "Minimum ORF length [default: 96]") do |arg|
30
+ length = arg.to_i
31
+ if length.to_s != arg or length.nil? or length < 1
32
+ raise "Unable to parse minimum orf length parameter #{arg}, cannot continue"
33
+ end
34
+ options[:min_orf_length] = length
35
+ end
36
+
37
+
20
38
  optparse_object.separator "\nOptional graph-exploration arguments:\n\n"
21
39
  Bio::FinishM::Visualise.new.add_probe_options(optparse_object, options)
22
40
 
@@ -59,7 +77,18 @@ class Bio::FinishM::ORFsFinder
59
77
  end
60
78
 
61
79
  initial_onodes = Bio::FinishM::PathCounter.new.get_leash_start_nodes(finishm_graph, options[:range])
62
- find_orfs_in_graph(finishm_graph, initial_onodes, options)
80
+ orfs = find_orfs_in_graph(finishm_graph, initial_onodes, options)
81
+ log.info "Found #{orfs.length} open reading frames longer than #{options[:min_orf_length]}."
82
+ if not options[:output_fna] and not options[:output_faa]
83
+ options[:output_fna] = 'orfs.fna'
84
+ end
85
+
86
+ if options[:output_fna]
87
+ write_orfs_to_file(orfs, options[:output_fna])
88
+ end
89
+ if options[:output_faa]
90
+ write_orfs_to_file(orfs, options[:output_faa], translate=true)
91
+ end
63
92
  end
64
93
 
65
94
  def find_orfs_in_graph(finishm_graph, initial_onodes, options={})
@@ -73,11 +102,24 @@ class Bio::FinishM::ORFsFinder
73
102
  orf_trails = orfer.find_orfs_in_graph(finishm_graph.graph, initial_paths,
74
103
  options[:min_orf_length], options[:range])
75
104
 
76
- found_orfs = orfer.orf_sequences_from_trails(orf_trails)
105
+ orfer.orf_sequences_from_trails(orf_trails, options[:min_orf_length])
106
+ end
77
107
 
78
- found_orfs.each_pair do |name, sequence|
79
- puts ">#{name}"
80
- puts sequence
108
+ def write_orfs_to_file(found_orfs, orfs_file, translate=false)
109
+ if translate
110
+ translator = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
111
+ end
112
+ File.open(orfs_file,'w') do |f|
113
+ counter = 0
114
+ found_orfs.each do |name_and_sequence|
115
+ counter += 1
116
+ f.puts ">finishm_orf_#{counter} #{name_and_sequence[0]}"
117
+ if translate
118
+ f.puts translator.sequence2AA(name_and_sequence[1][0...-3])
119
+ else
120
+ f.puts name_and_sequence[1]
121
+ end
122
+ end
81
123
  end
82
124
  end
83
125
 
@@ -10,6 +10,7 @@ class Bio::FinishM::RoundUp
10
10
  :gapfill_only => false,
11
11
  :max_explore_nodes => 10000,
12
12
  :max_gapfill_paths => 10,
13
+ :gapfill_with_max_coverage => false,
13
14
  }
14
15
 
15
16
  def add_options(optparse_object, options)
@@ -64,6 +65,9 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
64
65
  optparse_object.on("--max-explore-nodes NUM", Integer, "Only explore this many nodes. If max is reached, do not make connections. [default: #{options[:max_explore_nodes] }]") do |arg|
65
66
  options[:max_explore_nodes] = arg
66
67
  end
68
+ optparse_object.on("--gapfill-with-max-coverage", "When gapfilling, take the path with maximal coverage and do not print variants [default: #{options[:gapfill_with_max_coverage] }]") do
69
+ options[:gapfill_with_max_coverage] = true
70
+ end
67
71
  optparse_object.on("--debug", "Build the graph, then drop to a pry console. [default: #{options[:debug] }]") do
68
72
  options[:debug] = true
69
73
  end
@@ -193,6 +197,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
193
197
  # Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
194
198
  scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
195
199
  else
200
+ acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
196
201
  scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
197
202
  master_graph.graph,
198
203
  scaffold_sequence,
@@ -321,7 +326,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
321
326
  return gapfilled_sequence, num_gapfills, all_variants
322
327
  end
323
328
 
324
- def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths)
329
+ def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options)
325
330
  scaffold_sequence = nil
326
331
  gapfilled = -1
327
332
  if aconn.paths.length == 0 or aconn.paths.length > max_gapfill_paths
@@ -329,6 +334,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
329
334
  scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
330
335
  gapfilled = false
331
336
  else
337
+ acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
332
338
  scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
333
339
  master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
334
340
  )
@@ -38,7 +38,8 @@ class Bio::FinishM::Visualise
38
38
  return validate_argv_length(argv) ||
39
39
  validate_visualisation_options(options) ||
40
40
  validate_probe_options(options) ||
41
- validate_assembly_options(options)
41
+ validate_assembly_options(options) ||
42
+ validate_scaffold_options(options)
42
43
  end
43
44
 
44
45
  def add_visualisation_options(optparse_object, options)
@@ -79,8 +80,8 @@ class Bio::FinishM::Visualise
79
80
 
80
81
  def validate_scaffold_options(options)
81
82
  # If scaffolds are defined, then probe genomes must also be defined
82
- if options[:scaffolds] and !options[:assembly_files]
83
- return "If --scaffolds is defined, so then must --genomes"
83
+ if options[:scaffold_sides] and !options[:assembly_files]
84
+ return "If --scaffolds is given, then --genomes must also be given"
84
85
  end
85
86
  end
86
87
 
@@ -195,6 +195,127 @@ describe "AllOrfs" do
195
195
  res.collect{|result| result.initial_start_markers}.should == [[],[]]
196
196
  end
197
197
 
198
+ it 'should find two same-phase orfs along a trail' do
199
+ graph, = GraphTesting.emit_otrails([
200
+ [1,2,3]
201
+ ])
202
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
203
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAAATGGA' #stop codon 'TAA', start codon 'ATG'
204
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
205
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
206
+
207
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
208
+ problems = orfer.find_all_problems(graph, [initial_path])
209
+ #pp problems
210
+
211
+ paths = orfer.find_orfs_from_problems(problems)
212
+ #pp paths
213
+ GraphTesting.sorted_paths(paths.trails).should == [
214
+ [1,2,3]
215
+ ]
216
+ res = paths.trails[0].fwd_orfs_result
217
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
218
+ [6,15],
219
+ [18,30]
220
+ ]
221
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
222
+ [[1,6],[2,5]],
223
+ [[2,8], [3,10]]
224
+ ]
225
+ res.initial_start_markers.should == []
226
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
227
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
228
+ res.final_start_markers.should == []
229
+ end
230
+
231
+ it 'should end orfs at first stop codon in forward direction' do
232
+ graph, = GraphTesting.emit_otrails([
233
+ [1,2,3]
234
+ ])
235
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
236
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAAAAAGA' #stop codon 'TAA'
237
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
238
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
239
+
240
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
241
+ problems = orfer.find_all_problems(graph, [initial_path])
242
+ #pp problems
243
+
244
+ paths = orfer.find_orfs_from_problems(problems)
245
+ #pp paths
246
+ GraphTesting.sorted_paths(paths.trails).should == [
247
+ [1,2,3]
248
+ ]
249
+ res = paths.trails[0].fwd_orfs_result
250
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
251
+ [6,15]
252
+ ]
253
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
254
+ [[1,6],[2,5]]
255
+ ]
256
+ res.initial_start_markers.should == []
257
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
258
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,3]]
259
+ res.final_start_markers.should == []
260
+ end
261
+
262
+ it 'should end orfs at first stop codon in twin direction' do
263
+ graph = GraphTesting.emit([
264
+ [1,2],
265
+ [2,3]
266
+ ])
267
+ graph.nodes[1].ends_of_kmers_of_twin_node = 'TTAGTTTTTT' # stop codon 'TAG'
268
+ graph.nodes[2].ends_of_kmers_of_twin_node = 'TTTAGTTTTT' # stop codon 'TAG'
269
+ graph.nodes[3].ends_of_kmers_of_twin_node = 'TAAATGTTTT' # stop codon 'TAA', start codon 'ATG'
270
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
271
+
272
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
273
+ problems = orfer.find_all_problems(graph, [initial_path])
274
+ #pp problems
275
+
276
+ paths = orfer.find_orfs_from_problems(problems)
277
+ #pp paths
278
+ GraphTesting.sorted_paths(paths.trails).should == [
279
+ [1,2,3]
280
+ ]
281
+ res = paths.trails[0].twin_orfs_result
282
+ GraphTesting.sorted_marker_pair_positions(res.start_stop_pairs).should == [
283
+ [6,15]
284
+ ]
285
+ GraphTesting.sorted_marker_pair_node_positions(res.start_stop_pairs).should == [
286
+ [[3,6],[2,5]]
287
+ ]
288
+ res.initial_start_markers.should == []
289
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [3]
290
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[3,3]]
291
+ res.final_start_markers.should == []
292
+ end
293
+
294
+ it 'should return the first initial stop codon in forward direction' do
295
+ graph = GraphTesting.emit([
296
+ [1,2],
297
+ [2,3]
298
+ ])
299
+ graph.nodes[1].ends_of_kmers_of_node = 'AAATAGAAAA' # stop codon 'TAG'
300
+ graph.nodes[2].ends_of_kmers_of_node = 'AATAGAAAAA' # stop codon 'TAG'
301
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
302
+
303
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
304
+ problems = orfer.find_all_problems(graph, [initial_path])
305
+ #pp problems
306
+
307
+ paths = orfer.find_orfs_from_problems(problems)
308
+ #pp paths
309
+ GraphTesting.sorted_paths(paths.trails).should == [
310
+ [1,2,3]
311
+ ]
312
+ res = paths.trails[0].fwd_orfs_result
313
+ res.start_stop_pairs.should == []
314
+ res.initial_start_markers.should == []
315
+ GraphTesting.marker_positions(res.initial_stop_markers).should == [6]
316
+ GraphTesting.marker_node_positions(res.initial_stop_markers).should == [[1,6]]
317
+ res.final_start_markers.should == []
318
+ end
198
319
 
199
320
  it 'should respect terminal nodes' do
200
321
  fail '#todo'
@@ -433,11 +554,79 @@ describe "AllOrfs" do
433
554
 
434
555
  paths = orfer.find_orfs_from_problems(problems)
435
556
  #pp paths
436
- orfer.orf_sequences_from_trails(paths.trails).should == {
437
- '(1s:6),2s,(3s:10)' => 'ATGGAAAAAAAAAAAAAAAAAAAATAA',
438
- '1s,2s,3s' => 'T'*30,
439
- ',(1s:3)' => 'TAA'
440
- }
557
+ orfer.orf_sequences_from_trails(paths.trails).should == [
558
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
559
+ [',(1s:3)', 'TAA'],
560
+ ['1s,2s,3s', 'T'*30],
561
+ ['1s,2s,3s', 'T'*27],
562
+ ['1s,2s,3s', 'T'*27]
563
+ ]
564
+ end
565
+
566
+ it 'should respect minimum orf length' do
567
+ graph = GraphTesting.emit([
568
+ [1,2],
569
+ [2,3]
570
+ ])
571
+ graph.nodes[1].ends_of_kmers_of_node = 'TAAATGGAAA' #stop codon 'TAA', start codon 'ATG'
572
+ graph.nodes[3].ends_of_kmers_of_node = 'AAAAAAATAA' #stop codon 'TAA'
573
+ initial_path = GraphTesting.make_onodes(graph, %w(1s))
574
+
575
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
576
+ problems = orfer.find_all_problems(graph, [initial_path])
577
+
578
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 30)
579
+ orfer.orf_sequences_from_trails(paths.trails, 30).should == [
580
+ ['1s,2s,3s', 'T'*30]
581
+ ]
582
+
583
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 20)
584
+ orfer.orf_sequences_from_trails(paths.trails, 20).should == [
585
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
586
+ ['1s,2s,3s', 'T'*30],
587
+ ['1s,2s,3s', 'T'*27],
588
+ ['1s,2s,3s', 'T'*27]
589
+ ]
590
+
591
+ paths = orfer.find_orfs_from_problems(problems, :min_orf_length => 0)
592
+ orfer.orf_sequences_from_trails(paths.trails, 0).should == [
593
+ ['(1s:6),2s,(3s:10)', 'ATGGAAAAAAAAAAAAAAAAAAAATAA'],
594
+ [',(1s:3)', 'TAA'],
595
+ ['1s,2s,3s', 'T'*30],
596
+ ['1s,2s,3s', 'T'*27],
597
+ ['1s,2s,3s', 'T'*27]
598
+ ]
599
+
600
+ end
601
+ end
602
+
603
+ describe 'sequence2AA' do
604
+ it 'should return corresponding amino acids for an orf sequence' do
605
+ orfer = Bio::AssemblyGraphAlgorithms::AllOrfsFinder.new
606
+ orfer.sequence2AA('GCTGCCGCAGCG').should == 'AAAA'
607
+ orfer.sequence2AA('CGTCGCCGACGGAGAAGG').should == 'RRRRRR'
608
+ orfer.sequence2AA('AATAAC').should == 'NN'
609
+ orfer.sequence2AA('GATGAC').should == 'DD'
610
+ orfer.sequence2AA('TGTTGC').should == 'CC'
611
+ orfer.sequence2AA('CAACAG').should == 'QQ'
612
+ orfer.sequence2AA('GAAGAG').should == 'EE'
613
+ orfer.sequence2AA('GGTGGCGGAGGG').should == 'GGGG'
614
+ orfer.sequence2AA('CATCAC').should == 'HH'
615
+ orfer.sequence2AA('ATTATCATA').should == 'III'
616
+ orfer.sequence2AA('TTATTGCTTCTCCTACTG').should == 'LLLLLL'
617
+ orfer.sequence2AA('AAAAAG').should == 'KK'
618
+ orfer.sequence2AA('ATG').should == 'M'
619
+ orfer.sequence2AA('TTTTTC').should == 'FF'
620
+ orfer.sequence2AA('CCTCCCCCACCG').should == 'PPPP'
621
+ orfer.sequence2AA('TCTTCCTCATCGAGTAGC').should == 'SSSSSS'
622
+ orfer.sequence2AA('ACTACCACAACG').should == 'TTTT'
623
+ orfer.sequence2AA('TGG').should == 'W'
624
+ orfer.sequence2AA('TATTAC').should == 'YY'
625
+ orfer.sequence2AA('GTTGTCGTAGTG').should == 'VVVV'
626
+ lambda { orfer.sequence2AA('TAA') }.should raise_error
627
+ lambda { orfer.sequence2AA('TGA') }.should raise_error
628
+ lambda { orfer.sequence2AA('TAG') }.should raise_error
629
+ lambda { orfer.sequence2AA('ABCXYZ') }.should raise_error
441
630
  end
442
631
  end
443
632
  end
@@ -42,7 +42,7 @@ describe "ContigPrinter" do
42
42
  '14S:GTT',
43
43
  ].sort
44
44
  end
45
-
45
+
46
46
  it 'should handle not variants' do
47
47
  seqs = [
48
48
  'ATGAATATGTGCATAGGATT',
@@ -235,7 +235,7 @@ describe "ContigPrinter" do
235
235
  GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
236
236
  GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
237
237
  ]
238
- expected =
238
+ expected =
239
239
  'ATGAACGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGAGACCTTCGGGTCTAGTGGCGCACGGGTGCGTAACGCGTGGGAATCTGCCCTTGGGTACGG'+
240
240
  'AATAACAGTTAGAAATGACTGCTAATACCGTATAATGACTTCGGTCCAAAGATTTATCGCCCAGGGATGAGCCCGCGTAGGATTAGCTTGTTGGTGAGGTAAANN'+
241
241
  'NTNNCNNANNNNNNNNNNNNTNNNNNGNNNNNNNNNNNGNTNAGNNNCNNNGNNNNNGNGANNTGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGC'+
@@ -283,9 +283,27 @@ describe "ContigPrinter" do
283
283
  it 'should handle when start_coord is not == 0 and both reads are outwards facing' do
284
284
  raise
285
285
  end
286
-
286
+
287
287
  it 'should handle when the example path is not the same length as the reference path' do
288
288
  fail
289
289
  end
290
290
  end
291
+
292
+ describe 'AnchoredConnection' do
293
+ it 'should collapse_paths_to_maximal_coverage_path!' do
294
+ graph = Bio::Velvet::Graph.parse_from_file(File.join TEST_DATA_DIR, 'contig_printer','1','seq.fa.velvet','LastGraph')
295
+ graph.nodes.length.should == 13
296
+ acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
297
+ acon.start_probe_noded_read = graph.nodes[9].short_reads.select{|nr| nr.read_id == 161}[0] #Found these by using bwa and inspecting the Sequence velvet file
298
+ acon.end_probe_noded_read = graph.nodes[4].short_reads.select{|nr| nr.read_id == 1045}[0]
299
+ acon.start_probe_contig_offset = 2
300
+ acon.end_probe_contig_offset = 3
301
+ acon.paths = [
302
+ GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 11e 2s 10s 4e)),#highest coverage
303
+ GraphTesting.make_onodes(graph, %w(9s 12s 7e 13s 5e 1e 2e 10s 4e)),
304
+ ]
305
+ acon.collapse_paths_to_maximal_coverage_path!
306
+ acon.paths.collect{|path| path.to_shorthand}.should == [%w(9s 12s 7e 13s 5e 11e 2s 10s 4e).join(',')]
307
+ end
308
+ end
291
309
  end
@@ -141,4 +141,18 @@ describe "ScaffoldBreaker" do
141
141
  brokes[0].sequence.should == seq
142
142
  end
143
143
  end
144
+
145
+ it 'should replace non-ATGC characters with N' do
146
+ breaker = Bio::FinishM::ScaffoldBreaker.new
147
+ Tempfile.open('a') do |tmp|
148
+ tmp.puts '>ab'
149
+ seq = 'AAAAANNNGGGYYYTTNNAA'
150
+ tmp.puts seq
151
+ # 1234567890123456789
152
+ tmp.close
153
+
154
+ brokes = breaker.break_scaffolds(tmp.path)
155
+ brokes[0].sequence.should == 'AAAAANNNGGGNNNTTNNAA'
156
+ end
157
+ end
144
158
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: finishm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J. Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-30 00:00:00.000000000 Z
11
+ date: 2015-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-ipcress