finishm 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fd108980dbaa801a2989d17f35c59249cc7b16e2
4
- data.tar.gz: 0533d456de22bcda37b7578f0860b50e96ebf6f6
3
+ metadata.gz: 97ecc9ab04ccb6e227d33d59a4219edfc0ba4614
4
+ data.tar.gz: 931671e63b1b82f20337dac7af39970d87f4c25a
5
5
  SHA512:
6
- metadata.gz: c7c29e7353c0cc55a745a803fbeea5d87196a5e922c8551a0fb04b44bd928cd922d664d3e1cf1ad86539a5f090dfdfc77fad23667b2eda3bab822724c3da9b00
7
- data.tar.gz: 56785e4404f2da18c76e399fc3747f71bb82fb259fcd42851b7559d7902f40b33d19b070f2c605b3b360de0fe182f2d2c4f5b4417a6430b3b2fd526cc561a0d9
6
+ metadata.gz: 48492828900c54d3fcbfd2e4ffae365476861a0cada3ed5b3c31917e9bc7bfc47a06055b1f1891003fad2896c5ce8b6576602a3c5e93c2ffab865001f3126065
7
+ data.tar.gz: 08d57f83a950941b8d202df5df572ee9735ba61a842e55d2c494169d24a248e7930c7034128981a1f8a09e4e09c17c161b6a662d5aa3090b09fc39720892dfe2
data/README.md CHANGED
@@ -50,10 +50,10 @@ cd ../..
50
50
 
51
51
  ## Citation
52
52
 
53
- A manuscript describing the tools described here is currently in preparation. However, FinishM reuses code from velvet and BioRuby, so these tools may be worth citing.
53
+ A manuscript describing the tools described here is currently in preparation. However, FinishM reuses code from velvet, clustalo and BioRuby, so these tools may be worth citing.
54
54
 
55
55
  ## Copyright
56
56
 
57
- Copyright (c) 2012-2014 Ben J. Woodcroft. See LICENSE.txt for
57
+ Copyright (c) 2012-2015 Ben J. Woodcroft. See LICENSE.txt for
58
58
  further details.
59
59
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.6
1
+ 0.0.7
@@ -2,17 +2,17 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: finishm 0.0.6 ruby lib
5
+ # stub: finishm 0.0.7 ruby lib
6
6
  # stub: ext/mkrf_conf.rb
7
7
 
8
8
  Gem::Specification.new do |s|
9
9
  s.name = "finishm"
10
- s.version = "0.0.6"
10
+ s.version = "0.0.7"
11
11
 
12
12
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13
13
  s.require_paths = ["lib"]
14
14
  s.authors = ["Ben J. Woodcroft"]
15
- s.date = "2015-08-31"
15
+ s.date = "2015-09-07"
16
16
  s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
17
17
  s.email = "donttrustben near gmail.com"
18
18
  s.executables = ["finishm"]
@@ -84,10 +84,6 @@ module Bio
84
84
  # beginning and ending probes
85
85
  begin
86
86
  example_path = anchored_connection.paths[0]
87
- path_sequence, variants = sequences_to_variants_conservative(
88
- anchored_connection.paths.collect{|path| path.sequence}
89
- )
90
- log.debug "Reference path has a sequence length #{path_sequence.length}" if log.debug?
91
87
 
92
88
  # Find start index
93
89
  begin_onode = example_path[0]
@@ -108,6 +104,20 @@ module Bio
108
104
  offset_of_begin_probe_on_path = begin_noded_read.offset_from_start_of_node
109
105
  end
110
106
 
107
+ path_sequence, variants = sequences_to_variants_conservative(
108
+ anchored_connection.paths.collect do |path|
109
+ seq = nil
110
+ begin
111
+ seq = path.sequence
112
+ rescue Bio::Velvet::Graph::OrientedNodeTrail::InsufficientLengthException => e
113
+ log.warn "Failed to join two contigs together because of inability to get sequence out of a trail of nodes. In the past this has been caused by low coverage thus making finishM inappropriate, so returning an unconnected contig now. However, this may be legitimate in the case of an unlucky misassembly at both ends of the contigs being joined, so please report this error to the author."
114
+ return nil, nil
115
+ end
116
+ seq
117
+ end
118
+ )
119
+ log.debug "Reference path has a sequence length #{path_sequence.length}" if log.debug?
120
+
111
121
  # Correct variants' positions to be relative to the full contig,
112
122
  # not just the path sequence
113
123
  variants.each do |variant|
@@ -191,27 +191,35 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
191
191
  )
192
192
  second_sequence = genome.scaffolds[contig.sequence_index].contigs[0].sequence
193
193
  log.debug "Found #{aconn.paths.length} connections between #{last_name} and #{current_name}" if log.debug?
194
- if aconn.paths.length == 0
195
- # when this occurs, it is due to there being a circuit in the path, so no paths are printed.
196
- # (at least for now) TODO: this could be improved.
197
- # Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
198
- scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
199
- else
194
+ connected = false
195
+ if aconn.paths.length > 0
200
196
  aconn.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
201
- scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
197
+ scaffold_sequence2, variants = printer.ready_two_contigs_and_connections(
202
198
  master_graph.graph,
203
199
  scaffold_sequence,
204
200
  aconn,
205
201
  rhs_sequence,
206
202
  master_graph.velvet_sequences
207
203
  )
208
- # Print variants
209
- # TODO: need to change coordinates of variants, particularly when >2 contigs are joined?
210
- variants.each do |variant|
211
- variant.reference_name = superscaffold_name
212
- variants_file.puts variant.vcf(scaffold_sequence)
204
+ if !scaffold_sequence2.nil?
205
+ scaffold_sequence = scaffold_sequence2
206
+ connected = true
207
+ # Print variants
208
+ # TODO: need to change coordinates of variants, particularly when >2 contigs are joined?
209
+ variants.each do |variant|
210
+ variant.reference_name = superscaffold_name
211
+ variants_file.puts variant.vcf(scaffold_sequence)
212
+ end
213
213
  end
214
214
  end
215
+ if !connected
216
+ # when this occurs, it is due to there being a circuit in the path, so no paths are printed.
217
+ # (at least for now) TODO: this could be improved.
218
+ # Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
219
+
220
+ # (or, it could be impossible to join because of low coverage resulting in inability to get sequence from the node trail)
221
+ scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
222
+ end
215
223
  end
216
224
  last_contig = contig
217
225
  end
@@ -228,6 +236,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
228
236
  genome.scaffolds[contig.sequence_index].name
229
237
  end
230
238
  output_file.puts ">#{superscaffold_name} #{descriptor} #{scaffold_names.join(':') }"
239
+ scaffold_sequence.gsub! '-', '' #remove dashes since these make things fail downstream
231
240
  output_file.puts scaffold_sequence
232
241
  end
233
242
 
@@ -292,7 +301,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
292
301
  options[:interesting_probes].include?(probe2.number)
293
302
  connections.push gapfiller.gapfill(master_graph, probe1.index, probe2.index, options)
294
303
  end
295
- log.debug "Found #{connections.length} connections" if log.debug?
304
+ log.debug "Found #{connections.length} connections gapfilling in scaffold #{scaffold_index}" if log.debug?
296
305
 
297
306
  all_variants = []
298
307
  num_gapfills = 0
@@ -329,18 +338,22 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
329
338
  def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options={})
330
339
  scaffold_sequence = nil
331
340
  gapfilled = -1
332
- if aconn.paths.length == 0 or aconn.paths.length > max_gapfill_paths
333
- # No paths found. Just fill with Ns like it was before
334
- scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
335
- gapfilled = false
336
- else
337
- acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
341
+ if aconn.paths.length != 0 and aconn.paths.length <= max_gapfill_paths
342
+ aconn.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
338
343
  scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
339
344
  master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
340
345
  )
341
- gapfilled = true
346
+ if !scaffold_sequence.nil? #sometimes it is just impossible even if there is paths
347
+ scaffold_sequence.gsub!('-','') #remove gaps i.e. where the consensus is a gap
348
+ gapfilled = true
349
+ end
342
350
  end
343
- scaffold_sequence.gsub!('-','') #remove gaps i.e. where the consensus is a gap
351
+ if gapfilled != true
352
+ # No paths found, or gapfilling failed at the final step Just fill with Ns like it was before
353
+ scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
354
+ gapfilled = false
355
+ end
356
+
344
357
  return scaffold_sequence, variants, gapfilled
345
358
  end
346
359
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: finishm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J. Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-31 00:00:00.000000000 Z
11
+ date: 2015-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-ipcress