finishm 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/VERSION +1 -1
- data/finishm.gemspec +3 -3
- data/lib/assembly/contig_printer.rb +14 -4
- data/lib/finishm/roundup.rb +34 -21
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 97ecc9ab04ccb6e227d33d59a4219edfc0ba4614
|
4
|
+
data.tar.gz: 931671e63b1b82f20337dac7af39970d87f4c25a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48492828900c54d3fcbfd2e4ffae365476861a0cada3ed5b3c31917e9bc7bfc47a06055b1f1891003fad2896c5ce8b6576602a3c5e93c2ffab865001f3126065
|
7
|
+
data.tar.gz: 08d57f83a950941b8d202df5df572ee9735ba61a842e55d2c494169d24a248e7930c7034128981a1f8a09e4e09c17c161b6a662d5aa3090b09fc39720892dfe2
|
data/README.md
CHANGED
@@ -50,10 +50,10 @@ cd ../..
|
|
50
50
|
|
51
51
|
## Citation
|
52
52
|
|
53
|
-
A manuscript describing the tools described here is currently in preparation. However, FinishM reuses code from velvet and BioRuby, so these tools may be worth citing.
|
53
|
+
A manuscript describing the tools described here is currently in preparation. However, FinishM reuses code from velvet, clustalo and BioRuby, so these tools may be worth citing.
|
54
54
|
|
55
55
|
## Copyright
|
56
56
|
|
57
|
-
Copyright (c) 2012-
|
57
|
+
Copyright (c) 2012-2015 Ben J. Woodcroft. See LICENSE.txt for
|
58
58
|
further details.
|
59
59
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/finishm.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: finishm 0.0.
|
5
|
+
# stub: finishm 0.0.7 ruby lib
|
6
6
|
# stub: ext/mkrf_conf.rb
|
7
7
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = "finishm"
|
10
|
-
s.version = "0.0.
|
10
|
+
s.version = "0.0.7"
|
11
11
|
|
12
12
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
13
|
s.require_paths = ["lib"]
|
14
14
|
s.authors = ["Ben J. Woodcroft"]
|
15
|
-
s.date = "2015-
|
15
|
+
s.date = "2015-09-07"
|
16
16
|
s.description = "De-novo assemblies generally only provide draft genomes. FinishM is aimed at improving these draft assemblies."
|
17
17
|
s.email = "donttrustben near gmail.com"
|
18
18
|
s.executables = ["finishm"]
|
@@ -84,10 +84,6 @@ module Bio
|
|
84
84
|
# beginning and ending probes
|
85
85
|
begin
|
86
86
|
example_path = anchored_connection.paths[0]
|
87
|
-
path_sequence, variants = sequences_to_variants_conservative(
|
88
|
-
anchored_connection.paths.collect{|path| path.sequence}
|
89
|
-
)
|
90
|
-
log.debug "Reference path has a sequence length #{path_sequence.length}" if log.debug?
|
91
87
|
|
92
88
|
# Find start index
|
93
89
|
begin_onode = example_path[0]
|
@@ -108,6 +104,20 @@ module Bio
|
|
108
104
|
offset_of_begin_probe_on_path = begin_noded_read.offset_from_start_of_node
|
109
105
|
end
|
110
106
|
|
107
|
+
path_sequence, variants = sequences_to_variants_conservative(
|
108
|
+
anchored_connection.paths.collect do |path|
|
109
|
+
seq = nil
|
110
|
+
begin
|
111
|
+
seq = path.sequence
|
112
|
+
rescue Bio::Velvet::Graph::OrientedNodeTrail::InsufficientLengthException => e
|
113
|
+
log.warn "Failed to join two contigs together because of inability to get sequence out of a trail of nodes. In the past this has been caused by low coverage thus making finishM inappropriate, so returning an unconnected contig now. However, this may be legitimate in the case of an unlucky misassembly at both ends of the contigs being joined, so please report this error to the author."
|
114
|
+
return nil, nil
|
115
|
+
end
|
116
|
+
seq
|
117
|
+
end
|
118
|
+
)
|
119
|
+
log.debug "Reference path has a sequence length #{path_sequence.length}" if log.debug?
|
120
|
+
|
111
121
|
# Correct variants' positions to be relative to the full contig,
|
112
122
|
# not just the path sequence
|
113
123
|
variants.each do |variant|
|
data/lib/finishm/roundup.rb
CHANGED
@@ -191,27 +191,35 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
191
191
|
)
|
192
192
|
second_sequence = genome.scaffolds[contig.sequence_index].contigs[0].sequence
|
193
193
|
log.debug "Found #{aconn.paths.length} connections between #{last_name} and #{current_name}" if log.debug?
|
194
|
-
|
195
|
-
|
196
|
-
# (at least for now) TODO: this could be improved.
|
197
|
-
# Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
|
198
|
-
scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
|
199
|
-
else
|
194
|
+
connected = false
|
195
|
+
if aconn.paths.length > 0
|
200
196
|
aconn.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
201
|
-
|
197
|
+
scaffold_sequence2, variants = printer.ready_two_contigs_and_connections(
|
202
198
|
master_graph.graph,
|
203
199
|
scaffold_sequence,
|
204
200
|
aconn,
|
205
201
|
rhs_sequence,
|
206
202
|
master_graph.velvet_sequences
|
207
203
|
)
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
204
|
+
if !scaffold_sequence2.nil?
|
205
|
+
scaffold_sequence = scaffold_sequence2
|
206
|
+
connected = true
|
207
|
+
# Print variants
|
208
|
+
# TODO: need to change coordinates of variants, particularly when >2 contigs are joined?
|
209
|
+
variants.each do |variant|
|
210
|
+
variant.reference_name = superscaffold_name
|
211
|
+
variants_file.puts variant.vcf(scaffold_sequence)
|
212
|
+
end
|
213
213
|
end
|
214
214
|
end
|
215
|
+
if !connected
|
216
|
+
# when this occurs, it is due to there being a circuit in the path, so no paths are printed.
|
217
|
+
# (at least for now) TODO: this could be improved.
|
218
|
+
# Just arbitrarily put in 100 N characters, to denote a join, but no gapfill
|
219
|
+
|
220
|
+
# (or, it could be impossible to join because of low coverage resulting in inability to get sequence from the node trail)
|
221
|
+
scaffold_sequence = scaffold_sequence+('N'*100)+rhs_sequence
|
222
|
+
end
|
215
223
|
end
|
216
224
|
last_contig = contig
|
217
225
|
end
|
@@ -228,6 +236,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
228
236
|
genome.scaffolds[contig.sequence_index].name
|
229
237
|
end
|
230
238
|
output_file.puts ">#{superscaffold_name} #{descriptor} #{scaffold_names.join(':') }"
|
239
|
+
scaffold_sequence.gsub! '-', '' #remove dashes since these make things fail downstream
|
231
240
|
output_file.puts scaffold_sequence
|
232
241
|
end
|
233
242
|
|
@@ -292,7 +301,7 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
292
301
|
options[:interesting_probes].include?(probe2.number)
|
293
302
|
connections.push gapfiller.gapfill(master_graph, probe1.index, probe2.index, options)
|
294
303
|
end
|
295
|
-
log.debug "Found #{connections.length} connections" if log.debug?
|
304
|
+
log.debug "Found #{connections.length} connections gapfilling in scaffold #{scaffold_index}" if log.debug?
|
296
305
|
|
297
306
|
all_variants = []
|
298
307
|
num_gapfills = 0
|
@@ -329,18 +338,22 @@ the finishm_roundup_results directory in FASTA format. The procedure is then rep
|
|
329
338
|
def piece_together_gapfill(printer, master_graph, first_sequence, aconn, second_sequence, gap_length, max_gapfill_paths, options={})
|
330
339
|
scaffold_sequence = nil
|
331
340
|
gapfilled = -1
|
332
|
-
if aconn.paths.length
|
333
|
-
|
334
|
-
scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
|
335
|
-
gapfilled = false
|
336
|
-
else
|
337
|
-
acon.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
341
|
+
if aconn.paths.length != 0 and aconn.paths.length <= max_gapfill_paths
|
342
|
+
aconn.collapse_paths_to_maximal_coverage_path! if options[:gapfill_with_max_coverage]
|
338
343
|
scaffold_sequence, variants = printer.ready_two_contigs_and_connections(
|
339
344
|
master_graph.graph, first_sequence, aconn, second_sequence, master_graph.velvet_sequences
|
340
345
|
)
|
341
|
-
|
346
|
+
if !scaffold_sequence.nil? #sometimes it is just impossible even if there is paths
|
347
|
+
scaffold_sequence.gsub!('-','') #remove gaps i.e. where the consensus is a gap
|
348
|
+
gapfilled = true
|
349
|
+
end
|
342
350
|
end
|
343
|
-
|
351
|
+
if gapfilled != true
|
352
|
+
# No paths found, or gapfilling failed at the final step Just fill with Ns like it was before
|
353
|
+
scaffold_sequence = first_sequence + 'N'*gap_length + second_sequence
|
354
|
+
gapfilled = false
|
355
|
+
end
|
356
|
+
|
344
357
|
return scaffold_sequence, variants, gapfilled
|
345
358
|
end
|
346
359
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: finishm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J. Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-ipcress
|