bio-velvet 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +62 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/lib/bio-velvet.rb +13 -0
- data/lib/bio-velvet/graph.rb +517 -0
- data/lib/bio-velvet/runner.rb +91 -0
- data/spec/bio-velvet_arc_array_spec.rb +123 -0
- data/spec/bio-velvet_graph_spec.rb +290 -0
- data/spec/bio-velvet_runner_spec.rb +67 -0
- data/spec/data/node_sequence/LastGraph +33 -0
- data/spec/data/node_sequence/contigs.fa +75 -0
- data/spec/data/runner_input.fa +18 -0
- data/spec/data/short_node_LastGraph +40 -0
- data/spec/data/short_node_sequence_test_graph +20 -0
- data/spec/data/velvet_test_reads_assembly/Graph +3465 -0
- data/spec/data/velvet_test_reads_assembly/HOWTO_RECREATE +2 -0
- data/spec/data/velvet_test_reads_assembly/LastGraph +3462 -0
- data/spec/data/velvet_test_reads_assembly_read_tracking/Graph2 +45602 -0
- data/spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE +2 -0
- data/spec/data/velvet_test_trail_sequence_assembly/reads1.fa +18 -0
- data/spec/spec_helper.rb +14 -0
- metadata +211 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 798289b36dd93bb47a40f8f5c1e71ecf59305699
|
4
|
+
data.tar.gz: 3b97653d1ca5fd6b62c1ab1f097c3ffe868ce04c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee227d4e19f9ce09edb22316aea8fa05fde499e27dcec8af4bec6afc7c8bbb7567b081f86cd64d0c34a00ec4d7e7c2202a3336914770ee00e053bf09907cf8f0
|
7
|
+
data.tar.gz: f84054f0cff8d627a57d2431d3af35b19a9941d65da10aee04e38431e436bc28ddef4021904df445d31d59a87cddabe8b61dcc409abd6df1db19bc9f41930fb2
|
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
script: bundle exec rspec spec/bio-velvet_graph_spec.rb
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem 'bio-logger', '>=1.0.1'
|
4
|
+
gem 'systemu'
|
5
|
+
gem 'files'
|
6
|
+
gem 'hopcsv', '>= 0.4.3'
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "rspec", ">= 2.8.0"
|
12
|
+
gem "rdoc", ">= 3.12"
|
13
|
+
gem "jeweler", ">= 1.8.4"
|
14
|
+
gem "bundler", ">= 1.0.21"
|
15
|
+
gem "bio", ">= 1.4.2"
|
16
|
+
gem "rdoc", ">= 3.12"
|
17
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Ben J Woodcroft
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# bio-velvet
|
2
|
+
|
3
|
+
[](http://travis-ci.org/wwood/bioruby-velvet)
|
4
|
+
|
5
|
+
```bio-velvet``` is a [biogem](biogems.info) for interacting with the [velvet](http://www.ebi.ac.uk/~zerbino/velvet/) sequence assembler. It includes both a wrapper for the velvet executable, as well as a a parser for the 'LastGraph' format files that velvet creates. This gives access to the underlying assembly graph created by velvet.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
To install ```bio-velvet``` and its rubygem dependencies:
|
9
|
+
|
10
|
+
```sh
|
11
|
+
gem install bio-velvet
|
12
|
+
```
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
To run velvet with a kmer length of 87 on a set of single ended reads in ```/path/to/reads.fa```:
|
17
|
+
```ruby
|
18
|
+
require 'bio-velvet'
|
19
|
+
|
20
|
+
velvet_result = Bio::Velvet::Runner.new.velvet(87, '-short /path/to/reads.fa') #=> Bio::Velvet::Result object
|
21
|
+
|
22
|
+
contigs_file = velvet_result.contigs_path #=> path to contigs file as a String
|
23
|
+
lastgraph_file = velvet_result.last_graph_path #=> path to last graph file as a String
|
24
|
+
```
|
25
|
+
|
26
|
+
The graph file can be then parsed from the ```velvet_result```:
|
27
|
+
```ruby
|
28
|
+
graph = velvet_result.last_graph #=> Bio::Velvet::Graph object
|
29
|
+
```
|
30
|
+
In my experience (mostly on complex metagenomes), the graph object itself does not take as much RAM as I initially expected. Most of the hard work has already been done by velvet itself, particularly if the ```-cov_cutoff``` has been set. However parsing in the graph can take many minutes if the LastGraph file is big (>500MB).
|
31
|
+
|
32
|
+
With this graph you can access interact with the graph e.g.
|
33
|
+
```ruby
|
34
|
+
graph.kmer_length #=> 87
|
35
|
+
graph.nodes #=> Bio::Velvet::Graph::NodeArray object
|
36
|
+
graph.nodes[3] #=> Bio::Velvet::Graph::Node object with node ID 3
|
37
|
+
graph.get_arcs_by_node_id(1, 3) #=> an array of arcs between nodes 1 and 3 (Bio::Velvet::Graph::Arc objects)
|
38
|
+
graph.nodes[5].noded_reads #=> array of Bio::Velvet::Graph::NodedRead objects, for read tracking
|
39
|
+
```
|
40
|
+
There is much more that can be done to interact with the graph object and its components - see the [rubydoc](http://rubydoc.info/gems/bio-velvet).
|
41
|
+
|
42
|
+
## Project home page
|
43
|
+
|
44
|
+
Information on the source tree, documentation, examples, issues and
|
45
|
+
how to contribute, see
|
46
|
+
|
47
|
+
http://github.com/wwood/bioruby-velvet
|
48
|
+
|
49
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
50
|
+
|
51
|
+
## Cite
|
52
|
+
|
53
|
+
This code is currently unpublished.
|
54
|
+
|
55
|
+
## Biogems.info
|
56
|
+
|
57
|
+
This Biogem is published at (http://biogems.info/index.html#bio-velvet)
|
58
|
+
|
59
|
+
## Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for further details.
|
62
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-velvet"
|
18
|
+
gem.homepage = "http://github.com/wwood/bioruby-velvet"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Parser to work with file formats used in the velvet DNA assembler}
|
21
|
+
gem.description = %Q{Parser to work with some file formats used in the velvet DNA assembler}
|
22
|
+
gem.email = "donttrustben@gmail.com"
|
23
|
+
gem.authors = ["Ben J Woodcroft"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rdoc/task'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "bio-velvet #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/lib/bio-velvet.rb
ADDED
@@ -0,0 +1,517 @@
|
|
1
|
+
require 'hopcsv'
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module Velvet
|
6
|
+
class NotImplementedException < Exception; end
|
7
|
+
|
8
|
+
# Parser for a velvet assembler's graph file (Graph or LastGraph) output from velvetg
|
9
|
+
#
|
10
|
+
# The definition of this file is given in the velvet manual, at
|
11
|
+
# http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
|
12
|
+
class Graph
|
13
|
+
include Bio::Velvet::Logging
|
14
|
+
|
15
|
+
# $NUMBER_OF_NODES $NUMBER_OF_SEQUENCES $HASH_LENGTH
|
16
|
+
attr_accessor :number_of_nodes, :number_of_sequences, :hash_length
|
17
|
+
|
18
|
+
# NodeArray object of all the graph's node objects
|
19
|
+
attr_accessor :nodes
|
20
|
+
|
21
|
+
# Array of Arc objects
|
22
|
+
attr_accessor :arcs
|
23
|
+
|
24
|
+
def self.log
|
25
|
+
self.new.log
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse a graph file from a Graph, Graph2 or LastGraph output file from velvet
|
29
|
+
# into a Bio::Velvet::Graph object
|
30
|
+
def self.parse_from_file(path_to_graph_file)
|
31
|
+
graph = self.new
|
32
|
+
state = :header
|
33
|
+
|
34
|
+
current_node = nil
|
35
|
+
graph.nodes = NodeArray.new
|
36
|
+
graph.arcs = ArcArray.new
|
37
|
+
current_node_direction = nil
|
38
|
+
|
39
|
+
line_number = 0
|
40
|
+
Hopcsv.foreach(path_to_graph_file,"\t") do |row|
|
41
|
+
line_number += 1
|
42
|
+
|
43
|
+
if state == :header
|
44
|
+
raise "parse exception on header line, this line #{line_number}: #{row.inspect}" unless row.length >= 3
|
45
|
+
graph.number_of_nodes = row[0].to_i
|
46
|
+
graph.number_of_sequences = row[1].to_i
|
47
|
+
graph.hash_length = row[2].to_i
|
48
|
+
#Not quite sure what the function of the 4th column is
|
49
|
+
state = :nodes_0
|
50
|
+
log.debug "Now parsing velvet graph nodes" if log.debug?
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
if state == :nodes_0
|
55
|
+
# NODE $NODE_ID $COV_SHORT1 $O_COV_SHORT1 $COV_SHORT2 $O_COV_SHORT2
|
56
|
+
# $ENDS_OF_KMERS_OF_NODE
|
57
|
+
# $ENDS_OF_KMERS_OF_TWIN_NODE
|
58
|
+
if row[0] == 'NODE'
|
59
|
+
raise unless row.length > 2
|
60
|
+
current_node = Node.new
|
61
|
+
current_node.node_id = row[1].to_i
|
62
|
+
current_node.length = row[2].to_i
|
63
|
+
current_node.coverages = row[3...row.length].collect{|c| c.to_i}
|
64
|
+
current_node.parent_graph = graph
|
65
|
+
state = :nodes_1
|
66
|
+
raise "Duplicate node name" unless graph.nodes[current_node.node_id].nil?
|
67
|
+
graph.nodes[current_node.node_id] = current_node
|
68
|
+
next
|
69
|
+
else
|
70
|
+
state = :arc
|
71
|
+
log.debug "Now parsing velvet graph arcs" if log.debug?
|
72
|
+
# No next in the loop so that this line gets parsed as an ARC further down the loop
|
73
|
+
end
|
74
|
+
elsif state == :nodes_1
|
75
|
+
# Sometimes nodes can be empty
|
76
|
+
row[0] ||= ''
|
77
|
+
current_node.ends_of_kmers_of_node = row[0]
|
78
|
+
raise "Unexpected nodes_1 type line on line #{line_number}: #{row.inspect}" if row.length != 1
|
79
|
+
state = :nodes_2
|
80
|
+
next
|
81
|
+
elsif state == :nodes_2
|
82
|
+
# Sometimes nodes can be empty
|
83
|
+
row[0] ||= ''
|
84
|
+
raise if row.length != 1
|
85
|
+
current_node.ends_of_kmers_of_twin_node = row[0]
|
86
|
+
state = :nodes_0
|
87
|
+
next
|
88
|
+
end
|
89
|
+
|
90
|
+
if state == :arc
|
91
|
+
if row[0] == 'ARC'
|
92
|
+
# ARC $START_NODE $END_NODE $MULTIPLICITY
|
93
|
+
arc = Arc.new
|
94
|
+
raise unless row.length == 4
|
95
|
+
arc.begin_node_id = row[1].to_i.abs
|
96
|
+
arc.end_node_id = row[2].to_i.abs
|
97
|
+
arc.multiplicity = row[3].to_i
|
98
|
+
arc.begin_node_direction = (row[1].to_i > 0)
|
99
|
+
arc.end_node_direction = (row[2].to_i > 0)
|
100
|
+
graph.arcs.push arc
|
101
|
+
next
|
102
|
+
else
|
103
|
+
state = :nr
|
104
|
+
log.debug "Finished parsing velvet graph arcs. Now parsing the rest of the file" if log.debug?
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if state == :nr
|
109
|
+
if row[0] == 'SEQ'
|
110
|
+
log.warn "velvet graph parse warning: SEQ lines in the Graph file parsing not implemented yet, tracking of reads now not parsed either"
|
111
|
+
break
|
112
|
+
end
|
113
|
+
|
114
|
+
# If short reads are tracked, for every node a block of read identifiers:
|
115
|
+
# NR $NODE_ID $NUMBER_OF_SHORT_READS
|
116
|
+
# $READ_ID $OFFSET_FROM_START_OF_NODE $START_COORD
|
117
|
+
# $READ_ID2 etc.
|
118
|
+
#p row
|
119
|
+
if row[0] == 'NR'
|
120
|
+
raise unless row.length == 3
|
121
|
+
node_pm = row[1].to_i
|
122
|
+
current_node_direction = node_pm > 0
|
123
|
+
current_node = graph.nodes[node_pm.abs]
|
124
|
+
current_node.number_of_short_reads ||= 0
|
125
|
+
current_node.number_of_short_reads += row[2].to_i
|
126
|
+
next
|
127
|
+
else
|
128
|
+
raise unless row.length == 3
|
129
|
+
nr = NodedRead.new
|
130
|
+
nr.read_id = row[0].to_i
|
131
|
+
nr.offset_from_start_of_node = row[1].to_i
|
132
|
+
nr.start_coord = row[2].to_i
|
133
|
+
nr.direction = current_node_direction
|
134
|
+
current_node.short_reads ||= []
|
135
|
+
current_node.short_reads.push nr
|
136
|
+
next
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
log.debug "Finished parsing velvet graph file" if log.debug?
|
141
|
+
|
142
|
+
return graph
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return an array of Arc objects between two nodes (specified by integer IDs),
|
146
|
+
# or an empty array if none exists. There is four possible arcs between
|
147
|
+
# two nodes, connecting their beginnings and ends
|
148
|
+
def get_arcs_by_node_id(node_id1, node_id2)
|
149
|
+
@arcs.get_arcs_by_node_id(node_id1, node_id2)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Return an array of Arc objects between two nodes (specified by node objects),
|
153
|
+
# or an empty array if none exists. There is four possible arcs between
|
154
|
+
# two nodes, connecting their beginnings and ends
|
155
|
+
def get_arcs_by_node(node1, node2)
|
156
|
+
@arcs.get_arcs_by_node_id(node1.node_id, node2.node_id)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return the adjacent nodes in the graph that connect to the end of a node
|
160
|
+
def neighbours_off_end(node)
|
161
|
+
# Find all arcs that include this node in the right place
|
162
|
+
passable_nodes = []
|
163
|
+
@arcs.get_arcs_by_node_id(node.node_id).each do |arc|
|
164
|
+
if arc.begin_node_id == node.node_id and arc.begin_node_direction
|
165
|
+
# The most intuitive case
|
166
|
+
passable_nodes.push nodes[arc.end_node_id]
|
167
|
+
elsif arc.end_node_id == node.node_id and !arc.end_node_direction
|
168
|
+
passable_nodes.push nodes[arc.begin_node_id]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
return passable_nodes
|
172
|
+
end
|
173
|
+
|
174
|
+
# Return the adjacent nodes in the graph that connect to the end of a node
|
175
|
+
def neighbours_into_start(node)
|
176
|
+
# Find all arcs that include this node in the right place
|
177
|
+
passable_nodes = []
|
178
|
+
@arcs.get_arcs_by_node_id(node.node_id).each do |arc|
|
179
|
+
if arc.end_node_id == node.node_id and arc.end_node_direction
|
180
|
+
passable_nodes.push nodes[arc.begin_node_id]
|
181
|
+
elsif arc.begin_node_id == node.node_id and !arc.begin_node_direction
|
182
|
+
passable_nodes.push nodes[arc.end_node_id]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
return passable_nodes
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# Deletes nodes and associated arcs from the graph if the block passed
|
190
|
+
# evaluates to true (as in Array#delete_if). Actually the associated arcs
|
191
|
+
# are deleted first, and then the node, so that the graph remains sane at all
|
192
|
+
# times - there is never dangling arcs, as such.
|
193
|
+
#
|
194
|
+
# Returns a [deleted_nodes, deleted_arc] tuple, which are both enumerables,
|
195
|
+
# each in no particular order.
|
196
|
+
def delete_nodes_if(&block)
|
197
|
+
deleted_nodes = []
|
198
|
+
deleted_arcs = []
|
199
|
+
nodes.each do |node|
|
200
|
+
if yield(node)
|
201
|
+
deleted_nodes.push node
|
202
|
+
|
203
|
+
# delete associated arcs
|
204
|
+
arcs_to_del = @arcs.get_arcs_by_node_id(node.node_id)
|
205
|
+
deleted_arcs.push arcs_to_del
|
206
|
+
arcs_to_del.each do |arc|
|
207
|
+
@arcs.delete arc
|
208
|
+
end
|
209
|
+
|
210
|
+
# delete the arc itself
|
211
|
+
nodes.delete node
|
212
|
+
end
|
213
|
+
end
|
214
|
+
return deleted_nodes, deleted_arcs.flatten
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
# A container class for a list of Node objects. Can index with 1-offset
|
222
|
+
# IDs, so that they line up with the identifiers in velvet Graph files,
|
223
|
+
# yet respond sensibly to NodeArray#length, etc.
|
224
|
+
class NodeArray
|
225
|
+
include Enumerable
|
226
|
+
|
227
|
+
def initialize
|
228
|
+
# Internal index is required because when things get deleted stuff changes.
|
229
|
+
@internal_structure = {}
|
230
|
+
end
|
231
|
+
|
232
|
+
def []=(node_id, value)
|
233
|
+
@internal_structure[node_id] = value
|
234
|
+
end
|
235
|
+
|
236
|
+
def [](node_id)
|
237
|
+
@internal_structure[node_id]
|
238
|
+
end
|
239
|
+
|
240
|
+
def delete(node)
|
241
|
+
@internal_structure.delete node.node_id
|
242
|
+
end
|
243
|
+
|
244
|
+
def length
|
245
|
+
@internal_structure.length
|
246
|
+
end
|
247
|
+
|
248
|
+
def each(&block)
|
249
|
+
@internal_structure.each do |internal_id, node|
|
250
|
+
block.yield node
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
class ArcArray
|
256
|
+
include Enumerable
|
257
|
+
|
258
|
+
def initialize
|
259
|
+
# Internal structure is hash of [node_id1, node_id2] => Array of arcs
|
260
|
+
@internal_structure = {}
|
261
|
+
@node_to_keys = {}
|
262
|
+
end
|
263
|
+
|
264
|
+
def push(arc)
|
265
|
+
key = [arc.begin_node_id, arc.end_node_id].sort
|
266
|
+
@internal_structure[key] ||= []
|
267
|
+
@internal_structure[key].push arc
|
268
|
+
@node_to_keys[arc.begin_node_id] ||= []
|
269
|
+
@node_to_keys[arc.begin_node_id].push key
|
270
|
+
unless arc.begin_node_id == arc.end_node_id
|
271
|
+
@node_to_keys[arc.end_node_id] ||= []
|
272
|
+
@node_to_keys[arc.end_node_id].push key
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
# Return all arcs into or out of the given node_id, or
|
277
|
+
def get_arcs_by_node_id(node_id1, node_id2=nil)
|
278
|
+
if node_id2.nil?
|
279
|
+
next_keys = @node_to_keys[node_id1]
|
280
|
+
return [] if next_keys.nil?
|
281
|
+
next_keys.uniq.collect do |key|
|
282
|
+
@internal_structure[key]
|
283
|
+
end.flatten
|
284
|
+
else
|
285
|
+
to_return = @internal_structure[[node_id1, node_id2].sort]
|
286
|
+
if to_return.nil?
|
287
|
+
return []
|
288
|
+
else
|
289
|
+
return to_return
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def delete(arc)
|
295
|
+
key = [arc.begin_node_id, arc.end_node_id].sort
|
296
|
+
@internal_structure[key].delete arc
|
297
|
+
# If there is no other arcs with this same key, clean up more
|
298
|
+
if @internal_structure[key].empty?
|
299
|
+
@internal_structure.delete key
|
300
|
+
@node_to_keys[key[0]].delete key
|
301
|
+
@node_to_keys[key[1]].delete key
|
302
|
+
@node_to_keys[key[0]] = nil if @node_to_keys[key[0]].nil? or @node_to_keys[key[0]].empty?
|
303
|
+
@node_to_keys[key[1]] = nil if @node_to_keys[key[1]].nil? or @node_to_keys[key[1]].empty?
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
def length
|
308
|
+
@internal_structure.values.flatten.length
|
309
|
+
end
|
310
|
+
|
311
|
+
def each(&block)
|
312
|
+
@internal_structure.each do |internal_id, arcs|
|
313
|
+
arcs.each do |arc|
|
314
|
+
block.yield arc
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
class Node
|
321
|
+
include Bio::Velvet::Logging
|
322
|
+
|
323
|
+
attr_accessor :node_id, :coverages, :ends_of_kmers_of_node, :ends_of_kmers_of_twin_node
|
324
|
+
|
325
|
+
# For read tracking
|
326
|
+
attr_accessor :number_of_short_reads
|
327
|
+
# For read tracking - an array of NodedRead objects
|
328
|
+
attr_accessor :short_reads
|
329
|
+
|
330
|
+
# Graph to which this node belongs
|
331
|
+
attr_accessor :parent_graph
|
332
|
+
|
333
|
+
# Number of nucleotides in this node if a contig was made from this contig alone
|
334
|
+
attr_accessor :length
|
335
|
+
|
336
|
+
# The sequence of this node, should a contig be made solely out of this node.
|
337
|
+
# The kmer length is that kmer length that was used to create the assembly.
|
338
|
+
#
|
339
|
+
# If this node has a sequence that is 2 or more less than the hash length, then the
|
340
|
+
# sequence of this node requires information outside of this object, and gathering
|
341
|
+
# that information is not implemented here.
|
342
|
+
def sequence
|
343
|
+
if !sequence?
|
344
|
+
raise NotImplementedException, "Attempted to get the sequence of a velvet node that is too short, such that the sequence info is not fully present in the node object"
|
345
|
+
end
|
346
|
+
kmer_length = @parent_graph.hash_length
|
347
|
+
|
348
|
+
# Sequence is the reverse complement of the ends_of_kmers_of_twin_node,
|
349
|
+
# Then the ends_of_kmers_of_node after removing the first kmer_length - 1
|
350
|
+
# nucleotides
|
351
|
+
length_to_get_from_fwd = corresponding_contig_length - @ends_of_kmers_of_twin_node.length
|
352
|
+
fwd_length = @ends_of_kmers_of_node.length
|
353
|
+
raise "Programming error" if length_to_get_from_fwd > fwd_length
|
354
|
+
revcom(@ends_of_kmers_of_twin_node)+
|
355
|
+
@ends_of_kmers_of_node[-length_to_get_from_fwd...fwd_length]
|
356
|
+
end
|
357
|
+
|
358
|
+
# Number of nucleotides in this node if this contig length is being added to
|
359
|
+
# another node's length (nodes overlap)
|
360
|
+
def length_alone
|
361
|
+
@ends_of_kmers_of_node.length
|
362
|
+
end
|
363
|
+
|
364
|
+
# The common length of [ends_of_kmers_of_node and :ends_of_kmers_of_twin_node]
|
365
|
+
# is equal to the length of the corresponding contig minus k − 1.
|
366
|
+
#
|
367
|
+
# This method returns that corresponding contig's length
|
368
|
+
def corresponding_contig_length
|
369
|
+
@ends_of_kmers_of_node.length+@parent_graph.hash_length-1
|
370
|
+
end
|
371
|
+
|
372
|
+
# Is it possible to extract the sequence of this node? I.e. is it long enough?
|
373
|
+
def sequence?
|
374
|
+
kmer_length = @parent_graph.hash_length
|
375
|
+
if kmer_length -1 > @ends_of_kmers_of_node.length
|
376
|
+
return false
|
377
|
+
else
|
378
|
+
return true
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
# The reverse complement of this node's sequence
|
383
|
+
def reverse_sequence
|
384
|
+
revcom(sequence)
|
385
|
+
end
|
386
|
+
|
387
|
+
# Number of nucleotides in this node if this contig length is being added to
|
388
|
+
# another node's length (nodes overlap)
|
389
|
+
def length_alone
|
390
|
+
@ends_of_kmers_of_node.length
|
391
|
+
end
|
392
|
+
|
393
|
+
def to_s
|
394
|
+
"Node #{@node_id}: #{@ends_of_kmers_of_node} / #{@ends_of_kmers_of_twin_node}"
|
395
|
+
end
|
396
|
+
|
397
|
+
def inspect
|
398
|
+
to_s
|
399
|
+
end
|
400
|
+
|
401
|
+
# Return the sum of all coverage columns, divided by the length of the node,
|
402
|
+
# or nil if this node has no coverage
|
403
|
+
def coverage
|
404
|
+
return nil if length == 0
|
405
|
+
|
406
|
+
coverage = 0
|
407
|
+
coverages.each_with_index do |cov, i|
|
408
|
+
# Only take the 0th, 2nd, 4th, etc, don't want the O_cov things
|
409
|
+
coverage += cov if i.modulo(2) == 0
|
410
|
+
end
|
411
|
+
return coverage.to_f / length
|
412
|
+
end
|
413
|
+
|
414
|
+
private
|
415
|
+
def revcom(seq)
|
416
|
+
Bio::Sequence::NA.new(seq).reverse_complement.to_s.upcase
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
class Arc
|
421
|
+
attr_accessor :begin_node_id, :end_node_id, :multiplicity
|
422
|
+
|
423
|
+
# true for forwards direction, false for reverse
|
424
|
+
attr_accessor :begin_node_direction, :end_node_direction
|
425
|
+
|
426
|
+
def directions_opposing?
|
427
|
+
if (@begin_node_direction == true and @end_node_direction == false) or
|
428
|
+
(@begin_node_direction == false and @end_node_direction == true)
|
429
|
+
return true
|
430
|
+
elsif [true,false].include?(@begin_node_direction) and [true,false].include?(@end_node_direction)
|
431
|
+
return false
|
432
|
+
else
|
433
|
+
raise Exception, "Node directions not set! Cannot tell whether directions are opposing"
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
def begin_node_forward?
|
438
|
+
@begin_node_direction
|
439
|
+
end
|
440
|
+
|
441
|
+
def end_node_forward?
|
442
|
+
@end_node_forward
|
443
|
+
end
|
444
|
+
|
445
|
+
# Returns true if this arc connects the end of the first node
|
446
|
+
# to the start of the second node, else false
|
447
|
+
def connects_end_to_beginning?(first_node_id, second_node_id)
|
448
|
+
# ARC $START_NODE $END_NODE $MULTIPLICITY
|
449
|
+
#Note: this one line implicitly represents an arc from node A to B and
|
450
|
+
#another, with same multiplicity, from -B to -A.
|
451
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
452
|
+
@begin_node_direction == true and @end_node_direction == true) or
|
453
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
454
|
+
@begin_node_direction == false and @end_node_direction == false)
|
455
|
+
end
|
456
|
+
|
457
|
+
# Returns true if this arc connects the end of the first node
|
458
|
+
# to the end of the second node, else false
|
459
|
+
def connects_end_to_end?(first_node_id, second_node_id)
|
460
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
461
|
+
@begin_node_direction == true and @end_node_direction == false) or
|
462
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
463
|
+
@begin_node_direction == true and @end_node_direction == false)
|
464
|
+
end
|
465
|
+
|
466
|
+
# Returns true if this arc connects the start of the first node
|
467
|
+
# to the start of the second node, else false
|
468
|
+
def connects_beginning_to_beginning?(first_node_id, second_node_id)
|
469
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
470
|
+
@begin_node_direction == false and @end_node_direction == true) or
|
471
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
472
|
+
@begin_node_direction == false and @end_node_direction == true)
|
473
|
+
end
|
474
|
+
|
475
|
+
# Returns true if this arc connects the start of the first node
|
476
|
+
# to the start of the second node, else false
|
477
|
+
def connects_beginning_to_end?(first_node_id, second_node_id)
|
478
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
479
|
+
@begin_node_direction == false and @end_node_direction == false) or
|
480
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
481
|
+
@begin_node_direction == true and @end_node_direction == true)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Return true if this arc connects the beginning of the node,
|
485
|
+
# else false
|
486
|
+
def connects_to_beginning?(node_id)
|
487
|
+
(node_id == @begin_node_id and !@begin_node_direction) or
|
488
|
+
(node_id == @end_node_id and @end_node_direction)
|
489
|
+
end
|
490
|
+
|
491
|
+
# Return true if this arc connects the end of the node,
|
492
|
+
# else false
|
493
|
+
def connects_to_end?(node_id)
|
494
|
+
(node_id == @begin_node_id and @begin_node_direction) or
|
495
|
+
(node_id == @end_node_id and !@end_node_direction)
|
496
|
+
end
|
497
|
+
|
498
|
+
def to_s
|
499
|
+
str = ''
|
500
|
+
str += '-' if @begin_node_direction == false
|
501
|
+
str += @begin_node_id.to_s
|
502
|
+
str += ' '
|
503
|
+
str += '-' if @end_node_direction == false
|
504
|
+
str += @end_node_id.to_s
|
505
|
+
str += ' '
|
506
|
+
str += @multiplicity.to_s
|
507
|
+
str
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
# Tracked read, part of a node
|
512
|
+
class NodedRead
|
513
|
+
attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
|
514
|
+
end
|
515
|
+
end
|
516
|
+
end
|
517
|
+
end
|