bio-velvet 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +62 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/lib/bio-velvet.rb +13 -0
- data/lib/bio-velvet/graph.rb +517 -0
- data/lib/bio-velvet/runner.rb +91 -0
- data/spec/bio-velvet_arc_array_spec.rb +123 -0
- data/spec/bio-velvet_graph_spec.rb +290 -0
- data/spec/bio-velvet_runner_spec.rb +67 -0
- data/spec/data/node_sequence/LastGraph +33 -0
- data/spec/data/node_sequence/contigs.fa +75 -0
- data/spec/data/runner_input.fa +18 -0
- data/spec/data/short_node_LastGraph +40 -0
- data/spec/data/short_node_sequence_test_graph +20 -0
- data/spec/data/velvet_test_reads_assembly/Graph +3465 -0
- data/spec/data/velvet_test_reads_assembly/HOWTO_RECREATE +2 -0
- data/spec/data/velvet_test_reads_assembly/LastGraph +3462 -0
- data/spec/data/velvet_test_reads_assembly_read_tracking/Graph2 +45602 -0
- data/spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE +2 -0
- data/spec/data/velvet_test_trail_sequence_assembly/reads1.fa +18 -0
- data/spec/spec_helper.rb +14 -0
- metadata +211 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 798289b36dd93bb47a40f8f5c1e71ecf59305699
|
4
|
+
data.tar.gz: 3b97653d1ca5fd6b62c1ab1f097c3ffe868ce04c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee227d4e19f9ce09edb22316aea8fa05fde499e27dcec8af4bec6afc7c8bbb7567b081f86cd64d0c34a00ec4d7e7c2202a3336914770ee00e053bf09907cf8f0
|
7
|
+
data.tar.gz: f84054f0cff8d627a57d2431d3af35b19a9941d65da10aee04e38431e436bc28ddef4021904df445d31d59a87cddabe8b61dcc409abd6df1db19bc9f41930fb2
|
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
script: bundle exec rspec spec/bio-velvet_graph_spec.rb
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem 'bio-logger', '>=1.0.1'
|
4
|
+
gem 'systemu'
|
5
|
+
gem 'files'
|
6
|
+
gem 'hopcsv', '>= 0.4.3'
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "rspec", ">= 2.8.0"
|
12
|
+
gem "rdoc", ">= 3.12"
|
13
|
+
gem "jeweler", ">= 1.8.4"
|
14
|
+
gem "bundler", ">= 1.0.21"
|
15
|
+
gem "bio", ">= 1.4.2"
|
16
|
+
gem "rdoc", ">= 3.12"
|
17
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2013 Ben J Woodcroft
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# bio-velvet
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/wwood/bioruby-velvet.png)](http://travis-ci.org/wwood/bioruby-velvet)
|
4
|
+
|
5
|
+
```bio-velvet``` is a [biogem](biogems.info) for interacting with the [velvet](http://www.ebi.ac.uk/~zerbino/velvet/) sequence assembler. It includes both a wrapper for the velvet executable, as well as a a parser for the 'LastGraph' format files that velvet creates. This gives access to the underlying assembly graph created by velvet.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
To install ```bio-velvet``` and its rubygem dependencies:
|
9
|
+
|
10
|
+
```sh
|
11
|
+
gem install bio-velvet
|
12
|
+
```
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
To run velvet with a kmer length of 87 on a set of single ended reads in ```/path/to/reads.fa```:
|
17
|
+
```ruby
|
18
|
+
require 'bio-velvet'
|
19
|
+
|
20
|
+
velvet_result = Bio::Velvet::Runner.new.velvet(87, '-short /path/to/reads.fa') #=> Bio::Velvet::Result object
|
21
|
+
|
22
|
+
contigs_file = velvet_result.contigs_path #=> path to contigs file as a String
|
23
|
+
lastgraph_file = velvet_result.last_graph_path #=> path to last graph file as a String
|
24
|
+
```
|
25
|
+
|
26
|
+
The graph file can be then parsed from the ```velvet_result```:
|
27
|
+
```ruby
|
28
|
+
graph = velvet_result.last_graph #=> Bio::Velvet::Graph object
|
29
|
+
```
|
30
|
+
In my experience (mostly on complex metagenomes), the graph object itself does not take as much RAM as I initially expected. Most of the hard work has already been done by velvet itself, particularly if the ```-cov_cutoff``` has been set. However parsing in the graph can take many minutes if the LastGraph file is big (>500MB).
|
31
|
+
|
32
|
+
With this graph you can access interact with the graph e.g.
|
33
|
+
```ruby
|
34
|
+
graph.kmer_length #=> 87
|
35
|
+
graph.nodes #=> Bio::Velvet::Graph::NodeArray object
|
36
|
+
graph.nodes[3] #=> Bio::Velvet::Graph::Node object with node ID 3
|
37
|
+
graph.get_arcs_by_node_id(1, 3) #=> an array of arcs between nodes 1 and 3 (Bio::Velvet::Graph::Arc objects)
|
38
|
+
graph.nodes[5].noded_reads #=> array of Bio::Velvet::Graph::NodedRead objects, for read tracking
|
39
|
+
```
|
40
|
+
There is much more that can be done to interact with the graph object and its components - see the [rubydoc](http://rubydoc.info/gems/bio-velvet).
|
41
|
+
|
42
|
+
## Project home page
|
43
|
+
|
44
|
+
Information on the source tree, documentation, examples, issues and
|
45
|
+
how to contribute, see
|
46
|
+
|
47
|
+
http://github.com/wwood/bioruby-velvet
|
48
|
+
|
49
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
50
|
+
|
51
|
+
## Cite
|
52
|
+
|
53
|
+
This code is currently unpublished.
|
54
|
+
|
55
|
+
## Biogems.info
|
56
|
+
|
57
|
+
This Biogem is published at (http://biogems.info/index.html#bio-velvet)
|
58
|
+
|
59
|
+
## Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for further details.
|
62
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-velvet"
|
18
|
+
gem.homepage = "http://github.com/wwood/bioruby-velvet"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Parser to work with file formats used in the velvet DNA assembler}
|
21
|
+
gem.description = %Q{Parser to work with some file formats used in the velvet DNA assembler}
|
22
|
+
gem.email = "donttrustben@gmail.com"
|
23
|
+
gem.authors = ["Ben J Woodcroft"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rdoc/task'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "bio-velvet #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/lib/bio-velvet.rb
ADDED
@@ -0,0 +1,517 @@
|
|
1
|
+
require 'hopcsv'
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
module Bio
|
5
|
+
module Velvet
|
6
|
+
class NotImplementedException < Exception; end
|
7
|
+
|
8
|
+
# Parser for a velvet assembler's graph file (Graph or LastGraph) output from velvetg
|
9
|
+
#
|
10
|
+
# The definition of this file is given in the velvet manual, at
|
11
|
+
# http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
|
12
|
+
class Graph
|
13
|
+
include Bio::Velvet::Logging
|
14
|
+
|
15
|
+
# $NUMBER_OF_NODES $NUMBER_OF_SEQUENCES $HASH_LENGTH
|
16
|
+
attr_accessor :number_of_nodes, :number_of_sequences, :hash_length
|
17
|
+
|
18
|
+
# NodeArray object of all the graph's node objects
|
19
|
+
attr_accessor :nodes
|
20
|
+
|
21
|
+
# Array of Arc objects
|
22
|
+
attr_accessor :arcs
|
23
|
+
|
24
|
+
def self.log
|
25
|
+
self.new.log
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse a graph file from a Graph, Graph2 or LastGraph output file from velvet
|
29
|
+
# into a Bio::Velvet::Graph object
|
30
|
+
def self.parse_from_file(path_to_graph_file)
|
31
|
+
graph = self.new
|
32
|
+
state = :header
|
33
|
+
|
34
|
+
current_node = nil
|
35
|
+
graph.nodes = NodeArray.new
|
36
|
+
graph.arcs = ArcArray.new
|
37
|
+
current_node_direction = nil
|
38
|
+
|
39
|
+
line_number = 0
|
40
|
+
Hopcsv.foreach(path_to_graph_file,"\t") do |row|
|
41
|
+
line_number += 1
|
42
|
+
|
43
|
+
if state == :header
|
44
|
+
raise "parse exception on header line, this line #{line_number}: #{row.inspect}" unless row.length >= 3
|
45
|
+
graph.number_of_nodes = row[0].to_i
|
46
|
+
graph.number_of_sequences = row[1].to_i
|
47
|
+
graph.hash_length = row[2].to_i
|
48
|
+
#Not quite sure what the function of the 4th column is
|
49
|
+
state = :nodes_0
|
50
|
+
log.debug "Now parsing velvet graph nodes" if log.debug?
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
if state == :nodes_0
|
55
|
+
# NODE $NODE_ID $COV_SHORT1 $O_COV_SHORT1 $COV_SHORT2 $O_COV_SHORT2
|
56
|
+
# $ENDS_OF_KMERS_OF_NODE
|
57
|
+
# $ENDS_OF_KMERS_OF_TWIN_NODE
|
58
|
+
if row[0] == 'NODE'
|
59
|
+
raise unless row.length > 2
|
60
|
+
current_node = Node.new
|
61
|
+
current_node.node_id = row[1].to_i
|
62
|
+
current_node.length = row[2].to_i
|
63
|
+
current_node.coverages = row[3...row.length].collect{|c| c.to_i}
|
64
|
+
current_node.parent_graph = graph
|
65
|
+
state = :nodes_1
|
66
|
+
raise "Duplicate node name" unless graph.nodes[current_node.node_id].nil?
|
67
|
+
graph.nodes[current_node.node_id] = current_node
|
68
|
+
next
|
69
|
+
else
|
70
|
+
state = :arc
|
71
|
+
log.debug "Now parsing velvet graph arcs" if log.debug?
|
72
|
+
# No next in the loop so that this line gets parsed as an ARC further down the loop
|
73
|
+
end
|
74
|
+
elsif state == :nodes_1
|
75
|
+
# Sometimes nodes can be empty
|
76
|
+
row[0] ||= ''
|
77
|
+
current_node.ends_of_kmers_of_node = row[0]
|
78
|
+
raise "Unexpected nodes_1 type line on line #{line_number}: #{row.inspect}" if row.length != 1
|
79
|
+
state = :nodes_2
|
80
|
+
next
|
81
|
+
elsif state == :nodes_2
|
82
|
+
# Sometimes nodes can be empty
|
83
|
+
row[0] ||= ''
|
84
|
+
raise if row.length != 1
|
85
|
+
current_node.ends_of_kmers_of_twin_node = row[0]
|
86
|
+
state = :nodes_0
|
87
|
+
next
|
88
|
+
end
|
89
|
+
|
90
|
+
if state == :arc
|
91
|
+
if row[0] == 'ARC'
|
92
|
+
# ARC $START_NODE $END_NODE $MULTIPLICITY
|
93
|
+
arc = Arc.new
|
94
|
+
raise unless row.length == 4
|
95
|
+
arc.begin_node_id = row[1].to_i.abs
|
96
|
+
arc.end_node_id = row[2].to_i.abs
|
97
|
+
arc.multiplicity = row[3].to_i
|
98
|
+
arc.begin_node_direction = (row[1].to_i > 0)
|
99
|
+
arc.end_node_direction = (row[2].to_i > 0)
|
100
|
+
graph.arcs.push arc
|
101
|
+
next
|
102
|
+
else
|
103
|
+
state = :nr
|
104
|
+
log.debug "Finished parsing velvet graph arcs. Now parsing the rest of the file" if log.debug?
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if state == :nr
|
109
|
+
if row[0] == 'SEQ'
|
110
|
+
log.warn "velvet graph parse warning: SEQ lines in the Graph file parsing not implemented yet, tracking of reads now not parsed either"
|
111
|
+
break
|
112
|
+
end
|
113
|
+
|
114
|
+
# If short reads are tracked, for every node a block of read identifiers:
|
115
|
+
# NR $NODE_ID $NUMBER_OF_SHORT_READS
|
116
|
+
# $READ_ID $OFFSET_FROM_START_OF_NODE $START_COORD
|
117
|
+
# $READ_ID2 etc.
|
118
|
+
#p row
|
119
|
+
if row[0] == 'NR'
|
120
|
+
raise unless row.length == 3
|
121
|
+
node_pm = row[1].to_i
|
122
|
+
current_node_direction = node_pm > 0
|
123
|
+
current_node = graph.nodes[node_pm.abs]
|
124
|
+
current_node.number_of_short_reads ||= 0
|
125
|
+
current_node.number_of_short_reads += row[2].to_i
|
126
|
+
next
|
127
|
+
else
|
128
|
+
raise unless row.length == 3
|
129
|
+
nr = NodedRead.new
|
130
|
+
nr.read_id = row[0].to_i
|
131
|
+
nr.offset_from_start_of_node = row[1].to_i
|
132
|
+
nr.start_coord = row[2].to_i
|
133
|
+
nr.direction = current_node_direction
|
134
|
+
current_node.short_reads ||= []
|
135
|
+
current_node.short_reads.push nr
|
136
|
+
next
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
log.debug "Finished parsing velvet graph file" if log.debug?
|
141
|
+
|
142
|
+
return graph
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return an array of Arc objects between two nodes (specified by integer IDs),
|
146
|
+
# or an empty array if none exists. There is four possible arcs between
|
147
|
+
# two nodes, connecting their beginnings and ends
|
148
|
+
def get_arcs_by_node_id(node_id1, node_id2)
|
149
|
+
@arcs.get_arcs_by_node_id(node_id1, node_id2)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Return an array of Arc objects between two nodes (specified by node objects),
|
153
|
+
# or an empty array if none exists. There is four possible arcs between
|
154
|
+
# two nodes, connecting their beginnings and ends
|
155
|
+
def get_arcs_by_node(node1, node2)
|
156
|
+
@arcs.get_arcs_by_node_id(node1.node_id, node2.node_id)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return the adjacent nodes in the graph that connect to the end of a node
|
160
|
+
def neighbours_off_end(node)
|
161
|
+
# Find all arcs that include this node in the right place
|
162
|
+
passable_nodes = []
|
163
|
+
@arcs.get_arcs_by_node_id(node.node_id).each do |arc|
|
164
|
+
if arc.begin_node_id == node.node_id and arc.begin_node_direction
|
165
|
+
# The most intuitive case
|
166
|
+
passable_nodes.push nodes[arc.end_node_id]
|
167
|
+
elsif arc.end_node_id == node.node_id and !arc.end_node_direction
|
168
|
+
passable_nodes.push nodes[arc.begin_node_id]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
return passable_nodes
|
172
|
+
end
|
173
|
+
|
174
|
+
# Return the adjacent nodes in the graph that connect to the end of a node
|
175
|
+
def neighbours_into_start(node)
|
176
|
+
# Find all arcs that include this node in the right place
|
177
|
+
passable_nodes = []
|
178
|
+
@arcs.get_arcs_by_node_id(node.node_id).each do |arc|
|
179
|
+
if arc.end_node_id == node.node_id and arc.end_node_direction
|
180
|
+
passable_nodes.push nodes[arc.begin_node_id]
|
181
|
+
elsif arc.begin_node_id == node.node_id and !arc.begin_node_direction
|
182
|
+
passable_nodes.push nodes[arc.end_node_id]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
return passable_nodes
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# Deletes nodes and associated arcs from the graph if the block passed
|
190
|
+
# evaluates to true (as in Array#delete_if). Actually the associated arcs
|
191
|
+
# are deleted first, and then the node, so that the graph remains sane at all
|
192
|
+
# times - there is never dangling arcs, as such.
|
193
|
+
#
|
194
|
+
# Returns a [deleted_nodes, deleted_arc] tuple, which are both enumerables,
|
195
|
+
# each in no particular order.
|
196
|
+
def delete_nodes_if(&block)
|
197
|
+
deleted_nodes = []
|
198
|
+
deleted_arcs = []
|
199
|
+
nodes.each do |node|
|
200
|
+
if yield(node)
|
201
|
+
deleted_nodes.push node
|
202
|
+
|
203
|
+
# delete associated arcs
|
204
|
+
arcs_to_del = @arcs.get_arcs_by_node_id(node.node_id)
|
205
|
+
deleted_arcs.push arcs_to_del
|
206
|
+
arcs_to_del.each do |arc|
|
207
|
+
@arcs.delete arc
|
208
|
+
end
|
209
|
+
|
210
|
+
# delete the arc itself
|
211
|
+
nodes.delete node
|
212
|
+
end
|
213
|
+
end
|
214
|
+
return deleted_nodes, deleted_arcs.flatten
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
# A container class for a list of Node objects. Can index with 1-offset
|
222
|
+
# IDs, so that they line up with the identifiers in velvet Graph files,
|
223
|
+
# yet respond sensibly to NodeArray#length, etc.
|
224
|
+
class NodeArray
|
225
|
+
include Enumerable
|
226
|
+
|
227
|
+
def initialize
|
228
|
+
# Internal index is required because when things get deleted stuff changes.
|
229
|
+
@internal_structure = {}
|
230
|
+
end
|
231
|
+
|
232
|
+
def []=(node_id, value)
|
233
|
+
@internal_structure[node_id] = value
|
234
|
+
end
|
235
|
+
|
236
|
+
def [](node_id)
|
237
|
+
@internal_structure[node_id]
|
238
|
+
end
|
239
|
+
|
240
|
+
def delete(node)
|
241
|
+
@internal_structure.delete node.node_id
|
242
|
+
end
|
243
|
+
|
244
|
+
def length
|
245
|
+
@internal_structure.length
|
246
|
+
end
|
247
|
+
|
248
|
+
def each(&block)
|
249
|
+
@internal_structure.each do |internal_id, node|
|
250
|
+
block.yield node
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
class ArcArray
|
256
|
+
include Enumerable
|
257
|
+
|
258
|
+
def initialize
|
259
|
+
# Internal structure is hash of [node_id1, node_id2] => Array of arcs
|
260
|
+
@internal_structure = {}
|
261
|
+
@node_to_keys = {}
|
262
|
+
end
|
263
|
+
|
264
|
+
def push(arc)
|
265
|
+
key = [arc.begin_node_id, arc.end_node_id].sort
|
266
|
+
@internal_structure[key] ||= []
|
267
|
+
@internal_structure[key].push arc
|
268
|
+
@node_to_keys[arc.begin_node_id] ||= []
|
269
|
+
@node_to_keys[arc.begin_node_id].push key
|
270
|
+
unless arc.begin_node_id == arc.end_node_id
|
271
|
+
@node_to_keys[arc.end_node_id] ||= []
|
272
|
+
@node_to_keys[arc.end_node_id].push key
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
# Return all arcs into or out of the given node_id, or
|
277
|
+
def get_arcs_by_node_id(node_id1, node_id2=nil)
|
278
|
+
if node_id2.nil?
|
279
|
+
next_keys = @node_to_keys[node_id1]
|
280
|
+
return [] if next_keys.nil?
|
281
|
+
next_keys.uniq.collect do |key|
|
282
|
+
@internal_structure[key]
|
283
|
+
end.flatten
|
284
|
+
else
|
285
|
+
to_return = @internal_structure[[node_id1, node_id2].sort]
|
286
|
+
if to_return.nil?
|
287
|
+
return []
|
288
|
+
else
|
289
|
+
return to_return
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def delete(arc)
|
295
|
+
key = [arc.begin_node_id, arc.end_node_id].sort
|
296
|
+
@internal_structure[key].delete arc
|
297
|
+
# If there is no other arcs with this same key, clean up more
|
298
|
+
if @internal_structure[key].empty?
|
299
|
+
@internal_structure.delete key
|
300
|
+
@node_to_keys[key[0]].delete key
|
301
|
+
@node_to_keys[key[1]].delete key
|
302
|
+
@node_to_keys[key[0]] = nil if @node_to_keys[key[0]].nil? or @node_to_keys[key[0]].empty?
|
303
|
+
@node_to_keys[key[1]] = nil if @node_to_keys[key[1]].nil? or @node_to_keys[key[1]].empty?
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
def length
|
308
|
+
@internal_structure.values.flatten.length
|
309
|
+
end
|
310
|
+
|
311
|
+
def each(&block)
|
312
|
+
@internal_structure.each do |internal_id, arcs|
|
313
|
+
arcs.each do |arc|
|
314
|
+
block.yield arc
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
class Node
|
321
|
+
include Bio::Velvet::Logging
|
322
|
+
|
323
|
+
attr_accessor :node_id, :coverages, :ends_of_kmers_of_node, :ends_of_kmers_of_twin_node
|
324
|
+
|
325
|
+
# For read tracking
|
326
|
+
attr_accessor :number_of_short_reads
|
327
|
+
# For read tracking - an array of NodedRead objects
|
328
|
+
attr_accessor :short_reads
|
329
|
+
|
330
|
+
# Graph to which this node belongs
|
331
|
+
attr_accessor :parent_graph
|
332
|
+
|
333
|
+
# Number of nucleotides in this node if a contig was made from this contig alone
|
334
|
+
attr_accessor :length
|
335
|
+
|
336
|
+
# The sequence of this node, should a contig be made solely out of this node.
|
337
|
+
# The kmer length is that kmer length that was used to create the assembly.
|
338
|
+
#
|
339
|
+
# If this node has a sequence that is 2 or more less than the hash length, then the
|
340
|
+
# sequence of this node requires information outside of this object, and gathering
|
341
|
+
# that information is not implemented here.
|
342
|
+
def sequence
|
343
|
+
if !sequence?
|
344
|
+
raise NotImplementedException, "Attempted to get the sequence of a velvet node that is too short, such that the sequence info is not fully present in the node object"
|
345
|
+
end
|
346
|
+
kmer_length = @parent_graph.hash_length
|
347
|
+
|
348
|
+
# Sequence is the reverse complement of the ends_of_kmers_of_twin_node,
|
349
|
+
# Then the ends_of_kmers_of_node after removing the first kmer_length - 1
|
350
|
+
# nucleotides
|
351
|
+
length_to_get_from_fwd = corresponding_contig_length - @ends_of_kmers_of_twin_node.length
|
352
|
+
fwd_length = @ends_of_kmers_of_node.length
|
353
|
+
raise "Programming error" if length_to_get_from_fwd > fwd_length
|
354
|
+
revcom(@ends_of_kmers_of_twin_node)+
|
355
|
+
@ends_of_kmers_of_node[-length_to_get_from_fwd...fwd_length]
|
356
|
+
end
|
357
|
+
|
358
|
+
# Number of nucleotides in this node if this contig length is being added to
|
359
|
+
# another node's length (nodes overlap)
|
360
|
+
def length_alone
|
361
|
+
@ends_of_kmers_of_node.length
|
362
|
+
end
|
363
|
+
|
364
|
+
# The common length of [ends_of_kmers_of_node and :ends_of_kmers_of_twin_node]
|
365
|
+
# is equal to the length of the corresponding contig minus k − 1.
|
366
|
+
#
|
367
|
+
# This method returns that corresponding contig's length
|
368
|
+
def corresponding_contig_length
|
369
|
+
@ends_of_kmers_of_node.length+@parent_graph.hash_length-1
|
370
|
+
end
|
371
|
+
|
372
|
+
# Is it possible to extract the sequence of this node? I.e. is it long enough?
|
373
|
+
def sequence?
|
374
|
+
kmer_length = @parent_graph.hash_length
|
375
|
+
if kmer_length -1 > @ends_of_kmers_of_node.length
|
376
|
+
return false
|
377
|
+
else
|
378
|
+
return true
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
# The reverse complement of this node's sequence
|
383
|
+
def reverse_sequence
|
384
|
+
revcom(sequence)
|
385
|
+
end
|
386
|
+
|
387
|
+
# Number of nucleotides in this node if this contig length is being added to
|
388
|
+
# another node's length (nodes overlap)
|
389
|
+
def length_alone
|
390
|
+
@ends_of_kmers_of_node.length
|
391
|
+
end
|
392
|
+
|
393
|
+
def to_s
|
394
|
+
"Node #{@node_id}: #{@ends_of_kmers_of_node} / #{@ends_of_kmers_of_twin_node}"
|
395
|
+
end
|
396
|
+
|
397
|
+
def inspect
|
398
|
+
to_s
|
399
|
+
end
|
400
|
+
|
401
|
+
# Return the sum of all coverage columns, divided by the length of the node,
|
402
|
+
# or nil if this node has no coverage
|
403
|
+
def coverage
|
404
|
+
return nil if length == 0
|
405
|
+
|
406
|
+
coverage = 0
|
407
|
+
coverages.each_with_index do |cov, i|
|
408
|
+
# Only take the 0th, 2nd, 4th, etc, don't want the O_cov things
|
409
|
+
coverage += cov if i.modulo(2) == 0
|
410
|
+
end
|
411
|
+
return coverage.to_f / length
|
412
|
+
end
|
413
|
+
|
414
|
+
private
|
415
|
+
def revcom(seq)
|
416
|
+
Bio::Sequence::NA.new(seq).reverse_complement.to_s.upcase
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
class Arc
|
421
|
+
attr_accessor :begin_node_id, :end_node_id, :multiplicity
|
422
|
+
|
423
|
+
# true for forwards direction, false for reverse
|
424
|
+
attr_accessor :begin_node_direction, :end_node_direction
|
425
|
+
|
426
|
+
def directions_opposing?
|
427
|
+
if (@begin_node_direction == true and @end_node_direction == false) or
|
428
|
+
(@begin_node_direction == false and @end_node_direction == true)
|
429
|
+
return true
|
430
|
+
elsif [true,false].include?(@begin_node_direction) and [true,false].include?(@end_node_direction)
|
431
|
+
return false
|
432
|
+
else
|
433
|
+
raise Exception, "Node directions not set! Cannot tell whether directions are opposing"
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
def begin_node_forward?
|
438
|
+
@begin_node_direction
|
439
|
+
end
|
440
|
+
|
441
|
+
def end_node_forward?
|
442
|
+
@end_node_forward
|
443
|
+
end
|
444
|
+
|
445
|
+
# Returns true if this arc connects the end of the first node
|
446
|
+
# to the start of the second node, else false
|
447
|
+
def connects_end_to_beginning?(first_node_id, second_node_id)
|
448
|
+
# ARC $START_NODE $END_NODE $MULTIPLICITY
|
449
|
+
#Note: this one line implicitly represents an arc from node A to B and
|
450
|
+
#another, with same multiplicity, from -B to -A.
|
451
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
452
|
+
@begin_node_direction == true and @end_node_direction == true) or
|
453
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
454
|
+
@begin_node_direction == false and @end_node_direction == false)
|
455
|
+
end
|
456
|
+
|
457
|
+
# Returns true if this arc connects the end of the first node
|
458
|
+
# to the end of the second node, else false
|
459
|
+
def connects_end_to_end?(first_node_id, second_node_id)
|
460
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
461
|
+
@begin_node_direction == true and @end_node_direction == false) or
|
462
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
463
|
+
@begin_node_direction == true and @end_node_direction == false)
|
464
|
+
end
|
465
|
+
|
466
|
+
# Returns true if this arc connects the start of the first node
|
467
|
+
# to the start of the second node, else false
|
468
|
+
def connects_beginning_to_beginning?(first_node_id, second_node_id)
|
469
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
470
|
+
@begin_node_direction == false and @end_node_direction == true) or
|
471
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
472
|
+
@begin_node_direction == false and @end_node_direction == true)
|
473
|
+
end
|
474
|
+
|
475
|
+
# Returns true if this arc connects the start of the first node
|
476
|
+
# to the start of the second node, else false
|
477
|
+
def connects_beginning_to_end?(first_node_id, second_node_id)
|
478
|
+
(first_node_id == @begin_node_id and second_node_id == @end_node_id and
|
479
|
+
@begin_node_direction == false and @end_node_direction == false) or
|
480
|
+
(first_node_id == @end_node_id and second_node_id = @begin_node_id and
|
481
|
+
@begin_node_direction == true and @end_node_direction == true)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Return true if this arc connects the beginning of the node,
|
485
|
+
# else false
|
486
|
+
def connects_to_beginning?(node_id)
|
487
|
+
(node_id == @begin_node_id and !@begin_node_direction) or
|
488
|
+
(node_id == @end_node_id and @end_node_direction)
|
489
|
+
end
|
490
|
+
|
491
|
+
# Return true if this arc connects the end of the node,
|
492
|
+
# else false
|
493
|
+
def connects_to_end?(node_id)
|
494
|
+
(node_id == @begin_node_id and @begin_node_direction) or
|
495
|
+
(node_id == @end_node_id and !@end_node_direction)
|
496
|
+
end
|
497
|
+
|
498
|
+
def to_s
|
499
|
+
str = ''
|
500
|
+
str += '-' if @begin_node_direction == false
|
501
|
+
str += @begin_node_id.to_s
|
502
|
+
str += ' '
|
503
|
+
str += '-' if @end_node_direction == false
|
504
|
+
str += @end_node_id.to_s
|
505
|
+
str += ' '
|
506
|
+
str += @multiplicity.to_s
|
507
|
+
str
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
# Tracked read, part of a node
|
512
|
+
class NodedRead
|
513
|
+
attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
|
514
|
+
end
|
515
|
+
end
|
516
|
+
end
|
517
|
+
end
|