bio-velvet 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +24 -0
- data/VERSION +1 -1
- data/lib/bio-velvet.rb +1 -0
- data/lib/bio-velvet/graph.rb +3 -1
- data/lib/bio-velvet/sequence_names.rb +56 -0
- data/lib/bio-velvet/sequences.rb +1 -1
- data/spec/{bio-velvet_arc_array_spec.rb → arc_array_spec.rb} +0 -0
- data/spec/{bio-velvet_graph_spec.rb → graph_spec.rb} +13 -0
- data/spec/{bio-velvet_runner_spec.rb → runner_spec.rb} +0 -0
- data/spec/sequence_names_spec.rb +70 -0
- metadata +35 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8098d77f70e2f60c9c4a820198b9e7839cc3f770
|
4
|
+
data.tar.gz: 924a25a11fbacfdcc24f2ec68bf435efbe1a2502
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16960ccbec0e2a781d928171581702cbea22b1374aff05b950af5fee4fd15f4e598c4d5baa4043fe37878b6828ee16c6c7261a4bd747ae2726003c8e0516daba
|
7
|
+
data.tar.gz: b357398f3aa8f8e4ad80d033c7912d17cd2e219bb2c7e1fb97c944a50bdc890b1b1e81dc54b0429e8d8a48a462f51aa4864337f697dc45b308a39626ab884dec
|
data/Gemfile
CHANGED
@@ -4,6 +4,7 @@ gem 'bio-logger', '~>1.0'
|
|
4
4
|
gem 'systemu', '~>2.6'
|
5
5
|
gem 'files', '~>0.3'
|
6
6
|
gem 'hopcsv', '~> 0.4'
|
7
|
+
gem 'bio-commandeer', '~>0.1'
|
7
8
|
|
8
9
|
# Add dependencies to develop your gem here.
|
9
10
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -13,4 +14,5 @@ group :development do
|
|
13
14
|
gem "bundler", "~> 1.0"
|
14
15
|
gem "bio", "~> 1.4"
|
15
16
|
gem "rdoc", "~> 4.1"
|
17
|
+
gem 'pry', '~>0.9'
|
16
18
|
end
|
data/README.md
CHANGED
@@ -51,6 +51,30 @@ graph.nodes[5].noded_reads #=> array of Bio::Velvet::Graph::NodedRead objects, f
|
|
51
51
|
```
|
52
52
|
There is much more that can be done to interact with the graph object and its components - see the [rubydoc](http://rubydoc.info/gems/bio-velvet/Bio/Velvet/Graph).
|
53
53
|
|
54
|
+
### Parsers for `Sequences` and `CnyUnifiedSeq.names` files
|
55
|
+
With default parameters velvet generates a `Seqeunces` file, that includes read ID information and the sequences themselves.
|
56
|
+
```ruby
|
57
|
+
seqs = Bio::Velvet::Sequences.parse_from_file(File.join velvet_result.result_directory, 'Sequences')
|
58
|
+
seqs[1] => 'AAAATTGTCAGACTAGCTATCAGCATATCAGCGCGCATCTCAGACGAGCACTATC'
|
59
|
+
```
|
60
|
+
If the `-create_binary` flag is set when running `velveth`, a names file is generated that encodes the read names and IDs.
|
61
|
+
```ruby
|
62
|
+
entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
|
63
|
+
File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
|
64
|
+
['read1','read2']
|
65
|
+
) #=> Hash of read name to Array of CnyUnifiedSeqNamesFileEntry objects
|
66
|
+
entries['read1'] #=> Array of CnyUnifiedSeqNamesFileEntry objects
|
67
|
+
entries['read1'][0].read_id #=> 1 (i.e. '1'.to_i)
|
68
|
+
```
|
69
|
+
When speed is required, grep can come to the rescue (at the cost of some portability)
|
70
|
+
```ruby
|
71
|
+
entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
72
|
+
File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
|
73
|
+
['read1','read2']
|
74
|
+
) #=> same returned object as above
|
75
|
+
```
|
76
|
+
The sequences themselves are stored in a separate file when `-create_binary` is used - an interface for this is included in the [bio-velvet_underground](https://github.com/wwood/bioruby-velvet_underground) biogem.
|
77
|
+
|
54
78
|
## Project home page
|
55
79
|
|
56
80
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/bio-velvet.rb
CHANGED
data/lib/bio-velvet/graph.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'hopcsv'
|
1
|
+
#require 'hopcsv'
|
2
2
|
require 'bio'
|
3
3
|
require 'tempfile'
|
4
4
|
|
@@ -30,6 +30,7 @@ module Bio
|
|
30
30
|
# into a Bio::Velvet::Graph object
|
31
31
|
#
|
32
32
|
# Options:
|
33
|
+
# * :dont_parse_noded_reads: if true, then parsing of the NR section is skipped
|
33
34
|
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
34
35
|
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
35
36
|
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
@@ -130,6 +131,7 @@ module Bio
|
|
130
131
|
# $READ_ID2 etc.
|
131
132
|
#p row
|
132
133
|
if row[0] == 'NR'
|
134
|
+
break if options[:dont_parse_noded_reads] # We are done if NR things aren't parsed
|
133
135
|
if options[:grep_hack]
|
134
136
|
unless options[:interesting_read_ids] or options[:interesting_node_ids]
|
135
137
|
raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
require 'bio-commandeer'
|
3
|
+
require 'hopcsv'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Velvet
|
7
|
+
# Methods for dealing with the CnyUnifiedSeq.names file output when
|
8
|
+
# the -create_binary flag is set in velveth.
|
9
|
+
class CnyUnifiedSeqNamesFile
|
10
|
+
# Return a hash of seqname => (Array of CnyUnifiedSeqNamesFileEntry objects)
|
11
|
+
# created by parsing the CnyUnifiedSeq.names file. Sometimes sequences
|
12
|
+
# can be found multiple times e.g. if fwd and rev of a pair is delineated
|
13
|
+
# after a space in the input names.
|
14
|
+
def self.extract_entries(path_to_cny_unified_seq_names_file, entry_names)
|
15
|
+
# Create results hash
|
16
|
+
to_return = {}
|
17
|
+
entry_names.each do |name|
|
18
|
+
to_return[name] = []
|
19
|
+
end
|
20
|
+
|
21
|
+
Hopcsv.foreach(path_to_cny_unified_seq_names_file,"\t") do |row|
|
22
|
+
name = row[0][1...row[0].length] #remove '>' at the start of the name
|
23
|
+
next unless to_return.key?(name) #ignore uninsteresting sequences
|
24
|
+
|
25
|
+
entry = CnyUnifiedSeqNamesFileEntry.new
|
26
|
+
entry.name = name
|
27
|
+
entry.read_id = row[1].to_i
|
28
|
+
entry.category = row[2].to_i
|
29
|
+
to_return[name].push entry
|
30
|
+
end
|
31
|
+
return to_return
|
32
|
+
end
|
33
|
+
|
34
|
+
# These files can be quite big, so this method
|
35
|
+
def self.extract_entries_using_grep_hack(path_to_cny_unified_seq_names_file, entry_names)
|
36
|
+
to_return = nil
|
37
|
+
Tempfile.open('velvet_names_grep_hack_in') do |input|
|
38
|
+
entry_names.each do |name|
|
39
|
+
input.puts ">#{name}\t"
|
40
|
+
end
|
41
|
+
input.close #flush
|
42
|
+
|
43
|
+
Tempfile.open('velvet_names_grep_hack_result') do |output|
|
44
|
+
Bio::Commandeer.run "grep -F -f #{input.path} #{path_to_cny_unified_seq_names_file.inspect} >#{output.path}"
|
45
|
+
to_return = extract_entries output.path, entry_names
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return to_return
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class CnyUnifiedSeqNamesFileEntry
|
53
|
+
attr_accessor :name, :read_id, :category
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/bio-velvet/sequences.rb
CHANGED
File without changes
|
@@ -254,6 +254,19 @@ describe "BioVelvet" do
|
|
254
254
|
node.short_reads.collect{|r| r.read_id}.should == [47210]
|
255
255
|
end
|
256
256
|
|
257
|
+
it 'should not parse NR when option is set' do
|
258
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
259
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
260
|
+
{:dont_parse_noded_reads => true}
|
261
|
+
)
|
262
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
263
|
+
|
264
|
+
node = graph.nodes[967]
|
265
|
+
node.short_reads.should == nil
|
266
|
+
node = graph.nodes[951]
|
267
|
+
node.short_reads.should == nil
|
268
|
+
end
|
269
|
+
|
257
270
|
it 'should return sets of arcs by id' do
|
258
271
|
graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
|
259
272
|
# ARC 2 -578 1
|
File without changes
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
include Bio::Velvet
|
5
|
+
|
6
|
+
describe "SeqeunceNames" do
|
7
|
+
it 'should parse a whole file' do
|
8
|
+
string = <<EOF
|
9
|
+
>read1 1 0
|
10
|
+
>read2 2 0
|
11
|
+
EOF
|
12
|
+
Tempfile.open('test') do |tempfile|
|
13
|
+
tempfile.print string
|
14
|
+
tempfile.close
|
15
|
+
|
16
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
|
17
|
+
tempfile.path,
|
18
|
+
%w(read1 read2)
|
19
|
+
)
|
20
|
+
names.keys.should == %w(read1 read2)
|
21
|
+
names['read1'].kind_of?(Array).should == true
|
22
|
+
names['read1'].length.should == 1
|
23
|
+
names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
24
|
+
names['read1'].collect{|e| e.name}.should == ['read1']
|
25
|
+
names['read1'].collect{|e| e.read_id}.should == [1]
|
26
|
+
names['read1'].collect{|e| e.category}.should == [0]
|
27
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
28
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
29
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should handle the grep hack' do
|
34
|
+
string = <<EOF
|
35
|
+
>read1 1 0
|
36
|
+
>read2 2 0
|
37
|
+
EOF
|
38
|
+
Tempfile.open('test') do |tempfile|
|
39
|
+
tempfile.print string
|
40
|
+
tempfile.close
|
41
|
+
|
42
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
43
|
+
tempfile.path,
|
44
|
+
%w(read1 read2)
|
45
|
+
)
|
46
|
+
names.keys.should == %w(read1 read2)
|
47
|
+
names['read1'].kind_of?(Array).should == true
|
48
|
+
names['read1'].length.should == 1
|
49
|
+
names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
50
|
+
names['read1'].collect{|e| e.name}.should == ['read1']
|
51
|
+
names['read1'].collect{|e| e.read_id}.should == [1]
|
52
|
+
names['read1'].collect{|e| e.category}.should == [0]
|
53
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
54
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
55
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
56
|
+
|
57
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
58
|
+
tempfile.path,
|
59
|
+
%w(read2)
|
60
|
+
)
|
61
|
+
names.keys.should == %w(read2)
|
62
|
+
names['read2'].kind_of?(Array).should == true
|
63
|
+
names['read2'].length.should == 1
|
64
|
+
names['read2'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
65
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
66
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
67
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-velvet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bio-commandeer
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: rspec
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,20 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '4.1'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: pry
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0.9'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0.9'
|
139
167
|
description: Parser to work with some file formats used in the velvet DNA assembler
|
140
168
|
email: donttrustben@gmail.com
|
141
169
|
executables: []
|
@@ -155,10 +183,9 @@ files:
|
|
155
183
|
- lib/bio-velvet.rb
|
156
184
|
- lib/bio-velvet/graph.rb
|
157
185
|
- lib/bio-velvet/runner.rb
|
186
|
+
- lib/bio-velvet/sequence_names.rb
|
158
187
|
- lib/bio-velvet/sequences.rb
|
159
|
-
- spec/
|
160
|
-
- spec/bio-velvet_graph_spec.rb
|
161
|
-
- spec/bio-velvet_runner_spec.rb
|
188
|
+
- spec/arc_array_spec.rb
|
162
189
|
- spec/data/node_sequence/LastGraph
|
163
190
|
- spec/data/node_sequence/contigs.fa
|
164
191
|
- spec/data/runner_input.fa
|
@@ -173,6 +200,9 @@ files:
|
|
173
200
|
- spec/data/velvet_test_reads_assembly_read_tracking/Graph2
|
174
201
|
- spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
|
175
202
|
- spec/data/velvet_test_trail_sequence_assembly/reads1.fa
|
203
|
+
- spec/graph_spec.rb
|
204
|
+
- spec/runner_spec.rb
|
205
|
+
- spec/sequence_names_spec.rb
|
176
206
|
- spec/sequences_spec.rb
|
177
207
|
- spec/spec_helper.rb
|
178
208
|
homepage: http://github.com/wwood/bioruby-velvet
|