bio-velvet 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +24 -0
- data/VERSION +1 -1
- data/lib/bio-velvet.rb +1 -0
- data/lib/bio-velvet/graph.rb +3 -1
- data/lib/bio-velvet/sequence_names.rb +56 -0
- data/lib/bio-velvet/sequences.rb +1 -1
- data/spec/{bio-velvet_arc_array_spec.rb → arc_array_spec.rb} +0 -0
- data/spec/{bio-velvet_graph_spec.rb → graph_spec.rb} +13 -0
- data/spec/{bio-velvet_runner_spec.rb → runner_spec.rb} +0 -0
- data/spec/sequence_names_spec.rb +70 -0
- metadata +35 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8098d77f70e2f60c9c4a820198b9e7839cc3f770
|
4
|
+
data.tar.gz: 924a25a11fbacfdcc24f2ec68bf435efbe1a2502
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16960ccbec0e2a781d928171581702cbea22b1374aff05b950af5fee4fd15f4e598c4d5baa4043fe37878b6828ee16c6c7261a4bd747ae2726003c8e0516daba
|
7
|
+
data.tar.gz: b357398f3aa8f8e4ad80d033c7912d17cd2e219bb2c7e1fb97c944a50bdc890b1b1e81dc54b0429e8d8a48a462f51aa4864337f697dc45b308a39626ab884dec
|
data/Gemfile
CHANGED
@@ -4,6 +4,7 @@ gem 'bio-logger', '~>1.0'
|
|
4
4
|
gem 'systemu', '~>2.6'
|
5
5
|
gem 'files', '~>0.3'
|
6
6
|
gem 'hopcsv', '~> 0.4'
|
7
|
+
gem 'bio-commandeer', '~>0.1'
|
7
8
|
|
8
9
|
# Add dependencies to develop your gem here.
|
9
10
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -13,4 +14,5 @@ group :development do
|
|
13
14
|
gem "bundler", "~> 1.0"
|
14
15
|
gem "bio", "~> 1.4"
|
15
16
|
gem "rdoc", "~> 4.1"
|
17
|
+
gem 'pry', '~>0.9'
|
16
18
|
end
|
data/README.md
CHANGED
@@ -51,6 +51,30 @@ graph.nodes[5].noded_reads #=> array of Bio::Velvet::Graph::NodedRead objects, f
|
|
51
51
|
```
|
52
52
|
There is much more that can be done to interact with the graph object and its components - see the [rubydoc](http://rubydoc.info/gems/bio-velvet/Bio/Velvet/Graph).
|
53
53
|
|
54
|
+
### Parsers for `Sequences` and `CnyUnifiedSeq.names` files
|
55
|
+
With default parameters velvet generates a `Seqeunces` file, that includes read ID information and the sequences themselves.
|
56
|
+
```ruby
|
57
|
+
seqs = Bio::Velvet::Sequences.parse_from_file(File.join velvet_result.result_directory, 'Sequences')
|
58
|
+
seqs[1] => 'AAAATTGTCAGACTAGCTATCAGCATATCAGCGCGCATCTCAGACGAGCACTATC'
|
59
|
+
```
|
60
|
+
If the `-create_binary` flag is set when running `velveth`, a names file is generated that encodes the read names and IDs.
|
61
|
+
```ruby
|
62
|
+
entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
|
63
|
+
File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
|
64
|
+
['read1','read2']
|
65
|
+
) #=> Hash of read name to Array of CnyUnifiedSeqNamesFileEntry objects
|
66
|
+
entries['read1'] #=> Array of CnyUnifiedSeqNamesFileEntry objects
|
67
|
+
entries['read1'][0].read_id #=> 1 (i.e. '1'.to_i)
|
68
|
+
```
|
69
|
+
When speed is required, grep can come to the rescue (at the cost of some portability)
|
70
|
+
```ruby
|
71
|
+
entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
72
|
+
File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
|
73
|
+
['read1','read2']
|
74
|
+
) #=> same returned object as above
|
75
|
+
```
|
76
|
+
The sequences themselves are stored in a separate file when `-create_binary` is used - an interface for this is included in the [bio-velvet_underground](https://github.com/wwood/bioruby-velvet_underground) biogem.
|
77
|
+
|
54
78
|
## Project home page
|
55
79
|
|
56
80
|
Information on the source tree, documentation, examples, issues and
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/bio-velvet.rb
CHANGED
data/lib/bio-velvet/graph.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'hopcsv'
|
1
|
+
#require 'hopcsv'
|
2
2
|
require 'bio'
|
3
3
|
require 'tempfile'
|
4
4
|
|
@@ -30,6 +30,7 @@ module Bio
|
|
30
30
|
# into a Bio::Velvet::Graph object
|
31
31
|
#
|
32
32
|
# Options:
|
33
|
+
# * :dont_parse_noded_reads: if true, then parsing of the NR section is skipped
|
33
34
|
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
34
35
|
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
35
36
|
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
@@ -130,6 +131,7 @@ module Bio
|
|
130
131
|
# $READ_ID2 etc.
|
131
132
|
#p row
|
132
133
|
if row[0] == 'NR'
|
134
|
+
break if options[:dont_parse_noded_reads] # We are done if NR things aren't parsed
|
133
135
|
if options[:grep_hack]
|
134
136
|
unless options[:interesting_read_ids] or options[:interesting_node_ids]
|
135
137
|
raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
require 'bio-commandeer'
|
3
|
+
require 'hopcsv'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Velvet
|
7
|
+
# Methods for dealing with the CnyUnifiedSeq.names file output when
|
8
|
+
# the -create_binary flag is set in velveth.
|
9
|
+
class CnyUnifiedSeqNamesFile
|
10
|
+
# Return a hash of seqname => (Array of CnyUnifiedSeqNamesFileEntry objects)
|
11
|
+
# created by parsing the CnyUnifiedSeq.names file. Sometimes sequences
|
12
|
+
# can be found multiple times e.g. if fwd and rev of a pair is delineated
|
13
|
+
# after a space in the input names.
|
14
|
+
def self.extract_entries(path_to_cny_unified_seq_names_file, entry_names)
|
15
|
+
# Create results hash
|
16
|
+
to_return = {}
|
17
|
+
entry_names.each do |name|
|
18
|
+
to_return[name] = []
|
19
|
+
end
|
20
|
+
|
21
|
+
Hopcsv.foreach(path_to_cny_unified_seq_names_file,"\t") do |row|
|
22
|
+
name = row[0][1...row[0].length] #remove '>' at the start of the name
|
23
|
+
next unless to_return.key?(name) #ignore uninsteresting sequences
|
24
|
+
|
25
|
+
entry = CnyUnifiedSeqNamesFileEntry.new
|
26
|
+
entry.name = name
|
27
|
+
entry.read_id = row[1].to_i
|
28
|
+
entry.category = row[2].to_i
|
29
|
+
to_return[name].push entry
|
30
|
+
end
|
31
|
+
return to_return
|
32
|
+
end
|
33
|
+
|
34
|
+
# These files can be quite big, so this method
|
35
|
+
def self.extract_entries_using_grep_hack(path_to_cny_unified_seq_names_file, entry_names)
|
36
|
+
to_return = nil
|
37
|
+
Tempfile.open('velvet_names_grep_hack_in') do |input|
|
38
|
+
entry_names.each do |name|
|
39
|
+
input.puts ">#{name}\t"
|
40
|
+
end
|
41
|
+
input.close #flush
|
42
|
+
|
43
|
+
Tempfile.open('velvet_names_grep_hack_result') do |output|
|
44
|
+
Bio::Commandeer.run "grep -F -f #{input.path} #{path_to_cny_unified_seq_names_file.inspect} >#{output.path}"
|
45
|
+
to_return = extract_entries output.path, entry_names
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return to_return
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class CnyUnifiedSeqNamesFileEntry
|
53
|
+
attr_accessor :name, :read_id, :category
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/bio-velvet/sequences.rb
CHANGED
File without changes
|
@@ -254,6 +254,19 @@ describe "BioVelvet" do
|
|
254
254
|
node.short_reads.collect{|r| r.read_id}.should == [47210]
|
255
255
|
end
|
256
256
|
|
257
|
+
it 'should not parse NR when option is set' do
|
258
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
259
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
260
|
+
{:dont_parse_noded_reads => true}
|
261
|
+
)
|
262
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
263
|
+
|
264
|
+
node = graph.nodes[967]
|
265
|
+
node.short_reads.should == nil
|
266
|
+
node = graph.nodes[951]
|
267
|
+
node.short_reads.should == nil
|
268
|
+
end
|
269
|
+
|
257
270
|
it 'should return sets of arcs by id' do
|
258
271
|
graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
|
259
272
|
# ARC 2 -578 1
|
File without changes
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
include Bio::Velvet
|
5
|
+
|
6
|
+
describe "SeqeunceNames" do
|
7
|
+
it 'should parse a whole file' do
|
8
|
+
string = <<EOF
|
9
|
+
>read1 1 0
|
10
|
+
>read2 2 0
|
11
|
+
EOF
|
12
|
+
Tempfile.open('test') do |tempfile|
|
13
|
+
tempfile.print string
|
14
|
+
tempfile.close
|
15
|
+
|
16
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
|
17
|
+
tempfile.path,
|
18
|
+
%w(read1 read2)
|
19
|
+
)
|
20
|
+
names.keys.should == %w(read1 read2)
|
21
|
+
names['read1'].kind_of?(Array).should == true
|
22
|
+
names['read1'].length.should == 1
|
23
|
+
names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
24
|
+
names['read1'].collect{|e| e.name}.should == ['read1']
|
25
|
+
names['read1'].collect{|e| e.read_id}.should == [1]
|
26
|
+
names['read1'].collect{|e| e.category}.should == [0]
|
27
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
28
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
29
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should handle the grep hack' do
|
34
|
+
string = <<EOF
|
35
|
+
>read1 1 0
|
36
|
+
>read2 2 0
|
37
|
+
EOF
|
38
|
+
Tempfile.open('test') do |tempfile|
|
39
|
+
tempfile.print string
|
40
|
+
tempfile.close
|
41
|
+
|
42
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
43
|
+
tempfile.path,
|
44
|
+
%w(read1 read2)
|
45
|
+
)
|
46
|
+
names.keys.should == %w(read1 read2)
|
47
|
+
names['read1'].kind_of?(Array).should == true
|
48
|
+
names['read1'].length.should == 1
|
49
|
+
names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
50
|
+
names['read1'].collect{|e| e.name}.should == ['read1']
|
51
|
+
names['read1'].collect{|e| e.read_id}.should == [1]
|
52
|
+
names['read1'].collect{|e| e.category}.should == [0]
|
53
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
54
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
55
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
56
|
+
|
57
|
+
names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
|
58
|
+
tempfile.path,
|
59
|
+
%w(read2)
|
60
|
+
)
|
61
|
+
names.keys.should == %w(read2)
|
62
|
+
names['read2'].kind_of?(Array).should == true
|
63
|
+
names['read2'].length.should == 1
|
64
|
+
names['read2'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
|
65
|
+
names['read2'].collect{|e| e.name}.should == ['read2']
|
66
|
+
names['read2'].collect{|e| e.read_id}.should == [2]
|
67
|
+
names['read2'].collect{|e| e.category}.should == [0]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-velvet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bio-commandeer
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: rspec
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,20 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '4.1'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: pry
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0.9'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0.9'
|
139
167
|
description: Parser to work with some file formats used in the velvet DNA assembler
|
140
168
|
email: donttrustben@gmail.com
|
141
169
|
executables: []
|
@@ -155,10 +183,9 @@ files:
|
|
155
183
|
- lib/bio-velvet.rb
|
156
184
|
- lib/bio-velvet/graph.rb
|
157
185
|
- lib/bio-velvet/runner.rb
|
186
|
+
- lib/bio-velvet/sequence_names.rb
|
158
187
|
- lib/bio-velvet/sequences.rb
|
159
|
-
- spec/
|
160
|
-
- spec/bio-velvet_graph_spec.rb
|
161
|
-
- spec/bio-velvet_runner_spec.rb
|
188
|
+
- spec/arc_array_spec.rb
|
162
189
|
- spec/data/node_sequence/LastGraph
|
163
190
|
- spec/data/node_sequence/contigs.fa
|
164
191
|
- spec/data/runner_input.fa
|
@@ -173,6 +200,9 @@ files:
|
|
173
200
|
- spec/data/velvet_test_reads_assembly_read_tracking/Graph2
|
174
201
|
- spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
|
175
202
|
- spec/data/velvet_test_trail_sequence_assembly/reads1.fa
|
203
|
+
- spec/graph_spec.rb
|
204
|
+
- spec/runner_spec.rb
|
205
|
+
- spec/sequence_names_spec.rb
|
176
206
|
- spec/sequences_spec.rb
|
177
207
|
- spec/spec_helper.rb
|
178
208
|
homepage: http://github.com/wwood/bioruby-velvet
|