bio-velvet 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f88fe95bdfceafceb78b65c763675780ee12f1cf
4
- data.tar.gz: 40f0b257c9e15ca861c844a51dbce319ee17453b
3
+ metadata.gz: 8098d77f70e2f60c9c4a820198b9e7839cc3f770
4
+ data.tar.gz: 924a25a11fbacfdcc24f2ec68bf435efbe1a2502
5
5
  SHA512:
6
- metadata.gz: d4de2138c28eb006fd872e518f7aa5172c04016aded296aeca1bdbeb27e98c7b99fa6f64b5fcf2252c0eff7dacccf08d9b2b4db3e38f05256d12b270cbc3fe19
7
- data.tar.gz: f2fe259725937d1995f5a0269a3b6a3a2c3a55b8ecb30abc8e7f54dadfe6311a4d43a3df3bd588c49f3567a7ca9a1846885f0d51bbfb6d66bd84e3cf3b00937c
6
+ metadata.gz: 16960ccbec0e2a781d928171581702cbea22b1374aff05b950af5fee4fd15f4e598c4d5baa4043fe37878b6828ee16c6c7261a4bd747ae2726003c8e0516daba
7
+ data.tar.gz: b357398f3aa8f8e4ad80d033c7912d17cd2e219bb2c7e1fb97c944a50bdc890b1b1e81dc54b0429e8d8a48a462f51aa4864337f697dc45b308a39626ab884dec
data/Gemfile CHANGED
@@ -4,6 +4,7 @@ gem 'bio-logger', '~>1.0'
4
4
  gem 'systemu', '~>2.6'
5
5
  gem 'files', '~>0.3'
6
6
  gem 'hopcsv', '~> 0.4'
7
+ gem 'bio-commandeer', '~>0.1'
7
8
 
8
9
  # Add dependencies to develop your gem here.
9
10
  # Include everything needed to run rake, tests, features, etc.
@@ -13,4 +14,5 @@ group :development do
13
14
  gem "bundler", "~> 1.0"
14
15
  gem "bio", "~> 1.4"
15
16
  gem "rdoc", "~> 4.1"
17
+ gem 'pry', '~>0.9'
16
18
  end
data/README.md CHANGED
@@ -51,6 +51,30 @@ graph.nodes[5].noded_reads #=> array of Bio::Velvet::Graph::NodedRead objects, f
51
51
  ```
52
52
  There is much more that can be done to interact with the graph object and its components - see the [rubydoc](http://rubydoc.info/gems/bio-velvet/Bio/Velvet/Graph).
53
53
 
54
+ ### Parsers for `Sequences` and `CnyUnifiedSeq.names` files
55
+ With default parameters velvet generates a `Seqeunces` file, that includes read ID information and the sequences themselves.
56
+ ```ruby
57
+ seqs = Bio::Velvet::Sequences.parse_from_file(File.join velvet_result.result_directory, 'Sequences')
58
+ seqs[1] => 'AAAATTGTCAGACTAGCTATCAGCATATCAGCGCGCATCTCAGACGAGCACTATC'
59
+ ```
60
+ If the `-create_binary` flag is set when running `velveth`, a names file is generated that encodes the read names and IDs.
61
+ ```ruby
62
+ entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
63
+ File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
64
+ ['read1','read2']
65
+ ) #=> Hash of read name to Array of CnyUnifiedSeqNamesFileEntry objects
66
+ entries['read1'] #=> Array of CnyUnifiedSeqNamesFileEntry objects
67
+ entries['read1'][0].read_id #=> 1 (i.e. '1'.to_i)
68
+ ```
69
+ When speed is required, grep can come to the rescue (at the cost of some portability)
70
+ ```ruby
71
+ entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
72
+ File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
73
+ ['read1','read2']
74
+ ) #=> same returned object as above
75
+ ```
76
+ The sequences themselves are stored in a separate file when `-create_binary` is used - an interface for this is included in the [bio-velvet_underground](https://github.com/wwood/bioruby-velvet_underground) biogem.
77
+
54
78
  ## Project home page
55
79
 
56
80
  Information on the source tree, documentation, examples, issues and
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -12,3 +12,4 @@ end
12
12
  require 'bio-velvet/graph'
13
13
  require 'bio-velvet/runner'
14
14
  require 'bio-velvet/sequences'
15
+ require 'bio-velvet/sequence_names'
@@ -1,4 +1,4 @@
1
- require 'hopcsv'
1
+ #require 'hopcsv'
2
2
  require 'bio'
3
3
  require 'tempfile'
4
4
 
@@ -30,6 +30,7 @@ module Bio
30
30
  # into a Bio::Velvet::Graph object
31
31
  #
32
32
  # Options:
33
+ # * :dont_parse_noded_reads: if true, then parsing of the NR section is skipped
33
34
  # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
34
35
  # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
35
36
  # nodes and edges are parsed in. Using this options saves both memory and CPU.
@@ -130,6 +131,7 @@ module Bio
130
131
  # $READ_ID2 etc.
131
132
  #p row
132
133
  if row[0] == 'NR'
134
+ break if options[:dont_parse_noded_reads] # We are done if NR things aren't parsed
133
135
  if options[:grep_hack]
134
136
  unless options[:interesting_read_ids] or options[:interesting_node_ids]
135
137
  raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
@@ -0,0 +1,56 @@
1
+ require 'tempfile'
2
+ require 'bio-commandeer'
3
+ require 'hopcsv'
4
+
5
+ module Bio
6
+ module Velvet
7
+ # Methods for dealing with the CnyUnifiedSeq.names file output when
8
+ # the -create_binary flag is set in velveth.
9
+ class CnyUnifiedSeqNamesFile
10
+ # Return a hash of seqname => (Array of CnyUnifiedSeqNamesFileEntry objects)
11
+ # created by parsing the CnyUnifiedSeq.names file. Sometimes sequences
12
+ # can be found multiple times e.g. if fwd and rev of a pair is delineated
13
+ # after a space in the input names.
14
+ def self.extract_entries(path_to_cny_unified_seq_names_file, entry_names)
15
+ # Create results hash
16
+ to_return = {}
17
+ entry_names.each do |name|
18
+ to_return[name] = []
19
+ end
20
+
21
+ Hopcsv.foreach(path_to_cny_unified_seq_names_file,"\t") do |row|
22
+ name = row[0][1...row[0].length] #remove '>' at the start of the name
23
+ next unless to_return.key?(name) #ignore uninsteresting sequences
24
+
25
+ entry = CnyUnifiedSeqNamesFileEntry.new
26
+ entry.name = name
27
+ entry.read_id = row[1].to_i
28
+ entry.category = row[2].to_i
29
+ to_return[name].push entry
30
+ end
31
+ return to_return
32
+ end
33
+
34
+ # These files can be quite big, so this method
35
+ def self.extract_entries_using_grep_hack(path_to_cny_unified_seq_names_file, entry_names)
36
+ to_return = nil
37
+ Tempfile.open('velvet_names_grep_hack_in') do |input|
38
+ entry_names.each do |name|
39
+ input.puts ">#{name}\t"
40
+ end
41
+ input.close #flush
42
+
43
+ Tempfile.open('velvet_names_grep_hack_result') do |output|
44
+ Bio::Commandeer.run "grep -F -f #{input.path} #{path_to_cny_unified_seq_names_file.inspect} >#{output.path}"
45
+ to_return = extract_entries output.path, entry_names
46
+ end
47
+ end
48
+ return to_return
49
+ end
50
+ end
51
+
52
+ class CnyUnifiedSeqNamesFileEntry
53
+ attr_accessor :name, :read_id, :category
54
+ end
55
+ end
56
+ end
@@ -1,4 +1,4 @@
1
- require 'hopcsv'
1
+ #require 'hopcsv'
2
2
  require 'bio'
3
3
  require 'tempfile'
4
4
 
@@ -254,6 +254,19 @@ describe "BioVelvet" do
254
254
  node.short_reads.collect{|r| r.read_id}.should == [47210]
255
255
  end
256
256
 
257
+ it 'should not parse NR when option is set' do
258
+ graph = Bio::Velvet::Graph.parse_from_file(
259
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
260
+ {:dont_parse_noded_reads => true}
261
+ )
262
+ graph.should be_kind_of(Bio::Velvet::Graph)
263
+
264
+ node = graph.nodes[967]
265
+ node.short_reads.should == nil
266
+ node = graph.nodes[951]
267
+ node.short_reads.should == nil
268
+ end
269
+
257
270
  it 'should return sets of arcs by id' do
258
271
  graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
259
272
  # ARC 2 -578 1
@@ -0,0 +1,70 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'bio'
3
+
4
+ include Bio::Velvet
5
+
6
+ describe "SeqeunceNames" do
7
+ it 'should parse a whole file' do
8
+ string = <<EOF
9
+ >read1 1 0
10
+ >read2 2 0
11
+ EOF
12
+ Tempfile.open('test') do |tempfile|
13
+ tempfile.print string
14
+ tempfile.close
15
+
16
+ names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries(
17
+ tempfile.path,
18
+ %w(read1 read2)
19
+ )
20
+ names.keys.should == %w(read1 read2)
21
+ names['read1'].kind_of?(Array).should == true
22
+ names['read1'].length.should == 1
23
+ names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
24
+ names['read1'].collect{|e| e.name}.should == ['read1']
25
+ names['read1'].collect{|e| e.read_id}.should == [1]
26
+ names['read1'].collect{|e| e.category}.should == [0]
27
+ names['read2'].collect{|e| e.name}.should == ['read2']
28
+ names['read2'].collect{|e| e.read_id}.should == [2]
29
+ names['read2'].collect{|e| e.category}.should == [0]
30
+ end
31
+ end
32
+
33
+ it 'should handle the grep hack' do
34
+ string = <<EOF
35
+ >read1 1 0
36
+ >read2 2 0
37
+ EOF
38
+ Tempfile.open('test') do |tempfile|
39
+ tempfile.print string
40
+ tempfile.close
41
+
42
+ names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
43
+ tempfile.path,
44
+ %w(read1 read2)
45
+ )
46
+ names.keys.should == %w(read1 read2)
47
+ names['read1'].kind_of?(Array).should == true
48
+ names['read1'].length.should == 1
49
+ names['read1'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
50
+ names['read1'].collect{|e| e.name}.should == ['read1']
51
+ names['read1'].collect{|e| e.read_id}.should == [1]
52
+ names['read1'].collect{|e| e.category}.should == [0]
53
+ names['read2'].collect{|e| e.name}.should == ['read2']
54
+ names['read2'].collect{|e| e.read_id}.should == [2]
55
+ names['read2'].collect{|e| e.category}.should == [0]
56
+
57
+ names = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
58
+ tempfile.path,
59
+ %w(read2)
60
+ )
61
+ names.keys.should == %w(read2)
62
+ names['read2'].kind_of?(Array).should == true
63
+ names['read2'].length.should == 1
64
+ names['read2'][0].kind_of?(Bio::Velvet::CnyUnifiedSeqNamesFileEntry).should == true
65
+ names['read2'].collect{|e| e.name}.should == ['read2']
66
+ names['read2'].collect{|e| e.read_id}.should == [2]
67
+ names['read2'].collect{|e| e.category}.should == [0]
68
+ end
69
+ end
70
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-velvet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-01 00:00:00.000000000 Z
11
+ date: 2014-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bio-commandeer
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.1'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: rspec
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +150,20 @@ dependencies:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
152
  version: '4.1'
153
+ - !ruby/object:Gem::Dependency
154
+ name: pry
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.9'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.9'
139
167
  description: Parser to work with some file formats used in the velvet DNA assembler
140
168
  email: donttrustben@gmail.com
141
169
  executables: []
@@ -155,10 +183,9 @@ files:
155
183
  - lib/bio-velvet.rb
156
184
  - lib/bio-velvet/graph.rb
157
185
  - lib/bio-velvet/runner.rb
186
+ - lib/bio-velvet/sequence_names.rb
158
187
  - lib/bio-velvet/sequences.rb
159
- - spec/bio-velvet_arc_array_spec.rb
160
- - spec/bio-velvet_graph_spec.rb
161
- - spec/bio-velvet_runner_spec.rb
188
+ - spec/arc_array_spec.rb
162
189
  - spec/data/node_sequence/LastGraph
163
190
  - spec/data/node_sequence/contigs.fa
164
191
  - spec/data/runner_input.fa
@@ -173,6 +200,9 @@ files:
173
200
  - spec/data/velvet_test_reads_assembly_read_tracking/Graph2
174
201
  - spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
175
202
  - spec/data/velvet_test_trail_sequence_assembly/reads1.fa
203
+ - spec/graph_spec.rb
204
+ - spec/runner_spec.rb
205
+ - spec/sequence_names_spec.rb
176
206
  - spec/sequences_spec.rb
177
207
  - spec/spec_helper.rb
178
208
  homepage: http://github.com/wwood/bioruby-velvet