bio-velvet 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/bio-velvet.rb +1 -0
- data/lib/bio-velvet/graph.rb +143 -8
- data/lib/bio-velvet/runner.rb +1 -1
- data/lib/bio-velvet/sequences.rb +121 -0
- data/spec/bio-velvet_graph_spec.rb +137 -1
- data/spec/data/sequence_spec/5seqs.fa +24 -0
- data/spec/data/sequence_spec/5seqs.fa.Sequences +36 -0
- data/spec/data/sequence_spec/Sequences +18 -0
- data/spec/sequences_spec.rb +113 -0
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f88fe95bdfceafceb78b65c763675780ee12f1cf
|
4
|
+
data.tar.gz: 40f0b257c9e15ca861c844a51dbce319ee17453b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4de2138c28eb006fd872e518f7aa5172c04016aded296aeca1bdbeb27e98c7b99fa6f64b5fcf2252c0eff7dacccf08d9b2b4db3e38f05256d12b270cbc3fe19
|
7
|
+
data.tar.gz: f2fe259725937d1995f5a0269a3b6a3a2c3a55b8ecb30abc8e7f54dadfe6311a4d43a3df3bd588c49f3567a7ca9a1846885f0d51bbfb6d66bd84e3cf3b00937c
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/bio-velvet.rb
CHANGED
data/lib/bio-velvet/graph.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'hopcsv'
|
2
2
|
require 'bio'
|
3
|
+
require 'tempfile'
|
3
4
|
|
4
5
|
module Bio
|
5
6
|
module Velvet
|
@@ -32,6 +33,13 @@ module Bio
|
|
32
33
|
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
33
34
|
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
34
35
|
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
36
|
+
# * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
|
37
|
+
# rather than particular reads.
|
38
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
39
|
+
# hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
|
40
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
41
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
42
|
+
# (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
|
35
43
|
def self.parse_from_file(path_to_graph_file, options={})
|
36
44
|
graph = self.new
|
37
45
|
state = :header
|
@@ -122,17 +130,26 @@ module Bio
|
|
122
130
|
# $READ_ID2 etc.
|
123
131
|
#p row
|
124
132
|
if row[0] == 'NR'
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
133
|
+
if options[:grep_hack]
|
134
|
+
unless options[:interesting_read_ids] or options[:interesting_node_ids]
|
135
|
+
raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
|
136
|
+
end
|
137
|
+
apply_grep_hack graph, path_to_graph_file, options[:interesting_read_ids], options[:interesting_node_ids], options[:grep_hack]
|
138
|
+
break #no more parsing is required
|
139
|
+
else
|
140
|
+
raise unless row.length == 3
|
141
|
+
node_pm = row[1].to_i
|
142
|
+
current_node_direction = node_pm > 0
|
143
|
+
current_node = graph.nodes[node_pm.abs]
|
144
|
+
current_node.number_of_short_reads ||= 0
|
145
|
+
current_node.number_of_short_reads += row[2].to_i
|
146
|
+
next
|
147
|
+
end
|
132
148
|
else
|
133
149
|
raise unless row.length == 3
|
134
150
|
read_id = row[0].to_i
|
135
|
-
if options[:
|
151
|
+
if (options[:interesting_node_ids] and !options[:interesting_node_ids].include?(current_node.node_id)) or
|
152
|
+
(options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id))
|
136
153
|
# We have come across an uninteresting read. Ignore it.
|
137
154
|
next
|
138
155
|
end
|
@@ -224,6 +241,33 @@ module Bio
|
|
224
241
|
return deleted_nodes, deleted_arcs.flatten
|
225
242
|
end
|
226
243
|
|
244
|
+
# Add more noded reads to this already parsed graph. There is
|
245
|
+
# no gaurantee that old NodedRead information is preserved, or removed.
|
246
|
+
#
|
247
|
+
# Options:
|
248
|
+
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
249
|
+
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
250
|
+
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
251
|
+
# * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
|
252
|
+
# rather than particular reads.
|
253
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
254
|
+
# hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
|
255
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
256
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
257
|
+
# (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
|
258
|
+
def parse_additional_noded_reads(path_to_graph_file, options)
|
259
|
+
grep_context = options[:grep_hack]
|
260
|
+
if grep_context.nil?
|
261
|
+
raise "Calling Graph#parse_additional_noded_reads without specifying :grep_hack is currently not implemented"
|
262
|
+
end
|
263
|
+
self.class.apply_grep_hack(self,
|
264
|
+
path_to_graph_file,
|
265
|
+
options[:interesting_read_ids],
|
266
|
+
options[:interesting_node_ids],
|
267
|
+
grep_context
|
268
|
+
)
|
269
|
+
end
|
270
|
+
|
227
271
|
|
228
272
|
|
229
273
|
|
@@ -522,6 +566,97 @@ module Bio
|
|
522
566
|
class NodedRead
|
523
567
|
attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
|
524
568
|
end
|
569
|
+
|
570
|
+
private
|
571
|
+
def self.apply_grep_hack(graph, path_to_graph_file, interesting_read_ids, interesting_node_ids, grep_context)
|
572
|
+
interesting_read_ids ||= []
|
573
|
+
interesting_node_ids ||= []
|
574
|
+
if interesting_read_ids.empty? and interesting_node_ids.empty?
|
575
|
+
log.debug "Nothing to grep for in grep hack" if log.debug?
|
576
|
+
return
|
577
|
+
end
|
578
|
+
|
579
|
+
Tempfile.open('grep_v_hack') do |tempfile|
|
580
|
+
# Create a file to pass to grep -f
|
581
|
+
unless interesting_read_ids.nil?
|
582
|
+
interesting_read_ids.each do |read_id|
|
583
|
+
tempfile.puts "^#{read_id}\t"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
unless interesting_node_ids.nil?
|
587
|
+
interesting_node_ids.each do |node_id|
|
588
|
+
tempfile.puts "^NR\t#{node_id}\t"
|
589
|
+
tempfile.puts "^NR\t-#{node_id}\t"
|
590
|
+
end
|
591
|
+
end
|
592
|
+
tempfile.close
|
593
|
+
|
594
|
+
cmd = "grep -B #{grep_context.inspect} -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_graph_file.inspect}"
|
595
|
+
# TODO: make this call more robust
|
596
|
+
# grep_result = Bio::Commandeer.run cmd
|
597
|
+
s, grep_result, stderr = systemu cmd
|
598
|
+
|
599
|
+
# Parse the grepped out results
|
600
|
+
current_node = nil
|
601
|
+
current_node_direction = nil
|
602
|
+
in_nr_section = false
|
603
|
+
grep_result.each_line do |line|
|
604
|
+
row = line.split("\t")
|
605
|
+
if in_nr_section == false
|
606
|
+
# If there is a lot of context then the context includes ARC definitions etc. Skip past this.
|
607
|
+
if row[0] == 'NR'
|
608
|
+
in_nr_section = true
|
609
|
+
elsif row[0] == '--'
|
610
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
611
|
+
else
|
612
|
+
next #skip to next line, waiting to ge into NR section
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
if line == "--\n" #the break introduced by grep
|
617
|
+
# If we encounter a grep break, but haven't assigned any nodes, then that's not good enough
|
618
|
+
if current_node.nil?
|
619
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
620
|
+
end
|
621
|
+
# reset the parsing situation
|
622
|
+
current_node = nil
|
623
|
+
elsif row[0] == 'NR'
|
624
|
+
raise unless row.length == 3
|
625
|
+
node_pm = row[1].to_i
|
626
|
+
current_node_direction = node_pm > 0
|
627
|
+
current_node = graph.nodes[node_pm.abs]
|
628
|
+
current_node.number_of_short_reads ||= 0
|
629
|
+
current_node.number_of_short_reads += row[2].to_i
|
630
|
+
next
|
631
|
+
else
|
632
|
+
raise unless row.length == 3
|
633
|
+
read_id = row[0].to_i
|
634
|
+
if (current_node.nil? or !interesting_node_ids.include?(current_node.node_id)) and
|
635
|
+
!interesting_read_ids.include?(read_id)
|
636
|
+
# We have come across an uninteresting read. Ignore it.
|
637
|
+
next
|
638
|
+
end
|
639
|
+
if current_node.nil?
|
640
|
+
# Came across a high coverage node, and grep isn't giving enough context. Hopefully this won't happen much
|
641
|
+
# particularly if the reads you are interested in are given to velvet first
|
642
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
643
|
+
end
|
644
|
+
nr = NodedRead.new
|
645
|
+
nr.read_id = read_id
|
646
|
+
nr.offset_from_start_of_node = row[1].to_i
|
647
|
+
nr.start_coord = row[2].to_i
|
648
|
+
nr.direction = current_node_direction
|
649
|
+
current_node.short_reads ||= []
|
650
|
+
current_node.short_reads.push nr
|
651
|
+
next
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
if current_node.nil?
|
656
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
657
|
+
end
|
658
|
+
end
|
659
|
+
end
|
525
660
|
end
|
526
661
|
end
|
527
662
|
end
|
data/lib/bio-velvet/runner.rb
CHANGED
@@ -85,7 +85,7 @@ module Bio
|
|
85
85
|
# Return a Bio::Velvet::Graph object built from the LastGraph file.
|
86
86
|
# The options for parsing are as per Bio::Velvet::Graph#parse_from_file
|
87
87
|
def last_graph(options=nil)
|
88
|
-
Bio::Velvet::Graph.parse_from_file(last_graph_path)
|
88
|
+
Bio::Velvet::Graph.parse_from_file(last_graph_path, options)
|
89
89
|
end
|
90
90
|
end
|
91
91
|
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'hopcsv'
|
2
|
+
require 'bio'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Velvet
|
7
|
+
# Parser and container class for textual Sequence files
|
8
|
+
#
|
9
|
+
# After parsing, the result is a hash of read_id => sequence
|
10
|
+
# where read_id is an Integer and sequence a String
|
11
|
+
#
|
12
|
+
# The definition of this file is given in the velvet manual, at
|
13
|
+
# http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
|
14
|
+
class Sequences < Hash
|
15
|
+
include Bio::Velvet::Logging
|
16
|
+
|
17
|
+
def self.log
|
18
|
+
self.new.log
|
19
|
+
end
|
20
|
+
|
21
|
+
# Options:
|
22
|
+
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
23
|
+
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
24
|
+
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
25
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
26
|
+
# hacky method is applied to the graph file, so only sequence data of interesting_read_ids is presented
|
27
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
28
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
29
|
+
# (the -A flag). In the Sequence file the sequences are wrapped at 60 characters, so you'll need at
|
30
|
+
# least (longest_sequence_length / 60) + 2 amount of context. The reason for adding 2 is that the
|
31
|
+
# parser will then be able to detect insufficient context and raise an Exception, without
|
32
|
+
# throwing up false positive Exceptions.
|
33
|
+
def self.parse_from_file(path_to_sequence_file, options={})
|
34
|
+
seq_object = Bio::Velvet::Sequences.new
|
35
|
+
|
36
|
+
if options[:apply_grep_hack]
|
37
|
+
apply_grep_hack(seq_object, path_to_sequence_file, options[:interesting_read_ids], options[:apply_grep_hack])
|
38
|
+
else
|
39
|
+
# Parse all the sequences
|
40
|
+
Bio::FlatFile.foreach(path_to_sequence_file) do |seq|
|
41
|
+
read_id = seq.definition.split("\t")[1].to_i
|
42
|
+
if options[:interesting_read_ids].nil? or options[:interesting_read_ids].include?(read_id)
|
43
|
+
seq_object[read_id] = seq.seq.to_s
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
log.info "Read in #{seq_object.length} velvet stored sequences"
|
48
|
+
return seq_object
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
# Add the interesting sequences to the hash
|
53
|
+
def self.apply_grep_hack(seq_object, path_to_sequence_file, interesting_read_ids, grep_context)
|
54
|
+
return if interesting_read_ids.nil? or interesting_read_ids.empty?
|
55
|
+
|
56
|
+
Tempfile.open('grep_v_hack') do |tempfile|
|
57
|
+
# Create a file to pass to grep -f
|
58
|
+
unless interesting_read_ids.nil?
|
59
|
+
interesting_read_ids.each do |read_id|
|
60
|
+
tempfile.puts "\t#{read_id}\t" #the read_id is the second field of the header
|
61
|
+
end
|
62
|
+
end
|
63
|
+
tempfile.close
|
64
|
+
|
65
|
+
cmd = "grep -F -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_sequence_file.inspect}"
|
66
|
+
# TODO: make this call more robust
|
67
|
+
# grep_result = Bio::Commandeer.run cmd
|
68
|
+
s, grep_result, stderr = systemu cmd
|
69
|
+
|
70
|
+
# Parse the grepped out results
|
71
|
+
current_read_id = nil
|
72
|
+
current_seq = nil
|
73
|
+
last_sequence_line_length = nil
|
74
|
+
|
75
|
+
add_last_sequence = lambda do
|
76
|
+
if current_read_id
|
77
|
+
seq_object[current_read_id] = current_seq
|
78
|
+
end
|
79
|
+
end
|
80
|
+
grep_result.each_line do |line|
|
81
|
+
line.chomp!
|
82
|
+
if line[0] == '>'
|
83
|
+
# Process the last sequence
|
84
|
+
add_last_sequence.call unless current_read_id.nil?
|
85
|
+
|
86
|
+
# Assume the real sequence name contains no tabs
|
87
|
+
read_id = line.split("\t")[1]
|
88
|
+
raise "Unable to parse velvet Sequence file at this line #{line}" if read_id.nil?
|
89
|
+
read_id = read_id.to_i
|
90
|
+
if interesting_read_ids.include?(read_id)
|
91
|
+
# if current_read_id is nil, then we know we are uninterested in this sequence
|
92
|
+
current_read_id = read_id
|
93
|
+
else
|
94
|
+
current_read_id = nil
|
95
|
+
end
|
96
|
+
current_seq = nil
|
97
|
+
elsif line == '--'
|
98
|
+
# grep demarker.
|
99
|
+
add_last_sequence.call unless current_read_id.nil?
|
100
|
+
if last_sequence_line_length == 60
|
101
|
+
raise "Parsing exception when parsing velvet Sequence file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
102
|
+
end
|
103
|
+
else
|
104
|
+
# plain old sequence
|
105
|
+
unless current_read_id.nil?
|
106
|
+
current_seq ||= ''
|
107
|
+
current_seq += line
|
108
|
+
last_sequence_line_length = line.length
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
# process the last sequence
|
113
|
+
if last_sequence_line_length == 60
|
114
|
+
raise "Parsing exception when parsing velvet Sequence file at the end of the file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
115
|
+
end
|
116
|
+
add_last_sequence.call
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -66,7 +66,7 @@ describe "BioVelvet" do
|
|
66
66
|
# 49981 0 0
|
67
67
|
node = graph.nodes[967]
|
68
68
|
node.short_reads.nil?.should eq(false)
|
69
|
-
node.short_reads.length.should eq(2)
|
69
|
+
node.short_reads.length.should eq(2)
|
70
70
|
node.short_reads[0].read_id.should eq(49982)
|
71
71
|
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
72
72
|
node.short_reads[0].start_coord.should eq(0)
|
@@ -118,6 +118,142 @@ describe "BioVelvet" do
|
|
118
118
|
node.short_reads[0].offset_from_start_of_node.should eq(41)
|
119
119
|
end
|
120
120
|
|
121
|
+
it 'should be able to parse a read tracked graph, using the grep hack correct context' do
|
122
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
123
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
124
|
+
{:interesting_read_ids => Set.new([47223]),
|
125
|
+
:grep_hack => 5}
|
126
|
+
)
|
127
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
128
|
+
|
129
|
+
graph.number_of_nodes.should eq(967)
|
130
|
+
graph.number_of_sequences.should eq(50000)
|
131
|
+
graph.hash_length.should eq(31)
|
132
|
+
|
133
|
+
# NR -951 2
|
134
|
+
#47210 0 0
|
135
|
+
#47223 41 0
|
136
|
+
# ====later
|
137
|
+
# NR 951 2
|
138
|
+
# 47209 54 0
|
139
|
+
# 47224 0 0
|
140
|
+
node = graph.nodes[951]
|
141
|
+
node.short_reads.length.should eq(1)
|
142
|
+
# node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
|
143
|
+
node.short_reads[0].read_id.should eq(47223)
|
144
|
+
node.short_reads[0].offset_from_start_of_node.should eq(41)
|
145
|
+
end
|
146
|
+
|
147
|
+
it 'should be able to parse a read tracked graph, using the grep hack context beyond NR' do
|
148
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
149
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
150
|
+
{:interesting_read_ids => Set.new([47224]),
|
151
|
+
:grep_hack => 500}
|
152
|
+
)
|
153
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
154
|
+
|
155
|
+
graph.number_of_nodes.should eq(967)
|
156
|
+
graph.number_of_sequences.should eq(50000)
|
157
|
+
graph.hash_length.should eq(31)
|
158
|
+
|
159
|
+
# NR -951 2
|
160
|
+
#47210 0 0
|
161
|
+
#47223 41 0
|
162
|
+
# ====later
|
163
|
+
# NR 951 2
|
164
|
+
# 47209 54 0
|
165
|
+
# 47224 0 0
|
166
|
+
node = graph.nodes[951]
|
167
|
+
node.short_reads.length.should eq(1)
|
168
|
+
# node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
|
169
|
+
node.short_reads[0].read_id.should eq(47224)
|
170
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
171
|
+
end
|
172
|
+
|
173
|
+
it 'should be able to parse a read tracked graph, using the grep hack insufficient context' do
|
174
|
+
expect {Bio::Velvet::Graph.parse_from_file(
|
175
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
176
|
+
{:interesting_read_ids => Set.new([47224]),
|
177
|
+
:grep_hack => 0}
|
178
|
+
)}.to raise_error
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'should grep target nodes with grep hack' do
|
182
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
183
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
184
|
+
{:interesting_node_ids => Set.new([951]),
|
185
|
+
:grep_hack => 500}
|
186
|
+
)
|
187
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
188
|
+
|
189
|
+
graph.number_of_nodes.should eq(967)
|
190
|
+
graph.number_of_sequences.should eq(50000)
|
191
|
+
graph.hash_length.should eq(31)
|
192
|
+
|
193
|
+
# NR -951 2
|
194
|
+
#47210 0 0
|
195
|
+
#47223 41 0
|
196
|
+
# ====later
|
197
|
+
# NR 951 2
|
198
|
+
# 47209 54 0
|
199
|
+
# 47224 0 0
|
200
|
+
node = graph.nodes[951]
|
201
|
+
node.short_reads.length.should eq(4)
|
202
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
203
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'should grep target nodes without grep hack' do
|
207
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
208
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
209
|
+
{:interesting_node_ids => Set.new([951])}
|
210
|
+
)
|
211
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
212
|
+
|
213
|
+
graph.number_of_nodes.should eq(967)
|
214
|
+
graph.number_of_sequences.should eq(50000)
|
215
|
+
graph.hash_length.should eq(31)
|
216
|
+
|
217
|
+
# NR -951 2
|
218
|
+
#47210 0 0
|
219
|
+
#47223 41 0
|
220
|
+
# ====later
|
221
|
+
# NR 951 2
|
222
|
+
# 47209 54 0
|
223
|
+
# 47224 0 0
|
224
|
+
node = graph.nodes[951]
|
225
|
+
node.short_reads.length.should eq(4)
|
226
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
227
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should parse_additional_noded_reads with interesting_node_ids' do
|
231
|
+
graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
|
232
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
233
|
+
graph_file,
|
234
|
+
{:interesting_node_ids => [] }
|
235
|
+
)
|
236
|
+
graph.nodes[951].short_reads.should == nil
|
237
|
+
|
238
|
+
graph.parse_additional_noded_reads(graph_file, :interesting_node_ids => [951], :grep_hack => 500)
|
239
|
+
node = graph.nodes[951]
|
240
|
+
node.short_reads.length.should eq(4)
|
241
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'should parse_additional_noded_reads with interesting_read_ids' do
|
245
|
+
graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
|
246
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
247
|
+
graph_file,
|
248
|
+
{:interesting_node_ids => [] }
|
249
|
+
)
|
250
|
+
graph.nodes[951].short_reads.should == nil
|
251
|
+
|
252
|
+
graph.parse_additional_noded_reads(graph_file, :interesting_read_ids => [47210], :grep_hack => 2)
|
253
|
+
node = graph.nodes[951]
|
254
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210]
|
255
|
+
end
|
256
|
+
|
121
257
|
it 'should return sets of arcs by id' do
|
122
258
|
graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
|
123
259
|
# ARC 2 -578 1
|
@@ -0,0 +1,24 @@
|
|
1
|
+
>1_1
|
2
|
+
CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATGTCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
|
3
|
+
>1_2
|
4
|
+
AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
|
5
|
+
>2_1
|
6
|
+
AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
|
7
|
+
>2_2
|
8
|
+
GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
|
9
|
+
>3_1
|
10
|
+
AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
|
11
|
+
>3_2
|
12
|
+
CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCGTGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
|
13
|
+
>4_1
|
14
|
+
AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGACTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
|
15
|
+
>4_2
|
16
|
+
CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTCGTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
|
17
|
+
>5_1
|
18
|
+
ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
|
19
|
+
>5_2
|
20
|
+
GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
|
21
|
+
>6_1
|
22
|
+
GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
|
23
|
+
>6_2
|
24
|
+
CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
|
@@ -0,0 +1,36 @@
|
|
1
|
+
>1_1 1 0
|
2
|
+
CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATG
|
3
|
+
TCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
|
4
|
+
>1_2 2 0
|
5
|
+
AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGT
|
6
|
+
GGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
|
7
|
+
>2_1 3 0
|
8
|
+
AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGC
|
9
|
+
AACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
|
10
|
+
>2_2 4 0
|
11
|
+
GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCC
|
12
|
+
CAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
|
13
|
+
>3_1 5 0
|
14
|
+
AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGA
|
15
|
+
AGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
|
16
|
+
>3_2 6 0
|
17
|
+
CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCG
|
18
|
+
TGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
|
19
|
+
>4_1 7 0
|
20
|
+
AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGA
|
21
|
+
CTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
|
22
|
+
>4_2 8 0
|
23
|
+
CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTC
|
24
|
+
GTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
|
25
|
+
>5_1 9 0
|
26
|
+
ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTA
|
27
|
+
TTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
|
28
|
+
>5_2 10 0
|
29
|
+
GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACG
|
30
|
+
GCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
|
31
|
+
>6_1 11 0
|
32
|
+
GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATT
|
33
|
+
GGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
|
34
|
+
>6_2 12 0
|
35
|
+
CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAG
|
36
|
+
TTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
|
@@ -0,0 +1,18 @@
|
|
1
|
+
>read1 1 0
|
2
|
+
CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
|
3
|
+
TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
|
4
|
+
ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
|
5
|
+
CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
|
6
|
+
GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
|
7
|
+
CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
|
8
|
+
ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
|
9
|
+
GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
|
10
|
+
ACTATGCTGGTATTTCACTTCCAGGTACAGG
|
11
|
+
>read2 2 0
|
12
|
+
ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
|
13
|
+
CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
|
14
|
+
ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
|
15
|
+
ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
|
16
|
+
TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
|
17
|
+
TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
|
18
|
+
TAATTAAATAAAAATATACTTCTTCTATTTTTAT
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
class String
|
5
|
+
def revcom
|
6
|
+
Bio::Sequences::NA.new(self).reverse_complement.to_s.upcase
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
seq1_sequence = 'CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
|
11
|
+
TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
|
12
|
+
ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
|
13
|
+
CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
|
14
|
+
GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
|
15
|
+
CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
|
16
|
+
ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
|
17
|
+
GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
|
18
|
+
ACTATGCTGGTATTTCACTTCCAGGTACAGG'.gsub("\n",'')
|
19
|
+
seq2_sequence = 'ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
|
20
|
+
CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
|
21
|
+
ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
|
22
|
+
ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
|
23
|
+
TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
|
24
|
+
TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
|
25
|
+
TAATTAAATAAAAATATACTTCTTCTATTTTTAT'.gsub("\n",'')
|
26
|
+
|
27
|
+
describe "BioVelvet" do
|
28
|
+
it "should be able to parse a whole Sequences file" do
|
29
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences')
|
30
|
+
seqs.should be_kind_of(Bio::Velvet::Sequences)
|
31
|
+
seqs.should be_kind_of(Hash)
|
32
|
+
seqs.keys.should == [1,2]
|
33
|
+
seqs[1].should == seq1_sequence
|
34
|
+
seqs[2].should == seq2_sequence
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should be able to read in an interesting seq only' do
|
38
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
39
|
+
:interesting_read_ids => [1]
|
40
|
+
seqs.keys.should == [1]
|
41
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
42
|
+
:interesting_read_ids => [2]
|
43
|
+
seqs.keys.should == [2]
|
44
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
45
|
+
:interesting_read_ids => [1,2]
|
46
|
+
seqs.keys.should == [1,2]
|
47
|
+
seqs[1].should == seq1_sequence
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should be able to apply the grep hack' do
|
51
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
52
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 500}
|
53
|
+
seqs.keys.should == [1]
|
54
|
+
seqs[1].should == seq1_sequence
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should be able to apply the grep hack when there is only just enough context' do
|
58
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
59
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 9}
|
60
|
+
seqs.keys.should == [1]
|
61
|
+
seqs[1].should == seq1_sequence
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'should warn when insufficient context is given' do
|
65
|
+
expect {
|
66
|
+
Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
67
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 8}
|
68
|
+
}.to raise_error
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should be able to handle multiple separated read ids with the grep hack' do
|
72
|
+
s = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
|
73
|
+
s = ['']+s.values
|
74
|
+
seq_file = File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
|
75
|
+
|
76
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
77
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 9}
|
78
|
+
seqs.keys.should == [1]
|
79
|
+
seqs[1].should == s[1]
|
80
|
+
|
81
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
82
|
+
{:interesting_read_ids => [1,2], :apply_grep_hack => 9}
|
83
|
+
seqs.keys.should == [1,2]
|
84
|
+
seqs.values.should == s[1..2]
|
85
|
+
|
86
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
87
|
+
{:interesting_read_ids => [1,2], :apply_grep_hack => 2}
|
88
|
+
seqs.keys.should == [1,2]
|
89
|
+
seqs.values.should == s[1..2]
|
90
|
+
|
91
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
92
|
+
{:interesting_read_ids => [1,5], :apply_grep_hack => 2}
|
93
|
+
seqs.keys.should == [1,5]
|
94
|
+
seqs.values.should == [s[1],s[5]]
|
95
|
+
|
96
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
97
|
+
{:interesting_read_ids => [1,5], :apply_grep_hack => 3}
|
98
|
+
seqs.keys.should == [1,5]
|
99
|
+
seqs.values.should == [s[1],s[5]]
|
100
|
+
|
101
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
102
|
+
{:interesting_read_ids => [1,12], :apply_grep_hack => 3}
|
103
|
+
seqs.keys.should == [1,12]
|
104
|
+
seqs.values.should == [s[1],s[12]]
|
105
|
+
|
106
|
+
expect {
|
107
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
108
|
+
{:interesting_read_ids => [12], :apply_grep_hack => 1}
|
109
|
+
seqs.keys.should == [1,12]
|
110
|
+
seqs.values.should == [s[1],s[12]]
|
111
|
+
}.to raise_error
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-velvet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01
|
11
|
+
date: 2014-04-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -155,12 +155,16 @@ files:
|
|
155
155
|
- lib/bio-velvet.rb
|
156
156
|
- lib/bio-velvet/graph.rb
|
157
157
|
- lib/bio-velvet/runner.rb
|
158
|
+
- lib/bio-velvet/sequences.rb
|
158
159
|
- spec/bio-velvet_arc_array_spec.rb
|
159
160
|
- spec/bio-velvet_graph_spec.rb
|
160
161
|
- spec/bio-velvet_runner_spec.rb
|
161
162
|
- spec/data/node_sequence/LastGraph
|
162
163
|
- spec/data/node_sequence/contigs.fa
|
163
164
|
- spec/data/runner_input.fa
|
165
|
+
- spec/data/sequence_spec/5seqs.fa
|
166
|
+
- spec/data/sequence_spec/5seqs.fa.Sequences
|
167
|
+
- spec/data/sequence_spec/Sequences
|
164
168
|
- spec/data/short_node_LastGraph
|
165
169
|
- spec/data/short_node_sequence_test_graph
|
166
170
|
- spec/data/velvet_test_reads_assembly/Graph
|
@@ -169,6 +173,7 @@ files:
|
|
169
173
|
- spec/data/velvet_test_reads_assembly_read_tracking/Graph2
|
170
174
|
- spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
|
171
175
|
- spec/data/velvet_test_trail_sequence_assembly/reads1.fa
|
176
|
+
- spec/sequences_spec.rb
|
172
177
|
- spec/spec_helper.rb
|
173
178
|
homepage: http://github.com/wwood/bioruby-velvet
|
174
179
|
licenses:
|
@@ -190,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
195
|
version: '0'
|
191
196
|
requirements: []
|
192
197
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.2.
|
198
|
+
rubygems_version: 2.2.2
|
194
199
|
signing_key:
|
195
200
|
specification_version: 4
|
196
201
|
summary: Parser to work with file formats used in the velvet DNA assembler
|