bio-velvet 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/bio-velvet.rb +1 -0
- data/lib/bio-velvet/graph.rb +143 -8
- data/lib/bio-velvet/runner.rb +1 -1
- data/lib/bio-velvet/sequences.rb +121 -0
- data/spec/bio-velvet_graph_spec.rb +137 -1
- data/spec/data/sequence_spec/5seqs.fa +24 -0
- data/spec/data/sequence_spec/5seqs.fa.Sequences +36 -0
- data/spec/data/sequence_spec/Sequences +18 -0
- data/spec/sequences_spec.rb +113 -0
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f88fe95bdfceafceb78b65c763675780ee12f1cf
|
4
|
+
data.tar.gz: 40f0b257c9e15ca861c844a51dbce319ee17453b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4de2138c28eb006fd872e518f7aa5172c04016aded296aeca1bdbeb27e98c7b99fa6f64b5fcf2252c0eff7dacccf08d9b2b4db3e38f05256d12b270cbc3fe19
|
7
|
+
data.tar.gz: f2fe259725937d1995f5a0269a3b6a3a2c3a55b8ecb30abc8e7f54dadfe6311a4d43a3df3bd588c49f3567a7ca9a1846885f0d51bbfb6d66bd84e3cf3b00937c
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/lib/bio-velvet.rb
CHANGED
data/lib/bio-velvet/graph.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'hopcsv'
|
2
2
|
require 'bio'
|
3
|
+
require 'tempfile'
|
3
4
|
|
4
5
|
module Bio
|
5
6
|
module Velvet
|
@@ -32,6 +33,13 @@ module Bio
|
|
32
33
|
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
33
34
|
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
34
35
|
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
36
|
+
# * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
|
37
|
+
# rather than particular reads.
|
38
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
39
|
+
# hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
|
40
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
41
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
42
|
+
# (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
|
35
43
|
def self.parse_from_file(path_to_graph_file, options={})
|
36
44
|
graph = self.new
|
37
45
|
state = :header
|
@@ -122,17 +130,26 @@ module Bio
|
|
122
130
|
# $READ_ID2 etc.
|
123
131
|
#p row
|
124
132
|
if row[0] == 'NR'
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
133
|
+
if options[:grep_hack]
|
134
|
+
unless options[:interesting_read_ids] or options[:interesting_node_ids]
|
135
|
+
raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
|
136
|
+
end
|
137
|
+
apply_grep_hack graph, path_to_graph_file, options[:interesting_read_ids], options[:interesting_node_ids], options[:grep_hack]
|
138
|
+
break #no more parsing is required
|
139
|
+
else
|
140
|
+
raise unless row.length == 3
|
141
|
+
node_pm = row[1].to_i
|
142
|
+
current_node_direction = node_pm > 0
|
143
|
+
current_node = graph.nodes[node_pm.abs]
|
144
|
+
current_node.number_of_short_reads ||= 0
|
145
|
+
current_node.number_of_short_reads += row[2].to_i
|
146
|
+
next
|
147
|
+
end
|
132
148
|
else
|
133
149
|
raise unless row.length == 3
|
134
150
|
read_id = row[0].to_i
|
135
|
-
if options[:
|
151
|
+
if (options[:interesting_node_ids] and !options[:interesting_node_ids].include?(current_node.node_id)) or
|
152
|
+
(options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id))
|
136
153
|
# We have come across an uninteresting read. Ignore it.
|
137
154
|
next
|
138
155
|
end
|
@@ -224,6 +241,33 @@ module Bio
|
|
224
241
|
return deleted_nodes, deleted_arcs.flatten
|
225
242
|
end
|
226
243
|
|
244
|
+
# Add more noded reads to this already parsed graph. There is
|
245
|
+
# no gaurantee that old NodedRead information is preserved, or removed.
|
246
|
+
#
|
247
|
+
# Options:
|
248
|
+
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
249
|
+
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
250
|
+
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
251
|
+
# * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
|
252
|
+
# rather than particular reads.
|
253
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
254
|
+
# hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
|
255
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
256
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
257
|
+
# (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
|
258
|
+
def parse_additional_noded_reads(path_to_graph_file, options)
|
259
|
+
grep_context = options[:grep_hack]
|
260
|
+
if grep_context.nil?
|
261
|
+
raise "Calling Graph#parse_additional_noded_reads without specifying :grep_hack is currently not implemented"
|
262
|
+
end
|
263
|
+
self.class.apply_grep_hack(self,
|
264
|
+
path_to_graph_file,
|
265
|
+
options[:interesting_read_ids],
|
266
|
+
options[:interesting_node_ids],
|
267
|
+
grep_context
|
268
|
+
)
|
269
|
+
end
|
270
|
+
|
227
271
|
|
228
272
|
|
229
273
|
|
@@ -522,6 +566,97 @@ module Bio
|
|
522
566
|
class NodedRead
|
523
567
|
attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
|
524
568
|
end
|
569
|
+
|
570
|
+
private
|
571
|
+
def self.apply_grep_hack(graph, path_to_graph_file, interesting_read_ids, interesting_node_ids, grep_context)
|
572
|
+
interesting_read_ids ||= []
|
573
|
+
interesting_node_ids ||= []
|
574
|
+
if interesting_read_ids.empty? and interesting_node_ids.empty?
|
575
|
+
log.debug "Nothing to grep for in grep hack" if log.debug?
|
576
|
+
return
|
577
|
+
end
|
578
|
+
|
579
|
+
Tempfile.open('grep_v_hack') do |tempfile|
|
580
|
+
# Create a file to pass to grep -f
|
581
|
+
unless interesting_read_ids.nil?
|
582
|
+
interesting_read_ids.each do |read_id|
|
583
|
+
tempfile.puts "^#{read_id}\t"
|
584
|
+
end
|
585
|
+
end
|
586
|
+
unless interesting_node_ids.nil?
|
587
|
+
interesting_node_ids.each do |node_id|
|
588
|
+
tempfile.puts "^NR\t#{node_id}\t"
|
589
|
+
tempfile.puts "^NR\t-#{node_id}\t"
|
590
|
+
end
|
591
|
+
end
|
592
|
+
tempfile.close
|
593
|
+
|
594
|
+
cmd = "grep -B #{grep_context.inspect} -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_graph_file.inspect}"
|
595
|
+
# TODO: make this call more robust
|
596
|
+
# grep_result = Bio::Commandeer.run cmd
|
597
|
+
s, grep_result, stderr = systemu cmd
|
598
|
+
|
599
|
+
# Parse the grepped out results
|
600
|
+
current_node = nil
|
601
|
+
current_node_direction = nil
|
602
|
+
in_nr_section = false
|
603
|
+
grep_result.each_line do |line|
|
604
|
+
row = line.split("\t")
|
605
|
+
if in_nr_section == false
|
606
|
+
# If there is a lot of context then the context includes ARC definitions etc. Skip past this.
|
607
|
+
if row[0] == 'NR'
|
608
|
+
in_nr_section = true
|
609
|
+
elsif row[0] == '--'
|
610
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
611
|
+
else
|
612
|
+
next #skip to next line, waiting to ge into NR section
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
if line == "--\n" #the break introduced by grep
|
617
|
+
# If we encounter a grep break, but haven't assigned any nodes, then that's not good enough
|
618
|
+
if current_node.nil?
|
619
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
620
|
+
end
|
621
|
+
# reset the parsing situation
|
622
|
+
current_node = nil
|
623
|
+
elsif row[0] == 'NR'
|
624
|
+
raise unless row.length == 3
|
625
|
+
node_pm = row[1].to_i
|
626
|
+
current_node_direction = node_pm > 0
|
627
|
+
current_node = graph.nodes[node_pm.abs]
|
628
|
+
current_node.number_of_short_reads ||= 0
|
629
|
+
current_node.number_of_short_reads += row[2].to_i
|
630
|
+
next
|
631
|
+
else
|
632
|
+
raise unless row.length == 3
|
633
|
+
read_id = row[0].to_i
|
634
|
+
if (current_node.nil? or !interesting_node_ids.include?(current_node.node_id)) and
|
635
|
+
!interesting_read_ids.include?(read_id)
|
636
|
+
# We have come across an uninteresting read. Ignore it.
|
637
|
+
next
|
638
|
+
end
|
639
|
+
if current_node.nil?
|
640
|
+
# Came across a high coverage node, and grep isn't giving enough context. Hopefully this won't happen much
|
641
|
+
# particularly if the reads you are interested in are given to velvet first
|
642
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
643
|
+
end
|
644
|
+
nr = NodedRead.new
|
645
|
+
nr.read_id = read_id
|
646
|
+
nr.offset_from_start_of_node = row[1].to_i
|
647
|
+
nr.start_coord = row[2].to_i
|
648
|
+
nr.direction = current_node_direction
|
649
|
+
current_node.short_reads ||= []
|
650
|
+
current_node.short_reads.push nr
|
651
|
+
next
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
if current_node.nil?
|
656
|
+
raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
657
|
+
end
|
658
|
+
end
|
659
|
+
end
|
525
660
|
end
|
526
661
|
end
|
527
662
|
end
|
data/lib/bio-velvet/runner.rb
CHANGED
@@ -85,7 +85,7 @@ module Bio
|
|
85
85
|
# Return a Bio::Velvet::Graph object built from the LastGraph file.
|
86
86
|
# The options for parsing are as per Bio::Velvet::Graph#parse_from_file
|
87
87
|
def last_graph(options=nil)
|
88
|
-
Bio::Velvet::Graph.parse_from_file(last_graph_path)
|
88
|
+
Bio::Velvet::Graph.parse_from_file(last_graph_path, options)
|
89
89
|
end
|
90
90
|
end
|
91
91
|
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'hopcsv'
|
2
|
+
require 'bio'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
module Bio
|
6
|
+
module Velvet
|
7
|
+
# Parser and container class for textual Sequence files
|
8
|
+
#
|
9
|
+
# After parsing, the result is a hash of read_id => sequence
|
10
|
+
# where read_id is an Integer and sequence a String
|
11
|
+
#
|
12
|
+
# The definition of this file is given in the velvet manual, at
|
13
|
+
# http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
|
14
|
+
class Sequences < Hash
|
15
|
+
include Bio::Velvet::Logging
|
16
|
+
|
17
|
+
def self.log
|
18
|
+
self.new.log
|
19
|
+
end
|
20
|
+
|
21
|
+
# Options:
|
22
|
+
# * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
|
23
|
+
# not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
|
24
|
+
# nodes and edges are parsed in. Using this options saves both memory and CPU.
|
25
|
+
# * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
|
26
|
+
# hacky method is applied to the graph file, so only sequence data of interesting_read_ids is presented
|
27
|
+
# to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
|
28
|
+
# not be particularly future-proof. The value of this option is the amount of context coming out of grep
|
29
|
+
# (the -A flag). In the Sequence file the sequences are wrapped at 60 characters, so you'll need at
|
30
|
+
# least (longest_sequence_length / 60) + 2 amount of context. The reason for adding 2 is that the
|
31
|
+
# parser will then be able to detect insufficient context and raise an Exception, without
|
32
|
+
# throwing up false positive Exceptions.
|
33
|
+
def self.parse_from_file(path_to_sequence_file, options={})
|
34
|
+
seq_object = Bio::Velvet::Sequences.new
|
35
|
+
|
36
|
+
if options[:apply_grep_hack]
|
37
|
+
apply_grep_hack(seq_object, path_to_sequence_file, options[:interesting_read_ids], options[:apply_grep_hack])
|
38
|
+
else
|
39
|
+
# Parse all the sequences
|
40
|
+
Bio::FlatFile.foreach(path_to_sequence_file) do |seq|
|
41
|
+
read_id = seq.definition.split("\t")[1].to_i
|
42
|
+
if options[:interesting_read_ids].nil? or options[:interesting_read_ids].include?(read_id)
|
43
|
+
seq_object[read_id] = seq.seq.to_s
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
log.info "Read in #{seq_object.length} velvet stored sequences"
|
48
|
+
return seq_object
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
# Add the interesting sequences to the hash
|
53
|
+
def self.apply_grep_hack(seq_object, path_to_sequence_file, interesting_read_ids, grep_context)
|
54
|
+
return if interesting_read_ids.nil? or interesting_read_ids.empty?
|
55
|
+
|
56
|
+
Tempfile.open('grep_v_hack') do |tempfile|
|
57
|
+
# Create a file to pass to grep -f
|
58
|
+
unless interesting_read_ids.nil?
|
59
|
+
interesting_read_ids.each do |read_id|
|
60
|
+
tempfile.puts "\t#{read_id}\t" #the read_id is the second field of the header
|
61
|
+
end
|
62
|
+
end
|
63
|
+
tempfile.close
|
64
|
+
|
65
|
+
cmd = "grep -F -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_sequence_file.inspect}"
|
66
|
+
# TODO: make this call more robust
|
67
|
+
# grep_result = Bio::Commandeer.run cmd
|
68
|
+
s, grep_result, stderr = systemu cmd
|
69
|
+
|
70
|
+
# Parse the grepped out results
|
71
|
+
current_read_id = nil
|
72
|
+
current_seq = nil
|
73
|
+
last_sequence_line_length = nil
|
74
|
+
|
75
|
+
add_last_sequence = lambda do
|
76
|
+
if current_read_id
|
77
|
+
seq_object[current_read_id] = current_seq
|
78
|
+
end
|
79
|
+
end
|
80
|
+
grep_result.each_line do |line|
|
81
|
+
line.chomp!
|
82
|
+
if line[0] == '>'
|
83
|
+
# Process the last sequence
|
84
|
+
add_last_sequence.call unless current_read_id.nil?
|
85
|
+
|
86
|
+
# Assume the real sequence name contains no tabs
|
87
|
+
read_id = line.split("\t")[1]
|
88
|
+
raise "Unable to parse velvet Sequence file at this line #{line}" if read_id.nil?
|
89
|
+
read_id = read_id.to_i
|
90
|
+
if interesting_read_ids.include?(read_id)
|
91
|
+
# if current_read_id is nil, then we know we are uninterested in this sequence
|
92
|
+
current_read_id = read_id
|
93
|
+
else
|
94
|
+
current_read_id = nil
|
95
|
+
end
|
96
|
+
current_seq = nil
|
97
|
+
elsif line == '--'
|
98
|
+
# grep demarker.
|
99
|
+
add_last_sequence.call unless current_read_id.nil?
|
100
|
+
if last_sequence_line_length == 60
|
101
|
+
raise "Parsing exception when parsing velvet Sequence file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
102
|
+
end
|
103
|
+
else
|
104
|
+
# plain old sequence
|
105
|
+
unless current_read_id.nil?
|
106
|
+
current_seq ||= ''
|
107
|
+
current_seq += line
|
108
|
+
last_sequence_line_length = line.length
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
# process the last sequence
|
113
|
+
if last_sequence_line_length == 60
|
114
|
+
raise "Parsing exception when parsing velvet Sequence file at the end of the file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
|
115
|
+
end
|
116
|
+
add_last_sequence.call
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -66,7 +66,7 @@ describe "BioVelvet" do
|
|
66
66
|
# 49981 0 0
|
67
67
|
node = graph.nodes[967]
|
68
68
|
node.short_reads.nil?.should eq(false)
|
69
|
-
node.short_reads.length.should eq(2)
|
69
|
+
node.short_reads.length.should eq(2)
|
70
70
|
node.short_reads[0].read_id.should eq(49982)
|
71
71
|
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
72
72
|
node.short_reads[0].start_coord.should eq(0)
|
@@ -118,6 +118,142 @@ describe "BioVelvet" do
|
|
118
118
|
node.short_reads[0].offset_from_start_of_node.should eq(41)
|
119
119
|
end
|
120
120
|
|
121
|
+
it 'should be able to parse a read tracked graph, using the grep hack correct context' do
|
122
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
123
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
124
|
+
{:interesting_read_ids => Set.new([47223]),
|
125
|
+
:grep_hack => 5}
|
126
|
+
)
|
127
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
128
|
+
|
129
|
+
graph.number_of_nodes.should eq(967)
|
130
|
+
graph.number_of_sequences.should eq(50000)
|
131
|
+
graph.hash_length.should eq(31)
|
132
|
+
|
133
|
+
# NR -951 2
|
134
|
+
#47210 0 0
|
135
|
+
#47223 41 0
|
136
|
+
# ====later
|
137
|
+
# NR 951 2
|
138
|
+
# 47209 54 0
|
139
|
+
# 47224 0 0
|
140
|
+
node = graph.nodes[951]
|
141
|
+
node.short_reads.length.should eq(1)
|
142
|
+
# node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
|
143
|
+
node.short_reads[0].read_id.should eq(47223)
|
144
|
+
node.short_reads[0].offset_from_start_of_node.should eq(41)
|
145
|
+
end
|
146
|
+
|
147
|
+
it 'should be able to parse a read tracked graph, using the grep hack context beyond NR' do
|
148
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
149
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
150
|
+
{:interesting_read_ids => Set.new([47224]),
|
151
|
+
:grep_hack => 500}
|
152
|
+
)
|
153
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
154
|
+
|
155
|
+
graph.number_of_nodes.should eq(967)
|
156
|
+
graph.number_of_sequences.should eq(50000)
|
157
|
+
graph.hash_length.should eq(31)
|
158
|
+
|
159
|
+
# NR -951 2
|
160
|
+
#47210 0 0
|
161
|
+
#47223 41 0
|
162
|
+
# ====later
|
163
|
+
# NR 951 2
|
164
|
+
# 47209 54 0
|
165
|
+
# 47224 0 0
|
166
|
+
node = graph.nodes[951]
|
167
|
+
node.short_reads.length.should eq(1)
|
168
|
+
# node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
|
169
|
+
node.short_reads[0].read_id.should eq(47224)
|
170
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
171
|
+
end
|
172
|
+
|
173
|
+
it 'should be able to parse a read tracked graph, using the grep hack insufficient context' do
|
174
|
+
expect {Bio::Velvet::Graph.parse_from_file(
|
175
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
176
|
+
{:interesting_read_ids => Set.new([47224]),
|
177
|
+
:grep_hack => 0}
|
178
|
+
)}.to raise_error
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'should grep target nodes with grep hack' do
|
182
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
183
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
184
|
+
{:interesting_node_ids => Set.new([951]),
|
185
|
+
:grep_hack => 500}
|
186
|
+
)
|
187
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
188
|
+
|
189
|
+
graph.number_of_nodes.should eq(967)
|
190
|
+
graph.number_of_sequences.should eq(50000)
|
191
|
+
graph.hash_length.should eq(31)
|
192
|
+
|
193
|
+
# NR -951 2
|
194
|
+
#47210 0 0
|
195
|
+
#47223 41 0
|
196
|
+
# ====later
|
197
|
+
# NR 951 2
|
198
|
+
# 47209 54 0
|
199
|
+
# 47224 0 0
|
200
|
+
node = graph.nodes[951]
|
201
|
+
node.short_reads.length.should eq(4)
|
202
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
203
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'should grep target nodes without grep hack' do
|
207
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
208
|
+
File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
|
209
|
+
{:interesting_node_ids => Set.new([951])}
|
210
|
+
)
|
211
|
+
graph.should be_kind_of(Bio::Velvet::Graph)
|
212
|
+
|
213
|
+
graph.number_of_nodes.should eq(967)
|
214
|
+
graph.number_of_sequences.should eq(50000)
|
215
|
+
graph.hash_length.should eq(31)
|
216
|
+
|
217
|
+
# NR -951 2
|
218
|
+
#47210 0 0
|
219
|
+
#47223 41 0
|
220
|
+
# ====later
|
221
|
+
# NR 951 2
|
222
|
+
# 47209 54 0
|
223
|
+
# 47224 0 0
|
224
|
+
node = graph.nodes[951]
|
225
|
+
node.short_reads.length.should eq(4)
|
226
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
227
|
+
node.short_reads[0].offset_from_start_of_node.should eq(0)
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should parse_additional_noded_reads with interesting_node_ids' do
|
231
|
+
graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
|
232
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
233
|
+
graph_file,
|
234
|
+
{:interesting_node_ids => [] }
|
235
|
+
)
|
236
|
+
graph.nodes[951].short_reads.should == nil
|
237
|
+
|
238
|
+
graph.parse_additional_noded_reads(graph_file, :interesting_node_ids => [951], :grep_hack => 500)
|
239
|
+
node = graph.nodes[951]
|
240
|
+
node.short_reads.length.should eq(4)
|
241
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'should parse_additional_noded_reads with interesting_read_ids' do
|
245
|
+
graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
|
246
|
+
graph = Bio::Velvet::Graph.parse_from_file(
|
247
|
+
graph_file,
|
248
|
+
{:interesting_node_ids => [] }
|
249
|
+
)
|
250
|
+
graph.nodes[951].short_reads.should == nil
|
251
|
+
|
252
|
+
graph.parse_additional_noded_reads(graph_file, :interesting_read_ids => [47210], :grep_hack => 2)
|
253
|
+
node = graph.nodes[951]
|
254
|
+
node.short_reads.collect{|r| r.read_id}.should == [47210]
|
255
|
+
end
|
256
|
+
|
121
257
|
it 'should return sets of arcs by id' do
|
122
258
|
graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
|
123
259
|
# ARC 2 -578 1
|
@@ -0,0 +1,24 @@
|
|
1
|
+
>1_1
|
2
|
+
CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATGTCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
|
3
|
+
>1_2
|
4
|
+
AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
|
5
|
+
>2_1
|
6
|
+
AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
|
7
|
+
>2_2
|
8
|
+
GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
|
9
|
+
>3_1
|
10
|
+
AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
|
11
|
+
>3_2
|
12
|
+
CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCGTGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
|
13
|
+
>4_1
|
14
|
+
AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGACTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
|
15
|
+
>4_2
|
16
|
+
CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTCGTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
|
17
|
+
>5_1
|
18
|
+
ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
|
19
|
+
>5_2
|
20
|
+
GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
|
21
|
+
>6_1
|
22
|
+
GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
|
23
|
+
>6_2
|
24
|
+
CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
|
@@ -0,0 +1,36 @@
|
|
1
|
+
>1_1 1 0
|
2
|
+
CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATG
|
3
|
+
TCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
|
4
|
+
>1_2 2 0
|
5
|
+
AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGT
|
6
|
+
GGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
|
7
|
+
>2_1 3 0
|
8
|
+
AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGC
|
9
|
+
AACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
|
10
|
+
>2_2 4 0
|
11
|
+
GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCC
|
12
|
+
CAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
|
13
|
+
>3_1 5 0
|
14
|
+
AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGA
|
15
|
+
AGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
|
16
|
+
>3_2 6 0
|
17
|
+
CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCG
|
18
|
+
TGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
|
19
|
+
>4_1 7 0
|
20
|
+
AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGA
|
21
|
+
CTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
|
22
|
+
>4_2 8 0
|
23
|
+
CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTC
|
24
|
+
GTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
|
25
|
+
>5_1 9 0
|
26
|
+
ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTA
|
27
|
+
TTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
|
28
|
+
>5_2 10 0
|
29
|
+
GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACG
|
30
|
+
GCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
|
31
|
+
>6_1 11 0
|
32
|
+
GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATT
|
33
|
+
GGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
|
34
|
+
>6_2 12 0
|
35
|
+
CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAG
|
36
|
+
TTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
|
@@ -0,0 +1,18 @@
|
|
1
|
+
>read1 1 0
|
2
|
+
CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
|
3
|
+
TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
|
4
|
+
ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
|
5
|
+
CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
|
6
|
+
GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
|
7
|
+
CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
|
8
|
+
ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
|
9
|
+
GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
|
10
|
+
ACTATGCTGGTATTTCACTTCCAGGTACAGG
|
11
|
+
>read2 2 0
|
12
|
+
ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
|
13
|
+
CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
|
14
|
+
ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
|
15
|
+
ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
|
16
|
+
TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
|
17
|
+
TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
|
18
|
+
TAATTAAATAAAAATATACTTCTTCTATTTTTAT
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
class String
|
5
|
+
def revcom
|
6
|
+
Bio::Sequences::NA.new(self).reverse_complement.to_s.upcase
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
seq1_sequence = 'CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
|
11
|
+
TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
|
12
|
+
ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
|
13
|
+
CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
|
14
|
+
GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
|
15
|
+
CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
|
16
|
+
ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
|
17
|
+
GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
|
18
|
+
ACTATGCTGGTATTTCACTTCCAGGTACAGG'.gsub("\n",'')
|
19
|
+
seq2_sequence = 'ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
|
20
|
+
CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
|
21
|
+
ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
|
22
|
+
ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
|
23
|
+
TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
|
24
|
+
TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
|
25
|
+
TAATTAAATAAAAATATACTTCTTCTATTTTTAT'.gsub("\n",'')
|
26
|
+
|
27
|
+
describe "BioVelvet" do
|
28
|
+
it "should be able to parse a whole Sequences file" do
|
29
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences')
|
30
|
+
seqs.should be_kind_of(Bio::Velvet::Sequences)
|
31
|
+
seqs.should be_kind_of(Hash)
|
32
|
+
seqs.keys.should == [1,2]
|
33
|
+
seqs[1].should == seq1_sequence
|
34
|
+
seqs[2].should == seq2_sequence
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should be able to read in an interesting seq only' do
|
38
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
39
|
+
:interesting_read_ids => [1]
|
40
|
+
seqs.keys.should == [1]
|
41
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
42
|
+
:interesting_read_ids => [2]
|
43
|
+
seqs.keys.should == [2]
|
44
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
45
|
+
:interesting_read_ids => [1,2]
|
46
|
+
seqs.keys.should == [1,2]
|
47
|
+
seqs[1].should == seq1_sequence
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should be able to apply the grep hack' do
|
51
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
52
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 500}
|
53
|
+
seqs.keys.should == [1]
|
54
|
+
seqs[1].should == seq1_sequence
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should be able to apply the grep hack when there is only just enough context' do
|
58
|
+
seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
59
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 9}
|
60
|
+
seqs.keys.should == [1]
|
61
|
+
seqs[1].should == seq1_sequence
|
62
|
+
end
|
63
|
+
|
64
|
+
it 'should warn when insufficient context is given' do
|
65
|
+
expect {
|
66
|
+
Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
|
67
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 8}
|
68
|
+
}.to raise_error
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should be able to handle multiple separated read ids with the grep hack' do
|
72
|
+
s = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
|
73
|
+
s = ['']+s.values
|
74
|
+
seq_file = File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
|
75
|
+
|
76
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
77
|
+
{:interesting_read_ids => [1], :apply_grep_hack => 9}
|
78
|
+
seqs.keys.should == [1]
|
79
|
+
seqs[1].should == s[1]
|
80
|
+
|
81
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
82
|
+
{:interesting_read_ids => [1,2], :apply_grep_hack => 9}
|
83
|
+
seqs.keys.should == [1,2]
|
84
|
+
seqs.values.should == s[1..2]
|
85
|
+
|
86
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
87
|
+
{:interesting_read_ids => [1,2], :apply_grep_hack => 2}
|
88
|
+
seqs.keys.should == [1,2]
|
89
|
+
seqs.values.should == s[1..2]
|
90
|
+
|
91
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
92
|
+
{:interesting_read_ids => [1,5], :apply_grep_hack => 2}
|
93
|
+
seqs.keys.should == [1,5]
|
94
|
+
seqs.values.should == [s[1],s[5]]
|
95
|
+
|
96
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
97
|
+
{:interesting_read_ids => [1,5], :apply_grep_hack => 3}
|
98
|
+
seqs.keys.should == [1,5]
|
99
|
+
seqs.values.should == [s[1],s[5]]
|
100
|
+
|
101
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
102
|
+
{:interesting_read_ids => [1,12], :apply_grep_hack => 3}
|
103
|
+
seqs.keys.should == [1,12]
|
104
|
+
seqs.values.should == [s[1],s[12]]
|
105
|
+
|
106
|
+
expect {
|
107
|
+
seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
|
108
|
+
{:interesting_read_ids => [12], :apply_grep_hack => 1}
|
109
|
+
seqs.keys.should == [1,12]
|
110
|
+
seqs.values.should == [s[1],s[12]]
|
111
|
+
}.to raise_error
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-velvet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01
|
11
|
+
date: 2014-04-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -155,12 +155,16 @@ files:
|
|
155
155
|
- lib/bio-velvet.rb
|
156
156
|
- lib/bio-velvet/graph.rb
|
157
157
|
- lib/bio-velvet/runner.rb
|
158
|
+
- lib/bio-velvet/sequences.rb
|
158
159
|
- spec/bio-velvet_arc_array_spec.rb
|
159
160
|
- spec/bio-velvet_graph_spec.rb
|
160
161
|
- spec/bio-velvet_runner_spec.rb
|
161
162
|
- spec/data/node_sequence/LastGraph
|
162
163
|
- spec/data/node_sequence/contigs.fa
|
163
164
|
- spec/data/runner_input.fa
|
165
|
+
- spec/data/sequence_spec/5seqs.fa
|
166
|
+
- spec/data/sequence_spec/5seqs.fa.Sequences
|
167
|
+
- spec/data/sequence_spec/Sequences
|
164
168
|
- spec/data/short_node_LastGraph
|
165
169
|
- spec/data/short_node_sequence_test_graph
|
166
170
|
- spec/data/velvet_test_reads_assembly/Graph
|
@@ -169,6 +173,7 @@ files:
|
|
169
173
|
- spec/data/velvet_test_reads_assembly_read_tracking/Graph2
|
170
174
|
- spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
|
171
175
|
- spec/data/velvet_test_trail_sequence_assembly/reads1.fa
|
176
|
+
- spec/sequences_spec.rb
|
172
177
|
- spec/spec_helper.rb
|
173
178
|
homepage: http://github.com/wwood/bioruby-velvet
|
174
179
|
licenses:
|
@@ -190,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
195
|
version: '0'
|
191
196
|
requirements: []
|
192
197
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.2.
|
198
|
+
rubygems_version: 2.2.2
|
194
199
|
signing_key:
|
195
200
|
specification_version: 4
|
196
201
|
summary: Parser to work with file formats used in the velvet DNA assembler
|