bio-velvet 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4150c97e0ffba2bebefade9a57dca252344e3550
4
- data.tar.gz: ce80f5a1d0ae5443ab707f1cd93efa576ab27fcf
3
+ metadata.gz: f88fe95bdfceafceb78b65c763675780ee12f1cf
4
+ data.tar.gz: 40f0b257c9e15ca861c844a51dbce319ee17453b
5
5
  SHA512:
6
- metadata.gz: 571473ce789272839ba47759af5f2c5a6e6cdfc97f206b21112549b25ac897eee7f1cad1b7cf08d59a703c448c75e50627e0c8e079cc477fcb2ea5632d6aaf9c
7
- data.tar.gz: 42d86334301a5fa559c3de76144951717bd20514b3025b594c052e99ce17414a0acb9d68a2215bef93b48b16f80ee39e926959ee2c0efad61543152d1263ee75
6
+ metadata.gz: d4de2138c28eb006fd872e518f7aa5172c04016aded296aeca1bdbeb27e98c7b99fa6f64b5fcf2252c0eff7dacccf08d9b2b4db3e38f05256d12b270cbc3fe19
7
+ data.tar.gz: f2fe259725937d1995f5a0269a3b6a3a2c3a55b8ecb30abc8e7f54dadfe6311a4d43a3df3bd588c49f3567a7ca9a1846885f0d51bbfb6d66bd84e3cf3b00937c
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/lib/bio-velvet.rb CHANGED
@@ -11,3 +11,4 @@ end
11
11
 
12
12
  require 'bio-velvet/graph'
13
13
  require 'bio-velvet/runner'
14
+ require 'bio-velvet/sequences'
@@ -1,5 +1,6 @@
1
1
  require 'hopcsv'
2
2
  require 'bio'
3
+ require 'tempfile'
3
4
 
4
5
  module Bio
5
6
  module Velvet
@@ -32,6 +33,13 @@ module Bio
32
33
  # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
33
34
  # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
34
35
  # nodes and edges are parsed in. Using this options saves both memory and CPU.
36
+ # * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
37
+ # rather than particular reads.
38
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
39
+ # hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
40
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
41
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
42
+ # (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
35
43
  def self.parse_from_file(path_to_graph_file, options={})
36
44
  graph = self.new
37
45
  state = :header
@@ -122,17 +130,26 @@ module Bio
122
130
  # $READ_ID2 etc.
123
131
  #p row
124
132
  if row[0] == 'NR'
125
- raise unless row.length == 3
126
- node_pm = row[1].to_i
127
- current_node_direction = node_pm > 0
128
- current_node = graph.nodes[node_pm.abs]
129
- current_node.number_of_short_reads ||= 0
130
- current_node.number_of_short_reads += row[2].to_i
131
- next
133
+ if options[:grep_hack]
134
+ unless options[:interesting_read_ids] or options[:interesting_node_ids]
135
+ raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
136
+ end
137
+ apply_grep_hack graph, path_to_graph_file, options[:interesting_read_ids], options[:interesting_node_ids], options[:grep_hack]
138
+ break #no more parsing is required
139
+ else
140
+ raise unless row.length == 3
141
+ node_pm = row[1].to_i
142
+ current_node_direction = node_pm > 0
143
+ current_node = graph.nodes[node_pm.abs]
144
+ current_node.number_of_short_reads ||= 0
145
+ current_node.number_of_short_reads += row[2].to_i
146
+ next
147
+ end
132
148
  else
133
149
  raise unless row.length == 3
134
150
  read_id = row[0].to_i
135
- if options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id)
151
+ if (options[:interesting_node_ids] and !options[:interesting_node_ids].include?(current_node.node_id)) or
152
+ (options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id))
136
153
  # We have come across an uninteresting read. Ignore it.
137
154
  next
138
155
  end
@@ -224,6 +241,33 @@ module Bio
224
241
  return deleted_nodes, deleted_arcs.flatten
225
242
  end
226
243
 
244
+ # Add more noded reads to this already parsed graph. There is
245
+ # no gaurantee that old NodedRead information is preserved, or removed.
246
+ #
247
+ # Options:
248
+ # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
249
+ # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
250
+ # nodes and edges are parsed in. Using this options saves both memory and CPU.
251
+ # * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
252
+ # rather than particular reads.
253
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
254
+ # hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
255
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
256
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
257
+ # (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
258
+ def parse_additional_noded_reads(path_to_graph_file, options)
259
+ grep_context = options[:grep_hack]
260
+ if grep_context.nil?
261
+ raise "Calling Graph#parse_additional_noded_reads without specifying :grep_hack is currently not implemented"
262
+ end
263
+ self.class.apply_grep_hack(self,
264
+ path_to_graph_file,
265
+ options[:interesting_read_ids],
266
+ options[:interesting_node_ids],
267
+ grep_context
268
+ )
269
+ end
270
+
227
271
 
228
272
 
229
273
 
@@ -522,6 +566,97 @@ module Bio
522
566
  class NodedRead
523
567
  attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
524
568
  end
569
+
570
+ private
571
+ def self.apply_grep_hack(graph, path_to_graph_file, interesting_read_ids, interesting_node_ids, grep_context)
572
+ interesting_read_ids ||= []
573
+ interesting_node_ids ||= []
574
+ if interesting_read_ids.empty? and interesting_node_ids.empty?
575
+ log.debug "Nothing to grep for in grep hack" if log.debug?
576
+ return
577
+ end
578
+
579
+ Tempfile.open('grep_v_hack') do |tempfile|
580
+ # Create a file to pass to grep -f
581
+ unless interesting_read_ids.nil?
582
+ interesting_read_ids.each do |read_id|
583
+ tempfile.puts "^#{read_id}\t"
584
+ end
585
+ end
586
+ unless interesting_node_ids.nil?
587
+ interesting_node_ids.each do |node_id|
588
+ tempfile.puts "^NR\t#{node_id}\t"
589
+ tempfile.puts "^NR\t-#{node_id}\t"
590
+ end
591
+ end
592
+ tempfile.close
593
+
594
+ cmd = "grep -B #{grep_context.inspect} -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_graph_file.inspect}"
595
+ # TODO: make this call more robust
596
+ # grep_result = Bio::Commandeer.run cmd
597
+ s, grep_result, stderr = systemu cmd
598
+
599
+ # Parse the grepped out results
600
+ current_node = nil
601
+ current_node_direction = nil
602
+ in_nr_section = false
603
+ grep_result.each_line do |line|
604
+ row = line.split("\t")
605
+ if in_nr_section == false
606
+ # If there is a lot of context then the context includes ARC definitions etc. Skip past this.
607
+ if row[0] == 'NR'
608
+ in_nr_section = true
609
+ elsif row[0] == '--'
610
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
611
+ else
612
+ next #skip to next line, waiting to ge into NR section
613
+ end
614
+ end
615
+
616
+ if line == "--\n" #the break introduced by grep
617
+ # If we encounter a grep break, but haven't assigned any nodes, then that's not good enough
618
+ if current_node.nil?
619
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
620
+ end
621
+ # reset the parsing situation
622
+ current_node = nil
623
+ elsif row[0] == 'NR'
624
+ raise unless row.length == 3
625
+ node_pm = row[1].to_i
626
+ current_node_direction = node_pm > 0
627
+ current_node = graph.nodes[node_pm.abs]
628
+ current_node.number_of_short_reads ||= 0
629
+ current_node.number_of_short_reads += row[2].to_i
630
+ next
631
+ else
632
+ raise unless row.length == 3
633
+ read_id = row[0].to_i
634
+ if (current_node.nil? or !interesting_node_ids.include?(current_node.node_id)) and
635
+ !interesting_read_ids.include?(read_id)
636
+ # We have come across an uninteresting read. Ignore it.
637
+ next
638
+ end
639
+ if current_node.nil?
640
+ # Came across a high coverage node, and grep isn't giving enough context. Hopefully this won't happen much
641
+ # particularly if the reads you are interested in are given to velvet first
642
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
643
+ end
644
+ nr = NodedRead.new
645
+ nr.read_id = read_id
646
+ nr.offset_from_start_of_node = row[1].to_i
647
+ nr.start_coord = row[2].to_i
648
+ nr.direction = current_node_direction
649
+ current_node.short_reads ||= []
650
+ current_node.short_reads.push nr
651
+ next
652
+ end
653
+ end
654
+
655
+ if current_node.nil?
656
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
657
+ end
658
+ end
659
+ end
525
660
  end
526
661
  end
527
662
  end
@@ -85,7 +85,7 @@ module Bio
85
85
  # Return a Bio::Velvet::Graph object built from the LastGraph file.
86
86
  # The options for parsing are as per Bio::Velvet::Graph#parse_from_file
87
87
  def last_graph(options=nil)
88
- Bio::Velvet::Graph.parse_from_file(last_graph_path)
88
+ Bio::Velvet::Graph.parse_from_file(last_graph_path, options)
89
89
  end
90
90
  end
91
91
  end
@@ -0,0 +1,121 @@
1
+ require 'hopcsv'
2
+ require 'bio'
3
+ require 'tempfile'
4
+
5
+ module Bio
6
+ module Velvet
7
+ # Parser and container class for textual Sequence files
8
+ #
9
+ # After parsing, the result is a hash of read_id => sequence
10
+ # where read_id is an Integer and sequence a String
11
+ #
12
+ # The definition of this file is given in the velvet manual, at
13
+ # http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
14
+ class Sequences < Hash
15
+ include Bio::Velvet::Logging
16
+
17
+ def self.log
18
+ self.new.log
19
+ end
20
+
21
+ # Options:
22
+ # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
23
+ # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
24
+ # nodes and edges are parsed in. Using this options saves both memory and CPU.
25
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
26
+ # hacky method is applied to the graph file, so only sequence data of interesting_read_ids is presented
27
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
28
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
29
+ # (the -A flag). In the Sequence file the sequences are wrapped at 60 characters, so you'll need at
30
+ # least (longest_sequence_length / 60) + 2 amount of context. The reason for adding 2 is that the
31
+ # parser will then be able to detect insufficient context and raise an Exception, without
32
+ # throwing up false positive Exceptions.
33
+ def self.parse_from_file(path_to_sequence_file, options={})
34
+ seq_object = Bio::Velvet::Sequences.new
35
+
36
+ if options[:apply_grep_hack]
37
+ apply_grep_hack(seq_object, path_to_sequence_file, options[:interesting_read_ids], options[:apply_grep_hack])
38
+ else
39
+ # Parse all the sequences
40
+ Bio::FlatFile.foreach(path_to_sequence_file) do |seq|
41
+ read_id = seq.definition.split("\t")[1].to_i
42
+ if options[:interesting_read_ids].nil? or options[:interesting_read_ids].include?(read_id)
43
+ seq_object[read_id] = seq.seq.to_s
44
+ end
45
+ end
46
+ end
47
+ log.info "Read in #{seq_object.length} velvet stored sequences"
48
+ return seq_object
49
+ end
50
+
51
+ private
52
+ # Add the interesting sequences to the hash
53
+ def self.apply_grep_hack(seq_object, path_to_sequence_file, interesting_read_ids, grep_context)
54
+ return if interesting_read_ids.nil? or interesting_read_ids.empty?
55
+
56
+ Tempfile.open('grep_v_hack') do |tempfile|
57
+ # Create a file to pass to grep -f
58
+ unless interesting_read_ids.nil?
59
+ interesting_read_ids.each do |read_id|
60
+ tempfile.puts "\t#{read_id}\t" #the read_id is the second field of the header
61
+ end
62
+ end
63
+ tempfile.close
64
+
65
+ cmd = "grep -F -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_sequence_file.inspect}"
66
+ # TODO: make this call more robust
67
+ # grep_result = Bio::Commandeer.run cmd
68
+ s, grep_result, stderr = systemu cmd
69
+
70
+ # Parse the grepped out results
71
+ current_read_id = nil
72
+ current_seq = nil
73
+ last_sequence_line_length = nil
74
+
75
+ add_last_sequence = lambda do
76
+ if current_read_id
77
+ seq_object[current_read_id] = current_seq
78
+ end
79
+ end
80
+ grep_result.each_line do |line|
81
+ line.chomp!
82
+ if line[0] == '>'
83
+ # Process the last sequence
84
+ add_last_sequence.call unless current_read_id.nil?
85
+
86
+ # Assume the real sequence name contains no tabs
87
+ read_id = line.split("\t")[1]
88
+ raise "Unable to parse velvet Sequence file at this line #{line}" if read_id.nil?
89
+ read_id = read_id.to_i
90
+ if interesting_read_ids.include?(read_id)
91
+ # if current_read_id is nil, then we know we are uninterested in this sequence
92
+ current_read_id = read_id
93
+ else
94
+ current_read_id = nil
95
+ end
96
+ current_seq = nil
97
+ elsif line == '--'
98
+ # grep demarker.
99
+ add_last_sequence.call unless current_read_id.nil?
100
+ if last_sequence_line_length == 60
101
+ raise "Parsing exception when parsing velvet Sequence file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
102
+ end
103
+ else
104
+ # plain old sequence
105
+ unless current_read_id.nil?
106
+ current_seq ||= ''
107
+ current_seq += line
108
+ last_sequence_line_length = line.length
109
+ end
110
+ end
111
+ end
112
+ # process the last sequence
113
+ if last_sequence_line_length == 60
114
+ raise "Parsing exception when parsing velvet Sequence file at the end of the file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
115
+ end
116
+ add_last_sequence.call
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -66,7 +66,7 @@ describe "BioVelvet" do
66
66
  # 49981 0 0
67
67
  node = graph.nodes[967]
68
68
  node.short_reads.nil?.should eq(false)
69
- node.short_reads.length.should eq(2), node.inspect
69
+ node.short_reads.length.should eq(2)
70
70
  node.short_reads[0].read_id.should eq(49982)
71
71
  node.short_reads[0].offset_from_start_of_node.should eq(0)
72
72
  node.short_reads[0].start_coord.should eq(0)
@@ -118,6 +118,142 @@ describe "BioVelvet" do
118
118
  node.short_reads[0].offset_from_start_of_node.should eq(41)
119
119
  end
120
120
 
121
+ it 'should be able to parse a read tracked graph, using the grep hack correct context' do
122
+ graph = Bio::Velvet::Graph.parse_from_file(
123
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
124
+ {:interesting_read_ids => Set.new([47223]),
125
+ :grep_hack => 5}
126
+ )
127
+ graph.should be_kind_of(Bio::Velvet::Graph)
128
+
129
+ graph.number_of_nodes.should eq(967)
130
+ graph.number_of_sequences.should eq(50000)
131
+ graph.hash_length.should eq(31)
132
+
133
+ # NR -951 2
134
+ #47210 0 0
135
+ #47223 41 0
136
+ # ====later
137
+ # NR 951 2
138
+ # 47209 54 0
139
+ # 47224 0 0
140
+ node = graph.nodes[951]
141
+ node.short_reads.length.should eq(1)
142
+ # node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
143
+ node.short_reads[0].read_id.should eq(47223)
144
+ node.short_reads[0].offset_from_start_of_node.should eq(41)
145
+ end
146
+
147
+ it 'should be able to parse a read tracked graph, using the grep hack context beyond NR' do
148
+ graph = Bio::Velvet::Graph.parse_from_file(
149
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
150
+ {:interesting_read_ids => Set.new([47224]),
151
+ :grep_hack => 500}
152
+ )
153
+ graph.should be_kind_of(Bio::Velvet::Graph)
154
+
155
+ graph.number_of_nodes.should eq(967)
156
+ graph.number_of_sequences.should eq(50000)
157
+ graph.hash_length.should eq(31)
158
+
159
+ # NR -951 2
160
+ #47210 0 0
161
+ #47223 41 0
162
+ # ====later
163
+ # NR 951 2
164
+ # 47209 54 0
165
+ # 47224 0 0
166
+ node = graph.nodes[951]
167
+ node.short_reads.length.should eq(1)
168
+ # node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
169
+ node.short_reads[0].read_id.should eq(47224)
170
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
171
+ end
172
+
173
+ it 'should be able to parse a read tracked graph, using the grep hack insufficient context' do
174
+ expect {Bio::Velvet::Graph.parse_from_file(
175
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
176
+ {:interesting_read_ids => Set.new([47224]),
177
+ :grep_hack => 0}
178
+ )}.to raise_error
179
+ end
180
+
181
+ it 'should grep target nodes with grep hack' do
182
+ graph = Bio::Velvet::Graph.parse_from_file(
183
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
184
+ {:interesting_node_ids => Set.new([951]),
185
+ :grep_hack => 500}
186
+ )
187
+ graph.should be_kind_of(Bio::Velvet::Graph)
188
+
189
+ graph.number_of_nodes.should eq(967)
190
+ graph.number_of_sequences.should eq(50000)
191
+ graph.hash_length.should eq(31)
192
+
193
+ # NR -951 2
194
+ #47210 0 0
195
+ #47223 41 0
196
+ # ====later
197
+ # NR 951 2
198
+ # 47209 54 0
199
+ # 47224 0 0
200
+ node = graph.nodes[951]
201
+ node.short_reads.length.should eq(4)
202
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
203
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
204
+ end
205
+
206
+ it 'should grep target nodes without grep hack' do
207
+ graph = Bio::Velvet::Graph.parse_from_file(
208
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
209
+ {:interesting_node_ids => Set.new([951])}
210
+ )
211
+ graph.should be_kind_of(Bio::Velvet::Graph)
212
+
213
+ graph.number_of_nodes.should eq(967)
214
+ graph.number_of_sequences.should eq(50000)
215
+ graph.hash_length.should eq(31)
216
+
217
+ # NR -951 2
218
+ #47210 0 0
219
+ #47223 41 0
220
+ # ====later
221
+ # NR 951 2
222
+ # 47209 54 0
223
+ # 47224 0 0
224
+ node = graph.nodes[951]
225
+ node.short_reads.length.should eq(4)
226
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
227
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
228
+ end
229
+
230
+ it 'should parse_additional_noded_reads with interesting_node_ids' do
231
+ graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
232
+ graph = Bio::Velvet::Graph.parse_from_file(
233
+ graph_file,
234
+ {:interesting_node_ids => [] }
235
+ )
236
+ graph.nodes[951].short_reads.should == nil
237
+
238
+ graph.parse_additional_noded_reads(graph_file, :interesting_node_ids => [951], :grep_hack => 500)
239
+ node = graph.nodes[951]
240
+ node.short_reads.length.should eq(4)
241
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
242
+ end
243
+
244
+ it 'should parse_additional_noded_reads with interesting_read_ids' do
245
+ graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
246
+ graph = Bio::Velvet::Graph.parse_from_file(
247
+ graph_file,
248
+ {:interesting_node_ids => [] }
249
+ )
250
+ graph.nodes[951].short_reads.should == nil
251
+
252
+ graph.parse_additional_noded_reads(graph_file, :interesting_read_ids => [47210], :grep_hack => 2)
253
+ node = graph.nodes[951]
254
+ node.short_reads.collect{|r| r.read_id}.should == [47210]
255
+ end
256
+
121
257
  it 'should return sets of arcs by id' do
122
258
  graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
123
259
  # ARC 2 -578 1
@@ -0,0 +1,24 @@
1
+ >1_1
2
+ CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATGTCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
3
+ >1_2
4
+ AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
5
+ >2_1
6
+ AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
7
+ >2_2
8
+ GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
9
+ >3_1
10
+ AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
11
+ >3_2
12
+ CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCGTGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
13
+ >4_1
14
+ AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGACTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
15
+ >4_2
16
+ CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTCGTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
17
+ >5_1
18
+ ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
19
+ >5_2
20
+ GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
21
+ >6_1
22
+ GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
23
+ >6_2
24
+ CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
@@ -0,0 +1,36 @@
1
+ >1_1 1 0
2
+ CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATG
3
+ TCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
4
+ >1_2 2 0
5
+ AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGT
6
+ GGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
7
+ >2_1 3 0
8
+ AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGC
9
+ AACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
10
+ >2_2 4 0
11
+ GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCC
12
+ CAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
13
+ >3_1 5 0
14
+ AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGA
15
+ AGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
16
+ >3_2 6 0
17
+ CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCG
18
+ TGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
19
+ >4_1 7 0
20
+ AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGA
21
+ CTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
22
+ >4_2 8 0
23
+ CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTC
24
+ GTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
25
+ >5_1 9 0
26
+ ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTA
27
+ TTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
28
+ >5_2 10 0
29
+ GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACG
30
+ GCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
31
+ >6_1 11 0
32
+ GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATT
33
+ GGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
34
+ >6_2 12 0
35
+ CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAG
36
+ TTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
@@ -0,0 +1,18 @@
1
+ >read1 1 0
2
+ CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
3
+ TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
4
+ ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
5
+ CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
6
+ GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
7
+ CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
8
+ ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
9
+ GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
10
+ ACTATGCTGGTATTTCACTTCCAGGTACAGG
11
+ >read2 2 0
12
+ ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
13
+ CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
14
+ ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
15
+ ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
16
+ TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
17
+ TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
18
+ TAATTAAATAAAAATATACTTCTTCTATTTTTAT
@@ -0,0 +1,113 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'bio'
3
+
4
+ class String
5
+ def revcom
6
+ Bio::Sequences::NA.new(self).reverse_complement.to_s.upcase
7
+ end
8
+ end
9
+
10
+ seq1_sequence = 'CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
11
+ TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
12
+ ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
13
+ CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
14
+ GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
15
+ CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
16
+ ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
17
+ GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
18
+ ACTATGCTGGTATTTCACTTCCAGGTACAGG'.gsub("\n",'')
19
+ seq2_sequence = 'ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
20
+ CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
21
+ ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
22
+ ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
23
+ TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
24
+ TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
25
+ TAATTAAATAAAAATATACTTCTTCTATTTTTAT'.gsub("\n",'')
26
+
27
+ describe "BioVelvet" do
28
+ it "should be able to parse a whole Sequences file" do
29
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences')
30
+ seqs.should be_kind_of(Bio::Velvet::Sequences)
31
+ seqs.should be_kind_of(Hash)
32
+ seqs.keys.should == [1,2]
33
+ seqs[1].should == seq1_sequence
34
+ seqs[2].should == seq2_sequence
35
+ end
36
+
37
+ it 'should be able to read in an interesting seq only' do
38
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
39
+ :interesting_read_ids => [1]
40
+ seqs.keys.should == [1]
41
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
42
+ :interesting_read_ids => [2]
43
+ seqs.keys.should == [2]
44
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
45
+ :interesting_read_ids => [1,2]
46
+ seqs.keys.should == [1,2]
47
+ seqs[1].should == seq1_sequence
48
+ end
49
+
50
+ it 'should be able to apply the grep hack' do
51
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
52
+ {:interesting_read_ids => [1], :apply_grep_hack => 500}
53
+ seqs.keys.should == [1]
54
+ seqs[1].should == seq1_sequence
55
+ end
56
+
57
+ it 'should be able to apply the grep hack when there is only just enough context' do
58
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
59
+ {:interesting_read_ids => [1], :apply_grep_hack => 9}
60
+ seqs.keys.should == [1]
61
+ seqs[1].should == seq1_sequence
62
+ end
63
+
64
+ it 'should warn when insufficient context is given' do
65
+ expect {
66
+ Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
67
+ {:interesting_read_ids => [1], :apply_grep_hack => 8}
68
+ }.to raise_error
69
+ end
70
+
71
+ it 'should be able to handle multiple separated read ids with the grep hack' do
72
+ s = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
73
+ s = ['']+s.values
74
+ seq_file = File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
75
+
76
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
77
+ {:interesting_read_ids => [1], :apply_grep_hack => 9}
78
+ seqs.keys.should == [1]
79
+ seqs[1].should == s[1]
80
+
81
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
82
+ {:interesting_read_ids => [1,2], :apply_grep_hack => 9}
83
+ seqs.keys.should == [1,2]
84
+ seqs.values.should == s[1..2]
85
+
86
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
87
+ {:interesting_read_ids => [1,2], :apply_grep_hack => 2}
88
+ seqs.keys.should == [1,2]
89
+ seqs.values.should == s[1..2]
90
+
91
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
92
+ {:interesting_read_ids => [1,5], :apply_grep_hack => 2}
93
+ seqs.keys.should == [1,5]
94
+ seqs.values.should == [s[1],s[5]]
95
+
96
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
97
+ {:interesting_read_ids => [1,5], :apply_grep_hack => 3}
98
+ seqs.keys.should == [1,5]
99
+ seqs.values.should == [s[1],s[5]]
100
+
101
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
102
+ {:interesting_read_ids => [1,12], :apply_grep_hack => 3}
103
+ seqs.keys.should == [1,12]
104
+ seqs.values.should == [s[1],s[12]]
105
+
106
+ expect {
107
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
108
+ {:interesting_read_ids => [12], :apply_grep_hack => 1}
109
+ seqs.keys.should == [1,12]
110
+ seqs.values.should == [s[1],s[12]]
111
+ }.to raise_error
112
+ end
113
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-velvet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-06 00:00:00.000000000 Z
11
+ date: 2014-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -155,12 +155,16 @@ files:
155
155
  - lib/bio-velvet.rb
156
156
  - lib/bio-velvet/graph.rb
157
157
  - lib/bio-velvet/runner.rb
158
+ - lib/bio-velvet/sequences.rb
158
159
  - spec/bio-velvet_arc_array_spec.rb
159
160
  - spec/bio-velvet_graph_spec.rb
160
161
  - spec/bio-velvet_runner_spec.rb
161
162
  - spec/data/node_sequence/LastGraph
162
163
  - spec/data/node_sequence/contigs.fa
163
164
  - spec/data/runner_input.fa
165
+ - spec/data/sequence_spec/5seqs.fa
166
+ - spec/data/sequence_spec/5seqs.fa.Sequences
167
+ - spec/data/sequence_spec/Sequences
164
168
  - spec/data/short_node_LastGraph
165
169
  - spec/data/short_node_sequence_test_graph
166
170
  - spec/data/velvet_test_reads_assembly/Graph
@@ -169,6 +173,7 @@ files:
169
173
  - spec/data/velvet_test_reads_assembly_read_tracking/Graph2
170
174
  - spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
171
175
  - spec/data/velvet_test_trail_sequence_assembly/reads1.fa
176
+ - spec/sequences_spec.rb
172
177
  - spec/spec_helper.rb
173
178
  homepage: http://github.com/wwood/bioruby-velvet
174
179
  licenses:
@@ -190,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
190
195
  version: '0'
191
196
  requirements: []
192
197
  rubyforge_project:
193
- rubygems_version: 2.2.0
198
+ rubygems_version: 2.2.2
194
199
  signing_key:
195
200
  specification_version: 4
196
201
  summary: Parser to work with file formats used in the velvet DNA assembler