bio-velvet 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4150c97e0ffba2bebefade9a57dca252344e3550
4
- data.tar.gz: ce80f5a1d0ae5443ab707f1cd93efa576ab27fcf
3
+ metadata.gz: f88fe95bdfceafceb78b65c763675780ee12f1cf
4
+ data.tar.gz: 40f0b257c9e15ca861c844a51dbce319ee17453b
5
5
  SHA512:
6
- metadata.gz: 571473ce789272839ba47759af5f2c5a6e6cdfc97f206b21112549b25ac897eee7f1cad1b7cf08d59a703c448c75e50627e0c8e079cc477fcb2ea5632d6aaf9c
7
- data.tar.gz: 42d86334301a5fa559c3de76144951717bd20514b3025b594c052e99ce17414a0acb9d68a2215bef93b48b16f80ee39e926959ee2c0efad61543152d1263ee75
6
+ metadata.gz: d4de2138c28eb006fd872e518f7aa5172c04016aded296aeca1bdbeb27e98c7b99fa6f64b5fcf2252c0eff7dacccf08d9b2b4db3e38f05256d12b270cbc3fe19
7
+ data.tar.gz: f2fe259725937d1995f5a0269a3b6a3a2c3a55b8ecb30abc8e7f54dadfe6311a4d43a3df3bd588c49f3567a7ca9a1846885f0d51bbfb6d66bd84e3cf3b00937c
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/lib/bio-velvet.rb CHANGED
@@ -11,3 +11,4 @@ end
11
11
 
12
12
  require 'bio-velvet/graph'
13
13
  require 'bio-velvet/runner'
14
+ require 'bio-velvet/sequences'
@@ -1,5 +1,6 @@
1
1
  require 'hopcsv'
2
2
  require 'bio'
3
+ require 'tempfile'
3
4
 
4
5
  module Bio
5
6
  module Velvet
@@ -32,6 +33,13 @@ module Bio
32
33
  # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
33
34
  # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
34
35
  # nodes and edges are parsed in. Using this options saves both memory and CPU.
36
+ # * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
37
+ # rather than particular reads.
38
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
39
+ # hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
40
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
41
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
42
+ # (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
35
43
  def self.parse_from_file(path_to_graph_file, options={})
36
44
  graph = self.new
37
45
  state = :header
@@ -122,17 +130,26 @@ module Bio
122
130
  # $READ_ID2 etc.
123
131
  #p row
124
132
  if row[0] == 'NR'
125
- raise unless row.length == 3
126
- node_pm = row[1].to_i
127
- current_node_direction = node_pm > 0
128
- current_node = graph.nodes[node_pm.abs]
129
- current_node.number_of_short_reads ||= 0
130
- current_node.number_of_short_reads += row[2].to_i
131
- next
133
+ if options[:grep_hack]
134
+ unless options[:interesting_read_ids] or options[:interesting_node_ids]
135
+ raise "Programming error using bio-velvet: if :grep_hack is specified, then :interesting_read_ids or :interesting_node_ids must also be"
136
+ end
137
+ apply_grep_hack graph, path_to_graph_file, options[:interesting_read_ids], options[:interesting_node_ids], options[:grep_hack]
138
+ break #no more parsing is required
139
+ else
140
+ raise unless row.length == 3
141
+ node_pm = row[1].to_i
142
+ current_node_direction = node_pm > 0
143
+ current_node = graph.nodes[node_pm.abs]
144
+ current_node.number_of_short_reads ||= 0
145
+ current_node.number_of_short_reads += row[2].to_i
146
+ next
147
+ end
132
148
  else
133
149
  raise unless row.length == 3
134
150
  read_id = row[0].to_i
135
- if options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id)
151
+ if (options[:interesting_node_ids] and !options[:interesting_node_ids].include?(current_node.node_id)) or
152
+ (options[:interesting_read_ids] and !options[:interesting_read_ids].include?(read_id))
136
153
  # We have come across an uninteresting read. Ignore it.
137
154
  next
138
155
  end
@@ -224,6 +241,33 @@ module Bio
224
241
  return deleted_nodes, deleted_arcs.flatten
225
242
  end
226
243
 
244
+ # Add more noded reads to this already parsed graph. There is
245
+ # no gaurantee that old NodedRead information is preserved, or removed.
246
+ #
247
+ # Options:
248
+ # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
249
+ # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
250
+ # nodes and edges are parsed in. Using this options saves both memory and CPU.
251
+ # * :interesting_node_ids: like :interesting_read_ids except it allows targeting of particular nodes
252
+ # rather than particular reads.
253
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
254
+ # hacky method is applied to the graph file, so only NR data of interesting_read_ids is presented
255
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
256
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
257
+ # (the -A and -B flags). Using 500 should probably work for most circumstances, if not an Exception will be raised.
258
+ def parse_additional_noded_reads(path_to_graph_file, options)
259
+ grep_context = options[:grep_hack]
260
+ if grep_context.nil?
261
+ raise "Calling Graph#parse_additional_noded_reads without specifying :grep_hack is currently not implemented"
262
+ end
263
+ self.class.apply_grep_hack(self,
264
+ path_to_graph_file,
265
+ options[:interesting_read_ids],
266
+ options[:interesting_node_ids],
267
+ grep_context
268
+ )
269
+ end
270
+
227
271
 
228
272
 
229
273
 
@@ -522,6 +566,97 @@ module Bio
522
566
  class NodedRead
523
567
  attr_accessor :read_id, :offset_from_start_of_node, :start_coord, :direction
524
568
  end
569
+
570
+ private
571
+ def self.apply_grep_hack(graph, path_to_graph_file, interesting_read_ids, interesting_node_ids, grep_context)
572
+ interesting_read_ids ||= []
573
+ interesting_node_ids ||= []
574
+ if interesting_read_ids.empty? and interesting_node_ids.empty?
575
+ log.debug "Nothing to grep for in grep hack" if log.debug?
576
+ return
577
+ end
578
+
579
+ Tempfile.open('grep_v_hack') do |tempfile|
580
+ # Create a file to pass to grep -f
581
+ unless interesting_read_ids.nil?
582
+ interesting_read_ids.each do |read_id|
583
+ tempfile.puts "^#{read_id}\t"
584
+ end
585
+ end
586
+ unless interesting_node_ids.nil?
587
+ interesting_node_ids.each do |node_id|
588
+ tempfile.puts "^NR\t#{node_id}\t"
589
+ tempfile.puts "^NR\t-#{node_id}\t"
590
+ end
591
+ end
592
+ tempfile.close
593
+
594
+ cmd = "grep -B #{grep_context.inspect} -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_graph_file.inspect}"
595
+ # TODO: make this call more robust
596
+ # grep_result = Bio::Commandeer.run cmd
597
+ s, grep_result, stderr = systemu cmd
598
+
599
+ # Parse the grepped out results
600
+ current_node = nil
601
+ current_node_direction = nil
602
+ in_nr_section = false
603
+ grep_result.each_line do |line|
604
+ row = line.split("\t")
605
+ if in_nr_section == false
606
+ # If there is a lot of context then the context includes ARC definitions etc. Skip past this.
607
+ if row[0] == 'NR'
608
+ in_nr_section = true
609
+ elsif row[0] == '--'
610
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
611
+ else
612
+ next #skip to next line, waiting to ge into NR section
613
+ end
614
+ end
615
+
616
+ if line == "--\n" #the break introduced by grep
617
+ # If we encounter a grep break, but haven't assigned any nodes, then that's not good enough
618
+ if current_node.nil?
619
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
620
+ end
621
+ # reset the parsing situation
622
+ current_node = nil
623
+ elsif row[0] == 'NR'
624
+ raise unless row.length == 3
625
+ node_pm = row[1].to_i
626
+ current_node_direction = node_pm > 0
627
+ current_node = graph.nodes[node_pm.abs]
628
+ current_node.number_of_short_reads ||= 0
629
+ current_node.number_of_short_reads += row[2].to_i
630
+ next
631
+ else
632
+ raise unless row.length == 3
633
+ read_id = row[0].to_i
634
+ if (current_node.nil? or !interesting_node_ids.include?(current_node.node_id)) and
635
+ !interesting_read_ids.include?(read_id)
636
+ # We have come across an uninteresting read. Ignore it.
637
+ next
638
+ end
639
+ if current_node.nil?
640
+ # Came across a high coverage node, and grep isn't giving enough context. Hopefully this won't happen much
641
+ # particularly if the reads you are interested in are given to velvet first
642
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
643
+ end
644
+ nr = NodedRead.new
645
+ nr.read_id = read_id
646
+ nr.offset_from_start_of_node = row[1].to_i
647
+ nr.start_coord = row[2].to_i
648
+ nr.direction = current_node_direction
649
+ current_node.short_reads ||= []
650
+ current_node.short_reads.push nr
651
+ next
652
+ end
653
+ end
654
+
655
+ if current_node.nil?
656
+ raise "Parsing exception - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
657
+ end
658
+ end
659
+ end
525
660
  end
526
661
  end
527
662
  end
@@ -85,7 +85,7 @@ module Bio
85
85
  # Return a Bio::Velvet::Graph object built from the LastGraph file.
86
86
  # The options for parsing are as per Bio::Velvet::Graph#parse_from_file
87
87
  def last_graph(options=nil)
88
- Bio::Velvet::Graph.parse_from_file(last_graph_path)
88
+ Bio::Velvet::Graph.parse_from_file(last_graph_path, options)
89
89
  end
90
90
  end
91
91
  end
@@ -0,0 +1,121 @@
1
+ require 'hopcsv'
2
+ require 'bio'
3
+ require 'tempfile'
4
+
5
+ module Bio
6
+ module Velvet
7
+ # Parser and container class for textual Sequence files
8
+ #
9
+ # After parsing, the result is a hash of read_id => sequence
10
+ # where read_id is an Integer and sequence a String
11
+ #
12
+ # The definition of this file is given in the velvet manual, at
13
+ # http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
14
+ class Sequences < Hash
15
+ include Bio::Velvet::Logging
16
+
17
+ def self.log
18
+ self.new.log
19
+ end
20
+
21
+ # Options:
22
+ # * :interesting_read_ids: If not nil, is a Set of nodes that we are interested in. Reads
23
+ # not of interest will not be parsed in (the NR part of the velvet LastGraph file). Regardless all
24
+ # nodes and edges are parsed in. Using this options saves both memory and CPU.
25
+ # * :grep_hack: to make the parsing of read associations go even faster, a grep-based, rather
26
+ # hacky method is applied to the graph file, so only sequence data of interesting_read_ids is presented
27
+ # to the parser. This can save days of parsing time, but is a bit of a hack and its usage may
28
+ # not be particularly future-proof. The value of this option is the amount of context coming out of grep
29
+ # (the -A flag). In the Sequence file the sequences are wrapped at 60 characters, so you'll need at
30
+ # least (longest_sequence_length / 60) + 2 amount of context. The reason for adding 2 is that the
31
+ # parser will then be able to detect insufficient context and raise an Exception, without
32
+ # throwing up false positive Exceptions.
33
+ def self.parse_from_file(path_to_sequence_file, options={})
34
+ seq_object = Bio::Velvet::Sequences.new
35
+
36
+ if options[:apply_grep_hack]
37
+ apply_grep_hack(seq_object, path_to_sequence_file, options[:interesting_read_ids], options[:apply_grep_hack])
38
+ else
39
+ # Parse all the sequences
40
+ Bio::FlatFile.foreach(path_to_sequence_file) do |seq|
41
+ read_id = seq.definition.split("\t")[1].to_i
42
+ if options[:interesting_read_ids].nil? or options[:interesting_read_ids].include?(read_id)
43
+ seq_object[read_id] = seq.seq.to_s
44
+ end
45
+ end
46
+ end
47
+ log.info "Read in #{seq_object.length} velvet stored sequences"
48
+ return seq_object
49
+ end
50
+
51
+ private
52
+ # Add the interesting sequences to the hash
53
+ def self.apply_grep_hack(seq_object, path_to_sequence_file, interesting_read_ids, grep_context)
54
+ return if interesting_read_ids.nil? or interesting_read_ids.empty?
55
+
56
+ Tempfile.open('grep_v_hack') do |tempfile|
57
+ # Create a file to pass to grep -f
58
+ unless interesting_read_ids.nil?
59
+ interesting_read_ids.each do |read_id|
60
+ tempfile.puts "\t#{read_id}\t" #the read_id is the second field of the header
61
+ end
62
+ end
63
+ tempfile.close
64
+
65
+ cmd = "grep -F -A #{grep_context.inspect} -f #{tempfile.path} #{path_to_sequence_file.inspect}"
66
+ # TODO: make this call more robust
67
+ # grep_result = Bio::Commandeer.run cmd
68
+ s, grep_result, stderr = systemu cmd
69
+
70
+ # Parse the grepped out results
71
+ current_read_id = nil
72
+ current_seq = nil
73
+ last_sequence_line_length = nil
74
+
75
+ add_last_sequence = lambda do
76
+ if current_read_id
77
+ seq_object[current_read_id] = current_seq
78
+ end
79
+ end
80
+ grep_result.each_line do |line|
81
+ line.chomp!
82
+ if line[0] == '>'
83
+ # Process the last sequence
84
+ add_last_sequence.call unless current_read_id.nil?
85
+
86
+ # Assume the real sequence name contains no tabs
87
+ read_id = line.split("\t")[1]
88
+ raise "Unable to parse velvet Sequence file at this line #{line}" if read_id.nil?
89
+ read_id = read_id.to_i
90
+ if interesting_read_ids.include?(read_id)
91
+ # if current_read_id is nil, then we know we are uninterested in this sequence
92
+ current_read_id = read_id
93
+ else
94
+ current_read_id = nil
95
+ end
96
+ current_seq = nil
97
+ elsif line == '--'
98
+ # grep demarker.
99
+ add_last_sequence.call unless current_read_id.nil?
100
+ if last_sequence_line_length == 60
101
+ raise "Parsing exception when parsing velvet Sequence file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
102
+ end
103
+ else
104
+ # plain old sequence
105
+ unless current_read_id.nil?
106
+ current_seq ||= ''
107
+ current_seq += line
108
+ last_sequence_line_length = line.length
109
+ end
110
+ end
111
+ end
112
+ # process the last sequence
113
+ if last_sequence_line_length == 60
114
+ raise "Parsing exception when parsing velvet Sequence file at the end of the file - grep hack too hacky. Sorry. Try modifying the code to increase the default amount of context grep is giving"
115
+ end
116
+ add_last_sequence.call
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -66,7 +66,7 @@ describe "BioVelvet" do
66
66
  # 49981 0 0
67
67
  node = graph.nodes[967]
68
68
  node.short_reads.nil?.should eq(false)
69
- node.short_reads.length.should eq(2), node.inspect
69
+ node.short_reads.length.should eq(2)
70
70
  node.short_reads[0].read_id.should eq(49982)
71
71
  node.short_reads[0].offset_from_start_of_node.should eq(0)
72
72
  node.short_reads[0].start_coord.should eq(0)
@@ -118,6 +118,142 @@ describe "BioVelvet" do
118
118
  node.short_reads[0].offset_from_start_of_node.should eq(41)
119
119
  end
120
120
 
121
+ it 'should be able to parse a read tracked graph, using the grep hack correct context' do
122
+ graph = Bio::Velvet::Graph.parse_from_file(
123
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
124
+ {:interesting_read_ids => Set.new([47223]),
125
+ :grep_hack => 5}
126
+ )
127
+ graph.should be_kind_of(Bio::Velvet::Graph)
128
+
129
+ graph.number_of_nodes.should eq(967)
130
+ graph.number_of_sequences.should eq(50000)
131
+ graph.hash_length.should eq(31)
132
+
133
+ # NR -951 2
134
+ #47210 0 0
135
+ #47223 41 0
136
+ # ====later
137
+ # NR 951 2
138
+ # 47209 54 0
139
+ # 47224 0 0
140
+ node = graph.nodes[951]
141
+ node.short_reads.length.should eq(1)
142
+ # node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
143
+ node.short_reads[0].read_id.should eq(47223)
144
+ node.short_reads[0].offset_from_start_of_node.should eq(41)
145
+ end
146
+
147
+ it 'should be able to parse a read tracked graph, using the grep hack context beyond NR' do
148
+ graph = Bio::Velvet::Graph.parse_from_file(
149
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
150
+ {:interesting_read_ids => Set.new([47224]),
151
+ :grep_hack => 500}
152
+ )
153
+ graph.should be_kind_of(Bio::Velvet::Graph)
154
+
155
+ graph.number_of_nodes.should eq(967)
156
+ graph.number_of_sequences.should eq(50000)
157
+ graph.hash_length.should eq(31)
158
+
159
+ # NR -951 2
160
+ #47210 0 0
161
+ #47223 41 0
162
+ # ====later
163
+ # NR 951 2
164
+ # 47209 54 0
165
+ # 47224 0 0
166
+ node = graph.nodes[951]
167
+ node.short_reads.length.should eq(1)
168
+ # node.number_of_short_reads.should eq(4) #all bets off on this now, because not everything is aprsed
169
+ node.short_reads[0].read_id.should eq(47224)
170
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
171
+ end
172
+
173
+ it 'should be able to parse a read tracked graph, using the grep hack insufficient context' do
174
+ expect {Bio::Velvet::Graph.parse_from_file(
175
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
176
+ {:interesting_read_ids => Set.new([47224]),
177
+ :grep_hack => 0}
178
+ )}.to raise_error
179
+ end
180
+
181
+ it 'should grep target nodes with grep hack' do
182
+ graph = Bio::Velvet::Graph.parse_from_file(
183
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
184
+ {:interesting_node_ids => Set.new([951]),
185
+ :grep_hack => 500}
186
+ )
187
+ graph.should be_kind_of(Bio::Velvet::Graph)
188
+
189
+ graph.number_of_nodes.should eq(967)
190
+ graph.number_of_sequences.should eq(50000)
191
+ graph.hash_length.should eq(31)
192
+
193
+ # NR -951 2
194
+ #47210 0 0
195
+ #47223 41 0
196
+ # ====later
197
+ # NR 951 2
198
+ # 47209 54 0
199
+ # 47224 0 0
200
+ node = graph.nodes[951]
201
+ node.short_reads.length.should eq(4)
202
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
203
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
204
+ end
205
+
206
+ it 'should grep target nodes without grep hack' do
207
+ graph = Bio::Velvet::Graph.parse_from_file(
208
+ File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2'),
209
+ {:interesting_node_ids => Set.new([951])}
210
+ )
211
+ graph.should be_kind_of(Bio::Velvet::Graph)
212
+
213
+ graph.number_of_nodes.should eq(967)
214
+ graph.number_of_sequences.should eq(50000)
215
+ graph.hash_length.should eq(31)
216
+
217
+ # NR -951 2
218
+ #47210 0 0
219
+ #47223 41 0
220
+ # ====later
221
+ # NR 951 2
222
+ # 47209 54 0
223
+ # 47224 0 0
224
+ node = graph.nodes[951]
225
+ node.short_reads.length.should eq(4)
226
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
227
+ node.short_reads[0].offset_from_start_of_node.should eq(0)
228
+ end
229
+
230
+ it 'should parse_additional_noded_reads with interesting_node_ids' do
231
+ graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
232
+ graph = Bio::Velvet::Graph.parse_from_file(
233
+ graph_file,
234
+ {:interesting_node_ids => [] }
235
+ )
236
+ graph.nodes[951].short_reads.should == nil
237
+
238
+ graph.parse_additional_noded_reads(graph_file, :interesting_node_ids => [951], :grep_hack => 500)
239
+ node = graph.nodes[951]
240
+ node.short_reads.length.should eq(4)
241
+ node.short_reads.collect{|r| r.read_id}.should == [47210, 47223, 47209, 47224]
242
+ end
243
+
244
+ it 'should parse_additional_noded_reads with interesting_read_ids' do
245
+ graph_file = File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly_read_tracking','Graph2')
246
+ graph = Bio::Velvet::Graph.parse_from_file(
247
+ graph_file,
248
+ {:interesting_node_ids => [] }
249
+ )
250
+ graph.nodes[951].short_reads.should == nil
251
+
252
+ graph.parse_additional_noded_reads(graph_file, :interesting_read_ids => [47210], :grep_hack => 2)
253
+ node = graph.nodes[951]
254
+ node.short_reads.collect{|r| r.read_id}.should == [47210]
255
+ end
256
+
121
257
  it 'should return sets of arcs by id' do
122
258
  graph = Bio::Velvet::Graph.parse_from_file File.join(TEST_DATA_DIR, 'velvet_test_reads_assembly','LastGraph')
123
259
  # ARC 2 -578 1
@@ -0,0 +1,24 @@
1
+ >1_1
2
+ CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATGTCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
3
+ >1_2
4
+ AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
5
+ >2_1
6
+ AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
7
+ >2_2
8
+ GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
9
+ >3_1
10
+ AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
11
+ >3_2
12
+ CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCGTGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
13
+ >4_1
14
+ AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGACTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
15
+ >4_2
16
+ CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTCGTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
17
+ >5_1
18
+ ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
19
+ >5_2
20
+ GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
21
+ >6_1
22
+ GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
23
+ >6_2
24
+ CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
@@ -0,0 +1,36 @@
1
+ >1_1 1 0
2
+ CAGCACCTGTGCTGGCGCTCCGAAGAGGGGCCGATGTTTCCACCGGCTTGCACCAGCATG
3
+ TCAAGCCCAGGTAAGGTTCTTCGCGTTGCTTCGAATTAAA
4
+ >1_2 2 0
5
+ AGGGTGCGAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGCCCGTTAAGT
6
+ GGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGT
7
+ >2_1 3 0
8
+ AGTCGAACGGGACTGGGGGCAACTCCAGTTCAGTGGCAGACGGGTGCGTAACACGTGAGC
9
+ AACTTGTCCGACGGCGGGGGATAGCCGGCCCAACGGCCGG
10
+ >2_2 4 0
11
+ GGAGTTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCTACGCGCCCTTTACGCC
12
+ CAGTGATTCCGGACAACGCTCGCACCCTCCGTATTACCGC
13
+ >3_1 5 0
14
+ AGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGA
15
+ AGAACCTTACCTGGGCTTGACATGCTGGTGCAAGCCGGTG
16
+ >3_2 6 0
17
+ CCGGCTTCCATGGCGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCG
18
+ TGGCTGATCCGCGATTACTAGCGATTCCGGCTTCATGCCG
19
+ >4_1 7 0
20
+ AGGTGGCCCGTTAAGTGGCTGGTGAAATCCCGGGGCTCAACTCCGGGGCTGCCGGTCAGA
21
+ CTGGCGAGCTAGAGCACGGTAGGGGCAGATGGAATTCCCG
22
+ >4_2 8 0
23
+ CGGCAGTCCCCCCAGAGTCCCCGGCCGAACCGCTGGCAACTGGGAGCGAGAGTTGCGCTC
24
+ GTTGCGGGACTTAACCCAACATCTCACGACACGAGCTGAC
25
+ >5_1 9 0
26
+ ACGGGCCACCTACGCGCCCTTTACGCCCAGTGATTCCGGACAACGCTCGCACCCTCCGTA
27
+ TTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCCTT
28
+ >5_2 10 0
29
+ GCAGACGGGTGCGTAACACGTGAGCAACTTGTCCGACGGCGGGGGATAGCCGGCCCAACG
30
+ GCCGGGTAATACCGCGTACGCTCGTTTAGGGACATCCCTG
31
+ >6_1 11 0
32
+ GGGTAACGGCCCACCAAGGCGACGACGGGTAGCTGGTCTGAGAGGATGGCCAGCCACATT
33
+ GGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAG
34
+ >6_2 12 0
35
+ CGGGAATTCCATCTGCCCCTACCGTGCTCTAGCTCGCCAGTCTGACCGGCAGCCCCGGAG
36
+ TTGAGCCCCGGGATTTCACCAGCCACTTAACGGGCCACCT
@@ -0,0 +1,18 @@
1
+ >read1 1 0
2
+ CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
3
+ TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
4
+ ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
5
+ CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
6
+ GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
7
+ CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
8
+ ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
9
+ GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
10
+ ACTATGCTGGTATTTCACTTCCAGGTACAGG
11
+ >read2 2 0
12
+ ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
13
+ CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
14
+ ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
15
+ ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
16
+ TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
17
+ TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
18
+ TAATTAAATAAAAATATACTTCTTCTATTTTTAT
@@ -0,0 +1,113 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'bio'
3
+
4
+ class String
5
+ def revcom
6
+ Bio::Sequences::NA.new(self).reverse_complement.to_s.upcase
7
+ end
8
+ end
9
+
10
+ seq1_sequence = 'CACTTATCTCTACCAAAGATCACGATTTAGAATCAAACTATAAAGTTTTAGAAGATAAAG
11
+ TAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAG
12
+ ATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATA
13
+ CGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATG
14
+ GACGAGTTATATTTACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTC
15
+ CTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATG
16
+ ATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAA
17
+ GTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAA
18
+ ACTATGCTGGTATTTCACTTCCAGGTACAGG'.gsub("\n",'')
19
+ seq2_sequence = 'ACTGGTTTAAAAGAAGGAGATTACTTTATAAAAGAAGAAAAAGCTCCTAAAGGGTATAGC
20
+ CTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTAT
21
+ ACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAAT
22
+ ATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATT
23
+ TCACTTCCAGGTACAGGTGGAATTGGAACAGATGGATTCATTAAAATAGGGCTAGTTTTA
24
+ TTAGGGGTTGTTATTATTCTAGGTGCAGGATATGTTGTCTTAGATAAAAGAAAGAGAATT
25
+ TAATTAAATAAAAATATACTTCTTCTATTTTTAT'.gsub("\n",'')
26
+
27
+ describe "BioVelvet" do
28
+ it "should be able to parse a whole Sequences file" do
29
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences')
30
+ seqs.should be_kind_of(Bio::Velvet::Sequences)
31
+ seqs.should be_kind_of(Hash)
32
+ seqs.keys.should == [1,2]
33
+ seqs[1].should == seq1_sequence
34
+ seqs[2].should == seq2_sequence
35
+ end
36
+
37
+ it 'should be able to read in an interesting seq only' do
38
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
39
+ :interesting_read_ids => [1]
40
+ seqs.keys.should == [1]
41
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
42
+ :interesting_read_ids => [2]
43
+ seqs.keys.should == [2]
44
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
45
+ :interesting_read_ids => [1,2]
46
+ seqs.keys.should == [1,2]
47
+ seqs[1].should == seq1_sequence
48
+ end
49
+
50
+ it 'should be able to apply the grep hack' do
51
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
52
+ {:interesting_read_ids => [1], :apply_grep_hack => 500}
53
+ seqs.keys.should == [1]
54
+ seqs[1].should == seq1_sequence
55
+ end
56
+
57
+ it 'should be able to apply the grep hack when there is only just enough context' do
58
+ seqs = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
59
+ {:interesting_read_ids => [1], :apply_grep_hack => 9}
60
+ seqs.keys.should == [1]
61
+ seqs[1].should == seq1_sequence
62
+ end
63
+
64
+ it 'should warn when insufficient context is given' do
65
+ expect {
66
+ Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','Sequences'),
67
+ {:interesting_read_ids => [1], :apply_grep_hack => 8}
68
+ }.to raise_error
69
+ end
70
+
71
+ it 'should be able to handle multiple separated read ids with the grep hack' do
72
+ s = Bio::Velvet::Sequences.parse_from_file File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
73
+ s = ['']+s.values
74
+ seq_file = File.join(TEST_DATA_DIR, 'sequence_spec','5seqs.fa.Sequences')
75
+
76
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
77
+ {:interesting_read_ids => [1], :apply_grep_hack => 9}
78
+ seqs.keys.should == [1]
79
+ seqs[1].should == s[1]
80
+
81
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
82
+ {:interesting_read_ids => [1,2], :apply_grep_hack => 9}
83
+ seqs.keys.should == [1,2]
84
+ seqs.values.should == s[1..2]
85
+
86
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
87
+ {:interesting_read_ids => [1,2], :apply_grep_hack => 2}
88
+ seqs.keys.should == [1,2]
89
+ seqs.values.should == s[1..2]
90
+
91
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
92
+ {:interesting_read_ids => [1,5], :apply_grep_hack => 2}
93
+ seqs.keys.should == [1,5]
94
+ seqs.values.should == [s[1],s[5]]
95
+
96
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
97
+ {:interesting_read_ids => [1,5], :apply_grep_hack => 3}
98
+ seqs.keys.should == [1,5]
99
+ seqs.values.should == [s[1],s[5]]
100
+
101
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
102
+ {:interesting_read_ids => [1,12], :apply_grep_hack => 3}
103
+ seqs.keys.should == [1,12]
104
+ seqs.values.should == [s[1],s[12]]
105
+
106
+ expect {
107
+ seqs = Bio::Velvet::Sequences.parse_from_file seq_file,
108
+ {:interesting_read_ids => [12], :apply_grep_hack => 1}
109
+ seqs.keys.should == [1,12]
110
+ seqs.values.should == [s[1],s[12]]
111
+ }.to raise_error
112
+ end
113
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-velvet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-06 00:00:00.000000000 Z
11
+ date: 2014-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -155,12 +155,16 @@ files:
155
155
  - lib/bio-velvet.rb
156
156
  - lib/bio-velvet/graph.rb
157
157
  - lib/bio-velvet/runner.rb
158
+ - lib/bio-velvet/sequences.rb
158
159
  - spec/bio-velvet_arc_array_spec.rb
159
160
  - spec/bio-velvet_graph_spec.rb
160
161
  - spec/bio-velvet_runner_spec.rb
161
162
  - spec/data/node_sequence/LastGraph
162
163
  - spec/data/node_sequence/contigs.fa
163
164
  - spec/data/runner_input.fa
165
+ - spec/data/sequence_spec/5seqs.fa
166
+ - spec/data/sequence_spec/5seqs.fa.Sequences
167
+ - spec/data/sequence_spec/Sequences
164
168
  - spec/data/short_node_LastGraph
165
169
  - spec/data/short_node_sequence_test_graph
166
170
  - spec/data/velvet_test_reads_assembly/Graph
@@ -169,6 +173,7 @@ files:
169
173
  - spec/data/velvet_test_reads_assembly_read_tracking/Graph2
170
174
  - spec/data/velvet_test_reads_assembly_read_tracking/HOWTO_RECREATE
171
175
  - spec/data/velvet_test_trail_sequence_assembly/reads1.fa
176
+ - spec/sequences_spec.rb
172
177
  - spec/spec_helper.rb
173
178
  homepage: http://github.com/wwood/bioruby-velvet
174
179
  licenses:
@@ -190,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
190
195
  version: '0'
191
196
  requirements: []
192
197
  rubyforge_project:
193
- rubygems_version: 2.2.0
198
+ rubygems_version: 2.2.2
194
199
  signing_key:
195
200
  specification_version: 4
196
201
  summary: Parser to work with file formats used in the velvet DNA assembler