bio-velvet_underground 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ # This constants file is included during the installation make process, and
2
+ # so cannot rely on the rest of the bio-velvet_underground code
3
+ module Bio
4
+ module Velvet
5
+ class Underground
6
+ DEFAULT_MAXKMERLENGTH=31
7
+
8
+ # Different versions of velvet are compiled on installation of bio-velvet_underground.
9
+ # These are the different MAXKMERLENGTH parameters that are given to the velvet Makefile.
10
+ # See the velvet manual for more information on this.
11
+ def self.max_kmers
12
+ [31,63,127,255]
13
+ end
14
+
15
+ # Where is the library given the max_kmer_length
16
+ def self.library_location_of(max_kmer_length=nil)
17
+ if !Bio::Velvet::Underground.max_kmers.include?(max_kmer_length) and
18
+ !max_kmer_length.nil?
19
+ raise "bad max kmer length #{max_kmer_length}"
20
+ end
21
+
22
+ extras = []
23
+ if !max_kmer_length.nil? and max_kmer_length != DEFAULT_MAXKMERLENGTH
24
+ extras.push "-maxkmer#{max_kmer_length}"
25
+ end
26
+ return File.join(
27
+ File.dirname(__FILE__),
28
+ 'external',
29
+ "libvelvet#{extras.join('') }.so.1.0")
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,262 @@
1
+ require 'csv'
2
+
3
+ class Bio::Velvet::Underground
4
+
5
+ class Graph
6
+ attr_accessor :internal_graph_struct
7
+
8
+ # Use parse_from_file, not #new
9
+ def initialize(graph_struct)
10
+ @internal_graph_struct = graph_struct
11
+ end
12
+
13
+ # Read in a graph from a file
14
+ def self.parse_from_file(path)
15
+ # First read the first line of the file to determine which underground velvet library to load
16
+ hash_length = nil
17
+ CSV.foreach(path, :col_sep => "\t") do |row|
18
+ raise "Badly formatted graph file" if row.length < 3
19
+ hash_length = row[2].to_i
20
+ if hash_length < 1 or hash_length > Bio::Velvet::Underground.max_kmers.max
21
+ raise "unable to load velvet shared library for kmer length `#{hash_length}'"
22
+ end
23
+ break
24
+ end
25
+ raise "No lines in graph file `#{path}', is it really a velvet LastGraph-type file?" if hash_length.nil?
26
+
27
+ # setup FFI in the underground base class with the correct kmer length
28
+ Bio::Velvet::Underground.attach_shared_library(:kmer => hash_length)
29
+
30
+ # Using the loaded velvet library, do the actual import of the graph
31
+ pointer = Bio::Velvet::Underground.importGraph path
32
+ struct = Bio::Velvet::Underground::GraphStruct.new pointer
33
+ Graph.new struct
34
+ end
35
+
36
+ def nodes
37
+ NodeArray.new self
38
+ end
39
+
40
+ def node_count
41
+ @internal_graph_struct[:nodeCount]
42
+ end
43
+
44
+ def hash_length
45
+ @internal_graph_struct[:wordLength]
46
+ end
47
+
48
+
49
+ class NodeArray
50
+ include Enumerable
51
+
52
+ def initialize(graph)
53
+ @graph = graph
54
+ end
55
+
56
+ def each
57
+ (1..length).each do |node_id|
58
+ yield self[node_id]
59
+ end
60
+ end
61
+
62
+ def length
63
+ Bio::Velvet::Underground.nodeCount @graph.internal_graph_struct
64
+ end
65
+
66
+ def [](node_id)
67
+ return nil if node_id < 1 or node_id > @graph.internal_graph_struct[:nodeCount]
68
+ pointer = Bio::Velvet::Underground.getNodeInGraph @graph.internal_graph_struct, node_id
69
+ node_struct = Bio::Velvet::Underground::NodeStruct.new pointer
70
+ Node.new(@graph, node_struct)
71
+ end
72
+ end
73
+
74
+ class Node
75
+ attr_accessor :internal_node_struct
76
+
77
+ def initialize(graph, node_struct)
78
+ @graph = graph
79
+ @internal_node_struct = node_struct
80
+ end
81
+
82
+ def node_id
83
+ @internal_node_struct[:ID]
84
+ end
85
+
86
+ def length_alone
87
+ @internal_node_struct[:length]
88
+ end
89
+
90
+ def coverages
91
+ [
92
+ @internal_node_struct[:virtualCoverage1],
93
+ @internal_node_struct[:virtualCoverage2],
94
+ ]
95
+ end
96
+
97
+ def ends_of_kmers_of_node
98
+ seq = []
99
+ key = %w(A C G T)
100
+ 0.upto(length_alone-1) do |i|
101
+ n = Bio::Velvet::Underground.getNucleotideInNode(@internal_node_struct, i)
102
+ seq.push key[n]
103
+ end
104
+ return seq.join
105
+ end
106
+
107
+ def ends_of_kmers_of_twin_node
108
+ twin.ends_of_kmers_of_node
109
+ end
110
+
111
+ def twin
112
+ return @twin unless @twin.nil?
113
+
114
+ twin_pointer = Bio::Velvet::Underground.getTwinNode(@internal_node_struct)
115
+ @twin = Bio::Velvet::Underground::Graph::Node.new(
116
+ @graph,
117
+ Bio::Velvet::Underground::NodeStruct.new(twin_pointer)
118
+ )
119
+ end
120
+
121
+ def fwd_short_reads
122
+ array_start_pointer = Bio::Velvet::Underground.getNodeReads @internal_node_struct, @graph.internal_graph_struct
123
+ num_short_reads = Bio::Velvet::Underground.getNodeReadCount @internal_node_struct, @graph.internal_graph_struct
124
+ short_reads = (0...num_short_reads).collect do |i|
125
+ # Use the fact that FFI pointers can do pointer arithmetic
126
+ pointer = array_start_pointer+(i*Bio::Velvet::Underground::ShortReadMarker.size)
127
+ NodedRead.new Bio::Velvet::Underground::ShortReadMarker.new(pointer), true
128
+ end
129
+ return short_reads
130
+ end
131
+
132
+ def rev_short_reads
133
+ twin.fwd_short_reads
134
+ end
135
+
136
+ def short_reads
137
+ reads = fwd_short_reads
138
+ rev_short_reads.each do |read|
139
+ read.direction = false
140
+ reads.push read
141
+ end
142
+ return reads
143
+ end
144
+
145
+ end
146
+
147
+ # TODO: this class is currently unimplemented.
148
+ class ArcArray
149
+ def initialize(graph_struct)
150
+ @internal_graph_struct = graph_struct
151
+ end
152
+
153
+ def get_arcs_by_node_id(node_id1, node_id2=nil)
154
+ raise
155
+ end
156
+ end
157
+
158
+ class NodedRead
159
+ attr_accessor :direction
160
+
161
+ def initialize(short_read_struct, direction)
162
+ @internal_short_read_struct = short_read_struct
163
+ @direction = direction
164
+ end
165
+
166
+ def read_id
167
+ @internal_short_read_struct[:readID]
168
+ end
169
+
170
+ def offset_from_start_of_node
171
+ @internal_short_read_struct[:position]
172
+ end
173
+
174
+ def start_coord
175
+ @internal_short_read_struct[:offset]
176
+ end
177
+ end
178
+ end
179
+
180
+
181
+ private
182
+ class GraphStruct < FFI::Struct
183
+ #class struct graph_st {
184
+ layout :nodes, :pointer, # Node **nodes;
185
+ :arcLookupTable, :pointer, # Arc **arcLookupTable;
186
+ :nodeReads, :pointer, # ShortReadMarker **nodeReads;
187
+ :nodeReadCounts, :pointer, # IDnum *nodeReadCounts;
188
+ :gapMarkers, :pointer, # GapMarker **gapMarkers;
189
+ #TODO: here default compilation settins are assumed (CATEGORIES=2) - probably not a future-proof assumption
190
+ :insertLengths0, :int64, # Coordinate insertLengths[CATEGORIES + 1];
191
+ :insertLengths1, :int64, # Coordinate insertLengths[CATEGORIES + 1];
192
+ :insertLengths2, :int64, # Coordinate insertLengths[CATEGORIES + 1];
193
+ :insertLengths_var0, :pointer, # double insertLengths_var[CATEGORIES + 1];
194
+ :insertLengths_var1, :pointer, # double insertLengths_var[CATEGORIES + 1];
195
+ :insertLengths_var2, :pointer, # double insertLengths_var[CATEGORIES + 1];
196
+ :sequenceCount, :int32, # IDnum sequenceCount;
197
+ :nodeCount, :int32, # IDnum nodeCount;
198
+ :wordLength, :int, # int wordLength;
199
+ :double_stranded, :bool # boolean double_stranded;
200
+ end
201
+
202
+ class NodeStruct < FFI::Struct
203
+ pack 1 # pack all members on a 1 byte boundary
204
+ # struct node_st {
205
+ layout :twinNode, :pointer, # Node *twinNode; // 64
206
+ :arc, :pointer, # Arc *arc; // 64
207
+ :descriptor, :pointer, # Descriptor *descriptor; // 64
208
+ :marker, :uint32, # PassageMarkerI marker; // 32
209
+ :length, :int32, # IDnum length; // 32
210
+ :virtualCoverage1, :int32, # IDnum virtualCoverage[CATEGORIES]; // 32 * 2
211
+ :virtualCoverage2, :int32,
212
+ :originalVirtualCoverage1, :int32, # IDnum originalVirtualCoverage[CATEGORIES]; // 32 * 2
213
+ :originalVirtualCoverage2, :int32,
214
+ :ID, :int32, # IDnum ID; // 32
215
+ :arcCount, :int32, # IDnum arcCount; // 32
216
+ :status, :int8, # boolean status; // 1
217
+ :uniqueness, :int8 # boolean uniqueness; // 1
218
+ #} ATTRIBUTE_PACKED;
219
+ end
220
+
221
+ class ArcStruct < FFI::Struct
222
+ pack 1 # pack all members on a 1 byte boundary
223
+ # struct arc_st {
224
+ layout :twinArc, :pointer, # Arc *twinArc; // 64
225
+ :next, :pointer, # Arc *next; // 64
226
+ :previous, :pointer, # Arc *previous; // 64
227
+ :nextInLookupTable, :pointer, # Arc *nextInLookupTable; // 64
228
+ :destination, :pointer, # Node *destination; // 64
229
+ :multiplicity, :int32 # IDnum multiplicity; // 32
230
+ # } ATTRIBUTE_PACKED;
231
+ end
232
+
233
+ class ShortReadMarker < FFI::Struct
234
+ pack 1 # pack all members on a 1 byte boundary
235
+ # struct shortReadMarker_st {
236
+ layout :position, :int32, # IDnum position;
237
+ :readID, :int32, # IDnum readID;
238
+ :offset, :int16 # ShortLength offset;
239
+ # } ATTRIBUTE_PACKED;
240
+ end
241
+
242
+ def self.attach_graph_functions
243
+ attach_function :importGraph, [:string], :pointer
244
+ attach_function :nodeCount, [:pointer], :int32
245
+ # Arc *getArcBetweenNodes(Node * originNode, Node * destinationNode,
246
+ # Graph * graph)
247
+ attach_function :getArcBetweenNodes, [:pointer, :pointer, :pointer], :pointer
248
+
249
+ # Nucleotide getNucleotideInNode(Node * node, Coordinate index) {
250
+ attach_function :getNucleotideInNode, [:pointer, :int32], :char
251
+ # IDnum getNodeID(Node * node)
252
+ # Node *getNodeInGraph(Graph * graph, IDnum nodeID)
253
+ attach_function :getNodeInGraph, [:pointer, :int32], :pointer
254
+ # Node *getTwinNode(Node * node);
255
+ attach_function :getTwinNode, [:pointer], :pointer
256
+
257
+ # ShortReadMarker *getNodeReads(Node * node, Graph * graph);
258
+ attach_function :getNodeReads, [:pointer, :pointer], :pointer
259
+ # IDnum getNodeReadCount(Node * node, Graph * graph);
260
+ attach_function :getNodeReadCount, [:pointer, :pointer], :int32
261
+ end
262
+ end
@@ -0,0 +1,59 @@
1
+ class Bio::Velvet::Underground
2
+ class Runner
3
+ # Run velveth and velvetg, selecting the most memory efficient library for the purpose
4
+ #
5
+ # kmer size: (integer)
6
+ # velvet_directory: where to run velvet
7
+ # velveth_options_string: Array of string options to velveth as on the cmdline, excluding the directory and kmer
8
+ # velvetg_options_string: Array of string options to velveth as on the cmdline, excluding the directory
9
+ # options: other options:
10
+ # :velvet_directory: where to run the velvets. Required (currently).
11
+ def self.run(kmer, velveth_options, velvetg_options=[], options={})
12
+ #load library with appropriate kmer size
13
+ Bio::Velvet::Underground.attach_shared_library(:kmer => kmer)
14
+
15
+ velvet_directory = options[:velvet_directory]
16
+ raise "Need options[:velvet_directory] to run velvet" if velvet_directory.nil?
17
+
18
+ # velveth
19
+ # Can't just pass a regular Ruby array of strings, as explained at
20
+ # http://zegoggl.es/2009/05/ruby-ffi-recipes.html
21
+ velveth_array_of_strings = []
22
+ velveth_array_of_strings << FFI::MemoryPointer.from_string('velveth')
23
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(velvet_directory)
24
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(kmer.to_s)
25
+ velveth_options.each do |o|
26
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(o)
27
+ end
28
+ velveth_array_of_strings << nil
29
+ argv = FFI::MemoryPointer.new(:pointer, velveth_array_of_strings.length)
30
+ velveth_array_of_strings.each_with_index do |p, i|
31
+ argv[i].put_pointer(0, p)
32
+ end
33
+ returned = Bio::Velvet::Underground.velveth velveth_array_of_strings.length-1, argv
34
+ raise "Error running velveth (#{returned})" unless returned == 0
35
+
36
+ # velvetg
37
+ velvetg_array_of_strings = []
38
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string('velvetg')
39
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string(velvet_directory)
40
+ velvetg_options.each do |o|
41
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string(o)
42
+ end
43
+ velvetg_array_of_strings << nil
44
+ argv = FFI::MemoryPointer.new(:pointer, velvetg_array_of_strings.length)
45
+ velvetg_array_of_strings.each_with_index do |p, i|
46
+ argv[i].put_pointer(0, p)
47
+ end
48
+ returned = Bio::Velvet::Underground.velvetg velvetg_array_of_strings.length-1, argv
49
+ raise "Error running velvetg (#{returned})" unless returned == 0
50
+
51
+ return 0
52
+ end
53
+ end
54
+
55
+ def self.attach_runner_functions
56
+ attach_function :velveth, [:int32, :pointer], :int32
57
+ attach_function :velvetg, [:int32, :pointer], :int32
58
+ end
59
+ end
@@ -24,4 +24,16 @@ describe "binary sequence store" do
24
24
  seqs[78]
25
25
  }.to raise_error
26
26
  end
27
+
28
+ it 'should be able to understand mates' do
29
+ path = File.join TEST_DATA_DIR, '2', 'CnyUnifiedSeq'
30
+ seqs = Bio::Velvet::Underground::BinarySequenceStore.new path
31
+ seqs.is_second_in_pair?(1).should == false
32
+ seqs.is_second_in_pair?(2).should == true
33
+ seqs.is_second_in_pair?(5).should == false
34
+ seqs.pair_id(1).should == 2
35
+ seqs.pair_id(2).should == 1
36
+ seqs.pair_id(5).should == 6
37
+ seqs.pair_id(6).should == 5
38
+ end
27
39
  end
Binary file
@@ -0,0 +1,40 @@
1
+ 4 5 31 1
2
+ NODE 1 228 1140 1140 0 0
3
+ AATCAAACTATAAAGTTTTAGAAGATAAAGTAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAGATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATACGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATGGACGAGTTATATTTACTG
4
+ TTACACCCTTTCCTTTTAATAAAACCACATTTCCAGTAGAATCATACGTATATTTACCAATTACATTACCATTTTTATCCCTAACAGAAAAAGCTGCGCCTGCAAGATCTATTGAAATATTTTCTGAATCTACTTTTTTAACTCCGAATCCCCATGTATAAGTTGTTACTTTATCTTCTAAAACTTTATAGTTTGATTCTAAATCGTGATCTTTGGTAGAGATAAGTG
5
+ NODE 2 29 58 58 0 0
6
+ GTTTAAAAGAAGGAGATTACTTTATAAAA
7
+ AGTAAATATAACTCGTCCATTTTTATCAG
8
+ NODE 3 224 1120 1120 0 0
9
+ GAAGAAAAAGCTCCTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATTTCACTTCCAGGTACAGG
10
+ TTTTTAATTTGTACATTAAATAATACATTGCCATCATTCATAGTAATATTATTTATTATACTTCCAGCTTCATTGCCATTAGTTACAGATATAGTTGCTTGACCAGTATACTCTCCATTATCATCTTTTTGAGCTGTTATAGTAACTTTTACTGGTTCTTTTAAAAGGCTATACCCTTTAGGAGCTTTTTCTTCTTTTATAAAGTAATCTCCTTCTTTTAAACC
11
+ NODE 4 38 114 114 0 0
12
+ CGGGGGGGGGTTTAAAAGAAGGAGATTACTTTATAAAA
13
+ CCCCCCCGCAGTAAATATAACTCGTCCATTTTTATCAG
14
+ ARC 1 2 2
15
+ ARC 1 4 3
16
+ ARC 2 3 2
17
+ ARC -3 -4 3
18
+ NR -4 1
19
+ 5 0 224
20
+ NR -3 2
21
+ 3 0 0
22
+ 5 0 0
23
+ NR -2 1
24
+ 3 0 224
25
+ NR -1 2
26
+ 3 0 253
27
+ 5 0 262
28
+ NR 1 3
29
+ 1 0 0
30
+ 2 0 0
31
+ 4 0 0
32
+ NR 2 1
33
+ 1 0 228
34
+ NR 3 3
35
+ 1 0 257
36
+ 2 0 266
37
+ 4 0 266
38
+ NR 4 2
39
+ 2 0 228
40
+ 4 0 228
@@ -0,0 +1,40 @@
1
+ 4 5 31 1
2
+ NODE 1 228 1140 1140 0 0
3
+ AATCAAACTATAAAGTTTTAGAAGATAAAGTAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAGATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATACGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATGGACGAGTTATATTTACTG
4
+ TTACACCCTTTCCTTTTAATAAAACCACATTTCCAGTAGAATCATACGTATATTTACCAATTACATTACCATTTTTATCCCTAACAGAAAAAGCTGCGCCTGCAAGATCTATTGAAATATTTTCTGAATCTACTTTTTTAACTCCGAATCCCCATGTATAAGTTGTTACTTTATCTTCTAAAACTTTATAGTTTGATTCTAAATCGTGATCTTTGGTAGAGATAAGTG
5
+ NODE 2 29 58 58 0 0
6
+ GTTTAAAAGAAGGAGATTACTTTATAAAA
7
+ AGTAAATATAACTCGTCCATTTTTATCAG
8
+ NODE 3 224 1120 1120 0 0
9
+ GAAGAAAAAGCTCCTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATTTCACTTCCAGGTACAGG
10
+ TTTTTAATTTGTACATTAAATAATACATTGCCATCATTCATAGTAATATTATTTATTATACTTCCAGCTTCATTGCCATTAGTTACAGATATAGTTGCTTGACCAGTATACTCTCCATTATCATCTTTTTGAGCTGTTATAGTAACTTTTACTGGTTCTTTTAAAAGGCTATACCCTTTAGGAGCTTTTTCTTCTTTTATAAAGTAATCTCCTTCTTTTAAACC
11
+ NODE 4 38 114 114 0 0
12
+ CGGGGGGGGGTTTAAAAGAAGGAGATTACTTTATAAAA
13
+ CCCCCCCGCAGTAAATATAACTCGTCCATTTTTATCAG
14
+ ARC 1 2 2
15
+ ARC 1 4 3
16
+ ARC 2 3 2
17
+ ARC -3 -4 3
18
+ NR -4 1
19
+ 5 0 224
20
+ NR -3 2
21
+ 3 0 0
22
+ 5 0 0
23
+ NR -2 1
24
+ 3 0 224
25
+ NR -1 2
26
+ 3 0 253
27
+ 5 0 262
28
+ NR 1 3
29
+ 1 0 0
30
+ 2 0 0
31
+ 4 0 0
32
+ NR 2 1
33
+ 1 0 228
34
+ NR 3 3
35
+ 1 0 257
36
+ 2 0 266
37
+ 4 0 266
38
+ NR 4 2
39
+ 2 0 228
40
+ 4 0 228