bio-velvet_underground 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ # This constants file is included during the installation make process, and
2
+ # so cannot rely on the rest of the bio-velvet_underground code
3
+ module Bio
4
+ module Velvet
5
+ class Underground
6
+ DEFAULT_MAXKMERLENGTH=31
7
+
8
+ # Different versions of velvet are compiled on installation of bio-velvet_underground.
9
+ # These are the different MAXKMERLENGTH parameters that are given to the velvet Makefile.
10
+ # See the velvet manual for more information on this.
11
+ def self.max_kmers
12
+ [31,63,127,255]
13
+ end
14
+
15
+ # Where is the library given the max_kmer_length
16
+ def self.library_location_of(max_kmer_length=nil)
17
+ if !Bio::Velvet::Underground.max_kmers.include?(max_kmer_length) and
18
+ !max_kmer_length.nil?
19
+ raise "bad max kmer length #{max_kmer_length}"
20
+ end
21
+
22
+ extras = []
23
+ if !max_kmer_length.nil? and max_kmer_length != DEFAULT_MAXKMERLENGTH
24
+ extras.push "-maxkmer#{max_kmer_length}"
25
+ end
26
+ return File.join(
27
+ File.dirname(__FILE__),
28
+ 'external',
29
+ "libvelvet#{extras.join('') }.so.1.0")
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,262 @@
1
+ require 'csv'
2
+
3
+ class Bio::Velvet::Underground
4
+
5
+ class Graph
6
+ attr_accessor :internal_graph_struct
7
+
8
+ # Use parse_from_file, not #new
9
+ def initialize(graph_struct)
10
+ @internal_graph_struct = graph_struct
11
+ end
12
+
13
+ # Read in a graph from a file
14
+ def self.parse_from_file(path)
15
+ # First read the first line of the file to determine which underground velvet library to load
16
+ hash_length = nil
17
+ CSV.foreach(path, :col_sep => "\t") do |row|
18
+ raise "Badly formatted graph file" if row.length < 3
19
+ hash_length = row[2].to_i
20
+ if hash_length < 1 or hash_length > Bio::Velvet::Underground.max_kmers.max
21
+ raise "unable to load velvet shared library for kmer length `#{hash_length}'"
22
+ end
23
+ break
24
+ end
25
+ raise "No lines in graph file `#{path}', is it really a velvet LastGraph-type file?" if hash_length.nil?
26
+
27
+ # setup FFI in the underground base class with the correct kmer length
28
+ Bio::Velvet::Underground.attach_shared_library(:kmer => hash_length)
29
+
30
+ # Using the loaded velvet library, do the actual import of the graph
31
+ pointer = Bio::Velvet::Underground.importGraph path
32
+ struct = Bio::Velvet::Underground::GraphStruct.new pointer
33
+ Graph.new struct
34
+ end
35
+
36
+ def nodes
37
+ NodeArray.new self
38
+ end
39
+
40
+ def node_count
41
+ @internal_graph_struct[:nodeCount]
42
+ end
43
+
44
+ def hash_length
45
+ @internal_graph_struct[:wordLength]
46
+ end
47
+
48
+
49
+ class NodeArray
50
+ include Enumerable
51
+
52
+ def initialize(graph)
53
+ @graph = graph
54
+ end
55
+
56
+ def each
57
+ (1..length).each do |node_id|
58
+ yield self[node_id]
59
+ end
60
+ end
61
+
62
+ def length
63
+ Bio::Velvet::Underground.nodeCount @graph.internal_graph_struct
64
+ end
65
+
66
+ def [](node_id)
67
+ return nil if node_id < 1 or node_id > @graph.internal_graph_struct[:nodeCount]
68
+ pointer = Bio::Velvet::Underground.getNodeInGraph @graph.internal_graph_struct, node_id
69
+ node_struct = Bio::Velvet::Underground::NodeStruct.new pointer
70
+ Node.new(@graph, node_struct)
71
+ end
72
+ end
73
+
74
+ class Node
75
+ attr_accessor :internal_node_struct
76
+
77
+ def initialize(graph, node_struct)
78
+ @graph = graph
79
+ @internal_node_struct = node_struct
80
+ end
81
+
82
+ def node_id
83
+ @internal_node_struct[:ID]
84
+ end
85
+
86
+ def length_alone
87
+ @internal_node_struct[:length]
88
+ end
89
+
90
+ def coverages
91
+ [
92
+ @internal_node_struct[:virtualCoverage1],
93
+ @internal_node_struct[:virtualCoverage2],
94
+ ]
95
+ end
96
+
97
+ def ends_of_kmers_of_node
98
+ seq = []
99
+ key = %w(A C G T)
100
+ 0.upto(length_alone-1) do |i|
101
+ n = Bio::Velvet::Underground.getNucleotideInNode(@internal_node_struct, i)
102
+ seq.push key[n]
103
+ end
104
+ return seq.join
105
+ end
106
+
107
+ def ends_of_kmers_of_twin_node
108
+ twin.ends_of_kmers_of_node
109
+ end
110
+
111
+ def twin
112
+ return @twin unless @twin.nil?
113
+
114
+ twin_pointer = Bio::Velvet::Underground.getTwinNode(@internal_node_struct)
115
+ @twin = Bio::Velvet::Underground::Graph::Node.new(
116
+ @graph,
117
+ Bio::Velvet::Underground::NodeStruct.new(twin_pointer)
118
+ )
119
+ end
120
+
121
+ def fwd_short_reads
122
+ array_start_pointer = Bio::Velvet::Underground.getNodeReads @internal_node_struct, @graph.internal_graph_struct
123
+ num_short_reads = Bio::Velvet::Underground.getNodeReadCount @internal_node_struct, @graph.internal_graph_struct
124
+ short_reads = (0...num_short_reads).collect do |i|
125
+ # Use the fact that FFI pointers can do pointer arithmetic
126
+ pointer = array_start_pointer+(i*Bio::Velvet::Underground::ShortReadMarker.size)
127
+ NodedRead.new Bio::Velvet::Underground::ShortReadMarker.new(pointer), true
128
+ end
129
+ return short_reads
130
+ end
131
+
132
+ def rev_short_reads
133
+ twin.fwd_short_reads
134
+ end
135
+
136
+ def short_reads
137
+ reads = fwd_short_reads
138
+ rev_short_reads.each do |read|
139
+ read.direction = false
140
+ reads.push read
141
+ end
142
+ return reads
143
+ end
144
+
145
+ end
146
+
147
+ # TODO: this class is currently unimplemented.
148
+ class ArcArray
149
+ def initialize(graph_struct)
150
+ @internal_graph_struct = graph_struct
151
+ end
152
+
153
+ def get_arcs_by_node_id(node_id1, node_id2=nil)
154
+ raise
155
+ end
156
+ end
157
+
158
+ class NodedRead
159
+ attr_accessor :direction
160
+
161
+ def initialize(short_read_struct, direction)
162
+ @internal_short_read_struct = short_read_struct
163
+ @direction = direction
164
+ end
165
+
166
+ def read_id
167
+ @internal_short_read_struct[:readID]
168
+ end
169
+
170
+ def offset_from_start_of_node
171
+ @internal_short_read_struct[:position]
172
+ end
173
+
174
+ def start_coord
175
+ @internal_short_read_struct[:offset]
176
+ end
177
+ end
178
+ end
179
+
180
+
181
+ private
182
+ class GraphStruct < FFI::Struct
183
+ #class struct graph_st {
184
+ layout :nodes, :pointer, # Node **nodes;
185
+ :arcLookupTable, :pointer, # Arc **arcLookupTable;
186
+ :nodeReads, :pointer, # ShortReadMarker **nodeReads;
187
+ :nodeReadCounts, :pointer, # IDnum *nodeReadCounts;
188
+ :gapMarkers, :pointer, # GapMarker **gapMarkers;
189
+ #TODO: here default compilation settins are assumed (CATEGORIES=2) - probably not a future-proof assumption
190
+ :insertLengths0, :int64, # Coordinate insertLengths[CATEGORIES + 1];
191
+ :insertLengths1, :int64, # Coordinate insertLengths[CATEGORIES + 1];
192
+ :insertLengths2, :int64, # Coordinate insertLengths[CATEGORIES + 1];
193
+ :insertLengths_var0, :pointer, # double insertLengths_var[CATEGORIES + 1];
194
+ :insertLengths_var1, :pointer, # double insertLengths_var[CATEGORIES + 1];
195
+ :insertLengths_var2, :pointer, # double insertLengths_var[CATEGORIES + 1];
196
+ :sequenceCount, :int32, # IDnum sequenceCount;
197
+ :nodeCount, :int32, # IDnum nodeCount;
198
+ :wordLength, :int, # int wordLength;
199
+ :double_stranded, :bool # boolean double_stranded;
200
+ end
201
+
202
+ class NodeStruct < FFI::Struct
203
+ pack 1 # pack all members on a 1 byte boundary
204
+ # struct node_st {
205
+ layout :twinNode, :pointer, # Node *twinNode; // 64
206
+ :arc, :pointer, # Arc *arc; // 64
207
+ :descriptor, :pointer, # Descriptor *descriptor; // 64
208
+ :marker, :uint32, # PassageMarkerI marker; // 32
209
+ :length, :int32, # IDnum length; // 32
210
+ :virtualCoverage1, :int32, # IDnum virtualCoverage[CATEGORIES]; // 32 * 2
211
+ :virtualCoverage2, :int32,
212
+ :originalVirtualCoverage1, :int32, # IDnum originalVirtualCoverage[CATEGORIES]; // 32 * 2
213
+ :originalVirtualCoverage2, :int32,
214
+ :ID, :int32, # IDnum ID; // 32
215
+ :arcCount, :int32, # IDnum arcCount; // 32
216
+ :status, :int8, # boolean status; // 1
217
+ :uniqueness, :int8 # boolean uniqueness; // 1
218
+ #} ATTRIBUTE_PACKED;
219
+ end
220
+
221
+ class ArcStruct < FFI::Struct
222
+ pack 1 # pack all members on a 1 byte boundary
223
+ # struct arc_st {
224
+ layout :twinArc, :pointer, # Arc *twinArc; // 64
225
+ :next, :pointer, # Arc *next; // 64
226
+ :previous, :pointer, # Arc *previous; // 64
227
+ :nextInLookupTable, :pointer, # Arc *nextInLookupTable; // 64
228
+ :destination, :pointer, # Node *destination; // 64
229
+ :multiplicity, :int32 # IDnum multiplicity; // 32
230
+ # } ATTRIBUTE_PACKED;
231
+ end
232
+
233
+ class ShortReadMarker < FFI::Struct
234
+ pack 1 # pack all members on a 1 byte boundary
235
+ # struct shortReadMarker_st {
236
+ layout :position, :int32, # IDnum position;
237
+ :readID, :int32, # IDnum readID;
238
+ :offset, :int16 # ShortLength offset;
239
+ # } ATTRIBUTE_PACKED;
240
+ end
241
+
242
+ def self.attach_graph_functions
243
+ attach_function :importGraph, [:string], :pointer
244
+ attach_function :nodeCount, [:pointer], :int32
245
+ # Arc *getArcBetweenNodes(Node * originNode, Node * destinationNode,
246
+ # Graph * graph)
247
+ attach_function :getArcBetweenNodes, [:pointer, :pointer, :pointer], :pointer
248
+
249
+ # Nucleotide getNucleotideInNode(Node * node, Coordinate index) {
250
+ attach_function :getNucleotideInNode, [:pointer, :int32], :char
251
+ # IDnum getNodeID(Node * node)
252
+ # Node *getNodeInGraph(Graph * graph, IDnum nodeID)
253
+ attach_function :getNodeInGraph, [:pointer, :int32], :pointer
254
+ # Node *getTwinNode(Node * node);
255
+ attach_function :getTwinNode, [:pointer], :pointer
256
+
257
+ # ShortReadMarker *getNodeReads(Node * node, Graph * graph);
258
+ attach_function :getNodeReads, [:pointer, :pointer], :pointer
259
+ # IDnum getNodeReadCount(Node * node, Graph * graph);
260
+ attach_function :getNodeReadCount, [:pointer, :pointer], :int32
261
+ end
262
+ end
@@ -0,0 +1,59 @@
1
+ class Bio::Velvet::Underground
2
+ class Runner
3
+ # Run velveth and velvetg, selecting the most memory efficient library for the purpose
4
+ #
5
+ # kmer size: (integer)
6
+ # velvet_directory: where to run velvet
7
+ # velveth_options_string: Array of string options to velveth as on the cmdline, excluding the directory and kmer
8
+ # velvetg_options_string: Array of string options to velveth as on the cmdline, excluding the directory
9
+ # options: other options:
10
+ # :velvet_directory: where to run the velvets. Required (currently).
11
+ def self.run(kmer, velveth_options, velvetg_options=[], options={})
12
+ #load library with appropriate kmer size
13
+ Bio::Velvet::Underground.attach_shared_library(:kmer => kmer)
14
+
15
+ velvet_directory = options[:velvet_directory]
16
+ raise "Need options[:velvet_directory] to run velvet" if velvet_directory.nil?
17
+
18
+ # velveth
19
+ # Can't just pass a regular Ruby array of strings, as explained at
20
+ # http://zegoggl.es/2009/05/ruby-ffi-recipes.html
21
+ velveth_array_of_strings = []
22
+ velveth_array_of_strings << FFI::MemoryPointer.from_string('velveth')
23
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(velvet_directory)
24
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(kmer.to_s)
25
+ velveth_options.each do |o|
26
+ velveth_array_of_strings << FFI::MemoryPointer.from_string(o)
27
+ end
28
+ velveth_array_of_strings << nil
29
+ argv = FFI::MemoryPointer.new(:pointer, velveth_array_of_strings.length)
30
+ velveth_array_of_strings.each_with_index do |p, i|
31
+ argv[i].put_pointer(0, p)
32
+ end
33
+ returned = Bio::Velvet::Underground.velveth velveth_array_of_strings.length-1, argv
34
+ raise "Error running velveth (#{returned})" unless returned == 0
35
+
36
+ # velvetg
37
+ velvetg_array_of_strings = []
38
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string('velvetg')
39
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string(velvet_directory)
40
+ velvetg_options.each do |o|
41
+ velvetg_array_of_strings << FFI::MemoryPointer.from_string(o)
42
+ end
43
+ velvetg_array_of_strings << nil
44
+ argv = FFI::MemoryPointer.new(:pointer, velvetg_array_of_strings.length)
45
+ velvetg_array_of_strings.each_with_index do |p, i|
46
+ argv[i].put_pointer(0, p)
47
+ end
48
+ returned = Bio::Velvet::Underground.velvetg velvetg_array_of_strings.length-1, argv
49
+ raise "Error running velvetg (#{returned})" unless returned == 0
50
+
51
+ return 0
52
+ end
53
+ end
54
+
55
+ def self.attach_runner_functions
56
+ attach_function :velveth, [:int32, :pointer], :int32
57
+ attach_function :velvetg, [:int32, :pointer], :int32
58
+ end
59
+ end
@@ -24,4 +24,16 @@ describe "binary sequence store" do
24
24
  seqs[78]
25
25
  }.to raise_error
26
26
  end
27
+
28
+ it 'should be able to understand mates' do
29
+ path = File.join TEST_DATA_DIR, '2', 'CnyUnifiedSeq'
30
+ seqs = Bio::Velvet::Underground::BinarySequenceStore.new path
31
+ seqs.is_second_in_pair?(1).should == false
32
+ seqs.is_second_in_pair?(2).should == true
33
+ seqs.is_second_in_pair?(5).should == false
34
+ seqs.pair_id(1).should == 2
35
+ seqs.pair_id(2).should == 1
36
+ seqs.pair_id(5).should == 6
37
+ seqs.pair_id(6).should == 5
38
+ end
27
39
  end
Binary file
@@ -0,0 +1,40 @@
1
+ 4 5 31 1
2
+ NODE 1 228 1140 1140 0 0
3
+ AATCAAACTATAAAGTTTTAGAAGATAAAGTAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAGATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATACGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATGGACGAGTTATATTTACTG
4
+ TTACACCCTTTCCTTTTAATAAAACCACATTTCCAGTAGAATCATACGTATATTTACCAATTACATTACCATTTTTATCCCTAACAGAAAAAGCTGCGCCTGCAAGATCTATTGAAATATTTTCTGAATCTACTTTTTTAACTCCGAATCCCCATGTATAAGTTGTTACTTTATCTTCTAAAACTTTATAGTTTGATTCTAAATCGTGATCTTTGGTAGAGATAAGTG
5
+ NODE 2 29 58 58 0 0
6
+ GTTTAAAAGAAGGAGATTACTTTATAAAA
7
+ AGTAAATATAACTCGTCCATTTTTATCAG
8
+ NODE 3 224 1120 1120 0 0
9
+ GAAGAAAAAGCTCCTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATTTCACTTCCAGGTACAGG
10
+ TTTTTAATTTGTACATTAAATAATACATTGCCATCATTCATAGTAATATTATTTATTATACTTCCAGCTTCATTGCCATTAGTTACAGATATAGTTGCTTGACCAGTATACTCTCCATTATCATCTTTTTGAGCTGTTATAGTAACTTTTACTGGTTCTTTTAAAAGGCTATACCCTTTAGGAGCTTTTTCTTCTTTTATAAAGTAATCTCCTTCTTTTAAACC
11
+ NODE 4 38 114 114 0 0
12
+ CGGGGGGGGGTTTAAAAGAAGGAGATTACTTTATAAAA
13
+ CCCCCCCGCAGTAAATATAACTCGTCCATTTTTATCAG
14
+ ARC 1 2 2
15
+ ARC 1 4 3
16
+ ARC 2 3 2
17
+ ARC -3 -4 3
18
+ NR -4 1
19
+ 5 0 224
20
+ NR -3 2
21
+ 3 0 0
22
+ 5 0 0
23
+ NR -2 1
24
+ 3 0 224
25
+ NR -1 2
26
+ 3 0 253
27
+ 5 0 262
28
+ NR 1 3
29
+ 1 0 0
30
+ 2 0 0
31
+ 4 0 0
32
+ NR 2 1
33
+ 1 0 228
34
+ NR 3 3
35
+ 1 0 257
36
+ 2 0 266
37
+ 4 0 266
38
+ NR 4 2
39
+ 2 0 228
40
+ 4 0 228
@@ -0,0 +1,40 @@
1
+ 4 5 31 1
2
+ NODE 1 228 1140 1140 0 0
3
+ AATCAAACTATAAAGTTTTAGAAGATAAAGTAACAACTTATACATGGGGATTCGGAGTTAAAAAAGTAGATTCAGAAAATATTTCAATAGATCTTGCAGGCGCAGCTTTTTCTGTTAGGGATAAAAATGGTAATGTAATTGGTAAATATACGTATGATTCTACTGGAAATGTGGTTTTATTAAAAGGAAAGGGTGTAACTGATAAAAATGGACGAGTTATATTTACTG
4
+ TTACACCCTTTCCTTTTAATAAAACCACATTTCCAGTAGAATCATACGTATATTTACCAATTACATTACCATTTTTATCCCTAACAGAAAAAGCTGCGCCTGCAAGATCTATTGAAATATTTTCTGAATCTACTTTTTTAACTCCGAATCCCCATGTATAAGTTGTTACTTTATCTTCTAAAACTTTATAGTTTGATTCTAAATCGTGATCTTTGGTAGAGATAAGTG
5
+ NODE 2 29 58 58 0 0
6
+ GTTTAAAAGAAGGAGATTACTTTATAAAA
7
+ AGTAAATATAACTCGTCCATTTTTATCAG
8
+ NODE 3 224 1120 1120 0 0
9
+ GAAGAAAAAGCTCCTAAAGGGTATAGCCTTTTAAAAGAACCAGTAAAAGTTACTATAACAGCTCAAAAAGATGATAATGGAGAGTATACTGGTCAAGCAACTATATCTGTAACTAATGGCAATGAAGCTGGAAGTATAATAAATAATATTACTATGAATGATGGCAATGTATTATTTAATGTACAAATTAAAAACTATGCTGGTATTTCACTTCCAGGTACAGG
10
+ TTTTTAATTTGTACATTAAATAATACATTGCCATCATTCATAGTAATATTATTTATTATACTTCCAGCTTCATTGCCATTAGTTACAGATATAGTTGCTTGACCAGTATACTCTCCATTATCATCTTTTTGAGCTGTTATAGTAACTTTTACTGGTTCTTTTAAAAGGCTATACCCTTTAGGAGCTTTTTCTTCTTTTATAAAGTAATCTCCTTCTTTTAAACC
11
+ NODE 4 38 114 114 0 0
12
+ CGGGGGGGGGTTTAAAAGAAGGAGATTACTTTATAAAA
13
+ CCCCCCCGCAGTAAATATAACTCGTCCATTTTTATCAG
14
+ ARC 1 2 2
15
+ ARC 1 4 3
16
+ ARC 2 3 2
17
+ ARC -3 -4 3
18
+ NR -4 1
19
+ 5 0 224
20
+ NR -3 2
21
+ 3 0 0
22
+ 5 0 0
23
+ NR -2 1
24
+ 3 0 224
25
+ NR -1 2
26
+ 3 0 253
27
+ 5 0 262
28
+ NR 1 3
29
+ 1 0 0
30
+ 2 0 0
31
+ 4 0 0
32
+ NR 2 1
33
+ 1 0 228
34
+ NR 3 3
35
+ 1 0 257
36
+ 2 0 266
37
+ 4 0 266
38
+ NR 4 2
39
+ 2 0 228
40
+ 4 0 228