bio-velvet_underground 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,8 @@ static void printUsage()
68
68
  puts("\t-paired_exp_fraction <double>\t: remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)");
69
69
  puts("\t-shortMatePaired* <yes|no>\t: for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)");
70
70
  puts("\t-conserveLong <yes|no>\t\t: preserve sequences with long reads in them (default no)");
71
+ puts("\t-clip_tips <yes|no>\t\t: do tip clipping on pre-graph (default yes)");
72
+ puts("\t-tour_bus <yes|no>\t\t: apply the tour bus algorithm (default yes)");
71
73
  puts("");
72
74
  puts("Output:");
73
75
  puts("\tdirectory/contigs.fa\t\t: fasta file of contigs longer than twice hash length");
@@ -76,7 +78,7 @@ static void printUsage()
76
78
  puts("\tdirectory/velvet_asm.afg\t: (if requested) AMOS compatible assembly file");
77
79
  }
78
80
 
79
- int main(int argc, char **argv)
81
+ int velvetg(int argc, char **argv)
80
82
  {
81
83
  ReadSet *sequences = NULL;
82
84
  RoadMapArray *rdmaps;
@@ -86,7 +88,9 @@ int main(int argc, char **argv)
86
88
  *preGraphFilename, *seqFilename, *roadmapFilename,
87
89
  *lowCovContigsFilename, *highCovContigsFilename;
88
90
  double coverageCutoff = -1;
89
- double longCoverageCutoff = -1;
91
+ boolean doClipTips = true;
92
+ boolean doTourBus = true;
93
+ double longCoverageCutoff = -1;
90
94
  double maxCoverageCutoff = -1;
91
95
  double expectedCoverage = -1;
92
96
  Coordinate minContigLength = -1;
@@ -164,7 +168,7 @@ int main(int argc, char **argv)
164
168
  return 0;
165
169
  }
166
170
 
167
- // Memory allocation
171
+ // Memory allocation
168
172
  directory = argv[1];
169
173
  graphFilename = mallocOrExit(strlen(directory) + 100, char);
170
174
  connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char);
@@ -181,9 +185,9 @@ int main(int argc, char **argv)
181
185
  if (arg_index >= argc) {
182
186
  velvetLog("Unusual number of arguments!\n");
183
187
  printUsage();
184
- #ifdef DEBUG
188
+ #ifdef DEBUG
185
189
  abort();
186
- #endif
190
+ #endif
187
191
  exit(1);
188
192
  }
189
193
 
@@ -210,9 +214,9 @@ int main(int argc, char **argv)
210
214
  if (insertLength[0] < 0) {
211
215
  velvetLog("Invalid insert length: %lli\n",
212
216
  (long long) insertLength[0]);
213
- #ifdef DEBUG
217
+ #ifdef DEBUG
214
218
  abort();
215
- #endif
219
+ #endif
216
220
  exit(1);
217
221
  }
218
222
  } else if (strcmp(arg, "-ins_length_sd") == 0) {
@@ -221,9 +225,9 @@ int main(int argc, char **argv)
221
225
  if (std_dev[0] < 0) {
222
226
  velvetLog("Invalid std deviation: %lli\n",
223
227
  (long long) std_dev[0]);
224
- #ifdef DEBUG
228
+ #ifdef DEBUG
225
229
  abort();
226
- #endif
230
+ #endif
227
231
  exit(1);
228
232
  }
229
233
  } else if (strcmp(arg, "-ins_length_long") == 0) {
@@ -238,9 +242,9 @@ int main(int argc, char **argv)
238
242
  cat = (Category) short_var;
239
243
  if (cat < 1 || cat > CATEGORIES) {
240
244
  velvetLog("Unknown option: %s\n", arg);
241
- #ifdef DEBUG
245
+ #ifdef DEBUG
242
246
  abort();
243
- #endif
247
+ #endif
244
248
  exit(1);
245
249
  }
246
250
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -248,9 +252,9 @@ int main(int argc, char **argv)
248
252
  if (insertLength[cat - 1] < 0) {
249
253
  velvetLog("Invalid insert length: %lli\n",
250
254
  (long long) insertLength[cat - 1]);
251
- #ifdef DEBUG
255
+ #ifdef DEBUG
252
256
  abort();
253
- #endif
257
+ #endif
254
258
  exit(1);
255
259
  }
256
260
  } else if (strncmp(arg, "-ins_length", 11) == 0) {
@@ -258,9 +262,9 @@ int main(int argc, char **argv)
258
262
  cat = (Category) short_var;
259
263
  if (cat < 1 || cat > CATEGORIES) {
260
264
  velvetLog("Unknown option: %s\n", arg);
261
- #ifdef DEBUG
265
+ #ifdef DEBUG
262
266
  abort();
263
- #endif
267
+ #endif
264
268
  exit(1);
265
269
  }
266
270
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -268,9 +272,9 @@ int main(int argc, char **argv)
268
272
  if (std_dev[cat - 1] < 0) {
269
273
  velvetLog("Invalid std deviation: %lli\n",
270
274
  (long long) std_dev[cat - 1]);
271
- #ifdef DEBUG
275
+ #ifdef DEBUG
272
276
  abort();
273
- #endif
277
+ #endif
274
278
  exit(1);
275
279
  }
276
280
  } else if (strcmp(arg, "-read_trkg") == 0) {
@@ -353,6 +357,12 @@ int main(int argc, char **argv)
353
357
  exit(1);
354
358
  }
355
359
  shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0);
360
+ } else if (strcmp(arg,"-clip_tips") == 0){
361
+ if (strcmp(argv[arg_index], "no") == 0)
362
+ doClipTips = false;
363
+ } else if (strcmp(arg,"-tour_bus") == 0){
364
+ if (strcmp(argv[arg_index], "no") == 0)
365
+ doTourBus = false;
356
366
  } else if (strcmp(arg, "--help") == 0) {
357
367
  printUsage();
358
368
  return 0;
@@ -452,7 +462,8 @@ int main(int argc, char **argv)
452
462
 
453
463
  sequenceLengths =
454
464
  getSequenceLengths(sequences, getWordLength(graph));
455
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
465
+ if (doTourBus)
466
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
456
467
  exportGraph(graphFilename, graph, sequences->tSequences);
457
468
  } else if ((file = fopen(preGraphFilename, "r")) != NULL) {
458
469
  fclose(file);
@@ -468,7 +479,8 @@ int main(int argc, char **argv)
468
479
  roadmapFilename, readTracking, accelerationBits);
469
480
  sequenceLengths =
470
481
  getSequenceLengths(sequences, getWordLength(graph));
471
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
482
+ if (doTourBus)
483
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
472
484
  exportGraph(graphFilename, graph, sequences->tSequences);
473
485
  } else if ((file = fopen(roadmapFilename, "r")) != NULL) {
474
486
  fclose(file);
@@ -509,7 +521,7 @@ int main(int argc, char **argv)
509
521
  }
510
522
  preGraph = newPreGraph_pg(rdmaps, seqReadInfo);
511
523
  concatenatePreGraph_pg(preGraph);
512
- if (!conserveLong)
524
+ if (!conserveLong && doClipTips)
513
525
  clipTips_pg(preGraph);
514
526
  exportPreGraph_pg(preGraphFilename, preGraph);
515
527
  destroyPreGraph_pg(preGraph);
@@ -523,13 +535,14 @@ int main(int argc, char **argv)
523
535
  roadmapFilename, readTracking, accelerationBits);
524
536
  sequenceLengths =
525
537
  getSequenceLengths(sequences, getWordLength(graph));
526
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
538
+ if (doTourBus)
539
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
527
540
  exportGraph(graphFilename, graph, sequences->tSequences);
528
541
  } else {
529
542
  velvetLog("No Roadmap file to build upon! Please run velveth (see manual)\n");
530
- #ifdef DEBUG
543
+ #ifdef DEBUG
531
544
  abort();
532
- #endif
545
+ #endif
533
546
  exit(1);
534
547
  }
535
548
 
@@ -553,11 +566,11 @@ int main(int argc, char **argv)
553
566
  coverageCutoff = expectedCoverage / 2;
554
567
  estimateCutoff = true;
555
568
  }
556
- } else {
569
+ } else {
557
570
  estimateCoverage = false;
558
- if (coverageCutoff < 0 && estimateCutoff)
571
+ if (coverageCutoff < 0 && estimateCutoff)
559
572
  coverageCutoff = estimated_cov(graph, directory) / 2;
560
- else
573
+ else
561
574
  estimateCutoff = false;
562
575
  }
563
576
 
@@ -580,7 +593,7 @@ int main(int argc, char **argv)
580
593
  if (minContigLength < 2 * getWordLength(graph))
581
594
  minContigKmerLength = getWordLength(graph);
582
595
  else
583
- minContigKmerLength = minContigLength - getWordLength(graph) + 1;
596
+ minContigKmerLength = minContigLength - getWordLength(graph) + 1;
584
597
 
585
598
  dubious =
586
599
  removeLowCoverageNodesAndDenounceDubiousReads(graph,
@@ -599,7 +612,8 @@ int main(int argc, char **argv)
599
612
  lowCovContigsFilename);
600
613
 
601
614
  removeHighCoverageNodes(graph, maxCoverageCutoff, exportFilteredNodes, minContigKmerLength, highCovContigsFilename);
602
- clipTipsHard(graph, conserveLong);
615
+ if (doClipTips)
616
+ clipTipsHard(graph, conserveLong);
603
617
 
604
618
  if (sequences->readCount > 0 && sequences->categories[0] == REFERENCE)
605
619
  removeLowArcs(graph, coverageCutoff);
@@ -635,7 +649,7 @@ int main(int argc, char **argv)
635
649
  strcpy(graphFilename, directory);
636
650
  strcat(graphFilename, "/contigs.fa");
637
651
  sequenceLengths = getSequenceLengths(sequences, getWordLength(graph));
638
- exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
652
+ exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
639
653
 
640
654
  if (exportAlignments) {
641
655
  strcpy(graphFilename, directory);
@@ -663,9 +677,9 @@ int main(int argc, char **argv)
663
677
  if (unusedReads)
664
678
  exportUnusedReads(graph, sequences, minContigKmerLength, directory);
665
679
 
666
- if (estimateCoverage)
680
+ if (estimateCoverage)
667
681
  velvetLog("Estimated Coverage = %f\n", expectedCoverage);
668
- if (estimateCutoff)
682
+ if (estimateCutoff)
669
683
  velvetLog("Estimated Coverage cutoff = %f\n", coverageCutoff);
670
684
 
671
685
  logFinalStats(graph, minContigKmerLength, directory);
@@ -673,25 +687,25 @@ int main(int argc, char **argv)
673
687
  if (clean > 0) {
674
688
  strcpy(graphFilename, directory);
675
689
  strcat(graphFilename, "/Roadmaps");
676
- remove(graphFilename);
690
+ remove(graphFilename);
677
691
 
678
692
  strcpy(graphFilename, directory);
679
693
  strcat(graphFilename, "/LastGraph");
680
- remove(graphFilename);
681
- }
694
+ remove(graphFilename);
695
+ }
682
696
 
683
697
  if (clean > 1) {
684
698
  strcpy(graphFilename, directory);
685
699
  strcat(graphFilename, "/Sequences");
686
- remove(graphFilename);
700
+ remove(graphFilename);
687
701
 
688
702
  strcpy(graphFilename, directory);
689
703
  strcat(graphFilename, "/Graph2");
690
- remove(graphFilename);
704
+ remove(graphFilename);
691
705
 
692
706
  strcpy(graphFilename, directory);
693
707
  strcat(graphFilename, "/Graph");
694
- remove(graphFilename);
708
+ remove(graphFilename);
695
709
  }
696
710
 
697
711
  free(sequenceLengths);
@@ -1,5 +1,5 @@
1
1
  /*
2
- Copyright 2009 John Marshall (jm18@sanger.ac.uk)
2
+ Copyright 2009 John Marshall (jm18@sanger.ac.uk)
3
3
 
4
4
  This file is part of Velvet.
5
5
 
@@ -89,14 +89,15 @@ void exitErrorf(int exitStatus, boolean showErrno, const char *format, ...)
89
89
  fprintf(stderr, "\n");
90
90
  va_end(args);
91
91
 
92
- #ifdef DEBUG
92
+ #ifdef DEBUG
93
93
  abort();
94
- #endif
94
+ #endif
95
95
  exit(exitStatus);
96
96
  }
97
97
 
98
98
  void velvetLog(const char *format, ...)
99
99
  {
100
+ /* Don't print anything as it interferes with code bound through bioruby-velvet_underground
100
101
  static boolean timeIsSet = false;
101
102
  static struct timeval tvStart;
102
103
  struct timeval tvNow;
@@ -117,14 +118,14 @@ void velvetLog(const char *format, ...)
117
118
  vprintf(format, args);
118
119
  va_end(args);
119
120
 
120
- #ifdef DEBUG
121
+ #ifdef DEBUG
121
122
  fflush(stdout);
122
- #endif
123
+ #endif*/
123
124
  }
124
125
 
125
- void velvetFprintf(FILE * file, const char * format, ...)
126
+ void velvetFprintf(FILE * file, const char * format, ...)
126
127
  {
127
- va_list args;
128
+ va_list args;
128
129
 
129
130
  va_start(args, format);
130
131
  if (vfprintf(file, format, args) < 0) {
@@ -132,9 +133,9 @@ void velvetFprintf(FILE * file, const char * format, ...)
132
133
  fprintf(stderr, "%s: ", programName);
133
134
  fprintf(stderr, "Could not write into file\n");
134
135
  va_end(args);
135
- #ifdef DEBUG
136
+ #ifdef DEBUG
136
137
  abort();
137
- #endif
138
+ #endif
138
139
  exit(EXIT_FAILURE);
139
140
  }
140
141
  va_end(args);
@@ -1,12 +1,56 @@
1
- # Please require your code below, respecting the naming conventions in the
2
- # bioruby directory tree.
3
- #
4
- # For example, say you have a plugin named bio-plugin, the only uncommented
5
- # line in this file would be
6
- #
7
- # require 'bio/bio-plugin/plugin'
8
- #
9
- # In this file only require other files. Avoid other source code.
10
-
11
- require 'bio-velvet_underground/velvet_underground.rb'
1
+ require 'ffi'
2
+
3
+ require 'bio-velvet_underground/constants'
4
+ require 'bio-logger'
5
+
6
+ Bio::Log::LoggerPlus.new('bio-velvet_underground')
7
+ module Bio
8
+ module Velvet
9
+ module UndergroundLogging
10
+ def log
11
+ Bio::Log::LoggerPlus['bio-velvet_underground']
12
+ end
13
+ end
14
+
15
+ class Underground
16
+ extend FFI::Library
17
+ include Bio::Velvet::UndergroundLogging
18
+ def self.log
19
+ Bio::Log::LoggerPlus['bio-velvet_underground']
20
+ end
21
+
22
+ # Return the minimum kmer length greater than or equal to the given
23
+ # graph hash length e.g. 29 => 31, 31 => 31, 33 => 63.
24
+ def self.compilation_max_kmer(graph_hash_length)
25
+ max_kmers.select{|k| graph_hash_length<=k}.min
26
+ end
27
+
28
+ # Attach the correct shared velvet library with ffi. Options:
29
+ # :kmer: attach library with at least this much kmer length
30
+ def self.attach_shared_library(velvet_compilation_options={})
31
+ max_kmer_length = nil
32
+ given_kmer = velvet_compilation_options[:kmer]
33
+ if !given_kmer.nil?
34
+ max_kmer_length = compilation_max_kmer(given_kmer)
35
+ raise "No installed velvet library available for max kmer #{given_kmer}" if max_kmer_length.nil?
36
+ end
37
+ log.debug "Found max kmer length #{max_kmer_length} to load with the velvet library"
38
+
39
+ # Set the ffi library path to the correct velvet one
40
+ lib_location = self.library_location_of(max_kmer_length)
41
+ log.debug "Loading velvet underground FFI library #{lib_location}.."
42
+ ffi_lib lib_location
43
+ log.debug "Velvet library loaded."
44
+
45
+ attach_graph_functions
46
+ attach_binary_sequence_functions
47
+ attach_runner_functions
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ require 'bio-velvet_underground/binary_sequence_store'
54
+ require 'bio-velvet_underground/graph'
55
+ require 'bio-velvet_underground/runner'
12
56
 
@@ -0,0 +1,86 @@
1
+ class Bio::Velvet::Underground
2
+ class BinarySequenceStore
3
+ # Parse a CnyUnifiedSeq file in so that sequences can be accessed
4
+ def initialize(cny_unified_seq_file)
5
+ Bio::Velvet::Underground.attach_shared_library
6
+ readset_pointer = Bio::Velvet::Underground.importCnyReadSet cny_unified_seq_file
7
+ @readset = Bio::Velvet::Underground::ReadSet.new(readset_pointer)
8
+ end
9
+
10
+ # Return a sequence from the store given its read ID.
11
+ def [](sequence_id)
12
+ if sequence_id==0 or sequence_id > @readset[:readCount]
13
+ raise "Invalid sequence_id #{sequence_id}"
14
+ end
15
+
16
+ pointer = Bio::Velvet::Underground.getTightStringInArray(
17
+ @readset[:tSequences], sequence_id-1
18
+ )
19
+ Bio::Velvet::Underground.readTightString pointer
20
+ end
21
+
22
+ # Number of sequences in this store
23
+ def length
24
+ @readset[:readCount]
25
+ end
26
+
27
+ # Returns true if the sequence ID refers to the
28
+ # second in a pair of sequences.
29
+ def is_second_in_pair?(sequence_id)
30
+ if sequence_id==0 or sequence_id > @readset[:readCount]
31
+ raise "Invalid sequence_id #{sequence_id}"
32
+ end
33
+ Bio::Velvet::Underground.isSecondInPair @readset, sequence_id-1
34
+ end
35
+
36
+ # Returns the ID of the given sequence_id's pair
37
+ def pair_id(sequence_id)
38
+ if is_second_in_pair?(sequence_id)
39
+ sequence_id-1
40
+ else
41
+ sequence_id+1
42
+ end
43
+ end
44
+ end
45
+
46
+ private
47
+ # struct readSet_st {
48
+ # char **sequences;
49
+ # TightString *tSequences;
50
+ # char **labels;
51
+ # char *tSeqMem;
52
+ # Quality **confidenceScores;
53
+ # Probability **kmerProbabilities;
54
+ # IDnum *mateReads;
55
+ # Category *categories;
56
+ # unsigned char *secondInPair;
57
+ # IDnum readCount;
58
+ # };
59
+ class ReadSet < FFI::Struct
60
+ layout :sequences, :pointer, # char **sequences;
61
+ :tSequences, :pointer, # TightString *tSequences;
62
+ :labels, :pointer, # char **labels;
63
+ :tSeqMem, :pointer, # char *tSeqMem; #TODO: they don't really mean char* here - meant as an unsigned short?
64
+ :confidenceScores, :pointer, # Quality **confidenceScores;
65
+ :kmerProbabilities, :pointer, # Probability **kmerProbabilities;
66
+ :mateReads, :pointer, # IDnum *mateReads;
67
+ :categories, :pointer, # Category *categories;
68
+ :secondInPair, :pointer, # unsigned char *secondInPair;
69
+ :readCount, :int32 # IDnum readCount;
70
+ end
71
+
72
+ def self.attach_binary_sequence_functions
73
+ # ReadSet *importCnyReadSet(char *filename);
74
+ attach_function :importCnyReadSet, [:string], :pointer
75
+
76
+ # char *readTightString(TightString * tString); #tightString.h
77
+ attach_function :readTightString, [:pointer], :string
78
+
79
+ # TightString *getTightStringInArray(TightString * tString,
80
+ # IDnum position);
81
+ attach_function :getTightStringInArray, [:pointer, :int32], :pointer
82
+
83
+ # boolean isSecondInPair(ReadSet * reads, IDnum index);
84
+ attach_function :isSecondInPair, [:pointer, :int32], :bool
85
+ end
86
+ end