bio-velvet_underground 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -68,6 +68,8 @@ static void printUsage()
68
68
  puts("\t-paired_exp_fraction <double>\t: remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)");
69
69
  puts("\t-shortMatePaired* <yes|no>\t: for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)");
70
70
  puts("\t-conserveLong <yes|no>\t\t: preserve sequences with long reads in them (default no)");
71
+ puts("\t-clip_tips <yes|no>\t\t: do tip clipping on pre-graph (default yes)");
72
+ puts("\t-tour_bus <yes|no>\t\t: apply the tour bus algorithm (default yes)");
71
73
  puts("");
72
74
  puts("Output:");
73
75
  puts("\tdirectory/contigs.fa\t\t: fasta file of contigs longer than twice hash length");
@@ -76,7 +78,7 @@ static void printUsage()
76
78
  puts("\tdirectory/velvet_asm.afg\t: (if requested) AMOS compatible assembly file");
77
79
  }
78
80
 
79
- int main(int argc, char **argv)
81
+ int velvetg(int argc, char **argv)
80
82
  {
81
83
  ReadSet *sequences = NULL;
82
84
  RoadMapArray *rdmaps;
@@ -86,7 +88,9 @@ int main(int argc, char **argv)
86
88
  *preGraphFilename, *seqFilename, *roadmapFilename,
87
89
  *lowCovContigsFilename, *highCovContigsFilename;
88
90
  double coverageCutoff = -1;
89
- double longCoverageCutoff = -1;
91
+ boolean doClipTips = true;
92
+ boolean doTourBus = true;
93
+ double longCoverageCutoff = -1;
90
94
  double maxCoverageCutoff = -1;
91
95
  double expectedCoverage = -1;
92
96
  Coordinate minContigLength = -1;
@@ -164,7 +168,7 @@ int main(int argc, char **argv)
164
168
  return 0;
165
169
  }
166
170
 
167
- // Memory allocation
171
+ // Memory allocation
168
172
  directory = argv[1];
169
173
  graphFilename = mallocOrExit(strlen(directory) + 100, char);
170
174
  connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char);
@@ -181,9 +185,9 @@ int main(int argc, char **argv)
181
185
  if (arg_index >= argc) {
182
186
  velvetLog("Unusual number of arguments!\n");
183
187
  printUsage();
184
- #ifdef DEBUG
188
+ #ifdef DEBUG
185
189
  abort();
186
- #endif
190
+ #endif
187
191
  exit(1);
188
192
  }
189
193
 
@@ -210,9 +214,9 @@ int main(int argc, char **argv)
210
214
  if (insertLength[0] < 0) {
211
215
  velvetLog("Invalid insert length: %lli\n",
212
216
  (long long) insertLength[0]);
213
- #ifdef DEBUG
217
+ #ifdef DEBUG
214
218
  abort();
215
- #endif
219
+ #endif
216
220
  exit(1);
217
221
  }
218
222
  } else if (strcmp(arg, "-ins_length_sd") == 0) {
@@ -221,9 +225,9 @@ int main(int argc, char **argv)
221
225
  if (std_dev[0] < 0) {
222
226
  velvetLog("Invalid std deviation: %lli\n",
223
227
  (long long) std_dev[0]);
224
- #ifdef DEBUG
228
+ #ifdef DEBUG
225
229
  abort();
226
- #endif
230
+ #endif
227
231
  exit(1);
228
232
  }
229
233
  } else if (strcmp(arg, "-ins_length_long") == 0) {
@@ -238,9 +242,9 @@ int main(int argc, char **argv)
238
242
  cat = (Category) short_var;
239
243
  if (cat < 1 || cat > CATEGORIES) {
240
244
  velvetLog("Unknown option: %s\n", arg);
241
- #ifdef DEBUG
245
+ #ifdef DEBUG
242
246
  abort();
243
- #endif
247
+ #endif
244
248
  exit(1);
245
249
  }
246
250
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -248,9 +252,9 @@ int main(int argc, char **argv)
248
252
  if (insertLength[cat - 1] < 0) {
249
253
  velvetLog("Invalid insert length: %lli\n",
250
254
  (long long) insertLength[cat - 1]);
251
- #ifdef DEBUG
255
+ #ifdef DEBUG
252
256
  abort();
253
- #endif
257
+ #endif
254
258
  exit(1);
255
259
  }
256
260
  } else if (strncmp(arg, "-ins_length", 11) == 0) {
@@ -258,9 +262,9 @@ int main(int argc, char **argv)
258
262
  cat = (Category) short_var;
259
263
  if (cat < 1 || cat > CATEGORIES) {
260
264
  velvetLog("Unknown option: %s\n", arg);
261
- #ifdef DEBUG
265
+ #ifdef DEBUG
262
266
  abort();
263
- #endif
267
+ #endif
264
268
  exit(1);
265
269
  }
266
270
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -268,9 +272,9 @@ int main(int argc, char **argv)
268
272
  if (std_dev[cat - 1] < 0) {
269
273
  velvetLog("Invalid std deviation: %lli\n",
270
274
  (long long) std_dev[cat - 1]);
271
- #ifdef DEBUG
275
+ #ifdef DEBUG
272
276
  abort();
273
- #endif
277
+ #endif
274
278
  exit(1);
275
279
  }
276
280
  } else if (strcmp(arg, "-read_trkg") == 0) {
@@ -353,6 +357,12 @@ int main(int argc, char **argv)
353
357
  exit(1);
354
358
  }
355
359
  shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0);
360
+ } else if (strcmp(arg,"-clip_tips") == 0){
361
+ if (strcmp(argv[arg_index], "no") == 0)
362
+ doClipTips = false;
363
+ } else if (strcmp(arg,"-tour_bus") == 0){
364
+ if (strcmp(argv[arg_index], "no") == 0)
365
+ doTourBus = false;
356
366
  } else if (strcmp(arg, "--help") == 0) {
357
367
  printUsage();
358
368
  return 0;
@@ -452,7 +462,8 @@ int main(int argc, char **argv)
452
462
 
453
463
  sequenceLengths =
454
464
  getSequenceLengths(sequences, getWordLength(graph));
455
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
465
+ if (doTourBus)
466
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
456
467
  exportGraph(graphFilename, graph, sequences->tSequences);
457
468
  } else if ((file = fopen(preGraphFilename, "r")) != NULL) {
458
469
  fclose(file);
@@ -468,7 +479,8 @@ int main(int argc, char **argv)
468
479
  roadmapFilename, readTracking, accelerationBits);
469
480
  sequenceLengths =
470
481
  getSequenceLengths(sequences, getWordLength(graph));
471
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
482
+ if (doTourBus)
483
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
472
484
  exportGraph(graphFilename, graph, sequences->tSequences);
473
485
  } else if ((file = fopen(roadmapFilename, "r")) != NULL) {
474
486
  fclose(file);
@@ -509,7 +521,7 @@ int main(int argc, char **argv)
509
521
  }
510
522
  preGraph = newPreGraph_pg(rdmaps, seqReadInfo);
511
523
  concatenatePreGraph_pg(preGraph);
512
- if (!conserveLong)
524
+ if (!conserveLong && doClipTips)
513
525
  clipTips_pg(preGraph);
514
526
  exportPreGraph_pg(preGraphFilename, preGraph);
515
527
  destroyPreGraph_pg(preGraph);
@@ -523,13 +535,14 @@ int main(int argc, char **argv)
523
535
  roadmapFilename, readTracking, accelerationBits);
524
536
  sequenceLengths =
525
537
  getSequenceLengths(sequences, getWordLength(graph));
526
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
538
+ if (doTourBus)
539
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
527
540
  exportGraph(graphFilename, graph, sequences->tSequences);
528
541
  } else {
529
542
  velvetLog("No Roadmap file to build upon! Please run velveth (see manual)\n");
530
- #ifdef DEBUG
543
+ #ifdef DEBUG
531
544
  abort();
532
- #endif
545
+ #endif
533
546
  exit(1);
534
547
  }
535
548
 
@@ -553,11 +566,11 @@ int main(int argc, char **argv)
553
566
  coverageCutoff = expectedCoverage / 2;
554
567
  estimateCutoff = true;
555
568
  }
556
- } else {
569
+ } else {
557
570
  estimateCoverage = false;
558
- if (coverageCutoff < 0 && estimateCutoff)
571
+ if (coverageCutoff < 0 && estimateCutoff)
559
572
  coverageCutoff = estimated_cov(graph, directory) / 2;
560
- else
573
+ else
561
574
  estimateCutoff = false;
562
575
  }
563
576
 
@@ -580,7 +593,7 @@ int main(int argc, char **argv)
580
593
  if (minContigLength < 2 * getWordLength(graph))
581
594
  minContigKmerLength = getWordLength(graph);
582
595
  else
583
- minContigKmerLength = minContigLength - getWordLength(graph) + 1;
596
+ minContigKmerLength = minContigLength - getWordLength(graph) + 1;
584
597
 
585
598
  dubious =
586
599
  removeLowCoverageNodesAndDenounceDubiousReads(graph,
@@ -599,7 +612,8 @@ int main(int argc, char **argv)
599
612
  lowCovContigsFilename);
600
613
 
601
614
  removeHighCoverageNodes(graph, maxCoverageCutoff, exportFilteredNodes, minContigKmerLength, highCovContigsFilename);
602
- clipTipsHard(graph, conserveLong);
615
+ if (doClipTips)
616
+ clipTipsHard(graph, conserveLong);
603
617
 
604
618
  if (sequences->readCount > 0 && sequences->categories[0] == REFERENCE)
605
619
  removeLowArcs(graph, coverageCutoff);
@@ -635,7 +649,7 @@ int main(int argc, char **argv)
635
649
  strcpy(graphFilename, directory);
636
650
  strcat(graphFilename, "/contigs.fa");
637
651
  sequenceLengths = getSequenceLengths(sequences, getWordLength(graph));
638
- exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
652
+ exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
639
653
 
640
654
  if (exportAlignments) {
641
655
  strcpy(graphFilename, directory);
@@ -663,9 +677,9 @@ int main(int argc, char **argv)
663
677
  if (unusedReads)
664
678
  exportUnusedReads(graph, sequences, minContigKmerLength, directory);
665
679
 
666
- if (estimateCoverage)
680
+ if (estimateCoverage)
667
681
  velvetLog("Estimated Coverage = %f\n", expectedCoverage);
668
- if (estimateCutoff)
682
+ if (estimateCutoff)
669
683
  velvetLog("Estimated Coverage cutoff = %f\n", coverageCutoff);
670
684
 
671
685
  logFinalStats(graph, minContigKmerLength, directory);
@@ -673,25 +687,25 @@ int main(int argc, char **argv)
673
687
  if (clean > 0) {
674
688
  strcpy(graphFilename, directory);
675
689
  strcat(graphFilename, "/Roadmaps");
676
- remove(graphFilename);
690
+ remove(graphFilename);
677
691
 
678
692
  strcpy(graphFilename, directory);
679
693
  strcat(graphFilename, "/LastGraph");
680
- remove(graphFilename);
681
- }
694
+ remove(graphFilename);
695
+ }
682
696
 
683
697
  if (clean > 1) {
684
698
  strcpy(graphFilename, directory);
685
699
  strcat(graphFilename, "/Sequences");
686
- remove(graphFilename);
700
+ remove(graphFilename);
687
701
 
688
702
  strcpy(graphFilename, directory);
689
703
  strcat(graphFilename, "/Graph2");
690
- remove(graphFilename);
704
+ remove(graphFilename);
691
705
 
692
706
  strcpy(graphFilename, directory);
693
707
  strcat(graphFilename, "/Graph");
694
- remove(graphFilename);
708
+ remove(graphFilename);
695
709
  }
696
710
 
697
711
  free(sequenceLengths);
@@ -1,5 +1,5 @@
1
1
  /*
2
- Copyright 2009 John Marshall (jm18@sanger.ac.uk)
2
+ Copyright 2009 John Marshall (jm18@sanger.ac.uk)
3
3
 
4
4
  This file is part of Velvet.
5
5
 
@@ -89,14 +89,15 @@ void exitErrorf(int exitStatus, boolean showErrno, const char *format, ...)
89
89
  fprintf(stderr, "\n");
90
90
  va_end(args);
91
91
 
92
- #ifdef DEBUG
92
+ #ifdef DEBUG
93
93
  abort();
94
- #endif
94
+ #endif
95
95
  exit(exitStatus);
96
96
  }
97
97
 
98
98
  void velvetLog(const char *format, ...)
99
99
  {
100
+ /* Don't print anything as it interferes with code bound through bioruby-velvet_underground
100
101
  static boolean timeIsSet = false;
101
102
  static struct timeval tvStart;
102
103
  struct timeval tvNow;
@@ -117,14 +118,14 @@ void velvetLog(const char *format, ...)
117
118
  vprintf(format, args);
118
119
  va_end(args);
119
120
 
120
- #ifdef DEBUG
121
+ #ifdef DEBUG
121
122
  fflush(stdout);
122
- #endif
123
+ #endif*/
123
124
  }
124
125
 
125
- void velvetFprintf(FILE * file, const char * format, ...)
126
+ void velvetFprintf(FILE * file, const char * format, ...)
126
127
  {
127
- va_list args;
128
+ va_list args;
128
129
 
129
130
  va_start(args, format);
130
131
  if (vfprintf(file, format, args) < 0) {
@@ -132,9 +133,9 @@ void velvetFprintf(FILE * file, const char * format, ...)
132
133
  fprintf(stderr, "%s: ", programName);
133
134
  fprintf(stderr, "Could not write into file\n");
134
135
  va_end(args);
135
- #ifdef DEBUG
136
+ #ifdef DEBUG
136
137
  abort();
137
- #endif
138
+ #endif
138
139
  exit(EXIT_FAILURE);
139
140
  }
140
141
  va_end(args);
@@ -1,12 +1,56 @@
1
- # Please require your code below, respecting the naming conventions in the
2
- # bioruby directory tree.
3
- #
4
- # For example, say you have a plugin named bio-plugin, the only uncommented
5
- # line in this file would be
6
- #
7
- # require 'bio/bio-plugin/plugin'
8
- #
9
- # In this file only require other files. Avoid other source code.
10
-
11
- require 'bio-velvet_underground/velvet_underground.rb'
1
+ require 'ffi'
2
+
3
+ require 'bio-velvet_underground/constants'
4
+ require 'bio-logger'
5
+
6
+ Bio::Log::LoggerPlus.new('bio-velvet_underground')
7
+ module Bio
8
+ module Velvet
9
+ module UndergroundLogging
10
+ def log
11
+ Bio::Log::LoggerPlus['bio-velvet_underground']
12
+ end
13
+ end
14
+
15
+ class Underground
16
+ extend FFI::Library
17
+ include Bio::Velvet::UndergroundLogging
18
+ def self.log
19
+ Bio::Log::LoggerPlus['bio-velvet_underground']
20
+ end
21
+
22
+ # Return the minimum kmer length greater than or equal to the given
23
+ # graph hash length e.g. 29 => 31, 31 => 31, 33 => 63.
24
+ def self.compilation_max_kmer(graph_hash_length)
25
+ max_kmers.select{|k| graph_hash_length<=k}.min
26
+ end
27
+
28
+ # Attach the correct shared velvet library with ffi. Options:
29
+ # :kmer: attach library with at least this much kmer length
30
+ def self.attach_shared_library(velvet_compilation_options={})
31
+ max_kmer_length = nil
32
+ given_kmer = velvet_compilation_options[:kmer]
33
+ if !given_kmer.nil?
34
+ max_kmer_length = compilation_max_kmer(given_kmer)
35
+ raise "No installed velvet library available for max kmer #{given_kmer}" if max_kmer_length.nil?
36
+ end
37
+ log.debug "Found max kmer length #{max_kmer_length} to load with the velvet library"
38
+
39
+ # Set the ffi library path to the correct velvet one
40
+ lib_location = self.library_location_of(max_kmer_length)
41
+ log.debug "Loading velvet underground FFI library #{lib_location}.."
42
+ ffi_lib lib_location
43
+ log.debug "Velvet library loaded."
44
+
45
+ attach_graph_functions
46
+ attach_binary_sequence_functions
47
+ attach_runner_functions
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ require 'bio-velvet_underground/binary_sequence_store'
54
+ require 'bio-velvet_underground/graph'
55
+ require 'bio-velvet_underground/runner'
12
56
 
@@ -0,0 +1,86 @@
1
+ class Bio::Velvet::Underground
2
+ class BinarySequenceStore
3
+ # Parse a CnyUnifiedSeq file in so that sequences can be accessed
4
+ def initialize(cny_unified_seq_file)
5
+ Bio::Velvet::Underground.attach_shared_library
6
+ readset_pointer = Bio::Velvet::Underground.importCnyReadSet cny_unified_seq_file
7
+ @readset = Bio::Velvet::Underground::ReadSet.new(readset_pointer)
8
+ end
9
+
10
+ # Return a sequence from the store given its read ID.
11
+ def [](sequence_id)
12
+ if sequence_id==0 or sequence_id > @readset[:readCount]
13
+ raise "Invalid sequence_id #{sequence_id}"
14
+ end
15
+
16
+ pointer = Bio::Velvet::Underground.getTightStringInArray(
17
+ @readset[:tSequences], sequence_id-1
18
+ )
19
+ Bio::Velvet::Underground.readTightString pointer
20
+ end
21
+
22
+ # Number of sequences in this store
23
+ def length
24
+ @readset[:readCount]
25
+ end
26
+
27
+ # Returns true if the sequence ID refers to the
28
+ # second in a pair of sequences.
29
+ def is_second_in_pair?(sequence_id)
30
+ if sequence_id==0 or sequence_id > @readset[:readCount]
31
+ raise "Invalid sequence_id #{sequence_id}"
32
+ end
33
+ Bio::Velvet::Underground.isSecondInPair @readset, sequence_id-1
34
+ end
35
+
36
+ # Returns the ID of the given sequence_id's pair
37
+ def pair_id(sequence_id)
38
+ if is_second_in_pair?(sequence_id)
39
+ sequence_id-1
40
+ else
41
+ sequence_id+1
42
+ end
43
+ end
44
+ end
45
+
46
+ private
47
+ # struct readSet_st {
48
+ # char **sequences;
49
+ # TightString *tSequences;
50
+ # char **labels;
51
+ # char *tSeqMem;
52
+ # Quality **confidenceScores;
53
+ # Probability **kmerProbabilities;
54
+ # IDnum *mateReads;
55
+ # Category *categories;
56
+ # unsigned char *secondInPair;
57
+ # IDnum readCount;
58
+ # };
59
+ class ReadSet < FFI::Struct
60
+ layout :sequences, :pointer, # char **sequences;
61
+ :tSequences, :pointer, # TightString *tSequences;
62
+ :labels, :pointer, # char **labels;
63
+ :tSeqMem, :pointer, # char *tSeqMem; #TODO: they don't really mean char* here - meant as an unsigned short?
64
+ :confidenceScores, :pointer, # Quality **confidenceScores;
65
+ :kmerProbabilities, :pointer, # Probability **kmerProbabilities;
66
+ :mateReads, :pointer, # IDnum *mateReads;
67
+ :categories, :pointer, # Category *categories;
68
+ :secondInPair, :pointer, # unsigned char *secondInPair;
69
+ :readCount, :int32 # IDnum readCount;
70
+ end
71
+
72
+ def self.attach_binary_sequence_functions
73
+ # ReadSet *importCnyReadSet(char *filename);
74
+ attach_function :importCnyReadSet, [:string], :pointer
75
+
76
+ # char *readTightString(TightString * tString); #tightString.h
77
+ attach_function :readTightString, [:pointer], :string
78
+
79
+ # TightString *getTightStringInArray(TightString * tString,
80
+ # IDnum position);
81
+ attach_function :getTightStringInArray, [:pointer, :int32], :pointer
82
+
83
+ # boolean isSecondInPair(ReadSet * reads, IDnum index);
84
+ attach_function :isSecondInPair, [:pointer, :int32], :bool
85
+ end
86
+ end