bio-velvet_underground 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/src/src/run.c CHANGED
@@ -39,7 +39,7 @@ static void printUsage()
39
39
  printf("\thash_length\t: EITHER an odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
40
40
  printf("\t\t\t: OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
41
41
  puts("\t\t\t\tand s is a step (even number). Velvet will then hash from k=m to k=M with a step of s");
42
- puts("\tfilename\t: path to sequence file or - for standard input");
42
+ puts("\tfilename\t: path to sequence file or - for standard input");
43
43
  puts("");
44
44
  puts("File format options:");
45
45
  puts("\t-fasta\t-fastq\t-raw\t-fasta.gz\t-fastq.gz\t-raw.gz\t-sam\t-bam\t-fmtAuto");
@@ -52,7 +52,7 @@ static void printUsage()
52
52
  puts("Read type options:");
53
53
  puts("\t-short\t-shortPaired");
54
54
  #if CATEGORIES <= 5
55
- Category cat;
55
+ Category cat;
56
56
  for (cat = 2; cat <= CATEGORIES; cat++)
57
57
  printf("\t-short%i\t-shortPaired%i\n", cat, cat);
58
58
  #else
@@ -93,7 +93,7 @@ static void printUsage()
93
93
  puts("\t\t[Both files are picked up by graph, so please leave them there]");
94
94
  }
95
95
 
96
- int velveth(int argc, char **argv)
96
+ int main(int argc, char **argv)
97
97
  {
98
98
  ReadSet *allSequences = NULL;
99
99
  SplayTable *splayTable;
@@ -161,18 +161,18 @@ int velveth(int argc, char **argv)
161
161
  ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
162
162
  hashLength, MAXKMERLENGTH);
163
163
  hashLength = MAXKMERLENGTH;
164
- }
164
+ }
165
165
  if (hashLength <= 0) {
166
166
  velvetLog("Invalid hash length: %s\n", argv[2]);
167
167
  printUsage();
168
168
  return 0;
169
- }
169
+ }
170
170
  if (hashLength % 2 == 0) {
171
171
  velvetLog
172
172
  ("Velvet can't work with even length k-mers, such as %i. We'll use %i instead, if you don't mind.\n",
173
173
  hashLength, hashLength - 1);
174
174
  hashLength--;
175
- }
175
+ }
176
176
 
177
177
  if (multiple_kmers) {
178
178
  if (hashLengthMax > MAXKMERLENGTH + 1) {
@@ -180,12 +180,12 @@ int velveth(int argc, char **argv)
180
180
  ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
181
181
  hashLengthMax, MAXKMERLENGTH + 1);
182
182
  hashLengthMax = MAXKMERLENGTH + 1;
183
- }
183
+ }
184
184
  if (hashLengthMax <= hashLength) {
185
185
  velvetLog("hashLengthMin < hashLengthMax is required %s", argv[2]);
186
186
  printUsage();
187
187
  return 0;
188
- }
188
+ }
189
189
 
190
190
  if (hashLengthStep <= 0) {
191
191
  velvetLog("Non-positive hash length! Setting it to 2\n");
@@ -215,7 +215,7 @@ int velveth(int argc, char **argv)
215
215
  sprintf(buf,"%s_%d",argv[1],h);
216
216
  directory = mallocOrExit(strlen(buf) + 100, char);
217
217
  strcpy(directory,buf);
218
- } else
218
+ } else
219
219
  directory = argv[1];
220
220
 
221
221
  filename = mallocOrExit(strlen(directory) + 100, char);
data/ext/src/src/run2.c CHANGED
@@ -68,8 +68,6 @@ static void printUsage()
68
68
  puts("\t-paired_exp_fraction <double>\t: remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)");
69
69
  puts("\t-shortMatePaired* <yes|no>\t: for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)");
70
70
  puts("\t-conserveLong <yes|no>\t\t: preserve sequences with long reads in them (default no)");
71
- puts("\t-clip_tips <yes|no>\t\t: do tip clipping on pre-graph (default yes)");
72
- puts("\t-tour_bus <yes|no>\t\t: apply the tour bus algorithm (default yes)");
73
71
  puts("");
74
72
  puts("Output:");
75
73
  puts("\tdirectory/contigs.fa\t\t: fasta file of contigs longer than twice hash length");
@@ -78,7 +76,7 @@ static void printUsage()
78
76
  puts("\tdirectory/velvet_asm.afg\t: (if requested) AMOS compatible assembly file");
79
77
  }
80
78
 
81
- int velvetg(int argc, char **argv)
79
+ int main(int argc, char **argv)
82
80
  {
83
81
  ReadSet *sequences = NULL;
84
82
  RoadMapArray *rdmaps;
@@ -88,9 +86,7 @@ int velvetg(int argc, char **argv)
88
86
  *preGraphFilename, *seqFilename, *roadmapFilename,
89
87
  *lowCovContigsFilename, *highCovContigsFilename;
90
88
  double coverageCutoff = -1;
91
- boolean doClipTips = true;
92
- boolean doTourBus = true;
93
- double longCoverageCutoff = -1;
89
+ double longCoverageCutoff = -1;
94
90
  double maxCoverageCutoff = -1;
95
91
  double expectedCoverage = -1;
96
92
  Coordinate minContigLength = -1;
@@ -168,7 +164,7 @@ int velvetg(int argc, char **argv)
168
164
  return 0;
169
165
  }
170
166
 
171
- // Memory allocation
167
+ // Memory allocation
172
168
  directory = argv[1];
173
169
  graphFilename = mallocOrExit(strlen(directory) + 100, char);
174
170
  connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char);
@@ -185,9 +181,9 @@ int velvetg(int argc, char **argv)
185
181
  if (arg_index >= argc) {
186
182
  velvetLog("Unusual number of arguments!\n");
187
183
  printUsage();
188
- #ifdef DEBUG
184
+ #ifdef DEBUG
189
185
  abort();
190
- #endif
186
+ #endif
191
187
  exit(1);
192
188
  }
193
189
 
@@ -214,9 +210,9 @@ int velvetg(int argc, char **argv)
214
210
  if (insertLength[0] < 0) {
215
211
  velvetLog("Invalid insert length: %lli\n",
216
212
  (long long) insertLength[0]);
217
- #ifdef DEBUG
213
+ #ifdef DEBUG
218
214
  abort();
219
- #endif
215
+ #endif
220
216
  exit(1);
221
217
  }
222
218
  } else if (strcmp(arg, "-ins_length_sd") == 0) {
@@ -225,9 +221,9 @@ int velvetg(int argc, char **argv)
225
221
  if (std_dev[0] < 0) {
226
222
  velvetLog("Invalid std deviation: %lli\n",
227
223
  (long long) std_dev[0]);
228
- #ifdef DEBUG
224
+ #ifdef DEBUG
229
225
  abort();
230
- #endif
226
+ #endif
231
227
  exit(1);
232
228
  }
233
229
  } else if (strcmp(arg, "-ins_length_long") == 0) {
@@ -242,9 +238,9 @@ int velvetg(int argc, char **argv)
242
238
  cat = (Category) short_var;
243
239
  if (cat < 1 || cat > CATEGORIES) {
244
240
  velvetLog("Unknown option: %s\n", arg);
245
- #ifdef DEBUG
241
+ #ifdef DEBUG
246
242
  abort();
247
- #endif
243
+ #endif
248
244
  exit(1);
249
245
  }
250
246
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -252,9 +248,9 @@ int velvetg(int argc, char **argv)
252
248
  if (insertLength[cat - 1] < 0) {
253
249
  velvetLog("Invalid insert length: %lli\n",
254
250
  (long long) insertLength[cat - 1]);
255
- #ifdef DEBUG
251
+ #ifdef DEBUG
256
252
  abort();
257
- #endif
253
+ #endif
258
254
  exit(1);
259
255
  }
260
256
  } else if (strncmp(arg, "-ins_length", 11) == 0) {
@@ -262,9 +258,9 @@ int velvetg(int argc, char **argv)
262
258
  cat = (Category) short_var;
263
259
  if (cat < 1 || cat > CATEGORIES) {
264
260
  velvetLog("Unknown option: %s\n", arg);
265
- #ifdef DEBUG
261
+ #ifdef DEBUG
266
262
  abort();
267
- #endif
263
+ #endif
268
264
  exit(1);
269
265
  }
270
266
  sscanf(argv[arg_index], "%lli", &longlong_var);
@@ -272,9 +268,9 @@ int velvetg(int argc, char **argv)
272
268
  if (std_dev[cat - 1] < 0) {
273
269
  velvetLog("Invalid std deviation: %lli\n",
274
270
  (long long) std_dev[cat - 1]);
275
- #ifdef DEBUG
271
+ #ifdef DEBUG
276
272
  abort();
277
- #endif
273
+ #endif
278
274
  exit(1);
279
275
  }
280
276
  } else if (strcmp(arg, "-read_trkg") == 0) {
@@ -357,12 +353,6 @@ int velvetg(int argc, char **argv)
357
353
  exit(1);
358
354
  }
359
355
  shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0);
360
- } else if (strcmp(arg,"-clip_tips") == 0){
361
- if (strcmp(argv[arg_index], "no") == 0)
362
- doClipTips = false;
363
- } else if (strcmp(arg,"-tour_bus") == 0){
364
- if (strcmp(argv[arg_index], "no") == 0)
365
- doTourBus = false;
366
356
  } else if (strcmp(arg, "--help") == 0) {
367
357
  printUsage();
368
358
  return 0;
@@ -462,8 +452,7 @@ int velvetg(int argc, char **argv)
462
452
 
463
453
  sequenceLengths =
464
454
  getSequenceLengths(sequences, getWordLength(graph));
465
- if (doTourBus)
466
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
455
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
467
456
  exportGraph(graphFilename, graph, sequences->tSequences);
468
457
  } else if ((file = fopen(preGraphFilename, "r")) != NULL) {
469
458
  fclose(file);
@@ -479,8 +468,7 @@ int velvetg(int argc, char **argv)
479
468
  roadmapFilename, readTracking, accelerationBits);
480
469
  sequenceLengths =
481
470
  getSequenceLengths(sequences, getWordLength(graph));
482
- if (doTourBus)
483
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
471
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
484
472
  exportGraph(graphFilename, graph, sequences->tSequences);
485
473
  } else if ((file = fopen(roadmapFilename, "r")) != NULL) {
486
474
  fclose(file);
@@ -521,7 +509,7 @@ int velvetg(int argc, char **argv)
521
509
  }
522
510
  preGraph = newPreGraph_pg(rdmaps, seqReadInfo);
523
511
  concatenatePreGraph_pg(preGraph);
524
- if (!conserveLong && doClipTips)
512
+ if (!conserveLong)
525
513
  clipTips_pg(preGraph);
526
514
  exportPreGraph_pg(preGraphFilename, preGraph);
527
515
  destroyPreGraph_pg(preGraph);
@@ -535,14 +523,13 @@ int velvetg(int argc, char **argv)
535
523
  roadmapFilename, readTracking, accelerationBits);
536
524
  sequenceLengths =
537
525
  getSequenceLengths(sequences, getWordLength(graph));
538
- if (doTourBus)
539
- correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
526
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
540
527
  exportGraph(graphFilename, graph, sequences->tSequences);
541
528
  } else {
542
529
  velvetLog("No Roadmap file to build upon! Please run velveth (see manual)\n");
543
- #ifdef DEBUG
530
+ #ifdef DEBUG
544
531
  abort();
545
- #endif
532
+ #endif
546
533
  exit(1);
547
534
  }
548
535
 
@@ -566,11 +553,11 @@ int velvetg(int argc, char **argv)
566
553
  coverageCutoff = expectedCoverage / 2;
567
554
  estimateCutoff = true;
568
555
  }
569
- } else {
556
+ } else {
570
557
  estimateCoverage = false;
571
- if (coverageCutoff < 0 && estimateCutoff)
558
+ if (coverageCutoff < 0 && estimateCutoff)
572
559
  coverageCutoff = estimated_cov(graph, directory) / 2;
573
- else
560
+ else
574
561
  estimateCutoff = false;
575
562
  }
576
563
 
@@ -593,7 +580,7 @@ int velvetg(int argc, char **argv)
593
580
  if (minContigLength < 2 * getWordLength(graph))
594
581
  minContigKmerLength = getWordLength(graph);
595
582
  else
596
- minContigKmerLength = minContigLength - getWordLength(graph) + 1;
583
+ minContigKmerLength = minContigLength - getWordLength(graph) + 1;
597
584
 
598
585
  dubious =
599
586
  removeLowCoverageNodesAndDenounceDubiousReads(graph,
@@ -612,8 +599,7 @@ int velvetg(int argc, char **argv)
612
599
  lowCovContigsFilename);
613
600
 
614
601
  removeHighCoverageNodes(graph, maxCoverageCutoff, exportFilteredNodes, minContigKmerLength, highCovContigsFilename);
615
- if (doClipTips)
616
- clipTipsHard(graph, conserveLong);
602
+ clipTipsHard(graph, conserveLong);
617
603
 
618
604
  if (sequences->readCount > 0 && sequences->categories[0] == REFERENCE)
619
605
  removeLowArcs(graph, coverageCutoff);
@@ -649,7 +635,7 @@ int velvetg(int argc, char **argv)
649
635
  strcpy(graphFilename, directory);
650
636
  strcat(graphFilename, "/contigs.fa");
651
637
  sequenceLengths = getSequenceLengths(sequences, getWordLength(graph));
652
- exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
638
+ exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
653
639
 
654
640
  if (exportAlignments) {
655
641
  strcpy(graphFilename, directory);
@@ -677,9 +663,9 @@ int velvetg(int argc, char **argv)
677
663
  if (unusedReads)
678
664
  exportUnusedReads(graph, sequences, minContigKmerLength, directory);
679
665
 
680
- if (estimateCoverage)
666
+ if (estimateCoverage)
681
667
  velvetLog("Estimated Coverage = %f\n", expectedCoverage);
682
- if (estimateCutoff)
668
+ if (estimateCutoff)
683
669
  velvetLog("Estimated Coverage cutoff = %f\n", coverageCutoff);
684
670
 
685
671
  logFinalStats(graph, minContigKmerLength, directory);
@@ -687,25 +673,25 @@ int velvetg(int argc, char **argv)
687
673
  if (clean > 0) {
688
674
  strcpy(graphFilename, directory);
689
675
  strcat(graphFilename, "/Roadmaps");
690
- remove(graphFilename);
676
+ remove(graphFilename);
691
677
 
692
678
  strcpy(graphFilename, directory);
693
679
  strcat(graphFilename, "/LastGraph");
694
- remove(graphFilename);
695
- }
680
+ remove(graphFilename);
681
+ }
696
682
 
697
683
  if (clean > 1) {
698
684
  strcpy(graphFilename, directory);
699
685
  strcat(graphFilename, "/Sequences");
700
- remove(graphFilename);
686
+ remove(graphFilename);
701
687
 
702
688
  strcpy(graphFilename, directory);
703
689
  strcat(graphFilename, "/Graph2");
704
- remove(graphFilename);
690
+ remove(graphFilename);
705
691
 
706
692
  strcpy(graphFilename, directory);
707
693
  strcat(graphFilename, "/Graph");
708
- remove(graphFilename);
694
+ remove(graphFilename);
709
695
  }
710
696
 
711
697
  free(sequenceLengths);
@@ -18,7 +18,7 @@ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
18
18
  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
 
20
20
  */
21
- #ifndef _SSCAFFOLD_H_
21
+ #ifndef _SCAFFOLD_H_
22
22
  #define _SCAFFOLD_H_
23
23
 
24
24
  typedef struct connection_st Connection;
@@ -1,5 +1,5 @@
1
1
  /*
2
- Copyright 2009 John Marshall (jm18@sanger.ac.uk)
2
+ Copyright 2009 John Marshall (jm18@sanger.ac.uk)
3
3
 
4
4
  This file is part of Velvet.
5
5
 
@@ -89,15 +89,14 @@ void exitErrorf(int exitStatus, boolean showErrno, const char *format, ...)
89
89
  fprintf(stderr, "\n");
90
90
  va_end(args);
91
91
 
92
- #ifdef DEBUG
92
+ #ifdef DEBUG
93
93
  abort();
94
- #endif
94
+ #endif
95
95
  exit(exitStatus);
96
96
  }
97
97
 
98
98
  void velvetLog(const char *format, ...)
99
99
  {
100
- /* Don't print anything as it interferes with code bound through bioruby-velvet_underground
101
100
  static boolean timeIsSet = false;
102
101
  static struct timeval tvStart;
103
102
  struct timeval tvNow;
@@ -118,14 +117,14 @@ void velvetLog(const char *format, ...)
118
117
  vprintf(format, args);
119
118
  va_end(args);
120
119
 
121
- #ifdef DEBUG
120
+ #ifdef DEBUG
122
121
  fflush(stdout);
123
- #endif*/
122
+ #endif
124
123
  }
125
124
 
126
- void velvetFprintf(FILE * file, const char * format, ...)
125
+ void velvetFprintf(FILE * file, const char * format, ...)
127
126
  {
128
- va_list args;
127
+ va_list args;
129
128
 
130
129
  va_start(args, format);
131
130
  if (vfprintf(file, format, args) < 0) {
@@ -133,9 +132,9 @@ void velvetFprintf(FILE * file, const char * format, ...)
133
132
  fprintf(stderr, "%s: ", programName);
134
133
  fprintf(stderr, "Could not write into file\n");
135
134
  va_end(args);
136
- #ifdef DEBUG
135
+ #ifdef DEBUG
137
136
  abort();
138
- #endif
137
+ #endif
139
138
  exit(EXIT_FAILURE);
140
139
  }
141
140
  va_end(args);
@@ -24,6 +24,18 @@ class Bio::Velvet::Underground
24
24
  @readset[:readCount]
25
25
  end
26
26
 
27
+ # Return true if paired, else false
28
+ def paired?(sequence_id)
29
+ cat = FFI::Pointer.new(:int8, @readset[:categories])[sequence_id-1].read_int8
30
+ if cat == 0
31
+ return false
32
+ elsif cat == 1
33
+ return true
34
+ else
35
+ raise "Unexpected velvet sequence category found: #{cat}"
36
+ end
37
+ end
38
+
27
39
  # Returns true if the sequence ID refers to the
28
40
  # second in a pair of sequences.
29
41
  def is_second_in_pair?(sequence_id)
@@ -33,8 +45,10 @@ class Bio::Velvet::Underground
33
45
  Bio::Velvet::Underground.isSecondInPair @readset, sequence_id-1
34
46
  end
35
47
 
36
- # Returns the ID of the given sequence_id's pair
48
+ # Returns the ID of the given sequence_id's pair, or nil if it is not a
49
+ # paired sequence
37
50
  def pair_id(sequence_id)
51
+ return nil unless paired?(sequence_id)
38
52
  if is_second_in_pair?(sequence_id)
39
53
  sequence_id-1
40
54
  else
@@ -80,6 +94,9 @@ class Bio::Velvet::Underground
80
94
  # IDnum position);
81
95
  attach_function :getTightStringInArray, [:pointer, :int32], :pointer
82
96
 
97
+ # int pairedCategories(ReadSet * reads);
98
+ attach_function :pairedCategories, [:pointer], :int
99
+
83
100
  # boolean isSecondInPair(ReadSet * reads, IDnum index);
84
101
  attach_function :isSecondInPair, [:pointer, :int32], :bool
85
102
  end
@@ -119,14 +119,16 @@ class Bio::Velvet::Underground
119
119
  end
120
120
 
121
121
  def fwd_short_reads
122
+ return @short_reads unless @short_reads.nil?
122
123
  array_start_pointer = Bio::Velvet::Underground.getNodeReads @internal_node_struct, @graph.internal_graph_struct
123
124
  num_short_reads = Bio::Velvet::Underground.getNodeReadCount @internal_node_struct, @graph.internal_graph_struct
124
- short_reads = (0...num_short_reads).collect do |i|
125
+ struct_size = Bio::Velvet::Underground::ShortReadMarker.size #calculate once for performance
126
+ @short_reads = 0.step(num_short_reads-1, 1).collect do |i|
125
127
  # Use the fact that FFI pointers can do pointer arithmetic
126
- pointer = array_start_pointer+(i*Bio::Velvet::Underground::ShortReadMarker.size)
128
+ pointer = array_start_pointer+(i*struct_size)
127
129
  NodedRead.new Bio::Velvet::Underground::ShortReadMarker.new(pointer), true
128
130
  end
129
- return short_reads
131
+ return @short_reads
130
132
  end
131
133
 
132
134
  def rev_short_reads
@@ -141,7 +143,6 @@ class Bio::Velvet::Underground
141
143
  end
142
144
  return reads
143
145
  end
144
-
145
146
  end
146
147
 
147
148
  # TODO: this class is currently unimplemented.