finishm 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,318 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <sys/stat.h>
24
+ #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
25
+ #include <uce-dirent.h>
26
+ #define Arc v_Arc
27
+ #else
28
+ #include <dirent.h>
29
+ #endif
30
+
31
+ #include "run.h"
32
+
33
+ static void printUsage()
34
+ {
35
+ puts("Usage:");
36
+ puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename1 [filename2 ...]} {...} [options]");
37
+ puts("");
38
+ puts("\tdirectory\t: directory name for output files");
39
+ printf("\thash_length\t: EITHER an odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
40
+ printf("\t\t\t: OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
41
+ puts("\t\t\t\tand s is a step (even number). Velvet will then hash from k=m to k=M with a step of s");
42
+ puts("\tfilename\t: path to sequence file or - for standard input");
43
+ puts("");
44
+ puts("File format options:");
45
+ puts("\t-fasta\t-fastq\t-raw\t-fasta.gz\t-fastq.gz\t-raw.gz\t-sam\t-bam\t-fmtAuto");
46
+ puts("\t(Note: -fmtAuto will detect fasta or fastq, and will try the following programs for decompression : gunzip, pbunzip2, bunzip2");
47
+ puts("");
48
+ puts("File layout options for paired reads (only for fasta and fastq formats):");
49
+ puts("\t-interleaved\t: File contains paired reads interleaved in the one file (default)");
50
+ puts("\t-separate\t: Read 2 separate files for paired reads");
51
+ puts("");
52
+ puts("Read type options:");
53
+ puts("\t-short\t-shortPaired");
54
+ #if CATEGORIES <= 5
55
+ Category cat;
56
+ for (cat = 2; cat <= CATEGORIES; cat++)
57
+ printf("\t-short%i\t-shortPaired%i\n", cat, cat);
58
+ #else
59
+ puts("\t...");
60
+ printf("\t-short%i\t-shortPaired%i\n", CATEGORIES - 1, CATEGORIES - 1);
61
+ printf("\t-short%i\t-shortPaired%i\n", CATEGORIES, CATEGORIES);
62
+ #endif
63
+ puts("\t-long\t-longPaired");
64
+ puts("\t-reference");
65
+ puts("");
66
+ puts("Options:");
67
+ puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
68
+ puts("\t-reuse_Sequences\t: reuse Sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)");
69
+ puts("\t-reuse_binary\t: reuse binary sequences file (or link) already in directory (no need to provide original filenames in this case (default: off)");
70
+ puts("\t-noHash\t\t\t: simply prepare Sequences file, do not hash reads or prepare Roadmaps file (default: off)");
71
+ puts("\t-create_binary \t: create binary CnyUnifiedSeq file (default: off)");
72
+ puts("");
73
+ puts("Synopsis:");
74
+ puts("");
75
+ puts("- Short single end reads:");
76
+ puts("\tvelveth Assem 29 -short -fastq s_1_sequence.txt");
77
+ puts("");
78
+ puts("- Paired-end short reads (remember to interleave paired reads):");
79
+ puts("\tvelveth Assem 31 -shortPaired -fasta interleaved.fna");
80
+ puts("");
81
+ puts("- Paired-end short reads (using separate files for the paired reads)");
82
+ puts("\tvelveth Assem 31 -shortPaired -fasta -separate left.fa right.fa");
83
+ puts("");
84
+ puts("- Two channels and some long reads:");
85
+ puts("\tvelveth Assem 43 -short -fastq unmapped.fna -longPaired -fasta SangerReads.fasta");
86
+ puts("");
87
+ puts("- Three channels:");
88
+ puts("\tvelveth Assem 35 -shortPaired -fasta pe_lib1.fasta -shortPaired2 pe_lib2.fasta -short3 se_lib1.fa");
89
+ puts("");
90
+ puts("Output:");
91
+ puts("\tdirectory/Roadmaps");
92
+ puts("\tdirectory/Sequences");
93
+ puts("\t\t[Both files are picked up by graph, so please leave them there]");
94
+ }
95
+
96
+ int main(int argc, char **argv)
97
+ {
98
+ ReadSet *allSequences = NULL;
99
+ SplayTable *splayTable;
100
+ int hashLength, hashLengthStep, hashLengthMax, h;
101
+ char *directory, *filename, *seqFilename, *baseSeqName, *buf;
102
+ char * token;
103
+ boolean double_strand = true;
104
+ boolean noHash = false;
105
+ boolean multiple_kmers = false;
106
+ char buffer[100];
107
+ DIR *dir;
108
+
109
+ setProgramName("velveth");
110
+
111
+ if (argc < 4) {
112
+ printf("velveth - simple hashing program\n");
113
+ printf("Version %i.%i.%2.2i%s\n", VERSION_NUMBER,
114
+ RELEASE_NUMBER, UPDATE_NUMBER, VERSION_BRANCH);
115
+ printf("\nCopyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
116
+ printf("This is free software; see the source for copying conditions. There is NO\n");
117
+ printf("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\n");
118
+ printf("Compilation settings:\n");
119
+ printf("CATEGORIES = %i\n", CATEGORIES);
120
+ printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
121
+ #ifdef _OPENMP
122
+ puts("OPENMP");
123
+ #endif
124
+ #ifdef LONGSEQUENCES
125
+ puts("LONGSEQUENCES");
126
+ #endif
127
+ #ifdef BIGASSEMBLY
128
+ puts("BIGASSEMBLY");
129
+ #endif
130
+ #ifdef COLOR
131
+ puts("COLOR");
132
+ #endif
133
+ #ifdef DEBUG
134
+ puts("DEBUG");
135
+ #endif
136
+ printf("\n");
137
+ printUsage();
138
+ return 0;
139
+ }
140
+
141
+ strcpy(buffer, argv[2]);
142
+ token = strtok(buffer, ",");
143
+ hashLength = atoi(token);
144
+ token = strtok(NULL, ",");
145
+ if (token == NULL) {
146
+ multiple_kmers = false;
147
+ hashLengthMax = hashLength + 1;
148
+ } else {
149
+ multiple_kmers = true;
150
+ hashLengthMax = atoi(token);
151
+ }
152
+ token = strtok(NULL, ",");
153
+ if (token == NULL) {
154
+ hashLengthStep = 2;
155
+ } else {
156
+ hashLengthStep = atoi(token);
157
+ }
158
+
159
+ if (hashLength > MAXKMERLENGTH) {
160
+ velvetLog
161
+ ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
162
+ hashLength, MAXKMERLENGTH);
163
+ hashLength = MAXKMERLENGTH;
164
+ }
165
+ if (hashLength <= 0) {
166
+ velvetLog("Invalid hash length: %s\n", argv[2]);
167
+ printUsage();
168
+ return 0;
169
+ }
170
+ if (hashLength % 2 == 0) {
171
+ velvetLog
172
+ ("Velvet can't work with even length k-mers, such as %i. We'll use %i instead, if you don't mind.\n",
173
+ hashLength, hashLength - 1);
174
+ hashLength--;
175
+ }
176
+
177
+ if (multiple_kmers) {
178
+ if (hashLengthMax > MAXKMERLENGTH + 1) {
179
+ velvetLog
180
+ ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
181
+ hashLengthMax, MAXKMERLENGTH + 1);
182
+ hashLengthMax = MAXKMERLENGTH + 1;
183
+ }
184
+ if (hashLengthMax <= hashLength) {
185
+ velvetLog("hashLengthMin < hashLengthMax is required %s", argv[2]);
186
+ printUsage();
187
+ return 0;
188
+ }
189
+
190
+ if (hashLengthStep <= 0) {
191
+ velvetLog("Non-positive hash length! Setting it to 2\n");
192
+ hashLengthStep = 2;
193
+ }
194
+ if (hashLengthStep % 2 == 1) {
195
+ velvetLog
196
+ ("Velvet can't work with an odd length k-mer step, such as %i. We'll use %i instead, if you don't mind.\n",
197
+ hashLengthStep, hashLengthStep + 1);
198
+ hashLengthStep++;
199
+ }
200
+ }
201
+
202
+ // check if binary sequences should be used
203
+ int argIndex;
204
+ for (argIndex = 3; argIndex < argc; argIndex++)
205
+ if (strcmp(argv[argIndex], "-create_binary") == 0 || strcmp(argv[argIndex], "-reuse_binary") == 0)
206
+ setCreateBinary(true);
207
+
208
+ for (h = hashLength; h < hashLengthMax; h += hashLengthStep) {
209
+
210
+ resetWordFilter(h);
211
+
212
+ buf = mallocOrExit(2 * strlen(argv[1]) + 500, char);
213
+
214
+ if ( multiple_kmers ) {
215
+ sprintf(buf,"%s_%d",argv[1],h);
216
+ directory = mallocOrExit(strlen(buf) + 100, char);
217
+ strcpy(directory,buf);
218
+ } else
219
+ directory = argv[1];
220
+
221
+ filename = mallocOrExit(strlen(directory) + 100, char);
222
+ seqFilename = mallocOrExit(strlen(directory) + 100, char);
223
+ baseSeqName = mallocOrExit(100, char);
224
+
225
+ dir = opendir(directory);
226
+
227
+ if (dir == NULL)
228
+ mkdir(directory, 0777);
229
+ else {
230
+ sprintf(buf, "%s/PreGraph", directory);
231
+ remove(buf);
232
+ sprintf(buf, "%s/Graph", directory);
233
+ remove(buf);
234
+ sprintf(buf, "%s/Graph2", directory);
235
+ remove(buf);
236
+ sprintf(buf, "%s/Graph3", directory);
237
+ remove(buf);
238
+ sprintf(buf, "%s/Graph4", directory);
239
+ remove(buf);
240
+ }
241
+
242
+ logInstructions(argc, argv, directory);
243
+
244
+ strcpy(seqFilename, directory);
245
+ if (isCreateBinary()) {
246
+ // use the CNY unified seq writer
247
+ strcpy(baseSeqName, "/CnyUnifiedSeq");
248
+ // remove other style sequences file
249
+ sprintf(buf, "%s/Sequences", directory);
250
+ remove(buf);
251
+ } else {
252
+ strcpy(baseSeqName, "/Sequences");
253
+ // remove other style sequences file
254
+ sprintf(buf, "%s/CnyUnifiedSeq", directory);
255
+ remove(buf);
256
+ sprintf(buf, "%s/CnyUnifiedSeq.names", directory);
257
+ remove(buf);
258
+ }
259
+ strcat(seqFilename, baseSeqName);
260
+
261
+ if ( h == hashLength ) {
262
+ parseDataAndReadFiles(seqFilename, argc - 2, &(argv[2]), &double_strand, &noHash);
263
+ } else {
264
+ sprintf(buf,"rm -f %s",seqFilename);
265
+ if (system(buf)) {
266
+ velvetLog("Command failed!\n");
267
+ velvetLog("%s\n", buf);
268
+ #ifdef DEBUG
269
+ abort();
270
+ #endif
271
+ exit(1);
272
+ }
273
+ if (argv[1][0] == '/')
274
+ sprintf(buf,"ln -s %s_%d%s %s",argv[1],hashLength,baseSeqName,seqFilename);
275
+ else
276
+ sprintf(buf,"ln -s `pwd`/%s_%d%s %s",argv[1],hashLength,baseSeqName,seqFilename);
277
+ if (system(buf)) {
278
+ velvetLog("Command failed!\n");
279
+ velvetLog("%s\n", buf);
280
+ #ifdef DEBUG
281
+ abort();
282
+ #endif
283
+ exit(1);
284
+ }
285
+ }
286
+
287
+ if (noHash)
288
+ continue;
289
+
290
+ splayTable = newSplayTable(h, double_strand);
291
+ if (isCreateBinary()) {
292
+ allSequences = importCnyReadSet(seqFilename);
293
+ } else {
294
+ allSequences = importReadSet(seqFilename);
295
+ }
296
+ velvetLog("%li sequences in total.\n", (long) allSequences->readCount);
297
+
298
+ strcpy(filename, directory);
299
+ strcat(filename, "/Roadmaps");
300
+ inputSequenceArrayIntoSplayTableAndArchive(allSequences,
301
+ splayTable, filename, seqFilename);
302
+
303
+ destroySplayTable(splayTable);
304
+ if (dir)
305
+ closedir(dir);
306
+ if (directory != argv[1])
307
+ free(directory);
308
+ free(filename);
309
+ free(seqFilename);
310
+ free(baseSeqName);
311
+ free(buf);
312
+ if (allSequences) {
313
+ destroyReadSet(allSequences);
314
+ }
315
+ }
316
+
317
+ return 0;
318
+ }
@@ -0,0 +1,52 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ // Compilation
22
+ #include "globals.h"
23
+
24
+ // Utilities
25
+ #include "graphStats.h"
26
+ #include "utility.h"
27
+
28
+ // Datastructures
29
+ #include "kmer.h"
30
+ #include "readSet.h"
31
+ #include "tightString.h"
32
+ #include "roadMap.h"
33
+ #include "splayTable.h"
34
+ #include "graph.h"
35
+ #include "scaffold.h"
36
+ #include "binarySequences.h"
37
+
38
+ // PreGraph operations
39
+ #include "preGraph.h"
40
+ #include "preGraphConstruction.h"
41
+ #include "concatenatedPreGraph.h"
42
+
43
+ // Graph operations
44
+ #include "graph.h"
45
+ #include "graphReConstruction.h"
46
+ #include "concatenatedGraph.h"
47
+ #include "correctedGraph.h"
48
+ #include "locallyCorrectedGraph.h"
49
+
50
+ // Repeat resolution
51
+ #include "readCoherentGraph.h"
52
+ #include "shortReadPairs.h"
@@ -0,0 +1,744 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <unistd.h>
25
+
26
+ #include "run.h"
27
+
28
+ #include "binarySequences.h"
29
+ #include "globals.h"
30
+ #include "readToNode.h"
31
+
32
+ static void printUsage()
33
+ {
34
+ puts("Usage:");
35
+ puts("./velvetg directory [options]");
36
+ puts("");
37
+ puts("\tdirectory\t\t\t: working directory name");
38
+ puts("");
39
+ puts("Standard options:");
40
+ puts("\t-cov_cutoff <floating-point|auto>\t: removal of low coverage nodes AFTER tour bus or allow the system to infer it");
41
+ puts("\t\t(default: no removal)");
42
+ puts("\t-ins_length <integer>\t\t: expected distance between two paired end reads (default: no read pairing)");
43
+ puts("\t-read_trkg <yes|no>\t\t: tracking of short read positions in assembly (default: no tracking)");
44
+ puts("\t-min_contig_lgth <integer>\t: minimum contig length exported to contigs.fa file (default: hash length * 2)");
45
+ puts("\t-amos_file <yes|no>\t\t: export assembly to AMOS file (default: no export)");
46
+ puts("\t-exp_cov <floating point|auto>\t: expected coverage of unique regions or allow the system to infer it");
47
+ puts("\t\t(default: no long or paired-end read resolution)");
48
+ puts("\t-long_cov_cutoff <floating-point>: removal of nodes with low long-read coverage AFTER tour bus");
49
+ puts("\t\t(default: no removal)");
50
+ puts("");
51
+ puts("Advanced options:");
52
+ puts("\t-ins_length* <integer>\t\t: expected distance between two paired-end reads in the respective short-read dataset (default: no read pairing)");
53
+ puts("\t-ins_length_long <integer>\t: expected distance between two long paired-end reads (default: no read pairing)");
54
+ puts("\t-ins_length*_sd <integer>\t: est. standard deviation of respective dataset (default: 10% of corresponding length)");
55
+ puts("\t\t[replace '*' by nothing, '2' or '_long' as necessary]");
56
+ puts("\t-scaffolding <yes|no>\t\t: scaffolding of contigs used paired end information (default: on)");
57
+ puts("\t-max_branch_length <integer>\t: maximum length in base pair of bubble (default: 100)");
58
+ puts("\t-max_divergence <floating-point>: maximum divergence rate between two branches in a bubble (default: 0.2)");
59
+ puts("\t-max_gap_count <integer>\t: maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3)");
60
+ puts("\t-min_pair_count <integer>\t: minimum number of paired end connections to justify the scaffolding of two long contigs (default: 5)");
61
+ puts("\t-max_coverage <floating point>\t: removal of high coverage nodes AFTER tour bus (default: no removal)");
62
+ puts("\t-coverage_mask <int>\t: minimum coverage required for confident regions of contigs (default: 1)");
63
+ puts("\t-long_mult_cutoff <int>\t\t: minimum number of long reads required to merge contigs (default: 2)");
64
+ puts("\t-unused_reads <yes|no>\t\t: export unused reads in UnusedReads.fa file (default: no)");
65
+ puts("\t-alignments <yes|no>\t\t: export a summary of contig alignment to the reference sequences (default: no)");
66
+ puts("\t-exportFiltered <yes|no>\t: export the long nodes which were eliminated by the coverage filters (default: no)");
67
+ puts("\t-clean <yes|no>\t\t\t: remove all the intermediary files which are useless for recalculation (default : no)");
68
+ puts("\t-very_clean <yes|no>\t\t: remove all the intermediary files (no recalculation possible) (default: no)");
69
+ puts("\t-paired_exp_fraction <double>\t: remove all the paired end connections which less than the specified fraction of the expected count (default: 0.1)");
70
+ puts("\t-shortMatePaired* <yes|no>\t: for mate-pair libraries, indicate that the library might be contaminated with paired-end reads (default no)");
71
+ puts("\t-conserveLong <yes|no>\t\t: preserve sequences with long reads in them (default no)");
72
+ puts("\t-tour_bus <yes|no>\t\t: apply the tour bus algorithm (default yes)");
73
+ puts("\t-read_to_node_binary <yes|no>\t: create binary ReadToNode file for looking up which nodes contain each read (default: no)");
74
+ puts("");
75
+ puts("Output:");
76
+ puts("\tdirectory/contigs.fa\t\t: fasta file of contigs longer than twice hash length");
77
+ puts("\tdirectory/stats.txt\t\t: stats file (tab-spaced) useful for determining appropriate coverage cutoff");
78
+ puts("\tdirectory/LastGraph\t\t: special formatted file with all the information on the final graph");
79
+ puts("\tdirectory/velvet_asm.afg\t: (if requested) AMOS compatible assembly file");
80
+ }
81
+
82
+ int main(int argc, char **argv)
83
+ {
84
+ ReadSet *sequences = NULL;
85
+ RoadMapArray *rdmaps;
86
+ PreGraph *preGraph;
87
+ Graph *graph;
88
+ char *directory, *graphFilename, *connectedGraphFilename,
89
+ *preGraphFilename, *seqFilename, *roadmapFilename,
90
+ *lowCovContigsFilename, *highCovContigsFilename;
91
+ double coverageCutoff = -1;
92
+ boolean doTourBus = true;
93
+ boolean createReadToNodeFile = false;
94
+ double longCoverageCutoff = -1;
95
+ double maxCoverageCutoff = -1;
96
+ double expectedCoverage = -1;
97
+ Coordinate minContigLength = -1;
98
+ Coordinate minContigKmerLength;
99
+ boolean *dubious = NULL;
100
+ Coordinate insertLength[CATEGORIES];
101
+ Coordinate insertLengthLong = -1;
102
+ Coordinate std_dev[CATEGORIES];
103
+ Coordinate std_dev_long = -1;
104
+ short int accelerationBits = 24;
105
+ boolean readTracking = false;
106
+ boolean exportAssembly = false;
107
+ boolean unusedReads = false;
108
+ boolean estimateCoverage = false;
109
+ boolean estimateCutoff = false;
110
+ boolean exportAlignments = false;
111
+ FILE *file;
112
+ int arg_index, arg_int;
113
+ double arg_double;
114
+ char *arg;
115
+ ShortLength *sequenceLengths = NULL;
116
+ Category cat;
117
+ boolean scaffolding = true;
118
+ int pebbleRounds = 1;
119
+ long long longlong_var;
120
+ short int short_var;
121
+ boolean exportFilteredNodes = false;
122
+ int clean = 0;
123
+ boolean conserveLong = false;
124
+ boolean shadows[CATEGORIES];
125
+ int coverageMask = 1;
126
+ SequencesReader *seqReadInfo = NULL;
127
+
128
+ setProgramName("velvetg");
129
+
130
+ for (cat = 0; cat < CATEGORIES; cat++) {
131
+ insertLength[cat] = -1;
132
+ std_dev[cat] = -1;
133
+ shadows[cat] = false;
134
+ }
135
+
136
+ // Error message
137
+ if (argc == 1) {
138
+ puts("velvetg - de Bruijn graph construction, error removal and repeat resolution");
139
+ printf("Version %i.%i.%2.2i%s\n", VERSION_NUMBER,
140
+ RELEASE_NUMBER, UPDATE_NUMBER, VERSION_BRANCH);
141
+ puts("Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)");
142
+ puts("This is free software; see the source for copying conditions. There is NO");
143
+ puts("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.");
144
+ puts("Compilation settings:");
145
+ printf("CATEGORIES = %i\n", CATEGORIES);
146
+ printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
147
+ #ifdef _OPENMP
148
+ puts("OPENMP");
149
+ #endif
150
+ #ifdef LONGSEQUENCES
151
+ puts("LONGSEQUENCES");
152
+ #endif
153
+ #ifdef BIGASSEMBLY
154
+ puts("BIGASSEMBLY");
155
+ #endif
156
+ #ifdef COLOR
157
+ puts("COLOR");
158
+ #endif
159
+ #ifdef DEBUG
160
+ puts("DEBUG");
161
+ #endif
162
+ puts("");
163
+ printUsage();
164
+ return 1;
165
+ }
166
+
167
+ if (strcmp(argv[1], "--help") == 0) {
168
+ printUsage();
169
+ return 0;
170
+ }
171
+
172
+ // Memory allocation
173
+ directory = argv[1];
174
+ graphFilename = mallocOrExit(strlen(directory) + 100, char);
175
+ connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char);
176
+ preGraphFilename =
177
+ mallocOrExit(strlen(directory) + 100, char);
178
+ roadmapFilename = mallocOrExit(strlen(directory) + 100, char);
179
+ seqFilename = mallocOrExit(strlen(directory) + 100, char);
180
+ lowCovContigsFilename = mallocOrExit(strlen(directory) + 100, char);
181
+ highCovContigsFilename = mallocOrExit(strlen(directory) + 100, char);
182
+
183
+ // Argument parsing
184
+ for (arg_index = 2; arg_index < argc; arg_index++) {
185
+ arg = argv[arg_index++];
186
+ if (arg_index >= argc) {
187
+ velvetLog("Unusual number of arguments!\n");
188
+ printUsage();
189
+ #ifdef DEBUG
190
+ abort();
191
+ #endif
192
+ exit(1);
193
+ }
194
+
195
+ if (strcmp(arg, "-cov_cutoff") == 0) {
196
+ if (strcmp(argv[arg_index], "auto") == 0) {
197
+ estimateCutoff = true;
198
+ } else {
199
+ sscanf(argv[arg_index], "%lf", &coverageCutoff);
200
+ }
201
+ } else if (strcmp(arg, "-long_cov_cutoff") == 0) {
202
+ sscanf(argv[arg_index], "%lf", &longCoverageCutoff);
203
+ } else if (strcmp(arg, "-exp_cov") == 0) {
204
+ if (strcmp(argv[arg_index], "auto") == 0) {
205
+ estimateCoverage = true;
206
+ readTracking = true;
207
+ } else {
208
+ sscanf(argv[arg_index], "%lf", &expectedCoverage);
209
+ if (expectedCoverage > 0)
210
+ readTracking = true;
211
+ }
212
+ } else if (strcmp(arg, "-ins_length") == 0) {
213
+ sscanf(argv[arg_index], "%lli", &longlong_var);
214
+ insertLength[0] = (Coordinate) longlong_var;
215
+ if (insertLength[0] < 0) {
216
+ velvetLog("Invalid insert length: %lli\n",
217
+ (long long) insertLength[0]);
218
+ #ifdef DEBUG
219
+ abort();
220
+ #endif
221
+ exit(1);
222
+ }
223
+ } else if (strcmp(arg, "-ins_length_sd") == 0) {
224
+ sscanf(argv[arg_index], "%lli", &longlong_var);
225
+ std_dev[0] = (Coordinate) longlong_var;
226
+ if (std_dev[0] < 0) {
227
+ velvetLog("Invalid std deviation: %lli\n",
228
+ (long long) std_dev[0]);
229
+ #ifdef DEBUG
230
+ abort();
231
+ #endif
232
+ exit(1);
233
+ }
234
+ } else if (strcmp(arg, "-ins_length_long") == 0) {
235
+ sscanf(argv[arg_index], "%lli", &longlong_var);
236
+ insertLengthLong = (Coordinate) longlong_var;
237
+ } else if (strcmp(arg, "-ins_length_long_sd") == 0) {
238
+ sscanf(argv[arg_index], "%lli", &longlong_var);
239
+ std_dev_long = (Coordinate) longlong_var;
240
+ } else if (strncmp(arg, "-ins_length", 11) == 0
241
+ && strchr(arg, 'd') == NULL) {
242
+ sscanf(arg, "-ins_length%hi", &short_var);
243
+ cat = (Category) short_var;
244
+ if (cat < 1 || cat > CATEGORIES) {
245
+ velvetLog("Unknown option: %s\n", arg);
246
+ #ifdef DEBUG
247
+ abort();
248
+ #endif
249
+ exit(1);
250
+ }
251
+ sscanf(argv[arg_index], "%lli", &longlong_var);
252
+ insertLength[cat - 1] = (Coordinate) longlong_var;
253
+ if (insertLength[cat - 1] < 0) {
254
+ velvetLog("Invalid insert length: %lli\n",
255
+ (long long) insertLength[cat - 1]);
256
+ #ifdef DEBUG
257
+ abort();
258
+ #endif
259
+ exit(1);
260
+ }
261
+ } else if (strncmp(arg, "-ins_length", 11) == 0) {
262
+ sscanf(arg, "-ins_length%hi_sd", &short_var);
263
+ cat = (Category) short_var;
264
+ if (cat < 1 || cat > CATEGORIES) {
265
+ velvetLog("Unknown option: %s\n", arg);
266
+ #ifdef DEBUG
267
+ abort();
268
+ #endif
269
+ exit(1);
270
+ }
271
+ sscanf(argv[arg_index], "%lli", &longlong_var);
272
+ std_dev[cat - 1] = (Coordinate) longlong_var;
273
+ if (std_dev[cat - 1] < 0) {
274
+ velvetLog("Invalid std deviation: %lli\n",
275
+ (long long) std_dev[cat - 1]);
276
+ #ifdef DEBUG
277
+ abort();
278
+ #endif
279
+ exit(1);
280
+ }
281
+ } else if (strcmp(arg, "-read_trkg") == 0) {
282
+ readTracking =
283
+ (strcmp(argv[arg_index], "yes") == 0);
284
+ } else if (strcmp(arg, "-scaffolding") == 0) {
285
+ scaffolding =
286
+ (strcmp(argv[arg_index], "yes") == 0);
287
+ } else if (strcmp(arg, "-exportFiltered") == 0) {
288
+ exportFilteredNodes =
289
+ (strcmp(argv[arg_index], "yes") == 0);
290
+ } else if (strcmp(arg, "-amos_file") == 0) {
291
+ exportAssembly =
292
+ (strcmp(argv[arg_index], "yes") == 0);
293
+ } else if (strcmp(arg, "-alignments") == 0) {
294
+ exportAlignments =
295
+ (strcmp(argv[arg_index], "yes") == 0);
296
+ } else if (strcmp(arg, "-min_contig_lgth") == 0) {
297
+ sscanf(argv[arg_index], "%lli", &longlong_var);
298
+ minContigLength = (Coordinate) longlong_var;
299
+ } else if (strcmp(arg, "-coverage_mask") == 0) {
300
+ sscanf(argv[arg_index], "%lli", &longlong_var);
301
+ coverageMask = (IDnum) longlong_var;
302
+ } else if (strcmp(arg, "-accel_bits") == 0) {
303
+ sscanf(argv[arg_index], "%hi", &accelerationBits);
304
+ if (accelerationBits < 0) {
305
+ velvetLog
306
+ ("Illegal acceleration parameter: %s\n",
307
+ argv[arg_index]);
308
+ printUsage();
309
+ return -1;
310
+ }
311
+ } else if (strcmp(arg, "-max_branch_length") == 0) {
312
+ sscanf(argv[arg_index], "%i", &arg_int);
313
+ setMaxReadLength(arg_int);
314
+ setLocalMaxReadLength(arg_int);
315
+ } else if (strcmp(arg, "-max_divergence") == 0) {
316
+ sscanf(argv[arg_index], "%lf", &arg_double);
317
+ setMaxDivergence(arg_double);
318
+ setLocalMaxDivergence(arg_double);
319
+ } else if (strcmp(arg, "-max_gap_count") == 0) {
320
+ sscanf(argv[arg_index], "%i", &arg_int);
321
+ setMaxGaps(arg_int);
322
+ setLocalMaxGaps(arg_int);
323
+ } else if (strcmp(arg, "-min_pair_count") == 0) {
324
+ sscanf(argv[arg_index], "%i", &arg_int);
325
+ setUnreliableConnectionCutoff(arg_int);
326
+ } else if (strcmp(arg, "-max_coverage") == 0) {
327
+ sscanf(argv[arg_index], "%lf", &maxCoverageCutoff);
328
+ } else if (strcmp(arg, "-long_mult_cutoff") == 0) {
329
+ sscanf(argv[arg_index], "%i", &arg_int);
330
+ setMultiplicityCutoff(arg_int);
331
+ } else if (strcmp(arg, "-paired_exp_fraction") == 0) {
332
+ sscanf(argv[arg_index], "%lf", &arg_double);
333
+ setPairedExpFraction(arg_double);
334
+ } else if (strcmp(arg, "-clean") == 0) {
335
+ if (strcmp(argv[arg_index], "yes") == 0)
336
+ clean = 1;
337
+ } else if (strcmp(arg, "-very_clean") == 0) {
338
+ if (strcmp(argv[arg_index], "yes") == 0)
339
+ clean = 2;
340
+ } else if (strcmp(arg, "-conserveLong") == 0) {
341
+ if (strcmp(argv[arg_index], "yes") == 0)
342
+ conserveLong = 2;
343
+ } else if (strcmp(arg, "-unused_reads") == 0) {
344
+ unusedReads =
345
+ (strcmp(argv[arg_index], "yes") == 0);
346
+ if (unusedReads)
347
+ readTracking = true;
348
+ } else if (strcmp(arg, "-shortMatePaired") == 0) {
349
+ shadows[0] = (strcmp(argv[arg_index], "yes") == 0);
350
+ } else if (strncmp(arg, "-shortMatePaired", 16) == 0) {
351
+ sscanf(arg, "-shortMatePaired%hi", &short_var);
352
+ cat = (Category) short_var;
353
+ if (cat < 1 || cat > CATEGORIES) {
354
+ velvetLog("Unknown option: %s\n", arg);
355
+ #ifdef DEBUG
356
+ abort();
357
+ #endif
358
+ exit(1);
359
+ }
360
+ shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0);
361
+ } else if (strcmp(arg,"-tour_bus") == 0){
362
+ if (strcmp(argv[arg_index], "no") == 0)
363
+ doTourBus = false;
364
+ } else if (strcmp(arg, "-read_to_node_binary") == 0){
365
+ createReadToNodeFile =
366
+ (strcmp(argv[arg_index], "yes") == 0);
367
+ } else if (strcmp(arg, "--help") == 0) {
368
+ printUsage();
369
+ return 0;
370
+ } else {
371
+ velvetLog("Unknown option: %s;\n", arg);
372
+ printUsage();
373
+ return 1;
374
+ }
375
+ }
376
+
377
+ // Bookkeeping
378
+ logInstructions(argc, argv, directory);
379
+
380
+ seqReadInfo = callocOrExit(1, SequencesReader);
381
+ strcpy(seqFilename, directory);
382
+ // if binary CnyUnifiedSeq exists, use it. Otherwise try Sequences
383
+ strcat(seqFilename, "/CnyUnifiedSeq");
384
+ if (access(seqFilename, R_OK) == 0) {
385
+ seqReadInfo->m_bIsBinary = true;
386
+ } else {
387
+ seqReadInfo->m_bIsBinary = false;
388
+ strcpy(seqFilename, directory);
389
+ strcat(seqFilename, "/Sequences");
390
+ }
391
+ seqReadInfo->m_seqFilename = seqFilename;
392
+ strcpy(roadmapFilename, directory);
393
+ strcat(roadmapFilename, "/Roadmaps");
394
+
395
+ strcpy(preGraphFilename, directory);
396
+ strcat(preGraphFilename, "/PreGraph");
397
+
398
+ strcpy(connectedGraphFilename, directory);
399
+ strcat(connectedGraphFilename, "/ConnectedGraph");
400
+
401
+ if (!readTracking) {
402
+ strcpy(graphFilename, directory);
403
+ strcat(graphFilename, "/Graph");
404
+ } else {
405
+ strcpy(graphFilename, directory);
406
+ strcat(graphFilename, "/Graph2");
407
+ }
408
+
409
+ strcpy(lowCovContigsFilename, directory);
410
+ strcat(lowCovContigsFilename, "/lowCoverageContigs.fa");
411
+
412
+ strcpy(highCovContigsFilename, directory);
413
+ strcat(highCovContigsFilename, "/highCoverageContigs.fa");
414
+
415
+ // Check consistency of arguments
416
+ if (createReadToNodeFile && !readTracking){
417
+ velvetLog("When -create_read_to_node_binary is used, -read_trkg must also be used\n");
418
+ exit(1);
419
+ }
420
+
421
+ // Graph uploading or creation
422
+ if ((file = fopen(graphFilename, "r")) != NULL) {
423
+ fclose(file);
424
+
425
+ graph = importGraph(graphFilename);
426
+
427
+ } else if ((file = fopen(connectedGraphFilename, "r")) != NULL) {
428
+ fclose(file);
429
+ if (seqReadInfo->m_bIsBinary) {
430
+
431
+ sequences = importCnyReadSet(seqFilename);
432
+
433
+ #if 0
434
+ // compare to velvet's version of a seq
435
+ ReadSet *compareSequences = NULL;
436
+ compareSeqFilename = mallocOrExit(strlen(directory) + 100, char);
437
+ strcpy(compareSeqFilename, directory);
438
+ strcat(compareSeqFilename, "/Sequences");
439
+ compareSequences = importReadSet(compareSeqFilename);
440
+ convertSequences(compareSequences);
441
+ if (sequences->readCount != compareSequences->readCount) {
442
+ printf("read count mismatch\n");
443
+ exit(1);
444
+ }
445
+ int i;
446
+ for (i = 0; i < sequences->readCount; i++) {
447
+ TightString *tString = getTightStringInArray(sequences->tSequences, i);
448
+ TightString *tStringCmp = getTightStringInArray(compareSequences->tSequences, i);
449
+ if (getLength(tString) != getLength(tStringCmp)) {
450
+ printf("sequence %d len mismatch\n", i);
451
+ exit(1);
452
+ }
453
+ if (strcmp(readTightString(tString), readTightString(tStringCmp)) != 0) {
454
+ printf("sequence %d cmp mismatch\n", i);
455
+ printf("seq %s != cmp %s\n", readTightString(tString), readTightString(tStringCmp));
456
+ exit(1);
457
+ }
458
+ }
459
+ #endif
460
+ } else {
461
+ sequences = importReadSet(seqFilename);
462
+ convertSequences(sequences);
463
+ }
464
+ seqReadInfo->m_sequences = sequences;
465
+
466
+ graph =
467
+ importConnectedGraph(connectedGraphFilename, sequences,
468
+ roadmapFilename, readTracking, accelerationBits);
469
+
470
+ sequenceLengths =
471
+ getSequenceLengths(sequences, getWordLength(graph));
472
+ if (doTourBus)
473
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
474
+ exportGraph(graphFilename, graph, sequences->tSequences);
475
+ } else if ((file = fopen(preGraphFilename, "r")) != NULL) {
476
+ fclose(file);
477
+ if (seqReadInfo->m_bIsBinary) {
478
+ sequences = importCnyReadSet(seqFilename);
479
+ } else {
480
+ sequences = importReadSet(seqFilename);
481
+ convertSequences(sequences);
482
+ }
483
+ seqReadInfo->m_sequences = sequences;
484
+ graph =
485
+ importPreGraph(preGraphFilename, sequences,
486
+ roadmapFilename, readTracking, accelerationBits);
487
+ sequenceLengths =
488
+ getSequenceLengths(sequences, getWordLength(graph));
489
+ if (doTourBus)
490
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
491
+ exportGraph(graphFilename, graph, sequences->tSequences);
492
+ } else if ((file = fopen(roadmapFilename, "r")) != NULL) {
493
+ fclose(file);
494
+
495
+ rdmaps = importRoadMapArray(roadmapFilename);
496
+ if (seqReadInfo->m_bIsBinary) {
497
+ // pull in sequences first and use in preGraph
498
+ sequences = importCnyReadSet(seqFilename);
499
+ seqReadInfo->m_sequences = sequences;
500
+ #if 0
501
+ // compare to velvet's version of a seq
502
+ ReadSet *compareSequences = NULL;
503
+ char *compareSeqFilename = mallocOrExit(strlen(directory) + 100, char);
504
+ strcpy(compareSeqFilename, directory);
505
+ strcat(compareSeqFilename, "/Sequences");
506
+ compareSequences = importReadSet(compareSeqFilename);
507
+ convertSequences(compareSequences);
508
+ if (sequences->readCount != compareSequences->readCount) {
509
+ printf("read count mismatch\n");
510
+ exit(1);
511
+ }
512
+ int i;
513
+ for (i = 0; i < sequences->readCount; i++) {
514
+ TightString *tString = getTightStringInArray(sequences->tSequences, i);
515
+ TightString *tStringCmp = getTightStringInArray(compareSequences->tSequences, i);
516
+ if (getLength(tString) != getLength(tStringCmp)) {
517
+ printf("sequence %d len mismatch\n", i);
518
+ exit(1);
519
+ }
520
+ if (strcmp(readTightString(tString), readTightString(tStringCmp)) != 0) {
521
+ printf("sequence %d cmp mismatch\n", i);
522
+ printf("seq %s != cmp %s\n", readTightString(tString), readTightString(tStringCmp));
523
+ exit(1);
524
+ }
525
+ }
526
+ printf("sequence files match!\n");
527
+ #endif
528
+ }
529
+ preGraph = newPreGraph_pg(rdmaps, seqReadInfo);
530
+ concatenatePreGraph_pg(preGraph);
531
+ if (!conserveLong)
532
+ clipTips_pg(preGraph);
533
+ exportPreGraph_pg(preGraphFilename, preGraph);
534
+ destroyPreGraph_pg(preGraph);
535
+ if (!seqReadInfo->m_bIsBinary) {
536
+ sequences = importReadSet(seqFilename);
537
+ convertSequences(sequences);
538
+ seqReadInfo->m_sequences = sequences;
539
+ }
540
+ graph =
541
+ importPreGraph(preGraphFilename, sequences,
542
+ roadmapFilename, readTracking, accelerationBits);
543
+ sequenceLengths =
544
+ getSequenceLengths(sequences, getWordLength(graph));
545
+ if (doTourBus)
546
+ correctGraph(graph, sequenceLengths, sequences->categories, conserveLong);
547
+ exportGraph(graphFilename, graph, sequences->tSequences);
548
+ } else {
549
+ velvetLog("No Roadmap file to build upon! Please run velveth (see manual)\n");
550
+ #ifdef DEBUG
551
+ abort();
552
+ #endif
553
+ exit(1);
554
+ }
555
+
556
+ // Set insert lengths and their standard deviations
557
+ for (cat = 0; cat < CATEGORIES; cat++) {
558
+ if (insertLength[cat] > -1 && std_dev[cat] < 0)
559
+ std_dev[cat] = insertLength[cat] / 10;
560
+ setInsertLengths(graph, cat,
561
+ insertLength[cat], std_dev[cat]);
562
+ }
563
+
564
+ if (insertLengthLong > -1 && std_dev_long < 0)
565
+ std_dev_long = insertLengthLong / 10;
566
+ setInsertLengths(graph, CATEGORIES,
567
+ insertLengthLong, std_dev_long);
568
+
569
+ // Coverage cutoff
570
+ if (expectedCoverage < 0 && estimateCoverage == true) {
571
+ expectedCoverage = estimated_cov(graph, directory);
572
+ if (coverageCutoff < 0) {
573
+ coverageCutoff = expectedCoverage / 2;
574
+ estimateCutoff = true;
575
+ }
576
+ } else {
577
+ estimateCoverage = false;
578
+ if (coverageCutoff < 0 && estimateCutoff)
579
+ coverageCutoff = estimated_cov(graph, directory) / 2;
580
+ else
581
+ estimateCutoff = false;
582
+ }
583
+
584
+ if (coverageCutoff < 0) {
585
+ velvetLog("WARNING: NO COVERAGE CUTOFF PROVIDED\n");
586
+ velvetLog("Velvet will probably leave behind many detectable errors\n");
587
+ velvetLog("See manual for instructions on how to set the coverage cutoff parameter\n");
588
+ }
589
+
590
+ if (sequences == NULL) {
591
+ if (seqReadInfo->m_bIsBinary) {
592
+ sequences = importCnyReadSet(seqFilename);
593
+ } else {
594
+ sequences = importReadSet(seqFilename);
595
+ convertSequences(sequences);
596
+ }
597
+ seqReadInfo->m_sequences = sequences;
598
+ }
599
+
600
+ if (minContigLength < 2 * getWordLength(graph))
601
+ minContigKmerLength = getWordLength(graph);
602
+ else
603
+ minContigKmerLength = minContigLength - getWordLength(graph) + 1;
604
+
605
+ // Check
606
+
607
+ dubious =
608
+ removeLowCoverageNodesAndDenounceDubiousReads(graph,
609
+ coverageCutoff,
610
+ sequences,
611
+ exportFilteredNodes,
612
+ minContigKmerLength,
613
+ lowCovContigsFilename);
614
+
615
+ removeLowLongCoverageNodesAndDenounceDubiousReads(graph,
616
+ longCoverageCutoff,
617
+ sequences,
618
+ dubious,
619
+ exportFilteredNodes,
620
+ minContigKmerLength,
621
+ lowCovContigsFilename);
622
+
623
+ removeHighCoverageNodes(graph, maxCoverageCutoff, exportFilteredNodes, minContigKmerLength, highCovContigsFilename);
624
+ clipTipsHard(graph, conserveLong);
625
+
626
+ if (sequences->readCount > 0 && sequences->categories[0] == REFERENCE)
627
+ removeLowArcs(graph, coverageCutoff);
628
+
629
+ if (expectedCoverage > 0) {
630
+
631
+ // Mixed length sequencing
632
+ readCoherentGraph(graph, isUniqueSolexa, expectedCoverage,
633
+ sequences);
634
+
635
+ // Paired end resolution
636
+ createReadPairingArray(sequences);
637
+ pebbleRounds += pairedCategories(sequences);
638
+ detachDubiousReads(sequences, dubious);
639
+ activateGapMarkers(graph);
640
+
641
+ for ( ;pebbleRounds > 0; pebbleRounds--)
642
+ exploitShortReadPairs(graph, sequences, dubious, shadows, scaffolding);
643
+
644
+ } else {
645
+ velvetLog("WARNING: NO EXPECTED COVERAGE PROVIDED\n");
646
+ velvetLog("Velvet will be unable to resolve any repeats\n");
647
+ velvetLog("See manual for instructions on how to set the expected coverage parameter\n");
648
+ }
649
+
650
+ if (dubious)
651
+ free(dubious);
652
+
653
+ concatenateGraph(graph);
654
+
655
+ removeLowCoverageReferenceNodes(graph, coverageCutoff, longCoverageCutoff, sequences);
656
+
657
+ strcpy(graphFilename, directory);
658
+ strcat(graphFilename, "/contigs.fa");
659
+ sequenceLengths = getSequenceLengths(sequences, getWordLength(graph));
660
+ exportLongNodeSequences(graphFilename, graph, minContigKmerLength, sequences, sequenceLengths, coverageMask);
661
+
662
+ if (exportAlignments) {
663
+ strcpy(graphFilename, directory);
664
+ strcat(graphFilename, "/contig-alignments.psa");
665
+ exportLongNodeMappings(graphFilename, graph, sequences,
666
+ minContigKmerLength, seqReadInfo);
667
+ }
668
+
669
+ strcpy(graphFilename, directory);
670
+ strcat(graphFilename, "/stats.txt");
671
+ displayGeneralStatistics(graph, graphFilename, sequences);
672
+
673
+ if (clean == 0) {
674
+ strcpy(graphFilename, directory);
675
+ strcat(graphFilename, "/LastGraph");
676
+ exportGraph(graphFilename, graph, sequences->tSequences);
677
+ }
678
+
679
+ if (exportAssembly) {
680
+ strcpy(graphFilename, directory);
681
+ strcat(graphFilename, "/velvet_asm.afg");
682
+ exportAMOSContigs(graphFilename, graph, minContigKmerLength, sequences);
683
+ }
684
+
685
+ if (unusedReads)
686
+ exportUnusedReads(graph, sequences, minContigKmerLength, directory);
687
+
688
+ if (estimateCoverage)
689
+ velvetLog("Estimated Coverage = %f\n", expectedCoverage);
690
+ if (estimateCutoff)
691
+ velvetLog("Estimated Coverage cutoff = %f\n", coverageCutoff);
692
+
693
+ if (createReadToNodeFile) {
694
+ velvetLog("Creating and writing ReadToNode binary file..\n");
695
+ strcpy(graphFilename, directory);
696
+ strcat(graphFilename, "/ReadToNode.bin");
697
+ ReadIdToNodeIdLookupTable* lookupTable = createReadToNode(graph);
698
+ writeReadIdToNodeIdLookupTable(graphFilename, lookupTable);
699
+ destroyReadIdToNodeIdLookupTable(lookupTable);
700
+ velvetLog("Finished writing ReadToNode binary file\n");
701
+ }
702
+
703
+ logFinalStats(graph, minContigKmerLength, directory);
704
+
705
+ if (clean > 0) {
706
+ strcpy(graphFilename, directory);
707
+ strcat(graphFilename, "/Roadmaps");
708
+ remove(graphFilename);
709
+
710
+ strcpy(graphFilename, directory);
711
+ strcat(graphFilename, "/LastGraph");
712
+ remove(graphFilename);
713
+ }
714
+
715
+ if (clean > 1) {
716
+ strcpy(graphFilename, directory);
717
+ strcat(graphFilename, "/Sequences");
718
+ remove(graphFilename);
719
+
720
+ strcpy(graphFilename, directory);
721
+ strcat(graphFilename, "/Graph2");
722
+ remove(graphFilename);
723
+
724
+ strcpy(graphFilename, directory);
725
+ strcat(graphFilename, "/Graph");
726
+ remove(graphFilename);
727
+ }
728
+
729
+ free(sequenceLengths);
730
+ destroyGraph(graph);
731
+ free(graphFilename);
732
+ free(connectedGraphFilename);
733
+ free(preGraphFilename);
734
+ free(seqFilename);
735
+ free(roadmapFilename);
736
+ free(lowCovContigsFilename);
737
+ free(highCovContigsFilename);
738
+ destroyReadSet(sequences);
739
+ if (seqReadInfo) {
740
+ free(seqReadInfo);
741
+ }
742
+
743
+ return 0;
744
+ }