finishm 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,233 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _GRAPH_H_
22
+ #define _GRAPH_H_
23
+
24
+ ////////////////////////////////////////////////////////////
25
+ // Node functions
26
+ ////////////////////////////////////////////////////////////
27
+
28
+ //Creators/destructor
29
+ Node *newNode(IDnum sequenceID, Coordinate start, Coordinate finish,
30
+ Coordinate offset, IDnum ID, TightString * sequences,
31
+ int WORDLENGTH);
32
+ Node *emptyNode();
33
+ void destroyNode(Node * node, Graph * graph);
34
+
35
+ // Locator
36
+ IDnum getNodeID(Node * node);
37
+ Node *getNodeInGraph(Graph * graph, IDnum nodeID);
38
+ Node *getTwinNode(Node * node);
39
+
40
+ // Arc info
41
+ int arcCount(Node * node);
42
+ int simpleArcCount(Node * node);
43
+ Arc *getArc(Node * node);
44
+ boolean hasSingleArc(Node * node);
45
+
46
+ // Descriptor
47
+ Coordinate getNodeLength(Node * node);
48
+ void appendDescriptors(Node * target, Node * source);
49
+ void directlyAppendDescriptors(Node * target, Node * sourcei, Coordinate totalLength);
50
+ void appendSequence(Node * node, TightString * reads,
51
+ PassageMarkerI guide, Graph * graph);
52
+ void splitNodeDescriptor(Node * source, Node * target, Coordinate offset);
53
+ void reduceNode(Node * node);
54
+ void reallocateNodeDescriptor(Node * node, Coordinate length);
55
+ Nucleotide getNucleotideInNode(Node * node, Coordinate index);
56
+
57
+ // Passage markers
58
+ void insertPassageMarker(PassageMarkerI marker, Node * destination);
59
+ PassageMarkerI getMarker(Node * node);
60
+ void setMarker(Node * node, PassageMarkerI marker);
61
+ IDnum markerCount(Node * node);
62
+
63
+ // Short read marker creation
64
+ void incrementReadStartCount(Node * node, Graph * graph);
65
+ void addReadStart(Node * node, IDnum seqID, Coordinate position,
66
+ Graph * graph, Coordinate offset);
67
+ void blurLastShortReadMarker(Node * node, Graph * graph);
68
+
69
+ // Short read marker handling
70
+ ShortReadMarker *getNodeReads(Node * node, Graph * graph);
71
+ IDnum getNodeReadCount(Node * node, Graph * graph);
72
+ ShortReadMarker *commonNodeReads(Node * nodeA, Node * nodeB, Graph * graph,
73
+ IDnum * length);
74
+ ShortReadMarker *extractBackOfNodeReads(Node * node, Coordinate breakpoint,
75
+ Graph * graph, IDnum * length,
76
+ PassageMarkerI sourceMarker,
77
+ ShortLength * sequenceLengths);
78
+ ShortReadMarker *extractFrontOfNodeReads(Node * node,
79
+ Coordinate breakpoint,
80
+ Graph * graph, IDnum * length,
81
+ PassageMarkerI sourceMarker,
82
+ ShortLength * sequenceLengths);
83
+
84
+ // Short read marker moving around
85
+ void foldSymmetricalNodeReads(Node * node, Graph * graph);
86
+ void spreadReadIDs(ShortReadMarker * reads, IDnum readCount, Node * node,
87
+ Graph * graph);
88
+ void injectShortReads(ShortReadMarker * sourceArray, IDnum sourceLength,
89
+ Node * target, Graph * graph);
90
+ void mergeNodeReads(Node * target, Node * source, Graph * graph);
91
+
92
+ #ifndef SINGLE_COV_CAT
93
+ // Virtual coverage
94
+ void setVirtualCoverage(Node * node, Category category,
95
+ Coordinate coverage);
96
+ void incrementVirtualCoverage(Node * node, Category category,
97
+ Coordinate coverage);
98
+ Coordinate getVirtualCoverage(Node * node, Category category);
99
+ Coordinate getTotalCoverage(Node * node);
100
+
101
+ // Original virtual coverage
102
+ void setOriginalVirtualCoverage(Node * node, Category category,
103
+ Coordinate coverage);
104
+ void incrementOriginalVirtualCoverage(Node * node, Category category,
105
+ Coordinate coverage);
106
+ Coordinate getOriginalVirtualCoverage(Node * node, Category category);
107
+
108
+ #else
109
+ // Virtual coverage
110
+ void setVirtualCoverage(Node * node,
111
+ Coordinate coverage);
112
+ void incrementVirtualCoverage(Node * node,
113
+ Coordinate coverage);
114
+ Coordinate getVirtualCoverage(Node * node);
115
+ Coordinate getTotalCoverage(Node * node);
116
+ #endif
117
+
118
+ // Status
119
+ void setNodeStatus(Node * node, boolean status);
120
+ void setSingleNodeStatus(Node * node, boolean status);
121
+ boolean getNodeStatus(Node * node);
122
+
123
+ // Uniqueness
124
+ void setUniqueness(Node * node, boolean value);
125
+ boolean getUniqueness(Node * node);
126
+
127
+ // Gap markers
128
+ void appendGap(Node * node, Coordinate length, Graph * graph);
129
+ void appendNodeGaps(Node * destination, Node * source, Graph * graph);
130
+
131
+ // IO
132
+ TightString *expandNode(Node * node, int WORDLENGTH);
133
+ void appendNodeSequence(Node * node, TightString * sequence,
134
+ Coordinate writeIndex);
135
+ char *expandNodeFragment(Node * node, Coordinate contigStart,
136
+ Coordinate contigFinish, int WORDLENGTH);
137
+
138
+ ////////////////////////////////////////////////////////////
139
+ // Arc functions
140
+ ////////////////////////////////////////////////////////////
141
+
142
+ // Creators/destructor
143
+ Arc *createArc(Node * origin, Node * destination, Graph * graph);
144
+ void createAnalogousArc(Node * origin, Node * destination, Arc * refArc,
145
+ Graph * graph);
146
+ void destroyArc(Arc * arc, Graph * graph);
147
+
148
+ // Multiplicity
149
+ void setMultiplicity(Arc * arc, IDnum mult);
150
+ IDnum getMultiplicity(Arc * arc);
151
+
152
+ // Extremities
153
+ Node *getOrigin(Arc * arc);
154
+ Node *getDestination(Arc * arc);
155
+
156
+ // Finding arcs
157
+ Arc *getArcBetweenNodes(Node * origin, Node * destination, Graph * graph);
158
+ Arc *getNextArc(Arc * arc);
159
+
160
+ // Lookup table option
161
+ void activateArcLookupTable(Graph * graph);
162
+ void deactivateArcLookupTable(Graph * graph);
163
+
164
+ ////////////////////////////////////////////////////////////
165
+ // Short read marker functions
166
+ ////////////////////////////////////////////////////////////
167
+
168
+ ShortReadMarker *getShortReadMarkerAtIndex(ShortReadMarker * array,
169
+ IDnum index);
170
+
171
+ IDnum getShortReadMarkerID(ShortReadMarker * marker);
172
+
173
+ extern inline Coordinate getShortReadMarkerPosition(ShortReadMarker * marker);
174
+ extern inline void setShortReadMarkerPosition(ShortReadMarker * marker,
175
+ Coordinate position);
176
+
177
+ extern inline ShortLength getShortReadMarkerOffset(ShortReadMarker * marker);
178
+ extern inline void setShortReadMarkerOffset(ShortReadMarker * marker,
179
+ ShortLength offset);
180
+
181
+ ////////////////////////////////////////////////////////////
182
+ // Gap marker functions
183
+ ////////////////////////////////////////////////////////////
184
+
185
+ GapMarker *getGap(Node * node, Graph * graph);
186
+ GapMarker *getNextGap(GapMarker * marker);
187
+ Coordinate getGapStart(GapMarker * marker);
188
+ Coordinate getGapFinish(GapMarker * marker);
189
+
190
+ ////////////////////////////////////////////////////////////
191
+ // Graph functions
192
+ ////////////////////////////////////////////////////////////
193
+
194
+ // Memory allocation
195
+ Graph *emptyGraph(IDnum sequenceCount, int wordLength);
196
+ void allocateNodeSpace(Graph * graph, IDnum nodeCount);
197
+ Node *addEmptyNodeToGraph(Graph * graph, IDnum nodeID);
198
+ void destroyGraph(Graph * graph);
199
+
200
+ // Dimensions
201
+ IDnum nodeCount(Graph * graph);
202
+ IDnum sequenceCount(Graph * graph);
203
+ void renumberNodes(Graph * graph);
204
+ int getWordLength(Graph * graph);
205
+ boolean doubleStrandedGraph(Graph * graph);
206
+
207
+ // Element status
208
+ void resetNodeStatus(Graph * graph);
209
+
210
+ // File IO
211
+ Graph *importGraph(char *filename);
212
+ void exportGraph(char *filename, Graph * graph, TightString * sequences);
213
+ Graph *readPreGraphFile(char *preGraphFilename, boolean * double_strand);
214
+ Graph *readConnectedGraphFile(char *connectedGraphFilename, boolean * double_strand);
215
+
216
+ // Read starts
217
+ void activateReadStarts(Graph * graph);
218
+ boolean readStartsAreActivated(Graph * graph);
219
+ void createNodeReadStartArrays(Graph * graph);
220
+ void orderNodeReadStartArrays(Graph * graph);
221
+
222
+ // Insert lengths
223
+ void setInsertLengths(Graph * graph, Category cat, Coordinate insertLength,
224
+ Coordinate insertLength_std_dev);
225
+ Coordinate getInsertLength(Graph * graph, Category cat);
226
+ double getInsertLength_var(Graph * graph, Category cat);
227
+
228
+ // Gaps markers
229
+ void activateGapMarkers(Graph * graph);
230
+ void deactivateGapMarkers(Graph * graph);
231
+ void sortGapMarkers(Graph * graph);
232
+
233
+ #endif
@@ -0,0 +1,1472 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+
22
+ #include <stdlib.h>
23
+ #include <stdio.h>
24
+ #include <string.h>
25
+ #include <limits.h>
26
+ #include <sys/time.h>
27
+
28
+ #ifdef _OPENMP
29
+ #include <omp.h>
30
+ #endif
31
+
32
+ #include "globals.h"
33
+ #include "graph.h"
34
+ #include "passageMarker.h"
35
+ #include "readSet.h"
36
+ #include "tightString.h"
37
+ #include "recycleBin.h"
38
+ #include "utility.h"
39
+ #include "kmer.h"
40
+ #include "kmerOccurenceTable.h"
41
+ #include "roadMap.h"
42
+
43
+ #define ADENINE 0
44
+ #define CYTOSINE 1
45
+ #define GUANINE 2
46
+ #define THYMINE 3
47
+
48
+
49
+ //////////////////////////////////////////////////////////
50
+ // Node Locking
51
+ //////////////////////////////////////////////////////////
52
+
53
+ #ifdef _OPENMP
54
+
55
+ /* Array of per-node locks */
56
+
57
+ static omp_lock_t *nodeLocks = NULL;
58
+
59
+ static void
60
+ createNodeLocks(Graph *graph)
61
+ {
62
+ IDnum nbNodes;
63
+ IDnum nodeIndex;
64
+
65
+ nbNodes = nodeCount(graph) + 1;
66
+ if (nodeLocks)
67
+ free (nodeLocks);
68
+ nodeLocks = mallocOrExit(nbNodes, omp_lock_t);
69
+
70
+ #pragma omp parallel for
71
+ for (nodeIndex = 0; nodeIndex < nbNodes; nodeIndex++)
72
+ omp_init_lock(nodeLocks + nodeIndex);
73
+ }
74
+
75
+ static inline void lockNode(Node *node)
76
+ {
77
+ IDnum nodeID = getNodeID(node);
78
+
79
+ if (nodeID < 0)
80
+ nodeID = -nodeID;
81
+ omp_set_lock (nodeLocks + nodeID);
82
+ }
83
+
84
+ /* Assumes node is already locked */
85
+ static inline void lockTwoNodes(Node *node, Node *node2)
86
+ {
87
+ IDnum nodeID = getNodeID(node);
88
+ IDnum node2ID = getNodeID(node2);
89
+
90
+ if (nodeID < 0)
91
+ nodeID = -nodeID;
92
+ if (node2ID < 0)
93
+ node2ID = -node2ID;
94
+
95
+ if (nodeID == node2ID)
96
+ return;
97
+
98
+ /* Lock lowest ID first to avoid deadlocks */
99
+ if (nodeID < node2ID)
100
+ {
101
+ omp_set_lock (nodeLocks + node2ID);
102
+ }
103
+ else if (!omp_test_lock (nodeLocks + node2ID))
104
+ {
105
+ omp_unset_lock (nodeLocks + nodeID);
106
+ omp_set_lock (nodeLocks + node2ID);
107
+ omp_set_lock (nodeLocks + nodeID);
108
+ }
109
+ }
110
+
111
+ static inline void unLockTwoNodes(Node *node, Node *node2)
112
+ {
113
+ IDnum nodeID = getNodeID(node);
114
+ IDnum node2ID = getNodeID(node2);
115
+
116
+ if (nodeID < 0)
117
+ nodeID = -nodeID;
118
+ if (node2ID < 0)
119
+ node2ID = -node2ID;
120
+
121
+ omp_unset_lock (nodeLocks + nodeID);
122
+ if (nodeID != node2ID)
123
+ omp_unset_lock (nodeLocks + node2ID);
124
+ }
125
+
126
+ static inline void unLockNode(Node *node)
127
+ {
128
+ IDnum nodeID = getNodeID(node);
129
+
130
+ if (nodeID < 0)
131
+ nodeID = -nodeID;
132
+ omp_unset_lock (nodeLocks + nodeID);
133
+ }
134
+
135
+ #endif
136
+
137
+ //////////////////////////////////////////////////////////
138
+ // Node Lists
139
+ //////////////////////////////////////////////////////////
140
+ typedef struct smallNodeList_st SmallNodeList;
141
+
142
+ struct smallNodeList_st {
143
+ Node *node;
144
+ SmallNodeList *next;
145
+ } ATTRIBUTE_PACKED;
146
+
147
+ static RecycleBin *smallNodeListMemory = NULL;
148
+
149
+ #define BLOCKSIZE 1000
150
+
151
+ #ifdef _OPENMP
152
+ static void initSmallNodeListMemory(void)
153
+ {
154
+ int n = omp_get_max_threads();
155
+
156
+ #pragma omp critical
157
+ {
158
+ if (smallNodeListMemory == NULL)
159
+ smallNodeListMemory = newRecycleBinArray(n, sizeof(SmallNodeList), BLOCKSIZE);
160
+ }
161
+ }
162
+ #endif
163
+
164
+ static SmallNodeList *allocateSmallNodeList()
165
+ {
166
+ #ifdef _OPENMP
167
+ #ifdef DEBUG
168
+ if (smallNodeListMemory == NULL)
169
+ {
170
+ velvetLog("The memory for small nodes seems uninitialised, "
171
+ "this is probably a bug, aborting.\n");
172
+ abort();
173
+ }
174
+ #endif
175
+ return allocatePointer(getRecycleBinInArray(smallNodeListMemory,
176
+ omp_get_thread_num()));
177
+ #else
178
+ if (smallNodeListMemory == NULL)
179
+ smallNodeListMemory = newRecycleBin(sizeof(SmallNodeList), BLOCKSIZE);
180
+
181
+ return allocatePointer(smallNodeListMemory);
182
+ #endif
183
+ }
184
+
185
+ static void deallocateSmallNodeList(SmallNodeList * smallNodeList)
186
+ {
187
+ #ifdef _OPENMP
188
+ deallocatePointer(getRecycleBinInArray(smallNodeListMemory,
189
+ omp_get_thread_num()),
190
+ smallNodeList);
191
+ #else
192
+ deallocatePointer(smallNodeListMemory, smallNodeList);
193
+ #endif
194
+ }
195
+
196
+ static void destroySmallNodeListMemmory(void)
197
+ {
198
+ if (smallNodeListMemory != NULL)
199
+ {
200
+ #ifdef _OPENMP
201
+ destroyRecycleBinArray(smallNodeListMemory);
202
+ #else
203
+ destroyRecycleBin(smallNodeListMemory);
204
+ #endif
205
+ smallNodeListMemory = NULL;
206
+ }
207
+ }
208
+
209
+ static inline void memorizeNode(Node * node, SmallNodeList ** nodePile)
210
+ {
211
+ SmallNodeList *list = allocateSmallNodeList();
212
+ list->node = node;
213
+ list->next = *nodePile;
214
+ *nodePile = list;
215
+ #ifndef _OPENMP
216
+ setSingleNodeStatus(node, true);
217
+ #endif
218
+ }
219
+
220
+ static inline boolean isNodeMemorized(Node * node, SmallNodeList * nodePile)
221
+ {
222
+ #ifdef _OPENMP
223
+ /* SF TODO There must be a faster way to do this: bit mask, hash table, tree, ... ? */
224
+ SmallNodeList * list;
225
+
226
+ for (list = nodePile; list; list = list->next)
227
+ if (list->node == node)
228
+ return true;
229
+
230
+ return false;
231
+ #else
232
+ return getNodeStatus(node);
233
+ #endif
234
+ }
235
+
236
+ static void unMemorizeNodes(SmallNodeList ** nodePile)
237
+ {
238
+ SmallNodeList * list;
239
+
240
+ while (*nodePile) {
241
+ list = *nodePile;
242
+ *nodePile = list->next;
243
+ #ifndef _OPENMP
244
+ setSingleNodeStatus(list->node, false);
245
+ #endif
246
+ deallocateSmallNodeList(list);
247
+ }
248
+ }
249
+
250
+ ///////////////////////////////////////////////////////////
251
+ // Reference Mappings
252
+ ///////////////////////////////////////////////////////////
253
+ typedef struct referenceMapping_st ReferenceMapping;
254
+
255
+ struct referenceMapping_st {
256
+ IDnum referenceStart;
257
+ IDnum nodeStart;
258
+ IDnum length;
259
+ IDnum referenceID;
260
+ IDnum nodeID;
261
+ } ATTRIBUTE_PACKED;
262
+
263
+ static IDnum countMappings(char * preGraphFilename) {
264
+ FILE *file = fopen(preGraphFilename, "r");
265
+ const int maxline = MAXLINE;
266
+ char line[MAXLINE];
267
+ IDnum count = 0;
268
+
269
+ // Go past NODE blocks
270
+ while(fgets(line, maxline, file))
271
+ if (line[0] == 'S')
272
+ break;
273
+
274
+ // Count relevant lines
275
+ while(fgets(line, maxline, file))
276
+ if (line[0] != 'S')
277
+ count++;
278
+
279
+ fclose(file);
280
+ return count;
281
+ }
282
+
283
+ static ReferenceMapping * recordReferenceMappings(char * preGraphFilename, IDnum arrayLength) {
284
+ ReferenceMapping * mappings = callocOrExit(arrayLength, ReferenceMapping);
285
+ FILE *file = fopen(preGraphFilename, "r");
286
+ const int maxline = MAXLINE;
287
+ char line[MAXLINE];
288
+ ReferenceMapping * current = mappings;
289
+ IDnum referenceID;
290
+ long long_var;
291
+ long long coord1, coord2, coord3;
292
+
293
+ // Go past NODE blocks
294
+ while(fgets(line, maxline, file))
295
+ if (line[0] == 'S')
296
+ break;
297
+
298
+ sscanf(line, "SEQ\t%li\n", &long_var);
299
+ referenceID = long_var;
300
+
301
+ // Go relevant lines
302
+ while(fgets(line, maxline, file)) {
303
+ if (line[0] != 'S') {
304
+ sscanf(line, "%li\t%lli\t%lli\t%lli\n", &long_var, &coord1, &coord2, &coord3);
305
+ current->referenceID = referenceID;
306
+ current->nodeID = long_var;
307
+ current->nodeStart = coord1;
308
+ current->referenceStart = coord2;
309
+ current->length = coord3;
310
+ current++;
311
+ } else {
312
+ sscanf(line, "SEQ\t%li\n", &long_var);
313
+ referenceID = long_var;
314
+ }
315
+ }
316
+
317
+ fclose(file);
318
+ return mappings;
319
+ }
320
+
321
+ static int compareRefMaps(const void * ptrA, const void * ptrB) {
322
+ ReferenceMapping * A = (ReferenceMapping *) ptrA;
323
+ ReferenceMapping * B = (ReferenceMapping *) ptrB;
324
+
325
+ if (A->referenceID > B->referenceID)
326
+ return 1;
327
+ else if (A->referenceID < B->referenceID)
328
+ return -1;
329
+ else {
330
+ if (A->referenceStart >= B->referenceStart + B->length)
331
+ return 1;
332
+ else if (A->referenceStart + A->length <= B->referenceStart)
333
+ return -1;
334
+ else
335
+ return 0;
336
+ }
337
+ }
338
+
339
+ static ReferenceMapping * computeReferenceMappings(char * preGraphFilename, ReadSet * reads, Coordinate * referenceMappingLength, IDnum * referenceCount) {
340
+ IDnum index;
341
+ ReferenceMapping * referenceMappings;
342
+
343
+ for(index = 0; index < reads->readCount && reads->categories[index] == 2 * CATEGORIES + 2; index++)
344
+ (*referenceCount)++;
345
+
346
+ if (*referenceCount == 0) {
347
+ *referenceMappingLength = 0;
348
+ return NULL;
349
+ }
350
+
351
+ *referenceMappingLength = countMappings(preGraphFilename);
352
+
353
+ if (*referenceMappingLength == 0)
354
+ return NULL;
355
+
356
+ referenceMappings = recordReferenceMappings(preGraphFilename, *referenceMappingLength);
357
+ qsort(referenceMappings, *referenceMappingLength, sizeof(ReferenceMapping), compareRefMaps);
358
+
359
+ return referenceMappings;
360
+ }
361
+
362
+ static ReferenceMapping * findReferenceMapping(IDnum seqID, Coordinate refCoord, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount) {
363
+ IDnum positive_seqID;
364
+ Coordinate leftIndex = 0;
365
+ Coordinate rightIndex = referenceMappingCount - 1;
366
+ Coordinate middleIndex;
367
+ ReferenceMapping refMap;
368
+ int comparison;
369
+
370
+ if (seqID > 0)
371
+ positive_seqID = seqID;
372
+ else
373
+ positive_seqID = -seqID;
374
+
375
+ refMap.referenceID = positive_seqID;
376
+ refMap.referenceStart = refCoord;
377
+ refMap.length = 1;
378
+ refMap.nodeStart = 0;
379
+ refMap.nodeID = 0;
380
+
381
+ if (compareRefMaps(&(referenceMappings[leftIndex]), &refMap) == 0)
382
+ return &(referenceMappings[leftIndex]);
383
+ if (compareRefMaps(&(referenceMappings[rightIndex]), &refMap) == 0)
384
+ return &(referenceMappings[rightIndex]);
385
+
386
+ while (true) {
387
+ middleIndex = (rightIndex + leftIndex) / 2;
388
+ comparison = compareRefMaps(&(referenceMappings[middleIndex]), &refMap);
389
+
390
+ if (leftIndex >= rightIndex)
391
+ return NULL;
392
+ else if (comparison == 0)
393
+ return &(referenceMappings[middleIndex]);
394
+ else if (leftIndex == middleIndex)
395
+ return NULL;
396
+ else if (comparison > 0)
397
+ rightIndex = middleIndex;
398
+ else
399
+ leftIndex = middleIndex;
400
+ }
401
+ }
402
+
403
+ ///////////////////////////////////////////////////////////
404
+ // Node Mask
405
+ ///////////////////////////////////////////////////////////
406
+
407
+ typedef struct nodeMask_st NodeMask;
408
+
409
+ struct nodeMask_st {
410
+ IDnum nodeID;
411
+ IDnum start;
412
+ IDnum finish;
413
+ } ATTRIBUTE_PACKED;
414
+
415
+ static int compareNodeMasks(const void * ptrA, const void * ptrB) {
416
+ NodeMask * A = (NodeMask *) ptrA;
417
+ NodeMask * B = (NodeMask *) ptrB;
418
+
419
+ if (A->nodeID < B->nodeID)
420
+ return -1;
421
+ else if (A->nodeID > B->nodeID)
422
+ return 1;
423
+ else {
424
+ if (A->start < B->start)
425
+ return -1;
426
+ else if (A->start > B->start)
427
+ return 1;
428
+ else
429
+ return 0;
430
+ }
431
+ }
432
+
433
+ static NodeMask * computeNodeMasks(ReferenceMapping * referenceMappings, Coordinate arrayLength, Graph * graph) {
434
+ NodeMask * nodeMasks;
435
+ NodeMask * currentMask;
436
+ ReferenceMapping * currentMapping = referenceMappings;
437
+ Coordinate index;
438
+
439
+ if (referenceMappings == NULL)
440
+ return NULL;
441
+
442
+ nodeMasks = callocOrExit(arrayLength, NodeMask);
443
+ currentMask = nodeMasks;
444
+
445
+ for (index = 0; index < arrayLength; index++) {
446
+ if (currentMapping->nodeID > 0) {
447
+ currentMask->nodeID = currentMapping->nodeID;
448
+ } else {
449
+ currentMask->nodeID = -currentMapping->nodeID;
450
+ }
451
+ currentMask->start = currentMapping->nodeStart;
452
+ currentMask->finish = currentMapping->nodeStart + currentMapping->length;
453
+ currentMask++;
454
+ currentMapping++;
455
+ }
456
+
457
+ qsort(nodeMasks, arrayLength, sizeof(NodeMask), compareNodeMasks);
458
+
459
+ return nodeMasks;
460
+ }
461
+
462
+ ///////////////////////////////////////////////////////////
463
+ // Process
464
+ ///////////////////////////////////////////////////////////
465
+
466
+ static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
467
+ short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount)
468
+ {
469
+ FILE *file = fopen(preGraphFilename, "r");
470
+ const int maxline = MAXLINE;
471
+ char line[MAXLINE];
472
+ char c;
473
+ int wordLength;
474
+ Coordinate lineLength, kmerCount;
475
+ Kmer word;
476
+ Kmer antiWord;
477
+ KmerOccurenceTable *kmerTable;
478
+ IDnum index;
479
+ IDnum nodeID = 0;
480
+ Nucleotide nucleotide;
481
+ NodeMask * nodeMask = nodeMasks;
482
+ Coordinate nodeMaskIndex = 0;
483
+
484
+ if (file == NULL)
485
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
486
+
487
+ // Count kmers
488
+ velvetLog("Scanning pre-graph file %s for k-mers\n",
489
+ preGraphFilename);
490
+
491
+ // First line
492
+ if (!fgets(line, maxline, file))
493
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
494
+ sscanf(line, "%*i\t%*i\t%i\n", &wordLength);
495
+
496
+ kmerTable = newKmerOccurenceTable(accelerationBits, wordLength);
497
+
498
+ // Read nodes
499
+ if (!fgets(line, maxline, file))
500
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
501
+ kmerCount = 0;
502
+ while (line[0] == 'N') {
503
+ lineLength = 0;
504
+ while ((c = getc(file)) != EOF && c != '\n')
505
+ lineLength++;
506
+ kmerCount += lineLength - wordLength + 1;
507
+ if (fgets(line, maxline, file) == NULL)
508
+ break;
509
+ }
510
+
511
+ velvetLog("%li kmers found\n", (long) kmerCount);
512
+
513
+ for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) {
514
+ kmerCount -= nodeMasks[nodeMaskIndex].finish -
515
+ nodeMasks[nodeMaskIndex].start;
516
+ }
517
+
518
+ nodeMaskIndex = 0;
519
+
520
+ fclose(file);
521
+
522
+ // Create table
523
+ allocateKmerOccurences(kmerCount, kmerTable);
524
+
525
+ // Fill table
526
+ file = fopen(preGraphFilename, "r");
527
+ if (file == NULL)
528
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
529
+
530
+ if (!fgets(line, maxline, file))
531
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
532
+
533
+ // Read nodes
534
+ if (!fgets(line, maxline, file))
535
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
536
+ while (line[0] == 'N') {
537
+ nodeID++;
538
+
539
+ // Fill in the initial word :
540
+ clearKmer(&word);
541
+ clearKmer(&antiWord);
542
+
543
+ for (index = 0; index < wordLength - 1; index++) {
544
+ c = getc(file);
545
+ if (c == 'A')
546
+ nucleotide = ADENINE;
547
+ else if (c == 'C')
548
+ nucleotide = CYTOSINE;
549
+ else if (c == 'G')
550
+ nucleotide = GUANINE;
551
+ else if (c == 'T')
552
+ nucleotide = THYMINE;
553
+ else if (c == '\n')
554
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
555
+ else
556
+ nucleotide = ADENINE;
557
+
558
+
559
+ pushNucleotide(&word, nucleotide);
560
+ if (double_strand) {
561
+ #ifdef COLOR
562
+ reversePushNucleotide(&antiWord, nucleotide);
563
+ #else
564
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
565
+ #endif
566
+ }
567
+ }
568
+
569
+ // Scan through node
570
+ index = 0;
571
+ while((c = getc(file)) != '\n' && c != EOF) {
572
+ if (c == 'A')
573
+ nucleotide = ADENINE;
574
+ else if (c == 'C')
575
+ nucleotide = CYTOSINE;
576
+ else if (c == 'G')
577
+ nucleotide = GUANINE;
578
+ else if (c == 'T')
579
+ nucleotide = THYMINE;
580
+ else
581
+ nucleotide = ADENINE;
582
+
583
+ pushNucleotide(&word, nucleotide);
584
+ if (double_strand) {
585
+ #ifdef COLOR
586
+ reversePushNucleotide(&antiWord, nucleotide);
587
+ #else
588
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
589
+ #endif
590
+ }
591
+
592
+ // Update mask if necessary
593
+ if (nodeMask) {
594
+ if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) {
595
+ if (++nodeMaskIndex == nodeMaskCount)
596
+ nodeMask = NULL;
597
+ else
598
+ nodeMask++;
599
+ }
600
+ }
601
+
602
+ // Check if not masked!
603
+ if (nodeMask) {
604
+ if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) {
605
+ index++;
606
+ continue;
607
+ }
608
+ }
609
+
610
+ if (!double_strand || compareKmers(&word, &antiWord) <= 0)
611
+ recordKmerOccurence(&word, nodeID, index, kmerTable);
612
+ else
613
+ recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable);
614
+
615
+ index++;
616
+ }
617
+
618
+ if (fgets(line, maxline, file) == NULL)
619
+ break;
620
+ }
621
+
622
+ fclose(file);
623
+
624
+ // Sort table
625
+ sortKmerOccurenceTable(kmerTable);
626
+
627
+ return kmerTable;
628
+ }
629
+
630
+ static void ghostThreadSequenceThroughGraph(TightString * tString,
631
+ KmerOccurenceTable *
632
+ kmerTable, Graph * graph,
633
+ IDnum seqID, Category category,
634
+ boolean readTracking,
635
+ boolean double_strand,
636
+ ReferenceMapping * referenceMappings,
637
+ Coordinate referenceMappingCount,
638
+ IDnum refCount,
639
+ Annotation * annotations,
640
+ IDnum annotationCount,
641
+ boolean second_in_pair)
642
+ {
643
+ Kmer word;
644
+ Kmer antiWord;
645
+ Coordinate readNucleotideIndex;
646
+ KmerOccurence *kmerOccurence;
647
+ int wordLength = getWordLength(graph);
648
+ Nucleotide nucleotide;
649
+ IDnum refID;
650
+ Coordinate refCoord;
651
+ ReferenceMapping * refMap = NULL;
652
+ Coordinate uniqueIndex = 0;
653
+ Coordinate annotIndex = 0;
654
+ IDnum annotCount = 0;
655
+ boolean reversed;
656
+ SmallNodeList * nodePile = NULL;
657
+ Annotation * annotation = annotations;
658
+
659
+ Node *node = NULL;
660
+ Node *previousNode = NULL;
661
+
662
+ // Neglect any read which will not be short paired
663
+ if ((!readTracking && category % 2 == 0)
664
+ || category / 2 >= CATEGORIES)
665
+ return;
666
+
667
+ // Neglect any string shorter than WORDLENGTH :
668
+ if (getLength(tString) < wordLength)
669
+ return;
670
+
671
+ // Verify that all short reads are reasonnably short
672
+ if (getLength(tString) > USHRT_MAX) {
673
+ velvetLog("Short read of length %lli, longer than limit %i\n",
674
+ (long long) getLength(tString), SHRT_MAX);
675
+ velvetLog("You should better declare this sequence as long, because it genuinely is!\n");
676
+ exit(1);
677
+ }
678
+
679
+ clearKmer(&word);
680
+ clearKmer(&antiWord);
681
+
682
+ // Fill in the initial word :
683
+ for (readNucleotideIndex = 0;
684
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
685
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
686
+ pushNucleotide(&word, nucleotide);
687
+ if (double_strand || second_in_pair) {
688
+ #ifdef COLOR
689
+ reversePushNucleotide(&antiWord, nucleotide);
690
+ #else
691
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
692
+ #endif
693
+ }
694
+ }
695
+
696
+ // Go through sequence
697
+ while (readNucleotideIndex < getLength(tString)) {
698
+ // Shift word:
699
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
700
+ pushNucleotide(&word, nucleotide);
701
+ if (double_strand || second_in_pair) {
702
+ #ifdef COLOR
703
+ reversePushNucleotide(&antiWord, nucleotide);
704
+ #else
705
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
706
+ #endif
707
+ }
708
+
709
+ // Update annotation if necessary
710
+ if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
711
+ annotation = getNextAnnotation(annotation);
712
+ annotCount++;
713
+ annotIndex = 0;
714
+ }
715
+
716
+ // Search for reference mapping
717
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
718
+ refID = getAnnotSequenceID(annotation);
719
+ if (refID > 0)
720
+ refCoord = getStart(annotation) + annotIndex;
721
+ else
722
+ refCoord = getStart(annotation) - annotIndex;
723
+
724
+ refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
725
+ // If success
726
+ if (refMap) {
727
+ if (refID > 0)
728
+ node = getNodeInGraph(graph, refMap->nodeID);
729
+ else
730
+ node = getNodeInGraph(graph, -refMap->nodeID);
731
+ } else {
732
+ node = NULL;
733
+ if (previousNode)
734
+ break;
735
+ }
736
+ }
737
+ // if not.. look in table
738
+ else {
739
+ reversed = false;
740
+ if (double_strand) {
741
+ if (compareKmers(&word, &antiWord) <= 0) {
742
+ kmerOccurence =
743
+ findKmerInKmerOccurenceTable(&word,
744
+ kmerTable);
745
+ } else {
746
+ kmerOccurence =
747
+ findKmerInKmerOccurenceTable(&antiWord,
748
+ kmerTable);
749
+ reversed = true;
750
+ }
751
+ } else {
752
+ if (!second_in_pair) {
753
+ kmerOccurence =
754
+ findKmerInKmerOccurenceTable(&word,
755
+ kmerTable);
756
+ } else {
757
+ kmerOccurence =
758
+ findKmerInKmerOccurenceTable(&antiWord,
759
+ kmerTable);
760
+ reversed = true;
761
+ }
762
+ }
763
+
764
+ if (kmerOccurence) {
765
+ if (!reversed)
766
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
767
+ else
768
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
769
+ } else {
770
+ node = NULL;
771
+ if (previousNode)
772
+ break;
773
+ }
774
+
775
+ }
776
+
777
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
778
+ annotIndex++;
779
+ else
780
+ uniqueIndex++;
781
+
782
+ previousNode = node;
783
+
784
+ // Fill in graph
785
+ if (node && !isNodeMemorized(node, nodePile))
786
+ {
787
+ #ifdef _OPENMP
788
+ lockNode(node);
789
+ #endif
790
+ incrementReadStartCount(node, graph);
791
+ #ifdef _OPENMP
792
+ unLockNode(node);
793
+ #endif
794
+ memorizeNode(node, &nodePile);
795
+ }
796
+ }
797
+
798
+ unMemorizeNodes(&nodePile);
799
+ }
800
+
801
+ static void threadSequenceThroughGraph(TightString * tString,
802
+ KmerOccurenceTable * kmerTable,
803
+ Graph * graph,
804
+ IDnum seqID, Category category,
805
+ boolean readTracking,
806
+ boolean double_strand,
807
+ ReferenceMapping * referenceMappings,
808
+ Coordinate referenceMappingCount,
809
+ IDnum refCount,
810
+ Annotation * annotations,
811
+ IDnum annotationCount,
812
+ boolean second_in_pair)
813
+ {
814
+ Kmer word;
815
+ Kmer antiWord;
816
+ Coordinate readNucleotideIndex;
817
+ Coordinate kmerIndex;
818
+ KmerOccurence *kmerOccurence;
819
+ int wordLength = getWordLength(graph);
820
+
821
+ PassageMarkerI marker = NULL_IDX;
822
+ PassageMarkerI previousMarker = NULL_IDX;
823
+ Node *node = NULL;
824
+ Node *previousNode = NULL;
825
+ Coordinate coord = 0;
826
+ Coordinate previousCoord = 0;
827
+ Nucleotide nucleotide;
828
+ boolean reversed;
829
+
830
+ IDnum refID;
831
+ Coordinate refCoord = 0;
832
+ ReferenceMapping * refMap;
833
+ Annotation * annotation = annotations;
834
+ Coordinate index = 0;
835
+ Coordinate uniqueIndex = 0;
836
+ Coordinate annotIndex = 0;
837
+ IDnum annotCount = 0;
838
+ SmallNodeList * nodePile = NULL;
839
+
840
+ // Neglect any string shorter than WORDLENGTH :
841
+ if (getLength(tString) < wordLength)
842
+ return;
843
+
844
+ clearKmer(&word);
845
+ clearKmer(&antiWord);
846
+
847
+ // Fill in the initial word :
848
+ for (readNucleotideIndex = 0;
849
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
850
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
851
+ pushNucleotide(&word, nucleotide);
852
+ if (double_strand || second_in_pair) {
853
+ #ifdef COLOR
854
+ reversePushNucleotide(&antiWord, nucleotide);
855
+ #else
856
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
857
+ #endif
858
+ }
859
+ }
860
+
861
+ // Go through sequence
862
+ // printf("len %d\n", getLength(tString));
863
+ while (readNucleotideIndex < getLength(tString)) {
864
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
865
+ pushNucleotide(&word, nucleotide);
866
+ if (double_strand || second_in_pair) {
867
+ #ifdef COLOR
868
+ reversePushNucleotide(&antiWord, nucleotide);
869
+ #else
870
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
871
+ #endif
872
+ }
873
+
874
+ // Update annotation if necessary
875
+ if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
876
+ annotation = getNextAnnotation(annotation);
877
+ annotCount++;
878
+ annotIndex = 0;
879
+ }
880
+
881
+ // Search for reference mapping
882
+ if (category == REFERENCE) {
883
+ if (referenceMappings)
884
+ refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount);
885
+ else
886
+ refMap = NULL;
887
+
888
+ if (refMap) {
889
+ node = getNodeInGraph(graph, refMap->nodeID);
890
+ if (refMap->nodeID > 0) {
891
+ coord = refMap->nodeStart + (index - refMap->referenceStart);
892
+ } else {
893
+ coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart);
894
+ }
895
+ } else {
896
+ node = NULL;
897
+ }
898
+ }
899
+ // Search for reference-based mapping
900
+ else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
901
+ refID = getAnnotSequenceID(annotation);
902
+ if (refID > 0)
903
+ refCoord = getStart(annotation) + annotIndex;
904
+ else
905
+ refCoord = getStart(annotation) - annotIndex;
906
+
907
+ refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
908
+ // If success
909
+ if (refMap) {
910
+ if (refID > 0) {
911
+ node = getNodeInGraph(graph, refMap->nodeID);
912
+ if (refMap->nodeID > 0) {
913
+ coord = refMap->nodeStart + (refCoord - refMap->referenceStart);
914
+ } else {
915
+ coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart);
916
+ }
917
+ } else {
918
+ node = getNodeInGraph(graph, -refMap->nodeID);
919
+ if (refMap->nodeID > 0) {
920
+ coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1;
921
+ } else {
922
+ coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1;
923
+ }
924
+ }
925
+ } else {
926
+ node = NULL;
927
+ if (previousNode)
928
+ break;
929
+ }
930
+ }
931
+ // Search in table
932
+ else {
933
+ reversed = false;
934
+ if (double_strand) {
935
+ if (compareKmers(&word, &antiWord) <= 0) {
936
+ kmerOccurence =
937
+ findKmerInKmerOccurenceTable(&word,
938
+ kmerTable);
939
+ } else {
940
+ kmerOccurence =
941
+ findKmerInKmerOccurenceTable(&antiWord,
942
+ kmerTable);
943
+ reversed = true;
944
+ }
945
+ } else {
946
+ if (!second_in_pair) {
947
+ kmerOccurence =
948
+ findKmerInKmerOccurenceTable(&word,
949
+ kmerTable);
950
+ } else {
951
+ kmerOccurence =
952
+ findKmerInKmerOccurenceTable(&antiWord,
953
+ kmerTable);
954
+ reversed = true;
955
+ }
956
+ }
957
+
958
+ if (kmerOccurence) {
959
+ if (!reversed) {
960
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
961
+ coord = getKmerOccurencePosition(kmerOccurence);
962
+ } else {
963
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
964
+ coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
965
+ }
966
+ } else {
967
+ node = NULL;
968
+ if (previousNode)
969
+ break;
970
+ }
971
+ }
972
+
973
+ // Increment positions
974
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
975
+ annotIndex++;
976
+ else
977
+ uniqueIndex++;
978
+
979
+ // Fill in graph
980
+ if (node)
981
+ {
982
+ #ifdef _OPENMP
983
+ lockNode(node);
984
+ #endif
985
+ kmerIndex = readNucleotideIndex - wordLength;
986
+
987
+ if (previousNode == node
988
+ && previousCoord == coord - 1) {
989
+ if (category / 2 >= CATEGORIES) {
990
+ setPassageMarkerFinish(marker,
991
+ kmerIndex +
992
+ 1);
993
+ setFinishOffset(marker,
994
+ getNodeLength(node)
995
+ - coord - 1);
996
+ } else {
997
+ #ifndef SINGLE_COV_CAT
998
+ incrementVirtualCoverage(node, category / 2, 1);
999
+ incrementOriginalVirtualCoverage(node, category / 2, 1);
1000
+ #else
1001
+ incrementVirtualCoverage(node, 1);
1002
+ #endif
1003
+ }
1004
+ #ifdef _OPENMP
1005
+ unLockNode(node);
1006
+ #endif
1007
+ } else {
1008
+ if (category / 2 >= CATEGORIES) {
1009
+ marker =
1010
+ newPassageMarker(seqID,
1011
+ kmerIndex,
1012
+ kmerIndex + 1,
1013
+ coord,
1014
+ getNodeLength
1015
+ (node) -
1016
+ coord - 1);
1017
+ transposePassageMarker(marker,
1018
+ node);
1019
+ connectPassageMarkers
1020
+ (previousMarker, marker,
1021
+ graph);
1022
+ previousMarker = marker;
1023
+ } else {
1024
+ if (readTracking) {
1025
+ if (!isNodeMemorized(node, nodePile)) {
1026
+ addReadStart(node,
1027
+ seqID,
1028
+ coord,
1029
+ graph,
1030
+ kmerIndex);
1031
+ memorizeNode(node, &nodePile);
1032
+ } else {
1033
+ blurLastShortReadMarker
1034
+ (node, graph);
1035
+ }
1036
+ }
1037
+
1038
+ #ifndef SINGLE_COV_CAT
1039
+ incrementVirtualCoverage(node, category / 2, 1);
1040
+ incrementOriginalVirtualCoverage(node, category / 2, 1);
1041
+ #else
1042
+ incrementVirtualCoverage(node, 1);
1043
+ #endif
1044
+ }
1045
+ #ifdef _OPENMP
1046
+ lockTwoNodes(node, previousNode);
1047
+ #endif
1048
+ if (category != REFERENCE)
1049
+ createArc(previousNode, node, graph);
1050
+ #ifdef _OPENMP
1051
+ unLockTwoNodes(node, previousNode);
1052
+ #endif
1053
+ }
1054
+
1055
+ previousNode = node;
1056
+ previousCoord = coord;
1057
+ }
1058
+ index++;
1059
+ }
1060
+ // printKmer(&word);
1061
+
1062
+ if (readTracking && category / 2 < CATEGORIES)
1063
+ unMemorizeNodes(&nodePile);
1064
+ }
1065
+
1066
+ static void fillUpGraph(ReadSet * reads,
1067
+ KmerOccurenceTable * kmerTable,
1068
+ Graph * graph,
1069
+ boolean readTracking,
1070
+ boolean double_strand,
1071
+ ReferenceMapping * referenceMappings,
1072
+ Coordinate referenceMappingCount,
1073
+ IDnum refCount,
1074
+ char * roadmapFilename)
1075
+ {
1076
+ IDnum readIndex;
1077
+ RoadMapArray *roadmap = NULL;
1078
+ Coordinate *annotationOffset = NULL;
1079
+ struct timeval start, end, diff;
1080
+
1081
+ if (referenceMappings)
1082
+ {
1083
+ roadmap = importRoadMapArray(roadmapFilename);
1084
+ annotationOffset = callocOrExit(reads->readCount, Coordinate);
1085
+ for (readIndex = 1; readIndex < reads->readCount; readIndex++)
1086
+ annotationOffset[readIndex] = annotationOffset[readIndex - 1]
1087
+ + getAnnotationCount(getRoadMapInArray(roadmap, readIndex - 1));
1088
+ }
1089
+
1090
+ resetNodeStatus(graph);
1091
+ // Allocate memory for the read pairs
1092
+ if (!readStartsAreActivated(graph))
1093
+ activateReadStarts(graph);
1094
+
1095
+ gettimeofday(&start, NULL);
1096
+ #ifdef _OPENMP
1097
+ initSmallNodeListMemory();
1098
+ createNodeLocks(graph);
1099
+ #pragma omp parallel for
1100
+ #endif
1101
+ for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1102
+ {
1103
+ Annotation * annotations = NULL;
1104
+ IDnum annotationCount = 0;
1105
+ Category category;
1106
+ boolean second_in_pair;
1107
+
1108
+ if (readIndex % 1000000 == 0)
1109
+ velvetLog("Ghost Threading through reads %ld / %ld\n",
1110
+ (long) readIndex, (long) reads->readCount);
1111
+
1112
+ category = reads->categories[readIndex];
1113
+ second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1114
+
1115
+ if (referenceMappings)
1116
+ {
1117
+ annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1118
+ annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1119
+ }
1120
+
1121
+ ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1122
+ kmerTable,
1123
+ graph, readIndex + 1,
1124
+ category,
1125
+ readTracking, double_strand,
1126
+ referenceMappings, referenceMappingCount,
1127
+ refCount, annotations, annotationCount,
1128
+ second_in_pair);
1129
+ }
1130
+ createNodeReadStartArrays(graph);
1131
+ gettimeofday(&end, NULL);
1132
+ timersub(&end, &start, &diff);
1133
+ velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1134
+
1135
+ gettimeofday(&start, NULL);
1136
+ #ifdef _OPENMP
1137
+ int threads = omp_get_max_threads();
1138
+ if (threads > 32)
1139
+ threads = 32;
1140
+
1141
+ #pragma omp parallel for num_threads(threads)
1142
+ #endif
1143
+ for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1144
+ {
1145
+ Annotation * annotations = NULL;
1146
+ IDnum annotationCount = 0;
1147
+ Category category;
1148
+ boolean second_in_pair;
1149
+
1150
+ if (readIndex % 1000000 == 0)
1151
+ velvetLog("Threading through reads %li / %li\n",
1152
+ (long) readIndex, (long) reads->readCount);
1153
+
1154
+ category = reads->categories[readIndex];
1155
+ second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1156
+
1157
+ if (referenceMappings)
1158
+ {
1159
+ annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1160
+ annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1161
+ }
1162
+
1163
+ threadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1164
+ kmerTable,
1165
+ graph, readIndex + 1, category,
1166
+ readTracking, double_strand,
1167
+ referenceMappings, referenceMappingCount,
1168
+ refCount, annotations, annotationCount, second_in_pair);
1169
+ }
1170
+ gettimeofday(&end, NULL);
1171
+ timersub(&end, &start, &diff);
1172
+ velvetLog(" === Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1173
+
1174
+ #ifdef _OPENMP
1175
+ free(nodeLocks);
1176
+ nodeLocks = NULL;
1177
+ #endif
1178
+
1179
+ if (referenceMappings)
1180
+ {
1181
+ destroyRoadMapArray(roadmap);
1182
+ free (annotationOffset);
1183
+ }
1184
+
1185
+ orderNodeReadStartArrays(graph);
1186
+
1187
+ destroySmallNodeListMemmory();
1188
+
1189
+ destroyKmerOccurenceTable(kmerTable);
1190
+ }
1191
+
1192
+ Graph *importPreGraph(char *preGraphFilename, ReadSet * reads, char * roadmapFilename,
1193
+ boolean readTracking, short int accelerationBits)
1194
+ {
1195
+ boolean double_strand = false;
1196
+ Graph *graph = readPreGraphFile(preGraphFilename, &double_strand);
1197
+ Coordinate referenceMappingCount = 0;
1198
+ IDnum referenceCount = 0;
1199
+
1200
+ if (nodeCount(graph) == 0)
1201
+ return graph;
1202
+
1203
+ // If necessary compile reference -> node
1204
+ ReferenceMapping * referenceMappings = computeReferenceMappings(preGraphFilename, reads, &referenceMappingCount, &referenceCount);
1205
+ // Node -> reference maps
1206
+ NodeMask * nodeMasks = computeNodeMasks(referenceMappings, referenceMappingCount, graph);
1207
+
1208
+ // Map k-mers to nodes
1209
+ KmerOccurenceTable *kmerTable =
1210
+ referenceGraphKmers(preGraphFilename, accelerationBits, graph, double_strand, nodeMasks, referenceMappingCount);
1211
+
1212
+ free(nodeMasks);
1213
+
1214
+ // Map sequences -> kmers -> nodes
1215
+ fillUpGraph(reads, kmerTable, graph, readTracking, double_strand, referenceMappings, referenceMappingCount, referenceCount, roadmapFilename);
1216
+
1217
+ free(referenceMappings);
1218
+
1219
+ return graph;
1220
+ }
1221
+
1222
+ static void addReadsToGraph(TightString * tString,
1223
+ KmerOccurenceTable * kmerTable,
1224
+ Graph * graph,
1225
+ IDnum seqID, Category category,
1226
+ boolean readTracking,
1227
+ boolean double_strand,
1228
+ boolean second_in_pair)
1229
+ {
1230
+ Kmer word;
1231
+ Kmer antiWord;
1232
+ Coordinate readNucleotideIndex;
1233
+ Coordinate kmerIndex;
1234
+ KmerOccurence *kmerOccurence;
1235
+ int wordLength = getWordLength(graph);
1236
+
1237
+ Node *node = NULL;
1238
+ Node *previousNode = NULL;
1239
+ Coordinate coord = 0;
1240
+ Coordinate previousCoord = 0;
1241
+ Nucleotide nucleotide;
1242
+ boolean reversed;
1243
+
1244
+ Coordinate index = 0;
1245
+ SmallNodeList * nodePile = NULL;
1246
+
1247
+ // Neglect any read which will not be short paired
1248
+ if (category / 2 >= CATEGORIES)
1249
+ return;
1250
+
1251
+ // Neglect any string shorter than WORDLENGTH :
1252
+ if (getLength(tString) < wordLength)
1253
+ return;
1254
+
1255
+ clearKmer(&word);
1256
+ clearKmer(&antiWord);
1257
+
1258
+ // Fill in the initial word :
1259
+ for (readNucleotideIndex = 0;
1260
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
1261
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
1262
+ pushNucleotide(&word, nucleotide);
1263
+ if (double_strand || second_in_pair) {
1264
+ #ifdef COLOR
1265
+ reversePushNucleotide(&antiWord, nucleotide);
1266
+ #else
1267
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
1268
+ #endif
1269
+ }
1270
+ }
1271
+
1272
+ // Go through sequence
1273
+ // printf("len %d\n", getLength(tString));
1274
+ while (readNucleotideIndex < getLength(tString)) {
1275
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
1276
+ pushNucleotide(&word, nucleotide);
1277
+ if (double_strand || second_in_pair) {
1278
+ #ifdef COLOR
1279
+ reversePushNucleotide(&antiWord, nucleotide);
1280
+ #else
1281
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
1282
+ #endif
1283
+ }
1284
+
1285
+ // Search in table
1286
+ reversed = false;
1287
+ if (double_strand) {
1288
+ if (compareKmers(&word, &antiWord) <= 0) {
1289
+ kmerOccurence =
1290
+ findKmerInKmerOccurenceTable(&word,
1291
+ kmerTable);
1292
+ } else {
1293
+ kmerOccurence =
1294
+ findKmerInKmerOccurenceTable(&antiWord,
1295
+ kmerTable);
1296
+ reversed = true;
1297
+ }
1298
+ } else {
1299
+ if (!second_in_pair) {
1300
+ kmerOccurence =
1301
+ findKmerInKmerOccurenceTable(&word,
1302
+ kmerTable);
1303
+ } else {
1304
+ kmerOccurence =
1305
+ findKmerInKmerOccurenceTable(&antiWord,
1306
+ kmerTable);
1307
+ reversed = true;
1308
+ }
1309
+ }
1310
+
1311
+ if (kmerOccurence) {
1312
+ if (!reversed) {
1313
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
1314
+ coord = getKmerOccurencePosition(kmerOccurence);
1315
+ } else {
1316
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
1317
+ coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
1318
+ }
1319
+ } else {
1320
+ node = NULL;
1321
+ if (previousNode)
1322
+ break;
1323
+ }
1324
+
1325
+ // Fill in graph
1326
+ if (node)
1327
+ {
1328
+ #ifdef _OPENMP
1329
+ lockNode(node);
1330
+ #endif
1331
+ kmerIndex = readNucleotideIndex - wordLength;
1332
+
1333
+ if (previousNode != node || previousCoord != coord -1) {
1334
+ if (!isNodeMemorized(node, nodePile)) {
1335
+ addReadStart(node,
1336
+ seqID,
1337
+ coord,
1338
+ graph,
1339
+ kmerIndex);
1340
+ memorizeNode(node, &nodePile);
1341
+ } else {
1342
+ blurLastShortReadMarker
1343
+ (node, graph);
1344
+ }
1345
+ }
1346
+ #ifdef _OPENMP
1347
+ unLockNode(node);
1348
+ #endif
1349
+ previousNode = node;
1350
+ previousCoord = coord;
1351
+ }
1352
+ index++;
1353
+ }
1354
+ // printKmer(&word);
1355
+
1356
+ if (category / 2 < CATEGORIES)
1357
+ unMemorizeNodes(&nodePile);
1358
+ }
1359
+
1360
+ static void fillUpConnectedGraph(ReadSet * reads,
1361
+ KmerOccurenceTable * kmerTable,
1362
+ Graph * graph,
1363
+ boolean readTracking,
1364
+ boolean double_strand)
1365
+ {
1366
+ IDnum refCount = 0; // refs not present in connected graphs
1367
+ IDnum readIndex;
1368
+ struct timeval start, end, diff;
1369
+
1370
+ resetNodeStatus(graph);
1371
+ // Allocate memory for the read pairs
1372
+ if (!readStartsAreActivated(graph))
1373
+ activateReadStarts(graph);
1374
+
1375
+ gettimeofday(&start, NULL);
1376
+ #ifdef _OPENMP
1377
+ initSmallNodeListMemory();
1378
+ createNodeLocks(graph);
1379
+ #pragma omp parallel for
1380
+ #endif
1381
+ for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1382
+ {
1383
+ Category category;
1384
+ boolean second_in_pair;
1385
+
1386
+ if (readIndex % 1000000 == 0)
1387
+ velvetLog("Ghost Threading through reads %ld / %ld\n",
1388
+ (long) readIndex, (long) reads->readCount);
1389
+
1390
+ category = reads->categories[readIndex];
1391
+ second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1392
+
1393
+ // referenceMappings = NULL, referenceMappingCount = 0
1394
+ // refCount = 0, annotations = NULL, annotationCount = 0
1395
+ ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1396
+ kmerTable,
1397
+ graph, readIndex + 1,
1398
+ category,
1399
+ readTracking, double_strand,
1400
+ NULL, 0,
1401
+ 0, NULL, 0,
1402
+ second_in_pair);
1403
+ }
1404
+ createNodeReadStartArrays(graph);
1405
+ gettimeofday(&end, NULL);
1406
+ timersub(&end, &start, &diff);
1407
+ velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1408
+
1409
+ gettimeofday(&start, NULL);
1410
+ #ifdef _OPENMP
1411
+ int threads = omp_get_max_threads();
1412
+ if (threads > 32)
1413
+ threads = 32;
1414
+
1415
+ #pragma omp parallel for num_threads(threads)
1416
+ #endif
1417
+ for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1418
+ {
1419
+ Category category;
1420
+ boolean second_in_pair;
1421
+
1422
+ if (readIndex % 1000000 == 0)
1423
+ velvetLog("Adding reads %li / %li\n",
1424
+ (long) readIndex, (long) reads->readCount);
1425
+
1426
+ category = reads->categories[readIndex];
1427
+ second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1428
+
1429
+ addReadsToGraph(getTightStringInArray(reads->tSequences, readIndex),
1430
+ kmerTable,
1431
+ graph, readIndex + 1, category,
1432
+ readTracking, double_strand, second_in_pair);
1433
+ }
1434
+ gettimeofday(&end, NULL);
1435
+ timersub(&end, &start, &diff);
1436
+ velvetLog(" === Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1437
+
1438
+ #ifdef _OPENMP
1439
+ free(nodeLocks);
1440
+ nodeLocks = NULL;
1441
+ #endif
1442
+
1443
+ orderNodeReadStartArrays(graph);
1444
+
1445
+ destroySmallNodeListMemmory();
1446
+
1447
+ destroyKmerOccurenceTable(kmerTable);
1448
+ }
1449
+
1450
+ Graph *importConnectedGraph(char *connectedGraphFilename, ReadSet * reads, char * roadmapFilename,
1451
+ boolean readTracking, short int accelerationBits)
1452
+ {
1453
+ boolean double_strand = false;
1454
+ Graph *graph = readConnectedGraphFile(connectedGraphFilename, &double_strand);
1455
+
1456
+ if (nodeCount(graph) == 0)
1457
+ return graph;
1458
+
1459
+ if (readTracking) {
1460
+ Coordinate referenceMappingCount = 0;
1461
+ NodeMask * nodeMasks = NULL;
1462
+
1463
+ // Map k-mers to nodes
1464
+ KmerOccurenceTable *kmerTable =
1465
+ referenceGraphKmers(connectedGraphFilename, accelerationBits, graph, doubleStrandedGraph(graph), nodeMasks, referenceMappingCount);
1466
+
1467
+ // Map sequences -> kmers -> nodes
1468
+ fillUpConnectedGraph(reads, kmerTable, graph, readTracking, double_strand);
1469
+ }
1470
+
1471
+ return graph;
1472
+ }