finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,233 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _GRAPH_H_
22
+ #define _GRAPH_H_
23
+
24
+ ////////////////////////////////////////////////////////////
25
+ // Node functions
26
+ ////////////////////////////////////////////////////////////
27
+
28
+ //Creators/destructor
29
+ Node *newNode(IDnum sequenceID, Coordinate start, Coordinate finish,
30
+ Coordinate offset, IDnum ID, TightString * sequences,
31
+ int WORDLENGTH);
32
+ Node *emptyNode();
33
+ void destroyNode(Node * node, Graph * graph);
34
+
35
+ // Locator
36
+ IDnum getNodeID(Node * node);
37
+ Node *getNodeInGraph(Graph * graph, IDnum nodeID);
38
+ Node *getTwinNode(Node * node);
39
+
40
+ // Arc info
41
+ int arcCount(Node * node);
42
+ int simpleArcCount(Node * node);
43
+ Arc *getArc(Node * node);
44
+ boolean hasSingleArc(Node * node);
45
+
46
+ // Descriptor
47
+ Coordinate getNodeLength(Node * node);
48
+ void appendDescriptors(Node * target, Node * source);
49
+ void directlyAppendDescriptors(Node * target, Node * sourcei, Coordinate totalLength);
50
+ void appendSequence(Node * node, TightString * reads,
51
+ PassageMarkerI guide, Graph * graph);
52
+ void splitNodeDescriptor(Node * source, Node * target, Coordinate offset);
53
+ void reduceNode(Node * node);
54
+ void reallocateNodeDescriptor(Node * node, Coordinate length);
55
+ Nucleotide getNucleotideInNode(Node * node, Coordinate index);
56
+
57
+ // Passage markers
58
+ void insertPassageMarker(PassageMarkerI marker, Node * destination);
59
+ PassageMarkerI getMarker(Node * node);
60
+ void setMarker(Node * node, PassageMarkerI marker);
61
+ IDnum markerCount(Node * node);
62
+
63
+ // Short read marker creation
64
+ void incrementReadStartCount(Node * node, Graph * graph);
65
+ void addReadStart(Node * node, IDnum seqID, Coordinate position,
66
+ Graph * graph, Coordinate offset);
67
+ void blurLastShortReadMarker(Node * node, Graph * graph);
68
+
69
+ // Short read marker handling
70
+ ShortReadMarker *getNodeReads(Node * node, Graph * graph);
71
+ IDnum getNodeReadCount(Node * node, Graph * graph);
72
+ ShortReadMarker *commonNodeReads(Node * nodeA, Node * nodeB, Graph * graph,
73
+ IDnum * length);
74
+ ShortReadMarker *extractBackOfNodeReads(Node * node, Coordinate breakpoint,
75
+ Graph * graph, IDnum * length,
76
+ PassageMarkerI sourceMarker,
77
+ ShortLength * sequenceLengths);
78
+ ShortReadMarker *extractFrontOfNodeReads(Node * node,
79
+ Coordinate breakpoint,
80
+ Graph * graph, IDnum * length,
81
+ PassageMarkerI sourceMarker,
82
+ ShortLength * sequenceLengths);
83
+
84
+ // Short read marker moving around
85
+ void foldSymmetricalNodeReads(Node * node, Graph * graph);
86
+ void spreadReadIDs(ShortReadMarker * reads, IDnum readCount, Node * node,
87
+ Graph * graph);
88
+ void injectShortReads(ShortReadMarker * sourceArray, IDnum sourceLength,
89
+ Node * target, Graph * graph);
90
+ void mergeNodeReads(Node * target, Node * source, Graph * graph);
91
+
92
+ #ifndef SINGLE_COV_CAT
93
+ // Virtual coverage
94
+ void setVirtualCoverage(Node * node, Category category,
95
+ Coordinate coverage);
96
+ void incrementVirtualCoverage(Node * node, Category category,
97
+ Coordinate coverage);
98
+ Coordinate getVirtualCoverage(Node * node, Category category);
99
+ Coordinate getTotalCoverage(Node * node);
100
+
101
+ // Original virtual coverage
102
+ void setOriginalVirtualCoverage(Node * node, Category category,
103
+ Coordinate coverage);
104
+ void incrementOriginalVirtualCoverage(Node * node, Category category,
105
+ Coordinate coverage);
106
+ Coordinate getOriginalVirtualCoverage(Node * node, Category category);
107
+
108
+ #else
109
+ // Virtual coverage
110
+ void setVirtualCoverage(Node * node,
111
+ Coordinate coverage);
112
+ void incrementVirtualCoverage(Node * node,
113
+ Coordinate coverage);
114
+ Coordinate getVirtualCoverage(Node * node);
115
+ Coordinate getTotalCoverage(Node * node);
116
+ #endif
117
+
118
+ // Status
119
+ void setNodeStatus(Node * node, boolean status);
120
+ void setSingleNodeStatus(Node * node, boolean status);
121
+ boolean getNodeStatus(Node * node);
122
+
123
+ // Uniqueness
124
+ void setUniqueness(Node * node, boolean value);
125
+ boolean getUniqueness(Node * node);
126
+
127
+ // Gap markers
128
+ void appendGap(Node * node, Coordinate length, Graph * graph);
129
+ void appendNodeGaps(Node * destination, Node * source, Graph * graph);
130
+
131
+ // IO
132
+ TightString *expandNode(Node * node, int WORDLENGTH);
133
+ void appendNodeSequence(Node * node, TightString * sequence,
134
+ Coordinate writeIndex);
135
+ char *expandNodeFragment(Node * node, Coordinate contigStart,
136
+ Coordinate contigFinish, int WORDLENGTH);
137
+
138
+ ////////////////////////////////////////////////////////////
139
+ // Arc functions
140
+ ////////////////////////////////////////////////////////////
141
+
142
+ // Creators/destructor
143
+ Arc *createArc(Node * origin, Node * destination, Graph * graph);
144
+ void createAnalogousArc(Node * origin, Node * destination, Arc * refArc,
145
+ Graph * graph);
146
+ void destroyArc(Arc * arc, Graph * graph);
147
+
148
+ // Multiplicity
149
+ void setMultiplicity(Arc * arc, IDnum mult);
150
+ IDnum getMultiplicity(Arc * arc);
151
+
152
+ // Extremities
153
+ Node *getOrigin(Arc * arc);
154
+ Node *getDestination(Arc * arc);
155
+
156
+ // Finding arcs
157
+ Arc *getArcBetweenNodes(Node * origin, Node * destination, Graph * graph);
158
+ Arc *getNextArc(Arc * arc);
159
+
160
+ // Lookup table option
161
+ void activateArcLookupTable(Graph * graph);
162
+ void deactivateArcLookupTable(Graph * graph);
163
+
164
+ ////////////////////////////////////////////////////////////
165
+ // Short read marker functions
166
+ ////////////////////////////////////////////////////////////
167
+
168
+ ShortReadMarker *getShortReadMarkerAtIndex(ShortReadMarker * array,
169
+ IDnum index);
170
+
171
+ IDnum getShortReadMarkerID(ShortReadMarker * marker);
172
+
173
+ extern inline Coordinate getShortReadMarkerPosition(ShortReadMarker * marker);
174
+ extern inline void setShortReadMarkerPosition(ShortReadMarker * marker,
175
+ Coordinate position);
176
+
177
+ extern inline ShortLength getShortReadMarkerOffset(ShortReadMarker * marker);
178
+ extern inline void setShortReadMarkerOffset(ShortReadMarker * marker,
179
+ ShortLength offset);
180
+
181
+ ////////////////////////////////////////////////////////////
182
+ // Gap marker functions
183
+ ////////////////////////////////////////////////////////////
184
+
185
+ GapMarker *getGap(Node * node, Graph * graph);
186
+ GapMarker *getNextGap(GapMarker * marker);
187
+ Coordinate getGapStart(GapMarker * marker);
188
+ Coordinate getGapFinish(GapMarker * marker);
189
+
190
+ ////////////////////////////////////////////////////////////
191
+ // Graph functions
192
+ ////////////////////////////////////////////////////////////
193
+
194
+ // Memory allocation
195
+ Graph *emptyGraph(IDnum sequenceCount, int wordLength);
196
+ void allocateNodeSpace(Graph * graph, IDnum nodeCount);
197
+ Node *addEmptyNodeToGraph(Graph * graph, IDnum nodeID);
198
+ void destroyGraph(Graph * graph);
199
+
200
+ // Dimensions
201
+ IDnum nodeCount(Graph * graph);
202
+ IDnum sequenceCount(Graph * graph);
203
+ void renumberNodes(Graph * graph);
204
+ int getWordLength(Graph * graph);
205
+ boolean doubleStrandedGraph(Graph * graph);
206
+
207
+ // Element status
208
+ void resetNodeStatus(Graph * graph);
209
+
210
+ // File IO
211
+ Graph *importGraph(char *filename);
212
+ void exportGraph(char *filename, Graph * graph, TightString * sequences);
213
+ Graph *readPreGraphFile(char *preGraphFilename, boolean * double_strand);
214
+ Graph *readConnectedGraphFile(char *connectedGraphFilename, boolean * double_strand);
215
+
216
+ // Read starts
217
+ void activateReadStarts(Graph * graph);
218
+ boolean readStartsAreActivated(Graph * graph);
219
+ void createNodeReadStartArrays(Graph * graph);
220
+ void orderNodeReadStartArrays(Graph * graph);
221
+
222
+ // Insert lengths
223
+ void setInsertLengths(Graph * graph, Category cat, Coordinate insertLength,
224
+ Coordinate insertLength_std_dev);
225
+ Coordinate getInsertLength(Graph * graph, Category cat);
226
+ double getInsertLength_var(Graph * graph, Category cat);
227
+
228
+ // Gaps markers
229
+ void activateGapMarkers(Graph * graph);
230
+ void deactivateGapMarkers(Graph * graph);
231
+ void sortGapMarkers(Graph * graph);
232
+
233
+ #endif
@@ -0,0 +1,1472 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+
22
+ #include <stdlib.h>
23
+ #include <stdio.h>
24
+ #include <string.h>
25
+ #include <limits.h>
26
+ #include <sys/time.h>
27
+
28
+ #ifdef _OPENMP
29
+ #include <omp.h>
30
+ #endif
31
+
32
+ #include "globals.h"
33
+ #include "graph.h"
34
+ #include "passageMarker.h"
35
+ #include "readSet.h"
36
+ #include "tightString.h"
37
+ #include "recycleBin.h"
38
+ #include "utility.h"
39
+ #include "kmer.h"
40
+ #include "kmerOccurenceTable.h"
41
+ #include "roadMap.h"
42
+
43
+ #define ADENINE 0
44
+ #define CYTOSINE 1
45
+ #define GUANINE 2
46
+ #define THYMINE 3
47
+
48
+
49
+ //////////////////////////////////////////////////////////
50
+ // Node Locking
51
+ //////////////////////////////////////////////////////////
52
+
53
+ #ifdef _OPENMP
54
+
55
+ /* Array of per-node locks */
56
+
57
+ static omp_lock_t *nodeLocks = NULL;
58
+
59
+ static void
60
+ createNodeLocks(Graph *graph)
61
+ {
62
+ IDnum nbNodes;
63
+ IDnum nodeIndex;
64
+
65
+ nbNodes = nodeCount(graph) + 1;
66
+ if (nodeLocks)
67
+ free (nodeLocks);
68
+ nodeLocks = mallocOrExit(nbNodes, omp_lock_t);
69
+
70
+ #pragma omp parallel for
71
+ for (nodeIndex = 0; nodeIndex < nbNodes; nodeIndex++)
72
+ omp_init_lock(nodeLocks + nodeIndex);
73
+ }
74
+
75
+ static inline void lockNode(Node *node)
76
+ {
77
+ IDnum nodeID = getNodeID(node);
78
+
79
+ if (nodeID < 0)
80
+ nodeID = -nodeID;
81
+ omp_set_lock (nodeLocks + nodeID);
82
+ }
83
+
84
+ /* Assumes node is already locked */
85
+ static inline void lockTwoNodes(Node *node, Node *node2)
86
+ {
87
+ IDnum nodeID = getNodeID(node);
88
+ IDnum node2ID = getNodeID(node2);
89
+
90
+ if (nodeID < 0)
91
+ nodeID = -nodeID;
92
+ if (node2ID < 0)
93
+ node2ID = -node2ID;
94
+
95
+ if (nodeID == node2ID)
96
+ return;
97
+
98
+ /* Lock lowest ID first to avoid deadlocks */
99
+ if (nodeID < node2ID)
100
+ {
101
+ omp_set_lock (nodeLocks + node2ID);
102
+ }
103
+ else if (!omp_test_lock (nodeLocks + node2ID))
104
+ {
105
+ omp_unset_lock (nodeLocks + nodeID);
106
+ omp_set_lock (nodeLocks + node2ID);
107
+ omp_set_lock (nodeLocks + nodeID);
108
+ }
109
+ }
110
+
111
+ static inline void unLockTwoNodes(Node *node, Node *node2)
112
+ {
113
+ IDnum nodeID = getNodeID(node);
114
+ IDnum node2ID = getNodeID(node2);
115
+
116
+ if (nodeID < 0)
117
+ nodeID = -nodeID;
118
+ if (node2ID < 0)
119
+ node2ID = -node2ID;
120
+
121
+ omp_unset_lock (nodeLocks + nodeID);
122
+ if (nodeID != node2ID)
123
+ omp_unset_lock (nodeLocks + node2ID);
124
+ }
125
+
126
+ static inline void unLockNode(Node *node)
127
+ {
128
+ IDnum nodeID = getNodeID(node);
129
+
130
+ if (nodeID < 0)
131
+ nodeID = -nodeID;
132
+ omp_unset_lock (nodeLocks + nodeID);
133
+ }
134
+
135
+ #endif
136
+
137
+ //////////////////////////////////////////////////////////
138
+ // Node Lists
139
+ //////////////////////////////////////////////////////////
140
+ typedef struct smallNodeList_st SmallNodeList;
141
+
142
+ struct smallNodeList_st {
143
+ Node *node;
144
+ SmallNodeList *next;
145
+ } ATTRIBUTE_PACKED;
146
+
147
+ static RecycleBin *smallNodeListMemory = NULL;
148
+
149
+ #define BLOCKSIZE 1000
150
+
151
+ #ifdef _OPENMP
152
+ static void initSmallNodeListMemory(void)
153
+ {
154
+ int n = omp_get_max_threads();
155
+
156
+ #pragma omp critical
157
+ {
158
+ if (smallNodeListMemory == NULL)
159
+ smallNodeListMemory = newRecycleBinArray(n, sizeof(SmallNodeList), BLOCKSIZE);
160
+ }
161
+ }
162
+ #endif
163
+
164
+ static SmallNodeList *allocateSmallNodeList()
165
+ {
166
+ #ifdef _OPENMP
167
+ #ifdef DEBUG
168
+ if (smallNodeListMemory == NULL)
169
+ {
170
+ velvetLog("The memory for small nodes seems uninitialised, "
171
+ "this is probably a bug, aborting.\n");
172
+ abort();
173
+ }
174
+ #endif
175
+ return allocatePointer(getRecycleBinInArray(smallNodeListMemory,
176
+ omp_get_thread_num()));
177
+ #else
178
+ if (smallNodeListMemory == NULL)
179
+ smallNodeListMemory = newRecycleBin(sizeof(SmallNodeList), BLOCKSIZE);
180
+
181
+ return allocatePointer(smallNodeListMemory);
182
+ #endif
183
+ }
184
+
185
+ static void deallocateSmallNodeList(SmallNodeList * smallNodeList)
186
+ {
187
+ #ifdef _OPENMP
188
+ deallocatePointer(getRecycleBinInArray(smallNodeListMemory,
189
+ omp_get_thread_num()),
190
+ smallNodeList);
191
+ #else
192
+ deallocatePointer(smallNodeListMemory, smallNodeList);
193
+ #endif
194
+ }
195
+
196
+ static void destroySmallNodeListMemmory(void)
197
+ {
198
+ if (smallNodeListMemory != NULL)
199
+ {
200
+ #ifdef _OPENMP
201
+ destroyRecycleBinArray(smallNodeListMemory);
202
+ #else
203
+ destroyRecycleBin(smallNodeListMemory);
204
+ #endif
205
+ smallNodeListMemory = NULL;
206
+ }
207
+ }
208
+
209
+ static inline void memorizeNode(Node * node, SmallNodeList ** nodePile)
210
+ {
211
+ SmallNodeList *list = allocateSmallNodeList();
212
+ list->node = node;
213
+ list->next = *nodePile;
214
+ *nodePile = list;
215
+ #ifndef _OPENMP
216
+ setSingleNodeStatus(node, true);
217
+ #endif
218
+ }
219
+
220
+ static inline boolean isNodeMemorized(Node * node, SmallNodeList * nodePile)
221
+ {
222
+ #ifdef _OPENMP
223
+ /* SF TODO There must be a faster way to do this: bit mask, hash table, tree, ... ? */
224
+ SmallNodeList * list;
225
+
226
+ for (list = nodePile; list; list = list->next)
227
+ if (list->node == node)
228
+ return true;
229
+
230
+ return false;
231
+ #else
232
+ return getNodeStatus(node);
233
+ #endif
234
+ }
235
+
236
+ static void unMemorizeNodes(SmallNodeList ** nodePile)
237
+ {
238
+ SmallNodeList * list;
239
+
240
+ while (*nodePile) {
241
+ list = *nodePile;
242
+ *nodePile = list->next;
243
+ #ifndef _OPENMP
244
+ setSingleNodeStatus(list->node, false);
245
+ #endif
246
+ deallocateSmallNodeList(list);
247
+ }
248
+ }
249
+
250
+ ///////////////////////////////////////////////////////////
251
+ // Reference Mappings
252
+ ///////////////////////////////////////////////////////////
253
+ typedef struct referenceMapping_st ReferenceMapping;
254
+
255
+ struct referenceMapping_st {
256
+ IDnum referenceStart;
257
+ IDnum nodeStart;
258
+ IDnum length;
259
+ IDnum referenceID;
260
+ IDnum nodeID;
261
+ } ATTRIBUTE_PACKED;
262
+
263
+ static IDnum countMappings(char * preGraphFilename) {
264
+ FILE *file = fopen(preGraphFilename, "r");
265
+ const int maxline = MAXLINE;
266
+ char line[MAXLINE];
267
+ IDnum count = 0;
268
+
269
+ // Go past NODE blocks
270
+ while(fgets(line, maxline, file))
271
+ if (line[0] == 'S')
272
+ break;
273
+
274
+ // Count relevant lines
275
+ while(fgets(line, maxline, file))
276
+ if (line[0] != 'S')
277
+ count++;
278
+
279
+ fclose(file);
280
+ return count;
281
+ }
282
+
283
+ static ReferenceMapping * recordReferenceMappings(char * preGraphFilename, IDnum arrayLength) {
284
+ ReferenceMapping * mappings = callocOrExit(arrayLength, ReferenceMapping);
285
+ FILE *file = fopen(preGraphFilename, "r");
286
+ const int maxline = MAXLINE;
287
+ char line[MAXLINE];
288
+ ReferenceMapping * current = mappings;
289
+ IDnum referenceID;
290
+ long long_var;
291
+ long long coord1, coord2, coord3;
292
+
293
+ // Go past NODE blocks
294
+ while(fgets(line, maxline, file))
295
+ if (line[0] == 'S')
296
+ break;
297
+
298
+ sscanf(line, "SEQ\t%li\n", &long_var);
299
+ referenceID = long_var;
300
+
301
+ // Go relevant lines
302
+ while(fgets(line, maxline, file)) {
303
+ if (line[0] != 'S') {
304
+ sscanf(line, "%li\t%lli\t%lli\t%lli\n", &long_var, &coord1, &coord2, &coord3);
305
+ current->referenceID = referenceID;
306
+ current->nodeID = long_var;
307
+ current->nodeStart = coord1;
308
+ current->referenceStart = coord2;
309
+ current->length = coord3;
310
+ current++;
311
+ } else {
312
+ sscanf(line, "SEQ\t%li\n", &long_var);
313
+ referenceID = long_var;
314
+ }
315
+ }
316
+
317
+ fclose(file);
318
+ return mappings;
319
+ }
320
+
321
+ static int compareRefMaps(const void * ptrA, const void * ptrB) {
322
+ ReferenceMapping * A = (ReferenceMapping *) ptrA;
323
+ ReferenceMapping * B = (ReferenceMapping *) ptrB;
324
+
325
+ if (A->referenceID > B->referenceID)
326
+ return 1;
327
+ else if (A->referenceID < B->referenceID)
328
+ return -1;
329
+ else {
330
+ if (A->referenceStart >= B->referenceStart + B->length)
331
+ return 1;
332
+ else if (A->referenceStart + A->length <= B->referenceStart)
333
+ return -1;
334
+ else
335
+ return 0;
336
+ }
337
+ }
338
+
339
+ static ReferenceMapping * computeReferenceMappings(char * preGraphFilename, ReadSet * reads, Coordinate * referenceMappingLength, IDnum * referenceCount) {
340
+ IDnum index;
341
+ ReferenceMapping * referenceMappings;
342
+
343
+ for(index = 0; index < reads->readCount && reads->categories[index] == 2 * CATEGORIES + 2; index++)
344
+ (*referenceCount)++;
345
+
346
+ if (*referenceCount == 0) {
347
+ *referenceMappingLength = 0;
348
+ return NULL;
349
+ }
350
+
351
+ *referenceMappingLength = countMappings(preGraphFilename);
352
+
353
+ if (*referenceMappingLength == 0)
354
+ return NULL;
355
+
356
+ referenceMappings = recordReferenceMappings(preGraphFilename, *referenceMappingLength);
357
+ qsort(referenceMappings, *referenceMappingLength, sizeof(ReferenceMapping), compareRefMaps);
358
+
359
+ return referenceMappings;
360
+ }
361
+
362
+ static ReferenceMapping * findReferenceMapping(IDnum seqID, Coordinate refCoord, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount) {
363
+ IDnum positive_seqID;
364
+ Coordinate leftIndex = 0;
365
+ Coordinate rightIndex = referenceMappingCount - 1;
366
+ Coordinate middleIndex;
367
+ ReferenceMapping refMap;
368
+ int comparison;
369
+
370
+ if (seqID > 0)
371
+ positive_seqID = seqID;
372
+ else
373
+ positive_seqID = -seqID;
374
+
375
+ refMap.referenceID = positive_seqID;
376
+ refMap.referenceStart = refCoord;
377
+ refMap.length = 1;
378
+ refMap.nodeStart = 0;
379
+ refMap.nodeID = 0;
380
+
381
+ if (compareRefMaps(&(referenceMappings[leftIndex]), &refMap) == 0)
382
+ return &(referenceMappings[leftIndex]);
383
+ if (compareRefMaps(&(referenceMappings[rightIndex]), &refMap) == 0)
384
+ return &(referenceMappings[rightIndex]);
385
+
386
+ while (true) {
387
+ middleIndex = (rightIndex + leftIndex) / 2;
388
+ comparison = compareRefMaps(&(referenceMappings[middleIndex]), &refMap);
389
+
390
+ if (leftIndex >= rightIndex)
391
+ return NULL;
392
+ else if (comparison == 0)
393
+ return &(referenceMappings[middleIndex]);
394
+ else if (leftIndex == middleIndex)
395
+ return NULL;
396
+ else if (comparison > 0)
397
+ rightIndex = middleIndex;
398
+ else
399
+ leftIndex = middleIndex;
400
+ }
401
+ }
402
+
403
+ ///////////////////////////////////////////////////////////
404
+ // Node Mask
405
+ ///////////////////////////////////////////////////////////
406
+
407
+ typedef struct nodeMask_st NodeMask;
408
+
409
+ struct nodeMask_st {
410
+ IDnum nodeID;
411
+ IDnum start;
412
+ IDnum finish;
413
+ } ATTRIBUTE_PACKED;
414
+
415
+ static int compareNodeMasks(const void * ptrA, const void * ptrB) {
416
+ NodeMask * A = (NodeMask *) ptrA;
417
+ NodeMask * B = (NodeMask *) ptrB;
418
+
419
+ if (A->nodeID < B->nodeID)
420
+ return -1;
421
+ else if (A->nodeID > B->nodeID)
422
+ return 1;
423
+ else {
424
+ if (A->start < B->start)
425
+ return -1;
426
+ else if (A->start > B->start)
427
+ return 1;
428
+ else
429
+ return 0;
430
+ }
431
+ }
432
+
433
+ static NodeMask * computeNodeMasks(ReferenceMapping * referenceMappings, Coordinate arrayLength, Graph * graph) {
434
+ NodeMask * nodeMasks;
435
+ NodeMask * currentMask;
436
+ ReferenceMapping * currentMapping = referenceMappings;
437
+ Coordinate index;
438
+
439
+ if (referenceMappings == NULL)
440
+ return NULL;
441
+
442
+ nodeMasks = callocOrExit(arrayLength, NodeMask);
443
+ currentMask = nodeMasks;
444
+
445
+ for (index = 0; index < arrayLength; index++) {
446
+ if (currentMapping->nodeID > 0) {
447
+ currentMask->nodeID = currentMapping->nodeID;
448
+ } else {
449
+ currentMask->nodeID = -currentMapping->nodeID;
450
+ }
451
+ currentMask->start = currentMapping->nodeStart;
452
+ currentMask->finish = currentMapping->nodeStart + currentMapping->length;
453
+ currentMask++;
454
+ currentMapping++;
455
+ }
456
+
457
+ qsort(nodeMasks, arrayLength, sizeof(NodeMask), compareNodeMasks);
458
+
459
+ return nodeMasks;
460
+ }
461
+
462
+ ///////////////////////////////////////////////////////////
463
+ // Process
464
+ ///////////////////////////////////////////////////////////
465
+
466
+ static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
467
+ short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount)
468
+ {
469
+ FILE *file = fopen(preGraphFilename, "r");
470
+ const int maxline = MAXLINE;
471
+ char line[MAXLINE];
472
+ char c;
473
+ int wordLength;
474
+ Coordinate lineLength, kmerCount;
475
+ Kmer word;
476
+ Kmer antiWord;
477
+ KmerOccurenceTable *kmerTable;
478
+ IDnum index;
479
+ IDnum nodeID = 0;
480
+ Nucleotide nucleotide;
481
+ NodeMask * nodeMask = nodeMasks;
482
+ Coordinate nodeMaskIndex = 0;
483
+
484
+ if (file == NULL)
485
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
486
+
487
+ // Count kmers
488
+ velvetLog("Scanning pre-graph file %s for k-mers\n",
489
+ preGraphFilename);
490
+
491
+ // First line
492
+ if (!fgets(line, maxline, file))
493
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
494
+ sscanf(line, "%*i\t%*i\t%i\n", &wordLength);
495
+
496
+ kmerTable = newKmerOccurenceTable(accelerationBits, wordLength);
497
+
498
+ // Read nodes
499
+ if (!fgets(line, maxline, file))
500
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
501
+ kmerCount = 0;
502
+ while (line[0] == 'N') {
503
+ lineLength = 0;
504
+ while ((c = getc(file)) != EOF && c != '\n')
505
+ lineLength++;
506
+ kmerCount += lineLength - wordLength + 1;
507
+ if (fgets(line, maxline, file) == NULL)
508
+ break;
509
+ }
510
+
511
+ velvetLog("%li kmers found\n", (long) kmerCount);
512
+
513
+ for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) {
514
+ kmerCount -= nodeMasks[nodeMaskIndex].finish -
515
+ nodeMasks[nodeMaskIndex].start;
516
+ }
517
+
518
+ nodeMaskIndex = 0;
519
+
520
+ fclose(file);
521
+
522
+ // Create table
523
+ allocateKmerOccurences(kmerCount, kmerTable);
524
+
525
+ // Fill table
526
+ file = fopen(preGraphFilename, "r");
527
+ if (file == NULL)
528
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
529
+
530
+ if (!fgets(line, maxline, file))
531
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
532
+
533
+ // Read nodes
534
+ if (!fgets(line, maxline, file))
535
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
536
+ while (line[0] == 'N') {
537
+ nodeID++;
538
+
539
+ // Fill in the initial word :
540
+ clearKmer(&word);
541
+ clearKmer(&antiWord);
542
+
543
+ for (index = 0; index < wordLength - 1; index++) {
544
+ c = getc(file);
545
+ if (c == 'A')
546
+ nucleotide = ADENINE;
547
+ else if (c == 'C')
548
+ nucleotide = CYTOSINE;
549
+ else if (c == 'G')
550
+ nucleotide = GUANINE;
551
+ else if (c == 'T')
552
+ nucleotide = THYMINE;
553
+ else if (c == '\n')
554
+ exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
555
+ else
556
+ nucleotide = ADENINE;
557
+
558
+
559
+ pushNucleotide(&word, nucleotide);
560
+ if (double_strand) {
561
+ #ifdef COLOR
562
+ reversePushNucleotide(&antiWord, nucleotide);
563
+ #else
564
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
565
+ #endif
566
+ }
567
+ }
568
+
569
+ // Scan through node
570
+ index = 0;
571
+ while((c = getc(file)) != '\n' && c != EOF) {
572
+ if (c == 'A')
573
+ nucleotide = ADENINE;
574
+ else if (c == 'C')
575
+ nucleotide = CYTOSINE;
576
+ else if (c == 'G')
577
+ nucleotide = GUANINE;
578
+ else if (c == 'T')
579
+ nucleotide = THYMINE;
580
+ else
581
+ nucleotide = ADENINE;
582
+
583
+ pushNucleotide(&word, nucleotide);
584
+ if (double_strand) {
585
+ #ifdef COLOR
586
+ reversePushNucleotide(&antiWord, nucleotide);
587
+ #else
588
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
589
+ #endif
590
+ }
591
+
592
+ // Update mask if necessary
593
+ if (nodeMask) {
594
+ if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) {
595
+ if (++nodeMaskIndex == nodeMaskCount)
596
+ nodeMask = NULL;
597
+ else
598
+ nodeMask++;
599
+ }
600
+ }
601
+
602
+ // Check if not masked!
603
+ if (nodeMask) {
604
+ if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) {
605
+ index++;
606
+ continue;
607
+ }
608
+ }
609
+
610
+ if (!double_strand || compareKmers(&word, &antiWord) <= 0)
611
+ recordKmerOccurence(&word, nodeID, index, kmerTable);
612
+ else
613
+ recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable);
614
+
615
+ index++;
616
+ }
617
+
618
+ if (fgets(line, maxline, file) == NULL)
619
+ break;
620
+ }
621
+
622
+ fclose(file);
623
+
624
+ // Sort table
625
+ sortKmerOccurenceTable(kmerTable);
626
+
627
+ return kmerTable;
628
+ }
629
+
630
+ static void ghostThreadSequenceThroughGraph(TightString * tString,
631
+ KmerOccurenceTable *
632
+ kmerTable, Graph * graph,
633
+ IDnum seqID, Category category,
634
+ boolean readTracking,
635
+ boolean double_strand,
636
+ ReferenceMapping * referenceMappings,
637
+ Coordinate referenceMappingCount,
638
+ IDnum refCount,
639
+ Annotation * annotations,
640
+ IDnum annotationCount,
641
+ boolean second_in_pair)
642
+ {
643
+ Kmer word;
644
+ Kmer antiWord;
645
+ Coordinate readNucleotideIndex;
646
+ KmerOccurence *kmerOccurence;
647
+ int wordLength = getWordLength(graph);
648
+ Nucleotide nucleotide;
649
+ IDnum refID;
650
+ Coordinate refCoord;
651
+ ReferenceMapping * refMap = NULL;
652
+ Coordinate uniqueIndex = 0;
653
+ Coordinate annotIndex = 0;
654
+ IDnum annotCount = 0;
655
+ boolean reversed;
656
+ SmallNodeList * nodePile = NULL;
657
+ Annotation * annotation = annotations;
658
+
659
+ Node *node = NULL;
660
+ Node *previousNode = NULL;
661
+
662
+ // Neglect any read which will not be short paired
663
+ if ((!readTracking && category % 2 == 0)
664
+ || category / 2 >= CATEGORIES)
665
+ return;
666
+
667
+ // Neglect any string shorter than WORDLENGTH :
668
+ if (getLength(tString) < wordLength)
669
+ return;
670
+
671
+ // Verify that all short reads are reasonnably short
672
+ if (getLength(tString) > USHRT_MAX) {
673
+ velvetLog("Short read of length %lli, longer than limit %i\n",
674
+ (long long) getLength(tString), SHRT_MAX);
675
+ velvetLog("You should better declare this sequence as long, because it genuinely is!\n");
676
+ exit(1);
677
+ }
678
+
679
+ clearKmer(&word);
680
+ clearKmer(&antiWord);
681
+
682
+ // Fill in the initial word :
683
+ for (readNucleotideIndex = 0;
684
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
685
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
686
+ pushNucleotide(&word, nucleotide);
687
+ if (double_strand || second_in_pair) {
688
+ #ifdef COLOR
689
+ reversePushNucleotide(&antiWord, nucleotide);
690
+ #else
691
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
692
+ #endif
693
+ }
694
+ }
695
+
696
+ // Go through sequence
697
+ while (readNucleotideIndex < getLength(tString)) {
698
+ // Shift word:
699
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
700
+ pushNucleotide(&word, nucleotide);
701
+ if (double_strand || second_in_pair) {
702
+ #ifdef COLOR
703
+ reversePushNucleotide(&antiWord, nucleotide);
704
+ #else
705
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
706
+ #endif
707
+ }
708
+
709
+ // Update annotation if necessary
710
+ if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
711
+ annotation = getNextAnnotation(annotation);
712
+ annotCount++;
713
+ annotIndex = 0;
714
+ }
715
+
716
+ // Search for reference mapping
717
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
718
+ refID = getAnnotSequenceID(annotation);
719
+ if (refID > 0)
720
+ refCoord = getStart(annotation) + annotIndex;
721
+ else
722
+ refCoord = getStart(annotation) - annotIndex;
723
+
724
+ refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
725
+ // If success
726
+ if (refMap) {
727
+ if (refID > 0)
728
+ node = getNodeInGraph(graph, refMap->nodeID);
729
+ else
730
+ node = getNodeInGraph(graph, -refMap->nodeID);
731
+ } else {
732
+ node = NULL;
733
+ if (previousNode)
734
+ break;
735
+ }
736
+ }
737
+ // if not.. look in table
738
+ else {
739
+ reversed = false;
740
+ if (double_strand) {
741
+ if (compareKmers(&word, &antiWord) <= 0) {
742
+ kmerOccurence =
743
+ findKmerInKmerOccurenceTable(&word,
744
+ kmerTable);
745
+ } else {
746
+ kmerOccurence =
747
+ findKmerInKmerOccurenceTable(&antiWord,
748
+ kmerTable);
749
+ reversed = true;
750
+ }
751
+ } else {
752
+ if (!second_in_pair) {
753
+ kmerOccurence =
754
+ findKmerInKmerOccurenceTable(&word,
755
+ kmerTable);
756
+ } else {
757
+ kmerOccurence =
758
+ findKmerInKmerOccurenceTable(&antiWord,
759
+ kmerTable);
760
+ reversed = true;
761
+ }
762
+ }
763
+
764
+ if (kmerOccurence) {
765
+ if (!reversed)
766
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
767
+ else
768
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
769
+ } else {
770
+ node = NULL;
771
+ if (previousNode)
772
+ break;
773
+ }
774
+
775
+ }
776
+
777
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
778
+ annotIndex++;
779
+ else
780
+ uniqueIndex++;
781
+
782
+ previousNode = node;
783
+
784
+ // Fill in graph
785
+ if (node && !isNodeMemorized(node, nodePile))
786
+ {
787
+ #ifdef _OPENMP
788
+ lockNode(node);
789
+ #endif
790
+ incrementReadStartCount(node, graph);
791
+ #ifdef _OPENMP
792
+ unLockNode(node);
793
+ #endif
794
+ memorizeNode(node, &nodePile);
795
+ }
796
+ }
797
+
798
+ unMemorizeNodes(&nodePile);
799
+ }
800
+
801
+ static void threadSequenceThroughGraph(TightString * tString,
802
+ KmerOccurenceTable * kmerTable,
803
+ Graph * graph,
804
+ IDnum seqID, Category category,
805
+ boolean readTracking,
806
+ boolean double_strand,
807
+ ReferenceMapping * referenceMappings,
808
+ Coordinate referenceMappingCount,
809
+ IDnum refCount,
810
+ Annotation * annotations,
811
+ IDnum annotationCount,
812
+ boolean second_in_pair)
813
+ {
814
+ Kmer word;
815
+ Kmer antiWord;
816
+ Coordinate readNucleotideIndex;
817
+ Coordinate kmerIndex;
818
+ KmerOccurence *kmerOccurence;
819
+ int wordLength = getWordLength(graph);
820
+
821
+ PassageMarkerI marker = NULL_IDX;
822
+ PassageMarkerI previousMarker = NULL_IDX;
823
+ Node *node = NULL;
824
+ Node *previousNode = NULL;
825
+ Coordinate coord = 0;
826
+ Coordinate previousCoord = 0;
827
+ Nucleotide nucleotide;
828
+ boolean reversed;
829
+
830
+ IDnum refID;
831
+ Coordinate refCoord = 0;
832
+ ReferenceMapping * refMap;
833
+ Annotation * annotation = annotations;
834
+ Coordinate index = 0;
835
+ Coordinate uniqueIndex = 0;
836
+ Coordinate annotIndex = 0;
837
+ IDnum annotCount = 0;
838
+ SmallNodeList * nodePile = NULL;
839
+
840
+ // Neglect any string shorter than WORDLENGTH :
841
+ if (getLength(tString) < wordLength)
842
+ return;
843
+
844
+ clearKmer(&word);
845
+ clearKmer(&antiWord);
846
+
847
+ // Fill in the initial word :
848
+ for (readNucleotideIndex = 0;
849
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
850
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
851
+ pushNucleotide(&word, nucleotide);
852
+ if (double_strand || second_in_pair) {
853
+ #ifdef COLOR
854
+ reversePushNucleotide(&antiWord, nucleotide);
855
+ #else
856
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
857
+ #endif
858
+ }
859
+ }
860
+
861
+ // Go through sequence
862
+ // printf("len %d\n", getLength(tString));
863
+ while (readNucleotideIndex < getLength(tString)) {
864
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
865
+ pushNucleotide(&word, nucleotide);
866
+ if (double_strand || second_in_pair) {
867
+ #ifdef COLOR
868
+ reversePushNucleotide(&antiWord, nucleotide);
869
+ #else
870
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
871
+ #endif
872
+ }
873
+
874
+ // Update annotation if necessary
875
+ if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
876
+ annotation = getNextAnnotation(annotation);
877
+ annotCount++;
878
+ annotIndex = 0;
879
+ }
880
+
881
+ // Search for reference mapping
882
+ if (category == REFERENCE) {
883
+ if (referenceMappings)
884
+ refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount);
885
+ else
886
+ refMap = NULL;
887
+
888
+ if (refMap) {
889
+ node = getNodeInGraph(graph, refMap->nodeID);
890
+ if (refMap->nodeID > 0) {
891
+ coord = refMap->nodeStart + (index - refMap->referenceStart);
892
+ } else {
893
+ coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart);
894
+ }
895
+ } else {
896
+ node = NULL;
897
+ }
898
+ }
899
+ // Search for reference-based mapping
900
+ else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
901
+ refID = getAnnotSequenceID(annotation);
902
+ if (refID > 0)
903
+ refCoord = getStart(annotation) + annotIndex;
904
+ else
905
+ refCoord = getStart(annotation) - annotIndex;
906
+
907
+ refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
908
+ // If success
909
+ if (refMap) {
910
+ if (refID > 0) {
911
+ node = getNodeInGraph(graph, refMap->nodeID);
912
+ if (refMap->nodeID > 0) {
913
+ coord = refMap->nodeStart + (refCoord - refMap->referenceStart);
914
+ } else {
915
+ coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart);
916
+ }
917
+ } else {
918
+ node = getNodeInGraph(graph, -refMap->nodeID);
919
+ if (refMap->nodeID > 0) {
920
+ coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1;
921
+ } else {
922
+ coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1;
923
+ }
924
+ }
925
+ } else {
926
+ node = NULL;
927
+ if (previousNode)
928
+ break;
929
+ }
930
+ }
931
+ // Search in table
932
+ else {
933
+ reversed = false;
934
+ if (double_strand) {
935
+ if (compareKmers(&word, &antiWord) <= 0) {
936
+ kmerOccurence =
937
+ findKmerInKmerOccurenceTable(&word,
938
+ kmerTable);
939
+ } else {
940
+ kmerOccurence =
941
+ findKmerInKmerOccurenceTable(&antiWord,
942
+ kmerTable);
943
+ reversed = true;
944
+ }
945
+ } else {
946
+ if (!second_in_pair) {
947
+ kmerOccurence =
948
+ findKmerInKmerOccurenceTable(&word,
949
+ kmerTable);
950
+ } else {
951
+ kmerOccurence =
952
+ findKmerInKmerOccurenceTable(&antiWord,
953
+ kmerTable);
954
+ reversed = true;
955
+ }
956
+ }
957
+
958
+ if (kmerOccurence) {
959
+ if (!reversed) {
960
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
961
+ coord = getKmerOccurencePosition(kmerOccurence);
962
+ } else {
963
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
964
+ coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
965
+ }
966
+ } else {
967
+ node = NULL;
968
+ if (previousNode)
969
+ break;
970
+ }
971
+ }
972
+
973
+ // Increment positions
974
+ if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
975
+ annotIndex++;
976
+ else
977
+ uniqueIndex++;
978
+
979
+ // Fill in graph
980
+ if (node)
981
+ {
982
+ #ifdef _OPENMP
983
+ lockNode(node);
984
+ #endif
985
+ kmerIndex = readNucleotideIndex - wordLength;
986
+
987
+ if (previousNode == node
988
+ && previousCoord == coord - 1) {
989
+ if (category / 2 >= CATEGORIES) {
990
+ setPassageMarkerFinish(marker,
991
+ kmerIndex +
992
+ 1);
993
+ setFinishOffset(marker,
994
+ getNodeLength(node)
995
+ - coord - 1);
996
+ } else {
997
+ #ifndef SINGLE_COV_CAT
998
+ incrementVirtualCoverage(node, category / 2, 1);
999
+ incrementOriginalVirtualCoverage(node, category / 2, 1);
1000
+ #else
1001
+ incrementVirtualCoverage(node, 1);
1002
+ #endif
1003
+ }
1004
+ #ifdef _OPENMP
1005
+ unLockNode(node);
1006
+ #endif
1007
+ } else {
1008
+ if (category / 2 >= CATEGORIES) {
1009
+ marker =
1010
+ newPassageMarker(seqID,
1011
+ kmerIndex,
1012
+ kmerIndex + 1,
1013
+ coord,
1014
+ getNodeLength
1015
+ (node) -
1016
+ coord - 1);
1017
+ transposePassageMarker(marker,
1018
+ node);
1019
+ connectPassageMarkers
1020
+ (previousMarker, marker,
1021
+ graph);
1022
+ previousMarker = marker;
1023
+ } else {
1024
+ if (readTracking) {
1025
+ if (!isNodeMemorized(node, nodePile)) {
1026
+ addReadStart(node,
1027
+ seqID,
1028
+ coord,
1029
+ graph,
1030
+ kmerIndex);
1031
+ memorizeNode(node, &nodePile);
1032
+ } else {
1033
+ blurLastShortReadMarker
1034
+ (node, graph);
1035
+ }
1036
+ }
1037
+
1038
+ #ifndef SINGLE_COV_CAT
1039
+ incrementVirtualCoverage(node, category / 2, 1);
1040
+ incrementOriginalVirtualCoverage(node, category / 2, 1);
1041
+ #else
1042
+ incrementVirtualCoverage(node, 1);
1043
+ #endif
1044
+ }
1045
+ #ifdef _OPENMP
1046
+ lockTwoNodes(node, previousNode);
1047
+ #endif
1048
+ if (category != REFERENCE)
1049
+ createArc(previousNode, node, graph);
1050
+ #ifdef _OPENMP
1051
+ unLockTwoNodes(node, previousNode);
1052
+ #endif
1053
+ }
1054
+
1055
+ previousNode = node;
1056
+ previousCoord = coord;
1057
+ }
1058
+ index++;
1059
+ }
1060
+ // printKmer(&word);
1061
+
1062
+ if (readTracking && category / 2 < CATEGORIES)
1063
+ unMemorizeNodes(&nodePile);
1064
+ }
1065
+
1066
+ static void fillUpGraph(ReadSet * reads,
1067
+ KmerOccurenceTable * kmerTable,
1068
+ Graph * graph,
1069
+ boolean readTracking,
1070
+ boolean double_strand,
1071
+ ReferenceMapping * referenceMappings,
1072
+ Coordinate referenceMappingCount,
1073
+ IDnum refCount,
1074
+ char * roadmapFilename)
1075
+ {
1076
+ IDnum readIndex;
1077
+ RoadMapArray *roadmap = NULL;
1078
+ Coordinate *annotationOffset = NULL;
1079
+ struct timeval start, end, diff;
1080
+
1081
+ if (referenceMappings)
1082
+ {
1083
+ roadmap = importRoadMapArray(roadmapFilename);
1084
+ annotationOffset = callocOrExit(reads->readCount, Coordinate);
1085
+ for (readIndex = 1; readIndex < reads->readCount; readIndex++)
1086
+ annotationOffset[readIndex] = annotationOffset[readIndex - 1]
1087
+ + getAnnotationCount(getRoadMapInArray(roadmap, readIndex - 1));
1088
+ }
1089
+
1090
+ resetNodeStatus(graph);
1091
+ // Allocate memory for the read pairs
1092
+ if (!readStartsAreActivated(graph))
1093
+ activateReadStarts(graph);
1094
+
1095
+ gettimeofday(&start, NULL);
1096
+ #ifdef _OPENMP
1097
+ initSmallNodeListMemory();
1098
+ createNodeLocks(graph);
1099
+ #pragma omp parallel for
1100
+ #endif
1101
+ for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1102
+ {
1103
+ Annotation * annotations = NULL;
1104
+ IDnum annotationCount = 0;
1105
+ Category category;
1106
+ boolean second_in_pair;
1107
+
1108
+ if (readIndex % 1000000 == 0)
1109
+ velvetLog("Ghost Threading through reads %ld / %ld\n",
1110
+ (long) readIndex, (long) reads->readCount);
1111
+
1112
+ category = reads->categories[readIndex];
1113
+ second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1114
+
1115
+ if (referenceMappings)
1116
+ {
1117
+ annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1118
+ annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1119
+ }
1120
+
1121
+ ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1122
+ kmerTable,
1123
+ graph, readIndex + 1,
1124
+ category,
1125
+ readTracking, double_strand,
1126
+ referenceMappings, referenceMappingCount,
1127
+ refCount, annotations, annotationCount,
1128
+ second_in_pair);
1129
+ }
1130
+ createNodeReadStartArrays(graph);
1131
+ gettimeofday(&end, NULL);
1132
+ timersub(&end, &start, &diff);
1133
+ velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1134
+
1135
+ gettimeofday(&start, NULL);
1136
+ #ifdef _OPENMP
1137
+ int threads = omp_get_max_threads();
1138
+ if (threads > 32)
1139
+ threads = 32;
1140
+
1141
+ #pragma omp parallel for num_threads(threads)
1142
+ #endif
1143
+ for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1144
+ {
1145
+ Annotation * annotations = NULL;
1146
+ IDnum annotationCount = 0;
1147
+ Category category;
1148
+ boolean second_in_pair;
1149
+
1150
+ if (readIndex % 1000000 == 0)
1151
+ velvetLog("Threading through reads %li / %li\n",
1152
+ (long) readIndex, (long) reads->readCount);
1153
+
1154
+ category = reads->categories[readIndex];
1155
+ second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1156
+
1157
+ if (referenceMappings)
1158
+ {
1159
+ annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1160
+ annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1161
+ }
1162
+
1163
+ threadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1164
+ kmerTable,
1165
+ graph, readIndex + 1, category,
1166
+ readTracking, double_strand,
1167
+ referenceMappings, referenceMappingCount,
1168
+ refCount, annotations, annotationCount, second_in_pair);
1169
+ }
1170
+ gettimeofday(&end, NULL);
1171
+ timersub(&end, &start, &diff);
1172
+ velvetLog(" === Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1173
+
1174
+ #ifdef _OPENMP
1175
+ free(nodeLocks);
1176
+ nodeLocks = NULL;
1177
+ #endif
1178
+
1179
+ if (referenceMappings)
1180
+ {
1181
+ destroyRoadMapArray(roadmap);
1182
+ free (annotationOffset);
1183
+ }
1184
+
1185
+ orderNodeReadStartArrays(graph);
1186
+
1187
+ destroySmallNodeListMemmory();
1188
+
1189
+ destroyKmerOccurenceTable(kmerTable);
1190
+ }
1191
+
1192
+ Graph *importPreGraph(char *preGraphFilename, ReadSet * reads, char * roadmapFilename,
1193
+ boolean readTracking, short int accelerationBits)
1194
+ {
1195
+ boolean double_strand = false;
1196
+ Graph *graph = readPreGraphFile(preGraphFilename, &double_strand);
1197
+ Coordinate referenceMappingCount = 0;
1198
+ IDnum referenceCount = 0;
1199
+
1200
+ if (nodeCount(graph) == 0)
1201
+ return graph;
1202
+
1203
+ // If necessary compile reference -> node
1204
+ ReferenceMapping * referenceMappings = computeReferenceMappings(preGraphFilename, reads, &referenceMappingCount, &referenceCount);
1205
+ // Node -> reference maps
1206
+ NodeMask * nodeMasks = computeNodeMasks(referenceMappings, referenceMappingCount, graph);
1207
+
1208
+ // Map k-mers to nodes
1209
+ KmerOccurenceTable *kmerTable =
1210
+ referenceGraphKmers(preGraphFilename, accelerationBits, graph, double_strand, nodeMasks, referenceMappingCount);
1211
+
1212
+ free(nodeMasks);
1213
+
1214
+ // Map sequences -> kmers -> nodes
1215
+ fillUpGraph(reads, kmerTable, graph, readTracking, double_strand, referenceMappings, referenceMappingCount, referenceCount, roadmapFilename);
1216
+
1217
+ free(referenceMappings);
1218
+
1219
+ return graph;
1220
+ }
1221
+
1222
+ static void addReadsToGraph(TightString * tString,
1223
+ KmerOccurenceTable * kmerTable,
1224
+ Graph * graph,
1225
+ IDnum seqID, Category category,
1226
+ boolean readTracking,
1227
+ boolean double_strand,
1228
+ boolean second_in_pair)
1229
+ {
1230
+ Kmer word;
1231
+ Kmer antiWord;
1232
+ Coordinate readNucleotideIndex;
1233
+ Coordinate kmerIndex;
1234
+ KmerOccurence *kmerOccurence;
1235
+ int wordLength = getWordLength(graph);
1236
+
1237
+ Node *node = NULL;
1238
+ Node *previousNode = NULL;
1239
+ Coordinate coord = 0;
1240
+ Coordinate previousCoord = 0;
1241
+ Nucleotide nucleotide;
1242
+ boolean reversed;
1243
+
1244
+ Coordinate index = 0;
1245
+ SmallNodeList * nodePile = NULL;
1246
+
1247
+ // Neglect any read which will not be short paired
1248
+ if (category / 2 >= CATEGORIES)
1249
+ return;
1250
+
1251
+ // Neglect any string shorter than WORDLENGTH :
1252
+ if (getLength(tString) < wordLength)
1253
+ return;
1254
+
1255
+ clearKmer(&word);
1256
+ clearKmer(&antiWord);
1257
+
1258
+ // Fill in the initial word :
1259
+ for (readNucleotideIndex = 0;
1260
+ readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
1261
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
1262
+ pushNucleotide(&word, nucleotide);
1263
+ if (double_strand || second_in_pair) {
1264
+ #ifdef COLOR
1265
+ reversePushNucleotide(&antiWord, nucleotide);
1266
+ #else
1267
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
1268
+ #endif
1269
+ }
1270
+ }
1271
+
1272
+ // Go through sequence
1273
+ // printf("len %d\n", getLength(tString));
1274
+ while (readNucleotideIndex < getLength(tString)) {
1275
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
1276
+ pushNucleotide(&word, nucleotide);
1277
+ if (double_strand || second_in_pair) {
1278
+ #ifdef COLOR
1279
+ reversePushNucleotide(&antiWord, nucleotide);
1280
+ #else
1281
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
1282
+ #endif
1283
+ }
1284
+
1285
+ // Search in table
1286
+ reversed = false;
1287
+ if (double_strand) {
1288
+ if (compareKmers(&word, &antiWord) <= 0) {
1289
+ kmerOccurence =
1290
+ findKmerInKmerOccurenceTable(&word,
1291
+ kmerTable);
1292
+ } else {
1293
+ kmerOccurence =
1294
+ findKmerInKmerOccurenceTable(&antiWord,
1295
+ kmerTable);
1296
+ reversed = true;
1297
+ }
1298
+ } else {
1299
+ if (!second_in_pair) {
1300
+ kmerOccurence =
1301
+ findKmerInKmerOccurenceTable(&word,
1302
+ kmerTable);
1303
+ } else {
1304
+ kmerOccurence =
1305
+ findKmerInKmerOccurenceTable(&antiWord,
1306
+ kmerTable);
1307
+ reversed = true;
1308
+ }
1309
+ }
1310
+
1311
+ if (kmerOccurence) {
1312
+ if (!reversed) {
1313
+ node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
1314
+ coord = getKmerOccurencePosition(kmerOccurence);
1315
+ } else {
1316
+ node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
1317
+ coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
1318
+ }
1319
+ } else {
1320
+ node = NULL;
1321
+ if (previousNode)
1322
+ break;
1323
+ }
1324
+
1325
+ // Fill in graph
1326
+ if (node)
1327
+ {
1328
+ #ifdef _OPENMP
1329
+ lockNode(node);
1330
+ #endif
1331
+ kmerIndex = readNucleotideIndex - wordLength;
1332
+
1333
+ if (previousNode != node || previousCoord != coord -1) {
1334
+ if (!isNodeMemorized(node, nodePile)) {
1335
+ addReadStart(node,
1336
+ seqID,
1337
+ coord,
1338
+ graph,
1339
+ kmerIndex);
1340
+ memorizeNode(node, &nodePile);
1341
+ } else {
1342
+ blurLastShortReadMarker
1343
+ (node, graph);
1344
+ }
1345
+ }
1346
+ #ifdef _OPENMP
1347
+ unLockNode(node);
1348
+ #endif
1349
+ previousNode = node;
1350
+ previousCoord = coord;
1351
+ }
1352
+ index++;
1353
+ }
1354
+ // printKmer(&word);
1355
+
1356
+ if (category / 2 < CATEGORIES)
1357
+ unMemorizeNodes(&nodePile);
1358
+ }
1359
+
1360
+ static void fillUpConnectedGraph(ReadSet * reads,
1361
+ KmerOccurenceTable * kmerTable,
1362
+ Graph * graph,
1363
+ boolean readTracking,
1364
+ boolean double_strand)
1365
+ {
1366
+ IDnum refCount = 0; // refs not present in connected graphs
1367
+ IDnum readIndex;
1368
+ struct timeval start, end, diff;
1369
+
1370
+ resetNodeStatus(graph);
1371
+ // Allocate memory for the read pairs
1372
+ if (!readStartsAreActivated(graph))
1373
+ activateReadStarts(graph);
1374
+
1375
+ gettimeofday(&start, NULL);
1376
+ #ifdef _OPENMP
1377
+ initSmallNodeListMemory();
1378
+ createNodeLocks(graph);
1379
+ #pragma omp parallel for
1380
+ #endif
1381
+ for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1382
+ {
1383
+ Category category;
1384
+ boolean second_in_pair;
1385
+
1386
+ if (readIndex % 1000000 == 0)
1387
+ velvetLog("Ghost Threading through reads %ld / %ld\n",
1388
+ (long) readIndex, (long) reads->readCount);
1389
+
1390
+ category = reads->categories[readIndex];
1391
+ second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1392
+
1393
+ // referenceMappings = NULL, referenceMappingCount = 0
1394
+ // refCount = 0, annotations = NULL, annotationCount = 0
1395
+ ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1396
+ kmerTable,
1397
+ graph, readIndex + 1,
1398
+ category,
1399
+ readTracking, double_strand,
1400
+ NULL, 0,
1401
+ 0, NULL, 0,
1402
+ second_in_pair);
1403
+ }
1404
+ createNodeReadStartArrays(graph);
1405
+ gettimeofday(&end, NULL);
1406
+ timersub(&end, &start, &diff);
1407
+ velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1408
+
1409
+ gettimeofday(&start, NULL);
1410
+ #ifdef _OPENMP
1411
+ int threads = omp_get_max_threads();
1412
+ if (threads > 32)
1413
+ threads = 32;
1414
+
1415
+ #pragma omp parallel for num_threads(threads)
1416
+ #endif
1417
+ for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1418
+ {
1419
+ Category category;
1420
+ boolean second_in_pair;
1421
+
1422
+ if (readIndex % 1000000 == 0)
1423
+ velvetLog("Adding reads %li / %li\n",
1424
+ (long) readIndex, (long) reads->readCount);
1425
+
1426
+ category = reads->categories[readIndex];
1427
+ second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1428
+
1429
+ addReadsToGraph(getTightStringInArray(reads->tSequences, readIndex),
1430
+ kmerTable,
1431
+ graph, readIndex + 1, category,
1432
+ readTracking, double_strand, second_in_pair);
1433
+ }
1434
+ gettimeofday(&end, NULL);
1435
+ timersub(&end, &start, &diff);
1436
+ velvetLog(" === Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1437
+
1438
+ #ifdef _OPENMP
1439
+ free(nodeLocks);
1440
+ nodeLocks = NULL;
1441
+ #endif
1442
+
1443
+ orderNodeReadStartArrays(graph);
1444
+
1445
+ destroySmallNodeListMemmory();
1446
+
1447
+ destroyKmerOccurenceTable(kmerTable);
1448
+ }
1449
+
1450
+ Graph *importConnectedGraph(char *connectedGraphFilename, ReadSet * reads, char * roadmapFilename,
1451
+ boolean readTracking, short int accelerationBits)
1452
+ {
1453
+ boolean double_strand = false;
1454
+ Graph *graph = readConnectedGraphFile(connectedGraphFilename, &double_strand);
1455
+
1456
+ if (nodeCount(graph) == 0)
1457
+ return graph;
1458
+
1459
+ if (readTracking) {
1460
+ Coordinate referenceMappingCount = 0;
1461
+ NodeMask * nodeMasks = NULL;
1462
+
1463
+ // Map k-mers to nodes
1464
+ KmerOccurenceTable *kmerTable =
1465
+ referenceGraphKmers(connectedGraphFilename, accelerationBits, graph, doubleStrandedGraph(graph), nodeMasks, referenceMappingCount);
1466
+
1467
+ // Map sequences -> kmers -> nodes
1468
+ fillUpConnectedGraph(reads, kmerTable, graph, readTracking, double_strand);
1469
+ }
1470
+
1471
+ return graph;
1472
+ }