finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,29 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+
5
+ #include "globals.h"
6
+ #include "utility.h"
7
+ #include "readToNode.h"
8
+ #include "graphStructures.h"
9
+ #include "graph.h"
10
+
11
+ int main(int argc, char **argv){
12
+ if (argc != 2){
13
+ printf("Usage: readToNode <lastgraph_file>\n");
14
+ exit(1);
15
+ }
16
+
17
+ Graph* graph = importGraph(argv[1]);
18
+ ReadIdToNodeIdLookupTable* readToNode = createReadToNode(graph);
19
+ printf("Parsed in graph with %i reads and %i entries\n", readToNode->num_reads, readToNode->num_contents);
20
+
21
+ char* fileName = "ReadToNode.bin";
22
+ writeReadIdToNodeIdLookupTable(fileName, readToNode);
23
+ printf("Finished writing\n");
24
+
25
+ destroyReadIdToNodeIdLookupTable(readToNode);
26
+ destroyGraph(graph);
27
+
28
+ return 0;
29
+ }
@@ -0,0 +1,1876 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <time.h>
24
+ #include <math.h>
25
+ #include <sys/time.h>
26
+
27
+ #ifdef _OPENMP
28
+ #include <omp.h>
29
+ #endif
30
+
31
+ #include "globals.h"
32
+ #include "graph.h"
33
+ #include "concatenatedGraph.h"
34
+ #include "recycleBin.h"
35
+ #include "locallyCorrectedGraph.h"
36
+ #include "passageMarker.h"
37
+ #include "readSet.h"
38
+ #include "utility.h"
39
+ #include "scaffold.h"
40
+
41
+ #define BLOCK_SIZE 100000
42
+ #define LN2 1.4
43
+
44
+ static int PEBBLE_ROUND_NUM = 0;
45
+
46
+ typedef struct readOccurence_st ReadOccurence;
47
+ static double paired_exp_fraction = 0.1;
48
+
49
+ struct connection_st {
50
+ Node *destination;
51
+ Connection *right;
52
+ Connection *left;
53
+ Connection *twin;
54
+ float distance;
55
+ float variance;
56
+ IDnum direct_count;
57
+ IDnum paired_count;
58
+ unsigned char clean;
59
+ } ATTRIBUTE_PACKED;
60
+
61
+ struct readOccurence_st {
62
+ IDnum position;
63
+ IDnum offset;
64
+ IDnum nodeID;
65
+ } ATTRIBUTE_PACKED;
66
+
67
+ // Global params
68
+ static IDnum UNRELIABLE_CONNECTION_CUTOFF = 5;
69
+
70
+ // Global pointers
71
+ static Graph *graph;
72
+ static Connection **scaffold = NULL;
73
+ static RecycleBin *connectionMemory = NULL;
74
+ static boolean estimated[CATEGORIES + 1];
75
+
76
+ #ifdef _OPENMP
77
+
78
+ #define READS_PER_LOCK 32
79
+
80
+
81
+ /* Array of reads locks */
82
+ static omp_lock_t *readsLocks = NULL;
83
+ /* Array of per-node locks */
84
+ static omp_lock_t *nodeLocks = NULL;
85
+
86
+ static void
87
+ createReadsLocks()
88
+ {
89
+ Coordinate nbLocks;
90
+ Coordinate lockIndex;
91
+
92
+ if (readsLocks)
93
+ free (readsLocks);
94
+ nbLocks = 1 + sequenceCount(graph) / READS_PER_LOCK;
95
+ readsLocks = mallocOrExit(nbLocks, omp_lock_t);
96
+
97
+ #pragma omp parallel for
98
+ for (lockIndex = 0; lockIndex < nbLocks; lockIndex++)
99
+ omp_init_lock(readsLocks + lockIndex);
100
+ }
101
+
102
+ static inline void lockRead(IDnum readID)
103
+ {
104
+ omp_set_lock (readsLocks + readID / READS_PER_LOCK);
105
+ }
106
+
107
+ static inline void unLockRead(IDnum readID)
108
+ {
109
+ omp_unset_lock (readsLocks + readID / READS_PER_LOCK);
110
+ }
111
+
112
+ static void
113
+ createNodeLocks(Graph *graph)
114
+ {
115
+ IDnum nbNodes;
116
+ IDnum nodeIndex;
117
+
118
+ nbNodes = nodeCount(graph) + 1;
119
+ if (nodeLocks)
120
+ free (nodeLocks);
121
+ nodeLocks = mallocOrExit(nbNodes, omp_lock_t);
122
+
123
+ #pragma omp parallel for
124
+ for (nodeIndex = 0; nodeIndex < nbNodes; nodeIndex++)
125
+ omp_init_lock(nodeLocks + nodeIndex);
126
+ }
127
+
128
+ /* Tries to avoid deadlocking */
129
+ static inline void lockTwoNodes(IDnum nodeID, IDnum node2ID)
130
+ {
131
+ if (nodeID < 0)
132
+ nodeID = -nodeID;
133
+ if (node2ID < 0)
134
+ node2ID = -node2ID;
135
+
136
+ /* Lock lowest ID first to avoid deadlocks */
137
+ if (nodeID < node2ID)
138
+ {
139
+ omp_set_lock (nodeLocks + nodeID);
140
+ omp_set_lock (nodeLocks + node2ID);
141
+ }
142
+ else
143
+ {
144
+ omp_set_lock (nodeLocks + node2ID);
145
+ omp_set_lock (nodeLocks + nodeID);
146
+ }
147
+ }
148
+
149
+ static inline void unLockTwoNodes(IDnum nodeID, IDnum node2ID)
150
+ {
151
+ if (nodeID < 0)
152
+ nodeID = -nodeID;
153
+ if (node2ID < 0)
154
+ node2ID = -node2ID;
155
+
156
+ omp_unset_lock (nodeLocks + nodeID);
157
+ omp_unset_lock (nodeLocks + node2ID);
158
+ }
159
+ #endif
160
+
161
+ static Connection *allocateConnection()
162
+ {
163
+ Connection *connect;
164
+ #ifdef _OPENMP
165
+ #pragma omp critical
166
+ {
167
+ #endif
168
+ if (connectionMemory == NULL)
169
+ connectionMemory =
170
+ newRecycleBin(sizeof(Connection), BLOCK_SIZE);
171
+
172
+ connect = allocatePointer(connectionMemory);
173
+ #ifdef _OPENMP
174
+ }
175
+ #endif
176
+ connect->destination = NULL;
177
+ connect->clean = false;
178
+ return connect;
179
+ }
180
+
181
+ static void deallocateConnection(Connection * connect)
182
+ {
183
+ deallocatePointer(connectionMemory, connect);
184
+ }
185
+
186
+ Node * getConnectionDestination(Connection * connect) {
187
+ return connect->destination;
188
+ }
189
+
190
+ Connection * getNextConnection(Connection * connect) {
191
+ return connect->right;
192
+ }
193
+
194
+ Connection * getTwinConnection(Connection * connect) {
195
+ return connect->twin;
196
+ }
197
+
198
+ Coordinate getConnectionDistance(Connection * connect) {
199
+ return (Coordinate) connect->distance;
200
+ }
201
+
202
+ double getConnectionVariance(Connection * connect) {
203
+ return connect->variance;
204
+ }
205
+
206
+ IDnum getConnectionDirectCount(Connection * connect) {
207
+ return connect->direct_count;
208
+ }
209
+
210
+ IDnum getConnectionPairedCount(Connection * connect) {
211
+ return connect->paired_count;
212
+ }
213
+
214
+ Connection * getConnection(Node * node) {
215
+ return scaffold[getNodeID(node) + nodeCount(graph)];
216
+ }
217
+
218
+ void incrementConnectionDistance(Connection * connect, Coordinate increment) {
219
+ connect->distance += increment;
220
+ }
221
+
222
+ static double norm(double X)
223
+ {
224
+ return 0.4 * exp(-X * X / 2);
225
+ }
226
+
227
+ static double normInt(double X, double Y)
228
+ {
229
+ return (erf(0.7 * Y) - erf(0.7 * X)) / 2;
230
+ }
231
+
232
+ static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect,
233
+ IDnum ** counts, Category cat)
234
+ {
235
+ Node *A = getNodeInGraph(graph, IDA);
236
+ Node *B = connect->destination;
237
+ double left, middle, right;
238
+ Coordinate longLength, shortLength, D;
239
+ IDnum longCount;
240
+ double M, N, O, P;
241
+ Coordinate mu = getInsertLength(graph, cat);
242
+ double sigma = sqrt(getInsertLength_var(graph, cat));
243
+ double result;
244
+
245
+ if (mu <= 0)
246
+ return 0;
247
+
248
+ if (getNodeLength(A) < getNodeLength(B)) {
249
+ longLength = getNodeLength(B);
250
+ shortLength = getNodeLength(A);
251
+ longCount = counts[cat][getNodeID(B) + nodeCount(graph)];
252
+ } else {
253
+ longLength = getNodeLength(A);
254
+ shortLength = getNodeLength(B);
255
+ longCount = counts[cat][IDA + nodeCount(graph)];
256
+ }
257
+
258
+ D = getConnectionDistance(connect) - (longLength + shortLength) / 2;
259
+
260
+ M = (D - mu) / sigma;
261
+ N = (D + shortLength - mu) / sigma;
262
+ O = (D + longLength - mu) / sigma;
263
+ P = (D + shortLength + longLength - mu) / sigma;
264
+
265
+ left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma;
266
+ middle = shortLength * normInt(N, O);
267
+ right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma);
268
+
269
+ result = (longCount * (left + middle + right)) / longLength;
270
+
271
+ if (result > 0)
272
+ return (IDnum) result;
273
+ else
274
+ return 0;
275
+ }
276
+
277
+ void destroyConnection(Connection * connect, IDnum nodeID)
278
+ {
279
+ Connection *previous, *next;
280
+
281
+ //velvetLog("Destroying connection from %li to %li\n", nodeID, getNodeID(connect->destination));
282
+
283
+ if (connect == NULL)
284
+ return;
285
+
286
+ previous = connect->left;
287
+ next = connect->right;
288
+
289
+ if (previous != NULL)
290
+ previous->right = next;
291
+ if (next != NULL)
292
+ next->left = previous;
293
+
294
+ if (scaffold[nodeID + nodeCount(graph)] == connect)
295
+ scaffold[nodeID + nodeCount(graph)] = next;
296
+
297
+ if (connect->twin != NULL) {
298
+ connect->twin->twin = NULL;
299
+ destroyConnection(connect->twin,
300
+ getNodeID(connect->destination));
301
+ }
302
+
303
+ deallocateConnection(connect);
304
+ }
305
+
306
+ static boolean testConnection(IDnum IDA,
307
+ Connection *connect,
308
+ IDnum **counts,
309
+ boolean *shadows)
310
+ {
311
+ IDnum total = 0;
312
+ Category cat;
313
+
314
+ // Spare unique -> undetermined node connections
315
+ if (!getUniqueness(connect->destination))
316
+ return true;
317
+
318
+ // Destroy tenuous connections
319
+ if (connect->paired_count + connect->direct_count <
320
+ UNRELIABLE_CONNECTION_CUTOFF)
321
+ return false;
322
+
323
+ for (cat = 0; cat < CATEGORIES; cat++)
324
+ if (!shadows[cat] || cat <= PEBBLE_ROUND_NUM)
325
+ total += expectedNumberOfConnections(IDA, connect, counts, cat);
326
+
327
+ // Remove inconsistent connections
328
+ return connect->paired_count >= total * paired_exp_fraction;
329
+ }
330
+
331
+ static IDnum *computeReadToNodeCounts(Coordinate *totalCount)
332
+ {
333
+ IDnum nodeIndex;
334
+ IDnum maxNodeIndex = 2 * nodeCount(graph) + 1;
335
+ IDnum maxReadIndex = sequenceCount(graph) + 1;
336
+ IDnum *readNodeCounts = callocOrExit(maxReadIndex, IDnum);
337
+ unsigned char *readMarker = callocOrExit(1 + maxReadIndex / 8, unsigned char);
338
+ Coordinate total = 0;
339
+
340
+ velvetLog("Computing read to node mapping array sizes\n");
341
+
342
+ #ifdef _OPENMP
343
+ #pragma omp parallel for reduction(+:total)
344
+ #endif
345
+ for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) {
346
+ Node *node;
347
+ ShortReadMarker *nodeArray;
348
+ IDnum nodeReadCount;
349
+ IDnum readIndex;
350
+
351
+ node = getNodeInGraph(graph, nodeIndex - nodeCount(graph));
352
+ if (node == NULL)
353
+ continue;
354
+ nodeArray = getNodeReads(node, graph);
355
+ nodeReadCount = getNodeReadCount(node, graph);
356
+
357
+ // Short reads
358
+ for (readIndex = 0; readIndex < nodeReadCount; readIndex++) {
359
+ ShortReadMarker *shortMarker;
360
+ IDnum readID;
361
+
362
+ shortMarker = getShortReadMarkerAtIndex(nodeArray,
363
+ readIndex);
364
+ readID = getShortReadMarkerID(shortMarker);
365
+ #ifdef _OPENMP
366
+ #pragma omp atomic
367
+ #endif
368
+ readNodeCounts[readID]++;
369
+ total++;
370
+ }
371
+ }
372
+
373
+ for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) {
374
+ Node *node;
375
+ PassageMarkerI marker;
376
+
377
+ node = getNodeInGraph(graph, nodeIndex - nodeCount(graph));
378
+ if (node == NULL)
379
+ continue;
380
+ // Long reads
381
+ for (marker = getMarker(node); marker != NULL_IDX;
382
+ marker = getNextInNode(marker)) {
383
+ IDnum readIndex = getPassageMarkerSequenceID(marker);;
384
+
385
+ if (readIndex < 0)
386
+ continue;
387
+
388
+ const unsigned int idx = readIndex / 8;
389
+ const unsigned int mask = 1 << (readIndex & 7);
390
+ if (readMarker[idx] & mask)
391
+ continue;
392
+
393
+ readNodeCounts[readIndex]++;
394
+ total++;
395
+ readMarker[idx] |= mask;
396
+ }
397
+
398
+ // Clean up marker array
399
+ for (marker = getMarker(node); marker != NULL_IDX;
400
+ marker = getNextInNode(marker)) {
401
+ IDnum readIndex = getPassageMarkerSequenceID(marker);
402
+ if (readIndex > 0)
403
+ // No need to go bit-wise
404
+ readMarker[readIndex / 8] = 0;
405
+ }
406
+ }
407
+
408
+ *totalCount = total;
409
+ free(readMarker);
410
+ return readNodeCounts;
411
+ }
412
+
413
+ static ReadOccurence **allocateReadToNodeTables(IDnum * readNodeCounts,
414
+ Coordinate totalCount,
415
+ ReadOccurence **readNodesArray)
416
+ {
417
+ Coordinate offset = 0;
418
+ IDnum readIndex;
419
+ IDnum maxReadIndex = sequenceCount(graph) + 1;
420
+ ReadOccurence **readNodes = callocOrExit(maxReadIndex, ReadOccurence *);
421
+ *readNodesArray = callocOrExit(totalCount, ReadOccurence);
422
+
423
+ for (readIndex = 1; readIndex < maxReadIndex; readIndex++) {
424
+ if (readNodeCounts[readIndex] != 0) {
425
+ readNodes[readIndex] = *readNodesArray + offset;
426
+ offset += readNodeCounts[readIndex];
427
+ readNodeCounts[readIndex] = 0;
428
+ }
429
+ }
430
+
431
+ return readNodes;
432
+ }
433
+
434
+ static void computePartialReadToNodeMappingShort(IDnum nodeID,
435
+ ReadOccurence ** readNodes,
436
+ IDnum * readNodeCounts)
437
+ {
438
+ ShortReadMarker *shortMarker;
439
+ IDnum index, readIndex;
440
+ ReadOccurence *readArray, *readOccurence;
441
+ Node *node = getNodeInGraph(graph, nodeID);
442
+ ShortReadMarker *nodeArray = getNodeReads(node, graph);
443
+ IDnum nodeReadCount = getNodeReadCount(node, graph);
444
+
445
+ for (index = 0; index < nodeReadCount; index++) {
446
+ shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
447
+ readIndex = getShortReadMarkerID(shortMarker);
448
+ readArray = readNodes[readIndex];
449
+ #ifdef _OPENMP
450
+ lockRead(readIndex);
451
+ #endif
452
+ readOccurence = &readArray[readNodeCounts[readIndex]];
453
+ readOccurence->nodeID = nodeID;
454
+ readOccurence->position =
455
+ getShortReadMarkerPosition(shortMarker);
456
+ readOccurence->offset =
457
+ getShortReadMarkerOffset(shortMarker);
458
+ readNodeCounts[readIndex]++;
459
+ #ifdef _OPENMP
460
+ unLockRead(readIndex);
461
+ #endif
462
+ }
463
+ }
464
+
465
+ static void computePartialReadToNodeMappingLong(IDnum nodeID,
466
+ ReadOccurence ** readNodes,
467
+ IDnum * readNodeCounts,
468
+ unsigned char *readMarker,
469
+ ReadSet * reads)
470
+ {
471
+ IDnum readIndex;
472
+ ReadOccurence *readArray, *readOccurence;
473
+ Node *node = getNodeInGraph(graph, nodeID);
474
+ PassageMarkerI marker;
475
+
476
+ for (marker = getMarker(node); marker != NULL_IDX;
477
+ marker = getNextInNode(marker)) {
478
+ readIndex = getPassageMarkerSequenceID(marker);
479
+ if (readIndex <= 0 || reads->categories[readIndex - 1] == REFERENCE)
480
+ continue;
481
+
482
+ const unsigned int idx = readIndex / 8;
483
+ const unsigned int mask = 1 << (readIndex & 7);
484
+ if (readMarker[idx] & mask) {
485
+ readArray = readNodes[readIndex];
486
+ readOccurence =
487
+ &readArray[readNodeCounts[readIndex] - 1];
488
+ readOccurence->position = -1;
489
+ readOccurence->offset = -1;
490
+ } else {
491
+ readArray = readNodes[readIndex];
492
+ readOccurence =
493
+ &readArray[readNodeCounts[readIndex]];
494
+ readOccurence->nodeID = nodeID;
495
+ readOccurence->position = getStartOffset(marker);
496
+ readOccurence->offset =
497
+ getPassageMarkerStart(marker);
498
+ readNodeCounts[readIndex]++;
499
+ readMarker[idx] |= mask;
500
+ }
501
+ }
502
+
503
+ for (marker = getMarker(node); marker != NULL_IDX;
504
+ marker = getNextInNode(marker)) {
505
+ readIndex = getPassageMarkerSequenceID(marker);
506
+ if (readIndex > 0)
507
+ // No need to go bit-wise
508
+ readMarker[readIndex / 8] = 0;
509
+ }
510
+ }
511
+
512
+ static ReadOccurence **computeReadToNodeMappings(IDnum * readNodeCounts,
513
+ ReadSet * reads,
514
+ Coordinate totalCount,
515
+ ReadOccurence **readNodesArray)
516
+ {
517
+ unsigned char *readMarker;
518
+ IDnum nodeID;
519
+ IDnum nodes = nodeCount(graph);
520
+ ReadOccurence **readNodes = allocateReadToNodeTables(readNodeCounts,
521
+ totalCount,
522
+ readNodesArray);
523
+
524
+ velvetLog("Computing read to node mappings\n");
525
+
526
+ #ifdef _OPENMP
527
+ createReadsLocks();
528
+ #pragma omp parallel for
529
+ #endif
530
+ for (nodeID = -nodes; nodeID <= nodes; nodeID++)
531
+ if (nodeID != 0 && getNodeInGraph(graph, nodeID))
532
+ computePartialReadToNodeMappingShort(nodeID, readNodes,
533
+ readNodeCounts);
534
+
535
+ #ifdef _OPENMP
536
+ free(readsLocks);
537
+ readsLocks = NULL;
538
+ #endif
539
+
540
+ readMarker = callocOrExit(1 + sequenceCount(graph) / 8, unsigned char);
541
+ for (nodeID = -nodes; nodeID <= nodes; nodeID++)
542
+ if (nodeID != 0 && getNodeInGraph(graph, nodeID))
543
+ computePartialReadToNodeMappingLong(nodeID, readNodes,
544
+ readNodeCounts,
545
+ readMarker,
546
+ reads);
547
+
548
+ free(readMarker);
549
+ return readNodes;
550
+ }
551
+
552
+ static unsigned char * countCoOccurences(IDnum * coOccurencesCount,
553
+ ReadOccurence ** readNodes,
554
+ IDnum * readNodeCounts,
555
+ IDnum * readPairs,
556
+ Category * cats)
557
+ {
558
+ IDnum readIndex, readPairIndex;
559
+ IDnum readNodeCount;
560
+ IDnum readOccurenceIndex, readPairOccurenceIndex;
561
+ ReadOccurence * readOccurence, *readPairOccurence;
562
+ unsigned char *interestingReads = callocOrExit(1 + sequenceCount(graph) / 8, unsigned char);
563
+ Category libID;
564
+
565
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
566
+ coOccurencesCount[libID] = 0;
567
+
568
+ for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
569
+ // Eliminating dodgy, unpaired, already counted or user-specified reads
570
+ if ( readPairs[readIndex] < readIndex
571
+ || getInsertLength(graph, cats[readIndex]) > -1)
572
+ continue;
573
+
574
+ // Check for co-occurence
575
+ // We know that for each read the read occurences are ordered by increasing node ID
576
+ // Therefore one list is followed by increasing index, whereas the other is followed
577
+ // by decreasing index
578
+ libID = cats[readIndex] / 2;
579
+ readPairIndex = readPairs[readIndex];
580
+
581
+ readOccurenceIndex = 0;
582
+ readOccurence = readNodes[readIndex + 1];
583
+ readNodeCount = readNodeCounts[readIndex + 1];
584
+
585
+ readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
586
+ readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
587
+
588
+ while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
589
+ if (readOccurence->nodeID == -readPairOccurence->nodeID) {
590
+ if (readOccurence->position > 0 && readPairOccurence->position > 0) {
591
+ coOccurencesCount[libID]++;
592
+ interestingReads[readIndex / 8] |= 1 << (readIndex & 7);
593
+ break;
594
+ } else {
595
+ readOccurence++;
596
+ readOccurenceIndex++;
597
+ readPairOccurence--;
598
+ readPairOccurenceIndex--;
599
+ }
600
+ } else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
601
+ readOccurence++;
602
+ readOccurenceIndex++;
603
+ } else {
604
+ readPairOccurence--;
605
+ readPairOccurenceIndex--;
606
+ }
607
+ }
608
+ }
609
+
610
+ return interestingReads;
611
+ }
612
+
613
+ static void measureCoOccurences(IDnum ** coOccurences,
614
+ unsigned char * interestingReads,
615
+ ReadOccurence ** readNodes,
616
+ IDnum * readNodeCounts,
617
+ IDnum * readPairs,
618
+ Category * cats)
619
+ {
620
+ IDnum coOccurencesIndex[CATEGORIES + 1];
621
+ IDnum observationIndex;
622
+ IDnum readIndex, readPairIndex;
623
+ IDnum readNodeCount;
624
+ IDnum readOccurenceIndex, readPairOccurenceIndex;
625
+ ReadOccurence * readOccurence, *readPairOccurence;
626
+ Category libID;
627
+
628
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
629
+ coOccurencesIndex[libID] = 0;
630
+
631
+ for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
632
+ // Eliminating dodgy, unpaired, already counted or user-specified reads
633
+ if (!(interestingReads[readIndex / 8] & (1 << (readIndex & 7))))
634
+ continue;
635
+
636
+ // Find co-occurence
637
+ // We know that for each read the read occurences are ordered by increasing node ID
638
+ libID = cats[readIndex]/2;
639
+ readPairIndex = readPairs[readIndex];
640
+ observationIndex = coOccurencesIndex[libID];
641
+
642
+ readOccurence = readNodes[readIndex + 1];
643
+ readOccurenceIndex = 0;
644
+ readNodeCount = readNodeCounts[readIndex + 1];
645
+
646
+ readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
647
+ readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
648
+
649
+ while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
650
+ if (readOccurence->nodeID == -readPairOccurence->nodeID) {
651
+ if (readOccurence->position > 0 && readPairOccurence->position > 0) {
652
+ coOccurences[libID][observationIndex] =
653
+ getNodeLength(getNodeInGraph(graph, readOccurence->nodeID))
654
+ + getWordLength(graph) - 1
655
+ - (readOccurence->position - readOccurence->offset)
656
+ - (readPairOccurence->position - readPairOccurence->offset);
657
+ coOccurencesIndex[libID]++;
658
+ break;
659
+ } else {
660
+ readOccurence++;
661
+ readOccurenceIndex++;
662
+ readPairOccurence--;
663
+ readPairOccurenceIndex--;
664
+ }
665
+ } else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
666
+ readOccurence++;
667
+ readOccurenceIndex++;
668
+ } else {
669
+ readPairOccurence--;
670
+ readPairOccurenceIndex--;
671
+ }
672
+ }
673
+ }
674
+ }
675
+
676
+ int compareReadOccurences(const void *A, const void * B) {
677
+ IDnum * cA = (IDnum *) A;
678
+ IDnum * cB = (IDnum *) B;
679
+
680
+ if (*cA > *cB)
681
+ return 1;
682
+ if (*cA == *cB)
683
+ return 0;
684
+ return -1;
685
+ }
686
+
687
+ static void estimateLibraryInsertLength(IDnum * coOccurences, IDnum coOccurencesCount, Category libID) {
688
+ Coordinate median, variance;
689
+ IDnum index;
690
+ int counter = 0;
691
+ qsort(coOccurences, coOccurencesCount, sizeof(IDnum), compareReadOccurences);
692
+
693
+ median = coOccurences[coOccurencesCount / 2];
694
+
695
+ // Modified variance around the median (proxy for expected value)
696
+ // interval censoring
697
+ variance = 0;
698
+ for (index = 0; index < coOccurencesCount; index++) {
699
+ if (coOccurences[index] > 0 && coOccurences[index] < 5 * median) {
700
+ variance += (coOccurences[index] - median) * (coOccurences[index] - median);
701
+ counter++;
702
+ }
703
+ }
704
+ if (counter)
705
+ variance /= counter;
706
+ else {
707
+ variance = 0;
708
+ for (index = 0; index < coOccurencesCount; index++)
709
+ variance += (coOccurences[index] - median) * (coOccurences[index] - median);
710
+ variance /= coOccurencesCount;
711
+ }
712
+
713
+ // To avoid subsequent divisions by zero
714
+ if (variance == 0)
715
+ variance = 1;
716
+
717
+ velvetLog("Paired-end library %i has length: %lli, sample standard deviation: %lli\n", libID + 1, (long long) median, (long long) sqrt(variance));
718
+ setInsertLengths(graph, libID, median, sqrt(variance));
719
+ estimated[libID] = true;
720
+ }
721
+
722
+ static void estimateLibraryInsertLengths(IDnum ** coOccurences, IDnum * coOccurencesCounts) {
723
+ Category libID;
724
+
725
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
726
+ estimated[libID] = false;
727
+
728
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
729
+ if (coOccurencesCounts[libID] > 0)
730
+ estimateLibraryInsertLength(coOccurences[libID], coOccurencesCounts[libID], libID);
731
+ }
732
+
733
+ static void estimateMissingInsertLengths(ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
734
+ IDnum * coOccurences[CATEGORIES + 1];
735
+ IDnum coOccurencesCounts[CATEGORIES + 1];
736
+ Category libID;
737
+
738
+ velvetLog("Estimating library insert lengths...\n");
739
+
740
+ unsigned char * interestingReads = countCoOccurences(coOccurencesCounts, readNodes, readNodeCounts, readPairs, cats);
741
+
742
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
743
+ coOccurences[libID] = callocOrExit(coOccurencesCounts[libID], IDnum);
744
+
745
+ measureCoOccurences(coOccurences, interestingReads, readNodes, readNodeCounts, readPairs, cats);
746
+ estimateLibraryInsertLengths(coOccurences, coOccurencesCounts);
747
+
748
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
749
+ free(coOccurences[libID]);
750
+
751
+ free(interestingReads);
752
+
753
+ velvetLog("Done\n");
754
+ }
755
+
756
+ static void createTwinConnection(IDnum nodeID, IDnum node2ID,
757
+ Connection * connect)
758
+ {
759
+ Connection *newConnection = allocateConnection();
760
+ IDnum nodeIndex = nodeID + nodeCount(graph);
761
+
762
+ // Fill in
763
+ newConnection->distance = connect->distance;
764
+ newConnection->variance = connect->variance;
765
+ newConnection->direct_count = connect->direct_count;
766
+ newConnection->paired_count = connect->paired_count;
767
+ newConnection->destination = getNodeInGraph(graph, node2ID);
768
+
769
+ // Batch to twin
770
+ newConnection->twin = connect;
771
+ connect->twin = newConnection;
772
+
773
+ // Insert in scaffold
774
+ newConnection->left = NULL;
775
+ newConnection->right = scaffold[nodeIndex];
776
+ if (scaffold[nodeIndex] != NULL)
777
+ scaffold[nodeIndex]->left = newConnection;
778
+ scaffold[nodeIndex] = newConnection;
779
+ }
780
+
781
+ Connection *createNewConnection(IDnum nodeID, IDnum node2ID,
782
+ IDnum direct_count,
783
+ IDnum paired_count,
784
+ Coordinate distance,
785
+ double variance)
786
+ {
787
+ Node *destination = getNodeInGraph(graph, node2ID);
788
+ IDnum nodeIndex = nodeID + nodeCount(graph);
789
+ Connection *connect = allocateConnection();
790
+
791
+ // Fill in
792
+ connect->destination = destination;
793
+ connect->direct_count = direct_count;
794
+ connect->paired_count = paired_count;
795
+ connect->distance = (double) distance;
796
+ connect->variance = variance;
797
+
798
+ // Insert in scaffold
799
+ connect->left = NULL;
800
+ connect->right = scaffold[nodeIndex];
801
+ if (scaffold[nodeIndex] != NULL)
802
+ scaffold[nodeIndex]->left = connect;
803
+ scaffold[nodeIndex] = connect;
804
+
805
+ // Event. pair up to twin
806
+ if (getUniqueness(destination))
807
+ createTwinConnection(node2ID, nodeID, connect);
808
+ else
809
+ connect->twin = NULL;
810
+
811
+ return connect;
812
+ }
813
+
814
+ void readjustConnection(Connection * connect, Coordinate distance,
815
+ double variance, IDnum direct_count,
816
+ IDnum paired_count)
817
+ {
818
+ connect->direct_count += direct_count;
819
+ connect->paired_count += paired_count;
820
+
821
+ connect->distance =
822
+ (variance * connect->distance +
823
+ distance * connect->variance) / (variance +
824
+ connect->variance);
825
+ connect->variance =
826
+ (variance *
827
+ connect->variance) / (variance + connect->variance);
828
+
829
+ if (connect->twin != NULL) {
830
+ connect->twin->distance = connect->distance;
831
+ connect->twin->variance = connect->variance;
832
+ connect->twin->direct_count = connect->direct_count;
833
+ connect->twin->paired_count = connect->paired_count;
834
+ }
835
+ }
836
+
837
+ //////////////////////////////////////
838
+ // Splay tree function for Connections
839
+ //////////////////////////////////////
840
+
841
+ /* This function can be called only if K2 has a left child */
842
+ /* Perform a rotate between a node (K2) and its left child */
843
+ /* Update heights, then return new root */
844
+
845
+ static Connection *connectionSingleRotateWithLeft(Connection * K2)
846
+ {
847
+ Connection *K1;
848
+
849
+ K1 = K2->left;
850
+ K2->left = K1->right;
851
+ K1->right = K2;
852
+
853
+ return K1; /* New root */
854
+ }
855
+
856
+ /* This function can be called only if K1 has a right child */
857
+ /* Perform a rotate between a node (K1) and its right child */
858
+ /* Update heights, then return new root */
859
+
860
+ static Connection *connectionSingleRotateWithRight(Connection * K1)
861
+ {
862
+ Connection *K2;
863
+
864
+ K2 = K1->right;
865
+ K1->right = K2->left;
866
+ K2->left = K1;
867
+
868
+ return K2; /* New root */
869
+ }
870
+
871
+ /* Top-down splay procedure, */
872
+ /* not requiring destination to be in tree */
873
+
874
+ static Connection *splayConnection(Connection * T, IDnum nodeID)
875
+ {
876
+ Connection Header;
877
+ Connection *LeftTreeMax, *RightTreeMin;
878
+
879
+ if (T == NULL)
880
+ return NULL;
881
+
882
+ Header.left = Header.right = NULL;
883
+ LeftTreeMax = RightTreeMin = &Header;
884
+
885
+ while (nodeID != getNodeID(T->destination))
886
+ {
887
+ if (nodeID < getNodeID(T->destination))
888
+ {
889
+ if (T->left == NULL)
890
+ break;
891
+ if (nodeID < getNodeID(T->left->destination))
892
+ T = connectionSingleRotateWithLeft(T);
893
+ if (T->left == NULL)
894
+ break;
895
+ /* Link right */
896
+ RightTreeMin->left = T;
897
+ RightTreeMin = T;
898
+ T = T->left;
899
+ }
900
+ else
901
+ {
902
+ if (T->right == NULL)
903
+ break;
904
+ if (nodeID > getNodeID(T->right->destination))
905
+ T = connectionSingleRotateWithRight(T);
906
+ if (T->right == NULL)
907
+ break;
908
+ /* Link left */
909
+ LeftTreeMax->right = T;
910
+ LeftTreeMax = T;
911
+ T = T->right;
912
+ }
913
+ } /* while nodeID != T->destination */
914
+
915
+ /* Reassemble */
916
+ LeftTreeMax->right = T->left;
917
+ RightTreeMin->left = T->right;
918
+ T->left = Header.right;
919
+ T->right = Header.left;
920
+
921
+ return T;
922
+ }
923
+
924
+ static Connection* findOrCreateConnection(IDnum nodeID,
925
+ IDnum node2ID)
926
+ {
927
+ Connection **T;
928
+ Connection *newConnection;
929
+ IDnum nodeIndex;
930
+
931
+ nodeIndex = nodeID + nodeCount(graph);
932
+ T = scaffold + nodeIndex;
933
+
934
+ if (*T == NULL)
935
+ {
936
+ newConnection = allocateConnection();
937
+
938
+ newConnection->left = NULL;
939
+ newConnection->right = NULL;
940
+ *T = newConnection;
941
+ }
942
+ else
943
+ {
944
+ IDnum destID;
945
+
946
+ *T = splayConnection(*T, node2ID);
947
+ destID = getNodeID((*T)->destination);
948
+ if (destID == node2ID)
949
+ newConnection = *T;
950
+ else
951
+ {
952
+ newConnection = allocateConnection();
953
+ if (node2ID < destID)
954
+ {
955
+ newConnection->left = (*T)->left;
956
+ newConnection->right = *T;
957
+ (*T)->left = NULL;
958
+ }
959
+ else if (node2ID > destID)
960
+ {
961
+ newConnection->right = (*T)->right;
962
+ newConnection->left = *T;
963
+ (*T)->right = NULL;
964
+ }
965
+ *T = newConnection;
966
+ }
967
+ }
968
+
969
+ return newConnection;
970
+ }
971
+
972
+ static Connection* findConnection(IDnum nodeID,
973
+ IDnum node2ID)
974
+ {
975
+ Connection **T;
976
+ IDnum nodeIndex;
977
+
978
+ nodeIndex = nodeID + nodeCount(graph);
979
+ T = scaffold + nodeIndex;
980
+
981
+ if (*T == NULL)
982
+ return NULL;
983
+ else
984
+ {
985
+ IDnum destID;
986
+
987
+ *T = splayConnection(*T, node2ID);
988
+ destID = getNodeID((*T)->destination);
989
+ if (destID == node2ID)
990
+ return *T;
991
+ }
992
+ return NULL;
993
+ }
994
+
995
+ RecycleBin *connectionStackMemory = NULL;
996
+
997
+ typedef struct ConnectionStack_st ConnectionStack;
998
+
999
+ struct ConnectionStack_st
1000
+ {
1001
+ Connection *connection;
1002
+ ConnectionStack *next;
1003
+ };
1004
+
1005
+ #ifdef _OPENMP
1006
+ static void initConnectionStackMemory(void)
1007
+ {
1008
+ int n = omp_get_max_threads();
1009
+
1010
+ #pragma omp critical
1011
+ {
1012
+ if (connectionStackMemory == NULL)
1013
+ connectionStackMemory = newRecycleBinArray(n, sizeof(ConnectionStack), BLOCK_SIZE);
1014
+ }
1015
+ }
1016
+ #endif
1017
+
1018
+ static ConnectionStack *allocateConnectionStack(void)
1019
+ {
1020
+ #ifdef _OPENMP
1021
+ #ifdef DEBUG
1022
+ if (connectionStackMemory == NULL)
1023
+ {
1024
+ velvetLog("The memory for connection stack seems uninitialised, "
1025
+ "this is probably a bug, aborting.\n");
1026
+ abort();
1027
+ }
1028
+ #endif
1029
+ return allocatePointer(getRecycleBinInArray(connectionStackMemory,
1030
+ omp_get_thread_num()));
1031
+ #else
1032
+ if (connectionStackMemory == NULL)
1033
+ connectionStackMemory =
1034
+ newRecycleBin(sizeof(ConnectionStack), BLOCK_SIZE);
1035
+
1036
+ return allocatePointer(connectionStackMemory);
1037
+ #endif
1038
+ }
1039
+
1040
+ static void deallocateConnectionStack(ConnectionStack *stack)
1041
+ {
1042
+ #ifdef _OPENMP
1043
+ deallocatePointer(getRecycleBinInArray(connectionStackMemory,
1044
+ omp_get_thread_num()),
1045
+ stack);
1046
+ #else
1047
+ deallocatePointer(connectionStackMemory, stack);
1048
+ #endif
1049
+ }
1050
+
1051
+ static void destroyConnectionStackMemory(void)
1052
+ {
1053
+ #ifdef _OPENMP
1054
+ destroyRecycleBinArray(connectionStackMemory);
1055
+ #else
1056
+ destroyRecycleBin(connectionStackMemory);
1057
+ #endif
1058
+ connectionStackMemory = NULL;
1059
+ }
1060
+
1061
+ static void pushConnectionStack(ConnectionStack **stack, Connection *connection)
1062
+ {
1063
+ ConnectionStack *newElement;
1064
+
1065
+ newElement = allocateConnectionStack();
1066
+ newElement->connection = connection;
1067
+ newElement->next = *stack;
1068
+ *stack = newElement;
1069
+ }
1070
+
1071
+ static Connection *popConnectionStack(ConnectionStack **stack)
1072
+ {
1073
+ ConnectionStack *nextElement;
1074
+ Connection *connection;
1075
+
1076
+ if (*stack == NULL)
1077
+ return NULL;
1078
+
1079
+ nextElement = (*stack)->next;
1080
+ connection = (*stack)->connection;
1081
+ deallocateConnectionStack(*stack);
1082
+ *stack = nextElement;
1083
+
1084
+ return connection;
1085
+ }
1086
+
1087
+ static void splayToList(Connection **connection)
1088
+ {
1089
+ ConnectionStack *stack = NULL;
1090
+ Connection *current;
1091
+ Connection *list = NULL;
1092
+
1093
+ if (*connection == NULL)
1094
+ return;
1095
+
1096
+ for (current = *connection; current != NULL; current = popConnectionStack(&stack))
1097
+ {
1098
+ Connection *right;
1099
+ Connection *left;
1100
+
1101
+ right = current->right;
1102
+ if (right != NULL)
1103
+ pushConnectionStack(&stack, right);
1104
+ left = current->left;
1105
+ if (left != NULL)
1106
+ pushConnectionStack(&stack, left);
1107
+ if (list != NULL)
1108
+ list->left = current;
1109
+ current->right = list;
1110
+ list = current;
1111
+ }
1112
+ list->left = NULL;
1113
+ *connection = list;
1114
+ }
1115
+
1116
+ static void setAllConnectionsClean(void)
1117
+ {
1118
+ IDnum nodeID;
1119
+ IDnum nodes = nodeCount(graph);
1120
+
1121
+ #ifdef _OPENMP
1122
+ #pragma omp parallel for
1123
+ #endif
1124
+ for (nodeID = 2 * nodes; nodeID >= 0; nodeID--)
1125
+ {
1126
+ ConnectionStack *stack = NULL;
1127
+ Connection **connect;
1128
+ Connection *current;
1129
+
1130
+ connect = scaffold + nodeID;
1131
+ if (*connect == NULL)
1132
+ continue;
1133
+
1134
+ for (current = *connect; current != NULL; current = popConnectionStack(&stack))
1135
+ {
1136
+ Connection *right;
1137
+ Connection *left;
1138
+
1139
+ current->clean = true;
1140
+ right = current->right;
1141
+ if (right != NULL)
1142
+ pushConnectionStack(&stack, right);
1143
+ left = current->left;
1144
+ if (left != NULL)
1145
+ pushConnectionStack(&stack, left);
1146
+ }
1147
+ }
1148
+ }
1149
+
1150
+ static void fillNewConnectionInTree(Connection *connect,
1151
+ Node *destination,
1152
+ IDnum direct_count,
1153
+ IDnum paired_count,
1154
+ Coordinate distance,
1155
+ double variance)
1156
+ {
1157
+ connect->destination = destination;
1158
+ connect->direct_count = direct_count;
1159
+ connect->paired_count = paired_count;
1160
+ connect->distance = (double)distance;
1161
+ connect->variance = variance;
1162
+ }
1163
+
1164
+ static void readjustConnectionInTree(Connection *connect,
1165
+ IDnum direct_count,
1166
+ IDnum paired_count,
1167
+ Coordinate distance,
1168
+ double variance)
1169
+ {
1170
+ connect->direct_count += direct_count;
1171
+ connect->paired_count += paired_count;
1172
+ connect->distance = (variance * connect->distance + distance * connect->variance) /
1173
+ (variance + connect->variance);
1174
+ connect->variance = (variance * connect->variance) / (variance + connect->variance);
1175
+
1176
+ if (connect->twin != NULL)
1177
+ {
1178
+ connect->twin->direct_count = connect->direct_count;
1179
+ connect->twin->paired_count = connect->paired_count;
1180
+ connect->twin->distance = connect->distance;
1181
+ connect->twin->variance = connect->variance;
1182
+ }
1183
+ }
1184
+
1185
+ static void createTwinConnectionInTree(IDnum nodeID,
1186
+ IDnum node2ID,
1187
+ Connection *connect)
1188
+ {
1189
+ Connection *newConnection;
1190
+
1191
+ newConnection = findOrCreateConnection(nodeID, node2ID);
1192
+ if (newConnection->destination == NULL)
1193
+ {
1194
+ fillNewConnectionInTree(newConnection,
1195
+ getNodeInGraph(graph, node2ID),
1196
+ connect->direct_count,
1197
+ connect->paired_count,
1198
+ (Coordinate)connect->distance,
1199
+ connect->variance);
1200
+ // Batch to twin
1201
+ newConnection->twin = connect;
1202
+ connect->twin = newConnection;
1203
+ }
1204
+ else
1205
+ readjustConnectionInTree(newConnection,
1206
+ connect->direct_count,
1207
+ connect->paired_count,
1208
+ (Coordinate)connect->distance,
1209
+ connect->variance);
1210
+ }
1211
+
1212
+ static void createConnection(IDnum nodeID,
1213
+ IDnum node2ID,
1214
+ IDnum direct_count,
1215
+ IDnum paired_count,
1216
+ Coordinate distance,
1217
+ double variance)
1218
+ {
1219
+ Connection *connect;
1220
+
1221
+ if (getUniqueness(getNodeInGraph(graph, node2ID)) && node2ID < nodeID) {
1222
+ return;
1223
+ }
1224
+
1225
+ #ifdef _OPENMP
1226
+ lockTwoNodes(nodeID, node2ID);
1227
+ #endif
1228
+ connect = findOrCreateConnection(nodeID, node2ID);
1229
+ if (connect->destination == NULL)
1230
+ {
1231
+ Node *destination = getNodeInGraph(graph, node2ID);
1232
+ fillNewConnectionInTree(connect,
1233
+ destination,
1234
+ direct_count,
1235
+ paired_count,
1236
+ distance,
1237
+ variance);
1238
+
1239
+ if (getUniqueness(destination))
1240
+ createTwinConnectionInTree(node2ID, nodeID, connect);
1241
+ else
1242
+ connect->twin = NULL;
1243
+ }
1244
+ else
1245
+ readjustConnectionInTree(connect,
1246
+ direct_count,
1247
+ paired_count,
1248
+ distance,
1249
+ variance);
1250
+
1251
+ #ifdef _OPENMP
1252
+ unLockTwoNodes(nodeID, node2ID);
1253
+ #endif
1254
+ }
1255
+
1256
+ static void projectFromSingleRead(Node * node,
1257
+ ReadOccurence * readOccurence,
1258
+ Coordinate position,
1259
+ Coordinate offset, Coordinate length)
1260
+ {
1261
+ Coordinate distance = 0;
1262
+ Node *target = getNodeInGraph(graph, -readOccurence->nodeID);
1263
+ double variance = 1;
1264
+
1265
+ if (target == getTwinNode(node) || target == node)
1266
+ return;
1267
+
1268
+ if (position < 0) {
1269
+ variance += getNodeLength(node) * getNodeLength(node) / 16;
1270
+ // distance += 0;
1271
+ } else {
1272
+ // variance += 0;
1273
+ distance += position - getNodeLength(node) / 2;
1274
+ }
1275
+
1276
+ if (readOccurence->position < 0) {
1277
+ variance +=
1278
+ getNodeLength(target) * getNodeLength(target) / 16;
1279
+ //distance += 0;
1280
+ } else {
1281
+ // variance += 0;
1282
+ distance +=
1283
+ -readOccurence->position + getNodeLength(target) / 2;
1284
+ }
1285
+
1286
+ if (readOccurence->offset < 0 || offset < 0) {
1287
+ variance += length * length / 16;
1288
+ //distance += 0;
1289
+ } else {
1290
+ // variance += 0;
1291
+ distance += readOccurence->offset - offset;
1292
+ }
1293
+
1294
+ // Relative ordering
1295
+ if (offset > 0 && readOccurence->offset > 0) {
1296
+ if (offset < readOccurence->offset) {
1297
+ if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
1298
+ ;
1299
+ else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
1300
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1301
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1302
+ else
1303
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1304
+ distance, variance);
1305
+ } else if (offset > readOccurence->offset) {
1306
+ if (-distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
1307
+ ;
1308
+ else if (-distance < getNodeLength(node)/2 + getNodeLength(target)/2)
1309
+ createConnection(-getNodeID(node), -getNodeID(target), 1,
1310
+ 0, getNodeLength(node)/2 + getNodeLength(target)/2 , variance);
1311
+ else
1312
+ createConnection(-getNodeID(node), -getNodeID(target), 1,
1313
+ 0, -distance, variance);
1314
+ }
1315
+ } else if (offset > 0 && position > 0) {
1316
+ if (distance - offset > -getNodeLength(node)/2 && distance - offset + length > getNodeLength(node)/2)
1317
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1318
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1319
+ else if (distance - offset < -getNodeLength(node)/2 && distance - offset + length < getNodeLength(node)/2)
1320
+ createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
1321
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1322
+ else {
1323
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1324
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1325
+ createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
1326
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1327
+ }
1328
+ } else if (readOccurence->offset > 0 && readOccurence->position > 0) {
1329
+ if (-distance - readOccurence->offset > -getNodeLength(target)/2 && -distance - readOccurence->offset + length > getNodeLength(target)/2)
1330
+ createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
1331
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1332
+ if (-distance - readOccurence->offset < -getNodeLength(target)/2 && -distance - readOccurence->offset + length < getNodeLength(target)/2)
1333
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1334
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1335
+ else {
1336
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1337
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1338
+ createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
1339
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1340
+ }
1341
+ } else {
1342
+ createConnection(getNodeID(node), getNodeID(target), 1, 0,
1343
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1344
+ createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
1345
+ getNodeLength(node)/2 + getNodeLength(target)/2, variance);
1346
+ }
1347
+ }
1348
+
1349
+ static void projectFromReadPair(Node * node, ReadOccurence * readOccurence,
1350
+ Coordinate position, Coordinate offset,
1351
+ Coordinate insertLength,
1352
+ double insertVariance,
1353
+ boolean doMatePairs)
1354
+ {
1355
+ Coordinate distance = insertLength;
1356
+ Coordinate variance = insertVariance;
1357
+ Node *target = getNodeInGraph(graph, readOccurence->nodeID);
1358
+ IDnum nodeID;
1359
+ IDnum node2ID;
1360
+
1361
+ if (target == getTwinNode(node) || target == node)
1362
+ return;
1363
+
1364
+ nodeID = getNodeID(node);
1365
+ node2ID = getNodeID(target);
1366
+
1367
+ if (getUniqueness(target) && node2ID < nodeID)
1368
+ return;
1369
+
1370
+ // Check if a conflicting PE (or MP from a smaller size lib) connection
1371
+ // already exists
1372
+ if (doMatePairs) {
1373
+ Connection *reverseConnect;
1374
+
1375
+ #ifdef _OPENMP
1376
+ lockTwoNodes(nodeID, node2ID);
1377
+ #endif
1378
+ reverseConnect = findConnection(-nodeID, -node2ID);
1379
+ #ifdef _OPENMP
1380
+ unLockTwoNodes(nodeID, node2ID);
1381
+ #endif
1382
+
1383
+ if (reverseConnect != NULL &&
1384
+ reverseConnect->clean &&
1385
+ reverseConnect->paired_count +
1386
+ reverseConnect->direct_count >= UNRELIABLE_CONNECTION_CUTOFF)
1387
+ return;
1388
+ }
1389
+
1390
+ if (position < 0) {
1391
+ variance += getNodeLength(node) * getNodeLength(node) / 16;
1392
+ // distance += 0;
1393
+ } else {
1394
+ // variance += 0;
1395
+ distance += position - offset - getNodeLength(node) / 2;
1396
+ }
1397
+
1398
+ if (readOccurence->position < 0) {
1399
+ variance +=
1400
+ getNodeLength(target) * getNodeLength(target) / 16;
1401
+ //distance += 0;
1402
+ } else {
1403
+ // variance += 0;
1404
+ distance +=
1405
+ readOccurence->position - readOccurence->offset -
1406
+ getNodeLength(target) / 2;
1407
+ }
1408
+
1409
+ if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -6 * sqrt(insertVariance))
1410
+ return;
1411
+ else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
1412
+ distance = getNodeLength(node)/2 + getNodeLength(target)/2;
1413
+
1414
+ createConnection(nodeID, node2ID, 0, 1,
1415
+ distance, variance);
1416
+ }
1417
+
1418
+ static void projectFromShortRead(Node * node,
1419
+ ShortReadMarker * shortMarker,
1420
+ IDnum * readPairs, Category * cats,
1421
+ ReadOccurence ** readNodes,
1422
+ IDnum * readNodeCounts,
1423
+ ShortLength * lengths,
1424
+ boolean * shadows,
1425
+ boolean doMatePairs,
1426
+ Category thisCat)
1427
+ {
1428
+ IDnum index;
1429
+ IDnum readIndex = getShortReadMarkerID(shortMarker);
1430
+ ReadOccurence *readArray;
1431
+ IDnum readPairIndex;
1432
+ Category cat;
1433
+ Coordinate position = getShortReadMarkerPosition(shortMarker);
1434
+ Coordinate offset = getShortReadMarkerOffset(shortMarker);
1435
+ Coordinate length = lengths[getShortReadMarkerID(shortMarker) - 1];
1436
+ Coordinate insertLength;
1437
+ double insertVariance;
1438
+
1439
+ // Going through single-read information
1440
+ if (!doMatePairs && readNodeCounts[readIndex] > 1) {
1441
+ readArray = readNodes[readIndex];
1442
+ for (index = 0; index < readNodeCounts[readIndex]; index++)
1443
+ projectFromSingleRead(node, &readArray[index],
1444
+ position, offset, length);
1445
+ }
1446
+ // Going through paired read information
1447
+ if (readPairs == NULL)
1448
+ return;
1449
+
1450
+ readPairIndex = readPairs[readIndex - 1] + 1;
1451
+
1452
+ if (readPairIndex == 0)
1453
+ return;
1454
+
1455
+ cat = cats[readIndex - 1];
1456
+ insertLength = getInsertLength(graph, cat);
1457
+ insertVariance = getInsertLength_var(graph, cat);
1458
+ cat /= 2;
1459
+ if (shadows[cat] && cat > PEBBLE_ROUND_NUM)
1460
+ return;
1461
+
1462
+ if (!shadows[cat] && !doMatePairs) {
1463
+ readArray = readNodes[readPairIndex];
1464
+ for (index = 0; index < readNodeCounts[readPairIndex]; index++)
1465
+ projectFromReadPair(node, &readArray[index], position,
1466
+ offset, insertLength, insertVariance, false);
1467
+ }
1468
+ else if (shadows[cat] && doMatePairs && cat == thisCat) {
1469
+ readArray = readNodes[readPairIndex];
1470
+ for (index = 0; index < readNodeCounts[readPairIndex]; index++)
1471
+ projectFromReadPair(node, &readArray[index], position,
1472
+ offset, insertLength, insertVariance, true);
1473
+ }
1474
+
1475
+ }
1476
+
1477
+ static void projectFromLongRead(Node * node, PassageMarkerI marker,
1478
+ IDnum * readPairs, Category * cats,
1479
+ ReadOccurence ** readNodes,
1480
+ IDnum * readNodeCounts,
1481
+ ShortLength * lengths)
1482
+ {
1483
+ IDnum index;
1484
+ IDnum readIndex = getPassageMarkerSequenceID(marker);
1485
+ ReadOccurence *readArray;
1486
+ IDnum readPairIndex;
1487
+ Category cat;
1488
+ Coordinate position = getStartOffset(marker);
1489
+ Coordinate offset = getPassageMarkerStart(marker);
1490
+ Coordinate length =
1491
+ lengths[getPassageMarkerSequenceID(marker) - 1];
1492
+ Coordinate insertLength;
1493
+ double insertVariance;
1494
+
1495
+ // Going through single-read information
1496
+ if (readNodeCounts[readIndex] > 1 && position > 0) {
1497
+ readArray = readNodes[readIndex];
1498
+ for (index = 0; index < readNodeCounts[readIndex]; index++)
1499
+ projectFromSingleRead(node, &readArray[index],
1500
+ position, offset, length);
1501
+ }
1502
+ // Going through paired read information
1503
+ if (readPairs == NULL)
1504
+ return;
1505
+
1506
+ readPairIndex = readPairs[readIndex - 1] + 1;
1507
+
1508
+ if (readPairIndex == 0)
1509
+ return;
1510
+
1511
+ cat = cats[readIndex - 1];
1512
+ insertLength = getInsertLength(graph, cat);
1513
+ insertVariance = getInsertLength_var(graph, cat);
1514
+
1515
+ readArray = readNodes[readPairIndex];
1516
+ for (index = 0; index < readNodeCounts[readPairIndex]; index++)
1517
+ projectFromReadPair(node, &readArray[index], position,
1518
+ offset, insertLength, insertVariance, false);
1519
+
1520
+ }
1521
+
1522
+ static void projectFromNode(IDnum nodeID,
1523
+ ReadOccurence ** readNodes,
1524
+ IDnum * readNodeCounts,
1525
+ IDnum * readPairs, Category * cats,
1526
+ boolean * dubious, ShortLength * lengths,
1527
+ boolean * shadows,
1528
+ boolean doMatePairs,
1529
+ Category thisCat)
1530
+ {
1531
+ IDnum index;
1532
+ ShortReadMarker *nodeArray, *shortMarker;
1533
+ PassageMarkerI marker;
1534
+ Node *node;
1535
+ IDnum nodeReadCount;
1536
+
1537
+ node = getNodeInGraph(graph, nodeID);
1538
+
1539
+ if (node == NULL || !getUniqueness(node))
1540
+ return;
1541
+
1542
+ nodeArray = getNodeReads(node, graph);
1543
+ nodeReadCount = getNodeReadCount(node, graph);
1544
+ for (index = 0; index < nodeReadCount; index++) {
1545
+ shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
1546
+ if (dubious[getShortReadMarkerID(shortMarker) - 1])
1547
+ continue;
1548
+ projectFromShortRead(node, shortMarker, readPairs, cats,
1549
+ readNodes, readNodeCounts, lengths,
1550
+ shadows,
1551
+ doMatePairs,
1552
+ thisCat);
1553
+ }
1554
+
1555
+ if (!doMatePairs)
1556
+ for (marker = getMarker(node); marker != NULL_IDX;
1557
+ marker = getNextInNode(marker)) {
1558
+ if (getPassageMarkerSequenceID(marker) > 0)
1559
+ projectFromLongRead(node, marker, readPairs, cats,
1560
+ readNodes, readNodeCounts,
1561
+ lengths);
1562
+ }
1563
+ }
1564
+
1565
+ static Connection **computeNodeToNodeMappings(ReadOccurence ** readNodes,
1566
+ IDnum * readNodeCounts,
1567
+ IDnum * readPairs,
1568
+ Category * cats,
1569
+ boolean * dubious,
1570
+ boolean * shadows,
1571
+ ShortLength * lengths)
1572
+ {
1573
+ IDnum nodeID;
1574
+ IDnum nodes = nodeCount(graph);
1575
+ struct timeval start, end, diff;
1576
+ Category cat;
1577
+ boolean hasShadow;
1578
+
1579
+ scaffold = callocOrExit(2 * nodes + 1, Connection *);
1580
+
1581
+ velvetLog("Computing direct node to node mappings\n");
1582
+
1583
+ gettimeofday(&start, NULL);
1584
+ #ifdef _OPENMP
1585
+ createNodeLocks(graph);
1586
+
1587
+ int threads = omp_get_max_threads();
1588
+ if (threads > 32)
1589
+ threads = 32;
1590
+
1591
+ #pragma omp parallel for num_threads(threads)
1592
+ #endif
1593
+ for (nodeID = -nodes; nodeID <= nodes; nodeID++)
1594
+ {
1595
+ if (nodeID % 10000 == 0)
1596
+ velvetLog("Scaffolding node %li\n", (long) nodeID);
1597
+
1598
+ projectFromNode(nodeID, readNodes, readNodeCounts,
1599
+ readPairs, cats, dubious, lengths, shadows, false, 0);
1600
+ }
1601
+
1602
+ #ifdef _OPENMP
1603
+ initConnectionStackMemory();
1604
+ #endif
1605
+
1606
+ hasShadow = false;
1607
+ for (cat = 0; cat < CATEGORIES; cat++)
1608
+ if (shadows[cat])
1609
+ {
1610
+ hasShadow = true;
1611
+ break;
1612
+ }
1613
+
1614
+ if (hasShadow)
1615
+ {
1616
+ for (cat = 0; cat < CATEGORIES; cat++)
1617
+ {
1618
+ setAllConnectionsClean();
1619
+ if (!shadows[cat])
1620
+ continue;
1621
+ velvetLog("Scaffolding MP library %i\n", cat);
1622
+ #ifdef _OPENMP
1623
+ #pragma omp parallel for
1624
+ #endif
1625
+ for (nodeID = -nodes; nodeID <= nodes; nodeID++)
1626
+ projectFromNode(nodeID, readNodes, readNodeCounts,
1627
+ readPairs, cats, dubious, lengths,
1628
+ shadows, true, cat);
1629
+ }
1630
+ }
1631
+ #ifdef _OPENMP
1632
+ #pragma omp parallel for
1633
+ #endif
1634
+ for (nodeID = 2 * nodes; nodeID >= 0; nodeID--)
1635
+ splayToList(scaffold + nodeID);
1636
+
1637
+ destroyConnectionStackMemory();
1638
+
1639
+ #ifdef _OPENMP
1640
+ free(nodeLocks);
1641
+ nodeLocks = NULL;
1642
+ #endif
1643
+ gettimeofday(&end, NULL);
1644
+ timersub(&end, &start, &diff);
1645
+ velvetLog(" === Nodes Scaffolded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1646
+
1647
+ PEBBLE_ROUND_NUM++;
1648
+
1649
+ return scaffold;
1650
+ }
1651
+
1652
+ static IDnum **countShortReads(Graph * graph, ReadSet * reads)
1653
+ {
1654
+ IDnum **counts = callocOrExit(CATEGORIES + 1, IDnum *);
1655
+ Category cat;
1656
+ IDnum nodeIndex;
1657
+ IDnum nodes = nodeCount(graph);
1658
+ Node *node;
1659
+ ShortReadMarker *array, *marker;
1660
+ IDnum readCount, readIndex, readID;
1661
+
1662
+ // Allocate memory where needed
1663
+ for (cat = 0; cat <= CATEGORIES; cat++)
1664
+ if (getInsertLength(graph, cat) > 0)
1665
+ counts[cat] =
1666
+ callocOrExit(2 * nodeCount(graph) + 1,
1667
+ IDnum);
1668
+
1669
+ // Start fillin'
1670
+ for (nodeIndex = 0; nodeIndex < 2 * nodes + 1; nodeIndex++) {
1671
+ node = getNodeInGraph(graph, nodeIndex - nodes);
1672
+
1673
+ if (node == NULL || !getUniqueness(node))
1674
+ continue;
1675
+
1676
+ array = getNodeReads(node, graph);
1677
+ readCount = getNodeReadCount(node, graph);
1678
+ for (readIndex = 0; readIndex < readCount; readIndex++) {
1679
+ marker =
1680
+ getShortReadMarkerAtIndex(array, readIndex);
1681
+ readID = getShortReadMarkerID(marker);
1682
+ cat = reads->categories[readID - 1];
1683
+ if (cat % 2 == 1 && counts[cat / 2] != NULL)
1684
+ counts[cat / 2][nodeIndex]++;
1685
+ }
1686
+ }
1687
+
1688
+ return counts;
1689
+ }
1690
+
1691
+ static void removeUnreliableConnections(ReadSet * reads, boolean *shadows)
1692
+ {
1693
+ IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1694
+ IDnum index;
1695
+ Connection *connect, *next;
1696
+ Category cat;
1697
+ IDnum **counts = countShortReads(graph, reads);
1698
+ IDnum nodes = nodeCount(graph);
1699
+
1700
+ for (index = 0; index < maxNodeIndex; index++) {
1701
+ for (connect = scaffold[index]; connect != NULL;
1702
+ connect = next) {
1703
+ next = connect->right;
1704
+ if (!testConnection(index - nodes, connect, counts, shadows))
1705
+ destroyConnection(connect, index - nodes);
1706
+ }
1707
+ }
1708
+
1709
+ // Free memory
1710
+ for (cat = 0; cat <= CATEGORIES; cat++)
1711
+ if (counts[cat])
1712
+ free(counts[cat]);
1713
+ free(counts);
1714
+ }
1715
+
1716
+ void printConnections(ReadSet * reads, boolean * shadows)
1717
+ {
1718
+ IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1719
+ IDnum index;
1720
+ Connection *connect, *next;
1721
+ Node *node;
1722
+ IDnum **counts = countShortReads(graph, reads);
1723
+ IDnum nodes = nodeCount(graph);
1724
+ Category cat;
1725
+
1726
+ puts("CONNECT IDA IDB dcount pcount dist lengthA lengthB var countA countB coordA coordB real exp distance test");
1727
+
1728
+ for (index = 0; index < maxNodeIndex; index++) {
1729
+ node = getNodeInGraph(graph, index - nodeCount(graph));
1730
+ for (connect = scaffold[index]; connect != NULL;
1731
+ connect = next) {
1732
+ next = getNextConnection(connect);
1733
+ printf
1734
+ ("CONNECT %ld %ld %ld %ld %lld %lld %lld %f %ld %ld",
1735
+ (long) index - nodeCount(graph),
1736
+ (long) getNodeID(connect->destination),
1737
+ (long) connect->direct_count,
1738
+ (long) connect->paired_count,
1739
+ (long long) getConnectionDistance(connect),
1740
+ (long long) getNodeLength(node), (long long)
1741
+ getNodeLength(connect->destination),
1742
+ connect->variance,
1743
+ (long) getNodeReadCount(node, graph),
1744
+ (long) getNodeReadCount(connect->destination,
1745
+ graph));
1746
+ if (markerCount(node) == 1
1747
+ && markerCount(connect->destination) == 1)
1748
+ printf(" %lld %lld %lld", (long long)
1749
+ getPassageMarkerFinish(getMarker
1750
+ (node)),
1751
+ (long long)
1752
+ getPassageMarkerFinish(getMarker
1753
+ (connect->
1754
+ destination)),
1755
+ (long
1756
+ long) (getPassageMarkerFinish
1757
+ (getMarker(node)) -
1758
+ getPassageMarkerFinish
1759
+ (getMarker
1760
+ (connect->destination))));
1761
+ else
1762
+ printf(" ? ? ?");
1763
+ printf(" %ld",
1764
+ (long) expectedNumberOfConnections(index -
1765
+ nodeCount
1766
+ (graph),
1767
+ connect,
1768
+ counts,
1769
+ 0));
1770
+ printf(" %lld",
1771
+ (long long) (getConnectionDistance(connect)
1772
+ - (getNodeLength(node) +
1773
+ getNodeLength
1774
+ (connect->destination)) /
1775
+ 2));
1776
+ if (testConnection(index - nodes, connect, counts, shadows))
1777
+ puts(" OK");
1778
+ else
1779
+ puts(" NG");
1780
+ }
1781
+ }
1782
+
1783
+ for (cat = 0; cat <= CATEGORIES; cat++)
1784
+ if (counts[cat])
1785
+ free(counts[cat]);
1786
+ free(counts);
1787
+ }
1788
+
1789
+ void buildScaffold(Graph * argGraph,
1790
+ ReadSet * reads,
1791
+ boolean * dubious,
1792
+ boolean * shadows)
1793
+ {
1794
+ IDnum *readPairs;
1795
+ Category *cats;
1796
+ IDnum *readNodeCounts;
1797
+ ReadOccurence **readNodes;
1798
+ ReadOccurence *readNodesArray = NULL;
1799
+ ShortLength *lengths = getSequenceLengths(reads, getWordLength(argGraph));
1800
+ Coordinate totalCount = 0;
1801
+
1802
+ graph = argGraph;
1803
+ readPairs = reads->mateReads;
1804
+ cats = reads->categories;
1805
+
1806
+ // Prepare primary scaffold
1807
+ readNodeCounts = computeReadToNodeCounts(&totalCount);
1808
+ readNodes = computeReadToNodeMappings(readNodeCounts, reads, totalCount, &readNodesArray);
1809
+
1810
+ estimateMissingInsertLengths(readNodes, readNodeCounts, readPairs, cats);
1811
+
1812
+ scaffold = computeNodeToNodeMappings(readNodes, readNodeCounts,
1813
+ readPairs, cats, dubious, shadows, lengths);
1814
+ removeUnreliableConnections(reads, shadows);
1815
+
1816
+ free(readNodesArray);
1817
+ free(readNodes);
1818
+ free(readNodeCounts);
1819
+ free(lengths);
1820
+ }
1821
+
1822
+ //DEBUG
1823
+ void printScaffold(Graph * argGraph,
1824
+ ReadSet * reads,
1825
+ boolean * dubious,
1826
+ boolean * shadows)
1827
+ {
1828
+ IDnum *readPairs;
1829
+ Category *cats;
1830
+ IDnum *readNodeCounts;
1831
+ ReadOccurence **readNodes;
1832
+ ReadOccurence *readNodesArray = NULL;
1833
+ ShortLength *lengths = getSequenceLengths(reads, getWordLength(argGraph));
1834
+ Coordinate totalCount = 0;
1835
+
1836
+ graph = argGraph;
1837
+ readPairs = reads->mateReads;
1838
+ cats = reads->categories;
1839
+
1840
+ // Prepare primary scaffold
1841
+ readNodeCounts = computeReadToNodeCounts(&totalCount);
1842
+ readNodes = computeReadToNodeMappings(readNodeCounts, reads, totalCount, &readNodesArray);
1843
+
1844
+ estimateMissingInsertLengths(readNodes, readNodeCounts, readPairs, cats);
1845
+
1846
+ scaffold = computeNodeToNodeMappings(readNodes, readNodeCounts,
1847
+ readPairs, cats, dubious, shadows, lengths);
1848
+ printConnections(reads, shadows);
1849
+
1850
+ free(readNodesArray);
1851
+ free(readNodes);
1852
+ free(readNodeCounts);
1853
+ free(lengths);
1854
+ cleanScaffoldMemory();
1855
+ }
1856
+
1857
+ void setUnreliableConnectionCutoff(int val)
1858
+ {
1859
+ UNRELIABLE_CONNECTION_CUTOFF = (IDnum) val;
1860
+ }
1861
+
1862
+ void cleanScaffoldMemory() {
1863
+ Category libID;
1864
+
1865
+ for (libID = 0; libID < CATEGORIES + 1; libID++)
1866
+ if (estimated[libID])
1867
+ setInsertLengths(graph, libID, -1, -1);
1868
+
1869
+ destroyRecycleBin(connectionMemory);
1870
+ free(scaffold);
1871
+ connectionMemory = NULL;
1872
+ }
1873
+
1874
+ void setPairedExpFraction(double x) {
1875
+ paired_exp_fraction = x;
1876
+ }