finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,30 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _GRAPHRECONSTRUCTION_H_
22
+ #define _GRAPHRECONSTRUCTION_H_
23
+
24
+ Graph *importPreGraph(char *preGraphFilename, ReadSet * reads, char * roadmapFilename,
25
+ boolean readTracking, short int accelerationBits);
26
+
27
+ Graph *importConnectedGraph(char *connectedGraphFilename, ReadSet * reads, char * roadmapFilename,
28
+ boolean readTracking, short int accelerationBits);
29
+
30
+ #endif
@@ -0,0 +1,2167 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <math.h>
24
+ #include <string.h>
25
+ #include <ctype.h>
26
+
27
+ #include "globals.h"
28
+ #include "graph.h"
29
+ #include "graphStats.h"
30
+ #include "readSet.h"
31
+ #include "tightString.h"
32
+ #include "passageMarker.h"
33
+ #include "concatenatedGraph.h"
34
+ #include "readCoherentGraph.h"
35
+ #include "fibHeap.h"
36
+ #include "utility.h"
37
+ #include "recycleBin.h"
38
+ #include "passageMarker.h"
39
+ #include "binarySequences.h"
40
+
41
+ static PassageMarkerList *copyMarkers(Node * node)
42
+ {
43
+ PassageMarkerList *list = NULL;
44
+ PassageMarkerList *new;
45
+ PassageMarkerI currentMarker;
46
+
47
+ for (currentMarker = getMarker(node); currentMarker != NULL_IDX;
48
+ currentMarker = getNextInNode(currentMarker)) {
49
+ new = newPassageMarkerList(currentMarker, list);
50
+ list = new;
51
+ }
52
+
53
+ return list;
54
+ }
55
+
56
+ static boolean removeDead(PassageMarkerList ** list)
57
+ {
58
+ PassageMarkerList *current, *next;
59
+ boolean removed = false;
60
+
61
+ if (*list == NULL)
62
+ return false;
63
+
64
+ current = *list;
65
+
66
+ while (current->next != NULL) {
67
+ next = current->next;
68
+
69
+ if (isTerminal(next->marker)) {
70
+ removed = true;
71
+ current->next = next->next;
72
+ deallocatePassageMarkerList(next);
73
+ } else
74
+ current = current->next;
75
+ }
76
+
77
+ current = *list;
78
+ if (isTerminal(current->marker)) {
79
+ removed = true;
80
+ *list = current->next;
81
+ deallocatePassageMarkerList(current);
82
+ }
83
+
84
+ return removed;
85
+ }
86
+
87
+ static Node *chooseDestination(PassageMarkerList * list)
88
+ {
89
+ PassageMarkerList *current = list;
90
+ Node *destination;
91
+
92
+ destination = getNode(getNextInSequence(current->marker));
93
+ while (current != NULL) {
94
+ if (getNode(getNextInSequence(current->marker)) !=
95
+ destination)
96
+ return NULL;
97
+ current = current->next;
98
+ }
99
+
100
+ return destination;
101
+ }
102
+
103
+ static void destroyPassageMarkerList(PassageMarkerList ** list)
104
+ {
105
+ PassageMarkerList *ptr;
106
+
107
+ while (*list != NULL) {
108
+ ptr = *list;
109
+ *list = ptr->next;
110
+ deallocatePassageMarkerList(ptr);
111
+ }
112
+ }
113
+
114
+ static void updateMarkers(PassageMarkerList * list)
115
+ {
116
+ PassageMarkerList *current;
117
+
118
+ for (current = list; current != NULL; current = current->next)
119
+ current->marker = getNextInSequence(current->marker);
120
+ }
121
+
122
+ Coordinate computeSubsequentNodesLength(Node * node)
123
+ {
124
+ PassageMarkerList *list;
125
+ Node *nextNode;
126
+ Coordinate totalLength = 0;
127
+ boolean uncertain = false;
128
+
129
+ list = copyMarkers(node);
130
+
131
+ while (true) {
132
+ if (removeDead(&list))
133
+ uncertain = true;
134
+
135
+ if (uncertain && simpleArcCount(node) > 1) {
136
+ destroyPassageMarkerList(&list);
137
+ return totalLength;
138
+ }
139
+
140
+ if (list == NULL)
141
+ return totalLength;
142
+
143
+ nextNode = chooseDestination(list);
144
+ if (nextNode == NULL) {
145
+ destroyPassageMarkerList(&list);
146
+ return totalLength;
147
+ }
148
+
149
+ totalLength += getNodeLength(nextNode);
150
+
151
+ updateMarkers(list);
152
+ }
153
+
154
+ // Impossible instruction
155
+ return -1;
156
+ }
157
+
158
+ Coordinate computeVirtualNodeLength(Node * node)
159
+ {
160
+ Coordinate virtualLength;
161
+
162
+ if (node == NULL)
163
+ return 0;
164
+
165
+ virtualLength = getNodeLength(node);
166
+
167
+ virtualLength += computeSubsequentNodesLength(node);
168
+ virtualLength += computeSubsequentNodesLength(getTwinNode(node));
169
+
170
+ return virtualLength;
171
+ }
172
+
173
+ // Counts the number of markers for one node
174
+ int nodeGenomicMultiplicity(Node * node, IDnum firstStrain)
175
+ {
176
+ int counter = 0;
177
+ PassageMarkerI marker;
178
+
179
+ if (node == NULL)
180
+ return 0;
181
+
182
+ for (marker = getMarker(node); marker != NULL_IDX;
183
+ marker = getNextInNode(marker))
184
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
185
+ counter++;
186
+
187
+ return counter;
188
+ }
189
+
190
+ boolean isOnlyGenome(Node * node, IDnum firstStrain)
191
+ {
192
+ PassageMarkerI marker;
193
+
194
+ for (marker = getMarker(node); marker != NULL_IDX;
195
+ marker = getNextInNode(marker))
196
+ if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
197
+ return false;
198
+
199
+ return true;
200
+ }
201
+
202
+ boolean isOnlyStrain(Node * node, IDnum firstStrain)
203
+ {
204
+ PassageMarkerI marker;
205
+
206
+ for (marker = getMarker(node); marker != NULL_IDX;
207
+ marker = getNextInNode(marker))
208
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
209
+ return false;
210
+
211
+ return true;
212
+ }
213
+
214
+ boolean isSNP(Node * node, IDnum firstStrain, int WORDLENGTH)
215
+ {
216
+ IDnum sequence;
217
+ Coordinate position;
218
+
219
+ if (getNodeLength(node) != WORDLENGTH)
220
+ return false;
221
+
222
+ if (getMarker(node) == NULL_IDX)
223
+ return false;
224
+
225
+ if (getAbsolutePassMarkerSeqID(getMarker(node)) >= firstStrain)
226
+ return false;
227
+
228
+ if (getNextInNode(getMarker(node)) != NULL_IDX)
229
+ return false;
230
+
231
+ if (arcCount(node) != 1)
232
+ return false;
233
+
234
+ if (arcCount(getTwinNode(node)) != 1)
235
+ return false;
236
+
237
+ if (isOnlyGenome(getDestination(getArc(node)), firstStrain))
238
+ return false;
239
+
240
+ if (isOnlyGenome
241
+ (getDestination(getArc(getTwinNode(node))), firstStrain))
242
+ return false;
243
+
244
+ sequence = getPassageMarkerSequenceID(getMarker(node));
245
+
246
+ if (sequence >= 0)
247
+ position = getPassageMarkerStart(getMarker(node));
248
+ else {
249
+ sequence = -sequence;
250
+ position = getPassageMarkerFinish(getMarker(node));
251
+ }
252
+
253
+ velvetLog("SNP\t%lld\t%ld\n", (long long) position, (long) sequence);
254
+
255
+ return true;
256
+ }
257
+
258
+ void removeStrainMarkers(Node * node, IDnum firstStrain)
259
+ {
260
+ PassageMarkerI marker;
261
+ PassageMarkerI tmp = NULL_IDX;
262
+
263
+ marker = getMarker(node);
264
+ while (marker != NULL_IDX) {
265
+ tmp = getNextInNode(marker);
266
+
267
+ if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
268
+ destroyPassageMarker(marker);
269
+ marker = tmp;
270
+ }
271
+
272
+ }
273
+
274
+ Coordinate commonLength(Node * node, IDnum firstStrain)
275
+ {
276
+ PassageMarkerI marker = getMarker(node);
277
+ int orig = 0;
278
+ int strain = 0;
279
+
280
+ while (marker != NULL_IDX) {
281
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
282
+ orig++;
283
+ else
284
+ strain++;
285
+ marker = getNextInNode(marker);
286
+ }
287
+
288
+ if (orig == 0 || strain == 0)
289
+ return 0;
290
+
291
+ return (Coordinate) orig *getNodeLength(node);
292
+ }
293
+
294
+ boolean isMixed(Node * node, IDnum firstStrain)
295
+ {
296
+ return !isOnlyStrain(node, firstStrain)
297
+ && !isOnlyGenome(node, firstStrain);
298
+ }
299
+
300
+ int countLocalBreakpoints(PassageMarkerI marker, IDnum firstStrain)
301
+ {
302
+ PassageMarkerI localMarker;
303
+ IDnum sequenceID = getAbsolutePassMarkerSeqID(marker);
304
+ IDnum localSeqID;
305
+ Coordinate start = getPassageMarkerStart(marker);
306
+ Node *localNode = getNode(marker);
307
+ Node *destination;
308
+ Arc *arc;
309
+ int arcCount = 0;
310
+ int arcIndex;
311
+ boolean *arcStatus;
312
+ int counter = 0;
313
+
314
+ if (!isMixed(localNode, firstStrain))
315
+ return 0;
316
+
317
+ // Count arcs
318
+ for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc))
319
+ arcCount++;
320
+ arcStatus = callocOrExit(arcCount, boolean);
321
+ // Check for other genomic markers in node
322
+ for (localMarker = getMarker(localNode); localMarker != NULL_IDX;
323
+ localMarker = getNextInNode(localMarker)) {
324
+ localSeqID = getAbsolutePassMarkerSeqID(localMarker);
325
+ if (localSeqID >= firstStrain)
326
+ continue;
327
+
328
+ if (localSeqID < sequenceID)
329
+ return 0;
330
+
331
+ if (localSeqID == sequenceID
332
+ && getPassageMarkerStart(localMarker) < start)
333
+ return 0;
334
+
335
+ destination = getNode(getNextInSequence(localMarker));
336
+
337
+ // Enter into table:
338
+ arcIndex = 0;
339
+ for (arc = getArc(localNode);
340
+ getDestination(arc) != destination;
341
+ arc = getNextArc(arc))
342
+ arcIndex++;
343
+ arcStatus[arcIndex] = true;
344
+ }
345
+
346
+ // Check other nodes
347
+ arcIndex = 0;
348
+ for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc)) {
349
+ if (!arcStatus[arcIndex]
350
+ && isMixed(getDestination(arc), firstStrain))
351
+ counter++;
352
+ arcIndex++;
353
+ }
354
+
355
+ free(arcStatus);
356
+ return counter;
357
+ }
358
+
359
+ IDnum genomeMarkerCount(Node * node, IDnum firstStrain)
360
+ {
361
+ PassageMarkerI marker;
362
+ IDnum counter = 0;
363
+
364
+ for (marker = getMarker(node); marker != NULL_IDX;
365
+ marker = getNextInNode(marker))
366
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
367
+ counter++;
368
+
369
+ return counter;
370
+ }
371
+
372
+ Coordinate readCoverage(Node * node)
373
+ {
374
+ PassageMarkerI marker;
375
+ Coordinate sum = 0;
376
+
377
+ for (marker = getMarker(node); marker != NULL_IDX;
378
+ marker = getNextInNode(marker)) {
379
+ if (getTwinMarker(marker) == NULL_IDX) {
380
+ velvetLog("Node %li screwed up\n", (long) getNodeID(node));
381
+ velvetLog("Sequence %li\n",
382
+ (long) getPassageMarkerSequenceID(marker));
383
+ abort();
384
+ }
385
+ sum += getPassageMarkerLength(marker);
386
+ }
387
+
388
+ return sum;
389
+ }
390
+
391
+ Coordinate refReadCoverage(Node * node, IDnum firstStrain)
392
+ {
393
+ PassageMarkerI marker;
394
+ Coordinate sum = 0;
395
+
396
+ for (marker = getMarker(node); marker != NULL_IDX;
397
+ marker = getNextInNode(marker))
398
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
399
+ sum += getPassageMarkerLength(marker);
400
+
401
+ return sum;
402
+ }
403
+
404
+ Coordinate newReadCoverage(Node * node, IDnum firstStrain)
405
+ {
406
+ PassageMarkerI marker;
407
+ Coordinate sum = 0;
408
+
409
+ for (marker = getMarker(node); marker != NULL_IDX;
410
+ marker = getNextInNode(marker))
411
+ if (getAbsolutePassMarkerSeqID(marker) >= firstStrain) {
412
+ sum += getPassageMarkerLength(marker);
413
+ if (getPassageMarkerLength(marker) < 0)
414
+ velvetLog("Bizarre marker %li at node %li\n",
415
+ (long) getPassageMarkerSequenceID(marker),
416
+ (long) getNodeID(node));
417
+ }
418
+
419
+ return sum;
420
+ }
421
+
422
+ static void printShortCounts(FILE * outfile, Node * node, Graph * graph, ReadSet * reads)
423
+ {
424
+ IDnum counts[CATEGORIES];
425
+ Category cat;
426
+ IDnum shortReadIndex;
427
+ IDnum readID;
428
+ IDnum shortReadCount;
429
+ ShortReadMarker *array;
430
+ ShortReadMarker *marker;
431
+
432
+ if (!readStartsAreActivated(graph)) {
433
+ for (cat = 0; cat < CATEGORIES; cat++)
434
+ velvetFprintf(outfile, "\tN/A");
435
+ return;
436
+ }
437
+
438
+ shortReadCount = getNodeReadCount(node, graph);
439
+ array = getNodeReads(node, graph);
440
+
441
+ for (cat = 0; cat < CATEGORIES; cat++)
442
+ counts[cat] = 0;
443
+
444
+ for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
445
+ marker = getShortReadMarkerAtIndex(array, shortReadIndex);
446
+ readID = getShortReadMarkerID(marker);
447
+ cat = reads->categories[readID - 1] / 2;
448
+ counts[cat]++;
449
+ }
450
+
451
+ for (cat = 0; cat < CATEGORIES; cat++)
452
+ velvetFprintf(outfile, "\t%li", (long) counts[cat]);
453
+ }
454
+
455
+ void displayGeneralStatistics(Graph * graph, char *filename, ReadSet * reads)
456
+ {
457
+ IDnum nodeIndex;
458
+ Node *node;
459
+ Category cat;
460
+ FILE *outfile;
461
+
462
+ outfile = fopen(filename, "w");
463
+ if (outfile == NULL) {
464
+ velvetLog("Couldn't open file %s, sorry\n", filename);
465
+ return;
466
+ } else
467
+ velvetLog("Writing into stats file %s...\n", filename);
468
+
469
+ velvetFprintf(outfile, "ID\tlgth\tout\tin\tlong_cov");
470
+
471
+ #ifndef SINGLE_COV_CAT
472
+ for (cat = 0; cat < CATEGORIES; cat++) {
473
+ velvetFprintf(outfile, "\tshort%i_cov", (int) (cat + 1));
474
+ velvetFprintf(outfile, "\tshort%i_Ocov", (int) (cat + 1));
475
+ }
476
+ #else
477
+ velvetFprintf(outfile, "\tshort_cov");
478
+ #endif
479
+
480
+ velvetFprintf(outfile, "\tlong_nb");
481
+ for (cat = 0; cat < CATEGORIES; cat++) {
482
+ velvetFprintf(outfile, "\tshort%i_nb", (int) (cat + 1));
483
+ }
484
+
485
+ velvetFprintf(outfile, "\n");
486
+
487
+ for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
488
+ node = getNodeInGraph(graph, nodeIndex);
489
+ if (node == NULL)
490
+ continue;
491
+ velvetFprintf
492
+ (outfile, "%ld\t%lld\t%i\t%i",
493
+ (long) getNodeID(node), (long long) getNodeLength(node), arcCount(node),
494
+ arcCount(getTwinNode(node)));
495
+
496
+ if (getNodeLength(node) > 0) {
497
+ velvetFprintf(outfile, "\t%f",
498
+ readCoverage(node) /
499
+ (double) getNodeLength(node));
500
+ #ifndef SINGLE_COV_CAT
501
+ for (cat = 0; cat < CATEGORIES; cat++) {
502
+ velvetFprintf(outfile, "\t%f",
503
+ getVirtualCoverage(node, cat) /
504
+ (double) getNodeLength(node));
505
+ velvetFprintf(outfile, "\t%f",
506
+ getOriginalVirtualCoverage(node, cat) /
507
+ (double) getNodeLength(node));
508
+ }
509
+ #else
510
+ velvetFprintf(outfile, "\t%f",
511
+ getVirtualCoverage(node) /
512
+ (double) getNodeLength(node));
513
+ #endif
514
+ } else {
515
+ velvetFprintf(outfile, "\tInf");
516
+ #ifndef SINGLE_COV_CAT
517
+ for (cat = 0; cat < CATEGORIES; cat++)
518
+ velvetFprintf(outfile, "\tInf\tInf");
519
+ #else
520
+ velvetFprintf(outfile, "\tInf");
521
+ #endif
522
+ }
523
+
524
+ velvetFprintf(outfile, "\t%li", (long) markerCount(node));
525
+ printShortCounts(outfile, node, graph, reads);
526
+
527
+ velvetFprintf(outfile, "\n");
528
+ }
529
+
530
+ fclose(outfile);
531
+ }
532
+
533
+ void displayLocalBreakpoint(PassageMarkerI strainMarker,
534
+ IDnum firstStrain,
535
+ PassageMarkerI genomeMarker,
536
+ Node ** genomeDestination,
537
+ Node ** strainDestination, IDnum * counter,
538
+ IDnum nodeCount)
539
+ {
540
+ boolean isTranslocation;
541
+ PassageMarkerI marker;
542
+ Node *destination, *destinationA;
543
+ Node *destination2, *destination2A;
544
+ Node *node1, *node2;
545
+ IDnum localID = getNodeID(getNode(strainMarker));
546
+
547
+ // Eliminate genomic markers
548
+ if (strainMarker == genomeMarker)
549
+ return;
550
+
551
+ destinationA = getNode(getNextInSequence(strainMarker));
552
+
553
+ if (destinationA == NULL)
554
+ return;
555
+
556
+ // Eliminate those that follow some local strain
557
+ if (isDestinationToMarker(genomeMarker, destinationA)) {
558
+ // velvetLog("Parallel paths\n");
559
+ return;
560
+ }
561
+
562
+ destination2A = getNode(getNextInSequence(genomeMarker));
563
+
564
+ if (destination2A == NULL)
565
+ return;
566
+
567
+ velvetLog("Lengths %lld %lld\n", (long long) getNodeLength(destinationA),
568
+ (long long) getNodeLength(destination2A));
569
+
570
+ // Hop to another genomic node
571
+ // if (getNodeLength(destinationA) > 24) {
572
+ //velvetLog("wrong length %d %d\n", getNodeLength(destination) , getNodeID(destination));
573
+ // return;
574
+ // }
575
+
576
+ destination =
577
+ getNode(getNextInSequence(getNextInSequence(strainMarker)));
578
+
579
+ if (destination == NULL)
580
+ return;
581
+
582
+ // Eliminate those that point to uniquely strain sequences
583
+ if (nodeGenomicMultiplicity(destination, firstStrain) != 1) {
584
+ // velvetLog("Multiple genome reads\n");
585
+ return;
586
+ }
587
+ // Hop to another genomic node
588
+ // if (getNodeLength(destination2A) != 24) {
589
+ //velvetLog("wrong length 2\n");
590
+ // return;
591
+ // }
592
+
593
+ destination2 =
594
+ getNode(getNextInSequence(getNextInSequence(genomeMarker)));
595
+
596
+ if (destination2 == NULL)
597
+ return;
598
+
599
+
600
+ if (destination == destination2)
601
+ return;
602
+
603
+ // Eliminate those that point to uniquely strain sequences
604
+ if (isOnlyGenome(destination2, firstStrain))
605
+ return;
606
+
607
+ setSingleNodeStatus(getNode(strainMarker), true);
608
+ strainDestination[localID + nodeCount] = destination;
609
+ genomeDestination[localID + nodeCount] = destination2;
610
+
611
+ // velvetLog("Assigning %p and %p to %d\n", destination, destination2, localID);
612
+ velvetLog("lengths %lld\t%lld\n", (long long) getNodeLength(destinationA),
613
+ (long long) getNodeLength(destination2A));
614
+
615
+ // Detect translocation
616
+ isTranslocation = true;
617
+ for (marker = getMarker(destination); marker != NULL_IDX;
618
+ marker = getNextInNode(marker))
619
+ if (getAbsolutePassMarkerSeqID(marker) ==
620
+ getAbsolutePassMarkerSeqID(genomeMarker)) {
621
+ isTranslocation = false;
622
+ break;
623
+ }
624
+
625
+ if (isTranslocation) {
626
+ velvetLog("BREAK TRANS\t%ld\t%lld\t%lld\t%lld\n",
627
+ (long) getAbsolutePassMarkerSeqID(genomeMarker),
628
+ (long long) getPassageMarkerStart(genomeMarker),
629
+ (long long) getNodeLength(destinationA),
630
+ (long long) getNodeLength(destination2A));
631
+ counter[2]++;
632
+ return;
633
+ }
634
+ // Detect breakpoint
635
+ velvetLog("BREAK INTRA\t%ld\t%lld\t%lld\t%lld\n",
636
+ (long) getAbsolutePassMarkerSeqID(genomeMarker),
637
+ (long long) getPassageMarkerStart(genomeMarker),
638
+ (long long) getNodeLength(destinationA), (long long) getNodeLength(destination2A));
639
+ counter[1]++;
640
+
641
+ // Check for inversion
642
+ if (getPassageMarkerSequenceID(marker) !=
643
+ -getPassageMarkerSequenceID(genomeMarker))
644
+ return;
645
+
646
+ // velvetLog("potential!!\n");
647
+
648
+ node1 = getTwinNode(destination);
649
+
650
+ if (getNodeStatus(node1)) {
651
+ node2 =
652
+ getTwinNode(genomeDestination
653
+ [getNodeID(node1) + nodeCount]);
654
+ if (getNodeStatus(node2))
655
+ if (strainDestination[getNodeID(node2) + nodeCount]
656
+ == destination2) {
657
+ // velvetLog("Safe\n");
658
+ counter[1] -= 4;
659
+ counter[0]++;
660
+ } else;
661
+ // velvetLog("stopped 3\n");
662
+ else;
663
+ // velvetLog("stopped 2\n");
664
+ } else;
665
+ // velvetLog("stopped 1\n");
666
+ }
667
+
668
+ PassageMarkerI genomeMarker(Node * node, IDnum firstStrain)
669
+ {
670
+ PassageMarkerI marker;
671
+
672
+ if (genomeMarkerCount(node, firstStrain) != 1)
673
+ return NULL_IDX;
674
+
675
+ for (marker = getMarker(node); marker != NULL_IDX;
676
+ marker = getNextInNode(marker))
677
+ if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
678
+ return marker;
679
+
680
+ return NULL_IDX;
681
+ }
682
+
683
+ void exportArcSequence(Arc * arc, FILE * outfile, int WORDLENGTH,
684
+ TightString ** sequences)
685
+ {
686
+ char *str;
687
+ TightString *output =
688
+ newTightString(getNodeLength(getOrigin(arc)) +
689
+ getNodeLength(getDestination(arc)));
690
+ appendNodeSequence(getOrigin(arc), output, 0);
691
+ appendNodeSequence(getDestination(arc), output,
692
+ getNodeLength(getOrigin(arc)));
693
+ str = readTightString(output);
694
+ velvetFprintf(outfile, "> ARC from NODE %li", (long) getNodeID(getOrigin(arc)));
695
+ velvetFprintf(outfile, "%s\n", str);
696
+ destroyTightString(output);
697
+ free(str);
698
+ }
699
+
700
+ // Produce sequences necessary to recreate graph elsewhere...
701
+ void projectGraphToFile(Graph * graph, char *filename, int WORDLENGTH,
702
+ TightString ** sequences)
703
+ {
704
+ FILE *outfile = fopen(filename, "w");
705
+ IDnum index;
706
+ Node *currentNode;
707
+ Arc *arc;
708
+
709
+ if (outfile == NULL) {
710
+ velvetLog("Could not open %s, sorry\n", filename);
711
+ return;
712
+ }
713
+
714
+ for (index = 1; index < nodeCount(graph); index++) {
715
+ currentNode = getNodeInGraph(graph, index);
716
+ for (arc = getArc(currentNode); arc != NULL;
717
+ arc = getNextArc(arc))
718
+ exportArcSequence(arc, outfile, WORDLENGTH,
719
+ sequences);
720
+
721
+ for (arc = getArc(getTwinNode(currentNode)); arc != NULL;
722
+ arc = getNextArc(arc))
723
+ exportArcSequence(arc, outfile, WORDLENGTH,
724
+ sequences);
725
+ }
726
+
727
+ fclose(outfile);
728
+ }
729
+
730
+ static RecycleBin * maskMemory = NULL;
731
+
732
+ static Mask *allocateMask()
733
+ {
734
+ if (maskMemory == NULL)
735
+ maskMemory = newRecycleBin(sizeof(Mask), 10000);
736
+
737
+ return (Mask *) allocatePointer(maskMemory);
738
+ }
739
+
740
+ static void deallocateMask(Mask * mask)
741
+ {
742
+ deallocatePointer(maskMemory, mask);
743
+ }
744
+
745
+
746
+ static Mask * newMask(Coordinate position)
747
+ {
748
+ Mask * mask = allocateMask();
749
+ mask->start = position;
750
+ mask->finish = position;
751
+ mask->next = NULL;
752
+ return mask;
753
+ }
754
+
755
+ static Mask * lowCoverageRegions(Coordinate * starts, Coordinate * stops, size_t length, IDnum cutoff, Coordinate nodeLength) {
756
+ size_t indexStart = 0;
757
+ size_t indexStop = 0;
758
+ int currentValue = 0;
759
+ Mask * regions = NULL;
760
+ Mask * lastRegion = NULL;
761
+ boolean openMask = false;
762
+
763
+ while (indexStart < length && indexStop < length) {
764
+ if (starts[indexStart] == stops[indexStop]) {
765
+ indexStart++;
766
+ indexStop++;
767
+ } else if (starts[indexStart] < stops[indexStop]) {
768
+ if (currentValue == cutoff - 1 && lastRegion) {
769
+ lastRegion->finish = starts[indexStart] - 1;
770
+ openMask = false;
771
+ }
772
+ currentValue++;
773
+ indexStart++;
774
+ } else {
775
+ if (currentValue == cutoff && stops[indexStop] != nodeLength) {
776
+ if (regions) {
777
+ lastRegion->next = newMask(stops[indexStop]);
778
+ lastRegion = lastRegion->next;
779
+ } else {
780
+ regions = newMask(stops[indexStop]);
781
+ lastRegion = regions;
782
+ }
783
+ openMask = true;
784
+ }
785
+ currentValue--;
786
+ indexStop++;
787
+ }
788
+ }
789
+
790
+ while (indexStart < length) {
791
+ if (currentValue == cutoff - 1 && lastRegion) {
792
+ lastRegion->finish = starts[indexStart] - 1;
793
+ openMask = false;
794
+ } else if (currentValue >= cutoff) {
795
+ break;
796
+ }
797
+ currentValue++;
798
+ indexStart++;
799
+ }
800
+
801
+ while (indexStop < length) {
802
+ if (currentValue == cutoff + 1) {
803
+ if (regions) {
804
+ lastRegion->next = newMask(stops[indexStop]);
805
+ lastRegion = lastRegion->next;
806
+ } else {
807
+ regions = newMask(stops[indexStop]);
808
+ lastRegion = regions;
809
+ }
810
+ openMask = true;
811
+ } else if (currentValue < cutoff)
812
+ break;
813
+ currentValue--;
814
+ indexStop++;
815
+ }
816
+
817
+ if (openMask)
818
+ lastRegion->finish = nodeLength;
819
+
820
+ free(starts);
821
+ free(stops);
822
+ return regions;
823
+ }
824
+
825
+ static int compareCoords(const void * A, const void * B) {
826
+ Coordinate * a_p = (Coordinate *) A;
827
+ Coordinate * b_p = (Coordinate *) B;
828
+ Coordinate a = * a_p;
829
+ Coordinate b = * b_p;
830
+ if (a < b)
831
+ return -1;
832
+ else if (a > b)
833
+ return 1;
834
+ else
835
+ return 0;
836
+ }
837
+
838
+ static void sortCoords(Coordinate * array, IDnum length) {
839
+ qsort(array, (size_t) length, sizeof(Coordinate), compareCoords);
840
+ }
841
+
842
+ static void getShortReadCoords(Coordinate * starts, Coordinate * stops, Node * node, Graph * graph, ShortLength * readLengths) {
843
+ ShortReadMarker * markers = getNodeReads(node, graph);
844
+ ShortReadMarker* marker;
845
+ IDnum index;
846
+ for (index = 0; index < getNodeReadCount(node, graph); index++) {
847
+ marker = getShortReadMarkerAtIndex(markers, index);
848
+ starts[index] = getShortReadMarkerPosition(marker);
849
+ stops[index] = starts[index] - getShortReadMarkerOffset(marker) + readLengths[getShortReadMarkerID(marker) - 1] - 1;
850
+ }
851
+ }
852
+
853
+ static void getShortReadTwinCoords(Coordinate * starts, Coordinate * stops, Node * node, Graph * graph, ShortLength * readLengths, IDnum offset) {
854
+ ShortReadMarker * markers = getNodeReads(getTwinNode(node), graph);
855
+ ShortReadMarker* marker;
856
+ IDnum index;
857
+ for (index = 0; index < getNodeReadCount(getTwinNode(node), graph); index++) {
858
+ marker = getShortReadMarkerAtIndex(markers, index);
859
+ stops[index + offset] = getNodeLength(node) - 1 - getShortReadMarkerPosition(marker);
860
+ starts[index + offset] = stops[index + offset] + getShortReadMarkerOffset(marker) - readLengths[getShortReadMarkerID(marker) - 1] + 1;
861
+ }
862
+ }
863
+
864
+ static void getLongReadCoords(Coordinate * starts, Coordinate * stops, Node * node, Graph * graph, ReadSet * reads, IDnum offset) {
865
+ PassageMarkerI marker;
866
+ IDnum index = offset;
867
+
868
+ for (marker = getMarker(node); marker; marker = getNextInNode(marker)) {
869
+ if (reads->categories[getAbsolutePassMarkerSeqID(marker) - 1] != REFERENCE) {
870
+ starts[index] = getStartOffset(marker);
871
+ stops[index++] = getNodeLength(node) - 1 - getFinishOffset(marker);
872
+ } else {
873
+ starts[index] = -5;
874
+ stops[index++] = -10;
875
+ }
876
+ }
877
+ }
878
+
879
+ static Mask * findLowCoverageRegions(Node * node, Graph * graph, IDnum cutoff, ReadSet * reads, ShortLength * readLengths) {
880
+ // Fill arrays
881
+ IDnum nodeReads = getNodeReadCount(node, graph);
882
+ IDnum twinReads = getNodeReadCount(getTwinNode(node), graph);
883
+ IDnum longReads = markerCount(node);
884
+ IDnum length = nodeReads + twinReads + longReads;
885
+ Coordinate * starts = callocOrExit(length, Coordinate);
886
+ Coordinate * stops = callocOrExit(length, Coordinate);
887
+ getShortReadCoords(starts, stops, node, graph, readLengths);
888
+ getShortReadTwinCoords(starts, stops, node, graph, readLengths, nodeReads);
889
+ getLongReadCoords(starts, stops, node, graph, reads, nodeReads + twinReads);
890
+
891
+ // Sort arrays
892
+ sortCoords(starts, length);
893
+ sortCoords(stops, length);
894
+
895
+ // Go through array
896
+ return lowCoverageRegions(starts, stops, length, cutoff, getNodeLength(node));
897
+ }
898
+
899
+ static void exportLongNodeSequence(FILE * outfile, Node * node, Graph * graph, ReadSet * reads, ShortLength * readLengths, IDnum cutoff) {
900
+ TightString *tString;
901
+ Coordinate position;
902
+ char nucleotide;
903
+ int WORDLENGTH = getWordLength(graph);
904
+ GapMarker *gap;
905
+ IDnum nodeIndex = getNodeID(node);
906
+ Mask * mask = NULL;
907
+ Mask * next;
908
+
909
+ if (readStartsAreActivated(graph) && cutoff > 0)
910
+ mask = findLowCoverageRegions(node, graph, cutoff, reads, readLengths);
911
+
912
+ tString = expandNode(node, WORDLENGTH);
913
+ velvetFprintf(outfile, ">NODE_%ld_length_%lld_cov_%f\n",
914
+ (long) nodeIndex, (long long) getNodeLength(node),
915
+ (getTotalCoverage(node) + readCoverage(node)) /
916
+ (float) getNodeLength(node));
917
+
918
+ gap = getGap(node, graph);
919
+ for (position = 0; position < WORDLENGTH; position++) {
920
+ if (position % 60 == 0 && position > 0)
921
+ velvetFprintf(outfile, "\n");
922
+ nucleotide = getNucleotideChar(position, tString);
923
+ velvetFprintf(outfile, "%c", nucleotide);
924
+ }
925
+
926
+ gap = getGap(node, graph);
927
+ for (; position < getLength(tString); position++) {
928
+ if (position % 60 == 0)
929
+ velvetFprintf(outfile, "\n");
930
+
931
+ while (gap
932
+ && position - WORDLENGTH + 1 >=
933
+ getGapFinish(gap))
934
+ gap = getNextGap(gap);
935
+
936
+ while (mask
937
+ && position - WORDLENGTH + 1 >=
938
+ mask->finish) {
939
+ next = mask->next;
940
+ deallocateMask(mask);
941
+ mask = next;
942
+ }
943
+
944
+ if (gap
945
+ && position - WORDLENGTH + 1 >=
946
+ getGapStart(gap)) {
947
+ velvetFprintf(outfile, "N");
948
+ } else if (mask &&
949
+ position - WORDLENGTH + 1 >=
950
+ mask->start) {
951
+ nucleotide =
952
+ getNucleotideChar(position, tString);
953
+ velvetFprintf(outfile, "%c", tolower(nucleotide));
954
+
955
+ } else {
956
+ nucleotide =
957
+ getNucleotideChar(position, tString);
958
+ velvetFprintf(outfile, "%c", nucleotide);
959
+ }
960
+ }
961
+ velvetFprintf(outfile, "\n");
962
+ destroyTightString (tString);
963
+ }
964
+
965
+ void exportLongNodeSequences(char *filename, Graph * graph,
966
+ Coordinate minLength, ReadSet * reads, ShortLength * readLengths, IDnum minCov)
967
+ {
968
+ FILE *outfile = fopen(filename, "w");
969
+ IDnum nodeIndex;
970
+ Node *node;
971
+ //double sensitivity, specificity;
972
+
973
+ if (outfile == NULL) {
974
+ velvetLog("Could not write into %s, sorry\n", filename);
975
+ return;
976
+ } else {
977
+ velvetLog("Writing contigs into %s...\n", filename);
978
+ }
979
+
980
+ for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
981
+ node = getNodeInGraph(graph, nodeIndex);
982
+
983
+ if (node == NULL || getNodeLength(node) < minLength)
984
+ continue;
985
+
986
+ exportLongNodeSequence(outfile, node, graph, reads, readLengths, minCov);
987
+ }
988
+
989
+ if (maskMemory)
990
+ destroyRecycleBin(maskMemory);
991
+ maskMemory = NULL;
992
+
993
+ fclose(outfile);
994
+ }
995
+
996
+ Coordinate maxLength(Graph * graph)
997
+ {
998
+ IDnum index;
999
+ Node *node;
1000
+ Coordinate max = 0;
1001
+
1002
+ for (index = 1; index <= nodeCount(graph); index++) {
1003
+ node = getNodeInGraph(graph, index);
1004
+ if (node != NULL && getNodeLength(node) > max)
1005
+ max = getNodeLength(node);
1006
+ }
1007
+
1008
+ return max;
1009
+ }
1010
+
1011
+ Coordinate n50(Graph * graph)
1012
+ {
1013
+ FibHeap *heap = newFibHeap();
1014
+ IDnum index;
1015
+ Coordinate totalLength = 0;
1016
+ Coordinate sumLength = 0;
1017
+ Node *node;
1018
+
1019
+ if (nodeCount(graph) == 0) {
1020
+ velvetLog("EMPTY GRAPH\n");
1021
+ return 0;
1022
+ }
1023
+
1024
+ for (index = 1; index <= nodeCount(graph); index++) {
1025
+ node = getNodeInGraph(graph, index);
1026
+ if (node == NULL)
1027
+ continue;
1028
+ insertNodeIntoHeap(heap, getNodeLength(node), node);
1029
+ totalLength += getNodeLength(node);
1030
+ }
1031
+ totalLength /= 2;
1032
+
1033
+ node = removeNextNodeFromHeap(heap);
1034
+ while (node != NULL) {
1035
+ sumLength += getNodeLength(node);
1036
+ if (sumLength >= totalLength)
1037
+ break;
1038
+ node = removeNextNodeFromHeap(heap);
1039
+ }
1040
+
1041
+ destroyHeap(heap);
1042
+ return getNodeLength(node);
1043
+ }
1044
+
1045
+ int compareNodeCovs(const void * A, const void * B) {
1046
+ Node * nodeA = *((Node **) A);
1047
+ Node * nodeB = *((Node **) B);
1048
+ double covA;
1049
+ double covB;
1050
+
1051
+ if (getNodeLength(nodeA) == 0)
1052
+ nodeA = NULL;
1053
+
1054
+ if (getNodeLength(nodeB) == 0)
1055
+ nodeB = NULL;
1056
+
1057
+ // Null nodes considered to have infinite coverage
1058
+ if (nodeA == NULL && nodeB == NULL)
1059
+ return 0;
1060
+ if (nodeA == NULL)
1061
+ return 1;
1062
+ if (nodeB == NULL)
1063
+ return -1;
1064
+
1065
+ // Deal with real coverage numbers:
1066
+ covA = getTotalCoverage(nodeA) / (double) getNodeLength(nodeA);
1067
+ covB = getTotalCoverage(nodeB) / (double) getNodeLength(nodeB);
1068
+
1069
+ if (covA > covB)
1070
+ return 1;
1071
+ if (covA == covB)
1072
+ return 0;
1073
+ return -1;
1074
+ }
1075
+
1076
+ double estimated_cov(Graph * graph, char * directory)
1077
+ {
1078
+ Node ** nodeArray = callocOrExit(nodeCount(graph), Node*);
1079
+ IDnum index;
1080
+ Coordinate halfTotalLength = 0;
1081
+ Coordinate sumLength = 0;
1082
+ Node *node;
1083
+ char *logFilename =
1084
+ mallocOrExit(strlen(directory) + 100, char);
1085
+ char *statsLine =
1086
+ mallocOrExit(5000, char);
1087
+ FILE *logFile;
1088
+
1089
+ strcpy(logFilename, directory);
1090
+ strcat(logFilename, "/Log");
1091
+ logFile = fopen(logFilename, "a");
1092
+
1093
+ if (logFile == NULL)
1094
+ exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
1095
+ logFilename);
1096
+
1097
+ velvetLog("Measuring median coverage depth...\n");
1098
+
1099
+ if (nodeCount(graph) == 0) {
1100
+ velvetLog("EMPTY GRAPH\n");
1101
+ return 0;
1102
+ }
1103
+
1104
+ // Write nodes into array and compute total assembly length
1105
+ for (index = 1; index <= nodeCount(graph); index++) {
1106
+ node = getNodeInGraph(graph, index);
1107
+ nodeArray[index - 1] = node;
1108
+ if (node == NULL)
1109
+ continue;
1110
+ halfTotalLength += getNodeLength(node);
1111
+ }
1112
+ halfTotalLength /= 2;
1113
+
1114
+ // Sort nodes
1115
+ qsort(nodeArray, nodeCount(graph), sizeof(Node *), compareNodeCovs);
1116
+
1117
+ // Compute the length weighted median node coverage
1118
+ for (index = 0; index < nodeCount(graph); index++) {
1119
+ node = nodeArray[index];
1120
+ sumLength += getNodeLength(node);
1121
+ if (sumLength >= halfTotalLength) {
1122
+ velvetLog("Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1123
+ velvetFprintf(logFile, "Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1124
+ free(nodeArray);
1125
+ fclose(logFile);
1126
+ free(logFilename);
1127
+ free(statsLine);
1128
+ return getTotalCoverage(node) / (double) getNodeLength(node);
1129
+ }
1130
+ }
1131
+
1132
+ // In case something went wrong...
1133
+ free(nodeArray);
1134
+ fclose(logFile);
1135
+ free(logFilename);
1136
+ free(statsLine);
1137
+
1138
+ return -1;
1139
+ }
1140
+
1141
+ static boolean terminalReferenceMarker(Node * node, ReadSet * reads) {
1142
+ PassageMarkerI marker;
1143
+
1144
+ for (marker = getMarker(node); marker; marker = getNextInNode(marker))
1145
+ if (reads->categories[getAbsolutePassMarkerSeqID(marker) - 1] == REFERENCE
1146
+ && (!getNextInSequence(marker)
1147
+ || !getPreviousInSequence(marker)))
1148
+ return true;
1149
+
1150
+ return false;
1151
+ }
1152
+
1153
+ static boolean hasReferenceMarker(Node * node, ReadSet * reads) {
1154
+ PassageMarkerI marker;
1155
+
1156
+ for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker))
1157
+ if (reads->categories[getAbsolutePassMarkerSeqID(marker) - 1] == REFERENCE)
1158
+ return true;
1159
+
1160
+ return false;
1161
+ }
1162
+
1163
+ inline static void
1164
+ destroyNodePassageMarkers(Graph *graph,
1165
+ Node* node)
1166
+ {
1167
+ PassageMarkerI marker;
1168
+
1169
+ while ((marker = getMarker(node)) != NULL_IDX) {
1170
+ if (!isInitial(marker) && !isTerminal(marker))
1171
+ deleteNextPassageMarker(getPreviousInSequence(marker), graph);
1172
+ destroyPassageMarker(marker);
1173
+ }
1174
+ }
1175
+
1176
+ inline static void
1177
+ removeNodeAndDenounceDubiousReads(Graph *graph,
1178
+ Node *node,
1179
+ boolean denounceReads,
1180
+ boolean *res,
1181
+ Coordinate minLength,
1182
+ FILE *outfile)
1183
+ {
1184
+ if (denounceReads) {
1185
+ ShortReadMarker *nodeArray;
1186
+ ShortReadMarker *shortMarker;
1187
+ IDnum maxIndex;
1188
+ IDnum index;
1189
+ IDnum readID;
1190
+
1191
+ nodeArray = getNodeReads(node, graph);
1192
+ maxIndex = getNodeReadCount(node, graph);
1193
+ for (index = 0; index < maxIndex; index++) {
1194
+ shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
1195
+ readID = getShortReadMarkerID(shortMarker);
1196
+ if (readID > 0)
1197
+ res[readID - 1] = true;
1198
+ else
1199
+ res[-readID - 1] = true;
1200
+ }
1201
+
1202
+ nodeArray = getNodeReads(getTwinNode(node), graph);
1203
+ maxIndex = getNodeReadCount(getTwinNode(node), graph);
1204
+ for (index = 0; index < maxIndex; index++) {
1205
+ shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
1206
+ readID = getShortReadMarkerID(shortMarker);
1207
+ if (readID > 0)
1208
+ res[readID - 1] = true;
1209
+ else
1210
+ res[-readID - 1] = true;
1211
+ }
1212
+ }
1213
+
1214
+ destroyNodePassageMarkers(graph, node);
1215
+
1216
+ if (outfile != NULL && getNodeLength(node) > minLength)
1217
+ exportLongNodeSequence(outfile, node, graph, NULL, NULL, -1);
1218
+
1219
+ destroyNode(node, graph);
1220
+ }
1221
+
1222
+ boolean *removeLowCoverageNodesAndDenounceDubiousReads(Graph * graph,
1223
+ double minCov,
1224
+ ReadSet * reads,
1225
+ boolean export,
1226
+ Coordinate minLength,
1227
+ char *filename)
1228
+ {
1229
+ IDnum index;
1230
+ Node *node;
1231
+ boolean denounceReads = readStartsAreActivated(graph);
1232
+ boolean *res = NULL;
1233
+ FILE * outfile = NULL;
1234
+
1235
+ velvetLog("Removing contigs with coverage < %f...\n", minCov);
1236
+
1237
+ if (denounceReads)
1238
+ res = callocOrExit(sequenceCount(graph), boolean);
1239
+
1240
+ if (export) {
1241
+ outfile = fopen(filename, "w");
1242
+
1243
+ if (outfile == NULL) {
1244
+ velvetLog("Could not write into %s, sorry\n", filename);
1245
+ return res;
1246
+ } else {
1247
+ velvetLog("Writing contigs into %s...\n", filename);
1248
+ }
1249
+ }
1250
+
1251
+
1252
+ for (index = 1; index <= nodeCount(graph); index++) {
1253
+ node = getNodeInGraph(graph, index);
1254
+
1255
+ if (getNodeLength(node) == 0)
1256
+ continue;
1257
+
1258
+ if (getTotalCoverage(node) / getNodeLength(node) < minCov
1259
+ && !hasReferenceMarker(node, reads))
1260
+ removeNodeAndDenounceDubiousReads(graph,
1261
+ node,
1262
+ denounceReads,
1263
+ res,
1264
+ minLength,
1265
+ outfile);
1266
+ }
1267
+
1268
+ concatenateGraph(graph);
1269
+
1270
+ for (index = 1; index <= nodeCount(graph); index++) {
1271
+ node = getNodeInGraph(graph, index);
1272
+
1273
+ if (getNodeLength(node) == 0)
1274
+ continue;
1275
+
1276
+ if (getTotalCoverage(node) / getNodeLength(node) < minCov
1277
+ && !terminalReferenceMarker(node, reads))
1278
+ removeNodeAndDenounceDubiousReads(graph,
1279
+ node,
1280
+ denounceReads,
1281
+ res,
1282
+ minLength,
1283
+ outfile);
1284
+ }
1285
+
1286
+ if (export)
1287
+ fclose(outfile);
1288
+
1289
+ concatenateGraph(graph);
1290
+ return res;
1291
+ }
1292
+
1293
+ static Coordinate getLongCoverage(Node * node) {
1294
+ PassageMarkerI marker;
1295
+ Coordinate total = 0;
1296
+
1297
+ for (marker = getMarker(node); marker; marker = getNextInNode(marker))
1298
+ total += getPassageMarkerLength(marker);
1299
+
1300
+ return total;
1301
+ }
1302
+
1303
+ void removeLowCoverageReferenceNodes(Graph * graph, double minCov, double minLongCov, ReadSet * reads)
1304
+ {
1305
+ IDnum index;
1306
+ Node *node;
1307
+
1308
+ velvetLog("Removing reference contigs with coverage < %f...\n", minCov);
1309
+
1310
+ for (index = 1; index <= nodeCount(graph); index++) {
1311
+ node = getNodeInGraph(graph, index);
1312
+
1313
+ if (getNodeLength(node) == 0)
1314
+ continue;
1315
+
1316
+ if ((getTotalCoverage(node) / getNodeLength(node) < minCov
1317
+ || getLongCoverage(node) / getNodeLength(node) < minLongCov)
1318
+ && hasReferenceMarker(node, reads)) {
1319
+ destroyNodePassageMarkers(graph, node);
1320
+
1321
+ destroyNode(node, graph);
1322
+ }
1323
+ }
1324
+
1325
+ concatenateGraph(graph);
1326
+ }
1327
+
1328
+ void removeLowLongCoverageNodesAndDenounceDubiousReads(Graph * graph,
1329
+ double minCov,
1330
+ ReadSet * reads,
1331
+ boolean * res,
1332
+ boolean export,
1333
+ Coordinate minLength,
1334
+ char *filename)
1335
+ {
1336
+ IDnum index;
1337
+ Node *node;
1338
+ boolean denounceReads = readStartsAreActivated(graph);
1339
+ FILE * outfile = NULL;
1340
+
1341
+ if (minCov < 0)
1342
+ return;
1343
+
1344
+ velvetLog("Removing contigs with coverage < %f...\n", minCov);
1345
+
1346
+ if (export) {
1347
+ outfile = fopen(filename, "a");
1348
+
1349
+ if (outfile == NULL) {
1350
+ velvetLog("Could not write into %s, sorry\n", filename);
1351
+ return;
1352
+ } else {
1353
+ velvetLog("Writing contigs into %s...\n", filename);
1354
+ }
1355
+ }
1356
+
1357
+ for (index = 1; index <= nodeCount(graph); index++) {
1358
+ node = getNodeInGraph(graph, index);
1359
+
1360
+ if (getNodeLength(node) == 0)
1361
+ continue;
1362
+
1363
+ if (getLongCoverage(node) / getNodeLength(node) < minCov
1364
+ && !hasReferenceMarker(node, reads))
1365
+ removeNodeAndDenounceDubiousReads(graph,
1366
+ node,
1367
+ denounceReads,
1368
+ res,
1369
+ minLength,
1370
+ outfile);
1371
+ }
1372
+
1373
+ concatenateGraph(graph);
1374
+
1375
+ for (index = 1; index <= nodeCount(graph); index++) {
1376
+ node = getNodeInGraph(graph, index);
1377
+
1378
+ if (getNodeLength(node) == 0)
1379
+ continue;
1380
+
1381
+ if (getLongCoverage(node) / getNodeLength(node) < minCov
1382
+ && !terminalReferenceMarker(node, reads))
1383
+ removeNodeAndDenounceDubiousReads(graph,
1384
+ node,
1385
+ denounceReads,
1386
+ res,
1387
+ minLength,
1388
+ outfile);
1389
+ }
1390
+
1391
+ if (export)
1392
+ fclose(outfile);
1393
+
1394
+ concatenateGraph(graph);
1395
+ }
1396
+
1397
+ void removeHighCoverageNodes(Graph * graph, double maxCov, boolean export, Coordinate minLength, char *filename)
1398
+ {
1399
+ IDnum index;
1400
+ Node *node;
1401
+ FILE * outfile = NULL;
1402
+
1403
+ if (maxCov < 0)
1404
+ return;
1405
+
1406
+ velvetLog("Applying an upper coverage cutoff of %f...\n", maxCov);
1407
+
1408
+ if (export) {
1409
+ outfile = fopen(filename, "w");
1410
+
1411
+ if (outfile == NULL) {
1412
+ velvetLog("Could not write into %s, sorry\n", filename);
1413
+ return;
1414
+ } else {
1415
+ velvetLog("Writing contigs into %s...\n", filename);
1416
+ }
1417
+ }
1418
+
1419
+ for (index = 1; index <= nodeCount(graph); index++) {
1420
+ node = getNodeInGraph(graph, index);
1421
+
1422
+ if (getNodeLength(node) > 0
1423
+ && getTotalCoverage(node) / getNodeLength(node) > maxCov) {
1424
+ destroyNodePassageMarkers(graph, node);
1425
+
1426
+ if (export && getNodeLength(node) > minLength)
1427
+ exportLongNodeSequence(outfile, node, graph, NULL, NULL, -1);
1428
+
1429
+ destroyNode(node, graph);
1430
+ }
1431
+ }
1432
+
1433
+ if (export)
1434
+ fclose(outfile);
1435
+
1436
+ concatenateGraph(graph);
1437
+ }
1438
+
1439
+ static void exportAMOSLib(FILE * outfile, Graph * graph, Category cat)
1440
+ {
1441
+ Coordinate distance = getInsertLength(graph, cat * 2);
1442
+ double variance = getInsertLength_var(graph, cat * 2);
1443
+
1444
+ if (distance == -1)
1445
+ return;
1446
+
1447
+ velvetFprintf(outfile, "{LIB\n");
1448
+ velvetFprintf(outfile, "iid:%d\n", (int) (cat + 1));
1449
+ velvetFprintf(outfile, "{DST\n");
1450
+ velvetFprintf(outfile, "mea:%lld\n", (long long) distance);
1451
+ velvetFprintf(outfile, "std:%lld\n", (long long) sqrt(variance));
1452
+ velvetFprintf(outfile, "}\n");
1453
+ velvetFprintf(outfile, "}\n");
1454
+ }
1455
+
1456
+ static void exportAMOSMarker(FILE * outfile, PassageMarkerI marker,
1457
+ Coordinate nodeLength, Coordinate start,
1458
+ Coordinate finish, int wordShift)
1459
+ {
1460
+ Coordinate sequenceStart, sequenceFinish;
1461
+
1462
+ if (getStartOffset(marker) >= finish
1463
+ || getFinishOffset(marker) > nodeLength - start)
1464
+ return;
1465
+
1466
+ sequenceStart = getPassageMarkerStart(marker);
1467
+ if (start > getStartOffset(marker)) {
1468
+ if (getPassageMarkerSequenceID(marker) > 0)
1469
+ sequenceStart += start - getStartOffset(marker);
1470
+ else
1471
+ sequenceStart -= start - getStartOffset(marker);
1472
+ }
1473
+
1474
+ sequenceFinish = getPassageMarkerFinish(marker);
1475
+ if (nodeLength - finish > getFinishOffset(marker)) {
1476
+ if (getPassageMarkerSequenceID(marker) > 0)
1477
+ sequenceFinish -=
1478
+ nodeLength - finish - getFinishOffset(marker);
1479
+ else
1480
+ sequenceFinish +=
1481
+ nodeLength - finish - getFinishOffset(marker);
1482
+ }
1483
+
1484
+ if (getPassageMarkerSequenceID(marker) > 0)
1485
+ sequenceFinish += wordShift;
1486
+ else
1487
+ sequenceStart += wordShift;
1488
+
1489
+ velvetFprintf(outfile, "{TLE\n");
1490
+ velvetFprintf(outfile, "src:%li\n", (long) getAbsolutePassMarkerSeqID(marker));
1491
+ if (getStartOffset(marker) > start)
1492
+ velvetFprintf(outfile, "off:%lld\n",
1493
+ (long long) (getStartOffset(marker) - start));
1494
+ else
1495
+ velvetFprintf(outfile, "off:0\n");
1496
+ velvetFprintf(outfile, "clr:%lld,%lld\n", (long long) sequenceStart, (long long) sequenceFinish);
1497
+ velvetFprintf(outfile, "}\n");
1498
+ }
1499
+
1500
+ static void exportAMOSShortMarker(FILE * outfile, ShortReadMarker * marker,
1501
+ ReadSet * reads, Coordinate start,
1502
+ Coordinate finish)
1503
+ {
1504
+ Coordinate offset =
1505
+ getShortReadMarkerPosition(marker) -
1506
+ getShortReadMarkerOffset(marker);
1507
+ TightString *sequence =
1508
+ getTightStringInArray (reads->tSequences, getShortReadMarkerID(marker) - 1);
1509
+
1510
+ if (getShortReadMarkerPosition(marker) == -1)
1511
+ return;
1512
+
1513
+ if (offset >= finish || offset + getLength(sequence) < start)
1514
+ return;
1515
+
1516
+ velvetFprintf(outfile, "{TLE\n");
1517
+ velvetFprintf(outfile, "src:%li\n", (long) getShortReadMarkerID(marker));
1518
+ velvetFprintf(outfile, "off:%lld\n", (long long) (offset - start));
1519
+ velvetFprintf(outfile, "clr:0,%lld\n", (long long) getLength(sequence));
1520
+ velvetFprintf(outfile, "}\n");
1521
+ }
1522
+
1523
+ static void exportAMOSReverseShortMarker(FILE * outfile,
1524
+ ShortReadMarker * marker,
1525
+ Coordinate nodeLength,
1526
+ int wordShift, ReadSet * reads,
1527
+ Coordinate start,
1528
+ Coordinate finish)
1529
+ {
1530
+ TightString *sequence =
1531
+ getTightStringInArray (reads->tSequences, getShortReadMarkerID(marker) - 1);
1532
+
1533
+ Coordinate offset =
1534
+ nodeLength - getShortReadMarkerPosition(marker) +
1535
+ getShortReadMarkerOffset(marker) - getLength(sequence) +
1536
+ wordShift;
1537
+
1538
+ if (getShortReadMarkerPosition(marker) == -1)
1539
+ return;
1540
+
1541
+ if (offset >= finish || offset + getLength(sequence) < start)
1542
+ return;
1543
+
1544
+ velvetFprintf(outfile, "{TLE\n");
1545
+ velvetFprintf(outfile, "src:%li\n", (long) getShortReadMarkerID(marker));
1546
+ velvetFprintf(outfile, "off:%lld\n", (long long) (offset - start));
1547
+ velvetFprintf(outfile, "clr:%lld,0\n", (long long) getLength(sequence));
1548
+ velvetFprintf(outfile, "}\n");
1549
+ }
1550
+
1551
+ static void exportAMOSContig(FILE * outfile, ReadSet * reads, Node * node,
1552
+ Graph * graph, Coordinate contigStart,
1553
+ Coordinate contigFinish, IDnum iid,
1554
+ IDnum internalIndex)
1555
+ {
1556
+ Coordinate start;
1557
+ char str[100];
1558
+ PassageMarkerI marker;
1559
+ ShortReadMarker *shortMarkerArray, *shortMarker;
1560
+ Coordinate index, maxIndex;
1561
+ int wordShift = getWordLength(graph) - 1;
1562
+ char *string = expandNodeFragment(node, contigStart, contigFinish,
1563
+ getWordLength(graph));
1564
+ Coordinate length = contigFinish - contigStart + wordShift;
1565
+
1566
+ velvetFprintf(outfile, "{CTG\n");
1567
+ velvetFprintf(outfile, "iid:%li\n", (long) iid);
1568
+ velvetFprintf(outfile, "eid:%li-%li\n", (long) getNodeID(node), (long) internalIndex);
1569
+
1570
+ velvetFprintf(outfile, "seq:\n");
1571
+ for (start = 0; start <= length; start += 60) {
1572
+ strncpy(str, &(string[start]), 60);
1573
+ str[60] = '\0';
1574
+ velvetFprintf(outfile, "%s\n", str);
1575
+ }
1576
+ velvetFprintf(outfile, ".\n");
1577
+
1578
+ velvetFprintf(outfile, "qlt:\n");
1579
+ for (start = 0; start <= length; start += 60) {
1580
+ strncpy(str, &(string[start]), 60);
1581
+ str[60] = '\0';
1582
+ velvetFprintf(outfile, "%s\n", str);
1583
+ }
1584
+ velvetFprintf(outfile, ".\n");
1585
+
1586
+ free(string);
1587
+
1588
+ for (marker = getMarker(node); marker != NULL_IDX;
1589
+ marker = getNextInNode(marker))
1590
+ exportAMOSMarker(outfile, marker, getNodeLength(node),
1591
+ contigStart, contigFinish, wordShift);
1592
+
1593
+ if (readStartsAreActivated(graph)) {
1594
+ shortMarkerArray = getNodeReads(node, graph);
1595
+ maxIndex = getNodeReadCount(node, graph);
1596
+ for (index = 0; index < maxIndex; index++) {
1597
+ shortMarker =
1598
+ getShortReadMarkerAtIndex(shortMarkerArray,
1599
+ index);
1600
+ exportAMOSShortMarker(outfile, shortMarker, reads,
1601
+ contigStart, contigFinish);
1602
+ }
1603
+
1604
+ shortMarkerArray = getNodeReads(getTwinNode(node), graph);
1605
+ maxIndex = getNodeReadCount(getTwinNode(node), graph);
1606
+ for (index = 0; index < maxIndex; index++) {
1607
+ shortMarker =
1608
+ getShortReadMarkerAtIndex(shortMarkerArray,
1609
+ index);
1610
+ exportAMOSReverseShortMarker(outfile, shortMarker,
1611
+ getNodeLength(node),
1612
+ wordShift, reads,
1613
+ contigStart,
1614
+ contigFinish);
1615
+ }
1616
+ }
1617
+
1618
+ velvetFprintf(outfile, "}\n");
1619
+ }
1620
+
1621
+ static void exportAMOSNode(FILE * outfile, ReadSet * reads, Node * node,
1622
+ Graph * graph)
1623
+ {
1624
+ Coordinate start = 0;
1625
+ Coordinate finish;
1626
+ GapMarker *gap;
1627
+ IDnum smallIndex = 0;
1628
+ static IDnum iid = 1;
1629
+ IDnum contigIndex = iid;
1630
+ int wordShift = getWordLength(graph) - 1;
1631
+
1632
+ for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
1633
+ finish = getGapStart(gap);
1634
+ exportAMOSContig(outfile, reads, node, graph, start,
1635
+ finish, iid++, smallIndex++);
1636
+ start = getGapFinish(gap);
1637
+ }
1638
+
1639
+ finish = getNodeLength(node);
1640
+ exportAMOSContig(outfile, reads, node, graph, start, finish, iid++,
1641
+ smallIndex);
1642
+
1643
+ if (!getGap(node, graph))
1644
+ return;
1645
+
1646
+ start = 0;
1647
+
1648
+ velvetFprintf(outfile, "{SCF\n");
1649
+ velvetFprintf(outfile, "eid:%li\n", (long) getNodeID(node));
1650
+ for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
1651
+ finish = getGapStart(gap);
1652
+ velvetFprintf(outfile, "{TLE\n");
1653
+ velvetFprintf(outfile, "off:%lld\n", (long long) start);
1654
+ velvetFprintf(outfile, "clr:0,%lld\n",
1655
+ (long long) (finish - start + (long long) wordShift));
1656
+ velvetFprintf(outfile, "src:%li\n", (long) contigIndex++);
1657
+ velvetFprintf(outfile, "}\n");
1658
+ start = getGapFinish(gap);
1659
+ }
1660
+ finish = getNodeLength(node);
1661
+ velvetFprintf(outfile, "{TLE\n");
1662
+ velvetFprintf(outfile, "off:%lld\n", (long long) start);
1663
+ velvetFprintf(outfile, "clr:0,%lld\n", (long long) (finish - start));
1664
+ velvetFprintf(outfile, "src:%li\n", (long) contigIndex++);
1665
+ velvetFprintf(outfile, "}\n");
1666
+
1667
+ velvetFprintf(outfile, "}\n");
1668
+ }
1669
+
1670
+ static void exportAMOSRead(FILE * outfile, TightString * tString,
1671
+ IDnum index, IDnum frg_index)
1672
+ {
1673
+ Coordinate start, finish;
1674
+ char str[100];
1675
+
1676
+ velvetFprintf(outfile, "{RED\n");
1677
+ velvetFprintf(outfile, "iid:%li\n", (long) index);
1678
+ velvetFprintf(outfile, "eid:%li\n", (long) index);
1679
+ if (frg_index > 0)
1680
+ velvetFprintf(outfile, "frg:%li\n", (long) frg_index);
1681
+
1682
+ velvetFprintf(outfile, "seq:\n");
1683
+ start = 0;
1684
+ while (start <= getLength(tString)) {
1685
+ finish = start + 60;
1686
+ readTightStringFragment(tString, start, finish, str);
1687
+ velvetFprintf(outfile, "%s\n", str);
1688
+ start = finish;
1689
+ }
1690
+ velvetFprintf(outfile, ".\n");
1691
+
1692
+ velvetFprintf(outfile, "qlt:\n");
1693
+ start = 0;
1694
+ while (start <= getLength(tString)) {
1695
+ finish = start + 60;
1696
+ readTightStringFragment(tString, start, finish, str);
1697
+ velvetFprintf(outfile, "%s\n", str);
1698
+ start = finish;
1699
+ }
1700
+ velvetFprintf(outfile, ".\n");
1701
+
1702
+ velvetFprintf(outfile, "}\n");
1703
+ }
1704
+
1705
+ void exportAMOSContigs(char *filename, Graph * graph,
1706
+ Coordinate cutoff_length, ReadSet * reads)
1707
+ {
1708
+ IDnum index;
1709
+ Category cat;
1710
+ Node *node;
1711
+ FILE *outfile;
1712
+
1713
+ velvetLog("Writing into AMOS file %s...\n", filename);
1714
+ outfile = fopen(filename, "w");
1715
+
1716
+ if (outfile == NULL)
1717
+ exitErrorf(EXIT_FAILURE, true, "Could not write to AMOS file %s",
1718
+ filename);
1719
+
1720
+ for (cat = 0; cat <= CATEGORIES; cat++)
1721
+ exportAMOSLib(outfile, graph, cat);
1722
+
1723
+ for (index = 1; index <= reads->readCount; index++) {
1724
+ if (reads->categories[index - 1] % 2 != 0 &&
1725
+ getInsertLength(graph,
1726
+ reads->categories[index - 1]) >= 0) {
1727
+ velvetFprintf(outfile, "{FRG\n");
1728
+ velvetFprintf(outfile, "lib:%d\n",
1729
+ (int) ((reads->categories[index - 1] / 2) + 1));
1730
+ velvetFprintf(outfile, "rds:%li,%li\n", (long) index,
1731
+ (long) index + 1);
1732
+ velvetFprintf(outfile, "eid:%li\n", (long) index);
1733
+ velvetFprintf(outfile, "iid:%li\n", (long) index);
1734
+ velvetFprintf(outfile, "typ:I\n");
1735
+ velvetFprintf(outfile, "}\n");
1736
+ index++;
1737
+ }
1738
+ }
1739
+
1740
+ for (index = 1; index <= reads->readCount; index++) {
1741
+ if (reads->categories[index - 1] % 2 != 0 &&
1742
+ getInsertLength(graph,
1743
+ reads->categories[index - 1]) >= 0) {
1744
+ exportAMOSRead(outfile,
1745
+ getTightStringInArray(reads->tSequences, index - 1), index,
1746
+ index);
1747
+ index++;
1748
+ exportAMOSRead(outfile,
1749
+ getTightStringInArray(reads->tSequences, index - 1), index,
1750
+ index - 1);
1751
+ } else {
1752
+ exportAMOSRead(outfile,
1753
+ getTightStringInArray(reads->tSequences, index - 1), index,
1754
+ -1);
1755
+ }
1756
+ }
1757
+
1758
+ for (index = 1; index <= nodeCount(graph); index++) {
1759
+ node = getNodeInGraph(graph, index);
1760
+
1761
+ if (node == NULL)
1762
+ continue;
1763
+
1764
+ if (getNodeLength(node) >= cutoff_length)
1765
+ exportAMOSNode(outfile, reads, node, graph);
1766
+ }
1767
+
1768
+ fclose(outfile);
1769
+
1770
+ }
1771
+
1772
+ Coordinate totalAssemblyLength(Graph * graph)
1773
+ {
1774
+ IDnum index;
1775
+ Node *node;
1776
+ Coordinate total = 0;
1777
+
1778
+ for (index = 1; index <= nodeCount(graph); index++) {
1779
+ node = getNodeInGraph(graph, index);
1780
+ if (node)
1781
+ total += getNodeLength(node);
1782
+ }
1783
+
1784
+ return total;
1785
+ }
1786
+
1787
+ IDnum usedReads(Graph * graph, Coordinate minContigLength)
1788
+ {
1789
+ IDnum res = 0;
1790
+ boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
1791
+ IDnum nodeID, readID;
1792
+ Node * node;
1793
+ PassageMarkerI marker;
1794
+ ShortReadMarker * shortReadArray, * shortReadMarker;
1795
+ IDnum shortReadCount, shortReadIndex;
1796
+
1797
+ for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
1798
+ node = getNodeInGraph(graph, nodeID);
1799
+ if (node == NULL || getNodeLength(node) < minContigLength)
1800
+ continue;
1801
+
1802
+ // Long reads
1803
+ for(marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) {
1804
+ readID = getPassageMarkerSequenceID(marker);
1805
+ if (readID < 0)
1806
+ readID = -readID;
1807
+ used[readID] = true;
1808
+ }
1809
+
1810
+ // Short reads
1811
+ if (!readStartsAreActivated(graph))
1812
+ continue;
1813
+
1814
+ shortReadArray = getNodeReads(node, graph);
1815
+ shortReadCount = getNodeReadCount(node, graph);
1816
+ for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
1817
+ shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
1818
+ readID = getShortReadMarkerID(shortReadMarker);
1819
+ used[readID] = true;
1820
+ }
1821
+
1822
+ shortReadArray = getNodeReads(getTwinNode(node), graph);
1823
+ shortReadCount = getNodeReadCount(getTwinNode(node), graph);
1824
+ for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
1825
+ shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
1826
+ readID = getShortReadMarkerID(shortReadMarker);
1827
+ used[readID] = true;
1828
+ }
1829
+ }
1830
+
1831
+ for (readID = 1; readID <= sequenceCount(graph); readID++)
1832
+ if (used[readID])
1833
+ res++;
1834
+
1835
+ free(used);
1836
+
1837
+ return res;
1838
+ }
1839
+
1840
+ void logFinalStats(Graph * graph, Coordinate minContigKmerLength, char *directory)
1841
+ {
1842
+ char *logFilename =
1843
+ mallocOrExit(strlen(directory) + 100, char);
1844
+ char *statsLine =
1845
+ mallocOrExit(5000, char);
1846
+ FILE *logFile;
1847
+
1848
+ strcpy(logFilename, directory);
1849
+ strcat(logFilename, "/Log");
1850
+ logFile = fopen(logFilename, "a");
1851
+
1852
+ if (logFile == NULL)
1853
+ exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
1854
+ logFilename);
1855
+
1856
+ sprintf
1857
+ (statsLine, "Final graph has %ld nodes and n50 of %lld, max %lld, total %lld, using %ld/%ld reads\n",
1858
+ (long) nodeCount(graph), (long long) n50(graph), (long long) maxLength(graph),
1859
+ (long long) totalAssemblyLength(graph), (long) usedReads(graph, minContigKmerLength),
1860
+ (long) sequenceCount(graph));
1861
+
1862
+ velvetFprintf(logFile, "%s", statsLine);
1863
+ velvetFprintf(stdout, "%s", statsLine);
1864
+
1865
+ fclose(logFile);
1866
+ free(logFilename);
1867
+ free(statsLine);
1868
+ }
1869
+
1870
+ void exportUnusedReads(Graph* graph, ReadSet * reads, Coordinate minContigKmerLength, char* directory) {
1871
+ char *outFilename =
1872
+ mallocOrExit(strlen(directory) + 100, char);
1873
+ FILE * outfile;
1874
+ boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
1875
+ IDnum nodeID, readID;
1876
+ Node * node;
1877
+ PassageMarkerI marker;
1878
+ ShortReadMarker * shortReadArray, * shortReadMarker;
1879
+ IDnum shortReadCount, shortReadIndex;
1880
+
1881
+ strcpy(outFilename, directory);
1882
+ strcat(outFilename, "/UnusedReads.fa");
1883
+ outfile = fopen(outFilename, "w");
1884
+
1885
+ velvetLog("Printing unused reads into %s\n", outFilename);
1886
+
1887
+ for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
1888
+ node = getNodeInGraph(graph, nodeID);
1889
+ if (node == NULL || getNodeLength(node) < minContigKmerLength)
1890
+ continue;
1891
+
1892
+ // Long reads
1893
+ for(marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) {
1894
+ readID = getPassageMarkerSequenceID(marker);
1895
+ if (readID < 0)
1896
+ readID = -readID;
1897
+ used[readID] = true;
1898
+ }
1899
+
1900
+ // Short reads
1901
+ if (!readStartsAreActivated(graph))
1902
+ continue;
1903
+
1904
+ shortReadArray = getNodeReads(node, graph);
1905
+ shortReadCount = getNodeReadCount(node, graph);
1906
+ for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
1907
+ shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
1908
+ readID = getShortReadMarkerID(shortReadMarker);
1909
+ used[readID] = true;
1910
+ }
1911
+
1912
+ shortReadArray = getNodeReads(getTwinNode(node), graph);
1913
+ shortReadCount = getNodeReadCount(getTwinNode(node), graph);
1914
+ for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
1915
+ shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
1916
+ readID = getShortReadMarkerID(shortReadMarker);
1917
+ used[readID] = true;
1918
+ }
1919
+ }
1920
+
1921
+ for (readID = 1; readID <= sequenceCount(graph); readID++)
1922
+ if (!used[readID])
1923
+ exportTightString(outfile, getTightStringInArray(reads->tSequences, readID - 1), readID);
1924
+
1925
+ free(outFilename);
1926
+ free(used);
1927
+ fclose(outfile);
1928
+ }
1929
+
1930
+ static IDnum getReferenceCount(ReadSet * reads) {
1931
+ IDnum index;
1932
+
1933
+ for (index = 0; index < reads->readCount; index++)
1934
+ if (reads->categories[index] != REFERENCE)
1935
+ break;
1936
+
1937
+ return index;
1938
+ }
1939
+
1940
+ //////////////////////////////////////////////////////////////////////////
1941
+ // Reference identifiers
1942
+ //////////////////////////////////////////////////////////////////////////
1943
+
1944
+ typedef struct referenceCoord_st ReferenceCoord;
1945
+
1946
+ struct referenceCoord_st {
1947
+ IDnum start;
1948
+ IDnum finish;
1949
+ char * name;
1950
+ boolean positive_strand;
1951
+ } ATTRIBUTE_PACKED;
1952
+
1953
+ static ReferenceCoord * collectReferenceCoords(SequencesReader *seqReadInfo, IDnum referenceCount) {
1954
+ ReferenceCoord * refCoords = callocOrExit(referenceCount, ReferenceCoord);
1955
+ IDnum refIndex = 0;
1956
+ if (seqReadInfo->m_bIsBinary) {
1957
+ velvetLog("Creating placeholder reference headers\n");
1958
+ // binary seqs does not have reference header so a placeholder is created
1959
+ do {
1960
+ refCoords[refIndex].name = callocOrExit(sizeof("PLACEHLDR.%ld PLACEHOLDER000") + 20, char);
1961
+ sprintf(refCoords[refIndex].name, "PLACEHLDR.%ld PLACEHOLDER000", (int64_t) refIndex + 1);
1962
+ refCoords[refIndex].start = 1;
1963
+ refCoords[refIndex].finish = -1;
1964
+ refCoords[refIndex].positive_strand = true;
1965
+ } while (++refIndex < referenceCount);
1966
+ } else {
1967
+ FILE * file = fopen(seqReadInfo->m_seqFilename, "r");
1968
+ char line[MAXLINE];
1969
+ char name[5000];
1970
+ Coordinate start, finish;
1971
+ long long longlongvar;
1972
+ int i;
1973
+
1974
+ while (fgets(line, MAXLINE, file)) {
1975
+ if (line[0] == '>') {
1976
+ if (strchr(line, ':')) {
1977
+ sscanf(strtok(line, ":-\r\n"), ">%s", name);
1978
+ sscanf(strtok(NULL, ":-\r\n"), "%lli", &longlongvar);
1979
+ start = longlongvar;
1980
+ sscanf(strtok(NULL, ":-\r\n"), "%lli", &longlongvar);
1981
+ finish = longlongvar;
1982
+ refCoords[refIndex].name = callocOrExit(strlen(name) + 1, char);
1983
+ if (start <= finish) {
1984
+ strcpy(refCoords[refIndex].name, name);
1985
+ refCoords[refIndex].start = start;
1986
+ refCoords[refIndex].finish = finish;
1987
+ refCoords[refIndex].positive_strand = true;
1988
+ } else {
1989
+ strcpy(refCoords[refIndex].name, name);
1990
+ refCoords[refIndex].start = finish;
1991
+ refCoords[refIndex].finish = start;
1992
+ refCoords[refIndex].positive_strand = false;
1993
+ }
1994
+ } else {
1995
+ for (i = strlen(line) - 1;
1996
+ i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
1997
+ line[i] = '\0';
1998
+ }
1999
+
2000
+ strcpy(name, line + 1);
2001
+ refCoords[refIndex].name = callocOrExit(strlen(name) + 1, char);
2002
+ strcpy(refCoords[refIndex].name, name);
2003
+ refCoords[refIndex].start = 1;
2004
+ refCoords[refIndex].finish = -1;
2005
+ refCoords[refIndex].positive_strand = true;
2006
+ }
2007
+ if (++refIndex == referenceCount)
2008
+ break;
2009
+ }
2010
+ }
2011
+
2012
+ fclose(file);
2013
+ }
2014
+ return refCoords;
2015
+ }
2016
+
2017
+ typedef struct refMap_st {
2018
+ IDnum start;
2019
+ IDnum finish;
2020
+ IDnum refID;
2021
+ IDnum refStart;
2022
+ IDnum refFinish;
2023
+ } ATTRIBUTE_PACKED ReferenceMapping;
2024
+
2025
+ static int compareReferenceMappings(const void * A, const void * B) {
2026
+ ReferenceMapping * refMapA = (ReferenceMapping *) A;
2027
+ ReferenceMapping * refMapB = (ReferenceMapping *) B;
2028
+
2029
+ if (refMapA->start < refMapB->start)
2030
+ return -1;
2031
+ else if (refMapA->start == refMapB->start)
2032
+ return 0;
2033
+ else
2034
+ return 1;
2035
+ }
2036
+
2037
+ static void initializeReferenceMapping(ReferenceMapping * refMap, PassageMarkerI marker, Node * node) {
2038
+ refMap->start = getStartOffset(marker);
2039
+ refMap->finish = getNodeLength(node) - getFinishOffset(marker);
2040
+ refMap->refID = getPassageMarkerSequenceID(marker);
2041
+ refMap->refStart = getPassageMarkerStart(marker);
2042
+ refMap->refFinish = getPassageMarkerFinish(marker);
2043
+ }
2044
+
2045
+ static void velvetFprintfReferenceMapping(FILE * file, ReferenceMapping * mapping, ReferenceCoord * refCoords, int wordLength) {
2046
+ ReferenceCoord * refCoord;
2047
+ Coordinate start, finish;
2048
+
2049
+ if (mapping->refID > 0)
2050
+ refCoord = &refCoords[mapping->refID - 1];
2051
+ else
2052
+ refCoord = &refCoords[-mapping->refID - 1];
2053
+
2054
+ if (mapping->refID > 0) {
2055
+ if (refCoord->positive_strand) {
2056
+ start = refCoord->start + mapping->refStart;
2057
+ finish = refCoord->start + mapping->refFinish + wordLength - 2;
2058
+ } else {
2059
+ start = refCoord->finish - mapping->refStart + wordLength - 1;
2060
+ finish = refCoord->finish - mapping->refFinish + 1;
2061
+ }
2062
+ } else {
2063
+ if (refCoord->positive_strand) {
2064
+ start = refCoord->start + mapping->refStart + wordLength - 1;
2065
+ finish = refCoord->start + mapping->refFinish + 1;
2066
+ } else {
2067
+ start = refCoord->finish - mapping->refStart;
2068
+ finish = refCoord->finish - mapping->refFinish + wordLength;
2069
+ }
2070
+ }
2071
+
2072
+ velvetFprintf(file, "%lli\t%lli\t%s\t%lli\t%lli\n",
2073
+ (long long) mapping->start + 1, (long long) mapping->finish + wordLength - 1,
2074
+ refCoord->name, (long long) start, (long long) finish);
2075
+ }
2076
+
2077
+ static void exportLongNodeMapping(FILE * outfile, Node * node, ReadSet * reads, ReferenceCoord * refCoords, int wordLength) {
2078
+ PassageMarkerI marker;
2079
+ ReferenceMapping * referenceMappings;
2080
+ IDnum index;
2081
+ IDnum referenceCount = 0;
2082
+
2083
+ // Count reference sequences
2084
+ for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker))
2085
+ if (reads->categories[getAbsolutePassMarkerSeqID(marker) - 1] == REFERENCE)
2086
+ referenceCount++;
2087
+
2088
+ // Header
2089
+ velvetFprintf(outfile, ">contig_%li\n", (long) getNodeID(node));
2090
+
2091
+ // Create table
2092
+ referenceMappings = callocOrExit(referenceCount, ReferenceMapping);
2093
+
2094
+ // Initialize table
2095
+ referenceCount = 0;
2096
+ for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker))
2097
+ if (reads->categories[getAbsolutePassMarkerSeqID(marker) - 1] == REFERENCE)
2098
+ initializeReferenceMapping(&referenceMappings[referenceCount++], marker, node);
2099
+
2100
+ // Sort table
2101
+ qsort(referenceMappings, referenceCount, sizeof(ReferenceMapping), compareReferenceMappings);
2102
+
2103
+ // Print table
2104
+ for (index = 0; index < referenceCount; index++)
2105
+ velvetFprintfReferenceMapping(outfile, &referenceMappings[index], refCoords, wordLength);
2106
+
2107
+ // Clean table
2108
+ free(referenceMappings);
2109
+ }
2110
+
2111
+ void exportLongNodeMappings(char *filename, Graph * graph, ReadSet * reads,
2112
+ Coordinate minLength, SequencesReader *seqReadInfo)
2113
+ {
2114
+ FILE * outfile;
2115
+ IDnum nodeIndex, refIndex;
2116
+ Node *node;
2117
+ ReferenceCoord * refCoords;
2118
+ IDnum referenceCount = getReferenceCount(reads);
2119
+
2120
+ if (referenceCount == 0)
2121
+ return;
2122
+
2123
+ refCoords = collectReferenceCoords(seqReadInfo, referenceCount);
2124
+
2125
+ outfile = fopen(filename, "w");
2126
+ if (outfile == NULL) {
2127
+ velvetLog("Could not write into %s, sorry\n", filename);
2128
+ return;
2129
+ } else {
2130
+ velvetLog("Writing contigs into %s...\n", filename);
2131
+ }
2132
+
2133
+ for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
2134
+ node = getNodeInGraph(graph, nodeIndex);
2135
+
2136
+ if (node == NULL || getNodeLength(node) < minLength)
2137
+ continue;
2138
+
2139
+ exportLongNodeMapping(outfile, node, reads, refCoords, getWordLength(graph));
2140
+ }
2141
+
2142
+ for (refIndex = 0; refIndex < referenceCount; refIndex++)
2143
+ free(refCoords[refIndex].name);
2144
+ free(refCoords);
2145
+ fclose(outfile);
2146
+ }
2147
+
2148
+ static void removeLowArcsFromNode(Node* node, Graph * graph, double cutoff) {
2149
+ Arc * arc, * next;
2150
+ if (node == NULL)
2151
+ return;
2152
+
2153
+ for (arc = getArc(node); arc; arc = next) {
2154
+ next = getNextArc(arc);
2155
+ if (getMultiplicity(arc) <= cutoff)
2156
+ destroyArc(arc, graph);
2157
+ }
2158
+ }
2159
+
2160
+ void removeLowArcs(Graph * graph, double cutoff) {
2161
+ velvetLog("Removing single arcs\n");
2162
+ IDnum index;
2163
+ for (index = -nodeCount(graph); index <= nodeCount(graph); index++)
2164
+ removeLowArcsFromNode(getNodeInGraph(graph, index), graph, cutoff);
2165
+
2166
+ concatenateGraph(graph);
2167
+ }