finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,32 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _SHORTREADPAIRS_H_
22
+ #define _SHORTREADPAIRS_H_
23
+
24
+ void exploitShortReadPairs(Graph * graph,
25
+ ReadSet * reads,
26
+ boolean * dubious,
27
+ boolean * shadows,
28
+ boolean force_jumps);
29
+ void handicapNode(Node * node);
30
+ NodeList *getMarkedNodeList();
31
+
32
+ #endif
@@ -0,0 +1,259 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+
24
+ #ifdef _OPENMP
25
+ #include <omp.h>
26
+ #endif
27
+
28
+ #include "globals.h"
29
+ #include "recycleBin.h"
30
+ #include "kmer.h"
31
+ #include "utility.h"
32
+
33
+ #define CHUNKSIZE 10000
34
+
35
+ static RecycleBin *treeMemory = NULL;
36
+
37
+ struct splayNode_st {
38
+ Kmer kmer;
39
+ Coordinate position;
40
+ struct splayNode_st *left;
41
+ struct splayNode_st *right;
42
+ IDnum seqID;
43
+ } ATTRIBUTE_PACKED;
44
+
45
+ typedef struct splayNode_st SplayNode;
46
+ typedef struct splayNode_st SplayTree;
47
+
48
+
49
+ #ifdef _OPENMP
50
+ void initSplayTreeMemory(void)
51
+ {
52
+ int n;
53
+
54
+ n = omp_get_max_threads();
55
+ #pragma omp critical
56
+ if (treeMemory == NULL)
57
+ treeMemory = newRecycleBinArray(n, sizeof(SplayNode), CHUNKSIZE);
58
+ }
59
+ #endif
60
+
61
+ static SplayNode *allocateSplayNode()
62
+ {
63
+ #ifdef _OPENMP
64
+ #ifdef DEBUG
65
+ if (treeMemory == NULL)
66
+ {
67
+ velvetLog("The memory for splay trees seems uninitialised, "
68
+ "this is probably a bug, aborting.\n");
69
+ abort();
70
+ }
71
+ #endif
72
+ return allocatePointer(getRecycleBinInArray(treeMemory,
73
+ omp_get_thread_num()));
74
+ #else
75
+ if (treeMemory == NULL)
76
+ treeMemory = newRecycleBin(sizeof(SplayNode), CHUNKSIZE);
77
+
78
+ return allocatePointer(treeMemory);
79
+ #endif
80
+ }
81
+
82
+ void destroyAllSplayTrees()
83
+ {
84
+ #ifdef _OPENMP
85
+ destroyRecycleBinArray(treeMemory);
86
+ #else
87
+ destroyRecycleBin(treeMemory);
88
+ #endif
89
+ treeMemory = NULL;
90
+ }
91
+
92
+ /* This function can be called only if K2 has a left child */
93
+ /* Perform a rotate between a node (K2) and its left child */
94
+ /* Update heights, then return new root */
95
+
96
+ static SplayNode *SingleRotateWithLeft(SplayNode * K2)
97
+ {
98
+ SplayNode *K1;
99
+
100
+ K1 = K2->left;
101
+ K2->left = K1->right;
102
+ K1->right = K2;
103
+
104
+ return K1; /* New root */
105
+ }
106
+
107
+ /* This function can be called only if K1 has a right child */
108
+ /* Perform a rotate between a node (K1) and its right child */
109
+ /* Update heights, then return new root */
110
+
111
+ static SplayNode *SingleRotateWithRight(SplayNode * K1)
112
+ {
113
+ SplayNode *K2;
114
+
115
+ K2 = K1->right;
116
+ K1->right = K2->left;
117
+ K2->left = K1;
118
+
119
+ return K2; /* New root */
120
+ }
121
+
122
+ /* Top-down splay procedure, */
123
+ /* not requiring kmer to be in tree */
124
+
125
+ static SplayTree *Splay(Kmer * kmer, SplayTree * T)
126
+ {
127
+ SplayNode Header;
128
+ SplayNode *LeftTreeMax, *RightTreeMin;
129
+
130
+ if (T == NULL)
131
+ return NULL;
132
+
133
+ Header.left = Header.right = NULL;
134
+ LeftTreeMax = RightTreeMin = &Header;
135
+
136
+ while (compareKmers(kmer, &(T->kmer))) {
137
+ if (compareKmers(kmer, &(T->kmer)) < 0) {
138
+ if (T->left == NULL)
139
+ break;
140
+ if (compareKmers(kmer, &(T->left->kmer)) < 0)
141
+ T = SingleRotateWithLeft(T);
142
+ if (T->left == NULL)
143
+ break;
144
+ /* Link right */
145
+ RightTreeMin->left = T;
146
+ RightTreeMin = T;
147
+ T = T->left;
148
+ } else {
149
+ if (T->right == NULL)
150
+ break;
151
+ if (compareKmers(kmer, &(T->right->kmer)) > 0)
152
+ T = SingleRotateWithRight(T);
153
+ if (T->right == NULL)
154
+ break;
155
+ /* Link left */
156
+ LeftTreeMax->right = T;
157
+ LeftTreeMax = T;
158
+ T = T->right;
159
+ }
160
+ } /* while kmer != T->kmer */
161
+
162
+ /* Reassemble */
163
+ LeftTreeMax->right = T->left;
164
+ RightTreeMin->left = T->right;
165
+ T->left = Header.right;
166
+ T->right = Header.left;
167
+
168
+ return T;
169
+ }
170
+
171
+ Kmer * findInTree(Kmer * X, SplayTree ** T)
172
+ {
173
+ *T = Splay(X, *T);
174
+ return &((*T)->kmer);
175
+ }
176
+
177
+ void insertIntoTree(Kmer * kmer, SplayTree ** T)
178
+ {
179
+ SplayNode *newNode;
180
+
181
+ if (*T == NULL) {
182
+ newNode = allocateSplayNode();
183
+ copyKmers(&(newNode->kmer), kmer);
184
+ newNode->left = newNode->right = NULL;
185
+ *T = newNode;
186
+ return;
187
+ }
188
+
189
+ *T = Splay(kmer, *T);
190
+ if (compareKmers(kmer, &((*T)->kmer)) < 0) {
191
+ newNode = allocateSplayNode();
192
+ copyKmers(&(newNode->kmer), kmer);
193
+ newNode->left = (*T)->left;
194
+ newNode->right = *T;
195
+ (*T)->left = NULL;
196
+ *T = newNode;
197
+ } else if (compareKmers(&((*T)->kmer), kmer) < 0) {
198
+ newNode = allocateSplayNode();
199
+ copyKmers(&(newNode->kmer), kmer);
200
+ newNode->right = (*T)->right;
201
+ newNode->left = *T;
202
+ (*T)->right = NULL;
203
+ *T = newNode;
204
+ }
205
+ }
206
+
207
+ boolean
208
+ findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
209
+ Coordinate * position, SplayTree ** T)
210
+ {
211
+ SplayNode *newNode;
212
+
213
+ if (*T == NULL) {
214
+ newNode = allocateSplayNode();
215
+ copyKmers(&(newNode->kmer), kmer);
216
+ newNode->seqID = *seqID;
217
+ newNode->position = *position;
218
+
219
+ newNode->left = newNode->right = NULL;
220
+
221
+ *T = newNode;
222
+
223
+ return false;
224
+ }
225
+
226
+ *T = Splay(kmer, *T);
227
+ if (compareKmers(kmer, &((*T)->kmer)) < 0) {
228
+ newNode = allocateSplayNode();
229
+ copyKmers(&(newNode->kmer), kmer);
230
+ newNode->seqID = *seqID;
231
+ newNode->position = *position;
232
+
233
+ newNode->left = (*T)->left;
234
+ newNode->right = *T;
235
+ (*T)->left = NULL;
236
+
237
+ *T = newNode;
238
+
239
+ return false;
240
+ } else if (compareKmers(kmer, &((*T)->kmer)) > 0) {
241
+ newNode = allocateSplayNode();
242
+ copyKmers(&(newNode->kmer), kmer);
243
+ newNode->seqID = *seqID;
244
+ newNode->position = *position;
245
+
246
+ newNode->right = (*T)->right;
247
+ newNode->left = *T;
248
+ (*T)->right = NULL;
249
+
250
+ *T = newNode;
251
+
252
+ return false;
253
+ } else {
254
+ *seqID = (*T)->seqID;
255
+ *position = (*T)->position;
256
+
257
+ return true;
258
+ }
259
+ }
@@ -0,0 +1,43 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _Splay_H
22
+ #define _Splay_H
23
+
24
+ #include <stdio.h>
25
+
26
+ typedef struct splayNode_st SplayTree;
27
+
28
+ // Deallocates tree memory
29
+ void destroyAllSplayTrees();
30
+
31
+ // Finds occurrence of kmer in the tree
32
+ // If found, returns TRUE, and seqID and coordinate are accordingly modified
33
+ // If not, a new leaf is added to the tree, with the seqID and position data
34
+ boolean findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
35
+ Coordinate * position,
36
+ SplayTree ** T);
37
+
38
+ #ifdef _OPENMP
39
+ /* Initialises the per-thread RecycleBin array */
40
+ void initSplayTreeMemory(void);
41
+ #endif
42
+
43
+ #endif
@@ -0,0 +1,1315 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <time.h>
25
+ #include <sys/time.h>
26
+
27
+ #ifdef _OPENMP
28
+ #include <omp.h>
29
+ #endif
30
+
31
+ #include "globals.h"
32
+ #include "readSet.h"
33
+ #include "splay.h"
34
+ #include "tightString.h"
35
+ #include "utility.h"
36
+ #include "kmer.h"
37
+ #include "kmerOccurenceTable.h"
38
+ #include "recycleBin.h"
39
+ #include "binarySequences.h"
40
+
41
+ static RecycleBin * maskMemory = NULL;
42
+
43
+ static Mask *allocateMask()
44
+ {
45
+ if (maskMemory == NULL)
46
+ maskMemory = newRecycleBin(sizeof(Mask), 10000);
47
+
48
+ return (Mask *) allocatePointer(maskMemory);
49
+ }
50
+
51
+ static Mask * newMask(Coordinate position)
52
+ {
53
+ Mask * mask = allocateMask();
54
+ mask->start = position;
55
+ mask->finish = position;
56
+ mask->next = NULL;
57
+ return mask;
58
+ }
59
+
60
+ // DEBUG
61
+ boolean debug = false;
62
+
63
+ #define HASH_BUCKETS_NB 16777216
64
+
65
+ #ifdef _OPENMP
66
+
67
+ #define NB_PUSH 32
68
+ #define BUFFER_SIZE 4096
69
+
70
+ static StringBuffer **annotationBuffer = NULL;
71
+ static StringBuffer **annotationBufferW = NULL;
72
+ static int *nbPush = NULL;
73
+ static boolean producing = 1;
74
+
75
+ static void initAnnotationBuffers(void)
76
+ {
77
+ int n;
78
+ int i;
79
+
80
+ n = omp_get_max_threads();
81
+ annotationBuffer = callocOrExit(n, StringBuffer*);
82
+ annotationBufferW = callocOrExit(n, StringBuffer*);
83
+ nbPush = callocOrExit(n, int);
84
+
85
+ for (i = 0; i < n; i++)
86
+ {
87
+ annotationBuffer[i] = newStringBuffer(BUFFER_SIZE);
88
+ annotationBufferW[i] = newStringBuffer(BUFFER_SIZE);
89
+ }
90
+ }
91
+
92
+ static void destroyAnnotationBuffers(void)
93
+ {
94
+ int n;
95
+ int i;
96
+
97
+ n = omp_get_max_threads();
98
+
99
+ for (i = 0; i < n; i++)
100
+ {
101
+ destroyStringBuffer(annotationBuffer[i], 1);
102
+ destroyStringBuffer(annotationBufferW[i], 1);
103
+ }
104
+
105
+ free(annotationBuffer);
106
+ free(annotationBufferW);
107
+ free(nbPush);
108
+ annotationBuffer = NULL;
109
+ annotationBufferW = NULL;
110
+ nbPush = NULL;
111
+ }
112
+
113
+ static void pushBufferCommit(int thread)
114
+ {
115
+ StringBuffer *tmp;
116
+ char *s;
117
+
118
+ s = annotationBufferW[thread]->str;
119
+ do
120
+ {
121
+ #pragma omp flush(s)
122
+ }
123
+ while (*s);
124
+ tmp = annotationBufferW[thread];
125
+ annotationBufferW[thread] = annotationBuffer[thread];
126
+ annotationBuffer[thread] = tmp;
127
+ tmp = annotationBufferW[thread];
128
+ #pragma omp flush(tmp)
129
+ }
130
+
131
+ static void pushBuffer(int thread)
132
+ {
133
+ if (++nbPush[thread] == NB_PUSH)
134
+ {
135
+ nbPush[thread] = 0;
136
+ pushBufferCommit(thread);
137
+ }
138
+ }
139
+
140
+ static void writeBuffers(FILE *outFile, int nbThreads)
141
+ {
142
+ int i;
143
+
144
+ for (i = 0; i < nbThreads; i++)
145
+ {
146
+ StringBuffer *b;
147
+ char *s;
148
+
149
+ b = annotationBufferW[i];
150
+ #pragma omp flush(b)
151
+ s = b->str;
152
+ #pragma omp flush(s)
153
+ if (*s)
154
+ {
155
+ velvetFprintf(outFile, "%s", annotationBufferW[i]->str);
156
+ resetStringBuffer(annotationBufferW[i]);
157
+ }
158
+ }
159
+ }
160
+
161
+ static void bufferWritter(FILE *outFile)
162
+ {
163
+ int n;
164
+
165
+ n = omp_get_max_threads();
166
+ #pragma omp flush(producing)
167
+ while (producing)
168
+ {
169
+ writeBuffers(outFile, n);
170
+ #pragma omp flush(producing)
171
+ }
172
+ writeBuffers(outFile, n);
173
+ }
174
+
175
+ static void appendLine(char *line, int thread)
176
+ {
177
+ appendStringBuffer(annotationBuffer[thread], line);
178
+ }
179
+ #else
180
+
181
+ #define BUFFER_SIZE 1024
182
+
183
+ StringBuffer *annotationBuffer = NULL;
184
+
185
+ static void appendLine(char *line, int thread)
186
+ {
187
+ appendStringBuffer(annotationBuffer, line);
188
+ }
189
+ #endif
190
+
191
+ struct splayTable_st {
192
+ SplayTree **table;
193
+ #ifdef _OPENMP
194
+ omp_lock_t *tableLocks;
195
+ #endif
196
+ KmerOccurenceTable *kmerOccurenceTable;
197
+ int WORDLENGTH;
198
+ boolean double_strand;
199
+ };
200
+
201
+ SplayTable *newSplayTable(int WORDLENGTH, boolean double_strand)
202
+ {
203
+ SplayTable *splayTable = mallocOrExit(1, SplayTable);
204
+ splayTable->WORDLENGTH = WORDLENGTH;
205
+ splayTable->table = callocOrExit(HASH_BUCKETS_NB, SplayTree *);
206
+ splayTable->kmerOccurenceTable = NULL;
207
+ splayTable->double_strand = double_strand;
208
+ #ifdef _OPENMP
209
+ splayTable->tableLocks = mallocOrExit(HASH_BUCKETS_NB, omp_lock_t);
210
+ int i;
211
+ #pragma omp parallel for
212
+ for (i = 0; i < HASH_BUCKETS_NB; i++)
213
+ omp_init_lock(splayTable->tableLocks + i);
214
+ initSplayTreeMemory();
215
+ #endif
216
+ return splayTable;
217
+ }
218
+
219
+ void destroySplayTable(SplayTable * splayTable)
220
+ {
221
+ velvetLog("Destroying splay table\n");
222
+
223
+ destroyAllSplayTrees();
224
+ free(splayTable->table);
225
+ destroyKmerOccurenceTable(splayTable->kmerOccurenceTable);
226
+ free(splayTable);
227
+
228
+ velvetLog("Splay table destroyed\n");
229
+ }
230
+
231
+ static KmerKey hash_kmer(Kmer * kmer)
232
+ {
233
+ #if KMER_LONGLONGS
234
+ KmerKey key = kmer->longlongs[0];
235
+
236
+ #if KMER_LONGLONGS > 1
237
+ key ^= kmer->longlongs[1];
238
+ #endif
239
+ #if KMER_LONGLONGS > 2
240
+ key ^= kmer->longlongs[2];
241
+ #endif
242
+
243
+ key = (~key) + (key << 21);
244
+ key = key ^ (key >> 24);
245
+ key = (key + (key << 3)) + (key << 8);
246
+ key = key ^ (key >> 14);
247
+ key = (key + (key << 2)) + (key << 4);
248
+ key = key ^ (key >> 28);
249
+ key = key + (key << 31);
250
+
251
+ return key % HASH_BUCKETS_NB;
252
+ #elif KMER_LONGS
253
+ KmerKey key = kmer->longs;
254
+
255
+ key += ~(key << 15);
256
+ key ^= (key >> 10);
257
+ key += (key << 3);
258
+ key ^= (key >> 6);
259
+ key += ~(key << 11);
260
+ key ^= (key >> 16);
261
+
262
+ return key % HASH_BUCKETS_NB;
263
+
264
+ #elif KMER_INTS
265
+ return kmer->ints % HASH_BUCKETS_NB;
266
+ #elif KMER_CHARS
267
+ return kmer->chars % HASH_BUCKETS_NB;
268
+ #endif
269
+ }
270
+
271
+ static Coordinate getNearestHSPIndex(Coordinate position, IDnum * sequenceIDs, Coordinate sequenceLength) {
272
+ Coordinate back_offset = -1;
273
+ Coordinate front_offset = -1;
274
+
275
+ for (back_offset = 1; position - back_offset > 0; back_offset++)
276
+ if (sequenceIDs[position - back_offset])
277
+ break;
278
+
279
+ for (front_offset = 1; position + front_offset < sequenceLength; front_offset++)
280
+ if (sequenceIDs[position + front_offset])
281
+ break;
282
+
283
+ if (back_offset == position && position + front_offset == sequenceLength)
284
+ return -1;
285
+ else if (back_offset == position)
286
+ return position + front_offset;
287
+ else if (front_offset + position == sequenceLength)
288
+ return position - back_offset;
289
+ else
290
+ return back_offset < front_offset? position - back_offset : position + front_offset;
291
+ }
292
+
293
+ static KmerOccurence * getMostAppropriateHit(Coordinate readCoord, Coordinate readLength, boolean direct, KmerOccurence * kmerOccurence, IDnum mapCount, IDnum * mapSequenceID, Coordinate * mapCoord, int wordLength) {
294
+ KmerOccurence * current;
295
+ KmerOccurence * best = NULL;
296
+ Coordinate expectedPosition;
297
+ Coordinate positionError;
298
+ IDnum mapIndex;
299
+
300
+ // If only one hit
301
+ if (!getNextKmerOccurence(kmerOccurence))
302
+ return kmerOccurence;
303
+
304
+ // If multiple hits by unmapped read
305
+ if (mapCount == 0)
306
+ return NULL;
307
+
308
+ // Compare cases
309
+ for (current = kmerOccurence; current; current = getNextKmerOccurence(current)) {
310
+ for (mapIndex = 0; mapIndex < mapCount; mapIndex++) {
311
+
312
+ // If wrong sequence or unconsistent orientation
313
+ if ((direct && getKmerOccurenceNodeID(current) != mapSequenceID[mapIndex])
314
+ || (!direct && getKmerOccurenceNodeID(current) != -mapSequenceID[mapIndex]))
315
+ continue;
316
+
317
+ // Compute where it is supposed to land on reference
318
+ if (mapSequenceID[mapIndex] < 0)
319
+ expectedPosition = mapCoord[mapIndex] + readLength - readCoord - 1;
320
+ else
321
+ expectedPosition = mapCoord[mapIndex] + readCoord - wordLength + 1;
322
+
323
+ // Compute positional error
324
+ positionError = getKmerOccurencePosition(current) - expectedPosition;
325
+
326
+ // If potential hit record
327
+ if (positionError < 1 && positionError > -1) {
328
+ if (best)
329
+ // If competing hit, give up
330
+ return NULL;
331
+ else
332
+ // Record current hit
333
+ best = current;
334
+ }
335
+ }
336
+ }
337
+
338
+ return best;
339
+ }
340
+
341
+ static inline boolean
342
+ doFindOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
343
+ Coordinate * position, SplayTable *table)
344
+ {
345
+ #ifdef _OPENMP
346
+ const KmerKey kmerHash = hash_kmer(kmer);
347
+ boolean ret;
348
+
349
+ omp_set_lock(table->tableLocks + kmerHash);
350
+ ret = findOrInsertOccurenceInSplayTree(kmer, seqID, position,
351
+ table->table + kmerHash);
352
+ omp_unset_lock(table->tableLocks + kmerHash);
353
+
354
+ return ret;
355
+ #else
356
+ return findOrInsertOccurenceInSplayTree(kmer, seqID, position,
357
+ &table->table[hash_kmer(kmer)]);
358
+ #endif
359
+ }
360
+
361
+
362
+ static boolean findOrInsertOccurenceInSplayTable(Kmer * kmer, IDnum * seqID,
363
+ Coordinate * position,
364
+ SplayTable * table, IDnum * sequenceIDs,
365
+ Coordinate * coords, Coordinate readIndex, Coordinate readLength, boolean direct)
366
+ {
367
+ KmerOccurence * hit;
368
+ Coordinate HSPIndex;
369
+
370
+ // Check if previous anchor
371
+ if (sequenceIDs && sequenceIDs[readIndex]) {
372
+ if (direct)
373
+ *seqID = sequenceIDs[readIndex];
374
+ else
375
+ *seqID = -sequenceIDs[readIndex];
376
+ if (sequenceIDs[readIndex] > 0)
377
+ *position = coords[readIndex] + readIndex;
378
+ else
379
+ *position = coords[readIndex] - readIndex + readLength - 1;
380
+
381
+ return true;
382
+ }
383
+ else if (coords && coords[readIndex])
384
+ // If in buffer zone:
385
+ return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
386
+
387
+
388
+ if (debug)
389
+ abort();
390
+ // Look up first in reference sequence k-mers
391
+ if (table->kmerOccurenceTable
392
+ && (hit = findKmerInKmerOccurenceTable(kmer, table->kmerOccurenceTable))) {
393
+ if (!getNextKmerOccurence(hit)) {
394
+ *seqID = getKmerOccurenceNodeID(hit);
395
+ *position = getKmerOccurencePosition(hit);
396
+ return true;
397
+ } else if ((HSPIndex = getNearestHSPIndex(*position, sequenceIDs, readLength)) > 0) {
398
+ hit = getMostAppropriateHit(readIndex, readLength, direct, hit, 1, &(sequenceIDs[HSPIndex]), &(coords[HSPIndex]), table->WORDLENGTH);
399
+ if (hit) {
400
+ *seqID = getKmerOccurenceNodeID(hit);
401
+ *position = getKmerOccurencePosition(hit);
402
+ return true;
403
+ }
404
+
405
+ }
406
+ }
407
+
408
+ // If not, go through the novel k-mers
409
+ return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
410
+ }
411
+
412
+ static void printAnnotations(IDnum *sequenceIDs, Coordinate * coords,
413
+ TightString * array, SplayTable * table,
414
+ FILE * file, boolean second_in_pair, IDnum seqID)
415
+ {
416
+ Coordinate readNucleotideIndex = 0;
417
+ Coordinate writeNucleotideIndex = 0;
418
+ Kmer word;
419
+ Kmer antiWord;
420
+ boolean annotationClosed = true;
421
+ IDnum sequenceID;
422
+ Coordinate coord;
423
+ boolean found;
424
+ Coordinate position = 0;
425
+ Coordinate start = 0;
426
+ Coordinate finish = 0;
427
+ IDnum referenceSequenceID = 0;
428
+ Nucleotide nucleotide;
429
+ char lineBuffer[MAXLINE];
430
+ TightString * tString = getTightStringInArray(array, seqID - 1);
431
+ int thread = 0;
432
+
433
+ clearKmer(&word);
434
+ clearKmer(&antiWord);
435
+
436
+ #ifdef _OPENMP
437
+ thread = omp_get_thread_num();
438
+ #endif
439
+
440
+ if (debug)
441
+ abort();
442
+
443
+ sprintf(lineBuffer, "ROADMAP %li\n", (long)seqID);
444
+ appendLine(lineBuffer, thread);
445
+
446
+ // Neglect any string shorter than WORDLENGTH :
447
+ if (getLength(tString) < table->WORDLENGTH) {
448
+ #ifdef _OPENMP
449
+ pushBuffer(thread);
450
+ #else
451
+ velvetFprintf(file, "%s", annotationBuffer->str);
452
+ resetStringBuffer(annotationBuffer);
453
+ #endif
454
+ return;
455
+ }
456
+
457
+ // Fill in the initial word :
458
+ for (readNucleotideIndex = 0;
459
+ readNucleotideIndex < table->WORDLENGTH - 1;
460
+ readNucleotideIndex++) {
461
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
462
+ pushNucleotide(&word, nucleotide);
463
+ #ifdef COLOR
464
+ reversePushNucleotide(&antiWord, nucleotide);
465
+ #else
466
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
467
+ #endif
468
+ }
469
+
470
+ while (readNucleotideIndex < getLength(tString)) {
471
+ // Shift word:
472
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
473
+ pushNucleotide(&word, nucleotide);
474
+ #ifdef COLOR
475
+ reversePushNucleotide(&antiWord, nucleotide);
476
+ #else
477
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
478
+ #endif
479
+
480
+ sequenceID = seqID;
481
+ coord = writeNucleotideIndex;
482
+
483
+ if (table->double_strand) {
484
+ if (compareKmers(&word, &antiWord) <= 0) {
485
+ found =
486
+ findOrInsertOccurenceInSplayTable(&word,
487
+ &sequenceID,
488
+ &coord,
489
+ table,
490
+ sequenceIDs,
491
+ coords,
492
+ readNucleotideIndex,
493
+ getLength(tString),
494
+ true);
495
+ } else {
496
+ sequenceID = -sequenceID;
497
+ found =
498
+ findOrInsertOccurenceInSplayTable(&antiWord,
499
+ &sequenceID,
500
+ &coord,
501
+ table,
502
+ sequenceIDs,
503
+ coords,
504
+ readNucleotideIndex,
505
+ getLength(tString),
506
+ false);
507
+ sequenceID = -sequenceID;
508
+ }
509
+ } else {
510
+ if (!second_in_pair) {
511
+ found =
512
+ findOrInsertOccurenceInSplayTable(&word,
513
+ &sequenceID,
514
+ &coord,
515
+ table,
516
+ sequenceIDs,
517
+ coords,
518
+ readNucleotideIndex,
519
+ getLength(tString),
520
+ true);
521
+ } else {
522
+ sequenceID = -sequenceID;
523
+ found =
524
+ findOrInsertOccurenceInSplayTable(&antiWord,
525
+ &sequenceID,
526
+ &coord,
527
+ table,
528
+ sequenceIDs,
529
+ coords,
530
+ readNucleotideIndex,
531
+ getLength(tString),
532
+ false);
533
+ sequenceID = -sequenceID;
534
+ }
535
+ }
536
+
537
+ if (!found) {
538
+ writeNucleotideIndex++;
539
+ if (!annotationClosed) {
540
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
541
+ (long) referenceSequenceID, (long long) position,
542
+ (long long) start, (long long) finish);
543
+ appendLine(lineBuffer, thread);
544
+ }
545
+ annotationClosed = true;
546
+ }
547
+ // Otherwise create/complete annotation:
548
+ else {
549
+ // Forbidden k-mer
550
+ if (sequenceID == 0) {
551
+ break;
552
+ }
553
+ // Closed/inexistant annotation
554
+ else if (annotationClosed) {
555
+ referenceSequenceID = sequenceID;
556
+ position = writeNucleotideIndex;
557
+ start = finish = coord;
558
+
559
+ if (referenceSequenceID > 0)
560
+ finish++;
561
+ else
562
+ finish--;
563
+
564
+ annotationClosed = false;
565
+ }
566
+ // Open annotation
567
+ else if (sequenceID == referenceSequenceID
568
+ && coord == finish) {
569
+ if (referenceSequenceID > 0)
570
+ finish++;
571
+ else
572
+ finish--;
573
+ }
574
+ // Previous non corresponding annotation
575
+ else {
576
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
577
+ (long) referenceSequenceID, (long long) position,
578
+ (long long) start, (long long) finish);
579
+ appendLine(lineBuffer, thread);
580
+
581
+ referenceSequenceID = sequenceID;
582
+ position = writeNucleotideIndex;
583
+ start = finish = coord;
584
+
585
+ if (referenceSequenceID > 0)
586
+ finish++;
587
+ else
588
+ finish--;
589
+ }
590
+ }
591
+
592
+ readNucleotideIndex++;
593
+ }
594
+
595
+ if (!annotationClosed) {
596
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
597
+ (long) referenceSequenceID, (long long) position,
598
+ (long long) start, (long long) finish);
599
+ appendLine(lineBuffer, thread);
600
+ }
601
+ #ifdef _OPENMP
602
+ pushBuffer(thread);
603
+ #else
604
+ velvetFprintf(file, "%s", annotationBuffer->str);
605
+ resetStringBuffer(annotationBuffer);
606
+ #endif
607
+
608
+ return;
609
+ }
610
+
611
+ static void computeClearHSPs(TightString * array, boolean second_in_pair, SplayTable * table, IDnum ** sequenceIDs, Coordinate ** coords, IDnum * mapReferenceIDs, Coordinate * mapCoords, Coordinate mapCount, IDnum seqID) {
612
+ Coordinate readNucleotideIndex = 0;
613
+ Kmer word;
614
+ Kmer antiWord;
615
+ Kmer polyA;
616
+ Nucleotide nucleotide;
617
+ KmerOccurence * hit;
618
+
619
+ int penalty;
620
+ TightString * tString;
621
+ Coordinate length;
622
+
623
+ clearKmer(&polyA);
624
+ tString = getTightStringInArray(array, seqID - 1);
625
+ length = getLength(tString);
626
+ *sequenceIDs = callocOrExit(length, IDnum);
627
+ *coords = callocOrExit(length, Coordinate);
628
+
629
+ // First pass for unambiguous hits
630
+ // Fill in the initial word :
631
+ clearKmer(&word);
632
+ clearKmer(&antiWord);
633
+ for (readNucleotideIndex = 0;
634
+ readNucleotideIndex < table->WORDLENGTH - 1;
635
+ readNucleotideIndex++) {
636
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
637
+ pushNucleotide(&word, nucleotide);
638
+ #ifdef COLOR
639
+ reversePushNucleotide(&antiWord, nucleotide);
640
+ #else
641
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
642
+ #endif
643
+ }
644
+
645
+ // Kill silly poly-T beginnings
646
+ while (readNucleotideIndex < getLength(tString) && (compareKmers(&antiWord, &polyA) == 0 || compareKmers(&word, &polyA) == 0)) {
647
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
648
+ pushNucleotide(&word, nucleotide);
649
+ #ifdef COLOR
650
+ reversePushNucleotide(&antiWord, nucleotide);
651
+ #else
652
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
653
+ #endif
654
+ }
655
+
656
+ while (readNucleotideIndex < getLength(tString)) {
657
+ // Shift word:
658
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
659
+ pushNucleotide(&word, nucleotide);
660
+
661
+ #ifdef COLOR
662
+ reversePushNucleotide(&antiWord, nucleotide);
663
+ #else
664
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
665
+ #endif
666
+
667
+ if (table->double_strand) {
668
+ if (compareKmers(&word, &antiWord) <= 0) {
669
+ hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
670
+
671
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
672
+ (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
673
+ } else {
674
+ hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
675
+
676
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
677
+ (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
678
+ }
679
+ } else {
680
+ if (!second_in_pair) {
681
+ hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
682
+
683
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
684
+ (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
685
+ } else {
686
+ hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
687
+
688
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
689
+ (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
690
+ }
691
+ }
692
+
693
+ if ((*sequenceIDs)[readNucleotideIndex]) {
694
+ if ((*sequenceIDs)[readNucleotideIndex] > 0)
695
+ (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) - readNucleotideIndex;
696
+ else
697
+ (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) + readNucleotideIndex - getLength(tString) + 1;
698
+ }
699
+
700
+ // Barrier to flip-flopping
701
+ if ((*sequenceIDs)[readNucleotideIndex - 1] != 0
702
+ && ((*sequenceIDs)[readNucleotideIndex] != (*sequenceIDs)[readNucleotideIndex - 1]
703
+ || (*coords)[readNucleotideIndex] != (*coords)[readNucleotideIndex - 1])) {
704
+ // Break in continuity... skip k positions
705
+ (*sequenceIDs)[readNucleotideIndex] = 0;
706
+ (*coords)[readNucleotideIndex] = -1;
707
+ readNucleotideIndex++;
708
+
709
+ for (penalty = 0; penalty < table->WORDLENGTH - 1 && readNucleotideIndex < getLength(tString); penalty++) {
710
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
711
+ pushNucleotide(&word, nucleotide);
712
+
713
+ #ifdef COLOR
714
+ reversePushNucleotide(&antiWord, nucleotide);
715
+ #else
716
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
717
+ #endif
718
+ (*sequenceIDs)[readNucleotideIndex] = 0;
719
+ (*coords)[readNucleotideIndex] = -1;
720
+ readNucleotideIndex++;
721
+ }
722
+ } else
723
+ readNucleotideIndex++;
724
+
725
+ }
726
+
727
+ free(mapReferenceIDs);
728
+ free(mapCoords);
729
+ }
730
+
731
+ void inputSequenceIntoSplayTable(TightString * array,
732
+ SplayTable * table,
733
+ FILE * file,
734
+ boolean second_in_pair,
735
+ IDnum * mapReferenceIDs, Coordinate * mapCoords, Coordinate mapCount,
736
+ IDnum seqID)
737
+ {
738
+ IDnum * sequenceIDs = NULL;
739
+ Coordinate * coords = NULL;
740
+
741
+ //debug = (seqID == 29405);
742
+
743
+ // If appropriate, get the HSPs on reference sequences
744
+ if (table->kmerOccurenceTable)
745
+ computeClearHSPs(array, second_in_pair, table, &sequenceIDs, &coords, mapReferenceIDs, mapCoords, mapCount, seqID);
746
+
747
+ // Go through read, eventually with annotations
748
+ printAnnotations(sequenceIDs, coords, array, table, file, second_in_pair, seqID);
749
+
750
+ // Clean up
751
+ if (sequenceIDs) {
752
+ free(sequenceIDs);
753
+ free(coords);
754
+ }
755
+ }
756
+
757
+ void inputReferenceIntoSplayTable(TightString * tString,
758
+ SplayTable * table, FILE * file, IDnum seqID, Mask * mask)
759
+ {
760
+ IDnum currentIndex;
761
+ Coordinate readNucleotideIndex = 0;
762
+ Coordinate kmerIndex = 0;
763
+ Kmer word;
764
+ Kmer antiWord;
765
+ Nucleotide nucleotide;
766
+ Mask * currentMask = mask;
767
+ #ifdef _OPENMP
768
+ char lineBuffer[MAXLINE];
769
+ #endif
770
+
771
+ clearKmer(&word);
772
+ clearKmer(&antiWord);
773
+
774
+ currentIndex = seqID;
775
+ #ifdef _OPENMP
776
+ sprintf(lineBuffer, "ROADMAP %li\n", (long)currentIndex);
777
+ appendLine(lineBuffer, omp_get_thread_num());
778
+ #else
779
+ velvetFprintf(file, "ROADMAP %li\n", (long)currentIndex);
780
+ #endif
781
+
782
+ // Neglect any string shorter than WORDLENGTH :
783
+ if (getLength(tString) < table->WORDLENGTH) {
784
+ return;
785
+ }
786
+
787
+ // Fill in the initial word :
788
+ for (readNucleotideIndex = 0;
789
+ readNucleotideIndex < table->WORDLENGTH - 1;
790
+ readNucleotideIndex++) {
791
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
792
+ pushNucleotide(&word, nucleotide);
793
+ if (table->double_strand) {
794
+ #ifdef COLOR
795
+ reversePushNucleotide(&antiWord, nucleotide);
796
+ #else
797
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
798
+ #endif
799
+ }
800
+ }
801
+
802
+ while (readNucleotideIndex < getLength(tString)) {
803
+ // Shift word:
804
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
805
+ pushNucleotide(&word, nucleotide);
806
+
807
+ if (table->double_strand) {
808
+ #ifdef COLOR
809
+ reversePushNucleotide(&antiWord, nucleotide);
810
+ #else
811
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
812
+ #endif
813
+ }
814
+
815
+ // Check for gap masks:
816
+ if (currentMask && currentMask->start - table->WORDLENGTH + 1 <= readNucleotideIndex) {
817
+ while(currentMask && currentMask->finish + table->WORDLENGTH - 1 < readNucleotideIndex)
818
+ currentMask = currentMask->next;
819
+
820
+ if (currentMask && currentMask->finish + table->WORDLENGTH - 1 >= readNucleotideIndex) {
821
+ readNucleotideIndex++;
822
+ kmerIndex++;
823
+ continue;
824
+ }
825
+ }
826
+
827
+ // Record k-mer
828
+ if (table->double_strand) {
829
+ if (compareKmers(&word, &antiWord) <= 0)
830
+ recordKmerOccurence(&word, currentIndex,
831
+ kmerIndex,
832
+ table->kmerOccurenceTable);
833
+ else
834
+ recordKmerOccurence(&antiWord, -currentIndex,
835
+ kmerIndex,
836
+ table->kmerOccurenceTable);
837
+ } else {
838
+ recordKmerOccurence(&word, currentIndex,
839
+ kmerIndex,
840
+ table->kmerOccurenceTable);
841
+ }
842
+ readNucleotideIndex++;
843
+ kmerIndex++;
844
+ }
845
+
846
+ return;
847
+ }
848
+
849
+ static Coordinate countReferenceKmers(ReadSet * reads, int wordLength) {
850
+ IDnum readIndex;
851
+ Coordinate length = 0;
852
+
853
+
854
+ for (readIndex = 0; readIndex < reads->readCount && reads->categories[readIndex] == REFERENCE; readIndex++)
855
+ {
856
+ Coordinate tmpLength = getLength(getTightStringInArray(reads->tSequences, readIndex));
857
+ if (tmpLength >= wordLength)
858
+ length += tmpLength - wordLength + 1;
859
+ }
860
+
861
+ return length;
862
+ }
863
+
864
+ Mask ** scanReferenceSequences(FILE * file, IDnum referenceSequenceCount) {
865
+ Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
866
+ IDnum index;
867
+ char line[MAXLINE];
868
+ char c = '\0';
869
+
870
+ // Search sequences for masks
871
+ for (index = 0; index < referenceSequenceCount; index++) {
872
+ Mask * current = NULL;
873
+ Coordinate position = 0;
874
+ boolean openMask = false;
875
+
876
+ // Read through header
877
+ fgets(line, MAXLINE, file);
878
+
879
+ // Read through sequence
880
+ while ((c = getc(file))) {
881
+ if (c == EOF || c == '>')
882
+ break;
883
+ else if (c == '\r' || c == '\n')
884
+ continue;
885
+ else if (c == 'n' || c == 'N') {
886
+ if (openMask)
887
+ current->finish++;
888
+ else if (referenceMasks[index] == NULL) {
889
+ referenceMasks[index] = newMask(position);
890
+ current = referenceMasks[index];
891
+ } else {
892
+ current->next = newMask(position);
893
+ current = current->next;
894
+ }
895
+ openMask = true;
896
+ position++;
897
+ } else {
898
+ openMask = false;
899
+ position++;
900
+ }
901
+ }
902
+ }
903
+
904
+ if (c != '\0')
905
+ ungetc(c, file);
906
+ return referenceMasks;
907
+ }
908
+
909
+ Mask ** scanBinaryReferenceSequences(SequencesReader *seqReadInfo, IDnum referenceSequenceCount) {
910
+ Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
911
+ IDnum index;
912
+ char line[MAXLINE];
913
+ char c = '\0';
914
+ FILE *file = fopen(seqReadInfo->m_namesFilename, "r");
915
+ if (file == NULL) {
916
+ exitErrorf(EXIT_FAILURE, true, "Couldn't read file %s", seqReadInfo->m_namesFilename);
917
+ } else {
918
+ velvetLog("Reading mapping info from %s\n", seqReadInfo->m_namesFilename);
919
+ }
920
+
921
+ // Search sequences for masks
922
+ for (index = 0; index < referenceSequenceCount; index++) {
923
+ Mask * current = NULL;
924
+ long start = 0;
925
+ long finish = 0;
926
+ long number;
927
+ long cat;
928
+
929
+ // Read through header
930
+ if ((c = getc(file)) != '>') {
931
+ exitErrorf(EXIT_FAILURE, false, "names line did not start with >");
932
+ }
933
+ fgets(line, MAXLINE, file);
934
+ sscanf(line, "%*[^\t]\t%li\t%li\n", &number, &cat);
935
+ // ensure is is a ref cat
936
+ if ((IDnum) number != index + 1) {
937
+ exitErrorf(EXIT_FAILURE, false, "sequence %ld != expected %ld", number, (long) index);
938
+ }
939
+ if ((Category) cat != REFERENCE) {
940
+ exitErrorf(EXIT_FAILURE, false, "unexpected category %ld", cat);
941
+ }
942
+
943
+ // Read through the reference maps
944
+ while ((c = getc(file))) {
945
+ if (c == EOF || c == '>') {
946
+ break;
947
+ }
948
+ ungetc(c, file);
949
+ fgets(line, MAXLINE, file);
950
+ sscanf(line, "%li\t%li\n", &start, &finish);
951
+ if (referenceMasks[index] == NULL) {
952
+ referenceMasks[index] = newMask(start);
953
+ referenceMasks[index]->finish = finish;
954
+ current = referenceMasks[index];
955
+ } else {
956
+ current->next = newMask(start);
957
+ current->next->finish = finish;
958
+ current = current->next;
959
+ }
960
+ }
961
+ ungetc(c, file);
962
+ }
963
+
964
+ fclose(file);
965
+ return referenceMasks;
966
+ }
967
+
968
+ void inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,
969
+ SplayTable * table,
970
+ char *filename, char* seqFilename)
971
+ {
972
+ IDnum index;
973
+ IDnum sequenceCount = reads->readCount;
974
+ TightString *array;
975
+ FILE *outfile = fopen(filename, "w");
976
+ FILE *seqFile = NULL;
977
+ IDnum kmerCount;
978
+ IDnum referenceSequenceCount = 0;
979
+ struct timeval start, end, diff;
980
+ SequencesReader seqReadInfo;
981
+ memset(&seqReadInfo, 0, sizeof(seqReadInfo));
982
+ if (isCreateBinary()) {
983
+ seqReadInfo.m_bIsBinary = true;
984
+ seqReadInfo.m_pFile = openCnySeqForRead(seqFilename, &seqReadInfo.m_unifiedSeqFileHeader);
985
+ if (!seqReadInfo.m_pFile) {
986
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", seqFilename);
987
+ }
988
+ seqReadInfo.m_namesFilename = mallocOrExit(strlen(seqFilename) + sizeof(".names"), char);
989
+ sprintf(seqReadInfo.m_namesFilename, "%s.names", seqFilename);
990
+ seqReadInfo.m_numCategories = seqReadInfo.m_unifiedSeqFileHeader.m_numCategories;
991
+ seqReadInfo.m_minSeqLen = seqReadInfo.m_unifiedSeqFileHeader.m_minSeqLen;
992
+ seqReadInfo.m_maxSeqLen = seqReadInfo.m_unifiedSeqFileHeader.m_maxSeqLen;
993
+ seqReadInfo.m_bIsRef = false;
994
+ seqReadInfo.m_pReadBuffer = mallocOrExit(USF_READ_BUF_SIZE, uint8_t );
995
+ seqReadInfo.m_pCurrentReadPtr = seqReadInfo.m_pReadBufEnd = 0;
996
+
997
+ resetCnySeqCurrentRead(&seqReadInfo);
998
+ } else {
999
+ seqReadInfo.m_bIsBinary = false;
1000
+ }
1001
+ IDnum ** mapReferenceIDs = NULL;
1002
+ Coordinate ** mapCoords = NULL;
1003
+ Coordinate * mapCount = NULL;
1004
+
1005
+ char line[MAXLINE];
1006
+ char c;
1007
+ IDnum seqID = 0;
1008
+ long long_var;
1009
+ long long longlong_var;
1010
+ Coordinate maxCount = 20;
1011
+ Coordinate counter = 0;
1012
+ // DEBUG
1013
+ Mask ** referenceMasks = NULL;
1014
+
1015
+ if (outfile == NULL)
1016
+ exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", filename);
1017
+ else
1018
+ velvetLog("Writing into roadmap file %s...\n", filename);
1019
+
1020
+ // Count reference sequences
1021
+ for (index = 0; index < reads->readCount && reads->categories[index] == REFERENCE; index++)
1022
+ referenceSequenceCount++;
1023
+
1024
+ velvetFprintf(outfile, "%ld\t%ld\t%i\t%hi\n", (long) sequenceCount, (long) referenceSequenceCount, table->WORDLENGTH, (short) table->double_strand);
1025
+
1026
+ if (reads->tSequences == NULL)
1027
+ convertSequences(reads);
1028
+
1029
+ gettimeofday(&start, NULL);
1030
+ array = reads->tSequences;
1031
+
1032
+ #ifdef _OPENMP
1033
+ if (omp_get_max_threads() == 1)
1034
+ {
1035
+ omp_set_num_threads(2);
1036
+ omp_set_nested(0);
1037
+ }
1038
+ else
1039
+ omp_set_nested(1);
1040
+ initAnnotationBuffers();
1041
+ #else
1042
+ annotationBuffer = newStringBuffer(BUFFER_SIZE);
1043
+ #endif
1044
+
1045
+ if (referenceSequenceCount && (kmerCount = countReferenceKmers(reads, table->WORDLENGTH)) > 0) {
1046
+ table->kmerOccurenceTable = newKmerOccurenceTable(24 , table->WORDLENGTH);
1047
+ allocateKmerOccurences(kmerCount, table->kmerOccurenceTable);
1048
+ if (seqReadInfo.m_bIsBinary) {
1049
+ referenceMasks = scanBinaryReferenceSequences(&seqReadInfo, referenceSequenceCount);
1050
+ // binary seqs have no Ns so just advance past the references
1051
+ for (index = 0; index < referenceSequenceCount; index++) {
1052
+ TightString cmpString;
1053
+ cmpString.length = seqReadInfo.m_currentReadLength;
1054
+ cmpString.sequence = mallocOrExit((seqReadInfo.m_currentReadLength + 3) / 4, uint8_t );
1055
+ getCnySeqNucl(&seqReadInfo, cmpString.sequence);
1056
+ if (seqReadInfo.m_bIsRef) {
1057
+ seqReadInfo.m_refCnt = readCnySeqUint32(&seqReadInfo);
1058
+ // now the next ptr is advanced
1059
+ seqReadInfo.m_pNextReadPtr += (sizeof(RefInfo) * seqReadInfo.m_refCnt);
1060
+ RefInfo refElem;
1061
+ uint32_t refIdx;
1062
+ for (refIdx = 0; refIdx < seqReadInfo.m_refCnt; refIdx++) {
1063
+ // not actually used so just read past refs
1064
+ refElem.m_referenceID = readCnySeqUint32(&seqReadInfo);
1065
+ refElem.m_pos = readCnySeqUint32(&seqReadInfo);
1066
+ }
1067
+ }
1068
+ // optional test to ensure reference mapping seqIDs are in sync
1069
+ #if 0
1070
+ TightString *tString;
1071
+ tString = getTightStringInArray(array, index);
1072
+ if (getLength(tString) != seqReadInfo.m_currentReadLength) {
1073
+ velvetLog("Error: TightString len mismatch, %d != %ld\n", getLength(tString), seqReadInfo.m_currentReadLength);
1074
+ exit(1);
1075
+ }
1076
+ char *str = readTightString(tString);
1077
+ char *cmpStr = readTightString(&cmpString);
1078
+ if (strcmp(str, cmpStr) != 0) {
1079
+ printf("seq %s != cmp %s\n", str, cmpStr);
1080
+ exit(1);
1081
+ }
1082
+ free(str);
1083
+ free(cmpStr);
1084
+ #endif
1085
+ advanceCnySeqCurrentRead(&seqReadInfo);
1086
+ free(cmpString.sequence);
1087
+ }
1088
+ } else {
1089
+ seqFile = fopen(seqFilename, "r");
1090
+
1091
+ if (seqFile == NULL)
1092
+ exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", seqFilename);
1093
+ else
1094
+ velvetLog("Reading mapping info from file %s\n", seqFilename);
1095
+
1096
+ seqReadInfo.m_pFile = seqFile;
1097
+
1098
+ // Skip through reference headers quickly
1099
+ referenceMasks = scanReferenceSequences(seqFile, referenceSequenceCount);
1100
+ }
1101
+
1102
+ #ifdef _OPENMP
1103
+ producing = 1;
1104
+ #pragma omp parallel sections
1105
+ {
1106
+ #pragma omp section
1107
+ {
1108
+ bufferWritter(outfile);
1109
+ }
1110
+ #pragma omp section
1111
+ {
1112
+ #pragma omp parallel for
1113
+ #endif
1114
+ for (index = 0; index < referenceSequenceCount; index++)
1115
+ inputReferenceIntoSplayTable(getTightStringInArray(array, index),
1116
+ table, outfile, index + 1, referenceMasks[index]);
1117
+
1118
+ #ifdef _OPENMP
1119
+ for (index = omp_get_max_threads() - 1; index >= 0; index--)
1120
+ pushBufferCommit(index);
1121
+ producing = 0;
1122
+ #pragma omp flush(producing)
1123
+ }
1124
+ }
1125
+ #endif
1126
+
1127
+ if (maskMemory)
1128
+ destroyRecycleBin(maskMemory);
1129
+ maskMemory = NULL;
1130
+ sortKmerOccurenceTable(table->kmerOccurenceTable);
1131
+ }
1132
+
1133
+ velvetLog("Inputting sequences...\n");
1134
+
1135
+ if (table->kmerOccurenceTable) {
1136
+ mapReferenceIDs = callocOrExit(sequenceCount + 1, IDnum*);
1137
+ mapCoords = callocOrExit(sequenceCount + 1, Coordinate *);
1138
+ mapCount = callocOrExit(sequenceCount + 1, Coordinate);
1139
+
1140
+ RefInfo *refArray = NULL;
1141
+ if (seqReadInfo.m_bIsBinary) {
1142
+ TightString cmpString;
1143
+ for (seqID = referenceSequenceCount + 1; seqID < sequenceCount + 1; seqID++) {
1144
+ cmpString.length = seqReadInfo.m_currentReadLength;
1145
+ cmpString.sequence = mallocOrExit((seqReadInfo.m_currentReadLength + 3) / 4, uint8_t );
1146
+ getCnySeqNucl(&seqReadInfo, cmpString.sequence);
1147
+ if (seqReadInfo.m_bIsRef) {
1148
+ seqReadInfo.m_refCnt = readCnySeqUint32(&seqReadInfo);
1149
+ // now the next ptr is advanced
1150
+ seqReadInfo.m_pNextReadPtr += (sizeof(RefInfo) * seqReadInfo.m_refCnt);
1151
+ refArray = callocOrExit(seqReadInfo.m_refCnt, RefInfo);
1152
+ uint32_t refIdx;
1153
+ for (refIdx = 0; refIdx < seqReadInfo.m_refCnt; refIdx++) {
1154
+ refArray[refIdx].m_referenceID = readCnySeqUint32(&seqReadInfo);
1155
+ refArray[refIdx].m_pos = readCnySeqUint32(&seqReadInfo);
1156
+ }
1157
+ }
1158
+ // optional test to ensure reference mapping seqIDs are in sync
1159
+ #if 0
1160
+ TightString *tString;
1161
+ tString = getTightStringInArray(array, seqID - 1);
1162
+ if (getLength(tString) != seqReadInfo.m_currentReadLength) {
1163
+ velvetLog("Error: TightString len mismatch, %d != %ld\n", getLength(tString), seqReadInfo.m_currentReadLength);
1164
+ exit(1);
1165
+ }
1166
+ char *str = readTightString(tString);
1167
+ char *cmpStr = readTightString(&cmpString);
1168
+ if (strcmp(str, cmpStr) != 0) {
1169
+ printf("seq %s != cmp %s\n", str, cmpStr);
1170
+ exit(1);
1171
+ }
1172
+ free(str);
1173
+ free(cmpStr);
1174
+ #endif
1175
+ free(cmpString.sequence);
1176
+
1177
+ // set prior count
1178
+ mapCount[seqID - 1] = counter;
1179
+ counter = 0;
1180
+ maxCount = 20;
1181
+ mapReferenceIDs[seqID] = callocOrExit(maxCount, IDnum);
1182
+ mapCoords[seqID] = callocOrExit(maxCount, Coordinate);
1183
+
1184
+ if (seqReadInfo.m_bIsRef) {
1185
+ while (counter < seqReadInfo.m_refCnt) {
1186
+ mapReferenceIDs[seqID][counter] = (IDnum) refArray[counter].m_referenceID;
1187
+ mapCoords[seqID][counter] = (Coordinate) refArray[counter].m_pos;
1188
+
1189
+ if (++counter == maxCount) {
1190
+ maxCount *= 2;
1191
+ mapReferenceIDs[seqID] = reallocOrExit(mapReferenceIDs[seqID], maxCount, IDnum);
1192
+ mapCoords[seqID] = reallocOrExit(mapCoords[seqID], maxCount, Coordinate);
1193
+ }
1194
+ }
1195
+ free(refArray);
1196
+ }
1197
+ advanceCnySeqCurrentRead(&seqReadInfo);
1198
+ }
1199
+ } else {
1200
+ // Parse file for mapping info
1201
+ while (seqFile && (c = getc(seqFile)) != EOF) {
1202
+
1203
+ if (c == '>') {
1204
+ mapCount[seqID] = counter;
1205
+ counter = 0;
1206
+ maxCount = 20;
1207
+ fgets(line, MAXLINE, seqFile);
1208
+ sscanf(line,"%*[^\t]\t%li\t", &long_var);
1209
+ seqID = (IDnum) long_var;
1210
+ mapReferenceIDs[seqID] = callocOrExit(maxCount, IDnum);
1211
+ mapCoords[seqID] = callocOrExit(maxCount, Coordinate);
1212
+ } else if (c == 'M') {
1213
+ fgets(line, MAXLINE, seqFile);
1214
+ sscanf(line,"\t%li\t%lli\n", &long_var, &longlong_var);
1215
+ mapReferenceIDs[seqID][counter] = (IDnum) long_var;
1216
+ mapCoords[seqID][counter] = (Coordinate) longlong_var;
1217
+
1218
+ if (++counter == maxCount) {
1219
+ maxCount *= 2;
1220
+ mapReferenceIDs[seqID] = reallocOrExit(mapReferenceIDs[seqID], maxCount, IDnum);
1221
+ mapCoords[seqID] = reallocOrExit(mapCoords[seqID], maxCount, Coordinate);
1222
+ }
1223
+ }
1224
+ }
1225
+ }
1226
+ }
1227
+ if (seqFile)
1228
+ fclose(seqFile);
1229
+
1230
+ if (seqReadInfo.m_bIsBinary) {
1231
+ if (seqReadInfo.m_pReadBuffer) {
1232
+ free(seqReadInfo.m_pReadBuffer);
1233
+ }
1234
+ fclose(seqReadInfo.m_pFile);
1235
+ }
1236
+
1237
+ #ifdef _OPENMP
1238
+ producing = 1;
1239
+ #pragma omp parallel sections
1240
+ {
1241
+ #pragma omp section
1242
+ {
1243
+ bufferWritter(outfile);
1244
+ }
1245
+ #pragma omp section
1246
+ {
1247
+ #pragma omp parallel for
1248
+ #endif
1249
+ for (index = referenceSequenceCount; index < sequenceCount; index++)
1250
+ {
1251
+ boolean second_in_pair;
1252
+
1253
+ // Progress report on screen
1254
+ if (index % 1000000 == 0) {
1255
+ velvetLog("Inputting sequence %li / %li\n",
1256
+ (long)index, (long)sequenceCount);
1257
+ fflush(stdout);
1258
+ }
1259
+
1260
+ // Test to make sure that all the reference reads are before all the other reads
1261
+ if (reads->categories[index] == REFERENCE) {
1262
+ velvetLog("Reference sequence placed after a non-reference read!\n");
1263
+ velvetLog(">> Please re-order the filenames in your command line so as "
1264
+ "to have the reference sequence files before all the others\n");
1265
+ #ifdef DEBUG
1266
+ abort();
1267
+ #endif
1268
+ exit(0);
1269
+ }
1270
+ second_in_pair = reads->categories[index] % 2 && isSecondInPair(reads, index);
1271
+
1272
+ // Hashing the reads
1273
+ if (table->kmerOccurenceTable)
1274
+ inputSequenceIntoSplayTable(array, table,
1275
+ outfile,
1276
+ second_in_pair, mapReferenceIDs[index + 1], mapCoords[index+1], mapCount[index+1], index + 1);
1277
+ else
1278
+ inputSequenceIntoSplayTable(array, table,
1279
+ outfile,
1280
+ second_in_pair, NULL, NULL, 0, index + 1);
1281
+ }
1282
+ #ifdef _OPENMP
1283
+ for (index = omp_get_max_threads() - 1; index >= 0; index--)
1284
+ pushBufferCommit(index);
1285
+ producing = 0;
1286
+ #pragma omp flush(producing)
1287
+ }
1288
+ }
1289
+ destroyAnnotationBuffers();
1290
+ #else
1291
+ destroyStringBuffer(annotationBuffer, 1);
1292
+ #endif
1293
+
1294
+ gettimeofday(&end, NULL);
1295
+ timersub(&end, &start, &diff);
1296
+ velvetLog(" === Sequences loaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1297
+
1298
+ fclose(outfile);
1299
+
1300
+ if (mapReferenceIDs) {
1301
+ free(mapReferenceIDs);
1302
+ free(mapCoords);
1303
+ free(mapCount);
1304
+ }
1305
+ if (referenceMasks) {
1306
+ free(referenceMasks);
1307
+ }
1308
+ if (seqReadInfo.m_namesFilename) {
1309
+ free(seqReadInfo.m_namesFilename);
1310
+ }
1311
+ //free(reads->tSequences);
1312
+ //reads->tSequences = NULL;
1313
+ //destroyReadSet(reads);
1314
+ velvetLog("Done inputting sequences\n");
1315
+ }