finishm 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,32 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _SHORTREADPAIRS_H_
22
+ #define _SHORTREADPAIRS_H_
23
+
24
+ void exploitShortReadPairs(Graph * graph,
25
+ ReadSet * reads,
26
+ boolean * dubious,
27
+ boolean * shadows,
28
+ boolean force_jumps);
29
+ void handicapNode(Node * node);
30
+ NodeList *getMarkedNodeList();
31
+
32
+ #endif
@@ -0,0 +1,259 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+
24
+ #ifdef _OPENMP
25
+ #include <omp.h>
26
+ #endif
27
+
28
+ #include "globals.h"
29
+ #include "recycleBin.h"
30
+ #include "kmer.h"
31
+ #include "utility.h"
32
+
33
+ #define CHUNKSIZE 10000
34
+
35
+ static RecycleBin *treeMemory = NULL;
36
+
37
+ struct splayNode_st {
38
+ Kmer kmer;
39
+ Coordinate position;
40
+ struct splayNode_st *left;
41
+ struct splayNode_st *right;
42
+ IDnum seqID;
43
+ } ATTRIBUTE_PACKED;
44
+
45
+ typedef struct splayNode_st SplayNode;
46
+ typedef struct splayNode_st SplayTree;
47
+
48
+
49
+ #ifdef _OPENMP
50
+ void initSplayTreeMemory(void)
51
+ {
52
+ int n;
53
+
54
+ n = omp_get_max_threads();
55
+ #pragma omp critical
56
+ if (treeMemory == NULL)
57
+ treeMemory = newRecycleBinArray(n, sizeof(SplayNode), CHUNKSIZE);
58
+ }
59
+ #endif
60
+
61
+ static SplayNode *allocateSplayNode()
62
+ {
63
+ #ifdef _OPENMP
64
+ #ifdef DEBUG
65
+ if (treeMemory == NULL)
66
+ {
67
+ velvetLog("The memory for splay trees seems uninitialised, "
68
+ "this is probably a bug, aborting.\n");
69
+ abort();
70
+ }
71
+ #endif
72
+ return allocatePointer(getRecycleBinInArray(treeMemory,
73
+ omp_get_thread_num()));
74
+ #else
75
+ if (treeMemory == NULL)
76
+ treeMemory = newRecycleBin(sizeof(SplayNode), CHUNKSIZE);
77
+
78
+ return allocatePointer(treeMemory);
79
+ #endif
80
+ }
81
+
82
+ void destroyAllSplayTrees()
83
+ {
84
+ #ifdef _OPENMP
85
+ destroyRecycleBinArray(treeMemory);
86
+ #else
87
+ destroyRecycleBin(treeMemory);
88
+ #endif
89
+ treeMemory = NULL;
90
+ }
91
+
92
+ /* This function can be called only if K2 has a left child */
93
+ /* Perform a rotate between a node (K2) and its left child */
94
+ /* Update heights, then return new root */
95
+
96
+ static SplayNode *SingleRotateWithLeft(SplayNode * K2)
97
+ {
98
+ SplayNode *K1;
99
+
100
+ K1 = K2->left;
101
+ K2->left = K1->right;
102
+ K1->right = K2;
103
+
104
+ return K1; /* New root */
105
+ }
106
+
107
+ /* This function can be called only if K1 has a right child */
108
+ /* Perform a rotate between a node (K1) and its right child */
109
+ /* Update heights, then return new root */
110
+
111
+ static SplayNode *SingleRotateWithRight(SplayNode * K1)
112
+ {
113
+ SplayNode *K2;
114
+
115
+ K2 = K1->right;
116
+ K1->right = K2->left;
117
+ K2->left = K1;
118
+
119
+ return K2; /* New root */
120
+ }
121
+
122
+ /* Top-down splay procedure, */
123
+ /* not requiring kmer to be in tree */
124
+
125
+ static SplayTree *Splay(Kmer * kmer, SplayTree * T)
126
+ {
127
+ SplayNode Header;
128
+ SplayNode *LeftTreeMax, *RightTreeMin;
129
+
130
+ if (T == NULL)
131
+ return NULL;
132
+
133
+ Header.left = Header.right = NULL;
134
+ LeftTreeMax = RightTreeMin = &Header;
135
+
136
+ while (compareKmers(kmer, &(T->kmer))) {
137
+ if (compareKmers(kmer, &(T->kmer)) < 0) {
138
+ if (T->left == NULL)
139
+ break;
140
+ if (compareKmers(kmer, &(T->left->kmer)) < 0)
141
+ T = SingleRotateWithLeft(T);
142
+ if (T->left == NULL)
143
+ break;
144
+ /* Link right */
145
+ RightTreeMin->left = T;
146
+ RightTreeMin = T;
147
+ T = T->left;
148
+ } else {
149
+ if (T->right == NULL)
150
+ break;
151
+ if (compareKmers(kmer, &(T->right->kmer)) > 0)
152
+ T = SingleRotateWithRight(T);
153
+ if (T->right == NULL)
154
+ break;
155
+ /* Link left */
156
+ LeftTreeMax->right = T;
157
+ LeftTreeMax = T;
158
+ T = T->right;
159
+ }
160
+ } /* while kmer != T->kmer */
161
+
162
+ /* Reassemble */
163
+ LeftTreeMax->right = T->left;
164
+ RightTreeMin->left = T->right;
165
+ T->left = Header.right;
166
+ T->right = Header.left;
167
+
168
+ return T;
169
+ }
170
+
171
+ Kmer * findInTree(Kmer * X, SplayTree ** T)
172
+ {
173
+ *T = Splay(X, *T);
174
+ return &((*T)->kmer);
175
+ }
176
+
177
+ void insertIntoTree(Kmer * kmer, SplayTree ** T)
178
+ {
179
+ SplayNode *newNode;
180
+
181
+ if (*T == NULL) {
182
+ newNode = allocateSplayNode();
183
+ copyKmers(&(newNode->kmer), kmer);
184
+ newNode->left = newNode->right = NULL;
185
+ *T = newNode;
186
+ return;
187
+ }
188
+
189
+ *T = Splay(kmer, *T);
190
+ if (compareKmers(kmer, &((*T)->kmer)) < 0) {
191
+ newNode = allocateSplayNode();
192
+ copyKmers(&(newNode->kmer), kmer);
193
+ newNode->left = (*T)->left;
194
+ newNode->right = *T;
195
+ (*T)->left = NULL;
196
+ *T = newNode;
197
+ } else if (compareKmers(&((*T)->kmer), kmer) < 0) {
198
+ newNode = allocateSplayNode();
199
+ copyKmers(&(newNode->kmer), kmer);
200
+ newNode->right = (*T)->right;
201
+ newNode->left = *T;
202
+ (*T)->right = NULL;
203
+ *T = newNode;
204
+ }
205
+ }
206
+
207
+ boolean
208
+ findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
209
+ Coordinate * position, SplayTree ** T)
210
+ {
211
+ SplayNode *newNode;
212
+
213
+ if (*T == NULL) {
214
+ newNode = allocateSplayNode();
215
+ copyKmers(&(newNode->kmer), kmer);
216
+ newNode->seqID = *seqID;
217
+ newNode->position = *position;
218
+
219
+ newNode->left = newNode->right = NULL;
220
+
221
+ *T = newNode;
222
+
223
+ return false;
224
+ }
225
+
226
+ *T = Splay(kmer, *T);
227
+ if (compareKmers(kmer, &((*T)->kmer)) < 0) {
228
+ newNode = allocateSplayNode();
229
+ copyKmers(&(newNode->kmer), kmer);
230
+ newNode->seqID = *seqID;
231
+ newNode->position = *position;
232
+
233
+ newNode->left = (*T)->left;
234
+ newNode->right = *T;
235
+ (*T)->left = NULL;
236
+
237
+ *T = newNode;
238
+
239
+ return false;
240
+ } else if (compareKmers(kmer, &((*T)->kmer)) > 0) {
241
+ newNode = allocateSplayNode();
242
+ copyKmers(&(newNode->kmer), kmer);
243
+ newNode->seqID = *seqID;
244
+ newNode->position = *position;
245
+
246
+ newNode->right = (*T)->right;
247
+ newNode->left = *T;
248
+ (*T)->right = NULL;
249
+
250
+ *T = newNode;
251
+
252
+ return false;
253
+ } else {
254
+ *seqID = (*T)->seqID;
255
+ *position = (*T)->position;
256
+
257
+ return true;
258
+ }
259
+ }
@@ -0,0 +1,43 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _Splay_H
22
+ #define _Splay_H
23
+
24
+ #include <stdio.h>
25
+
26
+ typedef struct splayNode_st SplayTree;
27
+
28
+ // Deallocates tree memory
29
+ void destroyAllSplayTrees();
30
+
31
+ // Finds occurrence of kmer in the tree
32
+ // If found, returns TRUE, and seqID and coordinate are accordingly modified
33
+ // If not, a new leaf is added to the tree, with the seqID and position data
34
+ boolean findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
35
+ Coordinate * position,
36
+ SplayTree ** T);
37
+
38
+ #ifdef _OPENMP
39
+ /* Initialises the per-thread RecycleBin array */
40
+ void initSplayTreeMemory(void);
41
+ #endif
42
+
43
+ #endif
@@ -0,0 +1,1315 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <time.h>
25
+ #include <sys/time.h>
26
+
27
+ #ifdef _OPENMP
28
+ #include <omp.h>
29
+ #endif
30
+
31
+ #include "globals.h"
32
+ #include "readSet.h"
33
+ #include "splay.h"
34
+ #include "tightString.h"
35
+ #include "utility.h"
36
+ #include "kmer.h"
37
+ #include "kmerOccurenceTable.h"
38
+ #include "recycleBin.h"
39
+ #include "binarySequences.h"
40
+
41
+ static RecycleBin * maskMemory = NULL;
42
+
43
+ static Mask *allocateMask()
44
+ {
45
+ if (maskMemory == NULL)
46
+ maskMemory = newRecycleBin(sizeof(Mask), 10000);
47
+
48
+ return (Mask *) allocatePointer(maskMemory);
49
+ }
50
+
51
+ static Mask * newMask(Coordinate position)
52
+ {
53
+ Mask * mask = allocateMask();
54
+ mask->start = position;
55
+ mask->finish = position;
56
+ mask->next = NULL;
57
+ return mask;
58
+ }
59
+
60
+ // DEBUG
61
+ boolean debug = false;
62
+
63
+ #define HASH_BUCKETS_NB 16777216
64
+
65
+ #ifdef _OPENMP
66
+
67
+ #define NB_PUSH 32
68
+ #define BUFFER_SIZE 4096
69
+
70
+ static StringBuffer **annotationBuffer = NULL;
71
+ static StringBuffer **annotationBufferW = NULL;
72
+ static int *nbPush = NULL;
73
+ static boolean producing = 1;
74
+
75
+ static void initAnnotationBuffers(void)
76
+ {
77
+ int n;
78
+ int i;
79
+
80
+ n = omp_get_max_threads();
81
+ annotationBuffer = callocOrExit(n, StringBuffer*);
82
+ annotationBufferW = callocOrExit(n, StringBuffer*);
83
+ nbPush = callocOrExit(n, int);
84
+
85
+ for (i = 0; i < n; i++)
86
+ {
87
+ annotationBuffer[i] = newStringBuffer(BUFFER_SIZE);
88
+ annotationBufferW[i] = newStringBuffer(BUFFER_SIZE);
89
+ }
90
+ }
91
+
92
+ static void destroyAnnotationBuffers(void)
93
+ {
94
+ int n;
95
+ int i;
96
+
97
+ n = omp_get_max_threads();
98
+
99
+ for (i = 0; i < n; i++)
100
+ {
101
+ destroyStringBuffer(annotationBuffer[i], 1);
102
+ destroyStringBuffer(annotationBufferW[i], 1);
103
+ }
104
+
105
+ free(annotationBuffer);
106
+ free(annotationBufferW);
107
+ free(nbPush);
108
+ annotationBuffer = NULL;
109
+ annotationBufferW = NULL;
110
+ nbPush = NULL;
111
+ }
112
+
113
+ static void pushBufferCommit(int thread)
114
+ {
115
+ StringBuffer *tmp;
116
+ char *s;
117
+
118
+ s = annotationBufferW[thread]->str;
119
+ do
120
+ {
121
+ #pragma omp flush(s)
122
+ }
123
+ while (*s);
124
+ tmp = annotationBufferW[thread];
125
+ annotationBufferW[thread] = annotationBuffer[thread];
126
+ annotationBuffer[thread] = tmp;
127
+ tmp = annotationBufferW[thread];
128
+ #pragma omp flush(tmp)
129
+ }
130
+
131
+ static void pushBuffer(int thread)
132
+ {
133
+ if (++nbPush[thread] == NB_PUSH)
134
+ {
135
+ nbPush[thread] = 0;
136
+ pushBufferCommit(thread);
137
+ }
138
+ }
139
+
140
+ static void writeBuffers(FILE *outFile, int nbThreads)
141
+ {
142
+ int i;
143
+
144
+ for (i = 0; i < nbThreads; i++)
145
+ {
146
+ StringBuffer *b;
147
+ char *s;
148
+
149
+ b = annotationBufferW[i];
150
+ #pragma omp flush(b)
151
+ s = b->str;
152
+ #pragma omp flush(s)
153
+ if (*s)
154
+ {
155
+ velvetFprintf(outFile, "%s", annotationBufferW[i]->str);
156
+ resetStringBuffer(annotationBufferW[i]);
157
+ }
158
+ }
159
+ }
160
+
161
+ static void bufferWritter(FILE *outFile)
162
+ {
163
+ int n;
164
+
165
+ n = omp_get_max_threads();
166
+ #pragma omp flush(producing)
167
+ while (producing)
168
+ {
169
+ writeBuffers(outFile, n);
170
+ #pragma omp flush(producing)
171
+ }
172
+ writeBuffers(outFile, n);
173
+ }
174
+
175
+ static void appendLine(char *line, int thread)
176
+ {
177
+ appendStringBuffer(annotationBuffer[thread], line);
178
+ }
179
+ #else
180
+
181
+ #define BUFFER_SIZE 1024
182
+
183
+ StringBuffer *annotationBuffer = NULL;
184
+
185
+ static void appendLine(char *line, int thread)
186
+ {
187
+ appendStringBuffer(annotationBuffer, line);
188
+ }
189
+ #endif
190
+
191
+ struct splayTable_st {
192
+ SplayTree **table;
193
+ #ifdef _OPENMP
194
+ omp_lock_t *tableLocks;
195
+ #endif
196
+ KmerOccurenceTable *kmerOccurenceTable;
197
+ int WORDLENGTH;
198
+ boolean double_strand;
199
+ };
200
+
201
+ SplayTable *newSplayTable(int WORDLENGTH, boolean double_strand)
202
+ {
203
+ SplayTable *splayTable = mallocOrExit(1, SplayTable);
204
+ splayTable->WORDLENGTH = WORDLENGTH;
205
+ splayTable->table = callocOrExit(HASH_BUCKETS_NB, SplayTree *);
206
+ splayTable->kmerOccurenceTable = NULL;
207
+ splayTable->double_strand = double_strand;
208
+ #ifdef _OPENMP
209
+ splayTable->tableLocks = mallocOrExit(HASH_BUCKETS_NB, omp_lock_t);
210
+ int i;
211
+ #pragma omp parallel for
212
+ for (i = 0; i < HASH_BUCKETS_NB; i++)
213
+ omp_init_lock(splayTable->tableLocks + i);
214
+ initSplayTreeMemory();
215
+ #endif
216
+ return splayTable;
217
+ }
218
+
219
+ void destroySplayTable(SplayTable * splayTable)
220
+ {
221
+ velvetLog("Destroying splay table\n");
222
+
223
+ destroyAllSplayTrees();
224
+ free(splayTable->table);
225
+ destroyKmerOccurenceTable(splayTable->kmerOccurenceTable);
226
+ free(splayTable);
227
+
228
+ velvetLog("Splay table destroyed\n");
229
+ }
230
+
231
+ static KmerKey hash_kmer(Kmer * kmer)
232
+ {
233
+ #if KMER_LONGLONGS
234
+ KmerKey key = kmer->longlongs[0];
235
+
236
+ #if KMER_LONGLONGS > 1
237
+ key ^= kmer->longlongs[1];
238
+ #endif
239
+ #if KMER_LONGLONGS > 2
240
+ key ^= kmer->longlongs[2];
241
+ #endif
242
+
243
+ key = (~key) + (key << 21);
244
+ key = key ^ (key >> 24);
245
+ key = (key + (key << 3)) + (key << 8);
246
+ key = key ^ (key >> 14);
247
+ key = (key + (key << 2)) + (key << 4);
248
+ key = key ^ (key >> 28);
249
+ key = key + (key << 31);
250
+
251
+ return key % HASH_BUCKETS_NB;
252
+ #elif KMER_LONGS
253
+ KmerKey key = kmer->longs;
254
+
255
+ key += ~(key << 15);
256
+ key ^= (key >> 10);
257
+ key += (key << 3);
258
+ key ^= (key >> 6);
259
+ key += ~(key << 11);
260
+ key ^= (key >> 16);
261
+
262
+ return key % HASH_BUCKETS_NB;
263
+
264
+ #elif KMER_INTS
265
+ return kmer->ints % HASH_BUCKETS_NB;
266
+ #elif KMER_CHARS
267
+ return kmer->chars % HASH_BUCKETS_NB;
268
+ #endif
269
+ }
270
+
271
+ static Coordinate getNearestHSPIndex(Coordinate position, IDnum * sequenceIDs, Coordinate sequenceLength) {
272
+ Coordinate back_offset = -1;
273
+ Coordinate front_offset = -1;
274
+
275
+ for (back_offset = 1; position - back_offset > 0; back_offset++)
276
+ if (sequenceIDs[position - back_offset])
277
+ break;
278
+
279
+ for (front_offset = 1; position + front_offset < sequenceLength; front_offset++)
280
+ if (sequenceIDs[position + front_offset])
281
+ break;
282
+
283
+ if (back_offset == position && position + front_offset == sequenceLength)
284
+ return -1;
285
+ else if (back_offset == position)
286
+ return position + front_offset;
287
+ else if (front_offset + position == sequenceLength)
288
+ return position - back_offset;
289
+ else
290
+ return back_offset < front_offset? position - back_offset : position + front_offset;
291
+ }
292
+
293
+ static KmerOccurence * getMostAppropriateHit(Coordinate readCoord, Coordinate readLength, boolean direct, KmerOccurence * kmerOccurence, IDnum mapCount, IDnum * mapSequenceID, Coordinate * mapCoord, int wordLength) {
294
+ KmerOccurence * current;
295
+ KmerOccurence * best = NULL;
296
+ Coordinate expectedPosition;
297
+ Coordinate positionError;
298
+ IDnum mapIndex;
299
+
300
+ // If only one hit
301
+ if (!getNextKmerOccurence(kmerOccurence))
302
+ return kmerOccurence;
303
+
304
+ // If multiple hits by unmapped read
305
+ if (mapCount == 0)
306
+ return NULL;
307
+
308
+ // Compare cases
309
+ for (current = kmerOccurence; current; current = getNextKmerOccurence(current)) {
310
+ for (mapIndex = 0; mapIndex < mapCount; mapIndex++) {
311
+
312
+ // If wrong sequence or unconsistent orientation
313
+ if ((direct && getKmerOccurenceNodeID(current) != mapSequenceID[mapIndex])
314
+ || (!direct && getKmerOccurenceNodeID(current) != -mapSequenceID[mapIndex]))
315
+ continue;
316
+
317
+ // Compute where it is supposed to land on reference
318
+ if (mapSequenceID[mapIndex] < 0)
319
+ expectedPosition = mapCoord[mapIndex] + readLength - readCoord - 1;
320
+ else
321
+ expectedPosition = mapCoord[mapIndex] + readCoord - wordLength + 1;
322
+
323
+ // Compute positional error
324
+ positionError = getKmerOccurencePosition(current) - expectedPosition;
325
+
326
+ // If potential hit record
327
+ if (positionError < 1 && positionError > -1) {
328
+ if (best)
329
+ // If competing hit, give up
330
+ return NULL;
331
+ else
332
+ // Record current hit
333
+ best = current;
334
+ }
335
+ }
336
+ }
337
+
338
+ return best;
339
+ }
340
+
341
+ static inline boolean
342
+ doFindOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
343
+ Coordinate * position, SplayTable *table)
344
+ {
345
+ #ifdef _OPENMP
346
+ const KmerKey kmerHash = hash_kmer(kmer);
347
+ boolean ret;
348
+
349
+ omp_set_lock(table->tableLocks + kmerHash);
350
+ ret = findOrInsertOccurenceInSplayTree(kmer, seqID, position,
351
+ table->table + kmerHash);
352
+ omp_unset_lock(table->tableLocks + kmerHash);
353
+
354
+ return ret;
355
+ #else
356
+ return findOrInsertOccurenceInSplayTree(kmer, seqID, position,
357
+ &table->table[hash_kmer(kmer)]);
358
+ #endif
359
+ }
360
+
361
+
362
+ static boolean findOrInsertOccurenceInSplayTable(Kmer * kmer, IDnum * seqID,
363
+ Coordinate * position,
364
+ SplayTable * table, IDnum * sequenceIDs,
365
+ Coordinate * coords, Coordinate readIndex, Coordinate readLength, boolean direct)
366
+ {
367
+ KmerOccurence * hit;
368
+ Coordinate HSPIndex;
369
+
370
+ // Check if previous anchor
371
+ if (sequenceIDs && sequenceIDs[readIndex]) {
372
+ if (direct)
373
+ *seqID = sequenceIDs[readIndex];
374
+ else
375
+ *seqID = -sequenceIDs[readIndex];
376
+ if (sequenceIDs[readIndex] > 0)
377
+ *position = coords[readIndex] + readIndex;
378
+ else
379
+ *position = coords[readIndex] - readIndex + readLength - 1;
380
+
381
+ return true;
382
+ }
383
+ else if (coords && coords[readIndex])
384
+ // If in buffer zone:
385
+ return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
386
+
387
+
388
+ if (debug)
389
+ abort();
390
+ // Look up first in reference sequence k-mers
391
+ if (table->kmerOccurenceTable
392
+ && (hit = findKmerInKmerOccurenceTable(kmer, table->kmerOccurenceTable))) {
393
+ if (!getNextKmerOccurence(hit)) {
394
+ *seqID = getKmerOccurenceNodeID(hit);
395
+ *position = getKmerOccurencePosition(hit);
396
+ return true;
397
+ } else if ((HSPIndex = getNearestHSPIndex(*position, sequenceIDs, readLength)) > 0) {
398
+ hit = getMostAppropriateHit(readIndex, readLength, direct, hit, 1, &(sequenceIDs[HSPIndex]), &(coords[HSPIndex]), table->WORDLENGTH);
399
+ if (hit) {
400
+ *seqID = getKmerOccurenceNodeID(hit);
401
+ *position = getKmerOccurencePosition(hit);
402
+ return true;
403
+ }
404
+
405
+ }
406
+ }
407
+
408
+ // If not, go through the novel k-mers
409
+ return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
410
+ }
411
+
412
+ static void printAnnotations(IDnum *sequenceIDs, Coordinate * coords,
413
+ TightString * array, SplayTable * table,
414
+ FILE * file, boolean second_in_pair, IDnum seqID)
415
+ {
416
+ Coordinate readNucleotideIndex = 0;
417
+ Coordinate writeNucleotideIndex = 0;
418
+ Kmer word;
419
+ Kmer antiWord;
420
+ boolean annotationClosed = true;
421
+ IDnum sequenceID;
422
+ Coordinate coord;
423
+ boolean found;
424
+ Coordinate position = 0;
425
+ Coordinate start = 0;
426
+ Coordinate finish = 0;
427
+ IDnum referenceSequenceID = 0;
428
+ Nucleotide nucleotide;
429
+ char lineBuffer[MAXLINE];
430
+ TightString * tString = getTightStringInArray(array, seqID - 1);
431
+ int thread = 0;
432
+
433
+ clearKmer(&word);
434
+ clearKmer(&antiWord);
435
+
436
+ #ifdef _OPENMP
437
+ thread = omp_get_thread_num();
438
+ #endif
439
+
440
+ if (debug)
441
+ abort();
442
+
443
+ sprintf(lineBuffer, "ROADMAP %li\n", (long)seqID);
444
+ appendLine(lineBuffer, thread);
445
+
446
+ // Neglect any string shorter than WORDLENGTH :
447
+ if (getLength(tString) < table->WORDLENGTH) {
448
+ #ifdef _OPENMP
449
+ pushBuffer(thread);
450
+ #else
451
+ velvetFprintf(file, "%s", annotationBuffer->str);
452
+ resetStringBuffer(annotationBuffer);
453
+ #endif
454
+ return;
455
+ }
456
+
457
+ // Fill in the initial word :
458
+ for (readNucleotideIndex = 0;
459
+ readNucleotideIndex < table->WORDLENGTH - 1;
460
+ readNucleotideIndex++) {
461
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
462
+ pushNucleotide(&word, nucleotide);
463
+ #ifdef COLOR
464
+ reversePushNucleotide(&antiWord, nucleotide);
465
+ #else
466
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
467
+ #endif
468
+ }
469
+
470
+ while (readNucleotideIndex < getLength(tString)) {
471
+ // Shift word:
472
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
473
+ pushNucleotide(&word, nucleotide);
474
+ #ifdef COLOR
475
+ reversePushNucleotide(&antiWord, nucleotide);
476
+ #else
477
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
478
+ #endif
479
+
480
+ sequenceID = seqID;
481
+ coord = writeNucleotideIndex;
482
+
483
+ if (table->double_strand) {
484
+ if (compareKmers(&word, &antiWord) <= 0) {
485
+ found =
486
+ findOrInsertOccurenceInSplayTable(&word,
487
+ &sequenceID,
488
+ &coord,
489
+ table,
490
+ sequenceIDs,
491
+ coords,
492
+ readNucleotideIndex,
493
+ getLength(tString),
494
+ true);
495
+ } else {
496
+ sequenceID = -sequenceID;
497
+ found =
498
+ findOrInsertOccurenceInSplayTable(&antiWord,
499
+ &sequenceID,
500
+ &coord,
501
+ table,
502
+ sequenceIDs,
503
+ coords,
504
+ readNucleotideIndex,
505
+ getLength(tString),
506
+ false);
507
+ sequenceID = -sequenceID;
508
+ }
509
+ } else {
510
+ if (!second_in_pair) {
511
+ found =
512
+ findOrInsertOccurenceInSplayTable(&word,
513
+ &sequenceID,
514
+ &coord,
515
+ table,
516
+ sequenceIDs,
517
+ coords,
518
+ readNucleotideIndex,
519
+ getLength(tString),
520
+ true);
521
+ } else {
522
+ sequenceID = -sequenceID;
523
+ found =
524
+ findOrInsertOccurenceInSplayTable(&antiWord,
525
+ &sequenceID,
526
+ &coord,
527
+ table,
528
+ sequenceIDs,
529
+ coords,
530
+ readNucleotideIndex,
531
+ getLength(tString),
532
+ false);
533
+ sequenceID = -sequenceID;
534
+ }
535
+ }
536
+
537
+ if (!found) {
538
+ writeNucleotideIndex++;
539
+ if (!annotationClosed) {
540
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
541
+ (long) referenceSequenceID, (long long) position,
542
+ (long long) start, (long long) finish);
543
+ appendLine(lineBuffer, thread);
544
+ }
545
+ annotationClosed = true;
546
+ }
547
+ // Otherwise create/complete annotation:
548
+ else {
549
+ // Forbidden k-mer
550
+ if (sequenceID == 0) {
551
+ break;
552
+ }
553
+ // Closed/inexistant annotation
554
+ else if (annotationClosed) {
555
+ referenceSequenceID = sequenceID;
556
+ position = writeNucleotideIndex;
557
+ start = finish = coord;
558
+
559
+ if (referenceSequenceID > 0)
560
+ finish++;
561
+ else
562
+ finish--;
563
+
564
+ annotationClosed = false;
565
+ }
566
+ // Open annotation
567
+ else if (sequenceID == referenceSequenceID
568
+ && coord == finish) {
569
+ if (referenceSequenceID > 0)
570
+ finish++;
571
+ else
572
+ finish--;
573
+ }
574
+ // Previous non corresponding annotation
575
+ else {
576
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
577
+ (long) referenceSequenceID, (long long) position,
578
+ (long long) start, (long long) finish);
579
+ appendLine(lineBuffer, thread);
580
+
581
+ referenceSequenceID = sequenceID;
582
+ position = writeNucleotideIndex;
583
+ start = finish = coord;
584
+
585
+ if (referenceSequenceID > 0)
586
+ finish++;
587
+ else
588
+ finish--;
589
+ }
590
+ }
591
+
592
+ readNucleotideIndex++;
593
+ }
594
+
595
+ if (!annotationClosed) {
596
+ sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
597
+ (long) referenceSequenceID, (long long) position,
598
+ (long long) start, (long long) finish);
599
+ appendLine(lineBuffer, thread);
600
+ }
601
+ #ifdef _OPENMP
602
+ pushBuffer(thread);
603
+ #else
604
+ velvetFprintf(file, "%s", annotationBuffer->str);
605
+ resetStringBuffer(annotationBuffer);
606
+ #endif
607
+
608
+ return;
609
+ }
610
+
611
+ static void computeClearHSPs(TightString * array, boolean second_in_pair, SplayTable * table, IDnum ** sequenceIDs, Coordinate ** coords, IDnum * mapReferenceIDs, Coordinate * mapCoords, Coordinate mapCount, IDnum seqID) {
612
+ Coordinate readNucleotideIndex = 0;
613
+ Kmer word;
614
+ Kmer antiWord;
615
+ Kmer polyA;
616
+ Nucleotide nucleotide;
617
+ KmerOccurence * hit;
618
+
619
+ int penalty;
620
+ TightString * tString;
621
+ Coordinate length;
622
+
623
+ clearKmer(&polyA);
624
+ tString = getTightStringInArray(array, seqID - 1);
625
+ length = getLength(tString);
626
+ *sequenceIDs = callocOrExit(length, IDnum);
627
+ *coords = callocOrExit(length, Coordinate);
628
+
629
+ // First pass for unambiguous hits
630
+ // Fill in the initial word :
631
+ clearKmer(&word);
632
+ clearKmer(&antiWord);
633
+ for (readNucleotideIndex = 0;
634
+ readNucleotideIndex < table->WORDLENGTH - 1;
635
+ readNucleotideIndex++) {
636
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
637
+ pushNucleotide(&word, nucleotide);
638
+ #ifdef COLOR
639
+ reversePushNucleotide(&antiWord, nucleotide);
640
+ #else
641
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
642
+ #endif
643
+ }
644
+
645
+ // Kill silly poly-T beginnings
646
+ while (readNucleotideIndex < getLength(tString) && (compareKmers(&antiWord, &polyA) == 0 || compareKmers(&word, &polyA) == 0)) {
647
+ nucleotide = getNucleotide(readNucleotideIndex++, tString);
648
+ pushNucleotide(&word, nucleotide);
649
+ #ifdef COLOR
650
+ reversePushNucleotide(&antiWord, nucleotide);
651
+ #else
652
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
653
+ #endif
654
+ }
655
+
656
+ while (readNucleotideIndex < getLength(tString)) {
657
+ // Shift word:
658
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
659
+ pushNucleotide(&word, nucleotide);
660
+
661
+ #ifdef COLOR
662
+ reversePushNucleotide(&antiWord, nucleotide);
663
+ #else
664
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
665
+ #endif
666
+
667
+ if (table->double_strand) {
668
+ if (compareKmers(&word, &antiWord) <= 0) {
669
+ hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
670
+
671
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
672
+ (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
673
+ } else {
674
+ hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
675
+
676
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
677
+ (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
678
+ }
679
+ } else {
680
+ if (!second_in_pair) {
681
+ hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
682
+
683
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
684
+ (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
685
+ } else {
686
+ hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
687
+
688
+ if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
689
+ (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
690
+ }
691
+ }
692
+
693
+ if ((*sequenceIDs)[readNucleotideIndex]) {
694
+ if ((*sequenceIDs)[readNucleotideIndex] > 0)
695
+ (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) - readNucleotideIndex;
696
+ else
697
+ (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) + readNucleotideIndex - getLength(tString) + 1;
698
+ }
699
+
700
+ // Barrier to flip-flopping
701
+ if ((*sequenceIDs)[readNucleotideIndex - 1] != 0
702
+ && ((*sequenceIDs)[readNucleotideIndex] != (*sequenceIDs)[readNucleotideIndex - 1]
703
+ || (*coords)[readNucleotideIndex] != (*coords)[readNucleotideIndex - 1])) {
704
+ // Break in continuity... skip k positions
705
+ (*sequenceIDs)[readNucleotideIndex] = 0;
706
+ (*coords)[readNucleotideIndex] = -1;
707
+ readNucleotideIndex++;
708
+
709
+ for (penalty = 0; penalty < table->WORDLENGTH - 1 && readNucleotideIndex < getLength(tString); penalty++) {
710
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
711
+ pushNucleotide(&word, nucleotide);
712
+
713
+ #ifdef COLOR
714
+ reversePushNucleotide(&antiWord, nucleotide);
715
+ #else
716
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
717
+ #endif
718
+ (*sequenceIDs)[readNucleotideIndex] = 0;
719
+ (*coords)[readNucleotideIndex] = -1;
720
+ readNucleotideIndex++;
721
+ }
722
+ } else
723
+ readNucleotideIndex++;
724
+
725
+ }
726
+
727
+ free(mapReferenceIDs);
728
+ free(mapCoords);
729
+ }
730
+
731
+ void inputSequenceIntoSplayTable(TightString * array,
732
+ SplayTable * table,
733
+ FILE * file,
734
+ boolean second_in_pair,
735
+ IDnum * mapReferenceIDs, Coordinate * mapCoords, Coordinate mapCount,
736
+ IDnum seqID)
737
+ {
738
+ IDnum * sequenceIDs = NULL;
739
+ Coordinate * coords = NULL;
740
+
741
+ //debug = (seqID == 29405);
742
+
743
+ // If appropriate, get the HSPs on reference sequences
744
+ if (table->kmerOccurenceTable)
745
+ computeClearHSPs(array, second_in_pair, table, &sequenceIDs, &coords, mapReferenceIDs, mapCoords, mapCount, seqID);
746
+
747
+ // Go through read, eventually with annotations
748
+ printAnnotations(sequenceIDs, coords, array, table, file, second_in_pair, seqID);
749
+
750
+ // Clean up
751
+ if (sequenceIDs) {
752
+ free(sequenceIDs);
753
+ free(coords);
754
+ }
755
+ }
756
+
757
+ void inputReferenceIntoSplayTable(TightString * tString,
758
+ SplayTable * table, FILE * file, IDnum seqID, Mask * mask)
759
+ {
760
+ IDnum currentIndex;
761
+ Coordinate readNucleotideIndex = 0;
762
+ Coordinate kmerIndex = 0;
763
+ Kmer word;
764
+ Kmer antiWord;
765
+ Nucleotide nucleotide;
766
+ Mask * currentMask = mask;
767
+ #ifdef _OPENMP
768
+ char lineBuffer[MAXLINE];
769
+ #endif
770
+
771
+ clearKmer(&word);
772
+ clearKmer(&antiWord);
773
+
774
+ currentIndex = seqID;
775
+ #ifdef _OPENMP
776
+ sprintf(lineBuffer, "ROADMAP %li\n", (long)currentIndex);
777
+ appendLine(lineBuffer, omp_get_thread_num());
778
+ #else
779
+ velvetFprintf(file, "ROADMAP %li\n", (long)currentIndex);
780
+ #endif
781
+
782
+ // Neglect any string shorter than WORDLENGTH :
783
+ if (getLength(tString) < table->WORDLENGTH) {
784
+ return;
785
+ }
786
+
787
+ // Fill in the initial word :
788
+ for (readNucleotideIndex = 0;
789
+ readNucleotideIndex < table->WORDLENGTH - 1;
790
+ readNucleotideIndex++) {
791
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
792
+ pushNucleotide(&word, nucleotide);
793
+ if (table->double_strand) {
794
+ #ifdef COLOR
795
+ reversePushNucleotide(&antiWord, nucleotide);
796
+ #else
797
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
798
+ #endif
799
+ }
800
+ }
801
+
802
+ while (readNucleotideIndex < getLength(tString)) {
803
+ // Shift word:
804
+ nucleotide = getNucleotide(readNucleotideIndex, tString);
805
+ pushNucleotide(&word, nucleotide);
806
+
807
+ if (table->double_strand) {
808
+ #ifdef COLOR
809
+ reversePushNucleotide(&antiWord, nucleotide);
810
+ #else
811
+ reversePushNucleotide(&antiWord, 3 - nucleotide);
812
+ #endif
813
+ }
814
+
815
+ // Check for gap masks:
816
+ if (currentMask && currentMask->start - table->WORDLENGTH + 1 <= readNucleotideIndex) {
817
+ while(currentMask && currentMask->finish + table->WORDLENGTH - 1 < readNucleotideIndex)
818
+ currentMask = currentMask->next;
819
+
820
+ if (currentMask && currentMask->finish + table->WORDLENGTH - 1 >= readNucleotideIndex) {
821
+ readNucleotideIndex++;
822
+ kmerIndex++;
823
+ continue;
824
+ }
825
+ }
826
+
827
+ // Record k-mer
828
+ if (table->double_strand) {
829
+ if (compareKmers(&word, &antiWord) <= 0)
830
+ recordKmerOccurence(&word, currentIndex,
831
+ kmerIndex,
832
+ table->kmerOccurenceTable);
833
+ else
834
+ recordKmerOccurence(&antiWord, -currentIndex,
835
+ kmerIndex,
836
+ table->kmerOccurenceTable);
837
+ } else {
838
+ recordKmerOccurence(&word, currentIndex,
839
+ kmerIndex,
840
+ table->kmerOccurenceTable);
841
+ }
842
+ readNucleotideIndex++;
843
+ kmerIndex++;
844
+ }
845
+
846
+ return;
847
+ }
848
+
849
+ static Coordinate countReferenceKmers(ReadSet * reads, int wordLength) {
850
+ IDnum readIndex;
851
+ Coordinate length = 0;
852
+
853
+
854
+ for (readIndex = 0; readIndex < reads->readCount && reads->categories[readIndex] == REFERENCE; readIndex++)
855
+ {
856
+ Coordinate tmpLength = getLength(getTightStringInArray(reads->tSequences, readIndex));
857
+ if (tmpLength >= wordLength)
858
+ length += tmpLength - wordLength + 1;
859
+ }
860
+
861
+ return length;
862
+ }
863
+
864
+ Mask ** scanReferenceSequences(FILE * file, IDnum referenceSequenceCount) {
865
+ Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
866
+ IDnum index;
867
+ char line[MAXLINE];
868
+ char c = '\0';
869
+
870
+ // Search sequences for masks
871
+ for (index = 0; index < referenceSequenceCount; index++) {
872
+ Mask * current = NULL;
873
+ Coordinate position = 0;
874
+ boolean openMask = false;
875
+
876
+ // Read through header
877
+ fgets(line, MAXLINE, file);
878
+
879
+ // Read through sequence
880
+ while ((c = getc(file))) {
881
+ if (c == EOF || c == '>')
882
+ break;
883
+ else if (c == '\r' || c == '\n')
884
+ continue;
885
+ else if (c == 'n' || c == 'N') {
886
+ if (openMask)
887
+ current->finish++;
888
+ else if (referenceMasks[index] == NULL) {
889
+ referenceMasks[index] = newMask(position);
890
+ current = referenceMasks[index];
891
+ } else {
892
+ current->next = newMask(position);
893
+ current = current->next;
894
+ }
895
+ openMask = true;
896
+ position++;
897
+ } else {
898
+ openMask = false;
899
+ position++;
900
+ }
901
+ }
902
+ }
903
+
904
+ if (c != '\0')
905
+ ungetc(c, file);
906
+ return referenceMasks;
907
+ }
908
+
909
+ Mask ** scanBinaryReferenceSequences(SequencesReader *seqReadInfo, IDnum referenceSequenceCount) {
910
+ Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
911
+ IDnum index;
912
+ char line[MAXLINE];
913
+ char c = '\0';
914
+ FILE *file = fopen(seqReadInfo->m_namesFilename, "r");
915
+ if (file == NULL) {
916
+ exitErrorf(EXIT_FAILURE, true, "Couldn't read file %s", seqReadInfo->m_namesFilename);
917
+ } else {
918
+ velvetLog("Reading mapping info from %s\n", seqReadInfo->m_namesFilename);
919
+ }
920
+
921
+ // Search sequences for masks
922
+ for (index = 0; index < referenceSequenceCount; index++) {
923
+ Mask * current = NULL;
924
+ long start = 0;
925
+ long finish = 0;
926
+ long number;
927
+ long cat;
928
+
929
+ // Read through header
930
+ if ((c = getc(file)) != '>') {
931
+ exitErrorf(EXIT_FAILURE, false, "names line did not start with >");
932
+ }
933
+ fgets(line, MAXLINE, file);
934
+ sscanf(line, "%*[^\t]\t%li\t%li\n", &number, &cat);
935
+ // ensure is is a ref cat
936
+ if ((IDnum) number != index + 1) {
937
+ exitErrorf(EXIT_FAILURE, false, "sequence %ld != expected %ld", number, (long) index);
938
+ }
939
+ if ((Category) cat != REFERENCE) {
940
+ exitErrorf(EXIT_FAILURE, false, "unexpected category %ld", cat);
941
+ }
942
+
943
+ // Read through the reference maps
944
+ while ((c = getc(file))) {
945
+ if (c == EOF || c == '>') {
946
+ break;
947
+ }
948
+ ungetc(c, file);
949
+ fgets(line, MAXLINE, file);
950
+ sscanf(line, "%li\t%li\n", &start, &finish);
951
+ if (referenceMasks[index] == NULL) {
952
+ referenceMasks[index] = newMask(start);
953
+ referenceMasks[index]->finish = finish;
954
+ current = referenceMasks[index];
955
+ } else {
956
+ current->next = newMask(start);
957
+ current->next->finish = finish;
958
+ current = current->next;
959
+ }
960
+ }
961
+ ungetc(c, file);
962
+ }
963
+
964
+ fclose(file);
965
+ return referenceMasks;
966
+ }
967
+
968
+ void inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,
969
+ SplayTable * table,
970
+ char *filename, char* seqFilename)
971
+ {
972
+ IDnum index;
973
+ IDnum sequenceCount = reads->readCount;
974
+ TightString *array;
975
+ FILE *outfile = fopen(filename, "w");
976
+ FILE *seqFile = NULL;
977
+ IDnum kmerCount;
978
+ IDnum referenceSequenceCount = 0;
979
+ struct timeval start, end, diff;
980
+ SequencesReader seqReadInfo;
981
+ memset(&seqReadInfo, 0, sizeof(seqReadInfo));
982
+ if (isCreateBinary()) {
983
+ seqReadInfo.m_bIsBinary = true;
984
+ seqReadInfo.m_pFile = openCnySeqForRead(seqFilename, &seqReadInfo.m_unifiedSeqFileHeader);
985
+ if (!seqReadInfo.m_pFile) {
986
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", seqFilename);
987
+ }
988
+ seqReadInfo.m_namesFilename = mallocOrExit(strlen(seqFilename) + sizeof(".names"), char);
989
+ sprintf(seqReadInfo.m_namesFilename, "%s.names", seqFilename);
990
+ seqReadInfo.m_numCategories = seqReadInfo.m_unifiedSeqFileHeader.m_numCategories;
991
+ seqReadInfo.m_minSeqLen = seqReadInfo.m_unifiedSeqFileHeader.m_minSeqLen;
992
+ seqReadInfo.m_maxSeqLen = seqReadInfo.m_unifiedSeqFileHeader.m_maxSeqLen;
993
+ seqReadInfo.m_bIsRef = false;
994
+ seqReadInfo.m_pReadBuffer = mallocOrExit(USF_READ_BUF_SIZE, uint8_t );
995
+ seqReadInfo.m_pCurrentReadPtr = seqReadInfo.m_pReadBufEnd = 0;
996
+
997
+ resetCnySeqCurrentRead(&seqReadInfo);
998
+ } else {
999
+ seqReadInfo.m_bIsBinary = false;
1000
+ }
1001
+ IDnum ** mapReferenceIDs = NULL;
1002
+ Coordinate ** mapCoords = NULL;
1003
+ Coordinate * mapCount = NULL;
1004
+
1005
+ char line[MAXLINE];
1006
+ char c;
1007
+ IDnum seqID = 0;
1008
+ long long_var;
1009
+ long long longlong_var;
1010
+ Coordinate maxCount = 20;
1011
+ Coordinate counter = 0;
1012
+ // DEBUG
1013
+ Mask ** referenceMasks = NULL;
1014
+
1015
+ if (outfile == NULL)
1016
+ exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", filename);
1017
+ else
1018
+ velvetLog("Writing into roadmap file %s...\n", filename);
1019
+
1020
+ // Count reference sequences
1021
+ for (index = 0; index < reads->readCount && reads->categories[index] == REFERENCE; index++)
1022
+ referenceSequenceCount++;
1023
+
1024
+ velvetFprintf(outfile, "%ld\t%ld\t%i\t%hi\n", (long) sequenceCount, (long) referenceSequenceCount, table->WORDLENGTH, (short) table->double_strand);
1025
+
1026
+ if (reads->tSequences == NULL)
1027
+ convertSequences(reads);
1028
+
1029
+ gettimeofday(&start, NULL);
1030
+ array = reads->tSequences;
1031
+
1032
+ #ifdef _OPENMP
1033
+ if (omp_get_max_threads() == 1)
1034
+ {
1035
+ omp_set_num_threads(2);
1036
+ omp_set_nested(0);
1037
+ }
1038
+ else
1039
+ omp_set_nested(1);
1040
+ initAnnotationBuffers();
1041
+ #else
1042
+ annotationBuffer = newStringBuffer(BUFFER_SIZE);
1043
+ #endif
1044
+
1045
+ if (referenceSequenceCount && (kmerCount = countReferenceKmers(reads, table->WORDLENGTH)) > 0) {
1046
+ table->kmerOccurenceTable = newKmerOccurenceTable(24 , table->WORDLENGTH);
1047
+ allocateKmerOccurences(kmerCount, table->kmerOccurenceTable);
1048
+ if (seqReadInfo.m_bIsBinary) {
1049
+ referenceMasks = scanBinaryReferenceSequences(&seqReadInfo, referenceSequenceCount);
1050
+ // binary seqs have no Ns so just advance past the references
1051
+ for (index = 0; index < referenceSequenceCount; index++) {
1052
+ TightString cmpString;
1053
+ cmpString.length = seqReadInfo.m_currentReadLength;
1054
+ cmpString.sequence = mallocOrExit((seqReadInfo.m_currentReadLength + 3) / 4, uint8_t );
1055
+ getCnySeqNucl(&seqReadInfo, cmpString.sequence);
1056
+ if (seqReadInfo.m_bIsRef) {
1057
+ seqReadInfo.m_refCnt = readCnySeqUint32(&seqReadInfo);
1058
+ // now the next ptr is advanced
1059
+ seqReadInfo.m_pNextReadPtr += (sizeof(RefInfo) * seqReadInfo.m_refCnt);
1060
+ RefInfo refElem;
1061
+ uint32_t refIdx;
1062
+ for (refIdx = 0; refIdx < seqReadInfo.m_refCnt; refIdx++) {
1063
+ // not actually used so just read past refs
1064
+ refElem.m_referenceID = readCnySeqUint32(&seqReadInfo);
1065
+ refElem.m_pos = readCnySeqUint32(&seqReadInfo);
1066
+ }
1067
+ }
1068
+ // optional test to ensure reference mapping seqIDs are in sync
1069
+ #if 0
1070
+ TightString *tString;
1071
+ tString = getTightStringInArray(array, index);
1072
+ if (getLength(tString) != seqReadInfo.m_currentReadLength) {
1073
+ velvetLog("Error: TightString len mismatch, %d != %ld\n", getLength(tString), seqReadInfo.m_currentReadLength);
1074
+ exit(1);
1075
+ }
1076
+ char *str = readTightString(tString);
1077
+ char *cmpStr = readTightString(&cmpString);
1078
+ if (strcmp(str, cmpStr) != 0) {
1079
+ printf("seq %s != cmp %s\n", str, cmpStr);
1080
+ exit(1);
1081
+ }
1082
+ free(str);
1083
+ free(cmpStr);
1084
+ #endif
1085
+ advanceCnySeqCurrentRead(&seqReadInfo);
1086
+ free(cmpString.sequence);
1087
+ }
1088
+ } else {
1089
+ seqFile = fopen(seqFilename, "r");
1090
+
1091
+ if (seqFile == NULL)
1092
+ exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", seqFilename);
1093
+ else
1094
+ velvetLog("Reading mapping info from file %s\n", seqFilename);
1095
+
1096
+ seqReadInfo.m_pFile = seqFile;
1097
+
1098
+ // Skip through reference headers quickly
1099
+ referenceMasks = scanReferenceSequences(seqFile, referenceSequenceCount);
1100
+ }
1101
+
1102
+ #ifdef _OPENMP
1103
+ producing = 1;
1104
+ #pragma omp parallel sections
1105
+ {
1106
+ #pragma omp section
1107
+ {
1108
+ bufferWritter(outfile);
1109
+ }
1110
+ #pragma omp section
1111
+ {
1112
+ #pragma omp parallel for
1113
+ #endif
1114
+ for (index = 0; index < referenceSequenceCount; index++)
1115
+ inputReferenceIntoSplayTable(getTightStringInArray(array, index),
1116
+ table, outfile, index + 1, referenceMasks[index]);
1117
+
1118
+ #ifdef _OPENMP
1119
+ for (index = omp_get_max_threads() - 1; index >= 0; index--)
1120
+ pushBufferCommit(index);
1121
+ producing = 0;
1122
+ #pragma omp flush(producing)
1123
+ }
1124
+ }
1125
+ #endif
1126
+
1127
+ if (maskMemory)
1128
+ destroyRecycleBin(maskMemory);
1129
+ maskMemory = NULL;
1130
+ sortKmerOccurenceTable(table->kmerOccurenceTable);
1131
+ }
1132
+
1133
+ velvetLog("Inputting sequences...\n");
1134
+
1135
+ if (table->kmerOccurenceTable) {
1136
+ mapReferenceIDs = callocOrExit(sequenceCount + 1, IDnum*);
1137
+ mapCoords = callocOrExit(sequenceCount + 1, Coordinate *);
1138
+ mapCount = callocOrExit(sequenceCount + 1, Coordinate);
1139
+
1140
+ RefInfo *refArray = NULL;
1141
+ if (seqReadInfo.m_bIsBinary) {
1142
+ TightString cmpString;
1143
+ for (seqID = referenceSequenceCount + 1; seqID < sequenceCount + 1; seqID++) {
1144
+ cmpString.length = seqReadInfo.m_currentReadLength;
1145
+ cmpString.sequence = mallocOrExit((seqReadInfo.m_currentReadLength + 3) / 4, uint8_t );
1146
+ getCnySeqNucl(&seqReadInfo, cmpString.sequence);
1147
+ if (seqReadInfo.m_bIsRef) {
1148
+ seqReadInfo.m_refCnt = readCnySeqUint32(&seqReadInfo);
1149
+ // now the next ptr is advanced
1150
+ seqReadInfo.m_pNextReadPtr += (sizeof(RefInfo) * seqReadInfo.m_refCnt);
1151
+ refArray = callocOrExit(seqReadInfo.m_refCnt, RefInfo);
1152
+ uint32_t refIdx;
1153
+ for (refIdx = 0; refIdx < seqReadInfo.m_refCnt; refIdx++) {
1154
+ refArray[refIdx].m_referenceID = readCnySeqUint32(&seqReadInfo);
1155
+ refArray[refIdx].m_pos = readCnySeqUint32(&seqReadInfo);
1156
+ }
1157
+ }
1158
+ // optional test to ensure reference mapping seqIDs are in sync
1159
+ #if 0
1160
+ TightString *tString;
1161
+ tString = getTightStringInArray(array, seqID - 1);
1162
+ if (getLength(tString) != seqReadInfo.m_currentReadLength) {
1163
+ velvetLog("Error: TightString len mismatch, %d != %ld\n", getLength(tString), seqReadInfo.m_currentReadLength);
1164
+ exit(1);
1165
+ }
1166
+ char *str = readTightString(tString);
1167
+ char *cmpStr = readTightString(&cmpString);
1168
+ if (strcmp(str, cmpStr) != 0) {
1169
+ printf("seq %s != cmp %s\n", str, cmpStr);
1170
+ exit(1);
1171
+ }
1172
+ free(str);
1173
+ free(cmpStr);
1174
+ #endif
1175
+ free(cmpString.sequence);
1176
+
1177
+ // set prior count
1178
+ mapCount[seqID - 1] = counter;
1179
+ counter = 0;
1180
+ maxCount = 20;
1181
+ mapReferenceIDs[seqID] = callocOrExit(maxCount, IDnum);
1182
+ mapCoords[seqID] = callocOrExit(maxCount, Coordinate);
1183
+
1184
+ if (seqReadInfo.m_bIsRef) {
1185
+ while (counter < seqReadInfo.m_refCnt) {
1186
+ mapReferenceIDs[seqID][counter] = (IDnum) refArray[counter].m_referenceID;
1187
+ mapCoords[seqID][counter] = (Coordinate) refArray[counter].m_pos;
1188
+
1189
+ if (++counter == maxCount) {
1190
+ maxCount *= 2;
1191
+ mapReferenceIDs[seqID] = reallocOrExit(mapReferenceIDs[seqID], maxCount, IDnum);
1192
+ mapCoords[seqID] = reallocOrExit(mapCoords[seqID], maxCount, Coordinate);
1193
+ }
1194
+ }
1195
+ free(refArray);
1196
+ }
1197
+ advanceCnySeqCurrentRead(&seqReadInfo);
1198
+ }
1199
+ } else {
1200
+ // Parse file for mapping info
1201
+ while (seqFile && (c = getc(seqFile)) != EOF) {
1202
+
1203
+ if (c == '>') {
1204
+ mapCount[seqID] = counter;
1205
+ counter = 0;
1206
+ maxCount = 20;
1207
+ fgets(line, MAXLINE, seqFile);
1208
+ sscanf(line,"%*[^\t]\t%li\t", &long_var);
1209
+ seqID = (IDnum) long_var;
1210
+ mapReferenceIDs[seqID] = callocOrExit(maxCount, IDnum);
1211
+ mapCoords[seqID] = callocOrExit(maxCount, Coordinate);
1212
+ } else if (c == 'M') {
1213
+ fgets(line, MAXLINE, seqFile);
1214
+ sscanf(line,"\t%li\t%lli\n", &long_var, &longlong_var);
1215
+ mapReferenceIDs[seqID][counter] = (IDnum) long_var;
1216
+ mapCoords[seqID][counter] = (Coordinate) longlong_var;
1217
+
1218
+ if (++counter == maxCount) {
1219
+ maxCount *= 2;
1220
+ mapReferenceIDs[seqID] = reallocOrExit(mapReferenceIDs[seqID], maxCount, IDnum);
1221
+ mapCoords[seqID] = reallocOrExit(mapCoords[seqID], maxCount, Coordinate);
1222
+ }
1223
+ }
1224
+ }
1225
+ }
1226
+ }
1227
+ if (seqFile)
1228
+ fclose(seqFile);
1229
+
1230
+ if (seqReadInfo.m_bIsBinary) {
1231
+ if (seqReadInfo.m_pReadBuffer) {
1232
+ free(seqReadInfo.m_pReadBuffer);
1233
+ }
1234
+ fclose(seqReadInfo.m_pFile);
1235
+ }
1236
+
1237
+ #ifdef _OPENMP
1238
+ producing = 1;
1239
+ #pragma omp parallel sections
1240
+ {
1241
+ #pragma omp section
1242
+ {
1243
+ bufferWritter(outfile);
1244
+ }
1245
+ #pragma omp section
1246
+ {
1247
+ #pragma omp parallel for
1248
+ #endif
1249
+ for (index = referenceSequenceCount; index < sequenceCount; index++)
1250
+ {
1251
+ boolean second_in_pair;
1252
+
1253
+ // Progress report on screen
1254
+ if (index % 1000000 == 0) {
1255
+ velvetLog("Inputting sequence %li / %li\n",
1256
+ (long)index, (long)sequenceCount);
1257
+ fflush(stdout);
1258
+ }
1259
+
1260
+ // Test to make sure that all the reference reads are before all the other reads
1261
+ if (reads->categories[index] == REFERENCE) {
1262
+ velvetLog("Reference sequence placed after a non-reference read!\n");
1263
+ velvetLog(">> Please re-order the filenames in your command line so as "
1264
+ "to have the reference sequence files before all the others\n");
1265
+ #ifdef DEBUG
1266
+ abort();
1267
+ #endif
1268
+ exit(0);
1269
+ }
1270
+ second_in_pair = reads->categories[index] % 2 && isSecondInPair(reads, index);
1271
+
1272
+ // Hashing the reads
1273
+ if (table->kmerOccurenceTable)
1274
+ inputSequenceIntoSplayTable(array, table,
1275
+ outfile,
1276
+ second_in_pair, mapReferenceIDs[index + 1], mapCoords[index+1], mapCount[index+1], index + 1);
1277
+ else
1278
+ inputSequenceIntoSplayTable(array, table,
1279
+ outfile,
1280
+ second_in_pair, NULL, NULL, 0, index + 1);
1281
+ }
1282
+ #ifdef _OPENMP
1283
+ for (index = omp_get_max_threads() - 1; index >= 0; index--)
1284
+ pushBufferCommit(index);
1285
+ producing = 0;
1286
+ #pragma omp flush(producing)
1287
+ }
1288
+ }
1289
+ destroyAnnotationBuffers();
1290
+ #else
1291
+ destroyStringBuffer(annotationBuffer, 1);
1292
+ #endif
1293
+
1294
+ gettimeofday(&end, NULL);
1295
+ timersub(&end, &start, &diff);
1296
+ velvetLog(" === Sequences loaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1297
+
1298
+ fclose(outfile);
1299
+
1300
+ if (mapReferenceIDs) {
1301
+ free(mapReferenceIDs);
1302
+ free(mapCoords);
1303
+ free(mapCount);
1304
+ }
1305
+ if (referenceMasks) {
1306
+ free(referenceMasks);
1307
+ }
1308
+ if (seqReadInfo.m_namesFilename) {
1309
+ free(seqReadInfo.m_namesFilename);
1310
+ }
1311
+ //free(reads->tSequences);
1312
+ //reads->tSequences = NULL;
1313
+ //destroyReadSet(reads);
1314
+ velvetLog("Done inputting sequences\n");
1315
+ }