finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,30 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _READCOHERENTGRAPH_H_
22
+ #define _READCOHERENTGRAPH_H_
23
+
24
+ void readCoherentGraph(Graph * graph, boolean(*isUnique) (Node * node),
25
+ double coverage, ReadSet * reads);
26
+
27
+ boolean isUniqueSolexa(Node * node);
28
+
29
+ void setMultiplicityCutoff(int value);
30
+ #endif
@@ -0,0 +1,1734 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <math.h>
25
+ #include <time.h>
26
+ #include <limits.h>
27
+ #include <ctype.h>
28
+
29
+ #include "globals.h"
30
+ #include "tightString.h"
31
+ #include "readSet.h"
32
+ #include "utility.h"
33
+ #include "binarySequences.h"
34
+ #include "autoOpen.h"
35
+ #include "kseq.h"
36
+
37
+ #if !defined(BUNDLEDZLIB)
38
+ #include <zlib.h>
39
+ #elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
40
+ #include "../third-party/zlib-1.2.3/Win32/include/zlib.h"
41
+ #else
42
+ #include "../third-party/zlib-1.2.3/zlib.h"
43
+ #endif
44
+
45
+ #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
46
+ # include <fcntl.h>
47
+ # include <io.h>
48
+ # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
49
+ #else
50
+ # define SET_BINARY_MODE(file)
51
+ #endif
52
+
53
+ static Mask *allocateMask(SequencesWriter *seqWriteInfo)
54
+ {
55
+ if (seqWriteInfo->m_maskMemory == NULL)
56
+ seqWriteInfo->m_maskMemory = newRecycleBin(sizeof(Mask), 10000);
57
+
58
+ return (Mask *) allocatePointer(seqWriteInfo->m_maskMemory);
59
+ }
60
+
61
+ static Mask * newMask(SequencesWriter *seqWriteInfo, Coordinate position)
62
+ {
63
+ Mask * mask = allocateMask(seqWriteInfo);
64
+ mask->start = position;
65
+ mask->finish = position;
66
+ mask->next = NULL;
67
+ return mask;
68
+ }
69
+
70
+ //
71
+ // cmd line args can override the createBinary flag
72
+ // note that createBinary is only used by velveth
73
+ //
74
+ boolean createBinary = false;
75
+ boolean isCreateBinary()
76
+ {
77
+ return createBinary;
78
+ }
79
+
80
+ void setCreateBinary(boolean val)
81
+ {
82
+ createBinary = val;
83
+ }
84
+
85
+ ReadSet *newReadSet()
86
+ {
87
+ ReadSet *rs = callocOrExit(1, ReadSet);
88
+ return rs;
89
+ }
90
+
91
+ //////////////////////////////////////////////////////////////////////////
92
+ // Reference identifiers
93
+ //////////////////////////////////////////////////////////////////////////
94
+
95
+ typedef struct referenceCoordinate_st ReferenceCoordinate;
96
+ static Coordinate reference_coordinate_double_strand = true;
97
+
98
+ struct referenceCoordinate_st {
99
+ char * name;
100
+ Coordinate start;
101
+ Coordinate finish;
102
+ IDnum referenceID;
103
+ IDnum counter;
104
+ boolean positive_strand;
105
+ } ATTRIBUTE_PACKED;
106
+
107
+ static int compareRefCoords(const void * ptrA, const void * ptrB) {
108
+ ReferenceCoordinate * A = (ReferenceCoordinate *) ptrA;
109
+ ReferenceCoordinate * B = (ReferenceCoordinate *) ptrB;
110
+ int comp = strcmp(A->name, B->name);
111
+
112
+ if (comp != 0)
113
+ return comp;
114
+ else if (!reference_coordinate_double_strand && A->positive_strand != B->positive_strand)
115
+ return A->positive_strand > B->positive_strand;
116
+ else {
117
+ if (A->finish > -1 && A->finish < B->start)
118
+ return -1;
119
+ else if (B->finish > -1 && A->start > B->finish)
120
+ return 1;
121
+ else return 0;
122
+ }
123
+ }
124
+
125
+ typedef struct referenceCoordinateTable_st ReferenceCoordinateTable;
126
+
127
+ struct referenceCoordinateTable_st {
128
+ ReferenceCoordinate * array;
129
+ IDnum arrayLength;
130
+ } ATTRIBUTE_PACKED;
131
+
132
+ static ReferenceCoordinateTable * newReferenceCoordinateTable() {
133
+ ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable);
134
+ table->array = NULL;
135
+ table->arrayLength = 0;
136
+ return table;
137
+ }
138
+
139
+ static void printReferenceCoordinateTableStats(ReferenceCoordinateTable * table) {
140
+ IDnum index;
141
+ IDnum counter = 0;
142
+
143
+ velvetLog("Reference mapping counters\n");
144
+ velvetLog("Name\tRead mappings\n");
145
+
146
+ for (index = 0; index < table->arrayLength; index++) {
147
+ velvetLog("%s\t%li\n", table->array[index].name, (long) table->array[index].counter);
148
+ counter += table->array[index].counter;
149
+ }
150
+
151
+ if (counter == 0) {
152
+ velvetLog("WARNING: None of your read mappings recognized the reference sequence!\n");
153
+ velvetLog("Double check that the names are identical between reference fasta headers and SAM/BAM sequences.\n");
154
+ }
155
+ }
156
+
157
+ static void destroyReferenceCoordinateTable(ReferenceCoordinateTable * table) {
158
+ IDnum index;
159
+
160
+ if (table->array) {
161
+ printReferenceCoordinateTableStats(table);
162
+ for (index = 0; index < table->arrayLength; index++)
163
+ free(table->array[index].name);
164
+ free(table->array);
165
+ }
166
+ free(table);
167
+ }
168
+
169
+ static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) {
170
+ if (table->array == NULL)
171
+ table->array = callocOrExit(extraLength, ReferenceCoordinate);
172
+ else
173
+ table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate);
174
+ }
175
+
176
+ static ReferenceCoordinate * findReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
177
+ ReferenceCoordinate * array = table->array;
178
+ ReferenceCoordinate refCoord;
179
+ Coordinate leftIndex = 0;
180
+ Coordinate rightIndex = table->arrayLength - 1;
181
+ Coordinate middleIndex;
182
+
183
+ refCoord.name = name;
184
+ refCoord.start = start;
185
+ refCoord.finish = finish;
186
+ refCoord.referenceID = 0;
187
+ refCoord.positive_strand = positive_strand;
188
+
189
+ while (true) {
190
+ middleIndex = (rightIndex + leftIndex) / 2;
191
+
192
+ if (leftIndex > rightIndex)
193
+ return NULL;
194
+ else if (compareRefCoords(&(array[middleIndex]), &refCoord) == 0)
195
+ return &(array[middleIndex]);
196
+ else if (leftIndex == middleIndex)
197
+ return NULL;
198
+ else if (compareRefCoords(&(array[middleIndex]), &refCoord) > 0)
199
+ rightIndex = middleIndex;
200
+ else
201
+ leftIndex = middleIndex;
202
+ }
203
+ }
204
+
205
+ static void addReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
206
+ ReferenceCoordinate * refCoord;
207
+
208
+ if ((refCoord = findReferenceCoordinate(table, name, start, finish, positive_strand))) {
209
+ velvetLog("Overlapping reference coordinates:\n");
210
+ velvetLog("%s:%lli-%lli\n", name, (long long) start, (long long) finish);
211
+ velvetLog("%s:%lli-%lli\n", refCoord->name, (long long) refCoord->start, (long long) refCoord->finish);
212
+ velvetLog("Exiting...\n");
213
+ #ifdef DEBUG
214
+ abort();
215
+ #endif
216
+ exit(1);
217
+ }
218
+
219
+ refCoord = &(table->array[table->arrayLength++]);
220
+
221
+ refCoord->name = name;
222
+ refCoord->start = start;
223
+ refCoord->finish = finish;
224
+ refCoord->referenceID = table->arrayLength;
225
+ refCoord->positive_strand = positive_strand;
226
+ refCoord->counter = 0;
227
+ }
228
+
229
+ static void sortReferenceCoordinateTable(ReferenceCoordinateTable * table) {
230
+ qsort(table->array, table->arrayLength, sizeof(ReferenceCoordinate), compareRefCoords);
231
+ }
232
+
233
+ //////////////////////////////////////////////////////////////////////////
234
+ // File reading
235
+ //////////////////////////////////////////////////////////////////////////
236
+
237
+ static void velvetifySequence(char * str, SequencesWriter *seqWriteInfo) {
238
+ int i;
239
+ char c;
240
+ size_t length = strlen(str);
241
+
242
+ for (i = 0; i < length; i++) {
243
+ c = str[i];
244
+ switch (c) {
245
+ case '\n':
246
+ case '\r':
247
+ case EOF:
248
+ str[i] = '\0';
249
+ break;
250
+ case 'A':
251
+ case 'a':
252
+ str[i] = 'A';
253
+ break;
254
+ case 'C':
255
+ case 'c':
256
+ str[i] = 'C';
257
+ break;
258
+ case 'G':
259
+ case 'g':
260
+ str[i] = 'G';
261
+ break;
262
+ case 'T':
263
+ case 't':
264
+ str[i] = 'T';
265
+ break;
266
+ default:
267
+ str[i] = 'N';
268
+ }
269
+ // non NULL indicates ref masks are being created
270
+ if (seqWriteInfo->m_referenceMask != NULL) {
271
+ if (str[i] == 'N') {
272
+ if (seqWriteInfo->m_openMask) {
273
+ seqWriteInfo->m_current->finish++;
274
+ } else if (*(seqWriteInfo->m_referenceMask) == NULL) {
275
+ *(seqWriteInfo->m_referenceMask) = newMask(seqWriteInfo, seqWriteInfo->m_position);
276
+ seqWriteInfo->m_current = *(seqWriteInfo->m_referenceMask);
277
+ } else {
278
+ seqWriteInfo->m_current->next = newMask(seqWriteInfo, seqWriteInfo->m_position);
279
+ seqWriteInfo->m_current = seqWriteInfo->m_current->next;
280
+ }
281
+ seqWriteInfo->m_openMask = true;
282
+ seqWriteInfo->m_position += 1;
283
+ } else if (str[i] != '\0') {
284
+ seqWriteInfo->m_openMask = false;
285
+ seqWriteInfo->m_position += 1;
286
+ }
287
+ }
288
+ }
289
+ }
290
+
291
+ static void reverseComplementSequence(char * str)
292
+ {
293
+ size_t length = strlen(str);
294
+ size_t i;
295
+
296
+ for (i = 0; i < length-1 - i; i++) {
297
+ char c = str[i];
298
+ str[i] = str[length-1 - i];
299
+ str[length-1 - i] = c;
300
+ }
301
+
302
+ #ifndef COLOR
303
+ for (i = 0; i < length; i++) {
304
+ switch (str[i]) {
305
+ case 'A':
306
+ case 'a':
307
+ str[i] = 'T';
308
+ break;
309
+ case 'C':
310
+ case 'c':
311
+ str[i] = 'G';
312
+ break;
313
+ case 'G':
314
+ case 'g':
315
+ str[i] = 'C';
316
+ break;
317
+ // As in velvetifySequence(), anything unusual ends up as 'A'
318
+ default:
319
+ str[i] = 'A';
320
+ break;
321
+ }
322
+ }
323
+ #endif
324
+ }
325
+
326
+ static void writeFastaSequence(FILE * outfile, const char * str)
327
+ {
328
+ size_t length = strlen(str);
329
+ size_t start;
330
+ for (start = 0; start < length; start += 60)
331
+ velvetFprintf(outfile, "%.60s\n", &str[start]);
332
+ }
333
+
334
+ void convertSequences(ReadSet * rs)
335
+ {
336
+ rs->tSequences = newTightStringArrayFromStringArray(rs->sequences,
337
+ rs->readCount,
338
+ &rs->tSeqMem);
339
+ rs->sequences = NULL;
340
+ }
341
+
342
+ // Returns the value of a 32-bit little-endian-stored integer.
343
+ static int int32(const unsigned char * ptr)
344
+ {
345
+ int x = ptr[3];
346
+ x = (x << 8) | ptr[2];
347
+ x = (x << 8) | ptr[1];
348
+ x = (x << 8) | ptr[0];
349
+ return x;
350
+ }
351
+
352
+ void goToEndOfLine(char *line, FILE * file)
353
+ {
354
+ size_t length = strlen(line);
355
+ char c = line[length - 1];
356
+
357
+ while (c != '\n')
358
+ c = fgetc(file);
359
+ }
360
+
361
+ static void writeSeqName(char*seq_name, SequencesWriter *seqWriteInfo, Category cat, IDnum *sequenceIndex)
362
+ {
363
+ char name[5001];
364
+ if (isCreateBinary()) {
365
+ cnySeqInsertStart(seqWriteInfo);
366
+ sprintf(name, ">%s", seq_name);
367
+ cnySeqInsertSequenceName(name, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
368
+ } else {
369
+ velvetFprintf(seqWriteInfo->m_pFile,">%s\t%ld\t%d\n", seq_name, (long) ((*sequenceIndex)++), (int) cat);
370
+ }
371
+ }
372
+
373
+ static void writeSequence(char*seq, SequencesWriter *seqWriteInfo)
374
+ {
375
+ char str[100];
376
+ velvetifySequence(seq, seqWriteInfo);
377
+ if (isCreateBinary()) {
378
+ cnySeqInsertNucleotideString(seq, seqWriteInfo);
379
+ cnySeqInsertEnd(seqWriteInfo);
380
+ } else {
381
+ Coordinate start = 0;
382
+ while (start <= strlen(seq)) {
383
+ strncpy(str, seq + start, 60);
384
+ str[60] = '\0';
385
+ velvetFprintf(seqWriteInfo->m_pFile, "%s\n", str);
386
+ start += 60;
387
+ }
388
+ }
389
+ }
390
+
391
+ static void initFastX(SequencesWriter *seqWriteInfo, Category cat)
392
+ {
393
+ seqWriteInfo->m_referenceMask = NULL;
394
+ seqWriteInfo->m_position = 0;
395
+ seqWriteInfo->m_openMask = false;
396
+
397
+ // Binary file stuff
398
+ if (isCreateBinary() && (cat == REFERENCE)) {
399
+ seqWriteInfo->m_referenceMask = callocOrExit(1, Mask*);
400
+ }
401
+ if (isCreateBinary()) {
402
+ inputCnySeqFileStart(cat, seqWriteInfo);
403
+ }
404
+ }
405
+
406
+ static void cleanupFastX(SequencesWriter *seqWriteInfo, Category cat)
407
+ {
408
+ if (seqWriteInfo->m_referenceMask) {
409
+ free(seqWriteInfo->m_referenceMask);
410
+ seqWriteInfo->m_referenceMask = NULL;
411
+ }
412
+ }
413
+
414
+
415
+ // Imports sequences from a raw sequence file
416
+ // Memory space allocated within this function.
417
+ static void readRawFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex)
418
+ {
419
+ FILE *file;
420
+ const int maxline = 5000;
421
+ char line[5000];
422
+ IDnum counter = 0;
423
+
424
+ initFastX(seqWriteInfo, cat);
425
+
426
+ if (strcmp(filename, "-"))
427
+ file = fopen(filename, "r");
428
+ else
429
+ file = stdin;
430
+
431
+ if (file != NULL)
432
+ velvetLog("Reading raw file %s\n", filename);
433
+ else
434
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
435
+
436
+ while(fgets(line, maxline, file)) {
437
+ if (strlen(line) >= maxline - 1) {
438
+ velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
439
+ #ifdef DEBUG
440
+ abort();
441
+ #endif
442
+ exit(1);
443
+ }
444
+
445
+ writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
446
+ writeSequence(line, seqWriteInfo);
447
+ counter++;
448
+ }
449
+ fclose(file);
450
+ cleanupFastX(seqWriteInfo, cat);
451
+ velvetLog("%li reads found.\n", (long) counter);
452
+ velvetLog("Done\n");
453
+ }
454
+
455
+ // Imports sequences from a zipped raw file
456
+ // Memory space allocated within this function.
457
+ static void readRawGZFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex)
458
+ {
459
+ gzFile file;
460
+ const int maxline = 5000;
461
+ char line[5000];
462
+ IDnum counter = 0;
463
+
464
+ initFastX(seqWriteInfo, cat);
465
+ if (strcmp(filename, "-"))
466
+ file = gzopen(filename, "rb");
467
+ else {
468
+ file = gzdopen(fileno(stdin), "rb");
469
+ SET_BINARY_MODE(stdin);
470
+ }
471
+
472
+ if (file != NULL)
473
+ velvetLog("Reading zipped raw sequence file %s\n", filename);
474
+ else
475
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
476
+
477
+ while(gzgets(file, line, maxline)) {
478
+ if (strlen(line) >= maxline - 1) {
479
+ velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
480
+ #ifdef DEBUG
481
+ abort();
482
+ #endif
483
+ exit(1);
484
+ }
485
+
486
+ writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
487
+ writeSequence(line, seqWriteInfo);
488
+ counter++;
489
+ }
490
+ gzclose(file);
491
+ cleanupFastX(seqWriteInfo, cat);
492
+ velvetLog("%li reads found.\n", (long) counter);
493
+ velvetLog("Done\n");
494
+ }
495
+
496
+ static void fillReferenceCoordinateTable(char *filename, ReferenceCoordinateTable * refCoords, IDnum counter)
497
+ {
498
+ FILE *file;
499
+ const int maxline = 5000;
500
+ char line[5000];
501
+ char * name;
502
+ long long start, finish;
503
+ Coordinate i;
504
+ IDnum index = 0;
505
+
506
+ if (strcmp(filename, "-") == 0)
507
+ exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence from stdin");
508
+ else
509
+ file = fopen(filename, "r");
510
+
511
+ if (counter == 0)
512
+ return;
513
+
514
+ resizeReferenceCoordinateTable(refCoords,counter);
515
+
516
+ while (fgets(line, maxline, file) && index < counter) {
517
+ if (line[0] == '>') {
518
+ name = callocOrExit(strlen(line), char);
519
+
520
+ if (strchr(line, ':')) {
521
+ sscanf(strtok(line, ":-\r\n\t "), ">%s", name);
522
+ sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &start);
523
+ sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &finish);
524
+ if (start <= finish)
525
+ addReferenceCoordinate(refCoords, name, start, finish, true);
526
+ else
527
+ addReferenceCoordinate(refCoords, name, finish, start, false);
528
+ } else {
529
+ // Chomping EOL characters and comments
530
+ for (i=strlen(line) - 1; i >= 0; i--)
531
+ if (line[i] == '\n' || line[i] == '\r' || line[i] == ' ' || line[i] == '\t')
532
+ line[i] = '\0';
533
+
534
+ strcpy(name, line + 1);
535
+ addReferenceCoordinate(refCoords, name, 1, -1, true);
536
+ }
537
+
538
+ index++;
539
+ }
540
+ }
541
+
542
+ sortReferenceCoordinateTable(refCoords);
543
+ }
544
+
545
+ #define FASTQ 1
546
+ #define FASTA 2
547
+ #define FASTA_GZ 5
548
+ #define FASTQ_GZ 6
549
+ #define SAM 8
550
+ #define BAM 9
551
+ #define RAW 10
552
+ #define RAW_GZ 11
553
+ #define AUTO 12
554
+
555
+ static gzFile openFastXFile(int fileType, char*filename)
556
+ {
557
+ gzFile file;
558
+ char c;
559
+
560
+ // Choose file or stdin
561
+ if (strcmp(filename, "-")==0) {
562
+ file = gzdopen(fileno(stdin), "rb");
563
+ SET_BINARY_MODE(stdin);
564
+ } else {
565
+ file = gzopen(filename, "rb");
566
+ }
567
+
568
+ // Verify filetype
569
+ c = gzgetc(file);
570
+ switch (fileType) {
571
+ case FASTA:
572
+ case FASTA_GZ:
573
+ if (c != EOF && c!='>')
574
+ exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
575
+ break;
576
+ case FASTQ:
577
+ case FASTQ_GZ:
578
+ if (c != EOF && c!='@')
579
+ exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
580
+ break;
581
+ }
582
+ gzungetc(c, file);
583
+
584
+
585
+ if (file != NULL) {
586
+ char *type;
587
+ switch (fileType) {
588
+ case FASTA:
589
+ case FASTA_GZ: type = "FastA"; break;
590
+ case FASTQ:
591
+ case FASTQ_GZ: type = "FastQ"; break;
592
+ default: type = ""; break;
593
+ }
594
+ velvetLog("Reading %s file %s;\n", type, filename);
595
+ } else
596
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
597
+
598
+ return file;
599
+ }
600
+
601
+ typedef struct {
602
+ gzFile gzFile;
603
+ AutoFile *autoFile;
604
+ } FileGZOrAuto;
605
+
606
+ size_t fileGZOrAuto_read(FileGZOrAuto kseq_file, void *ptr, size_t size)
607
+ {
608
+ if (kseq_file.gzFile)
609
+ return gzread(kseq_file.gzFile, ptr, size);
610
+ else
611
+ return fread(ptr, 1, size, kseq_file.autoFile->file);
612
+ }
613
+
614
+ void fileGZOrAuto_close(FileGZOrAuto kseq_file)
615
+ {
616
+ if (kseq_file.gzFile)
617
+ gzclose(kseq_file.gzFile);
618
+ else
619
+ closeFileAuto(kseq_file.autoFile);
620
+ }
621
+
622
+ char const* charToType(char c)
623
+ {
624
+ switch(c) {
625
+ case '>': return "FastA";
626
+ case '@': return "FastQ";
627
+ default: return "Unknown";
628
+ }
629
+ }
630
+
631
+ // Define mode to use kseq in
632
+ KSEQ_INIT(FileGZOrAuto, fileGZOrAuto_read)
633
+
634
+ // Read in FastA or FastQ files in compressed or gz format
635
+ static void readFastXFile(int fileType, SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex, ReferenceCoordinateTable * refCoords)
636
+ {
637
+ kseq_t *seq;
638
+ FileGZOrAuto file;
639
+ IDnum counter = 0;
640
+
641
+ file.gzFile = file.autoFile = NULL;
642
+ if (fileType == AUTO) {
643
+ file.autoFile = openFileAuto(filename);
644
+ if (!file.autoFile)
645
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename);
646
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename, file.autoFile->decompressor, charToType(file.autoFile->first_char));
647
+ } else
648
+ file.gzFile = openFastXFile(fileType, filename);
649
+
650
+ initFastX(seqWriteInfo, cat);
651
+ // Read a sequence at a time
652
+ seq = kseq_init(file);
653
+ while (kseq_read(seq) >= 0) {
654
+ counter++;
655
+ writeSeqName(seq->name.s, seqWriteInfo, cat, sequenceIndex);
656
+ writeSequence(seq->seq.s, seqWriteInfo);
657
+ }
658
+
659
+ kseq_destroy(seq);
660
+ fileGZOrAuto_close(file);
661
+
662
+ if (cat == REFERENCE) {
663
+ fillReferenceCoordinateTable(filename, refCoords, counter);
664
+ }
665
+ cleanupFastX(seqWriteInfo, cat);
666
+
667
+ velvetLog("%li sequences found\n", (long) counter);
668
+ velvetLog("Done\n");
669
+ }
670
+
671
+ static void readFastXPair(int fileType, SequencesWriter *seqWriteInfo, char *filename1, char *filename2, Category cat, IDnum * sequenceIndex)
672
+ {
673
+ kseq_t *seq1, *seq2;
674
+ FileGZOrAuto file1, file2;
675
+ IDnum counter = 0;
676
+
677
+ if (cat==REFERENCE)
678
+ exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence in 'separate' read mode");
679
+
680
+ file1.gzFile = file1.autoFile = NULL;
681
+ file2.gzFile = file2.autoFile = NULL;
682
+ if (fileType == AUTO) {
683
+ file1.autoFile = openFileAuto(filename1);
684
+ if (!file1.autoFile)
685
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename1);
686
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename1, file1.autoFile->decompressor, charToType(file1.autoFile->first_char));
687
+ file2.autoFile = openFileAuto(filename2);
688
+ if (!file2.autoFile)
689
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename2);
690
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename2, file2.autoFile->decompressor, charToType(file2.autoFile->first_char));
691
+ } else {
692
+ file1.gzFile = openFastXFile(fileType, filename1);
693
+ file2.gzFile = openFastXFile(fileType, filename2);
694
+ }
695
+ initFastX(seqWriteInfo, cat);
696
+
697
+ // Read a sequence at a time
698
+ seq1 = kseq_init(file1);
699
+ seq2 = kseq_init(file2);
700
+ while (kseq_read(seq1) >= 0) {
701
+ counter++;
702
+ writeSeqName(seq1->name.s, seqWriteInfo, cat, sequenceIndex);
703
+ writeSequence(seq1->seq.s, seqWriteInfo);
704
+
705
+ if (kseq_read(seq2) < 0)
706
+ exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too few sequences", filename2);
707
+
708
+ counter++;
709
+ writeSeqName(seq2->name.s, seqWriteInfo, cat, sequenceIndex);
710
+ writeSequence(seq2->seq.s, seqWriteInfo);
711
+ }
712
+ if (kseq_read(seq2) >= 0)
713
+ exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too many sequences", filename2);
714
+
715
+ kseq_destroy(seq1);
716
+ kseq_destroy(seq2);
717
+
718
+ fileGZOrAuto_close(file1);
719
+ fileGZOrAuto_close(file2);
720
+
721
+ cleanupFastX(seqWriteInfo, cat);
722
+
723
+ velvetLog("%li sequences found in total in the paired sequence files\n", (long) counter);
724
+ velvetLog("Done\n");
725
+ }
726
+
727
+ static void addMapping(boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
728
+ if (isCreateBinary()) {
729
+ seqWriteInfo->m_bIsRef = true;
730
+ RefInfoList *refElem = callocOrExit(1, RefInfoList);
731
+ if (refCoord->positive_strand) {
732
+ refElem->m_elem.m_referenceID = (long) orientation * refCoord->referenceID;
733
+ refElem->m_elem.m_pos = (long long) (pos - refCoord->start);
734
+ } else {
735
+ refElem->m_elem.m_referenceID = (long) -orientation * refCoord->referenceID;
736
+ refElem->m_elem.m_pos = (long long) (refCoord->finish - pos - strlen(seq));
737
+ }
738
+ refElem->next = NULL;
739
+ if (seqWriteInfo->m_refInfoHead == NULL) {
740
+ seqWriteInfo->m_refInfoHead = refElem;
741
+ } else {
742
+ (*refTail)->next = refElem;
743
+ }
744
+ *refTail = refElem;
745
+ seqWriteInfo->m_refCnt++;
746
+ } else {
747
+ if (refCoord->positive_strand) {
748
+ snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) orientation * refCoord->referenceID, (long long) (pos - refCoord->start));
749
+ } else
750
+ snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) - orientation * refCoord->referenceID, (long long) (refCoord->finish - pos - strlen(seq)));
751
+
752
+ if (*buffer_size - strlen(buffer) < 100) {
753
+ *buffer_size += 1000;
754
+ buffer = reallocOrExit(buffer, *buffer_size, char);
755
+ }
756
+ }
757
+
758
+ // Increment counter
759
+ refCoord->counter++;
760
+ }
761
+
762
+ static void writeMappedSequence(IDnum * sequenceIndex, Category cat, Category prev_cat, char * previous_seq, char * previous_qname, char * previous_qname_pairing, char * buffer, SequencesWriter * seqWriteInfo) {
763
+ char print_qname[5000];
764
+ if (isCreateBinary()) {
765
+ if (prev_cat != cat) {
766
+ inputCnySeqFileStart(cat, seqWriteInfo);
767
+ prev_cat = cat;
768
+ }
769
+ cnySeqInsertStart(seqWriteInfo);
770
+ cnySeqInsertNucleotideString(previous_seq, seqWriteInfo);
771
+ sprintf(print_qname, ">%s%s", previous_qname, previous_qname_pairing);
772
+ cnySeqInsertSequenceName(print_qname, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
773
+ cnySeqInsertEnd(seqWriteInfo);
774
+ } else {
775
+ velvetFprintf(seqWriteInfo->m_pFile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
776
+ (long) ((*sequenceIndex)++), (int) cat);
777
+ writeFastaSequence(seqWriteInfo->m_pFile, previous_seq);
778
+ velvetFprintf(seqWriteInfo->m_pFile, "%s", buffer);
779
+ strcpy(buffer, "");
780
+ }
781
+ }
782
+
783
+ static void readCigar(char * cigar, boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
784
+ long long cigar_num;
785
+ int cigar_index;
786
+ char c;
787
+
788
+ if (strlen(cigar) == 1 && cigar[0] == '*')
789
+ ;
790
+ else {
791
+ cigar_num = 0;
792
+ for (cigar_index = 0; cigar_index < strlen(cigar); cigar_index++) {
793
+ c = cigar[cigar_index];
794
+ if (c == 'M' || c == '=' || c == 'X') {
795
+ if (refCoord->finish < 0 || pos < refCoord->finish)
796
+ addMapping(orientation, pos, seq, refCoord, buffer, seqWriteInfo, refTail, buffer_size);
797
+ cigar_num = 0;
798
+ } else if (c == 'S' || c == 'I') {
799
+ pos -= cigar_num;
800
+ cigar_num = 0;
801
+ } else if (c == 'D' || c == 'N') {
802
+ pos += cigar_num;
803
+ cigar_num = 0;
804
+ } else if (c == 'H' || c == 'P') {
805
+ cigar_num = 0;
806
+ } else if (isdigit(c)) {
807
+ cigar_num = 10 * cigar_num + (c - 48);
808
+ } else {
809
+ abort();
810
+ }
811
+ }
812
+ }
813
+ }
814
+
815
+ static void readSAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
816
+ {
817
+ char line[5000];
818
+ unsigned long lineno;
819
+ IDnum readCount = 0;
820
+ char previous_qname_pairing[10];
821
+ char previous_qname[5000];
822
+ char previous_seq[5000];
823
+ boolean previous_paired = false;
824
+ Category prev_cat = cat;
825
+ Category apparentCat;
826
+ ReferenceCoordinate * refCoord;
827
+ RefInfoList *refTail = NULL;
828
+ seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
829
+ seqWriteInfo->m_position = 0;
830
+ seqWriteInfo->m_openMask = false;
831
+
832
+ size_t buffer_size = 5000;
833
+ char * buffer = callocOrExit(buffer_size, char);
834
+
835
+ if (cat == REFERENCE) {
836
+ velvetLog("SAM file %s cannot contain reference sequences.\n", filename);
837
+ velvetLog("Please check the command line.\n");
838
+ #ifdef DEBUG
839
+ abort();
840
+ #endif
841
+ exit(1);
842
+ }
843
+
844
+ FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
845
+ if (file)
846
+ velvetLog("Reading SAM file %s\n", filename);
847
+ else
848
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
849
+ if (isCreateBinary()) {
850
+ inputCnySeqFileStart(cat, seqWriteInfo);
851
+ }
852
+ strcpy(previous_qname, "");
853
+ for (lineno = 1; fgets(line, sizeof(line), file); lineno++) {
854
+ if (line[0] != '@') {
855
+ char *qname, *flag, *seq, *rname, *cigar;
856
+ long long pos;
857
+ int orientation;
858
+ int i;
859
+
860
+ qname = strtok(line, "\t");
861
+ flag = strtok(NULL, "\t");
862
+ rname = strtok(NULL, "\t");
863
+ sscanf(strtok(NULL, "\t"), "%lli", &pos);
864
+ orientation = 1;
865
+
866
+ // Mapping scor
867
+ (void) strtok(NULL, "\t");
868
+ cigar = strtok(NULL, "\t");
869
+
870
+ // Columns 7,8,9 are paired name, position and score
871
+ for (i = 7; i < 10; i++)
872
+ (void) strtok(NULL, "\t");
873
+ seq = strtok(NULL, "\t");
874
+
875
+ if (seq == NULL) {
876
+ velvetFprintf(stderr,
877
+ "Line #%lu: ignoring SAM record with too few fields\n",
878
+ lineno);
879
+ }
880
+ else if (strcmp(seq, "*") == 0) {
881
+ velvetFprintf(stderr,
882
+ "Line #%lu: ignoring SAM record with omitted SEQ field\n",
883
+ lineno);
884
+ }
885
+ else {
886
+ // Accept flags represented in either decimal or hex:
887
+ int flagbits = strtol(flag, NULL, 0);
888
+
889
+ if (flagbits & 0x4)
890
+ strcpy(rname, "");
891
+
892
+ const char *qname_pairing = "";
893
+ if (flagbits & 0x40)
894
+ qname_pairing = "/1";
895
+ else if (flagbits & 0x80)
896
+ qname_pairing = "/2";
897
+
898
+ if (flagbits & 0x10) {
899
+ orientation = -1;
900
+ reverseComplementSequence(seq);
901
+ }
902
+
903
+ // Determine if paired to previous read
904
+ boolean same_name = (strcmp(qname, previous_qname) == 0);
905
+ if (readCount && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
906
+ if (cat % 2 && !same_name && !previous_paired)
907
+ apparentCat = cat - 1;
908
+ else
909
+ apparentCat = cat;
910
+
911
+ previous_paired = (cat % 2 && same_name);
912
+
913
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
914
+ prev_cat = apparentCat;
915
+ }
916
+
917
+ if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, rname, (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation))) {
918
+ readCigar(cigar, orientation, pos, seq, refCoord, buffer, seqWriteInfo, &refTail, &buffer_size);
919
+ }
920
+
921
+ strcpy(previous_qname, qname);
922
+ strcpy(previous_qname_pairing, qname_pairing);
923
+ strcpy(previous_seq, seq);
924
+ velvetifySequence(previous_seq, seqWriteInfo);
925
+
926
+ readCount++;
927
+ }
928
+ }
929
+ }
930
+
931
+ if (readCount) {
932
+ if (cat % 2 && !previous_paired)
933
+ apparentCat = cat - 1;
934
+ else
935
+ apparentCat = cat;
936
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
937
+ }
938
+
939
+ free(buffer);
940
+ fclose(file);
941
+ velvetLog("%lu reads found.\n", (long) readCount);
942
+ velvetLog("Done\n");
943
+ }
944
+
945
+ static int readBAMint32(gzFile file)
946
+ {
947
+ unsigned char buffer[4];
948
+ if (gzread(file, buffer, 4) != 4)
949
+ exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
950
+
951
+ return int32(buffer);
952
+ }
953
+
954
+ static void readBAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
955
+ {
956
+ size_t seqCapacity = 0;
957
+ char *seq = NULL;
958
+ char cigar[5000];
959
+ char cigar_buffer[5000];
960
+ size_t bufferCapacity = 4;
961
+ unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
962
+ unsigned long recno, readCount;
963
+ int i, refCount;
964
+ gzFile file;
965
+ char previous_qname_pairing[10];
966
+ char previous_qname[5000];
967
+ char previous_seq[5000];
968
+ boolean previous_paired = false;
969
+ Category prev_cat = cat;
970
+ Category apparentCat;
971
+ char ** refNames;
972
+ ReferenceCoordinate * refCoord;
973
+ seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
974
+ seqWriteInfo->m_position = 0;
975
+ seqWriteInfo->m_openMask = false;
976
+
977
+ RefInfoList *refTail = NULL;
978
+ size_t mapBuffer_size = 1000;
979
+ char * mapBuffer = callocOrExit(mapBuffer_size, char);
980
+
981
+ if (cat == REFERENCE) {
982
+ velvetLog("BAM file %s cannot contain reference sequences.\n", filename);
983
+ velvetLog("Please check the command line.\n");
984
+ #ifdef DEBUG
985
+ abort();
986
+ #endif
987
+ exit(1);
988
+ }
989
+
990
+ if (strcmp(filename, "-") != 0)
991
+ file = gzopen(filename, "rb");
992
+ else {
993
+ file = gzdopen(fileno(stdin), "rb");
994
+ SET_BINARY_MODE(stdin);
995
+ }
996
+
997
+ if (file != NULL)
998
+ velvetLog("Reading BAM file %s\n", filename);
999
+ else
1000
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1001
+
1002
+ if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1003
+ exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1004
+
1005
+ // Skip header text
1006
+ if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1007
+ exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1008
+
1009
+ // Skip header reference list
1010
+ refCount = readBAMint32(file);
1011
+ refNames = callocOrExit(refCount, char *);
1012
+ for (i = 0; i < refCount; i++) {
1013
+ int strLength;
1014
+
1015
+ if (gzread(file, buffer, 4) != 4)
1016
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1017
+
1018
+ strLength = int32(buffer);
1019
+ refNames[i] = callocOrExit(strLength, char);
1020
+
1021
+ if (bufferCapacity < 4 + strLength) {
1022
+ bufferCapacity = 4 + strLength + 4096;
1023
+ buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1024
+ }
1025
+
1026
+ if (gzread(file, buffer, 4 + strLength) != 4 + strLength)
1027
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1028
+
1029
+ strcpy(refNames[i], (char *) buffer);
1030
+ }
1031
+ if (isCreateBinary()) {
1032
+ inputCnySeqFileStart(cat, seqWriteInfo);
1033
+ }
1034
+ strcpy(previous_qname, "");
1035
+ readCount = 0;
1036
+ for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1037
+ int blockSize = int32(buffer);
1038
+ int readLength;
1039
+
1040
+ if (bufferCapacity < 4 + blockSize) {
1041
+ bufferCapacity = 4 + blockSize + 4096;
1042
+ buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1043
+ }
1044
+
1045
+ if (gzread(file, &buffer[4], blockSize) != blockSize)
1046
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1047
+
1048
+ readLength = int32(&buffer[20]);
1049
+ if (readLength == 0) {
1050
+ velvetFprintf(stderr,
1051
+ "Record #%lu: ignoring BAM record with omitted SEQ field\n",
1052
+ recno);
1053
+ }
1054
+ else {
1055
+ int readNameLength = buffer[12];
1056
+ int flag_nc = int32(&buffer[16]);
1057
+ int flagbits = flag_nc >> 16;
1058
+ int cigarLength = flag_nc & 0xffff;
1059
+ char *qname = (char *)&buffer[36];
1060
+ uint32_t *rawcigar = (uint32_t *) &buffer[36 + readNameLength];
1061
+ unsigned char *rawseq =
1062
+ &buffer[36 + readNameLength + 4 * cigarLength];
1063
+ int rID = int32(&buffer[4]);
1064
+ // NOTE: BAM file coords are 0-based, not 1-based like SAM files
1065
+ // No comment
1066
+ long long pos = int32(&buffer[8]) + 1;
1067
+ int orientation = 1;
1068
+
1069
+ const char *qname_pairing = "";
1070
+ if (flagbits & 0x40)
1071
+ qname_pairing = "/1";
1072
+ else if (flagbits & 0x80)
1073
+ qname_pairing = "/2";
1074
+
1075
+ strcpy(cigar, "");
1076
+ for (i = 0; i < cigarLength; i++) {
1077
+ static const char decode_ops[] = "MIDNSHP=X";
1078
+ uint32_t packed = *(rawcigar++);
1079
+ sprintf(cigar_buffer, "%i%c", packed >> 4, decode_ops[packed & 0xf]);
1080
+ strcat(cigar, cigar_buffer);
1081
+ }
1082
+
1083
+ if (seqCapacity < readLength + 1) {
1084
+ seqCapacity = readLength * 2 + 1;
1085
+ seq = reallocOrExit(seq, seqCapacity, char);
1086
+ }
1087
+
1088
+ for (i = 0; i < readLength; i += 2) {
1089
+ static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1090
+ unsigned int packed = *(rawseq++);
1091
+ seq[i] = decode_bases[packed >> 4];
1092
+ seq[i+1] = decode_bases[packed & 0xf];
1093
+ }
1094
+ seq[readLength] = '\0';
1095
+
1096
+ if (flagbits & 0x10) {
1097
+ orientation = -1;
1098
+ reverseComplementSequence(seq);
1099
+ }
1100
+
1101
+ // Determine if paired to previous read
1102
+ boolean same_name = (strcmp(qname, previous_qname) == 0);
1103
+ if (readCount > 0 && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
1104
+ if (cat % 2 && !same_name && !previous_paired)
1105
+ apparentCat = cat - 1;
1106
+ else
1107
+ apparentCat = cat;
1108
+
1109
+ previous_paired = (cat % 2 && same_name);
1110
+
1111
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
1112
+ prev_cat = apparentCat;
1113
+ }
1114
+
1115
+ if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[rID], (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation)))
1116
+ readCigar(cigar, orientation, pos, seq, refCoord, mapBuffer, seqWriteInfo, &refTail, &mapBuffer_size);
1117
+
1118
+ strcpy(previous_qname, qname);
1119
+ strcpy(previous_qname_pairing, qname_pairing);
1120
+ strcpy(previous_seq, seq);
1121
+ velvetifySequence(previous_seq, seqWriteInfo);
1122
+
1123
+ readCount++;
1124
+ }
1125
+ }
1126
+
1127
+ if (readCount) {
1128
+ if (cat % 2 && !previous_paired)
1129
+ apparentCat = cat - 1;
1130
+ else
1131
+ apparentCat = cat;
1132
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
1133
+ }
1134
+
1135
+ free(seq);
1136
+ free(buffer);
1137
+ free(mapBuffer);
1138
+
1139
+ gzclose(file);
1140
+ velvetLog("%lu reads found.\n", readCount);
1141
+ velvetLog("Done\n");
1142
+ }
1143
+
1144
+
1145
+ static void printUsage()
1146
+ {
1147
+ puts("Usage:");
1148
+ puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename} [options]");
1149
+ puts("");
1150
+ puts("\tdirectory\t\t: directory name for output files");
1151
+ printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
1152
+ puts("\tfilename\t\t: path to sequence file or - for standard input");
1153
+ puts("");
1154
+ puts("File format options:");
1155
+ puts("\t-fasta");
1156
+ puts("\t-fastq");
1157
+ puts("\t-raw");
1158
+ puts("\t-fasta.gz");
1159
+ puts("\t-fastq.gz");
1160
+ puts("\t-raw.gz");
1161
+ puts("\t-sam");
1162
+ puts("\t-bam");
1163
+ puts("\t-fmtAuto");
1164
+ puts("");
1165
+ puts("Read type options:");
1166
+ puts("\t-short");
1167
+ puts("\t-shortPaired");
1168
+ puts("\t-short2");
1169
+ puts("\t-shortPaired2");
1170
+ puts("\t-long");
1171
+ puts("\t-longPaired");
1172
+ puts("\t-reference");
1173
+ puts("");
1174
+ puts("Options:");
1175
+ puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
1176
+ puts("");
1177
+ puts("Output:");
1178
+ puts("\tdirectory/Roadmaps");
1179
+ puts("\tdirectory/Sequences");
1180
+ puts("\t\t[Both files are picked up by graph, so please leave them there]");
1181
+ }
1182
+
1183
+ // General argument parser for most functions
1184
+ // Basically a reused portion of toplevel code dumped into here
1185
+ void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand, boolean * noHash)
1186
+ {
1187
+ int argIndex = 1;
1188
+ int filetype = FASTA;
1189
+ Category cat = 0;
1190
+ IDnum sequenceIndex = 1;
1191
+ short short_var;
1192
+ ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
1193
+ boolean reuseSequences = false;
1194
+ boolean separate_pair_files = false;
1195
+
1196
+ if (argc < 2) {
1197
+ printUsage();
1198
+ #ifdef DEBUG
1199
+ abort();
1200
+ #endif
1201
+ exit(1);
1202
+ }
1203
+
1204
+ for (argIndex = 1; argIndex < argc; argIndex++) {
1205
+ if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1206
+ *double_strand = false;
1207
+ reference_coordinate_double_strand = false;
1208
+ } else if (strcmp(argv[argIndex], "-reuse_Sequences") == 0) {
1209
+ reuseSequences = true;
1210
+ } else if (strcmp(argv[argIndex], "-reuse_binary") == 0) {
1211
+ reuseSequences = true;
1212
+ } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1213
+ *noHash = true;
1214
+ }
1215
+ }
1216
+
1217
+ if (reuseSequences)
1218
+ return;
1219
+
1220
+ SequencesWriter * seqWriteInfo = NULL;
1221
+ if (isCreateBinary()) {
1222
+ seqWriteInfo = openCnySeqForWrite(filename);
1223
+ seqWriteInfo->m_unifiedSeqFileHeader.m_bDoubleStrand = *double_strand;
1224
+ // file is already open
1225
+ } else {
1226
+ seqWriteInfo = callocOrExit(1, SequencesWriter);
1227
+ seqWriteInfo->m_pFile = fopen(filename, "w");
1228
+ }
1229
+
1230
+ for (argIndex = 1; argIndex < argc; argIndex++) {
1231
+ if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1232
+
1233
+ if (strcmp(argv[argIndex], "-fastq") == 0)
1234
+ filetype = FASTQ;
1235
+ else if (strcmp(argv[argIndex], "-fasta") == 0)
1236
+ filetype = FASTA;
1237
+ else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1238
+ filetype = FASTQ_GZ;
1239
+ else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1240
+ filetype = FASTA_GZ;
1241
+ else if (strcmp(argv[argIndex], "-sam") == 0)
1242
+ filetype = SAM;
1243
+ else if (strcmp(argv[argIndex], "-bam") == 0)
1244
+ filetype = BAM;
1245
+ else if (strcmp(argv[argIndex], "-raw") == 0)
1246
+ filetype = RAW;
1247
+ else if (strcmp(argv[argIndex], "-raw.gz") == 0)
1248
+ filetype = RAW_GZ;
1249
+ else if (strcmp(argv[argIndex], "-fmtAuto") == 0)
1250
+ filetype = AUTO;
1251
+ else if (strcmp(argv[argIndex], "-short") == 0)
1252
+ cat = 0;
1253
+ else if (strcmp(argv[argIndex], "-shortPaired") ==
1254
+ 0)
1255
+ cat = 1;
1256
+ else if (strncmp
1257
+ (argv[argIndex], "-shortPaired",
1258
+ 12) == 0) {
1259
+ sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1260
+ cat = (Category) short_var;
1261
+ if (cat < 1 || cat > CATEGORIES) {
1262
+ velvetLog("Unknown option: %s\n",
1263
+ argv[argIndex]);
1264
+ #ifdef DEBUG
1265
+ abort();
1266
+ #endif
1267
+ exit(1);
1268
+ }
1269
+ cat--;
1270
+ cat *= 2;
1271
+ cat++;
1272
+ } else if (strncmp(argv[argIndex], "-short", 6) ==
1273
+ 0) {
1274
+ sscanf(argv[argIndex], "-short%hd", &short_var);
1275
+ cat = (Category) short_var;
1276
+ if (cat < 1 || cat > CATEGORIES) {
1277
+ velvetLog("Unknown option: %s\n",
1278
+ argv[argIndex]);
1279
+ #ifdef DEBUG
1280
+ abort();
1281
+ #endif
1282
+ exit(1);
1283
+ }
1284
+ cat--;
1285
+ cat *= 2;
1286
+ } else if (strcmp(argv[argIndex], "-long") == 0)
1287
+ cat = LONG; // CATEGORIES * 2;
1288
+ else if (strcmp(argv[argIndex], "-longPaired") == 0)
1289
+ cat = LONG_PAIRED; // CATEGORIES * 2 + 1;
1290
+ else if (strcmp(argv[argIndex], "-reference") == 0)
1291
+ cat = REFERENCE; // CATEGORIES * 2 + 2
1292
+ else if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1293
+ *double_strand = false;
1294
+ reference_coordinate_double_strand = false;
1295
+ } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1296
+ ;
1297
+ } else if (strcmp(argv[argIndex], "-create_binary") == 0) {
1298
+ ;
1299
+ } else if (strcmp(argv[argIndex], "-interleaved") == 0) {
1300
+ separate_pair_files = false;
1301
+ } else if (strcmp(argv[argIndex], "-separate") == 0) {
1302
+ separate_pair_files = true;
1303
+ }
1304
+ else {
1305
+ velvetLog("Unknown option: %s\n",
1306
+ argv[argIndex]);
1307
+ #ifdef DEBUG
1308
+ abort();
1309
+ #endif
1310
+ exit(1);
1311
+ }
1312
+
1313
+ continue;
1314
+ }
1315
+
1316
+ if (cat == -1)
1317
+ continue;
1318
+
1319
+ switch (filetype) {
1320
+ case FASTA:
1321
+ case FASTQ:
1322
+ case FASTA_GZ:
1323
+ case FASTQ_GZ:
1324
+ case AUTO:
1325
+ // Separate files for paired reads? Note odd categories used for paired read type
1326
+ if (separate_pair_files && cat%2==1) {
1327
+ argIndex++;
1328
+ if (argIndex>=argc)
1329
+ exitErrorf(EXIT_FAILURE, false, "Require left & right filename for -separate mode");
1330
+ readFastXPair(filetype, seqWriteInfo, argv[argIndex-1], argv[argIndex], cat, &sequenceIndex);
1331
+ } else {
1332
+ readFastXFile(filetype, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1333
+ }
1334
+ break;
1335
+ case RAW:
1336
+ if (separate_pair_files && cat%2==1) {
1337
+ exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
1338
+ }
1339
+ readRawFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
1340
+ break;
1341
+ case RAW_GZ:
1342
+ if (separate_pair_files && cat%2==1) {
1343
+ exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
1344
+ }
1345
+ readRawGZFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
1346
+ break;
1347
+ case SAM:
1348
+ readSAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1349
+ break;
1350
+ case BAM:
1351
+ readBAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1352
+ break;
1353
+ default:
1354
+ velvetLog("Screw up in parser... exiting\n");
1355
+ #ifdef DEBUG
1356
+ abort();
1357
+ #endif
1358
+ exit(1);
1359
+ }
1360
+ }
1361
+
1362
+ destroyReferenceCoordinateTable(refCoords);
1363
+ if (isCreateBinary()) {
1364
+ closeCnySeqForWrite(seqWriteInfo);
1365
+ } else {
1366
+ fclose(seqWriteInfo->m_pFile);
1367
+ }
1368
+ if (seqWriteInfo) {
1369
+ free(seqWriteInfo);
1370
+ }
1371
+ }
1372
+
1373
+ void createReadPairingArray(ReadSet* reads)
1374
+ {
1375
+ IDnum index;
1376
+ IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1377
+ Category cat = 0;
1378
+ int phase = 0;
1379
+
1380
+ for (index = 0; index < reads->readCount; index++)
1381
+ mateReads[index] = -1;
1382
+
1383
+ reads->mateReads = mateReads;
1384
+
1385
+ for (index = 0; index < reads->readCount; index++)
1386
+ {
1387
+ // Paired category
1388
+ if (cat & 1)
1389
+ {
1390
+ // Leaving the paired category
1391
+ if (reads->categories[index] != cat)
1392
+ {
1393
+ if (phase == 1)
1394
+ {
1395
+ reads->mateReads[index - 1] = -1;
1396
+ reads->categories[index - 1]--;
1397
+ phase = 0;
1398
+ }
1399
+ cat = reads->categories[index];
1400
+ // Into another paired category
1401
+ if (cat & 1)
1402
+ {
1403
+ reads->mateReads[index] = index + 1;
1404
+ phase = 1;
1405
+ }
1406
+ }
1407
+ else if (phase == 0)
1408
+ {
1409
+ reads->mateReads[index] = index + 1;
1410
+ phase = 1;
1411
+ }
1412
+ else
1413
+ {
1414
+ reads->mateReads[index] = index - 1;
1415
+ phase = 0;
1416
+ }
1417
+ }
1418
+ // Leaving an unpaired category
1419
+ else if (reads->categories[index] != cat)
1420
+ {
1421
+ cat = reads->categories[index];
1422
+ // Into a paired category
1423
+ if (cat & 1)
1424
+ {
1425
+ reads->mateReads[index] = index + 1;
1426
+ phase = 1;
1427
+ }
1428
+ }
1429
+ }
1430
+ }
1431
+
1432
+ int pairedCategories(ReadSet * reads)
1433
+ {
1434
+ boolean pairedCat[CATEGORIES + 1];
1435
+ int pairedCatCount = 0;
1436
+ IDnum index;
1437
+
1438
+ for (index = 0; index <= CATEGORIES; index++)
1439
+ pairedCat[index] = 0;
1440
+
1441
+ for (index = 0; index < reads->readCount; index++) {
1442
+ if (reads->categories[index] & 1 && !pairedCat[reads->categories[index] / 2]) {
1443
+ pairedCat[reads->categories[index] / 2] = true;
1444
+ if (pairedCatCount++ == CATEGORIES)
1445
+ break;
1446
+ }
1447
+ }
1448
+
1449
+ return pairedCatCount;
1450
+ }
1451
+
1452
+ boolean isSecondInPair(ReadSet * reads, IDnum index)
1453
+ {
1454
+ return reads->secondInPair[index / 8] & (1 << (index & 7));
1455
+ }
1456
+
1457
+ void computeSecondInPair(ReadSet * reads)
1458
+ {
1459
+ IDnum index;
1460
+ Category currentCat = 0;
1461
+ Category previousCat = 0;
1462
+ int phase = 0;
1463
+
1464
+ if (reads->secondInPair)
1465
+ free (reads->secondInPair);
1466
+ reads->secondInPair = callocOrExit((reads->readCount + 7) / 8, unsigned char);
1467
+
1468
+ for (index = 0; index < reads->readCount; index++)
1469
+ {
1470
+ currentCat = reads->categories[index];
1471
+ if (currentCat & 1)
1472
+ {
1473
+ if (previousCat == currentCat)
1474
+ {
1475
+ if (phase == 0)
1476
+ {
1477
+ phase = 1;
1478
+ }
1479
+ else
1480
+ {
1481
+ reads->secondInPair[index / 8] |= (1 << (index & 7));
1482
+ phase = 0;
1483
+ }
1484
+ }
1485
+ else {
1486
+ phase = 1;
1487
+ if (index > 0 && previousCat & 1 && !isSecondInPair(reads, index - 1))
1488
+ reads->categories[index - 1] = (reads->categories[index - 1] / 2) * 2;
1489
+ }
1490
+ }
1491
+ previousCat = currentCat;
1492
+ }
1493
+
1494
+ // Safeguard against odd sets of reads
1495
+ if (!isSecondInPair(reads, reads->readCount - 1)) {
1496
+ reads->categories[reads->readCount - 1] = (reads->categories[reads->readCount - 1] / 2) * 2;
1497
+ }
1498
+ }
1499
+
1500
+ void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1501
+ {
1502
+ IDnum index;
1503
+ IDnum pairID;
1504
+ IDnum sequenceCount = reads->readCount;
1505
+ IDnum *mateReads = reads->mateReads;
1506
+
1507
+ if (dubiousReads == NULL || mateReads == NULL)
1508
+ return;
1509
+
1510
+ for (index = 0; index < sequenceCount; index++) {
1511
+ if (!dubiousReads[index] || reads->categories[index] % 2 == 0 )
1512
+ continue;
1513
+
1514
+ if (isSecondInPair(reads, index))
1515
+ pairID = index - 1;
1516
+ else
1517
+ pairID = index + 1;
1518
+
1519
+ reads->categories[index] = (reads->categories[index] / 2) * 2;
1520
+ reads->categories[pairID] = (reads->categories[pairID] / 2) * 2;
1521
+ }
1522
+ }
1523
+
1524
+ ReadSet *importReadSet(char *filename)
1525
+ {
1526
+ FILE *file = fopen(filename, "r");
1527
+ char *sequence = NULL;
1528
+ Coordinate bpCount = 0;
1529
+ const int maxline = 5000;
1530
+ char line[5000];
1531
+ IDnum sequenceCount, sequenceIndex;
1532
+ ReadSet *reads;
1533
+ short int temp_short;
1534
+ int lineLength;
1535
+
1536
+ if (file != NULL)
1537
+ velvetLog("Reading read set file %s;\n", filename);
1538
+ else
1539
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1540
+
1541
+ reads = newReadSet();
1542
+
1543
+ // Count number of separate sequences
1544
+ sequenceCount = 0;
1545
+ while (fgets(line, maxline, file) != NULL)
1546
+ if (line[0] == '>')
1547
+ sequenceCount++;
1548
+ fclose(file);
1549
+ velvetLog("%li sequences found\n", (long) sequenceCount);
1550
+
1551
+ reads->readCount = sequenceCount;
1552
+
1553
+ if (reads->readCount == 0) {
1554
+ reads->sequences = NULL;
1555
+ reads->categories = NULL;
1556
+ return reads;
1557
+ }
1558
+
1559
+ reads->sequences = callocOrExit(sequenceCount, char *);
1560
+ reads->categories = callocOrExit(sequenceCount, Category);
1561
+ // Counting base pair length of each sequence:
1562
+ file = fopen(filename, "r");
1563
+ sequenceIndex = -1;
1564
+ while (fgets(line, maxline, file) != NULL) {
1565
+ if (line[0] == '>') {
1566
+
1567
+ // Reading category info
1568
+ sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1569
+ &temp_short);
1570
+ reads->categories[sequenceIndex + 1] = (Category) temp_short;
1571
+
1572
+ if (sequenceIndex != -1)
1573
+ reads->sequences[sequenceIndex] =
1574
+ mallocOrExit(bpCount + 1, char);
1575
+ sequenceIndex++;
1576
+ bpCount = 0;
1577
+ } if (line[0] == 'M') {;
1578
+ // Map line
1579
+ } else {
1580
+ bpCount += (Coordinate) strlen(line) - 1;
1581
+
1582
+ if (sizeof(ShortLength) == sizeof(int16_t) && (bpCount > SHRT_MAX || bpCount < 0)) {
1583
+ velvetLog("Read %li of length %lli, longer than limit %i\n",
1584
+ (long) sequenceIndex + 1, (long long) bpCount, SHRT_MAX);
1585
+ velvetLog("You should modify recompile with the LONGSEQUENCES option (cf. manual)\n");
1586
+ exit(1);
1587
+ }
1588
+ }
1589
+ }
1590
+
1591
+ //velvetLog("Sequence %d has length %d\n", sequenceIndex, bpCount);
1592
+ reads->sequences[sequenceIndex] =
1593
+ mallocOrExit(bpCount + 1, char);
1594
+ fclose(file);
1595
+
1596
+ // Reopen file and memorize line:
1597
+ file = fopen(filename, "r");
1598
+ sequenceIndex = -1;
1599
+ while (fgets(line, maxline, file)) {
1600
+ if (line[0] == '>') {
1601
+ if (sequenceIndex != -1) {
1602
+ sequence[bpCount] = '\0';
1603
+ }
1604
+ sequenceIndex++;
1605
+ bpCount = 0;
1606
+ //velvetLog("Starting to read sequence %d\n",
1607
+ // sequenceIndex);
1608
+ sequence = reads->sequences[sequenceIndex];
1609
+ } else if (line[0] == 'M') {;
1610
+ // Map line
1611
+ } else {
1612
+ lineLength = strlen(line) - 1;
1613
+ strncpy(sequence + bpCount, line, lineLength);
1614
+ bpCount += (Coordinate) lineLength;
1615
+ }
1616
+ }
1617
+
1618
+ sequence[bpCount] = '\0';
1619
+ fclose(file);
1620
+ computeSecondInPair(reads);
1621
+
1622
+ velvetLog("Done\n");
1623
+ return reads;
1624
+
1625
+ }
1626
+
1627
+ void logInstructions(int argc, char **argv, char *directory)
1628
+ {
1629
+ int index;
1630
+ char *logFilename =
1631
+ mallocOrExit(strlen(directory) + 100, char);
1632
+ FILE *logFile;
1633
+ time_t date;
1634
+ char *string;
1635
+
1636
+ time(&date);
1637
+ string = ctime(&date);
1638
+
1639
+ strcpy(logFilename, directory);
1640
+ strcat(logFilename, "/Log");
1641
+ logFile = fopen(logFilename, "a");
1642
+
1643
+ if (logFile == NULL)
1644
+ exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1645
+
1646
+ velvetFprintf(logFile, "%s", string);
1647
+
1648
+ for (index = 0; index < argc; index++)
1649
+ velvetFprintf(logFile, " %s", argv[index]);
1650
+
1651
+ velvetFprintf(logFile, "\n");
1652
+
1653
+ velvetFprintf(logFile, "Version %i.%i.%2.2i%s\n", VERSION_NUMBER,
1654
+ RELEASE_NUMBER, UPDATE_NUMBER, VERSION_BRANCH);
1655
+ velvetFprintf(logFile, "Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
1656
+ velvetFprintf(logFile, "This is free software; see the source for copying conditions. There is NO\n");
1657
+ velvetFprintf(logFile, "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
1658
+ velvetFprintf(logFile, "Compilation settings:\n");
1659
+ velvetFprintf(logFile, "CATEGORIES = %i\n", CATEGORIES);
1660
+ velvetFprintf(logFile, "MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
1661
+ #ifdef _OPENMP
1662
+ velvetFprintf(logFile, "OPENMP\n");
1663
+ #endif
1664
+ #ifdef LONGSEQUENCES
1665
+ velvetFprintf(logFile, "LONGSEQUENCES\n");
1666
+ #endif
1667
+ #ifdef BIGASSEMBLY
1668
+ velvetFprintf(logFile, "BIGASSEMBLY\n");
1669
+ #endif
1670
+ #ifdef COLOR
1671
+ velvetFprintf(logFile, "COLOR\n");
1672
+ #endif
1673
+ #ifdef DEBUG
1674
+ velvetFprintf(logFile, "DEBUG\n");
1675
+ #endif
1676
+ velvetFprintf(logFile, "\n");
1677
+
1678
+ fclose(logFile);
1679
+ free(logFilename);
1680
+ }
1681
+
1682
+ void destroyReadSet(ReadSet * reads)
1683
+ {
1684
+ IDnum index;
1685
+
1686
+ if (reads == NULL)
1687
+ return;
1688
+
1689
+ if (reads->sequences != NULL)
1690
+ {
1691
+ for (index = 0; index < reads->readCount; index++)
1692
+ free(reads->sequences[index]);
1693
+ free(reads->sequences);
1694
+ }
1695
+
1696
+ if (reads->tSequences != NULL)
1697
+ free (reads->tSequences);
1698
+
1699
+ if (reads->tSeqMem != NULL)
1700
+ free (reads->tSeqMem);
1701
+
1702
+ if (reads->labels != NULL)
1703
+ for (index = 0; index < reads->readCount; index++)
1704
+ free(reads->labels[index]);
1705
+
1706
+ if (reads->confidenceScores != NULL)
1707
+ for (index = 0; index < reads->readCount; index++)
1708
+ free(reads->confidenceScores[index]);
1709
+
1710
+ if (reads->kmerProbabilities != NULL)
1711
+ for (index = 0; index < reads->readCount; index++)
1712
+ free(reads->kmerProbabilities[index]);
1713
+
1714
+ free(reads->labels);
1715
+ free(reads->confidenceScores);
1716
+ free(reads->kmerProbabilities);
1717
+ free(reads->mateReads);
1718
+ free(reads->categories);
1719
+ free(reads->secondInPair);
1720
+ free(reads);
1721
+ }
1722
+
1723
+ ShortLength *getSequenceLengths(ReadSet * reads, int wordLength)
1724
+ {
1725
+ ShortLength *lengths = callocOrExit(reads->readCount, ShortLength);
1726
+ IDnum index;
1727
+ int lengthOffset = wordLength - 1;
1728
+
1729
+ for (index = 0; index < reads->readCount; index++)
1730
+ lengths[index] =
1731
+ getLength(getTightStringInArray(reads->tSequences, index)) - lengthOffset;
1732
+
1733
+ return lengths;
1734
+ }