finishm 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,30 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #ifndef _READCOHERENTGRAPH_H_
22
+ #define _READCOHERENTGRAPH_H_
23
+
24
+ void readCoherentGraph(Graph * graph, boolean(*isUnique) (Node * node),
25
+ double coverage, ReadSet * reads);
26
+
27
+ boolean isUniqueSolexa(Node * node);
28
+
29
+ void setMultiplicityCutoff(int value);
30
+ #endif
@@ -0,0 +1,1734 @@
1
+ /*
2
+ Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
+
4
+ This file is part of Velvet.
5
+
6
+ Velvet is free software; you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation; either version 2 of the License, or
9
+ (at your option) any later version.
10
+
11
+ Velvet is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with Velvet; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
+
20
+ */
21
+ #include <stdlib.h>
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <math.h>
25
+ #include <time.h>
26
+ #include <limits.h>
27
+ #include <ctype.h>
28
+
29
+ #include "globals.h"
30
+ #include "tightString.h"
31
+ #include "readSet.h"
32
+ #include "utility.h"
33
+ #include "binarySequences.h"
34
+ #include "autoOpen.h"
35
+ #include "kseq.h"
36
+
37
+ #if !defined(BUNDLEDZLIB)
38
+ #include <zlib.h>
39
+ #elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
40
+ #include "../third-party/zlib-1.2.3/Win32/include/zlib.h"
41
+ #else
42
+ #include "../third-party/zlib-1.2.3/zlib.h"
43
+ #endif
44
+
45
+ #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
46
+ # include <fcntl.h>
47
+ # include <io.h>
48
+ # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
49
+ #else
50
+ # define SET_BINARY_MODE(file)
51
+ #endif
52
+
53
+ static Mask *allocateMask(SequencesWriter *seqWriteInfo)
54
+ {
55
+ if (seqWriteInfo->m_maskMemory == NULL)
56
+ seqWriteInfo->m_maskMemory = newRecycleBin(sizeof(Mask), 10000);
57
+
58
+ return (Mask *) allocatePointer(seqWriteInfo->m_maskMemory);
59
+ }
60
+
61
+ static Mask * newMask(SequencesWriter *seqWriteInfo, Coordinate position)
62
+ {
63
+ Mask * mask = allocateMask(seqWriteInfo);
64
+ mask->start = position;
65
+ mask->finish = position;
66
+ mask->next = NULL;
67
+ return mask;
68
+ }
69
+
70
+ //
71
+ // cmd line args can override the createBinary flag
72
+ // note that createBinary is only used by velveth
73
+ //
74
+ boolean createBinary = false;
75
+ boolean isCreateBinary()
76
+ {
77
+ return createBinary;
78
+ }
79
+
80
+ void setCreateBinary(boolean val)
81
+ {
82
+ createBinary = val;
83
+ }
84
+
85
+ ReadSet *newReadSet()
86
+ {
87
+ ReadSet *rs = callocOrExit(1, ReadSet);
88
+ return rs;
89
+ }
90
+
91
+ //////////////////////////////////////////////////////////////////////////
92
+ // Reference identifiers
93
+ //////////////////////////////////////////////////////////////////////////
94
+
95
+ typedef struct referenceCoordinate_st ReferenceCoordinate;
96
+ static Coordinate reference_coordinate_double_strand = true;
97
+
98
+ struct referenceCoordinate_st {
99
+ char * name;
100
+ Coordinate start;
101
+ Coordinate finish;
102
+ IDnum referenceID;
103
+ IDnum counter;
104
+ boolean positive_strand;
105
+ } ATTRIBUTE_PACKED;
106
+
107
+ static int compareRefCoords(const void * ptrA, const void * ptrB) {
108
+ ReferenceCoordinate * A = (ReferenceCoordinate *) ptrA;
109
+ ReferenceCoordinate * B = (ReferenceCoordinate *) ptrB;
110
+ int comp = strcmp(A->name, B->name);
111
+
112
+ if (comp != 0)
113
+ return comp;
114
+ else if (!reference_coordinate_double_strand && A->positive_strand != B->positive_strand)
115
+ return A->positive_strand > B->positive_strand;
116
+ else {
117
+ if (A->finish > -1 && A->finish < B->start)
118
+ return -1;
119
+ else if (B->finish > -1 && A->start > B->finish)
120
+ return 1;
121
+ else return 0;
122
+ }
123
+ }
124
+
125
+ typedef struct referenceCoordinateTable_st ReferenceCoordinateTable;
126
+
127
+ struct referenceCoordinateTable_st {
128
+ ReferenceCoordinate * array;
129
+ IDnum arrayLength;
130
+ } ATTRIBUTE_PACKED;
131
+
132
+ static ReferenceCoordinateTable * newReferenceCoordinateTable() {
133
+ ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable);
134
+ table->array = NULL;
135
+ table->arrayLength = 0;
136
+ return table;
137
+ }
138
+
139
+ static void printReferenceCoordinateTableStats(ReferenceCoordinateTable * table) {
140
+ IDnum index;
141
+ IDnum counter = 0;
142
+
143
+ velvetLog("Reference mapping counters\n");
144
+ velvetLog("Name\tRead mappings\n");
145
+
146
+ for (index = 0; index < table->arrayLength; index++) {
147
+ velvetLog("%s\t%li\n", table->array[index].name, (long) table->array[index].counter);
148
+ counter += table->array[index].counter;
149
+ }
150
+
151
+ if (counter == 0) {
152
+ velvetLog("WARNING: None of your read mappings recognized the reference sequence!\n");
153
+ velvetLog("Double check that the names are identical between reference fasta headers and SAM/BAM sequences.\n");
154
+ }
155
+ }
156
+
157
+ static void destroyReferenceCoordinateTable(ReferenceCoordinateTable * table) {
158
+ IDnum index;
159
+
160
+ if (table->array) {
161
+ printReferenceCoordinateTableStats(table);
162
+ for (index = 0; index < table->arrayLength; index++)
163
+ free(table->array[index].name);
164
+ free(table->array);
165
+ }
166
+ free(table);
167
+ }
168
+
169
+ static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) {
170
+ if (table->array == NULL)
171
+ table->array = callocOrExit(extraLength, ReferenceCoordinate);
172
+ else
173
+ table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate);
174
+ }
175
+
176
+ static ReferenceCoordinate * findReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
177
+ ReferenceCoordinate * array = table->array;
178
+ ReferenceCoordinate refCoord;
179
+ Coordinate leftIndex = 0;
180
+ Coordinate rightIndex = table->arrayLength - 1;
181
+ Coordinate middleIndex;
182
+
183
+ refCoord.name = name;
184
+ refCoord.start = start;
185
+ refCoord.finish = finish;
186
+ refCoord.referenceID = 0;
187
+ refCoord.positive_strand = positive_strand;
188
+
189
+ while (true) {
190
+ middleIndex = (rightIndex + leftIndex) / 2;
191
+
192
+ if (leftIndex > rightIndex)
193
+ return NULL;
194
+ else if (compareRefCoords(&(array[middleIndex]), &refCoord) == 0)
195
+ return &(array[middleIndex]);
196
+ else if (leftIndex == middleIndex)
197
+ return NULL;
198
+ else if (compareRefCoords(&(array[middleIndex]), &refCoord) > 0)
199
+ rightIndex = middleIndex;
200
+ else
201
+ leftIndex = middleIndex;
202
+ }
203
+ }
204
+
205
+ static void addReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
206
+ ReferenceCoordinate * refCoord;
207
+
208
+ if ((refCoord = findReferenceCoordinate(table, name, start, finish, positive_strand))) {
209
+ velvetLog("Overlapping reference coordinates:\n");
210
+ velvetLog("%s:%lli-%lli\n", name, (long long) start, (long long) finish);
211
+ velvetLog("%s:%lli-%lli\n", refCoord->name, (long long) refCoord->start, (long long) refCoord->finish);
212
+ velvetLog("Exiting...\n");
213
+ #ifdef DEBUG
214
+ abort();
215
+ #endif
216
+ exit(1);
217
+ }
218
+
219
+ refCoord = &(table->array[table->arrayLength++]);
220
+
221
+ refCoord->name = name;
222
+ refCoord->start = start;
223
+ refCoord->finish = finish;
224
+ refCoord->referenceID = table->arrayLength;
225
+ refCoord->positive_strand = positive_strand;
226
+ refCoord->counter = 0;
227
+ }
228
+
229
+ static void sortReferenceCoordinateTable(ReferenceCoordinateTable * table) {
230
+ qsort(table->array, table->arrayLength, sizeof(ReferenceCoordinate), compareRefCoords);
231
+ }
232
+
233
+ //////////////////////////////////////////////////////////////////////////
234
+ // File reading
235
+ //////////////////////////////////////////////////////////////////////////
236
+
237
+ static void velvetifySequence(char * str, SequencesWriter *seqWriteInfo) {
238
+ int i;
239
+ char c;
240
+ size_t length = strlen(str);
241
+
242
+ for (i = 0; i < length; i++) {
243
+ c = str[i];
244
+ switch (c) {
245
+ case '\n':
246
+ case '\r':
247
+ case EOF:
248
+ str[i] = '\0';
249
+ break;
250
+ case 'A':
251
+ case 'a':
252
+ str[i] = 'A';
253
+ break;
254
+ case 'C':
255
+ case 'c':
256
+ str[i] = 'C';
257
+ break;
258
+ case 'G':
259
+ case 'g':
260
+ str[i] = 'G';
261
+ break;
262
+ case 'T':
263
+ case 't':
264
+ str[i] = 'T';
265
+ break;
266
+ default:
267
+ str[i] = 'N';
268
+ }
269
+ // non NULL indicates ref masks are being created
270
+ if (seqWriteInfo->m_referenceMask != NULL) {
271
+ if (str[i] == 'N') {
272
+ if (seqWriteInfo->m_openMask) {
273
+ seqWriteInfo->m_current->finish++;
274
+ } else if (*(seqWriteInfo->m_referenceMask) == NULL) {
275
+ *(seqWriteInfo->m_referenceMask) = newMask(seqWriteInfo, seqWriteInfo->m_position);
276
+ seqWriteInfo->m_current = *(seqWriteInfo->m_referenceMask);
277
+ } else {
278
+ seqWriteInfo->m_current->next = newMask(seqWriteInfo, seqWriteInfo->m_position);
279
+ seqWriteInfo->m_current = seqWriteInfo->m_current->next;
280
+ }
281
+ seqWriteInfo->m_openMask = true;
282
+ seqWriteInfo->m_position += 1;
283
+ } else if (str[i] != '\0') {
284
+ seqWriteInfo->m_openMask = false;
285
+ seqWriteInfo->m_position += 1;
286
+ }
287
+ }
288
+ }
289
+ }
290
+
291
+ static void reverseComplementSequence(char * str)
292
+ {
293
+ size_t length = strlen(str);
294
+ size_t i;
295
+
296
+ for (i = 0; i < length-1 - i; i++) {
297
+ char c = str[i];
298
+ str[i] = str[length-1 - i];
299
+ str[length-1 - i] = c;
300
+ }
301
+
302
+ #ifndef COLOR
303
+ for (i = 0; i < length; i++) {
304
+ switch (str[i]) {
305
+ case 'A':
306
+ case 'a':
307
+ str[i] = 'T';
308
+ break;
309
+ case 'C':
310
+ case 'c':
311
+ str[i] = 'G';
312
+ break;
313
+ case 'G':
314
+ case 'g':
315
+ str[i] = 'C';
316
+ break;
317
+ // As in velvetifySequence(), anything unusual ends up as 'A'
318
+ default:
319
+ str[i] = 'A';
320
+ break;
321
+ }
322
+ }
323
+ #endif
324
+ }
325
+
326
+ static void writeFastaSequence(FILE * outfile, const char * str)
327
+ {
328
+ size_t length = strlen(str);
329
+ size_t start;
330
+ for (start = 0; start < length; start += 60)
331
+ velvetFprintf(outfile, "%.60s\n", &str[start]);
332
+ }
333
+
334
+ void convertSequences(ReadSet * rs)
335
+ {
336
+ rs->tSequences = newTightStringArrayFromStringArray(rs->sequences,
337
+ rs->readCount,
338
+ &rs->tSeqMem);
339
+ rs->sequences = NULL;
340
+ }
341
+
342
+ // Returns the value of a 32-bit little-endian-stored integer.
343
+ static int int32(const unsigned char * ptr)
344
+ {
345
+ int x = ptr[3];
346
+ x = (x << 8) | ptr[2];
347
+ x = (x << 8) | ptr[1];
348
+ x = (x << 8) | ptr[0];
349
+ return x;
350
+ }
351
+
352
+ void goToEndOfLine(char *line, FILE * file)
353
+ {
354
+ size_t length = strlen(line);
355
+ char c = line[length - 1];
356
+
357
+ while (c != '\n')
358
+ c = fgetc(file);
359
+ }
360
+
361
+ static void writeSeqName(char*seq_name, SequencesWriter *seqWriteInfo, Category cat, IDnum *sequenceIndex)
362
+ {
363
+ char name[5001];
364
+ if (isCreateBinary()) {
365
+ cnySeqInsertStart(seqWriteInfo);
366
+ sprintf(name, ">%s", seq_name);
367
+ cnySeqInsertSequenceName(name, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
368
+ } else {
369
+ velvetFprintf(seqWriteInfo->m_pFile,">%s\t%ld\t%d\n", seq_name, (long) ((*sequenceIndex)++), (int) cat);
370
+ }
371
+ }
372
+
373
+ static void writeSequence(char*seq, SequencesWriter *seqWriteInfo)
374
+ {
375
+ char str[100];
376
+ velvetifySequence(seq, seqWriteInfo);
377
+ if (isCreateBinary()) {
378
+ cnySeqInsertNucleotideString(seq, seqWriteInfo);
379
+ cnySeqInsertEnd(seqWriteInfo);
380
+ } else {
381
+ Coordinate start = 0;
382
+ while (start <= strlen(seq)) {
383
+ strncpy(str, seq + start, 60);
384
+ str[60] = '\0';
385
+ velvetFprintf(seqWriteInfo->m_pFile, "%s\n", str);
386
+ start += 60;
387
+ }
388
+ }
389
+ }
390
+
391
+ static void initFastX(SequencesWriter *seqWriteInfo, Category cat)
392
+ {
393
+ seqWriteInfo->m_referenceMask = NULL;
394
+ seqWriteInfo->m_position = 0;
395
+ seqWriteInfo->m_openMask = false;
396
+
397
+ // Binary file stuff
398
+ if (isCreateBinary() && (cat == REFERENCE)) {
399
+ seqWriteInfo->m_referenceMask = callocOrExit(1, Mask*);
400
+ }
401
+ if (isCreateBinary()) {
402
+ inputCnySeqFileStart(cat, seqWriteInfo);
403
+ }
404
+ }
405
+
406
+ static void cleanupFastX(SequencesWriter *seqWriteInfo, Category cat)
407
+ {
408
+ if (seqWriteInfo->m_referenceMask) {
409
+ free(seqWriteInfo->m_referenceMask);
410
+ seqWriteInfo->m_referenceMask = NULL;
411
+ }
412
+ }
413
+
414
+
415
+ // Imports sequences from a raw sequence file
416
+ // Memory space allocated within this function.
417
+ static void readRawFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex)
418
+ {
419
+ FILE *file;
420
+ const int maxline = 5000;
421
+ char line[5000];
422
+ IDnum counter = 0;
423
+
424
+ initFastX(seqWriteInfo, cat);
425
+
426
+ if (strcmp(filename, "-"))
427
+ file = fopen(filename, "r");
428
+ else
429
+ file = stdin;
430
+
431
+ if (file != NULL)
432
+ velvetLog("Reading raw file %s\n", filename);
433
+ else
434
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
435
+
436
+ while(fgets(line, maxline, file)) {
437
+ if (strlen(line) >= maxline - 1) {
438
+ velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
439
+ #ifdef DEBUG
440
+ abort();
441
+ #endif
442
+ exit(1);
443
+ }
444
+
445
+ writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
446
+ writeSequence(line, seqWriteInfo);
447
+ counter++;
448
+ }
449
+ fclose(file);
450
+ cleanupFastX(seqWriteInfo, cat);
451
+ velvetLog("%li reads found.\n", (long) counter);
452
+ velvetLog("Done\n");
453
+ }
454
+
455
+ // Imports sequences from a zipped raw file
456
+ // Memory space allocated within this function.
457
+ static void readRawGZFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex)
458
+ {
459
+ gzFile file;
460
+ const int maxline = 5000;
461
+ char line[5000];
462
+ IDnum counter = 0;
463
+
464
+ initFastX(seqWriteInfo, cat);
465
+ if (strcmp(filename, "-"))
466
+ file = gzopen(filename, "rb");
467
+ else {
468
+ file = gzdopen(fileno(stdin), "rb");
469
+ SET_BINARY_MODE(stdin);
470
+ }
471
+
472
+ if (file != NULL)
473
+ velvetLog("Reading zipped raw sequence file %s\n", filename);
474
+ else
475
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
476
+
477
+ while(gzgets(file, line, maxline)) {
478
+ if (strlen(line) >= maxline - 1) {
479
+ velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
480
+ #ifdef DEBUG
481
+ abort();
482
+ #endif
483
+ exit(1);
484
+ }
485
+
486
+ writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
487
+ writeSequence(line, seqWriteInfo);
488
+ counter++;
489
+ }
490
+ gzclose(file);
491
+ cleanupFastX(seqWriteInfo, cat);
492
+ velvetLog("%li reads found.\n", (long) counter);
493
+ velvetLog("Done\n");
494
+ }
495
+
496
+ static void fillReferenceCoordinateTable(char *filename, ReferenceCoordinateTable * refCoords, IDnum counter)
497
+ {
498
+ FILE *file;
499
+ const int maxline = 5000;
500
+ char line[5000];
501
+ char * name;
502
+ long long start, finish;
503
+ Coordinate i;
504
+ IDnum index = 0;
505
+
506
+ if (strcmp(filename, "-") == 0)
507
+ exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence from stdin");
508
+ else
509
+ file = fopen(filename, "r");
510
+
511
+ if (counter == 0)
512
+ return;
513
+
514
+ resizeReferenceCoordinateTable(refCoords,counter);
515
+
516
+ while (fgets(line, maxline, file) && index < counter) {
517
+ if (line[0] == '>') {
518
+ name = callocOrExit(strlen(line), char);
519
+
520
+ if (strchr(line, ':')) {
521
+ sscanf(strtok(line, ":-\r\n\t "), ">%s", name);
522
+ sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &start);
523
+ sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &finish);
524
+ if (start <= finish)
525
+ addReferenceCoordinate(refCoords, name, start, finish, true);
526
+ else
527
+ addReferenceCoordinate(refCoords, name, finish, start, false);
528
+ } else {
529
+ // Chomping EOL characters and comments
530
+ for (i=strlen(line) - 1; i >= 0; i--)
531
+ if (line[i] == '\n' || line[i] == '\r' || line[i] == ' ' || line[i] == '\t')
532
+ line[i] = '\0';
533
+
534
+ strcpy(name, line + 1);
535
+ addReferenceCoordinate(refCoords, name, 1, -1, true);
536
+ }
537
+
538
+ index++;
539
+ }
540
+ }
541
+
542
+ sortReferenceCoordinateTable(refCoords);
543
+ }
544
+
545
+ #define FASTQ 1
546
+ #define FASTA 2
547
+ #define FASTA_GZ 5
548
+ #define FASTQ_GZ 6
549
+ #define SAM 8
550
+ #define BAM 9
551
+ #define RAW 10
552
+ #define RAW_GZ 11
553
+ #define AUTO 12
554
+
555
+ static gzFile openFastXFile(int fileType, char*filename)
556
+ {
557
+ gzFile file;
558
+ char c;
559
+
560
+ // Choose file or stdin
561
+ if (strcmp(filename, "-")==0) {
562
+ file = gzdopen(fileno(stdin), "rb");
563
+ SET_BINARY_MODE(stdin);
564
+ } else {
565
+ file = gzopen(filename, "rb");
566
+ }
567
+
568
+ // Verify filetype
569
+ c = gzgetc(file);
570
+ switch (fileType) {
571
+ case FASTA:
572
+ case FASTA_GZ:
573
+ if (c != EOF && c!='>')
574
+ exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
575
+ break;
576
+ case FASTQ:
577
+ case FASTQ_GZ:
578
+ if (c != EOF && c!='@')
579
+ exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
580
+ break;
581
+ }
582
+ gzungetc(c, file);
583
+
584
+
585
+ if (file != NULL) {
586
+ char *type;
587
+ switch (fileType) {
588
+ case FASTA:
589
+ case FASTA_GZ: type = "FastA"; break;
590
+ case FASTQ:
591
+ case FASTQ_GZ: type = "FastQ"; break;
592
+ default: type = ""; break;
593
+ }
594
+ velvetLog("Reading %s file %s;\n", type, filename);
595
+ } else
596
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
597
+
598
+ return file;
599
+ }
600
+
601
+ typedef struct {
602
+ gzFile gzFile;
603
+ AutoFile *autoFile;
604
+ } FileGZOrAuto;
605
+
606
+ size_t fileGZOrAuto_read(FileGZOrAuto kseq_file, void *ptr, size_t size)
607
+ {
608
+ if (kseq_file.gzFile)
609
+ return gzread(kseq_file.gzFile, ptr, size);
610
+ else
611
+ return fread(ptr, 1, size, kseq_file.autoFile->file);
612
+ }
613
+
614
+ void fileGZOrAuto_close(FileGZOrAuto kseq_file)
615
+ {
616
+ if (kseq_file.gzFile)
617
+ gzclose(kseq_file.gzFile);
618
+ else
619
+ closeFileAuto(kseq_file.autoFile);
620
+ }
621
+
622
+ char const* charToType(char c)
623
+ {
624
+ switch(c) {
625
+ case '>': return "FastA";
626
+ case '@': return "FastQ";
627
+ default: return "Unknown";
628
+ }
629
+ }
630
+
631
+ // Define mode to use kseq in
632
+ KSEQ_INIT(FileGZOrAuto, fileGZOrAuto_read)
633
+
634
+ // Read in FastA or FastQ files in compressed or gz format
635
+ static void readFastXFile(int fileType, SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex, ReferenceCoordinateTable * refCoords)
636
+ {
637
+ kseq_t *seq;
638
+ FileGZOrAuto file;
639
+ IDnum counter = 0;
640
+
641
+ file.gzFile = file.autoFile = NULL;
642
+ if (fileType == AUTO) {
643
+ file.autoFile = openFileAuto(filename);
644
+ if (!file.autoFile)
645
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename);
646
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename, file.autoFile->decompressor, charToType(file.autoFile->first_char));
647
+ } else
648
+ file.gzFile = openFastXFile(fileType, filename);
649
+
650
+ initFastX(seqWriteInfo, cat);
651
+ // Read a sequence at a time
652
+ seq = kseq_init(file);
653
+ while (kseq_read(seq) >= 0) {
654
+ counter++;
655
+ writeSeqName(seq->name.s, seqWriteInfo, cat, sequenceIndex);
656
+ writeSequence(seq->seq.s, seqWriteInfo);
657
+ }
658
+
659
+ kseq_destroy(seq);
660
+ fileGZOrAuto_close(file);
661
+
662
+ if (cat == REFERENCE) {
663
+ fillReferenceCoordinateTable(filename, refCoords, counter);
664
+ }
665
+ cleanupFastX(seqWriteInfo, cat);
666
+
667
+ velvetLog("%li sequences found\n", (long) counter);
668
+ velvetLog("Done\n");
669
+ }
670
+
671
+ static void readFastXPair(int fileType, SequencesWriter *seqWriteInfo, char *filename1, char *filename2, Category cat, IDnum * sequenceIndex)
672
+ {
673
+ kseq_t *seq1, *seq2;
674
+ FileGZOrAuto file1, file2;
675
+ IDnum counter = 0;
676
+
677
+ if (cat==REFERENCE)
678
+ exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence in 'separate' read mode");
679
+
680
+ file1.gzFile = file1.autoFile = NULL;
681
+ file2.gzFile = file2.autoFile = NULL;
682
+ if (fileType == AUTO) {
683
+ file1.autoFile = openFileAuto(filename1);
684
+ if (!file1.autoFile)
685
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename1);
686
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename1, file1.autoFile->decompressor, charToType(file1.autoFile->first_char));
687
+ file2.autoFile = openFileAuto(filename2);
688
+ if (!file2.autoFile)
689
+ exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename2);
690
+ velvetLog("Reading file '%s' using '%s' as %s\n", filename2, file2.autoFile->decompressor, charToType(file2.autoFile->first_char));
691
+ } else {
692
+ file1.gzFile = openFastXFile(fileType, filename1);
693
+ file2.gzFile = openFastXFile(fileType, filename2);
694
+ }
695
+ initFastX(seqWriteInfo, cat);
696
+
697
+ // Read a sequence at a time
698
+ seq1 = kseq_init(file1);
699
+ seq2 = kseq_init(file2);
700
+ while (kseq_read(seq1) >= 0) {
701
+ counter++;
702
+ writeSeqName(seq1->name.s, seqWriteInfo, cat, sequenceIndex);
703
+ writeSequence(seq1->seq.s, seqWriteInfo);
704
+
705
+ if (kseq_read(seq2) < 0)
706
+ exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too few sequences", filename2);
707
+
708
+ counter++;
709
+ writeSeqName(seq2->name.s, seqWriteInfo, cat, sequenceIndex);
710
+ writeSequence(seq2->seq.s, seqWriteInfo);
711
+ }
712
+ if (kseq_read(seq2) >= 0)
713
+ exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too many sequences", filename2);
714
+
715
+ kseq_destroy(seq1);
716
+ kseq_destroy(seq2);
717
+
718
+ fileGZOrAuto_close(file1);
719
+ fileGZOrAuto_close(file2);
720
+
721
+ cleanupFastX(seqWriteInfo, cat);
722
+
723
+ velvetLog("%li sequences found in total in the paired sequence files\n", (long) counter);
724
+ velvetLog("Done\n");
725
+ }
726
+
727
+ static void addMapping(boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
728
+ if (isCreateBinary()) {
729
+ seqWriteInfo->m_bIsRef = true;
730
+ RefInfoList *refElem = callocOrExit(1, RefInfoList);
731
+ if (refCoord->positive_strand) {
732
+ refElem->m_elem.m_referenceID = (long) orientation * refCoord->referenceID;
733
+ refElem->m_elem.m_pos = (long long) (pos - refCoord->start);
734
+ } else {
735
+ refElem->m_elem.m_referenceID = (long) -orientation * refCoord->referenceID;
736
+ refElem->m_elem.m_pos = (long long) (refCoord->finish - pos - strlen(seq));
737
+ }
738
+ refElem->next = NULL;
739
+ if (seqWriteInfo->m_refInfoHead == NULL) {
740
+ seqWriteInfo->m_refInfoHead = refElem;
741
+ } else {
742
+ (*refTail)->next = refElem;
743
+ }
744
+ *refTail = refElem;
745
+ seqWriteInfo->m_refCnt++;
746
+ } else {
747
+ if (refCoord->positive_strand) {
748
+ snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) orientation * refCoord->referenceID, (long long) (pos - refCoord->start));
749
+ } else
750
+ snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) - orientation * refCoord->referenceID, (long long) (refCoord->finish - pos - strlen(seq)));
751
+
752
+ if (*buffer_size - strlen(buffer) < 100) {
753
+ *buffer_size += 1000;
754
+ buffer = reallocOrExit(buffer, *buffer_size, char);
755
+ }
756
+ }
757
+
758
+ // Increment counter
759
+ refCoord->counter++;
760
+ }
761
+
762
+ static void writeMappedSequence(IDnum * sequenceIndex, Category cat, Category prev_cat, char * previous_seq, char * previous_qname, char * previous_qname_pairing, char * buffer, SequencesWriter * seqWriteInfo) {
763
+ char print_qname[5000];
764
+ if (isCreateBinary()) {
765
+ if (prev_cat != cat) {
766
+ inputCnySeqFileStart(cat, seqWriteInfo);
767
+ prev_cat = cat;
768
+ }
769
+ cnySeqInsertStart(seqWriteInfo);
770
+ cnySeqInsertNucleotideString(previous_seq, seqWriteInfo);
771
+ sprintf(print_qname, ">%s%s", previous_qname, previous_qname_pairing);
772
+ cnySeqInsertSequenceName(print_qname, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
773
+ cnySeqInsertEnd(seqWriteInfo);
774
+ } else {
775
+ velvetFprintf(seqWriteInfo->m_pFile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
776
+ (long) ((*sequenceIndex)++), (int) cat);
777
+ writeFastaSequence(seqWriteInfo->m_pFile, previous_seq);
778
+ velvetFprintf(seqWriteInfo->m_pFile, "%s", buffer);
779
+ strcpy(buffer, "");
780
+ }
781
+ }
782
+
783
+ static void readCigar(char * cigar, boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
784
+ long long cigar_num;
785
+ int cigar_index;
786
+ char c;
787
+
788
+ if (strlen(cigar) == 1 && cigar[0] == '*')
789
+ ;
790
+ else {
791
+ cigar_num = 0;
792
+ for (cigar_index = 0; cigar_index < strlen(cigar); cigar_index++) {
793
+ c = cigar[cigar_index];
794
+ if (c == 'M' || c == '=' || c == 'X') {
795
+ if (refCoord->finish < 0 || pos < refCoord->finish)
796
+ addMapping(orientation, pos, seq, refCoord, buffer, seqWriteInfo, refTail, buffer_size);
797
+ cigar_num = 0;
798
+ } else if (c == 'S' || c == 'I') {
799
+ pos -= cigar_num;
800
+ cigar_num = 0;
801
+ } else if (c == 'D' || c == 'N') {
802
+ pos += cigar_num;
803
+ cigar_num = 0;
804
+ } else if (c == 'H' || c == 'P') {
805
+ cigar_num = 0;
806
+ } else if (isdigit(c)) {
807
+ cigar_num = 10 * cigar_num + (c - 48);
808
+ } else {
809
+ abort();
810
+ }
811
+ }
812
+ }
813
+ }
814
+
815
+ static void readSAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
816
+ {
817
+ char line[5000];
818
+ unsigned long lineno;
819
+ IDnum readCount = 0;
820
+ char previous_qname_pairing[10];
821
+ char previous_qname[5000];
822
+ char previous_seq[5000];
823
+ boolean previous_paired = false;
824
+ Category prev_cat = cat;
825
+ Category apparentCat;
826
+ ReferenceCoordinate * refCoord;
827
+ RefInfoList *refTail = NULL;
828
+ seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
829
+ seqWriteInfo->m_position = 0;
830
+ seqWriteInfo->m_openMask = false;
831
+
832
+ size_t buffer_size = 5000;
833
+ char * buffer = callocOrExit(buffer_size, char);
834
+
835
+ if (cat == REFERENCE) {
836
+ velvetLog("SAM file %s cannot contain reference sequences.\n", filename);
837
+ velvetLog("Please check the command line.\n");
838
+ #ifdef DEBUG
839
+ abort();
840
+ #endif
841
+ exit(1);
842
+ }
843
+
844
+ FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
845
+ if (file)
846
+ velvetLog("Reading SAM file %s\n", filename);
847
+ else
848
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
849
+ if (isCreateBinary()) {
850
+ inputCnySeqFileStart(cat, seqWriteInfo);
851
+ }
852
+ strcpy(previous_qname, "");
853
+ for (lineno = 1; fgets(line, sizeof(line), file); lineno++) {
854
+ if (line[0] != '@') {
855
+ char *qname, *flag, *seq, *rname, *cigar;
856
+ long long pos;
857
+ int orientation;
858
+ int i;
859
+
860
+ qname = strtok(line, "\t");
861
+ flag = strtok(NULL, "\t");
862
+ rname = strtok(NULL, "\t");
863
+ sscanf(strtok(NULL, "\t"), "%lli", &pos);
864
+ orientation = 1;
865
+
866
+ // Mapping scor
867
+ (void) strtok(NULL, "\t");
868
+ cigar = strtok(NULL, "\t");
869
+
870
+ // Columns 7,8,9 are paired name, position and score
871
+ for (i = 7; i < 10; i++)
872
+ (void) strtok(NULL, "\t");
873
+ seq = strtok(NULL, "\t");
874
+
875
+ if (seq == NULL) {
876
+ velvetFprintf(stderr,
877
+ "Line #%lu: ignoring SAM record with too few fields\n",
878
+ lineno);
879
+ }
880
+ else if (strcmp(seq, "*") == 0) {
881
+ velvetFprintf(stderr,
882
+ "Line #%lu: ignoring SAM record with omitted SEQ field\n",
883
+ lineno);
884
+ }
885
+ else {
886
+ // Accept flags represented in either decimal or hex:
887
+ int flagbits = strtol(flag, NULL, 0);
888
+
889
+ if (flagbits & 0x4)
890
+ strcpy(rname, "");
891
+
892
+ const char *qname_pairing = "";
893
+ if (flagbits & 0x40)
894
+ qname_pairing = "/1";
895
+ else if (flagbits & 0x80)
896
+ qname_pairing = "/2";
897
+
898
+ if (flagbits & 0x10) {
899
+ orientation = -1;
900
+ reverseComplementSequence(seq);
901
+ }
902
+
903
+ // Determine if paired to previous read
904
+ boolean same_name = (strcmp(qname, previous_qname) == 0);
905
+ if (readCount && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
906
+ if (cat % 2 && !same_name && !previous_paired)
907
+ apparentCat = cat - 1;
908
+ else
909
+ apparentCat = cat;
910
+
911
+ previous_paired = (cat % 2 && same_name);
912
+
913
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
914
+ prev_cat = apparentCat;
915
+ }
916
+
917
+ if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, rname, (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation))) {
918
+ readCigar(cigar, orientation, pos, seq, refCoord, buffer, seqWriteInfo, &refTail, &buffer_size);
919
+ }
920
+
921
+ strcpy(previous_qname, qname);
922
+ strcpy(previous_qname_pairing, qname_pairing);
923
+ strcpy(previous_seq, seq);
924
+ velvetifySequence(previous_seq, seqWriteInfo);
925
+
926
+ readCount++;
927
+ }
928
+ }
929
+ }
930
+
931
+ if (readCount) {
932
+ if (cat % 2 && !previous_paired)
933
+ apparentCat = cat - 1;
934
+ else
935
+ apparentCat = cat;
936
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
937
+ }
938
+
939
+ free(buffer);
940
+ fclose(file);
941
+ velvetLog("%lu reads found.\n", (long) readCount);
942
+ velvetLog("Done\n");
943
+ }
944
+
945
+ static int readBAMint32(gzFile file)
946
+ {
947
+ unsigned char buffer[4];
948
+ if (gzread(file, buffer, 4) != 4)
949
+ exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
950
+
951
+ return int32(buffer);
952
+ }
953
+
954
+ static void readBAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
955
+ {
956
+ size_t seqCapacity = 0;
957
+ char *seq = NULL;
958
+ char cigar[5000];
959
+ char cigar_buffer[5000];
960
+ size_t bufferCapacity = 4;
961
+ unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
962
+ unsigned long recno, readCount;
963
+ int i, refCount;
964
+ gzFile file;
965
+ char previous_qname_pairing[10];
966
+ char previous_qname[5000];
967
+ char previous_seq[5000];
968
+ boolean previous_paired = false;
969
+ Category prev_cat = cat;
970
+ Category apparentCat;
971
+ char ** refNames;
972
+ ReferenceCoordinate * refCoord;
973
+ seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
974
+ seqWriteInfo->m_position = 0;
975
+ seqWriteInfo->m_openMask = false;
976
+
977
+ RefInfoList *refTail = NULL;
978
+ size_t mapBuffer_size = 1000;
979
+ char * mapBuffer = callocOrExit(mapBuffer_size, char);
980
+
981
+ if (cat == REFERENCE) {
982
+ velvetLog("BAM file %s cannot contain reference sequences.\n", filename);
983
+ velvetLog("Please check the command line.\n");
984
+ #ifdef DEBUG
985
+ abort();
986
+ #endif
987
+ exit(1);
988
+ }
989
+
990
+ if (strcmp(filename, "-") != 0)
991
+ file = gzopen(filename, "rb");
992
+ else {
993
+ file = gzdopen(fileno(stdin), "rb");
994
+ SET_BINARY_MODE(stdin);
995
+ }
996
+
997
+ if (file != NULL)
998
+ velvetLog("Reading BAM file %s\n", filename);
999
+ else
1000
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1001
+
1002
+ if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1003
+ exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1004
+
1005
+ // Skip header text
1006
+ if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1007
+ exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1008
+
1009
+ // Skip header reference list
1010
+ refCount = readBAMint32(file);
1011
+ refNames = callocOrExit(refCount, char *);
1012
+ for (i = 0; i < refCount; i++) {
1013
+ int strLength;
1014
+
1015
+ if (gzread(file, buffer, 4) != 4)
1016
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1017
+
1018
+ strLength = int32(buffer);
1019
+ refNames[i] = callocOrExit(strLength, char);
1020
+
1021
+ if (bufferCapacity < 4 + strLength) {
1022
+ bufferCapacity = 4 + strLength + 4096;
1023
+ buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1024
+ }
1025
+
1026
+ if (gzread(file, buffer, 4 + strLength) != 4 + strLength)
1027
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1028
+
1029
+ strcpy(refNames[i], (char *) buffer);
1030
+ }
1031
+ if (isCreateBinary()) {
1032
+ inputCnySeqFileStart(cat, seqWriteInfo);
1033
+ }
1034
+ strcpy(previous_qname, "");
1035
+ readCount = 0;
1036
+ for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1037
+ int blockSize = int32(buffer);
1038
+ int readLength;
1039
+
1040
+ if (bufferCapacity < 4 + blockSize) {
1041
+ bufferCapacity = 4 + blockSize + 4096;
1042
+ buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1043
+ }
1044
+
1045
+ if (gzread(file, &buffer[4], blockSize) != blockSize)
1046
+ exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1047
+
1048
+ readLength = int32(&buffer[20]);
1049
+ if (readLength == 0) {
1050
+ velvetFprintf(stderr,
1051
+ "Record #%lu: ignoring BAM record with omitted SEQ field\n",
1052
+ recno);
1053
+ }
1054
+ else {
1055
+ int readNameLength = buffer[12];
1056
+ int flag_nc = int32(&buffer[16]);
1057
+ int flagbits = flag_nc >> 16;
1058
+ int cigarLength = flag_nc & 0xffff;
1059
+ char *qname = (char *)&buffer[36];
1060
+ uint32_t *rawcigar = (uint32_t *) &buffer[36 + readNameLength];
1061
+ unsigned char *rawseq =
1062
+ &buffer[36 + readNameLength + 4 * cigarLength];
1063
+ int rID = int32(&buffer[4]);
1064
+ // NOTE: BAM file coords are 0-based, not 1-based like SAM files
1065
+ // No comment
1066
+ long long pos = int32(&buffer[8]) + 1;
1067
+ int orientation = 1;
1068
+
1069
+ const char *qname_pairing = "";
1070
+ if (flagbits & 0x40)
1071
+ qname_pairing = "/1";
1072
+ else if (flagbits & 0x80)
1073
+ qname_pairing = "/2";
1074
+
1075
+ strcpy(cigar, "");
1076
+ for (i = 0; i < cigarLength; i++) {
1077
+ static const char decode_ops[] = "MIDNSHP=X";
1078
+ uint32_t packed = *(rawcigar++);
1079
+ sprintf(cigar_buffer, "%i%c", packed >> 4, decode_ops[packed & 0xf]);
1080
+ strcat(cigar, cigar_buffer);
1081
+ }
1082
+
1083
+ if (seqCapacity < readLength + 1) {
1084
+ seqCapacity = readLength * 2 + 1;
1085
+ seq = reallocOrExit(seq, seqCapacity, char);
1086
+ }
1087
+
1088
+ for (i = 0; i < readLength; i += 2) {
1089
+ static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1090
+ unsigned int packed = *(rawseq++);
1091
+ seq[i] = decode_bases[packed >> 4];
1092
+ seq[i+1] = decode_bases[packed & 0xf];
1093
+ }
1094
+ seq[readLength] = '\0';
1095
+
1096
+ if (flagbits & 0x10) {
1097
+ orientation = -1;
1098
+ reverseComplementSequence(seq);
1099
+ }
1100
+
1101
+ // Determine if paired to previous read
1102
+ boolean same_name = (strcmp(qname, previous_qname) == 0);
1103
+ if (readCount > 0 && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
1104
+ if (cat % 2 && !same_name && !previous_paired)
1105
+ apparentCat = cat - 1;
1106
+ else
1107
+ apparentCat = cat;
1108
+
1109
+ previous_paired = (cat % 2 && same_name);
1110
+
1111
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
1112
+ prev_cat = apparentCat;
1113
+ }
1114
+
1115
+ if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[rID], (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation)))
1116
+ readCigar(cigar, orientation, pos, seq, refCoord, mapBuffer, seqWriteInfo, &refTail, &mapBuffer_size);
1117
+
1118
+ strcpy(previous_qname, qname);
1119
+ strcpy(previous_qname_pairing, qname_pairing);
1120
+ strcpy(previous_seq, seq);
1121
+ velvetifySequence(previous_seq, seqWriteInfo);
1122
+
1123
+ readCount++;
1124
+ }
1125
+ }
1126
+
1127
+ if (readCount) {
1128
+ if (cat % 2 && !previous_paired)
1129
+ apparentCat = cat - 1;
1130
+ else
1131
+ apparentCat = cat;
1132
+ writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
1133
+ }
1134
+
1135
+ free(seq);
1136
+ free(buffer);
1137
+ free(mapBuffer);
1138
+
1139
+ gzclose(file);
1140
+ velvetLog("%lu reads found.\n", readCount);
1141
+ velvetLog("Done\n");
1142
+ }
1143
+
1144
+
1145
+ static void printUsage()
1146
+ {
1147
+ puts("Usage:");
1148
+ puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename} [options]");
1149
+ puts("");
1150
+ puts("\tdirectory\t\t: directory name for output files");
1151
+ printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
1152
+ puts("\tfilename\t\t: path to sequence file or - for standard input");
1153
+ puts("");
1154
+ puts("File format options:");
1155
+ puts("\t-fasta");
1156
+ puts("\t-fastq");
1157
+ puts("\t-raw");
1158
+ puts("\t-fasta.gz");
1159
+ puts("\t-fastq.gz");
1160
+ puts("\t-raw.gz");
1161
+ puts("\t-sam");
1162
+ puts("\t-bam");
1163
+ puts("\t-fmtAuto");
1164
+ puts("");
1165
+ puts("Read type options:");
1166
+ puts("\t-short");
1167
+ puts("\t-shortPaired");
1168
+ puts("\t-short2");
1169
+ puts("\t-shortPaired2");
1170
+ puts("\t-long");
1171
+ puts("\t-longPaired");
1172
+ puts("\t-reference");
1173
+ puts("");
1174
+ puts("Options:");
1175
+ puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
1176
+ puts("");
1177
+ puts("Output:");
1178
+ puts("\tdirectory/Roadmaps");
1179
+ puts("\tdirectory/Sequences");
1180
+ puts("\t\t[Both files are picked up by graph, so please leave them there]");
1181
+ }
1182
+
1183
+ // General argument parser for most functions
1184
+ // Basically a reused portion of toplevel code dumped into here
1185
+ void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand, boolean * noHash)
1186
+ {
1187
+ int argIndex = 1;
1188
+ int filetype = FASTA;
1189
+ Category cat = 0;
1190
+ IDnum sequenceIndex = 1;
1191
+ short short_var;
1192
+ ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
1193
+ boolean reuseSequences = false;
1194
+ boolean separate_pair_files = false;
1195
+
1196
+ if (argc < 2) {
1197
+ printUsage();
1198
+ #ifdef DEBUG
1199
+ abort();
1200
+ #endif
1201
+ exit(1);
1202
+ }
1203
+
1204
+ for (argIndex = 1; argIndex < argc; argIndex++) {
1205
+ if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1206
+ *double_strand = false;
1207
+ reference_coordinate_double_strand = false;
1208
+ } else if (strcmp(argv[argIndex], "-reuse_Sequences") == 0) {
1209
+ reuseSequences = true;
1210
+ } else if (strcmp(argv[argIndex], "-reuse_binary") == 0) {
1211
+ reuseSequences = true;
1212
+ } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1213
+ *noHash = true;
1214
+ }
1215
+ }
1216
+
1217
+ if (reuseSequences)
1218
+ return;
1219
+
1220
+ SequencesWriter * seqWriteInfo = NULL;
1221
+ if (isCreateBinary()) {
1222
+ seqWriteInfo = openCnySeqForWrite(filename);
1223
+ seqWriteInfo->m_unifiedSeqFileHeader.m_bDoubleStrand = *double_strand;
1224
+ // file is already open
1225
+ } else {
1226
+ seqWriteInfo = callocOrExit(1, SequencesWriter);
1227
+ seqWriteInfo->m_pFile = fopen(filename, "w");
1228
+ }
1229
+
1230
+ for (argIndex = 1; argIndex < argc; argIndex++) {
1231
+ if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1232
+
1233
+ if (strcmp(argv[argIndex], "-fastq") == 0)
1234
+ filetype = FASTQ;
1235
+ else if (strcmp(argv[argIndex], "-fasta") == 0)
1236
+ filetype = FASTA;
1237
+ else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1238
+ filetype = FASTQ_GZ;
1239
+ else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1240
+ filetype = FASTA_GZ;
1241
+ else if (strcmp(argv[argIndex], "-sam") == 0)
1242
+ filetype = SAM;
1243
+ else if (strcmp(argv[argIndex], "-bam") == 0)
1244
+ filetype = BAM;
1245
+ else if (strcmp(argv[argIndex], "-raw") == 0)
1246
+ filetype = RAW;
1247
+ else if (strcmp(argv[argIndex], "-raw.gz") == 0)
1248
+ filetype = RAW_GZ;
1249
+ else if (strcmp(argv[argIndex], "-fmtAuto") == 0)
1250
+ filetype = AUTO;
1251
+ else if (strcmp(argv[argIndex], "-short") == 0)
1252
+ cat = 0;
1253
+ else if (strcmp(argv[argIndex], "-shortPaired") ==
1254
+ 0)
1255
+ cat = 1;
1256
+ else if (strncmp
1257
+ (argv[argIndex], "-shortPaired",
1258
+ 12) == 0) {
1259
+ sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1260
+ cat = (Category) short_var;
1261
+ if (cat < 1 || cat > CATEGORIES) {
1262
+ velvetLog("Unknown option: %s\n",
1263
+ argv[argIndex]);
1264
+ #ifdef DEBUG
1265
+ abort();
1266
+ #endif
1267
+ exit(1);
1268
+ }
1269
+ cat--;
1270
+ cat *= 2;
1271
+ cat++;
1272
+ } else if (strncmp(argv[argIndex], "-short", 6) ==
1273
+ 0) {
1274
+ sscanf(argv[argIndex], "-short%hd", &short_var);
1275
+ cat = (Category) short_var;
1276
+ if (cat < 1 || cat > CATEGORIES) {
1277
+ velvetLog("Unknown option: %s\n",
1278
+ argv[argIndex]);
1279
+ #ifdef DEBUG
1280
+ abort();
1281
+ #endif
1282
+ exit(1);
1283
+ }
1284
+ cat--;
1285
+ cat *= 2;
1286
+ } else if (strcmp(argv[argIndex], "-long") == 0)
1287
+ cat = LONG; // CATEGORIES * 2;
1288
+ else if (strcmp(argv[argIndex], "-longPaired") == 0)
1289
+ cat = LONG_PAIRED; // CATEGORIES * 2 + 1;
1290
+ else if (strcmp(argv[argIndex], "-reference") == 0)
1291
+ cat = REFERENCE; // CATEGORIES * 2 + 2
1292
+ else if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1293
+ *double_strand = false;
1294
+ reference_coordinate_double_strand = false;
1295
+ } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1296
+ ;
1297
+ } else if (strcmp(argv[argIndex], "-create_binary") == 0) {
1298
+ ;
1299
+ } else if (strcmp(argv[argIndex], "-interleaved") == 0) {
1300
+ separate_pair_files = false;
1301
+ } else if (strcmp(argv[argIndex], "-separate") == 0) {
1302
+ separate_pair_files = true;
1303
+ }
1304
+ else {
1305
+ velvetLog("Unknown option: %s\n",
1306
+ argv[argIndex]);
1307
+ #ifdef DEBUG
1308
+ abort();
1309
+ #endif
1310
+ exit(1);
1311
+ }
1312
+
1313
+ continue;
1314
+ }
1315
+
1316
+ if (cat == -1)
1317
+ continue;
1318
+
1319
+ switch (filetype) {
1320
+ case FASTA:
1321
+ case FASTQ:
1322
+ case FASTA_GZ:
1323
+ case FASTQ_GZ:
1324
+ case AUTO:
1325
+ // Separate files for paired reads? Note odd categories used for paired read type
1326
+ if (separate_pair_files && cat%2==1) {
1327
+ argIndex++;
1328
+ if (argIndex>=argc)
1329
+ exitErrorf(EXIT_FAILURE, false, "Require left & right filename for -separate mode");
1330
+ readFastXPair(filetype, seqWriteInfo, argv[argIndex-1], argv[argIndex], cat, &sequenceIndex);
1331
+ } else {
1332
+ readFastXFile(filetype, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1333
+ }
1334
+ break;
1335
+ case RAW:
1336
+ if (separate_pair_files && cat%2==1) {
1337
+ exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
1338
+ }
1339
+ readRawFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
1340
+ break;
1341
+ case RAW_GZ:
1342
+ if (separate_pair_files && cat%2==1) {
1343
+ exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
1344
+ }
1345
+ readRawGZFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
1346
+ break;
1347
+ case SAM:
1348
+ readSAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1349
+ break;
1350
+ case BAM:
1351
+ readBAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
1352
+ break;
1353
+ default:
1354
+ velvetLog("Screw up in parser... exiting\n");
1355
+ #ifdef DEBUG
1356
+ abort();
1357
+ #endif
1358
+ exit(1);
1359
+ }
1360
+ }
1361
+
1362
+ destroyReferenceCoordinateTable(refCoords);
1363
+ if (isCreateBinary()) {
1364
+ closeCnySeqForWrite(seqWriteInfo);
1365
+ } else {
1366
+ fclose(seqWriteInfo->m_pFile);
1367
+ }
1368
+ if (seqWriteInfo) {
1369
+ free(seqWriteInfo);
1370
+ }
1371
+ }
1372
+
1373
+ void createReadPairingArray(ReadSet* reads)
1374
+ {
1375
+ IDnum index;
1376
+ IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1377
+ Category cat = 0;
1378
+ int phase = 0;
1379
+
1380
+ for (index = 0; index < reads->readCount; index++)
1381
+ mateReads[index] = -1;
1382
+
1383
+ reads->mateReads = mateReads;
1384
+
1385
+ for (index = 0; index < reads->readCount; index++)
1386
+ {
1387
+ // Paired category
1388
+ if (cat & 1)
1389
+ {
1390
+ // Leaving the paired category
1391
+ if (reads->categories[index] != cat)
1392
+ {
1393
+ if (phase == 1)
1394
+ {
1395
+ reads->mateReads[index - 1] = -1;
1396
+ reads->categories[index - 1]--;
1397
+ phase = 0;
1398
+ }
1399
+ cat = reads->categories[index];
1400
+ // Into another paired category
1401
+ if (cat & 1)
1402
+ {
1403
+ reads->mateReads[index] = index + 1;
1404
+ phase = 1;
1405
+ }
1406
+ }
1407
+ else if (phase == 0)
1408
+ {
1409
+ reads->mateReads[index] = index + 1;
1410
+ phase = 1;
1411
+ }
1412
+ else
1413
+ {
1414
+ reads->mateReads[index] = index - 1;
1415
+ phase = 0;
1416
+ }
1417
+ }
1418
+ // Leaving an unpaired category
1419
+ else if (reads->categories[index] != cat)
1420
+ {
1421
+ cat = reads->categories[index];
1422
+ // Into a paired category
1423
+ if (cat & 1)
1424
+ {
1425
+ reads->mateReads[index] = index + 1;
1426
+ phase = 1;
1427
+ }
1428
+ }
1429
+ }
1430
+ }
1431
+
1432
+ int pairedCategories(ReadSet * reads)
1433
+ {
1434
+ boolean pairedCat[CATEGORIES + 1];
1435
+ int pairedCatCount = 0;
1436
+ IDnum index;
1437
+
1438
+ for (index = 0; index <= CATEGORIES; index++)
1439
+ pairedCat[index] = 0;
1440
+
1441
+ for (index = 0; index < reads->readCount; index++) {
1442
+ if (reads->categories[index] & 1 && !pairedCat[reads->categories[index] / 2]) {
1443
+ pairedCat[reads->categories[index] / 2] = true;
1444
+ if (pairedCatCount++ == CATEGORIES)
1445
+ break;
1446
+ }
1447
+ }
1448
+
1449
+ return pairedCatCount;
1450
+ }
1451
+
1452
+ boolean isSecondInPair(ReadSet * reads, IDnum index)
1453
+ {
1454
+ return reads->secondInPair[index / 8] & (1 << (index & 7));
1455
+ }
1456
+
1457
+ void computeSecondInPair(ReadSet * reads)
1458
+ {
1459
+ IDnum index;
1460
+ Category currentCat = 0;
1461
+ Category previousCat = 0;
1462
+ int phase = 0;
1463
+
1464
+ if (reads->secondInPair)
1465
+ free (reads->secondInPair);
1466
+ reads->secondInPair = callocOrExit((reads->readCount + 7) / 8, unsigned char);
1467
+
1468
+ for (index = 0; index < reads->readCount; index++)
1469
+ {
1470
+ currentCat = reads->categories[index];
1471
+ if (currentCat & 1)
1472
+ {
1473
+ if (previousCat == currentCat)
1474
+ {
1475
+ if (phase == 0)
1476
+ {
1477
+ phase = 1;
1478
+ }
1479
+ else
1480
+ {
1481
+ reads->secondInPair[index / 8] |= (1 << (index & 7));
1482
+ phase = 0;
1483
+ }
1484
+ }
1485
+ else {
1486
+ phase = 1;
1487
+ if (index > 0 && previousCat & 1 && !isSecondInPair(reads, index - 1))
1488
+ reads->categories[index - 1] = (reads->categories[index - 1] / 2) * 2;
1489
+ }
1490
+ }
1491
+ previousCat = currentCat;
1492
+ }
1493
+
1494
+ // Safeguard against odd sets of reads
1495
+ if (!isSecondInPair(reads, reads->readCount - 1)) {
1496
+ reads->categories[reads->readCount - 1] = (reads->categories[reads->readCount - 1] / 2) * 2;
1497
+ }
1498
+ }
1499
+
1500
+ void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1501
+ {
1502
+ IDnum index;
1503
+ IDnum pairID;
1504
+ IDnum sequenceCount = reads->readCount;
1505
+ IDnum *mateReads = reads->mateReads;
1506
+
1507
+ if (dubiousReads == NULL || mateReads == NULL)
1508
+ return;
1509
+
1510
+ for (index = 0; index < sequenceCount; index++) {
1511
+ if (!dubiousReads[index] || reads->categories[index] % 2 == 0 )
1512
+ continue;
1513
+
1514
+ if (isSecondInPair(reads, index))
1515
+ pairID = index - 1;
1516
+ else
1517
+ pairID = index + 1;
1518
+
1519
+ reads->categories[index] = (reads->categories[index] / 2) * 2;
1520
+ reads->categories[pairID] = (reads->categories[pairID] / 2) * 2;
1521
+ }
1522
+ }
1523
+
1524
+ ReadSet *importReadSet(char *filename)
1525
+ {
1526
+ FILE *file = fopen(filename, "r");
1527
+ char *sequence = NULL;
1528
+ Coordinate bpCount = 0;
1529
+ const int maxline = 5000;
1530
+ char line[5000];
1531
+ IDnum sequenceCount, sequenceIndex;
1532
+ ReadSet *reads;
1533
+ short int temp_short;
1534
+ int lineLength;
1535
+
1536
+ if (file != NULL)
1537
+ velvetLog("Reading read set file %s;\n", filename);
1538
+ else
1539
+ exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1540
+
1541
+ reads = newReadSet();
1542
+
1543
+ // Count number of separate sequences
1544
+ sequenceCount = 0;
1545
+ while (fgets(line, maxline, file) != NULL)
1546
+ if (line[0] == '>')
1547
+ sequenceCount++;
1548
+ fclose(file);
1549
+ velvetLog("%li sequences found\n", (long) sequenceCount);
1550
+
1551
+ reads->readCount = sequenceCount;
1552
+
1553
+ if (reads->readCount == 0) {
1554
+ reads->sequences = NULL;
1555
+ reads->categories = NULL;
1556
+ return reads;
1557
+ }
1558
+
1559
+ reads->sequences = callocOrExit(sequenceCount, char *);
1560
+ reads->categories = callocOrExit(sequenceCount, Category);
1561
+ // Counting base pair length of each sequence:
1562
+ file = fopen(filename, "r");
1563
+ sequenceIndex = -1;
1564
+ while (fgets(line, maxline, file) != NULL) {
1565
+ if (line[0] == '>') {
1566
+
1567
+ // Reading category info
1568
+ sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1569
+ &temp_short);
1570
+ reads->categories[sequenceIndex + 1] = (Category) temp_short;
1571
+
1572
+ if (sequenceIndex != -1)
1573
+ reads->sequences[sequenceIndex] =
1574
+ mallocOrExit(bpCount + 1, char);
1575
+ sequenceIndex++;
1576
+ bpCount = 0;
1577
+ } if (line[0] == 'M') {;
1578
+ // Map line
1579
+ } else {
1580
+ bpCount += (Coordinate) strlen(line) - 1;
1581
+
1582
+ if (sizeof(ShortLength) == sizeof(int16_t) && (bpCount > SHRT_MAX || bpCount < 0)) {
1583
+ velvetLog("Read %li of length %lli, longer than limit %i\n",
1584
+ (long) sequenceIndex + 1, (long long) bpCount, SHRT_MAX);
1585
+ velvetLog("You should modify recompile with the LONGSEQUENCES option (cf. manual)\n");
1586
+ exit(1);
1587
+ }
1588
+ }
1589
+ }
1590
+
1591
+ //velvetLog("Sequence %d has length %d\n", sequenceIndex, bpCount);
1592
+ reads->sequences[sequenceIndex] =
1593
+ mallocOrExit(bpCount + 1, char);
1594
+ fclose(file);
1595
+
1596
+ // Reopen file and memorize line:
1597
+ file = fopen(filename, "r");
1598
+ sequenceIndex = -1;
1599
+ while (fgets(line, maxline, file)) {
1600
+ if (line[0] == '>') {
1601
+ if (sequenceIndex != -1) {
1602
+ sequence[bpCount] = '\0';
1603
+ }
1604
+ sequenceIndex++;
1605
+ bpCount = 0;
1606
+ //velvetLog("Starting to read sequence %d\n",
1607
+ // sequenceIndex);
1608
+ sequence = reads->sequences[sequenceIndex];
1609
+ } else if (line[0] == 'M') {;
1610
+ // Map line
1611
+ } else {
1612
+ lineLength = strlen(line) - 1;
1613
+ strncpy(sequence + bpCount, line, lineLength);
1614
+ bpCount += (Coordinate) lineLength;
1615
+ }
1616
+ }
1617
+
1618
+ sequence[bpCount] = '\0';
1619
+ fclose(file);
1620
+ computeSecondInPair(reads);
1621
+
1622
+ velvetLog("Done\n");
1623
+ return reads;
1624
+
1625
+ }
1626
+
1627
+ void logInstructions(int argc, char **argv, char *directory)
1628
+ {
1629
+ int index;
1630
+ char *logFilename =
1631
+ mallocOrExit(strlen(directory) + 100, char);
1632
+ FILE *logFile;
1633
+ time_t date;
1634
+ char *string;
1635
+
1636
+ time(&date);
1637
+ string = ctime(&date);
1638
+
1639
+ strcpy(logFilename, directory);
1640
+ strcat(logFilename, "/Log");
1641
+ logFile = fopen(logFilename, "a");
1642
+
1643
+ if (logFile == NULL)
1644
+ exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1645
+
1646
+ velvetFprintf(logFile, "%s", string);
1647
+
1648
+ for (index = 0; index < argc; index++)
1649
+ velvetFprintf(logFile, " %s", argv[index]);
1650
+
1651
+ velvetFprintf(logFile, "\n");
1652
+
1653
+ velvetFprintf(logFile, "Version %i.%i.%2.2i%s\n", VERSION_NUMBER,
1654
+ RELEASE_NUMBER, UPDATE_NUMBER, VERSION_BRANCH);
1655
+ velvetFprintf(logFile, "Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
1656
+ velvetFprintf(logFile, "This is free software; see the source for copying conditions. There is NO\n");
1657
+ velvetFprintf(logFile, "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
1658
+ velvetFprintf(logFile, "Compilation settings:\n");
1659
+ velvetFprintf(logFile, "CATEGORIES = %i\n", CATEGORIES);
1660
+ velvetFprintf(logFile, "MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
1661
+ #ifdef _OPENMP
1662
+ velvetFprintf(logFile, "OPENMP\n");
1663
+ #endif
1664
+ #ifdef LONGSEQUENCES
1665
+ velvetFprintf(logFile, "LONGSEQUENCES\n");
1666
+ #endif
1667
+ #ifdef BIGASSEMBLY
1668
+ velvetFprintf(logFile, "BIGASSEMBLY\n");
1669
+ #endif
1670
+ #ifdef COLOR
1671
+ velvetFprintf(logFile, "COLOR\n");
1672
+ #endif
1673
+ #ifdef DEBUG
1674
+ velvetFprintf(logFile, "DEBUG\n");
1675
+ #endif
1676
+ velvetFprintf(logFile, "\n");
1677
+
1678
+ fclose(logFile);
1679
+ free(logFilename);
1680
+ }
1681
+
1682
+ void destroyReadSet(ReadSet * reads)
1683
+ {
1684
+ IDnum index;
1685
+
1686
+ if (reads == NULL)
1687
+ return;
1688
+
1689
+ if (reads->sequences != NULL)
1690
+ {
1691
+ for (index = 0; index < reads->readCount; index++)
1692
+ free(reads->sequences[index]);
1693
+ free(reads->sequences);
1694
+ }
1695
+
1696
+ if (reads->tSequences != NULL)
1697
+ free (reads->tSequences);
1698
+
1699
+ if (reads->tSeqMem != NULL)
1700
+ free (reads->tSeqMem);
1701
+
1702
+ if (reads->labels != NULL)
1703
+ for (index = 0; index < reads->readCount; index++)
1704
+ free(reads->labels[index]);
1705
+
1706
+ if (reads->confidenceScores != NULL)
1707
+ for (index = 0; index < reads->readCount; index++)
1708
+ free(reads->confidenceScores[index]);
1709
+
1710
+ if (reads->kmerProbabilities != NULL)
1711
+ for (index = 0; index < reads->readCount; index++)
1712
+ free(reads->kmerProbabilities[index]);
1713
+
1714
+ free(reads->labels);
1715
+ free(reads->confidenceScores);
1716
+ free(reads->kmerProbabilities);
1717
+ free(reads->mateReads);
1718
+ free(reads->categories);
1719
+ free(reads->secondInPair);
1720
+ free(reads);
1721
+ }
1722
+
1723
+ ShortLength *getSequenceLengths(ReadSet * reads, int wordLength)
1724
+ {
1725
+ ShortLength *lengths = callocOrExit(reads->readCount, ShortLength);
1726
+ IDnum index;
1727
+ int lengthOffset = wordLength - 1;
1728
+
1729
+ for (index = 0; index < reads->readCount; index++)
1730
+ lengths[index] =
1731
+ getLength(getTightStringInArray(reads->tSequences, index)) - lengthOffset;
1732
+
1733
+ return lengths;
1734
+ }