finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,303 @@
1
+ class Bio::FinishM::Finisher
2
+ include Bio::FinishM::Logging
3
+
4
+ def add_options(opts, options)
5
+ opts.banner = "\nUsage: finishm finish <options>\n\n"
6
+
7
+ options.merge!({
8
+ :min_leftover_length => false,
9
+ :kmer_coverage_target => 1,
10
+ :contig_end_length => 300,
11
+ :graph_search_leash_length => 20000,
12
+ :reads_to_assemble => nil,
13
+ })
14
+
15
+ # TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
16
+ # Look at the last 300bp of the first contig.
17
+ extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
18
+ contig = nil
19
+ Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
20
+ if contig.nil?
21
+ contig = e.seq
22
+ else
23
+ raise "Multiple sequences found in a contig file! I need exactly one"
24
+ end
25
+ end
26
+ raise "I need a contig to be in the start contig file" if contig.nil?
27
+ Bio::Sequence::NA.new(contig.to_s)
28
+ end
29
+
30
+ opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
31
+ options[:pattern] = arg
32
+ end
33
+ opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
34
+ options[:kmer_multiple_abundance_file] = arg
35
+ end
36
+ opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
37
+ options[:upper_threshold] = arg.to_i
38
+ end
39
+ opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
40
+ options[:lower_threshold] = arg.to_i
41
+ end
42
+ opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
43
+ options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
44
+ end
45
+ opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
46
+ options[:start_contig] = extract_exactly_one_contig_from_file.call arg
47
+ end
48
+ opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
49
+ options[:end_contig] = extract_exactly_one_contig_from_file.call arg
50
+ end
51
+
52
+ opts.separator "\nOptional arguments:\n\n"
53
+ opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
54
+ options[:min_leftover_length] = arg.to_i
55
+ end
56
+ opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
57
+ options[:kmer_coverage_target] = arg.to_i
58
+ end
59
+ opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
60
+ options[:already_patterned_reads] = arg
61
+ end
62
+ opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
63
+ options[:output_graph_png] = arg
64
+ end
65
+ opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
66
+ options[:output_graph_svg] = arg
67
+ end
68
+ opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
69
+ options[:output_graph_dot] = arg
70
+ end
71
+ opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
72
+ options[:assembly_coverage_cutoff] = arg.to_f
73
+ end
74
+ opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
75
+ options[:contig_end_length] = arg.to_i
76
+ end
77
+
78
+ Bio::FinishM::GraphGenerator.new.add_options opts, options
79
+ end
80
+
81
+ def validate_options(options, argv)
82
+ #TODO: give a better description of the error that has occurred
83
+ if argv.length != 0
84
+ return "Dangling argument(s) found e.g. #{argv[0]}"
85
+ elsif options[:already_patterned_reads]
86
+ else
87
+ [:upper_threshold,
88
+ :lower_threshold,
89
+ :pattern,
90
+ :kmer_multiple_abundance_file,
91
+ :reads_files].each do |sym|
92
+ if options[sym].nil?
93
+ return "No option found to specify #{sym}"
94
+ end
95
+ end
96
+ end
97
+ return nil #if here, options all were parsed successfully
98
+ end
99
+
100
+ #TODO: this method is too long - split it up by refactoring
101
+ def run(options, argv)
102
+ pooled_reads_filename = 'pooled_sampled_reads.fasta' #TODO: remove this constant into a tempfile or something.
103
+ if options[:already_patterned_reads] #If skipping read extraction
104
+ pooled_reads_filename = options[:already_patterned_reads]
105
+
106
+ else
107
+ # Parse pattern from cmdline
108
+ desired_pattern = KmerAbundancePattern.new
109
+ desired_pattern.parse_from_human(options[:pattern])
110
+ if options[:reads_files].length != desired_pattern.length
111
+ raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
112
+ end
113
+
114
+ # Collect the kmers that will be used to find trusted reads i.e.
115
+ # Go through each line of the kmer abundance file, looking for kmers that suit the pattern
116
+ input_file = File.open options[:kmer_multiple_abundance_file]
117
+ csv = CSV.new(input_file, :col_sep => ' ')
118
+
119
+ whitelist_kmers = []
120
+ blacklist_kmers = []
121
+ csv.each do |row|
122
+ max_i = row.length - 2 if max_i.nil?
123
+
124
+ kmer = row[0]
125
+ counts = row[1...row.length].collect{|s| s.to_i}
126
+ this_pattern = []
127
+ counts.each_with_index do |count, i|
128
+ if count > options[:upper_threshold]
129
+ this_pattern[i] = true
130
+ elsif count < options[:lower_threshold]
131
+ this_pattern[i] = false
132
+ else
133
+ # coverage was in no man's land between thresholds.
134
+ # Ignore this kmer as noise.
135
+ this_pattern[i] = '-'
136
+ end
137
+ end
138
+ #log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
139
+
140
+ if desired_pattern.consistent_with? this_pattern
141
+ whitelist_kmers.push row[0]
142
+ else
143
+ # kmer is not present when it should be
144
+ blacklist_kmers.push row[0]
145
+ end
146
+ end
147
+ log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
148
+ unless whitelist_kmers.length > 0
149
+ log.error "No kmers found that satisfy the given pattern, exiting.."
150
+ exit 1
151
+ end
152
+
153
+
154
+ #outdir = options[:output_directory]
155
+ #Dir.mkdir outdir unless Dir.exist?(outdir)
156
+
157
+ # grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
158
+ #Tempfile.open('whitelist') do |white|
159
+ File.open 'whitelist', 'w' do |white| #TODO: remove 'whitelist' file as a constant
160
+ white.puts whitelist_kmers.join("\n")
161
+ white.close
162
+
163
+ #Tempfile.open('blacklist') do |black|
164
+ File.open('black','w') do |black|
165
+ black.puts blacklist_kmers.join("\n")
166
+ black.close
167
+
168
+ threadpool = []
169
+ sampled_read_files = []
170
+ log.info "Extracting reads that contain suitable kmers"
171
+ options[:reads_files].each_with_index do |file, i|
172
+ next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
173
+
174
+ sampled = File.basename(file)+'.sampled_reads.fasta'
175
+ sampled_read_files.push sampled
176
+
177
+ grep_path = "#{ ENV['HOME'] }/git/priner/bin/read_selection_by_kmer " #TODO: this won't work on other people's systems.
178
+ if options[:min_leftover_length]
179
+ grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
180
+ end
181
+ thr = Thread.new do
182
+ grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
183
+ log.debug "Running cmd: #{grep_cmd}"
184
+ status, stdout, stderr = systemu grep_cmd
185
+ log.debug stderr
186
+
187
+ raise unless status.exitstatus == 0
188
+ log.debug "Finished extracting reads from #{file}"
189
+ end
190
+ threadpool.push thr
191
+ end
192
+ threadpool.each do |thread| thread.join; end #wait until everything is finito
193
+
194
+ log.info "Finished extracting reads for sampling. Now pooling sampled reads"
195
+ pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
196
+ log.debug "Running cmd: #{pool_cmd}"
197
+ status, stdout, stderr = systemu pool_cmd
198
+ raise stderr if stderr != ''
199
+ raise unless status.exitstatus == 0
200
+ end
201
+ end
202
+ end
203
+
204
+ log.info "Extracting dummy reads from the ends of contigs to use as anchors"
205
+ start_contig = options[:start_contig]
206
+ end_contig = options[:end_contig]
207
+ if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
208
+ log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
209
+ end
210
+ if [start_contig.length, end_contig.length].min < options[:contig_end_length]
211
+ log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
212
+ exit 1
213
+ end
214
+
215
+ probe_sequences = [
216
+ start_contig[start_contig.length-options[:contig_end_length]...start_contig.length],
217
+ Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]]).reverse_complement.to_s
218
+ ]
219
+ read_input = Bio::FinishM::ReadInput.new
220
+ read_input.fasta_singles = [pooled_reads_filename]
221
+ finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
222
+ graph = finishm_graph.graph
223
+ start_node = finishm_graph.probe_nodes[0]
224
+ start_node_forward = finishm_graph.probe_node_directions[0]
225
+ end_node = finishm_graph.probe_nodes[1]
226
+ end_node_forward = finishm_graph.probe_node_directions[1]
227
+
228
+ log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
229
+
230
+ log.info "Removing nodes unconnected to either the start or the end from the graph.."
231
+ original_num_nodes = graph.nodes.length
232
+ original_num_arcs = graph.arcs.length
233
+ filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
234
+ filter.remove_unconnected_nodes(graph, [start_node, end_node])
235
+ log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
236
+
237
+ if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
238
+ viser = Bio::Assembly::ABVisualiser.new
239
+ log.info "Preparing GraphViz object for output"
240
+ gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
241
+
242
+ if options[:output_graph_png]
243
+ log.info "Converting assembly to a graphviz PNG #{options[:output_graph_png] }"
244
+ gv.output :png => options[:output_graph_png], :use => :neato
245
+ end
246
+ if options[:output_graph_svg]
247
+ log.info "Converting assembly to a graphviz SVG #{options[:output_graph_svg] }"
248
+ gv.output :svg => options[:output_graph_svg], :use => :neato
249
+ end
250
+ if options[:output_graph_dot]
251
+ log.info "Converting assembly to a graphviz DOT #{options[:output_graph_dot] }"
252
+ gv.output :dot => options[:output_graph_dot]
253
+ end
254
+ end
255
+
256
+ log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
257
+ cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
258
+ #raise "Untested connection finder below"
259
+ #trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
260
+ trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
261
+ log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
262
+
263
+ # log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
264
+ # kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
265
+ # log.info "Finished reading the kmer abundances"
266
+
267
+ # if options[:trail_kmer_coverage_file]
268
+ # log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
269
+ # writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
270
+ # io = File.open(options[:trail_kmer_coverage_file],'w')
271
+ # writer.write(io, trails, kmer_hash)
272
+ # log.info "Finished writing"
273
+ # end
274
+
275
+ # log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
276
+ # kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
277
+ # thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
278
+ # log.info "Using thresholds for filtering: #{thresholds}"
279
+ # trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
280
+ # log.info "After filtering remained #{trails.length} trails"
281
+
282
+ printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new
283
+ trails.each_with_index do |trail, i|
284
+ log.debug "Before attachment to the contig, sequence of the trail was #{trail.sequence}" if log.debug?
285
+ acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
286
+ acon.start_probe_read_id = 1
287
+ acon.end_probe_read_id = 2
288
+ acon.start_probe_node = start_node
289
+ acon.end_probe_node = end_node
290
+ acon.start_probe_contig_offset = options[:contig_end_length]
291
+ acon.end_probe_contig_offset = options[:contig_end_length]
292
+ acon.paths = [trail]
293
+ log.debug "AnchoredConnection object to print for this trail: #{acon.inspect}" if log.debug?
294
+
295
+ puts ">trail#{i+1}"
296
+ puts printer.one_connection_between_two_contigs(
297
+ finishm_graph.graph,
298
+ probe_sequences[0],
299
+ acon,
300
+ probe_sequences[1])
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,122 @@
1
+ class Bio::FinishM::Fluff
2
+ include Bio::FinishM::Logging
3
+
4
+ def add_options(optparse_object, options)
5
+ optparse_object.banner = "\nUsage: finishm fluff --contigs <contig_file> --fastq-gz <reads..> --output-fluff-file <output.fa>
6
+
7
+ Takes a set of contigs, and places probes across them (e.g. every 2kb), and then explores the
8
+ graph from each of these probes, taking all paths within some leash length, including the 'fluff'
9
+ which is not the same path as along the contig. Prints out all of these paths to a fasta file.\n\n"
10
+
11
+ options.merge!({
12
+ :probe_spacing => 2000,
13
+ :probe_length => 100,
14
+ :graph_search_leash_length => 20000,
15
+ })
16
+
17
+ optparse_object.separator "\nRequired arguments:\n\n"
18
+ optparse_object.on("--contigs FILE", "fasta file containing contigs to find the fluff on [required]") do |arg|
19
+ options[:contigs_file] = arg
20
+ end
21
+ optparse_object.on("--output-fluff-file PATH", "Output found paths to this file in fasta format [required]") do |arg|
22
+ options[:output_fluff_file] = arg
23
+ end
24
+ optparse_object.separator "\nThere must be some definition of reads too:\n\n" #TODO improve this help
25
+ Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
26
+
27
+ optparse_object.separator "\nOptional arguments:\n\n"
28
+ optparse_object.on("--probe-spacing NUM", Integer, "Distance between probe points in the contig [default: #{options[:probe_spacing]}]") do |arg|
29
+ options[:probe_spacing] = arg
30
+ end
31
+ optparse_object.on("--probe-size NUM", Integer, "Length of the probe to be inserted into the velvet graph. Must be greater than graph kmer length. [default: #{options[:probe_length]}]") do |arg|
32
+ options[:probe_length] = arg
33
+ end
34
+ optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this far and not much more [default: #{options[:graph_search_leash_length]}]") do |arg|
35
+ options[:graph_search_leash_length] = arg
36
+ end
37
+ optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
38
+ options[:output_graph_png] = arg
39
+ end
40
+ optparse_object.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
41
+ options[:output_graph_svg] = arg
42
+ end
43
+ optparse_object.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
44
+ options[:output_graph_dot] = arg
45
+ end
46
+
47
+ Bio::FinishM::GraphGenerator.new.add_options optparse_object, options
48
+ end
49
+
50
+ def validate_options(options, argv)
51
+ #TODO: give a better description of the error that has occurred
52
+ #TODO: require reads options
53
+ if argv.length != 0
54
+ return "Dangling argument(s) found e.g. #{argv[0]}"
55
+ else
56
+ [
57
+ :contigs_file,
58
+ :output_fluff_file
59
+ ].each do |sym|
60
+ if options[sym].nil?
61
+ return "No option found to specify #{sym}."
62
+ end
63
+ end
64
+
65
+ unless options[:velvet_kmer_size] < options[:probe_length]
66
+ return "The probe length must be greater than the kmer length, otherwise it will not be incorporated into the kmer graph"
67
+ end
68
+
69
+ #if return nil from here, options all were parsed successfully
70
+ return Bio::FinishM::ReadInput.new.validate_options(options, [])
71
+ end
72
+ end
73
+
74
+ def run(options, argv)
75
+ # Read in all the contigs sequences
76
+ probe_sequences = []
77
+ sequence_names = []
78
+ Bio::FlatFile.foreach(options[:contigs_file]) do |seq|
79
+ sequence_names.push seq.definition
80
+
81
+ sequence = seq.seq
82
+ 0.step(sequence.length-1-options[:probe_length], options[:probe_spacing]) do |offset|
83
+ # Only probe in the forward direction
84
+ probe_sequence = sequence[offset...offset+options[:probe_length]]
85
+ probe_sequences.push probe_sequence
86
+ end
87
+ end
88
+ log.info "Searching from #{probe_sequences.length} different probes from #{sequence_names.length} contigs)"
89
+
90
+ # Generate the graph with the probe sequences in it.
91
+ read_input = Bio::FinishM::ReadInput.new
92
+ read_input.parse_options options
93
+ finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
94
+
95
+ # Loop over the ends, trying to make connections from each one
96
+ fluffer = Bio::AssemblyGraphAlgorithms::Fluffer.new
97
+ fluffings = fluffer.fluff(finishm_graph, options[:graph_search_leash_length])
98
+ log.debug "Found these fluffings: #{fluffings}" if log.debug?
99
+ log.info "Found #{fluffings.collect{|sets| sets.length}.reduce(:+)} paths in total" if log.info?
100
+
101
+ if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
102
+ log.info "Converting assembly to a graphviz PNG"
103
+ viser = Bio::Assembly::ABVisualiser.new
104
+ gv = viser.graphviz(finishm_graph.graph, {:start_node_ids => finishm_graph.probe_nodes.collect{|node| node.node_id}})
105
+
106
+ gv.output :png => options[:output_graph_png], :use => :neato if options[:output_graph_png]
107
+ gv.output :svg => options[:output_graph_svg], :use => :neato if options[:output_graph_svg]
108
+ gv.output :dot => options[:output_graph_dot] if options[:output_graph_dot]
109
+ end
110
+
111
+ # Print out the sequences
112
+ File.open(options[:output_fluff_file], 'w') do |output|
113
+ fluffings.each_with_index do |path_set, probe_number|
114
+ path_set.each_with_index do |path, path_number|
115
+ fate = path_set.fates[path_number]
116
+ output.puts ">probe#{probe_number+1}_path#{path_number+1} #{fate}"
117
+ output.puts path.sequence
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,325 @@
1
+ require 'tmpdir'
2
+
3
+ class Bio::FinishM::GapFiller
4
+ include Bio::FinishM::Logging
5
+
6
+ def add_options(optparse_object, options)
7
+ optparse_object.banner = "\nUsage: finishm gapfill --contigs <contigs_file> --fastq-gz <reads..> --output-fasta <output.fa>
8
+
9
+ Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
10
+ these N characters. It is possible that there is multiple ways to close the gap - in that case
11
+ each can be reported.
12
+
13
+ example: finishm gapfill --contigs to_gapfill.fasta --fastq-gz reads.1.fq.gz,reads.2.fq.gz --output-fasta output.fasta
14
+ \n"
15
+
16
+ options.merge!({
17
+ :contig_end_length => 200,
18
+ :graph_search_leash_length => 20000,
19
+ })
20
+
21
+ optparse_object.separator "\nRequired arguments:\n\n"
22
+ optparse_object.on("--contigs FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
23
+ options[:contigs_file] = arg
24
+ end
25
+ optparse_object.on("--output-fasta PATH", "Output the gap-filled sequence to this file [required]") do |arg|
26
+ options[:overall_fasta_file] = arg
27
+ end
28
+
29
+ optparse_object.separator "\nThere must be some definition of of how to do the assembly, or else a path to a previous assembly directory:\n\n"
30
+ Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
31
+ Bio::FinishM::GraphGenerator.new.add_options optparse_object, options
32
+
33
+ optparse_object.separator "\nGraph search options:\n\n"
34
+ optparse_object.on("--overhang NUM", Integer, "Start assembling this many base pairs back from the gap [default: #{options[:contig_end_length] }]") do |arg|
35
+ options[:contig_end_length] = arg
36
+ end
37
+ optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this many base pairs and not (much) more [default: #{options[:graph_search_leash_length] }]") do |arg|
38
+ options[:graph_search_leash_length] = arg
39
+ end
40
+ optparse_object.on("--recoherence-kmer NUM", Integer, "Use a kmer longer than the original velvet one, to help remove bubbles and circular paths [default: none]") do |arg|
41
+ options[:recoherence_kmer] = arg
42
+ end
43
+
44
+ optparse_object.separator "\nVisualisation options (of all joins):\n\n"
45
+ optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
46
+ options[:output_graph_png] = arg
47
+ end
48
+ optparse_object.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
49
+ options[:output_graph_svg] = arg
50
+ end
51
+ optparse_object.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
52
+ options[:output_graph_dot] = arg
53
+ end
54
+ end
55
+
56
+ def validate_options(options, argv)
57
+ #TODO: give a better description of the error that has occurred
58
+ #TODO: require reads options
59
+ if argv.length != 0
60
+ return "Dangling argument(s) found e.g. #{argv[0] }"
61
+ else
62
+ [
63
+ :contigs_file,
64
+ :overall_fasta_file
65
+ ].each do |sym|
66
+ if options[sym].nil?
67
+ return "No option found to specify #{sym}"
68
+ end
69
+ end
70
+
71
+ #if return nil from here, options all were parsed successfully
72
+ return Bio::FinishM::ReadInput.new.validate_options(options, [])
73
+ end
74
+ end
75
+
76
+ def run(options, argv)
77
+ # Read in all the contigs sequences and work out where the gaps are
78
+ genome = Bio::FinishM::InputGenome.new(
79
+ options[:contigs_file],
80
+ options[:contig_end_length],
81
+ options
82
+ )
83
+
84
+
85
+ scaffolds = Bio::FinishM::ScaffoldBreaker.new.break_scaffolds(options[:contigs_file])
86
+ gaps = []
87
+ output_fasta_file = File.open(options[:overall_fasta_file],'w')
88
+ num_without_gaps = 0
89
+ scaffolds.each do |scaffold|
90
+ sgaps = scaffold.gaps
91
+ if sgaps.empty?
92
+ num_without_gaps += 1
93
+ output_fasta_file.puts ">#{scaffold.name }"
94
+ output_fasta_file.puts scaffold.sequence
95
+ else
96
+ gaps.push scaffold.gaps
97
+ end
98
+ end
99
+ gaps.flatten!
100
+ log.info "Detected #{gaps.length} gap(s) from #{scaffolds.length} different contig(s). #{num_without_gaps } contig(s) were gap-free."
101
+
102
+ # Create probe sequences
103
+ probe_sequences = []
104
+ gaps.each do |gap|
105
+ sequence = gap.scaffold.sequence
106
+
107
+ if gap.start < options[:contig_end_length] or gap.stop > sequence.length - options[:contig_end_length]
108
+ log.warn "Found a gap that was too close to the end of a contig, skipping it: #{gap.coords}"
109
+ next
110
+ end
111
+
112
+ log.debug "Processing gap number #{gap.number}, #{gap.coords}"
113
+ first_coords = [
114
+ gap.start-options[:contig_end_length]-1,
115
+ gap.start-1,
116
+ ]
117
+ second_coords = [
118
+ gap.stop,
119
+ (gap.stop+options[:contig_end_length]),
120
+ ]
121
+ log.debug "Coordinates of the probes are #{first_coords} and #{second_coords}"
122
+ second = sequence[second_coords[0]..second_coords[1]]
123
+ probes = [
124
+ sequence[first_coords[0]...first_coords[1]],
125
+ Bio::Sequence::NA.new(second).reverse_complement.to_s,
126
+ ]
127
+ #TODO: this could probably be handled better.. e.g. if the amount of sequence is too small, just throw it out and make one big gap
128
+ if probes[0].match(/N/i) or probes[1].match(/N/i)
129
+ log.warn "Noticed gap that was too close together, skipping: #{gap.coords}"
130
+ next
131
+ end
132
+ probe_sequences.push probes[0]
133
+ probe_sequences.push probes[1]
134
+ end
135
+ log.debug "Generated #{probe_sequences.length} probes e.g. #{probe_sequences[0] }"
136
+
137
+
138
+ # Generate the graph with the probe sequences in it.
139
+ read_input = Bio::FinishM::ReadInput.new
140
+ read_input.parse_options options
141
+ # Own the tmpdir, if one is to be used - need to re-read the LastGraph later on see..
142
+ assembly_directory = options[:output_assembly_path]
143
+ assembly_directory ||= options[:previous_assembly]
144
+ using_tmp_assembly_directory = false
145
+ if assembly_directory.nil?
146
+ using_tmp_assembly_directory = true
147
+ assembly_directory = Dir.mktmpdir
148
+ options[:output_assembly_path] = assembly_directory
149
+ end
150
+
151
+ # Do the actual graph building and/or initial reading
152
+ options[:parse_sequences] = true
153
+ finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
154
+
155
+ # Output optional graphics.
156
+ if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
157
+ viser = Bio::Assembly::ABVisualiser.new
158
+ # TODO: make these visualise more than one join somehow
159
+ gv = viser.graphviz(finishm_graph.graph, {
160
+ :start_node_id => finishm_graph.probe_nodes[0].node_id,
161
+ :end_node_id => finishm_graph.probe_nodes[1].node_id})
162
+
163
+ if options[:output_graph_png]
164
+ log.info "Converting assembly to a graphviz PNG"
165
+ gv.output :png => options[:output_graph_png], :use => :neato
166
+ end
167
+ if options[:output_graph_svg]
168
+ log.info "Converting assembly to a graphviz SVG"
169
+ gv.output :svg => options[:output_graph_svg], :use => :neato
170
+ end
171
+ if options[:output_graph_dot]
172
+ log.info "Converting assembly to a graphviz DOT"
173
+ gv.output :dot => options[:output_graph_dot]
174
+ end
175
+ end
176
+
177
+ # Clean up the tmdir, if one was used.
178
+ if using_tmp_assembly_directory
179
+ log.debug "Removing tmpdir that held the assembly `#{assembly_directory}'.."
180
+ FileUtils.remove_entry assembly_directory
181
+ end
182
+
183
+ # Do the gap-filling and print out the results
184
+ printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new
185
+ num_total_trails = 0
186
+ num_singly_filled = 0
187
+ num_unbridgable = 0
188
+
189
+ output_trails_file = nil
190
+ output_trails_file = File.open(options[:overall_trail_output_fasta_file],'w') unless options[:overall_trail_output_fasta_file].nil?
191
+
192
+ # Print the fasta output for the scaffold
193
+ print_scaffold = lambda do |last_scaffold, gapfilled_sequence|
194
+ output_fasta_file.puts ">#{last_scaffold.name }"
195
+ #gapfilled_sequence += last_scaffold.contigs[last_scaffold.contigs.length-1].sequence #add last contig
196
+ output_fasta_file.puts gapfilled_sequence
197
+ end
198
+ # Lambda to add a gap the the String representing the scaffold
199
+ #TODO: if the trail is not filled then the wrong sequence is currently printed. BUG???
200
+ filler = lambda do |anchored_connection, following_contig, gapfilled_sequence, gap|
201
+ gapfilled = nil
202
+ if anchored_connection.paths.length == 1
203
+ # If there is only 1 trail, then output scaffolding information
204
+ num_singly_filled += 1
205
+
206
+ gapfilled = printer.one_connection_between_two_contigs(
207
+ finishm_graph.graph,
208
+ gapfilled_sequence,
209
+ anchored_connection,
210
+ following_contig.sequence
211
+ )
212
+ else
213
+ # Otherwise don't make any assumptions
214
+ num_unbridgable += 1 if anchored_connection.paths.empty?
215
+ # TODO: even the there is multiple trails, better info can still be output here
216
+ gapfilled = gapfilled_sequence + 'N'*gap.length + following_contig.sequence
217
+ end
218
+ gapfilled #return this string
219
+ end
220
+
221
+ log.info "Searching for trails between the nodes within the assembly graph"
222
+ log.info "Using contig overhang length #{options[:contig_end_length] } and leash length #{options[:graph_search_leash_length] }"
223
+ gapfilled_sequence = ''
224
+ last_scaffold = nil
225
+
226
+ (0...(probe_sequences.length / 2)).collect{|i| i*2}.each do |start_probe_index|
227
+ gap_number = start_probe_index / 2
228
+ gap = gaps[gap_number]
229
+ log.info "Now working through gap number #{gap_number+1}: #{gap.coords}"
230
+
231
+ probe_index1 = start_probe_index
232
+ probe_index2 = start_probe_index+1
233
+
234
+ connection = gapfill(finishm_graph, probe_index1, probe_index2, options)
235
+ log.info "Found #{connection.paths.length} trails for #{gap.coords}"
236
+
237
+ unless output_trails_file.nil?
238
+ # print the sequences of the trails if asked for:
239
+ trails.each_with_index do |trail, i|
240
+ #TODO: need to output this as something more sensible e.g. VCF format
241
+ output_trails_file.puts ">#{gap.coords}_trail#{i+1}"
242
+ output_trails_file.puts trail.sequence
243
+ end
244
+ end
245
+ num_total_trails += connection.paths.length
246
+
247
+ # Output the updated sequence. Fill in the sequence if there is only 1 trail
248
+ if gap.scaffold == last_scaffold
249
+ # We are still building the current scaffold
250
+ #gapfilled_sequence += gap.scaffold.contigs[gap.number].sequence
251
+ log.debug "Before adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug?
252
+ gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap
253
+ log.debug "After adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug?
254
+ else
255
+ # We are onto a new scaffold. Print the previous one (unless this the first one)
256
+ unless last_scaffold.nil?
257
+ # print the gapfilled (or not) scaffold.
258
+ print_scaffold.call(last_scaffold, gapfilled_sequence)
259
+ end
260
+ #reset
261
+ last_scaffold = gap.scaffold
262
+
263
+ #add the current gap (and the contig before it)
264
+ log.debug "Before adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}"
265
+ gapfilled_sequence = gap.scaffold.contigs[gap.number].sequence
266
+ log.debug "After adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}"
267
+ gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap
268
+ log.debug "After adding first gap sequence and next contig, gapfilled sequence length is #{gapfilled_sequence.length}"
269
+ end
270
+ end
271
+ print_scaffold.call(last_scaffold, gapfilled_sequence) # print the last scaffold
272
+
273
+ log.info "#{num_unbridgable } gaps had no suitable bridging paths in the graph within the leash, and found #{num_total_trails} trails in total."
274
+ log.info "Filled #{num_singly_filled } out of #{gaps.length } gaps."
275
+
276
+ output_trails_file.close unless output_trails_file.nil?
277
+ output_fasta_file.close
278
+ end
279
+
280
+ # Given a finishm graph, gapfill from the first probe to the second. Return a
281
+ # Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection object
282
+ def gapfill(finishm_graph, probe_index1, probe_index2, options)
283
+ start_onode = finishm_graph.velvet_oriented_node(probe_index1)
284
+ end_onode_inward = finishm_graph.velvet_oriented_node(probe_index2)
285
+ unless start_onode and end_onode_inward
286
+ raise "Unable to retrieve both probes from the graph for gap #{gap_number} (#{gap.coords}), fail"
287
+ end
288
+
289
+ # The probe from finishm_graph points in the wrong direction for path finding
290
+ end_onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new
291
+ end_onode.node = end_onode_inward.node
292
+ end_onode.first_side = end_onode_inward.starts_at_start? ? Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST : Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST
293
+
294
+ adjusted_leash_length = finishm_graph.adjusted_leash_length(probe_index1, options[:graph_search_leash_length])
295
+ log.debug "Using adjusted leash length #{adjusted_leash_length }" if log.debug?
296
+
297
+ cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
298
+ trails = cartographer.find_trails_between_nodes(
299
+ finishm_graph.graph, start_onode, end_onode, adjusted_leash_length, {
300
+ :recoherence_kmer => options[:recoherence_kmer],
301
+ :sequences => finishm_graph.velvet_sequences,
302
+ :max_explore_nodes => options[:max_explore_nodes],
303
+ :max_gapfill_paths => options[:max_gapfill_paths],
304
+ }
305
+ )
306
+ if trails.circular_paths_detected
307
+ log.warn "Circular path detected here, not attempting to gapfill"
308
+ end
309
+ # Convert the trails into OrientedNodePaths
310
+ trails = trails.collect do |trail|
311
+ path = Bio::Velvet::Graph::OrientedNodeTrail.new
312
+ path.trail = trail
313
+ path
314
+ end
315
+
316
+ acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
317
+ acon.start_probe_noded_read = finishm_graph.probe_node_reads[probe_index1]
318
+ acon.end_probe_noded_read = finishm_graph.probe_node_reads[probe_index2]
319
+ acon.start_probe_contig_offset = options[:contig_end_length]
320
+ acon.end_probe_contig_offset = options[:contig_end_length]
321
+ acon.paths = trails
322
+
323
+ return acon
324
+ end
325
+ end