finishm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +1 -0
  5. data/Gemfile +31 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +59 -0
  8. data/Rakefile +51 -0
  9. data/VERSION +1 -0
  10. data/bin/assembly_visualiser +106 -0
  11. data/bin/check_primer_combinations.rb +73 -0
  12. data/bin/contig_joiner.rb +244 -0
  13. data/bin/contigs_against_assembly.rb +153 -0
  14. data/bin/finishm +143 -0
  15. data/bin/finishm_assembler +55 -0
  16. data/bin/finishm_gap_closer.rb +241 -0
  17. data/bin/kmer_abundance_file_tool.rb +49 -0
  18. data/bin/kmer_pattern_to_assembly.rb +377 -0
  19. data/bin/kmer_profile_finder.rb +92 -0
  20. data/bin/kmers_count_parse.d +52 -0
  21. data/bin/kmers_count_tabulate.d +123 -0
  22. data/bin/kmers_count_tabulate.rb +84 -0
  23. data/bin/pcr_result_parser.rb +108 -0
  24. data/bin/primer_finder.rb +119 -0
  25. data/bin/read_selection_by_kmer.d +174 -0
  26. data/bin/scaffold_by_pattern.rb +119 -0
  27. data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
  28. data/bin/scaffold_end_coverages.rb +69 -0
  29. data/bin/trail_validator.rb +84 -0
  30. data/ext/mkrf_conf.rb +56 -0
  31. data/ext/src/Makefile +140 -0
  32. data/ext/src/src/allocArray.c +305 -0
  33. data/ext/src/src/allocArray.h +86 -0
  34. data/ext/src/src/autoOpen.c +107 -0
  35. data/ext/src/src/autoOpen.h +18 -0
  36. data/ext/src/src/binarySequences.c +813 -0
  37. data/ext/src/src/binarySequences.h +125 -0
  38. data/ext/src/src/concatenatedGraph.c +233 -0
  39. data/ext/src/src/concatenatedGraph.h +30 -0
  40. data/ext/src/src/concatenatedPreGraph.c +262 -0
  41. data/ext/src/src/concatenatedPreGraph.h +29 -0
  42. data/ext/src/src/correctedGraph.c +2643 -0
  43. data/ext/src/src/correctedGraph.h +32 -0
  44. data/ext/src/src/dfib.c +509 -0
  45. data/ext/src/src/dfib.h +69 -0
  46. data/ext/src/src/dfibHeap.c +89 -0
  47. data/ext/src/src/dfibHeap.h +39 -0
  48. data/ext/src/src/dfibpriv.h +105 -0
  49. data/ext/src/src/fib.c +628 -0
  50. data/ext/src/src/fib.h +78 -0
  51. data/ext/src/src/fibHeap.c +79 -0
  52. data/ext/src/src/fibHeap.h +41 -0
  53. data/ext/src/src/fibpriv.h +110 -0
  54. data/ext/src/src/globals.h +154 -0
  55. data/ext/src/src/graph.c +3932 -0
  56. data/ext/src/src/graph.h +233 -0
  57. data/ext/src/src/graphReConstruction.c +1472 -0
  58. data/ext/src/src/graphReConstruction.h +30 -0
  59. data/ext/src/src/graphStats.c +2167 -0
  60. data/ext/src/src/graphStats.h +72 -0
  61. data/ext/src/src/graphStructures.h +52 -0
  62. data/ext/src/src/kmer.c +652 -0
  63. data/ext/src/src/kmer.h +73 -0
  64. data/ext/src/src/kmerOccurenceTable.c +236 -0
  65. data/ext/src/src/kmerOccurenceTable.h +44 -0
  66. data/ext/src/src/kseq.h +223 -0
  67. data/ext/src/src/locallyCorrectedGraph.c +557 -0
  68. data/ext/src/src/locallyCorrectedGraph.h +40 -0
  69. data/ext/src/src/passageMarker.c +677 -0
  70. data/ext/src/src/passageMarker.h +137 -0
  71. data/ext/src/src/preGraph.c +1717 -0
  72. data/ext/src/src/preGraph.h +106 -0
  73. data/ext/src/src/preGraphConstruction.c +990 -0
  74. data/ext/src/src/preGraphConstruction.h +26 -0
  75. data/ext/src/src/probe_node_finder.c +84 -0
  76. data/ext/src/src/probe_node_finder.h +6 -0
  77. data/ext/src/src/readCoherentGraph.c +557 -0
  78. data/ext/src/src/readCoherentGraph.h +30 -0
  79. data/ext/src/src/readSet.c +1734 -0
  80. data/ext/src/src/readSet.h +67 -0
  81. data/ext/src/src/readToNode.c +218 -0
  82. data/ext/src/src/readToNode.h +35 -0
  83. data/ext/src/src/recycleBin.c +199 -0
  84. data/ext/src/src/recycleBin.h +58 -0
  85. data/ext/src/src/roadMap.c +342 -0
  86. data/ext/src/src/roadMap.h +65 -0
  87. data/ext/src/src/run.c +318 -0
  88. data/ext/src/src/run.h +52 -0
  89. data/ext/src/src/run2.c +744 -0
  90. data/ext/src/src/runReadToNode.c +29 -0
  91. data/ext/src/src/scaffold.c +1876 -0
  92. data/ext/src/src/scaffold.h +64 -0
  93. data/ext/src/src/shortReadPairs.c +1243 -0
  94. data/ext/src/src/shortReadPairs.h +32 -0
  95. data/ext/src/src/splay.c +259 -0
  96. data/ext/src/src/splay.h +43 -0
  97. data/ext/src/src/splayTable.c +1315 -0
  98. data/ext/src/src/splayTable.h +31 -0
  99. data/ext/src/src/tightString.c +362 -0
  100. data/ext/src/src/tightString.h +82 -0
  101. data/ext/src/src/utility.c +199 -0
  102. data/ext/src/src/utility.h +98 -0
  103. data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
  104. data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
  105. data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
  106. data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
  107. data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
  108. data/ext/src/third-party/zlib-1.2.3/README +125 -0
  109. data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
  110. data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
  111. data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
  112. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
  113. data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
  114. data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
  115. data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
  116. data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
  117. data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
  118. data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
  119. data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
  120. data/ext/src/third-party/zlib-1.2.3/configure +459 -0
  121. data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
  122. data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
  123. data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
  124. data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
  125. data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
  126. data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
  127. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
  128. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
  129. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
  130. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
  131. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
  132. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
  133. data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
  134. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
  135. data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
  136. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
  137. data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
  138. data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
  139. data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
  140. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
  141. data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
  142. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
  143. data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
  144. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
  145. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
  146. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
  147. data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
  148. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
  149. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
  150. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
  151. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
  152. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
  153. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
  154. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
  155. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
  156. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
  157. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
  158. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
  159. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
  160. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
  161. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
  162. data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
  163. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
  164. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
  165. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
  166. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
  167. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
  168. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
  169. data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
  170. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
  171. data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
  172. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
  173. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
  174. data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
  175. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
  176. data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
  177. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
  178. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
  179. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
  180. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
  181. data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
  182. data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
  183. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
  184. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
  185. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
  186. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
  187. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
  188. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
  189. data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
  190. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
  191. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
  192. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
  193. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
  194. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
  195. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
  196. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
  197. data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
  198. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
  199. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
  200. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
  201. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
  202. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
  203. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
  204. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
  205. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
  206. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
  207. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
  208. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
  209. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
  210. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
  211. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
  212. data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
  213. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
  214. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
  215. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
  216. data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
  217. data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
  218. data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
  219. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
  220. data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
  221. data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
  222. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
  223. data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
  224. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
  225. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
  226. data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
  227. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
  228. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
  229. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
  230. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
  231. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
  232. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
  233. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
  234. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
  235. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
  236. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
  237. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
  238. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
  239. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
  240. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
  241. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
  242. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
  243. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
  244. data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
  245. data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
  246. data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
  247. data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
  248. data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
  249. data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
  250. data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
  251. data/ext/src/third-party/zlib-1.2.3/example +0 -0
  252. data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
  253. data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
  254. data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
  255. data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
  256. data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
  257. data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
  258. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
  259. data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
  260. data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
  261. data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
  262. data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
  263. data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
  264. data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
  265. data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
  266. data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
  267. data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
  268. data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
  269. data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
  270. data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
  271. data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
  272. data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
  273. data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
  274. data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
  275. data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
  276. data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
  277. data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
  278. data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
  279. data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
  280. data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
  281. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
  282. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
  283. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
  284. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
  285. data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
  286. data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
  287. data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
  288. data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
  289. data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
  290. data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
  291. data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
  292. data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
  293. data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
  294. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
  295. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
  296. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
  297. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
  298. data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
  299. data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
  300. data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
  301. data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
  302. data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
  303. data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
  304. data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
  305. data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
  306. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
  307. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
  308. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
  309. data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
  310. data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
  311. data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
  312. data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
  313. data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
  314. data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
  315. data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
  316. data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
  317. data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
  318. data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
  319. data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
  320. data/lib/assembly/a_b_visualiser.rb +169 -0
  321. data/lib/assembly/acyclic_connection_finder.rb +81 -0
  322. data/lib/assembly/all_orfs.rb +615 -0
  323. data/lib/assembly/bad_format_writer.rb +46 -0
  324. data/lib/assembly/bam_probe_read_selector.rb +48 -0
  325. data/lib/assembly/bubbly_assembler.rb +842 -0
  326. data/lib/assembly/c_probe_node_finder.rb +38 -0
  327. data/lib/assembly/connection_interpreter.rb +350 -0
  328. data/lib/assembly/contig_printer.rb +400 -0
  329. data/lib/assembly/coverage_based_graph_filter.rb +68 -0
  330. data/lib/assembly/depth_first_search.rb +63 -0
  331. data/lib/assembly/dijkstra.rb +216 -0
  332. data/lib/assembly/fluffer.rb +253 -0
  333. data/lib/assembly/graph_explorer.rb +85 -0
  334. data/lib/assembly/graph_generator.rb +315 -0
  335. data/lib/assembly/height_finder.rb +355 -0
  336. data/lib/assembly/hybrid_velvet_graph.rb +70 -0
  337. data/lib/assembly/input_genome.rb +182 -0
  338. data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
  339. data/lib/assembly/node_finder.rb +171 -0
  340. data/lib/assembly/oriented_node_trail.rb +507 -0
  341. data/lib/assembly/paired_end_assembler.rb +53 -0
  342. data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
  343. data/lib/assembly/probed_graph.rb +105 -0
  344. data/lib/assembly/read_input.rb +79 -0
  345. data/lib/assembly/read_to_node.rb +37 -0
  346. data/lib/assembly/scaffold_breaker.rb +126 -0
  347. data/lib/assembly/sequence_hasher.rb +71 -0
  348. data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
  349. data/lib/assembly/single_coherent_wanderer.rb +261 -0
  350. data/lib/assembly/single_ended_assembler.rb +441 -0
  351. data/lib/assembly/velvet_c_binding.rb +54 -0
  352. data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
  353. data/lib/external/VERSION +1 -0
  354. data/lib/finishm/assemble.rb +224 -0
  355. data/lib/finishm/explore.rb +217 -0
  356. data/lib/finishm/finisher.rb +303 -0
  357. data/lib/finishm/fluff.rb +122 -0
  358. data/lib/finishm/gapfiller.rb +325 -0
  359. data/lib/finishm/orfs_finder.rb +88 -0
  360. data/lib/finishm/path_counter.rb +90 -0
  361. data/lib/finishm/primers.rb +425 -0
  362. data/lib/finishm/primers_check.rb +176 -0
  363. data/lib/finishm/roundup.rb +344 -0
  364. data/lib/finishm/sequence.rb +142 -0
  365. data/lib/finishm/visualise.rb +430 -0
  366. data/lib/finishm/wander.rb +270 -0
  367. data/lib/kmer_abundance_pattern.rb +79 -0
  368. data/lib/kmer_multi_abundance_file.rb +48 -0
  369. data/lib/oligo_designer.rb +88 -0
  370. data/lib/priner.rb +66 -0
  371. data/spec/acyclic_connection_finder_spec.rb +551 -0
  372. data/spec/all_orfs_spec.rb +443 -0
  373. data/spec/assemble_spec.rb +186 -0
  374. data/spec/bubbly_assembler_spec.rb +707 -0
  375. data/spec/c_node_finder_spec.rb +58 -0
  376. data/spec/connection_interpreter_spec.rb +284 -0
  377. data/spec/contig_printer_spec.rb +291 -0
  378. data/spec/coverage_based_graph_filter_spec.rb +102 -0
  379. data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
  380. data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
  381. data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
  382. data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
  383. data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
  384. data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
  385. data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
  386. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
  387. data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
  388. data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
  389. data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
  390. data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
  391. data/spec/data/assembly_visualiser/Graph +46 -0
  392. data/spec/data/assembly_visualiser/start_kmers1 +2 -0
  393. data/spec/data/bands.csv +1 -0
  394. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
  395. data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
  396. data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
  397. data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
  398. data/spec/data/c_probe_node_finder/1/Log +756 -0
  399. data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
  400. data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
  401. data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
  402. data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
  403. data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
  404. data/spec/data/contig_printer/1/contigs.fa +4 -0
  405. data/spec/data/contig_printer/1/seq.fa +2408 -0
  406. data/spec/data/contig_printer/1/seq.fa.svg +153 -0
  407. data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
  408. data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
  409. data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
  410. data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
  411. data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
  412. data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
  413. data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
  414. data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
  415. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
  416. data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
  417. data/spec/data/contig_printer/1/seq.node12.fa +4 -0
  418. data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
  419. data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
  420. data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
  421. data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
  422. data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
  423. data/spec/data/explore/1/a.fa +2 -0
  424. data/spec/data/explore/1/seq1_and_a.fa +3 -0
  425. data/spec/data/explore/1/seq2.fa +2 -0
  426. data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
  427. data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
  428. data/spec/data/fluff/1/seq1.fa +2 -0
  429. data/spec/data/fluff/1/seq2.fa +2 -0
  430. data/spec/data/gapfilling/1/reads.fa +171 -0
  431. data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
  432. data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
  433. data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
  434. data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
  435. data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
  436. data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
  437. data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
  438. data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
  439. data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
  440. data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
  441. data/spec/data/gapfilling/2/reference.fa +2 -0
  442. data/spec/data/gapfilling/2/reference_part1.fa +4 -0
  443. data/spec/data/gapfilling/2/reference_part2.fa +4 -0
  444. data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
  445. data/spec/data/gapfilling/2/with_gaps.fa +4 -0
  446. data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
  447. data/spec/data/gapfilling/3/reads.fa.gz +0 -0
  448. data/spec/data/gapfilling/3/reference_part1.fa +4 -0
  449. data/spec/data/gapfilling/3/reference_part2.fa +4 -0
  450. data/spec/data/gapfilling/3/with_gaps.fa +4 -0
  451. data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
  452. data/spec/data/gapfilling/4/reads.fa.gz +0 -0
  453. data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
  454. data/spec/data/gapfilling/5/answer.fna +2 -0
  455. data/spec/data/gapfilling/5/gappy.fna +2 -0
  456. data/spec/data/gapfilling/5/reads.fa +17961 -0
  457. data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
  458. data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
  459. data/spec/data/gapfilling/6/random1.fa +28 -0
  460. data/spec/data/gapfilling/6/random2.fa +28 -0
  461. data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
  462. data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
  463. data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
  464. data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
  465. data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
  466. data/spec/data/kmers_count1.csv +2 -0
  467. data/spec/data/kmers_count2.csv +3 -0
  468. data/spec/data/out +3 -0
  469. data/spec/data/positive_latching_pair.fa +2 -0
  470. data/spec/data/primers.csv +4 -0
  471. data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
  472. data/spec/data/read_selection_by_kmer/input.fasta +6 -0
  473. data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
  474. data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
  475. data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
  476. data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
  477. data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
  478. data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
  479. data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
  480. data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
  481. data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
  482. data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
  483. data/spec/data/t/details.txt +5 -0
  484. data/spec/data/t/details.txt.srt +5 -0
  485. data/spec/data/t/location.txt +3 -0
  486. data/spec/data/t/location.txt.srt +3 -0
  487. data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
  488. data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
  489. data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
  490. data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
  491. data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
  492. data/spec/data/tweak/3_variant/answer.fa +2 -0
  493. data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
  494. data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
  495. data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
  496. data/spec/data/velvet_test_trails/Assem/Graph +17 -0
  497. data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
  498. data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
  499. data/spec/data/velvet_test_trails/Assem/Log +35 -0
  500. data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
  501. data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
  502. data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
  503. data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
  504. data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
  505. data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
  506. data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
  507. data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
  508. data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
  509. data/spec/data/velvet_test_trails/read1.fa +2 -0
  510. data/spec/data/velvet_test_trails/reads.fa +50 -0
  511. data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
  512. data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
  513. data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
  514. data/spec/data/visualise/1/LastGraph +6695 -0
  515. data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
  516. data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
  517. data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
  518. data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
  519. data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
  520. data/spec/data/wander/1/random1.fa +2 -0
  521. data/spec/data/wander/1/random1.sammy.fa +804 -0
  522. data/spec/depth_first_search_spec.rb +190 -0
  523. data/spec/dijkstra_spec.rb +143 -0
  524. data/spec/explore_spec.rb +29 -0
  525. data/spec/fluffer_spec.rb +155 -0
  526. data/spec/gapfiller_spec.rb +107 -0
  527. data/spec/graph_explorer_spec.rb +475 -0
  528. data/spec/graph_generator_spec.rb +99 -0
  529. data/spec/height_finder_spec.rb +306 -0
  530. data/spec/kmer_abundance_pattern_spec.rb +56 -0
  531. data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
  532. data/spec/kmer_profile_finder_spec.rb +38 -0
  533. data/spec/kmers_count_tabulate_spec.rb +120 -0
  534. data/spec/oriented_node_trail_spec.rb +221 -0
  535. data/spec/paired_end_neighbours_spec.rb +126 -0
  536. data/spec/paths_between_nodes_spec.rb +349 -0
  537. data/spec/priner_spec.rb +7 -0
  538. data/spec/read_input_spec.rb +23 -0
  539. data/spec/read_selection_by_kmer_spec.rb +166 -0
  540. data/spec/read_to_node_spec.rb +35 -0
  541. data/spec/roundup_spec.rb +366 -0
  542. data/spec/scaffold_breaker_spec.rb +144 -0
  543. data/spec/sequence_spec.rb +43 -0
  544. data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
  545. data/spec/single_coherent_wanderer_spec.rb +120 -0
  546. data/spec/single_ended_assembler_spec.rb +398 -0
  547. data/spec/spec_helper.rb +310 -0
  548. data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
  549. data/spec/visualise_spec.rb +105 -0
  550. data/spec/wander_spec.rb +119 -0
  551. data/spec/watch_for_changes.sh +16 -0
  552. data/validation/fasta_compare.rb +72 -0
  553. data/validation/gapfill_simulate_perfect.rb +108 -0
  554. metadata +899 -0
@@ -0,0 +1,169 @@
1
+ require 'graphviz'
2
+ require 'set'
3
+
4
+ class Bio::Velvet::Graph::Node
5
+ def includes_kmers?(list_of_kmers)
6
+ list_of_kmers.each do |kmer|
7
+ return true if ends_of_kmers_of_node.include?(kmer) or ends_of_kmers_of_twin_node.include?(kmer)
8
+ end
9
+ return false
10
+ end
11
+ end
12
+
13
+ module Bio
14
+ module Assembly
15
+ class ABVisualiser
16
+ include Bio::FinishM::Logging
17
+
18
+ # Visualise a (velvet) graph, as a graphviz object
19
+ #
20
+ # Possible options:
21
+ # :start_kmers: list of kmers to denote the start node(s)
22
+ # :end_kmers: list of kmers to denote the end node(s)
23
+ # :start_node_id: ID of node to mark as a start
24
+ # :end_node_id:ID of node to mark as a end
25
+ # :start_node_ids: array of node IDs to mark as a start
26
+ # :end_node_ids:array of node IDs to mark as a end
27
+ # :coverage_cutoff: ignore nodes with less coverage than this cutoff
28
+ # :digraph: output as a digraph (default true, else output undirected graph)
29
+ # :nodes: an Enumerable of nodes to be visualised.
30
+ # :node_id_to_nickname: add these names to the node descriptions. Hash of integer node id to String.
31
+ # :paired_nodes_hash: a hash of node_id to Enumerable of node_ids where there is paired-end connections
32
+ def graphviz(graph, options={})
33
+ opts = {}
34
+ opts[:type] = :digraph unless options[:digraph] == false
35
+ opts[:overlap] = :scale
36
+ graphviz = GraphViz.new(:G, opts)
37
+
38
+ nodes_to_explore = Set.new(options[:nodes].to_a)
39
+ nodes_to_explore ||= Set.new(graph.nodes)
40
+
41
+ # Add all the nodes
42
+ blacklisted_node_ids = Set.new
43
+ log.debug "Converting nodes to GraphViz format"
44
+ nodes_to_explore.each do |node|
45
+ cov = node.coverage
46
+ if options[:coverage_cutoff] and cov < options[:coverage_cutoff] and !cov.nil?
47
+ blacklisted_node_ids.add node.node_id
48
+ else
49
+ cov_string = cov.nil? ? '' : cov.round
50
+ label = "n#{node.node_id}_length#{node.ends_of_kmers_of_node.length}_coverage#{cov_string}"
51
+ if options[:node_id_to_nickname] and options[:node_id_to_nickname].key?(node.node_id)
52
+ label += ' ' + options[:node_id_to_nickname][node.node_id]
53
+ end
54
+ mods = {
55
+ :label => label,
56
+ }
57
+ includes_start = false
58
+ includes_end = false
59
+ if options[:start_kmers]
60
+ includes_start = node.includes_kmers?(options[:start_kmers])
61
+ end
62
+ if options[:end_kmers]
63
+ includes_end = node.includes_kmers?(options[:end_kmers])
64
+ end
65
+ if options[:start_node_id]
66
+ includes_start = true if node.node_id == options[:start_node_id]
67
+ end
68
+ if options[:end_node_id]
69
+ includes_end = true if node.node_id == options[:end_node_id]
70
+ end
71
+ if options[:start_node_ids]
72
+ includes_start = true if options[:start_node_ids].include? node.node_id
73
+ end
74
+ if options[:end_node_ids]
75
+ includes_end = true if options[:end_node_ids].include? node.node_id
76
+ end
77
+
78
+ if includes_start and includes_end
79
+ log.warn "Start and end kmers detected in the same node!"
80
+ elsif includes_start
81
+ mods[:color] = "red"
82
+ elsif includes_end
83
+ mods[:color] = "green"
84
+ end
85
+
86
+ graphviz.add_nodes node.node_id.to_s, mods
87
+ end
88
+ end
89
+
90
+ # Add all the edges
91
+ arcs_of_interest = graph.arcs
92
+ if options[:nodes]
93
+ arcs_of_interest = Set.new
94
+ nodes_to_explore.each do |node|
95
+ graph.arcs.get_arcs_by_node_id(node.node_id).each do |arc|
96
+ arcs_of_interest << arc
97
+ end
98
+ end
99
+ end
100
+
101
+ log.info "Converting #{arcs_of_interest.length} arcs to GraphViz format"
102
+ arcs_of_interest.each do |arc|
103
+ # Add unless the node has been blacklisted
104
+ unless blacklisted_node_ids.include? arc.begin_node_id or
105
+ blacklisted_node_ids.include? arc.end_node_id or
106
+ !nodes_to_explore.include?(graph.nodes[arc.begin_node_id]) or
107
+ !nodes_to_explore.include?(graph.nodes[arc.end_node_id])
108
+
109
+ # Direction of the arrows, to denote connection to beginning of node (connects to start = in-arrow-head to node on output graph)
110
+ if arc.connects_end_to_beginning?(arc.begin_node_id, arc.end_node_id)
111
+ graphviz.add_edges arc.begin_node_id.to_s, arc.end_node_id.to_s
112
+ elsif arc.connects_end_to_end?(arc.begin_node_id, arc.end_node_id)
113
+ graphviz.add_edges arc.begin_node_id.to_s, arc.end_node_id.to_s, {:dir => "none"}
114
+ elsif arc.connects_beginning_to_beginning?(arc.begin_node_id, arc.end_node_id)
115
+ graphviz.add_edges arc.begin_node_id.to_s, arc.end_node_id.to_s, {:dir => "both"}
116
+ elsif arc.connects_beginning_to_end?(arc.begin_node_id, arc.end_node_id)
117
+ graphviz.add_edges arc.end_node_id.to_s, arc.begin_node_id.to_s
118
+ end
119
+ end
120
+ end
121
+
122
+ # Add paired_nodes_hash pairs
123
+ unless options[:paired_nodes_hash].nil?
124
+ # Create a list of arc node pairs for len calculation
125
+ arc_pairs = arcs_of_interest.collect do |arc|
126
+ [arc.begin_node_id, arc.end_node_id].sort
127
+ end
128
+ directly_connected_node_pairs = Set.new(arc_pairs)
129
+
130
+ # Keep track of pairs so multiple arcs are not drawn e.g. node1 => node2 and node2=>node1
131
+ pairs_added = Set.new
132
+ log.info "Adding paired-end linkages to GraphViz format.."
133
+ options[:paired_nodes_hash].each do |node1_id, connected_node_ids|
134
+ connected_node_ids.each do |node2_id|
135
+ next if node1_id == node2_id #skip within-node connections
136
+ sorted = [node1_id, node2_id].sort #sort so only a single connection is shown
137
+ unless pairs_added.include?(sorted) or
138
+ !nodes_to_explore.include?(graph.nodes[node1_id]) or
139
+ !nodes_to_explore.include?(graph.nodes[node2_id]) or
140
+ directly_connected_node_pairs.include?([node1_id, node2_id].sort)
141
+
142
+ graphviz.add_edges sorted[0].to_s, sorted[1].to_s, {:color => "grey", :dir => "none", :style => 'dashed'}
143
+ pairs_added << sorted
144
+ end
145
+ end
146
+ end
147
+ end
148
+
149
+ return graphviz
150
+ end
151
+
152
+
153
+ class SimplifiedGraph
154
+ def self.create_from_velvet_graph(graph)
155
+ nodes_incorporated = 0
156
+
157
+ # While there is more of the graph to incorporate
158
+ while nodes_incorporated < graph.nodes.length
159
+ raise "not implemented"
160
+ end
161
+ end
162
+
163
+ # A class representing a linear string of nodes without forks
164
+ class Path
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,81 @@
1
+ require 'ds'
2
+ require 'set'
3
+
4
+ module Bio
5
+ module AssemblyGraphAlgorithms
6
+
7
+ # Represents a set of trails, and whether or not circularity has been detected,
8
+ # and whether too many paths have been detected.
9
+ class TrailSet
10
+ attr_accessor :trails
11
+ attr_accessor :circular_paths_detected
12
+ attr_accessor :max_path_limit_exceeded
13
+ include Enumerable
14
+
15
+ def initialize
16
+ @circular_paths_detected = false
17
+ @max_path_limit_exceeded = false
18
+ end
19
+
20
+ def each
21
+ unless @trails.nil?
22
+ @trails.each{|t| yield t}
23
+ end
24
+ end
25
+ end
26
+
27
+ class AcyclicConnectionFinder
28
+ include Bio::FinishM::Logging
29
+
30
+ # Find trails between two oriented nodes, both facing the same way along the path.
31
+ #
32
+ # Options:
33
+ # * :recoherence_kmer: use a longer kmer to help de-bubble and de-cicularise (default don't use this)
34
+ # * :sequences: Bio::Velvet::Sequence object holding sequences of nodes within leash length
35
+ def find_trails_between_nodes(graph, initial_oriented_node, terminal_oriented_node, leash_length, options={})
36
+
37
+ #TODO: this is now implemented in the finishm_graph object - just get it from there
38
+ initial_path = Bio::Velvet::Graph::OrientedNodeTrail.new
39
+ initial_path.add_oriented_node initial_oriented_node
40
+
41
+ finder = Bio::AssemblyGraphAlgorithms::SingleCoherentPathsBetweenNodesFinder.new
42
+ return finder.find_all_connections_between_two_nodes(
43
+ graph, initial_path, terminal_oriented_node, leash_length, options[:recoherence_kmer], options[:sequences], options
44
+ )
45
+ end
46
+
47
+ # Algorithms like SingleCoherentWanderer#wander give an overly short
48
+ # base pair distance between two probes, because the length of the node
49
+ # containing the probe at either end is not included in the calculation.
50
+ #
51
+ # Return the calibrated distance i.e. the true base pair distance between
52
+ # the start of each node pair. Returned is the given distance plus the
53
+ # distance between the start of each probe and the end of the containing
54
+ # node.
55
+ def calibrate_distance_accounting_for_probes(finishm_graph, probe1_index, probe2_index, distance)
56
+ read1 = finishm_graph.probe_node_reads[probe1_index]
57
+ read2 = finishm_graph.probe_node_reads[probe2_index]
58
+ probe_node1 = finishm_graph.probe_nodes[probe1_index]
59
+ probe_node2 = finishm_graph.probe_nodes[probe2_index]
60
+
61
+ # If the start and end nodes are the same, that's a special case:
62
+ if finishm_graph.probe_nodes[probe1_index].node_id == finishm_graph.probe_nodes[probe2_index].node_id
63
+ if (read1.direction == true and read2.direction == false) or
64
+ (read1.direction == false and read2.direction == true)
65
+ return probe_node1.length - read1.offset_from_start_of_node - read2.offset_from_start_of_node - finishm_graph.graph.hash_length
66
+ else
67
+ raise "Programming error: to connect within a single contig two probes must have opposite directions: found #{read1.direction} and #{read2.direction}"
68
+ end
69
+ else
70
+ # Usual case - start and end nodes are different nodes
71
+ to_return = distance
72
+ # add the first probe side
73
+ to_return += probe_node1.length-read1.offset_from_start_of_node-finishm_graph.graph.hash_length
74
+ # add the second probe side
75
+ to_return += probe_node2.length-read1.offset_from_start_of_node-finishm_graph.graph.hash_length
76
+ return to_return
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,615 @@
1
+ require 'ds'
2
+ require 'set'
3
+
4
+ module Bio
5
+ module AssemblyGraphAlgorithms
6
+ class AllOrfsFinder
7
+ include Bio::FinishM::Logging
8
+
9
+ CODON_LENGTH = 3
10
+ START_CODONS = ['ATG']
11
+ STOP_CODONS = ['TAG', 'TAA', 'TGA']
12
+
13
+ # Search for open reading frames in a graph, in all the paths begining at a set of
14
+ # nodes through a graph (or a subset defined by range)
15
+ def find_orfs_in_graph(graph, initial_paths, minimum_orf_length=nil,
16
+ range=nil, max_gapfill_paths=nil, max_cycles=nil)
17
+
18
+ problems = find_all_problems(graph,
19
+ initial_paths,
20
+ :range => range
21
+ )
22
+
23
+ find_orfs_from_problems(problems, {
24
+ :min_orf_length => minimum_orf_length,
25
+ :max_gapfill_paths => max_gapfill_paths,
26
+ :max_cycles => max_cycles,
27
+ })
28
+ end
29
+
30
+
31
+ def find_all_problems(graph, initial_paths, options={})
32
+ problems = SingleCoherentPathsBetweenNodesFinder::ProblemSet.new
33
+ prob_finder = AllProblemTrailsFinder.new(graph, initial_paths)
34
+
35
+ while current_path = prob_finder.pop
36
+ log.debug "considering #{current_path}" if log.debug?
37
+ set_key = path_to_settable(current_path)
38
+
39
+ if problems.has_key? set_key
40
+ log.debug "Already seen this problem" if log.debug?
41
+ prob = problems[set_key]
42
+ prob.known_paths.push current_path
43
+ next
44
+ end
45
+
46
+ log.debug "New dynamic problem being solved" if log.debug?
47
+ # new problem being solved here
48
+ problem = SingleCoherentPathsBetweenNodesFinder::DynamicProgrammingProblem.new
49
+ problem.known_paths.push current_path.copy
50
+ problems[set_key] = problem
51
+
52
+ neighbours = current_path.neighbours_of_last_node(graph)
53
+ if options[:range]
54
+ neighbours.select!{|onode| options[:range].include? onode.node_id}
55
+ end
56
+ if neighbours.empty?
57
+ log.debug "last is terminal" if log.debug?
58
+
59
+ problems.terminal_node_keys ||= Set.new
60
+ problems.terminal_node_keys << set_key
61
+ next
62
+ end
63
+
64
+ # explore the forward neighbours
65
+ prob_finder.push_next_neighbours current_path
66
+ log.debug "Priority queue size: #{prob_finder.size}" if log.debug?
67
+ end
68
+
69
+ return problems
70
+ end
71
+
72
+ def path_to_settable(path)
73
+ return SingleCoherentPathsBetweenNodesFinder.new.path_to_settable(path, path.last.node.length_alone + CODON_LENGTH - 1)
74
+ end
75
+
76
+
77
+ def find_orfs_from_problems(problems, options={})
78
+ max_num_paths = options[:max_gapfill_paths]
79
+ max_num_paths ||= 2196
80
+ max_cycles = options[:max_cycles] || 1
81
+ min_orf_length = options[:minimum_orf_length] || 0
82
+
83
+ counter = SingleCoherentPathsBetweenNodesFinder::CycleCounter.new(max_cycles)
84
+ decide_stack = lambda do |to_push|
85
+ part_nodes = [to_push[0].trail, to_push[1].otrail ? to_push[1].otrail.trail : []]
86
+ if max_cycles < counter.path_cycle_count(part_nodes.flatten)
87
+ log.debug "Pushing #{part_nodes.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to secondary stack" if log.debug?
88
+ return true
89
+ else
90
+ log.debug "Pushing #{part_nodes.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to main stack" if log.debug?
91
+ return false
92
+ end
93
+ end
94
+
95
+ stack = SingleCoherentPathsBetweenNodesFinder::DualStack.new &decide_stack
96
+ to_return = Bio::AssemblyGraphAlgorithms::TrailSet.new
97
+
98
+ # if there is no solutions to the overall problem then there is no solution at all
99
+ if problems.terminal_node_keys.nil? or problems.terminal_node_keys.empty?
100
+ to_return.trails = []
101
+ return to_return
102
+ end
103
+
104
+ # push all "ending in the final node" solutions to the stack
105
+ problems.terminal_node_keys.each do |key|
106
+ overall_solution = problems[key]
107
+ first_part = overall_solution.known_paths[0].copy
108
+ second_part = ORFsTracingTrail.new
109
+ second_part.otrail = Bio::Velvet::Graph::OrientedNodeTrail.new
110
+ stack.push [first_part, second_part]
111
+ end
112
+
113
+ all_paths_hash = {}
114
+ while path_parts = stack.pop
115
+ first_part = path_parts[0]
116
+ second_part = path_parts[1]
117
+ log.debug "#{first_part.to_shorthand} and #{second_part.otrail.to_shorthand}" if log.debug?
118
+
119
+ # Look for codons
120
+ log.debug "Searching for codons in first node of second part" if log.debug?
121
+ fwd_result, twin_result = search_for_codons(second_part.otrail) # search from start of second part
122
+
123
+ # Forward direction
124
+ if not fwd_result.stop_markers.empty? or not fwd_result.start_markers.empty?
125
+ [fwd_result.stop_markers, fwd_result.start_markers].each do |markers|
126
+ markers.each do |marker|
127
+ marker.position_in_trail = marker.position_in_node
128
+ end
129
+ end
130
+ current_fwd_stops = []
131
+ current_fwd_starts = []
132
+ if second_part.fwd_orfs_result
133
+ current_fwd_stops.concat second_part.fwd_orfs_result.initial_stop_markers
134
+ current_fwd_starts.concat second_part.fwd_orfs_result.initial_start_markers
135
+ current_fwd_starts.concat second_part.fwd_orfs_result.final_start_markers
136
+ end
137
+ current_fwd_stops.concat fwd_result.stop_markers
138
+ current_fwd_starts.concat fwd_result.start_markers
139
+ log.debug "Attempt to pair start codons at #{current_fwd_starts.collect{|m| m.position_in_trail}.join(',')} with stop codons at #{current_fwd_stops.collect{|m| m.position_in_trail}.join(',')}" if log.debug?
140
+ fwd_orfs_result = orfs_from_start_stop_markers(current_fwd_starts, current_fwd_stops, min_orf_length)
141
+ log.debug "Found pairs #{fwd_orfs_result.start_stop_pairs.collect{|pair| pair.collect{|m| m.position_in_trail}.join(',')}.join('],[')}" if log.debug?
142
+
143
+ # collect previous start-stop pairs
144
+ if second_part.fwd_orfs_result
145
+ fwd_orfs_result.start_stop_pairs.concat second_part.fwd_orfs_result.start_stop_pairs
146
+ end
147
+ second_part.fwd_orfs_result = fwd_orfs_result
148
+ log.debug "Remaining forward stops: #{second_part.fwd_orfs_result.initial_stop_markers.collect{|m| m.position_in_trail}.join(',')}" if log.debug?
149
+ end
150
+
151
+ # Reverse direction
152
+ if not twin_result.stop_markers.empty? or not twin_result.start_markers.empty?
153
+ # twin stop positons are relative to start of first path twin node
154
+ # add length of rest of path to get position relative to start of last path twin node
155
+ length_of_rest_of_path = second_part.otrail.length_in_bp_within_path - second_part.otrail[0].node.length_alone
156
+ [twin_result.stop_markers, twin_result.start_markers].each do |markers|
157
+ markers.each do |marker|
158
+ marker.position_in_trail = marker.position_in_node + length_of_rest_of_path
159
+ end
160
+ end
161
+ current_twin_stops = []
162
+ current_twin_starts = []
163
+ if second_part.twin_orfs_result
164
+ current_twin_stops.concat second_part.twin_orfs_result.initial_stop_markers
165
+ current_twin_starts.concat second_part.twin_orfs_result.initial_start_markers
166
+ current_twin_starts.concat second_part.twin_orfs_result.final_start_markers
167
+ end
168
+ current_twin_stops.concat twin_result.stop_markers
169
+ current_twin_starts.concat twin_result.start_markers
170
+ log.debug "Attempt to pair stop codons in reverse direction at #{current_twin_stops.collect{|m| m.position_in_trail}.join(',')} with starts at #{current_twin_starts.collect{|m| m.position_in_trail}.join(',')}" if log.debug?
171
+ twin_orfs_result = orfs_from_start_stop_markers(current_twin_starts, current_twin_stops, min_orf_length)
172
+ log.debug "Found pairs #{twin_orfs_result.start_stop_pairs.collect{|pair| pair.collect{|m| m.position_in_trail}.join(',')}.join('],[')}" if log.debug?
173
+
174
+ # collect previous start-stop pairs
175
+ if second_part.twin_orfs_result
176
+ twin_orfs_result.start_stop_pairs.concat second_part.twin_orfs_result.start_stop_pairs
177
+ end
178
+ second_part.twin_orfs_result = twin_orfs_result
179
+ log.debug "Remaining twin starts: #{second_part.twin_orfs_result.final_start_markers.collect{|m| m.position_in_trail}.join(',')}" if log.debug?
180
+ end
181
+
182
+ if first_part.length == 0
183
+ # If we've tracked all the way to the beginning, then there's no need to track further
184
+
185
+ key = second_part.otrail.trail.hash
186
+ all_paths_hash[key] ||= second_part
187
+ next
188
+ end
189
+
190
+ last = first_part.last
191
+ if second_part.otrail.trail.include? last
192
+ log.debug "Cycle at node #{last.node_id} detected in previous path #{second_part.collect{|onode| onode.node_id}.join(',')}." if log.debug?
193
+ to_return.circular_paths_detected = true
194
+ if max_cycles == 0 or max_cycles < counter.path_cycle_count([last, second_part.otrail.trail].flatten)
195
+ log.debug "Not finishing cyclic path with too many repeated cycles." if log.debug?
196
+ next
197
+ end
198
+ end
199
+
200
+ paths_to_last = problems[path_to_settable(first_part)].known_paths
201
+ paths_to_last.each do |path|
202
+ new_second_part = ORFsTracingTrail.new
203
+ new_second_part.otrail = second_part.otrail.copy
204
+ new_second_part.otrail.trail.unshift last
205
+
206
+ if second_part.fwd_orfs_result
207
+ # offset positions in forward direction
208
+ offset = last.node.length_alone
209
+ copy_and_offset_marker = lambda do |marker|
210
+ m = marker.copy
211
+ m.position_in_trail += offset
212
+ m
213
+ end
214
+
215
+ new_fwd_orfs_result = ORFsResult.new
216
+ new_fwd_orfs_result.start_stop_pairs = second_part.fwd_orfs_result.start_stop_pairs.collect do |pairs|
217
+ pairs.collect &copy_and_offset_marker
218
+ end
219
+ new_fwd_orfs_result.initial_start_markers = second_part.fwd_orfs_result.initial_start_markers.collect &copy_and_offset_marker
220
+ new_fwd_orfs_result.initial_stop_markers = second_part.fwd_orfs_result.initial_stop_markers.collect &copy_and_offset_marker
221
+ new_fwd_orfs_result.final_start_markers = second_part.fwd_orfs_result.final_start_markers.collect &copy_and_offset_marker
222
+ new_second_part.fwd_orfs_result = new_fwd_orfs_result
223
+ end
224
+
225
+ if second_part.twin_orfs_result
226
+ new_twin_orfs_result = ORFsResult.new
227
+ new_twin_orfs_result.start_stop_pairs = second_part.twin_orfs_result.start_stop_pairs.collect do |pairs|
228
+ pairs.collect{|marker| marker.copy}
229
+ end
230
+ new_twin_orfs_result.initial_stop_markers = second_part.twin_orfs_result.initial_stop_markers.collect{|marker| marker.copy}
231
+ new_twin_orfs_result.final_start_markers = second_part.twin_orfs_result.final_start_markers.collect{|marker| marker.copy}
232
+ new_second_part.twin_orfs_result = new_twin_orfs_result
233
+ end
234
+
235
+ new_first_part = path.copy
236
+ new_first_part.remove_last_node
237
+
238
+ stack.push [new_first_part, new_second_part]
239
+ end
240
+
241
+ # max_num_paths parachute
242
+ # The parachute can kill the search once the main stack exceeds max_gapfill_paths,
243
+ # since all paths on it are valid.
244
+ if !max_num_paths.nil? and (stack.sizes[0] + all_paths_hash.length) > max_num_paths
245
+ log.info "Exceeded the maximum number of allowable paths in this gapfill" if log.info?
246
+ to_return.max_path_limit_exceeded = true
247
+ all_paths_hash = {}
248
+ break
249
+ end
250
+ end
251
+
252
+ to_return.trails = all_paths_hash.values
253
+ return to_return
254
+ end
255
+
256
+ # Returns:
257
+ # SearchResult relative to start of first node
258
+ # SearchResult relative to start of first twin node
259
+ def search_for_codons(otrail)
260
+ return SearchResult.new, SearchResult.new if otrail.trail.empty?
261
+ onode = otrail[0]
262
+
263
+ make_marker = lambda do |position|
264
+ marker = Marker.new
265
+ marker.position_in_node = position
266
+ marker.node = onode.node
267
+ marker
268
+ end
269
+
270
+ #log.debug "Looking for codons #{words.to_a}" if log.debug?
271
+ words = Set.new(START_CODONS).merge(STOP_CODONS)
272
+
273
+ # search within first / last node
274
+ fwd_nodes_sequence, twin_nodes_sequence = get_sequences onode
275
+ #log.debug "Looking in #{fwd_nodes_sequence}" if log.debug?
276
+ fwd_within_first = word_search(fwd_nodes_sequence, words, CODON_LENGTH)
277
+ #log.debug "Found codons #{fwd_within_first.keys.join(',')} at positions #{fwd_within_first.values.join(',')} in #{fwd_nodes_sequence}" if log.debug?
278
+ #log.debug "Looking in #{twin_nodes_sequence}" if log.debug?
279
+ twin_within_first = word_search(twin_nodes_sequence, words, CODON_LENGTH)
280
+ #log.debug "Found codons #{twin_within_first.keys.join(',')} in twin node at positions #{twin_within_first.values.join(',')} in #{fwd_nodes_sequence}" if log.debug?
281
+
282
+ # extend search along trail
283
+ fwd_overlap_sequence, twin_overlap_sequence = get_overlap_sequences(otrail, CODON_LENGTH)
284
+ #log.debug "Looking in #{fwd_overlap_sequence}" if log.debug?
285
+ fwd_in_overlap = word_search(fwd_overlap_sequence, words, CODON_LENGTH)
286
+ #log.debug "Found codons #{fwd_in_overlap.keys.join(',')} in twin node at positions #{fwd_in_overlap.values.join(',')} in #{fwd_overlap_sequence}" if log.debug?
287
+ #log.debug "Looking for stops in #{twin_overlap_sequence}" if log.debug?
288
+ twin_in_overlap = word_search(twin_overlap_sequence, words, CODON_LENGTH)
289
+ #log.debug "Found codons #{twin_in_overlap.keys.join(',')} in twin node at positions #{twin_in_overlap.values.join(',')} in #{twin_overlap_sequence}" if log.debug?
290
+
291
+ # offset positions in overlap to be relative to start of node / twin node
292
+ offset = onode.node.length_alone
293
+ fwd_in_overlap.each{|word, inds| fwd_in_overlap[word] = inds.collect{|pos| pos + offset}}
294
+ twin_in_overlap.each{|word, inds| twin_in_overlap[word] = inds.collect{|pos| pos + 1 - CODON_LENGTH}}
295
+ #log.debug "Codons in overlap positions relative to start of first node #{fwd_in_overlap.values.join(',')}" if log.debug?
296
+ #log.debug "Codons in overlap positions relative to start of first twin node #{twin_in_overlap.values.join(',')}" if log.debug?
297
+
298
+ # assemble result
299
+ fwd_result = SearchResult.new
300
+ twin_result = SearchResult.new
301
+
302
+ push_mark_to_list = lambda do |list, word, positions|
303
+ if positions.has_key? word
304
+ list.push positions[word].collect{|pos| make_marker.call pos}
305
+ end
306
+ end
307
+
308
+ fwd_positions = fwd_within_first.merge fwd_in_overlap
309
+ twin_positions = twin_within_first.merge twin_in_overlap
310
+ START_CODONS.each do |word|
311
+ # fwd starts
312
+ push_mark_to_list.call(fwd_result.start_markers, word, fwd_positions)
313
+ # twin starts
314
+ push_mark_to_list.call(twin_result.start_markers, word, twin_positions)
315
+ end
316
+ fwd_result.start_markers.flatten!
317
+ twin_result.start_markers.flatten!
318
+ #log.debug "Positions of start codons #{fwd_result.start_markers.join(',')}" if log.debug?
319
+ #log.debug "Positions of start codons in twin node #{twin_result.start_markers.join(',')}" if log.debug?
320
+
321
+ STOP_CODONS.each do |word|
322
+ #fwd stops
323
+ push_mark_to_list.call(fwd_result.stop_markers, word, fwd_positions)
324
+ # twin stops
325
+ push_mark_to_list.call(twin_result.stop_markers, word, twin_positions)
326
+ end
327
+ fwd_result.stop_markers.flatten!
328
+ twin_result.stop_markers.flatten!
329
+ #log.debug "Positions of stop codons #{fwd_result.stop_markers.join(',')}" if log.debug?
330
+ #log.debug "Positions of stop codons in twin node #{twin_result.stop_markers.join(',')}" if log.debug?
331
+
332
+
333
+ return fwd_result, twin_result
334
+ end
335
+
336
+ def get_overlap_sequences(otrail, size, from_end=false)
337
+ return '' if otrail.trail.empty?
338
+ trail = otrail.trail
339
+ if from_end # reverse as new sequence is taken from front of trail
340
+ trail = trail.reverse
341
+ end
342
+ twin_nodes_sequence = ''
343
+ fwd_nodes_sequence = ''
344
+
345
+ index = 0
346
+ onode = trail[index]
347
+
348
+ start_length = onode.node.length_alone
349
+ extension_length = -start_length
350
+
351
+ while extension_length < (size - 1) and index < trail.length
352
+ #log.debug "Extended #{extension_length} / #{size} bps and #{index+1} / #{otrail.length} nodes" if log.debug?
353
+ extend_fwd_nodes_sequence, extend_twin_nodes_sequence = get_sequences(onode)
354
+ if from_end
355
+ twin_nodes_sequence += extend_twin_nodes_sequence
356
+ fwd_nodes_sequence = extend_fwd_nodes_sequence + fwd_nodes_sequence
357
+ else
358
+ twin_nodes_sequence = extend_twin_nodes_sequence + twin_nodes_sequence
359
+ fwd_nodes_sequence += extend_fwd_nodes_sequence
360
+ end
361
+
362
+ extension_length += onode.node.length_alone
363
+ index += 1
364
+ onode = trail[index]
365
+ end
366
+
367
+ #log.debug "Found forward and twin sequences #{fwd_nodes_sequence} and #{twin_nodes_sequence} before trimming" if log.debug?
368
+
369
+ trim_start = start_length - (size - 1)
370
+ trim_start = 0 if trim_start < 0
371
+ trim_end = extension_length - (size - 1)
372
+ trim_end = 0 if trim_end < 0
373
+ #log.debug "Trimming first #{trim_start} and last #{trim_end} positions for output" if log.debug?
374
+ if from_end
375
+ return fwd_nodes_sequence[trim_end..-(trim_start+1)], twin_nodes_sequence[trim_start..-(trim_end+1)]
376
+ else
377
+ return fwd_nodes_sequence[trim_start..-(trim_end+1)], twin_nodes_sequence[trim_end..-(trim_start+1)]
378
+ end
379
+ end
380
+
381
+ def get_sequences(onode)
382
+ if onode.starts_at_start?
383
+ twin_nodes_sequence = onode.node.ends_of_kmers_of_twin_node
384
+ fwd_nodes_sequence = onode.node.ends_of_kmers_of_node
385
+ else
386
+ twin_nodes_sequence = onode.node.ends_of_kmers_of_node
387
+ fwd_nodes_sequence = onode.node.ends_of_kmers_of_twin_node
388
+ end
389
+ return fwd_nodes_sequence, twin_nodes_sequence
390
+ end
391
+
392
+ def word_search(sequence, words, size)
393
+ position = size
394
+ inds = {}
395
+
396
+ while position <= sequence.length
397
+ word = sequence[position-size...position]
398
+ if words.include? word
399
+ inds[word] ||=[]
400
+ inds[word].push position
401
+ end
402
+ position += 1
403
+ end
404
+
405
+ return inds
406
+ end
407
+
408
+
409
+ # Given an Array of start positions and stop positions, return
410
+ # start,stop base position pairs (not inclusive of the stop codon's bases)
411
+ # that are ORFs with a given minimum orf length (length measured in nucleotides).
412
+ # The returned object is an instance of ORFsResult.
413
+ def orfs_from_start_stop_markers(start_markers, stop_markers, minimum_orf_length)
414
+ # Split up the start and stop positions into 3 frames
415
+ frame_starts = [[],[],[]]
416
+ frame_stops = [[],[],[]]
417
+ start_markers.each do |marker|
418
+ frame_starts[marker.position_in_trail % 3].push marker
419
+ end
420
+ stop_markers.each do |marker|
421
+ frame_stops[marker.position_in_trail % 3].push marker
422
+ end
423
+
424
+ # For each frame
425
+ to_return = ORFsResult.new
426
+ (0..2).each do |frame|
427
+ frame_pairs = []
428
+
429
+ # Sort arrays in descending order because Array#pop removes from the end of the array
430
+ starts = frame_starts[frame].sort{|a,b| b.position_in_trail<=>a.position_in_trail}
431
+ stops = frame_stops[frame].sort{|a,b| b.position_in_trail<=>a.position_in_trail}
432
+
433
+ current_start = starts.pop
434
+ current_stop = stops.pop
435
+ if current_stop
436
+ # Record first stop codon
437
+ to_return.initial_stop_markers.push current_stop
438
+ end
439
+ if current_start and (current_stop.nil? or current_start.position_in_trail < current_stop.position_in_trail)
440
+ # Record first start codon before any stop codons
441
+ to_return.initial_start_markers.push current_start
442
+ end
443
+
444
+ while current_start and current_stop
445
+ # Move to next start after current stop
446
+ while current_start and current_start.position_in_trail < current_stop.position_in_trail
447
+ current_start = starts.pop
448
+ end
449
+
450
+ if current_start
451
+ # Move to next stop after current start
452
+ while current_stop and current_stop.position_in_trail < current_start.position_in_trail
453
+ current_stop = stops.pop
454
+ end
455
+ end
456
+
457
+ if current_start and current_stop
458
+ # This stop codon stops the current reading frame.
459
+ if current_stop.position_in_trail - current_start.position_in_trail >= minimum_orf_length
460
+ # Found a legit ORF
461
+ to_return.start_stop_pairs.push [current_start, current_stop]
462
+ end
463
+ # Whether or not last ORF was long enough, search for the next start codon
464
+ next
465
+ else
466
+ if current_start
467
+ to_return.final_start_markers.push current_start
468
+ end
469
+ break
470
+ end
471
+ end
472
+ end
473
+
474
+ return to_return
475
+ end
476
+
477
+ def orf_sequences_from_trails(trails)
478
+ to_return = {}
479
+ trails.each do |trail|
480
+ fwd_sequence, twin_sequence = trail.otrail.sequences_within_path
481
+ # forward / twin directions
482
+ [
483
+ [fwd_sequence, trail.fwd_orfs_result],
484
+ [twin_sequence, trail.twin_orfs_result]
485
+ ].each do |sequence_and_result|
486
+ sequence, result = sequence_and_result
487
+ if result
488
+ result.start_stop_pairs.each do |pair|
489
+ start_position = pair[0].position_in_trail - 3
490
+ end_position = pair[1].position_in_trail
491
+
492
+ # orf name
493
+ last_node = nil
494
+ onodes = trail.otrail.trail.drop_while do |onode|
495
+ onode.node != pair[0].node
496
+ end.take_while do |onode|
497
+ next false if last_node == pair[1].node
498
+ last_node = onode.node
499
+ true
500
+ end
501
+ name = "(#{onodes[0].to_shorthand}:#{pair[0].position_in_node}),#{onodes[1...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{pair[1].position_in_node})"
502
+
503
+ to_return[name] ||= sequence[start_position...end_position]
504
+ end
505
+ result.initial_stop_markers.each do |marker|
506
+ end_position = marker.position_in_trail
507
+
508
+ # orf_name
509
+ last_node = nil
510
+ onodes = trail.otrail.trail.take_while do |onode|
511
+ next false if last_node == marker.node
512
+ last_node = onode.node
513
+ true
514
+ end
515
+ name = "#{onodes[0...-1].collect{|onode| onode.to_shorthand}.join(',')},(#{onodes[-1].to_shorthand}:#{marker.position_in_node})"
516
+
517
+ to_return[name] ||= sequence[0...end_position]
518
+ end
519
+ result.final_start_markers.each do |marker|
520
+ start_position = marker.position_in_trail - 3
521
+
522
+ # orf_name
523
+ onodes = trail.otrail.trail.drop_while{|onode| onode.node != marker.node}
524
+ name = "(#{onodes[0].to_shorthand}:#{marker.position_in_node}),#{onodes[1..-1].collect{|onode| onode.to_shorthand}.join(',')}"
525
+ end
526
+ end
527
+ if result.nil? or (result.start_stop_pairs.empty? and result.final_start_markers.empty? and result.initial_stop_markers.empty?)
528
+ name = "#{trail.otrail.to_shorthand}"
529
+
530
+ to_return[name] ||= sequence
531
+ end
532
+ end
533
+ end
534
+
535
+ return to_return
536
+ end
537
+
538
+ # positions of last base of codons
539
+ class Marker
540
+ attr_accessor :position_in_trail, :position_in_node, :node
541
+
542
+ def copy
543
+ copy = Marker.new
544
+ copy.position_in_trail = @position_in_trail
545
+ copy.position_in_node = @position_in_node
546
+ copy.node = @node
547
+ return copy
548
+ end
549
+ end
550
+
551
+ class SearchResult
552
+ attr_accessor :start_markers, :stop_markers
553
+
554
+ def initialize
555
+ @start_markers = []
556
+ @stop_markers = []
557
+ end
558
+ end
559
+
560
+ class ORFsTracingTrail
561
+ attr_accessor :otrail, :fwd_orfs_result, :twin_orfs_result
562
+ include Enumerable
563
+
564
+ def each(&block)
565
+ unless @otrail.nil?
566
+ @otrail.each(&block)
567
+ end
568
+ end
569
+ end
570
+
571
+ class ORFsResult
572
+ attr_accessor :start_stop_pairs, :final_start_markers, :initial_start_markers, :initial_stop_markers
573
+
574
+ def initialize
575
+ @start_stop_pairs = []
576
+ @initial_start_markers = []
577
+ @final_start_markers = []
578
+ @initial_stop_markers = []
579
+ end
580
+ end
581
+
582
+ class AllProblemTrailsFinder
583
+ include Bio::FinishM::Logging
584
+
585
+ def initialize(graph, initial_paths)
586
+ @stack = DS::Stack.new
587
+ initial_paths.each do |path|
588
+ @stack.push path
589
+ end
590
+ @graph = graph
591
+ end
592
+
593
+ def pop
594
+ @stack.pop
595
+ end
596
+
597
+ def size
598
+ @stack.size
599
+ end
600
+
601
+ def push_next_neighbours(current_path)
602
+ next_nodes = current_path.neighbours_of_last_node(@graph)
603
+ log.debug "Pushing #{next_nodes.length} new neighbours of #{current_path.last}" if log.debug?
604
+ #TODO: not neccessary to copy all paths, can just continue one of them
605
+ next_nodes.each do |n|
606
+ log.debug "Pushing neighbour to stack: #{n}" if log.debug?
607
+ path = current_path.copy
608
+ path.add_oriented_node n
609
+ @stack.push path
610
+ end
611
+ end
612
+ end
613
+ end
614
+ end
615
+ end