finishm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitmodules +3 -0
- data/.rspec +1 -0
- data/Gemfile +31 -0
- data/LICENSE.txt +20 -0
- data/README.md +59 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/assembly_visualiser +106 -0
- data/bin/check_primer_combinations.rb +73 -0
- data/bin/contig_joiner.rb +244 -0
- data/bin/contigs_against_assembly.rb +153 -0
- data/bin/finishm +143 -0
- data/bin/finishm_assembler +55 -0
- data/bin/finishm_gap_closer.rb +241 -0
- data/bin/kmer_abundance_file_tool.rb +49 -0
- data/bin/kmer_pattern_to_assembly.rb +377 -0
- data/bin/kmer_profile_finder.rb +92 -0
- data/bin/kmers_count_parse.d +52 -0
- data/bin/kmers_count_tabulate.d +123 -0
- data/bin/kmers_count_tabulate.rb +84 -0
- data/bin/pcr_result_parser.rb +108 -0
- data/bin/primer_finder.rb +119 -0
- data/bin/read_selection_by_kmer.d +174 -0
- data/bin/scaffold_by_pattern.rb +119 -0
- data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
- data/bin/scaffold_end_coverages.rb +69 -0
- data/bin/trail_validator.rb +84 -0
- data/ext/mkrf_conf.rb +56 -0
- data/ext/src/Makefile +140 -0
- data/ext/src/src/allocArray.c +305 -0
- data/ext/src/src/allocArray.h +86 -0
- data/ext/src/src/autoOpen.c +107 -0
- data/ext/src/src/autoOpen.h +18 -0
- data/ext/src/src/binarySequences.c +813 -0
- data/ext/src/src/binarySequences.h +125 -0
- data/ext/src/src/concatenatedGraph.c +233 -0
- data/ext/src/src/concatenatedGraph.h +30 -0
- data/ext/src/src/concatenatedPreGraph.c +262 -0
- data/ext/src/src/concatenatedPreGraph.h +29 -0
- data/ext/src/src/correctedGraph.c +2643 -0
- data/ext/src/src/correctedGraph.h +32 -0
- data/ext/src/src/dfib.c +509 -0
- data/ext/src/src/dfib.h +69 -0
- data/ext/src/src/dfibHeap.c +89 -0
- data/ext/src/src/dfibHeap.h +39 -0
- data/ext/src/src/dfibpriv.h +105 -0
- data/ext/src/src/fib.c +628 -0
- data/ext/src/src/fib.h +78 -0
- data/ext/src/src/fibHeap.c +79 -0
- data/ext/src/src/fibHeap.h +41 -0
- data/ext/src/src/fibpriv.h +110 -0
- data/ext/src/src/globals.h +154 -0
- data/ext/src/src/graph.c +3932 -0
- data/ext/src/src/graph.h +233 -0
- data/ext/src/src/graphReConstruction.c +1472 -0
- data/ext/src/src/graphReConstruction.h +30 -0
- data/ext/src/src/graphStats.c +2167 -0
- data/ext/src/src/graphStats.h +72 -0
- data/ext/src/src/graphStructures.h +52 -0
- data/ext/src/src/kmer.c +652 -0
- data/ext/src/src/kmer.h +73 -0
- data/ext/src/src/kmerOccurenceTable.c +236 -0
- data/ext/src/src/kmerOccurenceTable.h +44 -0
- data/ext/src/src/kseq.h +223 -0
- data/ext/src/src/locallyCorrectedGraph.c +557 -0
- data/ext/src/src/locallyCorrectedGraph.h +40 -0
- data/ext/src/src/passageMarker.c +677 -0
- data/ext/src/src/passageMarker.h +137 -0
- data/ext/src/src/preGraph.c +1717 -0
- data/ext/src/src/preGraph.h +106 -0
- data/ext/src/src/preGraphConstruction.c +990 -0
- data/ext/src/src/preGraphConstruction.h +26 -0
- data/ext/src/src/probe_node_finder.c +84 -0
- data/ext/src/src/probe_node_finder.h +6 -0
- data/ext/src/src/readCoherentGraph.c +557 -0
- data/ext/src/src/readCoherentGraph.h +30 -0
- data/ext/src/src/readSet.c +1734 -0
- data/ext/src/src/readSet.h +67 -0
- data/ext/src/src/readToNode.c +218 -0
- data/ext/src/src/readToNode.h +35 -0
- data/ext/src/src/recycleBin.c +199 -0
- data/ext/src/src/recycleBin.h +58 -0
- data/ext/src/src/roadMap.c +342 -0
- data/ext/src/src/roadMap.h +65 -0
- data/ext/src/src/run.c +318 -0
- data/ext/src/src/run.h +52 -0
- data/ext/src/src/run2.c +744 -0
- data/ext/src/src/runReadToNode.c +29 -0
- data/ext/src/src/scaffold.c +1876 -0
- data/ext/src/src/scaffold.h +64 -0
- data/ext/src/src/shortReadPairs.c +1243 -0
- data/ext/src/src/shortReadPairs.h +32 -0
- data/ext/src/src/splay.c +259 -0
- data/ext/src/src/splay.h +43 -0
- data/ext/src/src/splayTable.c +1315 -0
- data/ext/src/src/splayTable.h +31 -0
- data/ext/src/src/tightString.c +362 -0
- data/ext/src/src/tightString.h +82 -0
- data/ext/src/src/utility.c +199 -0
- data/ext/src/src/utility.h +98 -0
- data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
- data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
- data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
- data/ext/src/third-party/zlib-1.2.3/README +125 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
- data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
- data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
- data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
- data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
- data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
- data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/configure +459 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/example +0 -0
- data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
- data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
- data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
- data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
- data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
- data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
- data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
- data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
- data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
- data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
- data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
- data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
- data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
- data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
- data/lib/assembly/a_b_visualiser.rb +169 -0
- data/lib/assembly/acyclic_connection_finder.rb +81 -0
- data/lib/assembly/all_orfs.rb +615 -0
- data/lib/assembly/bad_format_writer.rb +46 -0
- data/lib/assembly/bam_probe_read_selector.rb +48 -0
- data/lib/assembly/bubbly_assembler.rb +842 -0
- data/lib/assembly/c_probe_node_finder.rb +38 -0
- data/lib/assembly/connection_interpreter.rb +350 -0
- data/lib/assembly/contig_printer.rb +400 -0
- data/lib/assembly/coverage_based_graph_filter.rb +68 -0
- data/lib/assembly/depth_first_search.rb +63 -0
- data/lib/assembly/dijkstra.rb +216 -0
- data/lib/assembly/fluffer.rb +253 -0
- data/lib/assembly/graph_explorer.rb +85 -0
- data/lib/assembly/graph_generator.rb +315 -0
- data/lib/assembly/height_finder.rb +355 -0
- data/lib/assembly/hybrid_velvet_graph.rb +70 -0
- data/lib/assembly/input_genome.rb +182 -0
- data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
- data/lib/assembly/node_finder.rb +171 -0
- data/lib/assembly/oriented_node_trail.rb +507 -0
- data/lib/assembly/paired_end_assembler.rb +53 -0
- data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
- data/lib/assembly/probed_graph.rb +105 -0
- data/lib/assembly/read_input.rb +79 -0
- data/lib/assembly/read_to_node.rb +37 -0
- data/lib/assembly/scaffold_breaker.rb +126 -0
- data/lib/assembly/sequence_hasher.rb +71 -0
- data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
- data/lib/assembly/single_coherent_wanderer.rb +261 -0
- data/lib/assembly/single_ended_assembler.rb +441 -0
- data/lib/assembly/velvet_c_binding.rb +54 -0
- data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
- data/lib/external/VERSION +1 -0
- data/lib/finishm/assemble.rb +224 -0
- data/lib/finishm/explore.rb +217 -0
- data/lib/finishm/finisher.rb +303 -0
- data/lib/finishm/fluff.rb +122 -0
- data/lib/finishm/gapfiller.rb +325 -0
- data/lib/finishm/orfs_finder.rb +88 -0
- data/lib/finishm/path_counter.rb +90 -0
- data/lib/finishm/primers.rb +425 -0
- data/lib/finishm/primers_check.rb +176 -0
- data/lib/finishm/roundup.rb +344 -0
- data/lib/finishm/sequence.rb +142 -0
- data/lib/finishm/visualise.rb +430 -0
- data/lib/finishm/wander.rb +270 -0
- data/lib/kmer_abundance_pattern.rb +79 -0
- data/lib/kmer_multi_abundance_file.rb +48 -0
- data/lib/oligo_designer.rb +88 -0
- data/lib/priner.rb +66 -0
- data/spec/acyclic_connection_finder_spec.rb +551 -0
- data/spec/all_orfs_spec.rb +443 -0
- data/spec/assemble_spec.rb +186 -0
- data/spec/bubbly_assembler_spec.rb +707 -0
- data/spec/c_node_finder_spec.rb +58 -0
- data/spec/connection_interpreter_spec.rb +284 -0
- data/spec/contig_printer_spec.rb +291 -0
- data/spec/coverage_based_graph_filter_spec.rb +102 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
- data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
- data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
- data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
- data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Graph +46 -0
- data/spec/data/assembly_visualiser/start_kmers1 +2 -0
- data/spec/data/bands.csv +1 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
- data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
- data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
- data/spec/data/c_probe_node_finder/1/Log +756 -0
- data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
- data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
- data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
- data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
- data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
- data/spec/data/contig_printer/1/contigs.fa +4 -0
- data/spec/data/contig_printer/1/seq.fa +2408 -0
- data/spec/data/contig_printer/1/seq.fa.svg +153 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
- data/spec/data/contig_printer/1/seq.node12.fa +4 -0
- data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
- data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
- data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
- data/spec/data/explore/1/a.fa +2 -0
- data/spec/data/explore/1/seq1_and_a.fa +3 -0
- data/spec/data/explore/1/seq2.fa +2 -0
- data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
- data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
- data/spec/data/fluff/1/seq1.fa +2 -0
- data/spec/data/fluff/1/seq2.fa +2 -0
- data/spec/data/gapfilling/1/reads.fa +171 -0
- data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
- data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
- data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
- data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
- data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
- data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
- data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
- data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
- data/spec/data/gapfilling/2/reference.fa +2 -0
- data/spec/data/gapfilling/2/reference_part1.fa +4 -0
- data/spec/data/gapfilling/2/reference_part2.fa +4 -0
- data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
- data/spec/data/gapfilling/2/with_gaps.fa +4 -0
- data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
- data/spec/data/gapfilling/3/reads.fa.gz +0 -0
- data/spec/data/gapfilling/3/reference_part1.fa +4 -0
- data/spec/data/gapfilling/3/reference_part2.fa +4 -0
- data/spec/data/gapfilling/3/with_gaps.fa +4 -0
- data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
- data/spec/data/gapfilling/4/reads.fa.gz +0 -0
- data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
- data/spec/data/gapfilling/5/answer.fna +2 -0
- data/spec/data/gapfilling/5/gappy.fna +2 -0
- data/spec/data/gapfilling/5/reads.fa +17961 -0
- data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
- data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
- data/spec/data/gapfilling/6/random1.fa +28 -0
- data/spec/data/gapfilling/6/random2.fa +28 -0
- data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
- data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
- data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
- data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
- data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
- data/spec/data/kmers_count1.csv +2 -0
- data/spec/data/kmers_count2.csv +3 -0
- data/spec/data/out +3 -0
- data/spec/data/positive_latching_pair.fa +2 -0
- data/spec/data/primers.csv +4 -0
- data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/input.fasta +6 -0
- data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
- data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
- data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
- data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
- data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
- data/spec/data/t/details.txt +5 -0
- data/spec/data/t/details.txt.srt +5 -0
- data/spec/data/t/location.txt +3 -0
- data/spec/data/t/location.txt.srt +3 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
- data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
- data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/answer.fa +2 -0
- data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
- data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
- data/spec/data/velvet_test_trails/Assem/Graph +17 -0
- data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
- data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
- data/spec/data/velvet_test_trails/Assem/Log +35 -0
- data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
- data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
- data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
- data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
- data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
- data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
- data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
- data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
- data/spec/data/velvet_test_trails/read1.fa +2 -0
- data/spec/data/velvet_test_trails/reads.fa +50 -0
- data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
- data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
- data/spec/data/visualise/1/LastGraph +6695 -0
- data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
- data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
- data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
- data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
- data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
- data/spec/data/wander/1/random1.fa +2 -0
- data/spec/data/wander/1/random1.sammy.fa +804 -0
- data/spec/depth_first_search_spec.rb +190 -0
- data/spec/dijkstra_spec.rb +143 -0
- data/spec/explore_spec.rb +29 -0
- data/spec/fluffer_spec.rb +155 -0
- data/spec/gapfiller_spec.rb +107 -0
- data/spec/graph_explorer_spec.rb +475 -0
- data/spec/graph_generator_spec.rb +99 -0
- data/spec/height_finder_spec.rb +306 -0
- data/spec/kmer_abundance_pattern_spec.rb +56 -0
- data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
- data/spec/kmer_profile_finder_spec.rb +38 -0
- data/spec/kmers_count_tabulate_spec.rb +120 -0
- data/spec/oriented_node_trail_spec.rb +221 -0
- data/spec/paired_end_neighbours_spec.rb +126 -0
- data/spec/paths_between_nodes_spec.rb +349 -0
- data/spec/priner_spec.rb +7 -0
- data/spec/read_input_spec.rb +23 -0
- data/spec/read_selection_by_kmer_spec.rb +166 -0
- data/spec/read_to_node_spec.rb +35 -0
- data/spec/roundup_spec.rb +366 -0
- data/spec/scaffold_breaker_spec.rb +144 -0
- data/spec/sequence_spec.rb +43 -0
- data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
- data/spec/single_coherent_wanderer_spec.rb +120 -0
- data/spec/single_ended_assembler_spec.rb +398 -0
- data/spec/spec_helper.rb +310 -0
- data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
- data/spec/visualise_spec.rb +105 -0
- data/spec/wander_spec.rb +119 -0
- data/spec/watch_for_changes.sh +16 -0
- data/validation/fasta_compare.rb +72 -0
- data/validation/gapfill_simulate_perfect.rb +108 -0
- metadata +899 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
require 'bio'
|
|
2
|
+
|
|
3
|
+
class Bio::FinishM::ScaffoldBreaker
|
|
4
|
+
include Bio::FinishM::Logging
|
|
5
|
+
|
|
6
|
+
class UnscaffoldedContig
|
|
7
|
+
attr_accessor :scaffold_position_start, :scaffold_position_end
|
|
8
|
+
|
|
9
|
+
# The Scaffold to which this contig once belonged
|
|
10
|
+
attr_accessor :scaffold
|
|
11
|
+
|
|
12
|
+
# The actual nucleotide sequence of this contig, from scaffold start position to
|
|
13
|
+
# end (not revcom)
|
|
14
|
+
attr_accessor :sequence
|
|
15
|
+
|
|
16
|
+
def length
|
|
17
|
+
@scaffold_position_end - @scaffold_position_start +1
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def name
|
|
21
|
+
contig_number = scaffold.contigs.find_index(self)+1
|
|
22
|
+
if contig_number.nil?
|
|
23
|
+
raise "A contig finds itself unexpectedly not in the scaffold it is supposed to belong to"
|
|
24
|
+
end
|
|
25
|
+
return "#{scaffold.name}_#{contig_number}of#{scaffold.contigs.length}_#{scaffold_position_start}to#{scaffold_position_end}"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class Scaffold
|
|
30
|
+
# unscaffolded contigs from this scaffold, as an array in sorted order.
|
|
31
|
+
attr_accessor :contigs
|
|
32
|
+
|
|
33
|
+
# Name of sequence found in the fasta file
|
|
34
|
+
attr_accessor :name
|
|
35
|
+
|
|
36
|
+
# Return an array of Gap objects
|
|
37
|
+
def gaps
|
|
38
|
+
gaps = []
|
|
39
|
+
last_contig = nil
|
|
40
|
+
@contigs.each_with_index do |contig, i|
|
|
41
|
+
if i!=0
|
|
42
|
+
gap = Bio::FinishM::ScaffoldBreaker::Gap.new
|
|
43
|
+
gap.scaffold = self
|
|
44
|
+
gap.start = last_contig.scaffold_position_end + 1
|
|
45
|
+
gap.stop = contig.scaffold_position_start - 1
|
|
46
|
+
gap.number = i-1
|
|
47
|
+
gaps.push gap
|
|
48
|
+
end
|
|
49
|
+
last_contig = contig
|
|
50
|
+
end
|
|
51
|
+
return gaps
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def sequence
|
|
55
|
+
to_return = []
|
|
56
|
+
last_contig = nil
|
|
57
|
+
@contigs.each_with_index do |contig, i|
|
|
58
|
+
if i==0
|
|
59
|
+
to_return.push contig.sequence
|
|
60
|
+
else
|
|
61
|
+
gap_start = last_contig.scaffold_position_end + 1
|
|
62
|
+
gap_stop = contig.scaffold_position_start - 1
|
|
63
|
+
to_return.push 'N'*(gap_stop-gap_start+1)
|
|
64
|
+
to_return.push contig.sequence
|
|
65
|
+
end
|
|
66
|
+
last_contig = contig
|
|
67
|
+
end
|
|
68
|
+
return to_return.join
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Which contig number is this, in the scaffold?
|
|
72
|
+
def contig_number(contig)
|
|
73
|
+
@contigs.each_with_index do |current_contig, i|
|
|
74
|
+
return i if contig==current_contig
|
|
75
|
+
end
|
|
76
|
+
raise "Contig not found in scaffold"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
class Gap
|
|
81
|
+
attr_accessor :scaffold, :start, :stop, :number
|
|
82
|
+
|
|
83
|
+
def coords
|
|
84
|
+
@scaffold.name+':'+(@start+1).to_s+'-'+(@stop).to_s
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
#i.e. the number of N characters that would represent this gap
|
|
88
|
+
def length
|
|
89
|
+
@stop-@start+1
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Given a path to a scaffold fasta file, read in the scaffolds, and break them apart
|
|
94
|
+
# into constituent contigs. Then return an array of Scaffold objects containing the
|
|
95
|
+
# contig information therein.
|
|
96
|
+
def break_scaffolds(contigs_filename)
|
|
97
|
+
scaffolds = []
|
|
98
|
+
Bio::FlatFile.foreach(Bio::FastaFormat, contigs_filename) do |seq|
|
|
99
|
+
scaffold = Scaffold.new
|
|
100
|
+
scaffold.name = seq.definition
|
|
101
|
+
|
|
102
|
+
unless seq.seq.match(/^[ATGCN]+$/i)
|
|
103
|
+
example = seq.seq.match(/([^ATGCN])/i)[1]
|
|
104
|
+
log.warn "Found unexpected characters in the sequence #{seq.definition} e.g. #{example}, continuing optimistically, but not quite sure what will happen.. good luck"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
if seq.seq.match(/^N+$/i)
|
|
108
|
+
raise "Found a scaffold that contains all N characters, ignoring this (perhaps your input is mangled?): #{scaffold.name}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Find all Ns in the current sequence
|
|
112
|
+
seq.seq.scan(/([^N]+)/i) do
|
|
113
|
+
contig = UnscaffoldedContig.new
|
|
114
|
+
contig.scaffold = scaffold
|
|
115
|
+
contig.scaffold_position_start = $~.offset(0)[0]+1#Convert to 1-based indices in line with bioruby
|
|
116
|
+
contig.scaffold_position_end = $~.offset(0)[1]
|
|
117
|
+
contig.sequence = $~.to_s
|
|
118
|
+
scaffold.contigs ||= []
|
|
119
|
+
scaffold.contigs.push contig
|
|
120
|
+
end
|
|
121
|
+
scaffolds.push scaffold
|
|
122
|
+
end
|
|
123
|
+
log.info "Detected #{scaffolds.length} scaffolds, containing #{scaffolds.collect{|s| s.contigs.length}.reduce(:+)} different contigs"
|
|
124
|
+
return scaffolds
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
require 'ds'
|
|
2
|
+
require 'set'
|
|
3
|
+
|
|
4
|
+
class Bio::AssemblyGraphAlgorithms::SequenceHasher
|
|
5
|
+
include Bio::FinishM::Logging
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
def extend_overlap(graph, oriented_onode, overlap, options={})
|
|
9
|
+
trails = []
|
|
10
|
+
|
|
11
|
+
current_path = DistancedOrientedNodeTrail.new
|
|
12
|
+
current_path.add_oriented_node oriented_onode
|
|
13
|
+
current_path.distance = 0
|
|
14
|
+
|
|
15
|
+
stack = DS::Stack.new
|
|
16
|
+
stack.push current_path
|
|
17
|
+
|
|
18
|
+
# While there is more on the stack
|
|
19
|
+
while current_path = stack.pop
|
|
20
|
+
|
|
21
|
+
current_distance = current_path.distance
|
|
22
|
+
|
|
23
|
+
if current_distance >= overlap
|
|
24
|
+
# Found all the sequence we need
|
|
25
|
+
trails.push current_path
|
|
26
|
+
next
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Find neighbouring nodes
|
|
30
|
+
neighbours = nil
|
|
31
|
+
if options[:neighbour_finder]
|
|
32
|
+
neighbours = options[:neighbour_finder].neighbours(oriented_onode)
|
|
33
|
+
else
|
|
34
|
+
neighbours = oriented_node.next_neighbours(graph)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
neighbours.each do |onode|
|
|
38
|
+
new_distance = current_distance
|
|
39
|
+
if options[:neighbour_finder]
|
|
40
|
+
if onode.distance
|
|
41
|
+
new_distance += onode.distance
|
|
42
|
+
else
|
|
43
|
+
new_distance += 0
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
new_distance += onode.node.length_alone
|
|
47
|
+
|
|
48
|
+
new_path = current_path.copy
|
|
49
|
+
new_path.add_oriented_node onode
|
|
50
|
+
new_path.distance = new_distance
|
|
51
|
+
stack.push new_path
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
class DistancedOrientedNodeTrail < Bio::Velvet::Graph::OrientedNodeTrail
|
|
57
|
+
attr_accessor :distance
|
|
58
|
+
|
|
59
|
+
def copy
|
|
60
|
+
o = DistancedOrientedNodeTrail.new
|
|
61
|
+
o.trail = Array.new(@trail.collect{|onode| onode.copy})
|
|
62
|
+
o.distance = @distance
|
|
63
|
+
return o
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def to_s
|
|
67
|
+
"DistancedOrientedTrail: #{object_id}: #{to_shorthand} distance=#{@distance}"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
end
|
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
require 'ds'
|
|
2
|
+
require 'set'
|
|
3
|
+
|
|
4
|
+
class Bio::AssemblyGraphAlgorithms::SingleCoherentPathsBetweenNodesFinder
|
|
5
|
+
include Bio::FinishM::Logging
|
|
6
|
+
|
|
7
|
+
SINGLE_BASE_REVCOM = {
|
|
8
|
+
'A'=>'T',
|
|
9
|
+
'T'=>'A',
|
|
10
|
+
'G'=>'C',
|
|
11
|
+
'C'=>'G',
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
# Find all paths between the initial and terminal node in the graph.
|
|
15
|
+
# Don't search in the graph when the distance in base pairs exceeds the leash length.
|
|
16
|
+
# Recohere reads (singled ended only) in an attempt to remove bubbles.
|
|
17
|
+
#
|
|
18
|
+
# Options:
|
|
19
|
+
# * max_gapfill_paths: the maxmimum number of paths to return. If this maximum is exceeded, an empty solution set is returned
|
|
20
|
+
def find_all_connections_between_two_nodes(graph, initial_path, terminal_oriented_node,
|
|
21
|
+
leash_length, recoherence_kmer, sequence_hash, options={})
|
|
22
|
+
|
|
23
|
+
problems = find_all_problems(graph, initial_path, terminal_oriented_node, leash_length, recoherence_kmer, sequence_hash, options)
|
|
24
|
+
|
|
25
|
+
paths = find_paths_from_problems(problems, recoherence_kmer, options)
|
|
26
|
+
return paths
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Options:
|
|
30
|
+
#
|
|
31
|
+
# :max_explore_nodes: only explore this many nodes, not further.
|
|
32
|
+
def find_all_problems(graph, initial_path, terminal_node, leash_length, recoherence_kmer, sequence_hash, options={})
|
|
33
|
+
# setup dynamic programming cache
|
|
34
|
+
problems = ProblemSet.new
|
|
35
|
+
|
|
36
|
+
# setup stack to keep track of initial nodes
|
|
37
|
+
finder = ProblemTrailFinder.new(graph, initial_path)
|
|
38
|
+
|
|
39
|
+
#current_oriented_node_trail = Bio::Velvet::Graph::OrientedNodeTrail.new
|
|
40
|
+
#last_number_of_problems_observed_checkpoint = 0
|
|
41
|
+
|
|
42
|
+
while current_path = finder.dequeue
|
|
43
|
+
path_length = current_path.length_in_bp
|
|
44
|
+
log.debug "considering #{current_path}, path length: #{path_length}" if log.debug?
|
|
45
|
+
|
|
46
|
+
# Have we solved this before? If so, add this path to that solved problem.
|
|
47
|
+
set_key = path_to_settable current_path, recoherence_kmer
|
|
48
|
+
log.debug "Set key is #{set_key}" if log.debug?
|
|
49
|
+
|
|
50
|
+
# Unless the path validates, forget it.
|
|
51
|
+
if recoherence_kmer.nil?
|
|
52
|
+
# Continue, assume that it validates if there is no recoherence_kmer
|
|
53
|
+
elsif !validate_last_node_of_path_by_recoherence(current_path, recoherence_kmer, sequence_hash)
|
|
54
|
+
log.debug "Path did not validate, skipping" if log.debug?
|
|
55
|
+
next
|
|
56
|
+
elsif log.debug?
|
|
57
|
+
log.debug "Path validates"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
if current_path.last == terminal_node
|
|
61
|
+
log.debug "last is terminal" if log.debug?
|
|
62
|
+
problems[set_key] ||= DynamicProgrammingProblem.new
|
|
63
|
+
problems[set_key].known_paths ||= []
|
|
64
|
+
problems[set_key].known_paths.push current_path
|
|
65
|
+
|
|
66
|
+
problems.terminal_node_keys ||= Set.new
|
|
67
|
+
problems.terminal_node_keys << set_key
|
|
68
|
+
|
|
69
|
+
elsif problems[set_key]
|
|
70
|
+
log.debug "Already seen this problem" if log.debug?
|
|
71
|
+
prob = problems[set_key]
|
|
72
|
+
prob.known_paths.push current_path
|
|
73
|
+
|
|
74
|
+
# If a lesser min distance is found, then we need to start exploring from the
|
|
75
|
+
# current place again
|
|
76
|
+
if path_length < prob.min_distance
|
|
77
|
+
log.debug "Found a node with min_distance greater than path length.." if log.debug?
|
|
78
|
+
prob.min_distance = path_length
|
|
79
|
+
finder.push_next_neighbours current_path
|
|
80
|
+
end
|
|
81
|
+
elsif !leash_length.nil? and path_length > leash_length
|
|
82
|
+
# we are past the leash length, give up
|
|
83
|
+
log.debug "Past leash length, giving up" if log.debug?
|
|
84
|
+
else
|
|
85
|
+
log.debug "New dynamic problem being solved" if log.debug?
|
|
86
|
+
# new problem being solved here
|
|
87
|
+
problem = DynamicProgrammingProblem.new
|
|
88
|
+
problem.min_distance = path_length
|
|
89
|
+
problem.known_paths.push current_path.copy
|
|
90
|
+
problems[set_key] = problem
|
|
91
|
+
|
|
92
|
+
num_done = problems.length
|
|
93
|
+
if num_done > 0 and num_done % 512 == 0
|
|
94
|
+
log.info "So far worked with #{num_done} head node sets, up to distance #{path_length}" if log.info?
|
|
95
|
+
end
|
|
96
|
+
if options[:max_explore_nodes] and num_done > options[:max_explore_nodes]
|
|
97
|
+
log.warn "Explored too many nodes (#{num_done}), giving up.."
|
|
98
|
+
problems = ProblemSet.new
|
|
99
|
+
break
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# explore the forward neighbours
|
|
103
|
+
finder.push_next_neighbours current_path
|
|
104
|
+
end
|
|
105
|
+
log.debug "Priority queue size: #{finder.length}" if log.debug?
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
return problems
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def path_to_settable(path, recoherence_kmer)
|
|
112
|
+
log.debug "Making settable a path: #{path}" if log.debug?
|
|
113
|
+
return array_trail_to_settable(path.trail, recoherence_kmer)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def array_trail_to_settable(trail, recoherence_kmer)
|
|
117
|
+
return trail.last.to_settable if recoherence_kmer.nil?
|
|
118
|
+
|
|
119
|
+
cumulative_length = 0
|
|
120
|
+
i = trail.length - 1
|
|
121
|
+
while i >= 0 and cumulative_length < recoherence_kmer
|
|
122
|
+
cumulative_length += trail[i].node.length_alone
|
|
123
|
+
i -= 1
|
|
124
|
+
end
|
|
125
|
+
i += 1
|
|
126
|
+
# 'Return' an array made up of the settables
|
|
127
|
+
to_return = trail[i..-1].collect{|t| t.to_settable}.flatten
|
|
128
|
+
log.debug "'Returning' settable version of path: #{to_return}" if log.debug?
|
|
129
|
+
to_return
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Given an OrientedNodeTrail, and an expected number of
|
|
133
|
+
def validate_last_node_of_path_by_recoherence(path, recoherence_kmer, sequence_hash, min_concurring_reads=1)
|
|
134
|
+
#not possible to fail on a 1 or 2 node path, by debruijn graph definition.
|
|
135
|
+
#TODO: that ain't true! If one of the two nodes is sufficiently long, reads may not agree.
|
|
136
|
+
return true if path.length < 3
|
|
137
|
+
|
|
138
|
+
# Walk backwards along the path from the 2nd last node,
|
|
139
|
+
# collecting nodes until the length in bp of the nodes is > recoherence_kmer
|
|
140
|
+
collected_nodes = []
|
|
141
|
+
length_of_nodes = lambda do |nodes|
|
|
142
|
+
if nodes.empty?
|
|
143
|
+
0
|
|
144
|
+
else
|
|
145
|
+
hash_offset = nodes[0].node.parent_graph.hash_length-1
|
|
146
|
+
nodes.reduce(hash_offset) do |sum, node|
|
|
147
|
+
sum += node.node.length_alone
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
i = path.length-2
|
|
152
|
+
while i >= 0
|
|
153
|
+
collected_nodes.push path.trail[i]
|
|
154
|
+
i -= 1
|
|
155
|
+
# break if the recoherence_kmer doesn't cover
|
|
156
|
+
break if length_of_nodes.call(collected_nodes) + 1 >= recoherence_kmer
|
|
157
|
+
end
|
|
158
|
+
log.debug "validate: Collected nodes: #{collected_nodes}" if log.debug?
|
|
159
|
+
if collected_nodes.length < 2
|
|
160
|
+
log.debug "Only #{collected_nodes.length+1} nodes being tested for validation, so returning validated" if log.debug?
|
|
161
|
+
return true
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# There should be at least 1 read that spans the collected nodes and the last node
|
|
165
|
+
# The trail validates if the above statement is true.
|
|
166
|
+
#TODO: there's a possible 'bug' here in that there's guarantee that the read is overlays the
|
|
167
|
+
# nodes in a consecutive and gapless manner. But I suspect that is unlikely to be a problem in practice.
|
|
168
|
+
final_node = path.trail[-1].node
|
|
169
|
+
possible_reads = final_node.short_reads.collect{|nr| nr.read_id}
|
|
170
|
+
log.debug "validate starting from #{final_node.node_id}: Initial short reads: #{possible_reads.join(',') }" if log.debug?
|
|
171
|
+
collected_nodes.each do |node|
|
|
172
|
+
log.debug "Validating node #{node}" if log.debug?
|
|
173
|
+
current_set = Set.new node.node.short_reads.collect{|nr| nr.read_id}
|
|
174
|
+
possible_reads.select! do |r|
|
|
175
|
+
current_set.include? r
|
|
176
|
+
end
|
|
177
|
+
if possible_reads.length < min_concurring_reads
|
|
178
|
+
log.debug "First line validation failed, now detecting sub-kmer sequence overlap" if log.debug?
|
|
179
|
+
trail_to_validate = path.trail[i+1..-1]
|
|
180
|
+
return sub_kmer_sequence_overlap?(trail_to_validate, sequence_hash, min_concurring_reads)
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
log.debug "Found #{possible_reads.length} reads that concurred with validation e.g. #{possible_reads[0]}" if log.debug?
|
|
184
|
+
return true
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Is there overlap across the given nodes, even if the overlap
|
|
188
|
+
# does not include an entire kmer?
|
|
189
|
+
# nodes: an OrientedNodeTrail. To validate, there must be at least 1 read that spans all of these nodes
|
|
190
|
+
# sequence_hash: Bio::Velvet::Sequence object with the sequences from the reads in the nodes
|
|
191
|
+
def sub_kmer_sequence_overlap?(nodes, sequence_hash, min_concurring_reads=1)
|
|
192
|
+
raise if nodes.length < 3 #should not get here - this is taken care of above
|
|
193
|
+
log.debug "validating by sub-kmer sequence overlap with min #{min_concurring_reads}: #{nodes}" if log.debug?
|
|
194
|
+
|
|
195
|
+
# Only reads that are in the second last node are possible, by de-bruijn graph definition.
|
|
196
|
+
candidate_noded_reads = nodes[-2].node.short_reads
|
|
197
|
+
middle_nodes_length = nodes[1..-2].reduce(0){|sum, n| sum += n.node.length}+
|
|
198
|
+
+nodes[0].node.parent_graph.hash_length-1
|
|
199
|
+
log.debug "Found middle nodes length #{middle_nodes_length}" if log.debug?
|
|
200
|
+
|
|
201
|
+
num_confirming_reads = 0
|
|
202
|
+
|
|
203
|
+
candidate_noded_reads.each do |read|
|
|
204
|
+
# Ignore reads that don't come in at the start of the node
|
|
205
|
+
log.debug "Considering read #{read.inspect}" if log.debug?
|
|
206
|
+
if read.offset_from_start_of_node != 0
|
|
207
|
+
log.debug "Read doesn't start at beginning of node, skipping" if log.debug?
|
|
208
|
+
next
|
|
209
|
+
else
|
|
210
|
+
seq = sequence_hash[read.read_id]
|
|
211
|
+
raise "No sequence stored for #{read.read_id}, programming fail." if seq.nil?
|
|
212
|
+
|
|
213
|
+
if read.start_coord == 0
|
|
214
|
+
log.debug "start_coord Insufficient length of read" if log.debug?
|
|
215
|
+
next
|
|
216
|
+
elsif seq.length-read.start_coord-middle_nodes_length < 1
|
|
217
|
+
log.debug "other_side Insufficient length of read" if log.debug?
|
|
218
|
+
next
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Now ensure that the sequence matches correctly
|
|
222
|
+
# left base, the base from the first node
|
|
223
|
+
first_node = nodes[0].node
|
|
224
|
+
left_base = !(read.direction ^ nodes[-2].starts_at_start?) ?
|
|
225
|
+
SINGLE_BASE_REVCOM[seq[read.start_coord-1]] :
|
|
226
|
+
seq[read.start_coord+middle_nodes_length]
|
|
227
|
+
left_comparison_base = nodes[0].starts_at_start? ?
|
|
228
|
+
first_node.ends_of_kmers_of_twin_node[0] :
|
|
229
|
+
first_node.ends_of_kmers_of_node[0]
|
|
230
|
+
if left_base != left_comparison_base
|
|
231
|
+
log.debug "left comparison base mismatch, this is not a validating read" if log.debug?
|
|
232
|
+
next
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# right base, overlapping the last node
|
|
236
|
+
last_node = nodes[-1].node
|
|
237
|
+
right_base = !(read.direction ^ nodes[-2].starts_at_start?) ?
|
|
238
|
+
seq[read.start_coord+middle_nodes_length] :
|
|
239
|
+
SINGLE_BASE_REVCOM[seq[read.start_coord-1]]
|
|
240
|
+
right_comparison_base = nodes[-1].starts_at_start? ?
|
|
241
|
+
last_node.ends_of_kmers_of_node[0] :
|
|
242
|
+
last_node.ends_of_kmers_of_twin_node[0]
|
|
243
|
+
if right_base != right_comparison_base
|
|
244
|
+
log.debug "right comparison base mismatch, this is not a validating read" if log.debug?
|
|
245
|
+
next
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
log.debug "Read validates path"
|
|
249
|
+
num_confirming_reads += 1
|
|
250
|
+
if num_confirming_reads >= min_concurring_reads
|
|
251
|
+
return true #gauntlet passed, this is enough confirmatory reads, and so the path is validated.
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
return false #no candidate reads pass
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# Separate stacks for valid paths and paths which exceed the maximum allowed
|
|
260
|
+
# cycle count.
|
|
261
|
+
# Each backtrack spawns a set of new paths, which are cycle counted. If any cycle
|
|
262
|
+
# is repeated more than max_cycles, the new path is pushed to the max_cycle_stack,
|
|
263
|
+
# otherwise the path is pushed to the main stack. Main stack paths are prioritised.
|
|
264
|
+
# The max_cycle_stack paths must be tracked until cycle repeats in second_part exceed
|
|
265
|
+
# max_cycles, as they can spawn valid paths with backtracking.
|
|
266
|
+
def find_paths_from_problems(problems, recoherence_kmer, options={})
|
|
267
|
+
max_num_paths = options[:max_gapfill_paths]
|
|
268
|
+
max_num_paths ||= 2196
|
|
269
|
+
max_cycles = options[:max_cycles] || 1
|
|
270
|
+
|
|
271
|
+
counter = CycleCounter.new(max_cycles)
|
|
272
|
+
decide_stack = lambda do |to_push|
|
|
273
|
+
if max_cycles < counter.path_cycle_count(to_push.flatten)
|
|
274
|
+
log.debug "Pushing #{to_push.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to secondary stack" if log.debug?
|
|
275
|
+
return true
|
|
276
|
+
else
|
|
277
|
+
log.debug "Pushing #{to_push.collect{|part| part.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') } to main stack" if log.debug?
|
|
278
|
+
return false
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
stack = DualStack.new &decide_stack
|
|
283
|
+
to_return = Bio::AssemblyGraphAlgorithms::TrailSet.new
|
|
284
|
+
|
|
285
|
+
# if there is no solutions to the overall problem then there is no solution at all
|
|
286
|
+
if problems.terminal_node_keys.nil? or problems.terminal_node_keys.empty?
|
|
287
|
+
to_return.trails = []
|
|
288
|
+
return to_return
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# push all solutions to the "ending in the final node" solutions to the stack
|
|
292
|
+
problems.terminal_node_keys.each do |key|
|
|
293
|
+
overall_solution = problems[key]
|
|
294
|
+
first_part = overall_solution.known_paths[0].to_a
|
|
295
|
+
stack.push [first_part, []]
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
all_paths_hash = {}
|
|
299
|
+
while path_parts = stack.pop
|
|
300
|
+
log.debug path_parts.collect{|half| half.collect{|onode| onode.node.node_id}.join(',')}.join(' and ') if log.debug?
|
|
301
|
+
first_part = path_parts[0]
|
|
302
|
+
second_part = path_parts[1]
|
|
303
|
+
|
|
304
|
+
if first_part.length == 0
|
|
305
|
+
# If we've tracked all the way to the beginning,
|
|
306
|
+
# then there's no need to track further
|
|
307
|
+
|
|
308
|
+
# add this solution if required
|
|
309
|
+
# I've had some trouble getting the Ruby Set to work here, but this is effectively the same thing.
|
|
310
|
+
log.debug "Found solution: #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
311
|
+
key = second_part.hash
|
|
312
|
+
all_paths_hash[key] ||= second_part
|
|
313
|
+
else
|
|
314
|
+
last = first_part.last
|
|
315
|
+
|
|
316
|
+
if second_part.include? last
|
|
317
|
+
log.debug "Cycle at node #{last.node_id} detected in previous path #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
318
|
+
to_return.circular_paths_detected = true
|
|
319
|
+
if max_cycles == 0 or max_cycles < counter.path_cycle_count([last, second_part].flatten)
|
|
320
|
+
log.debug "Not finishing cyclic path with too many repeated cycles." if log.debug?
|
|
321
|
+
next
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
paths_to_last = problems[array_trail_to_settable(first_part, recoherence_kmer)].known_paths
|
|
326
|
+
paths_to_last.each do |path|
|
|
327
|
+
stack.push [path[0...(path.length-1)], [last,second_part].flatten]
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# max_num_paths parachute
|
|
332
|
+
# The parachute can kill the search once the main stack exceeds max_gapfill_paths,
|
|
333
|
+
# since all paths on it are valid.
|
|
334
|
+
if !max_num_paths.nil? and (stack.sizes[0] + all_paths_hash.length) > max_num_paths
|
|
335
|
+
log.info "Exceeded the maximum number of allowable paths in this gapfill" if log.info?
|
|
336
|
+
to_return.max_path_limit_exceeded = true
|
|
337
|
+
all_paths_hash = {}
|
|
338
|
+
break
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
to_return.trails = all_paths_hash.values
|
|
343
|
+
return to_return
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
class DualStack
|
|
347
|
+
def initialize(&block)
|
|
348
|
+
@checker = block
|
|
349
|
+
@stack = DS::Stack.new
|
|
350
|
+
@dual_stack = DS::Stack.new
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def push to_push
|
|
354
|
+
if @checker.call to_push
|
|
355
|
+
@dual_stack.push to_push
|
|
356
|
+
else
|
|
357
|
+
@stack.push to_push
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def pop
|
|
362
|
+
@stack.pop || @dual_stack.pop
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def sizes
|
|
366
|
+
return @stack.size, @dual_stack.size
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Count occurrences of cycles in paths through an assembly graph. Works by building a hash of paths and
|
|
371
|
+
# the frequency of the modal cycle in that path (up to the cut-off max_cycles). For an unknown path, looks
|
|
372
|
+
# for a subset of the path in hash by removing nodes from start (or end if :forward option is set), and
|
|
373
|
+
# then extends the subset by iteratively re-adding a single node and adding to the hash of paths the larger
|
|
374
|
+
# of the subset count or the frequency for the modal cycle beginning with the re-added node.
|
|
375
|
+
class CycleCounter
|
|
376
|
+
include Bio::FinishM::Logging
|
|
377
|
+
|
|
378
|
+
def initialize(max_cycles, options = {})
|
|
379
|
+
@max_cycles = max_cycles
|
|
380
|
+
@path_cache = Hash.new # Cache max_cycles for previously seen paths
|
|
381
|
+
@forward = options[:forward] || false # By default builds hash assuming backtracking from end of path. This flag will reverse path direction and build hash assuming moving forwards.
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# Iterate through unique nodes of path and find maximal cycle counts
|
|
386
|
+
def path_cycle_count(path)
|
|
387
|
+
log.debug "Finding cycles in path #{path.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
388
|
+
first_part = []
|
|
389
|
+
second_part = path
|
|
390
|
+
keys = []
|
|
391
|
+
count = nil
|
|
392
|
+
reached_max_cycles = false
|
|
393
|
+
|
|
394
|
+
second_part = second_part.reverse if @forward
|
|
395
|
+
|
|
396
|
+
# Iterate along path and look for the remaining path in cache. Remember the iterated
|
|
397
|
+
# path and the remaining path. Stop if a cache count is found, else use zero.
|
|
398
|
+
while count.nil? and !second_part.empty?
|
|
399
|
+
key = second_part.collect{|onode| onode.to_settable}.flatten
|
|
400
|
+
|
|
401
|
+
# Check if path value is cached
|
|
402
|
+
if @path_cache.has_key? key
|
|
403
|
+
count = @path_cache[key]
|
|
404
|
+
#log.debug "Found cached count #{count} for path #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
405
|
+
break
|
|
406
|
+
else
|
|
407
|
+
first_part = [first_part, second_part.first].flatten
|
|
408
|
+
second_part = second_part[1..-1]
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
if second_part.empty?
|
|
413
|
+
#log.debug "Reached end of path without finding cached count." if log.debug?
|
|
414
|
+
count = 0
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# The max cycle count for a path is the largest of:
|
|
418
|
+
# I. Cycle count for initial node of path in remaining path (without initial
|
|
419
|
+
# node), or
|
|
420
|
+
# II. Max cycle count of remaining path.
|
|
421
|
+
|
|
422
|
+
# We then iterate back through the iterated path. If count does not exceed
|
|
423
|
+
# max_cycles, we count cycles for each node in the remaining path, then
|
|
424
|
+
# backtrack by moving the node to the remaining path set. We record the count
|
|
425
|
+
# for each remaining path
|
|
426
|
+
while !first_part.empty?
|
|
427
|
+
|
|
428
|
+
node = first_part.last
|
|
429
|
+
if !reached_max_cycles
|
|
430
|
+
#log.debug "Next node is #{node.node.node_id}." if log.debug?
|
|
431
|
+
node_count = path_cycle_count_for_node(node, second_part, @max_cycles)
|
|
432
|
+
count = [count, node_count].max
|
|
433
|
+
reached_max_cycles = count > @max_cycles
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
second_part = [node, second_part].flatten
|
|
437
|
+
first_part = first_part[0...-1]
|
|
438
|
+
|
|
439
|
+
key = second_part.collect{|onode| onode.to_settable}.flatten
|
|
440
|
+
@path_cache[key] = count
|
|
441
|
+
#log.debug "Caching cycle count #{count} for path #{second_part.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
442
|
+
end
|
|
443
|
+
if reached_max_cycles and log.debug?
|
|
444
|
+
log.debug "Most repeated cycle in path occured #{count} or more times."
|
|
445
|
+
elsif log.debug?
|
|
446
|
+
log.debug "Most repeated cycle in path occured #{count} times."
|
|
447
|
+
end
|
|
448
|
+
return count
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
# For an initial node, find and count unique 'simple' cycles in a path that begin at the initial
|
|
452
|
+
# node, up to a max_cycles. Return count for the maximally repeated cycle if less than max_cycles,
|
|
453
|
+
# or max_cycles.
|
|
454
|
+
def path_cycle_count_for_node(node, path, max_cycles=1)
|
|
455
|
+
#log.debug "Finding all simple cycles for node #{node.node_id} in path #{path.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
456
|
+
remaining = path
|
|
457
|
+
cycles = Hash.new
|
|
458
|
+
|
|
459
|
+
remaining = remaining.reverse if @forward
|
|
460
|
+
|
|
461
|
+
while remaining.include?(node)
|
|
462
|
+
position = remaining.index(node)
|
|
463
|
+
cycle = remaining[0..position]
|
|
464
|
+
remaining = remaining[(position+1)..-1]
|
|
465
|
+
#log.debug "Found cycle: #{cycle.collect{|onode| onode.node.node_id}.join(',')}." if log.debug?
|
|
466
|
+
|
|
467
|
+
set_key = cycle.collect{|onode| onode.to_settable}.flatten
|
|
468
|
+
cycles[set_key] ||= 0
|
|
469
|
+
cycles[set_key] += 1
|
|
470
|
+
#log.debug "Found repeat #{cycles[set_key]}." if log.debug?
|
|
471
|
+
|
|
472
|
+
if cycles[set_key] > max_cycles
|
|
473
|
+
#log.debug "Max cycles #{max_cycles} exceeded." if log.debug?
|
|
474
|
+
return cycles[set_key]
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
if cycles.empty?
|
|
478
|
+
max_counts = 0
|
|
479
|
+
else
|
|
480
|
+
max_counts = cycles.values.max
|
|
481
|
+
end
|
|
482
|
+
#log.debug "Most cycles found #{max_counts}." if log.debug?
|
|
483
|
+
return max_counts
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
class DynamicProgrammingProblem
|
|
488
|
+
attr_accessor :min_distance, :known_paths
|
|
489
|
+
|
|
490
|
+
def initialize
|
|
491
|
+
@known_paths = []
|
|
492
|
+
end
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Like a Hash, but also contains a list of keys that end in the
|
|
496
|
+
# terminal node
|
|
497
|
+
class ProblemSet < Hash
|
|
498
|
+
# Array of keys to this hash that end in the terminal onode
|
|
499
|
+
attr_accessor :terminal_node_keys
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
class ProblemTrailFinder
|
|
503
|
+
include Bio::FinishM::Logging
|
|
504
|
+
|
|
505
|
+
def initialize(graph, initial_path)
|
|
506
|
+
@graph = graph
|
|
507
|
+
@pqueue = DS::AnyPriorityQueue.new {|a,b| a < b}
|
|
508
|
+
@pqueue.enqueue initial_path.copy, 0
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
def dequeue
|
|
512
|
+
@pqueue.dequeue
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
def length
|
|
516
|
+
@pqueue.length
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
def push_next_neighbours(current_path)
|
|
520
|
+
next_nodes = current_path.neighbours_of_last_node(@graph)
|
|
521
|
+
log.debug "Pushing #{next_nodes.length} new neighbours of #{current_path.last}" if log.debug?
|
|
522
|
+
#TODO: not neccessary to copy all paths, can just continue one of them
|
|
523
|
+
next_nodes.each do |n|
|
|
524
|
+
log.debug "Pushing neighbour to stack: #{n}" if log.debug?
|
|
525
|
+
path = current_path.copy
|
|
526
|
+
path.add_oriented_node n
|
|
527
|
+
@pqueue.enqueue path, path.length_in_bp
|
|
528
|
+
end
|
|
529
|
+
end
|
|
530
|
+
end
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
|