finishm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitmodules +3 -0
- data/.rspec +1 -0
- data/Gemfile +31 -0
- data/LICENSE.txt +20 -0
- data/README.md +59 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/assembly_visualiser +106 -0
- data/bin/check_primer_combinations.rb +73 -0
- data/bin/contig_joiner.rb +244 -0
- data/bin/contigs_against_assembly.rb +153 -0
- data/bin/finishm +143 -0
- data/bin/finishm_assembler +55 -0
- data/bin/finishm_gap_closer.rb +241 -0
- data/bin/kmer_abundance_file_tool.rb +49 -0
- data/bin/kmer_pattern_to_assembly.rb +377 -0
- data/bin/kmer_profile_finder.rb +92 -0
- data/bin/kmers_count_parse.d +52 -0
- data/bin/kmers_count_tabulate.d +123 -0
- data/bin/kmers_count_tabulate.rb +84 -0
- data/bin/pcr_result_parser.rb +108 -0
- data/bin/primer_finder.rb +119 -0
- data/bin/read_selection_by_kmer.d +174 -0
- data/bin/scaffold_by_pattern.rb +119 -0
- data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
- data/bin/scaffold_end_coverages.rb +69 -0
- data/bin/trail_validator.rb +84 -0
- data/ext/mkrf_conf.rb +56 -0
- data/ext/src/Makefile +140 -0
- data/ext/src/src/allocArray.c +305 -0
- data/ext/src/src/allocArray.h +86 -0
- data/ext/src/src/autoOpen.c +107 -0
- data/ext/src/src/autoOpen.h +18 -0
- data/ext/src/src/binarySequences.c +813 -0
- data/ext/src/src/binarySequences.h +125 -0
- data/ext/src/src/concatenatedGraph.c +233 -0
- data/ext/src/src/concatenatedGraph.h +30 -0
- data/ext/src/src/concatenatedPreGraph.c +262 -0
- data/ext/src/src/concatenatedPreGraph.h +29 -0
- data/ext/src/src/correctedGraph.c +2643 -0
- data/ext/src/src/correctedGraph.h +32 -0
- data/ext/src/src/dfib.c +509 -0
- data/ext/src/src/dfib.h +69 -0
- data/ext/src/src/dfibHeap.c +89 -0
- data/ext/src/src/dfibHeap.h +39 -0
- data/ext/src/src/dfibpriv.h +105 -0
- data/ext/src/src/fib.c +628 -0
- data/ext/src/src/fib.h +78 -0
- data/ext/src/src/fibHeap.c +79 -0
- data/ext/src/src/fibHeap.h +41 -0
- data/ext/src/src/fibpriv.h +110 -0
- data/ext/src/src/globals.h +154 -0
- data/ext/src/src/graph.c +3932 -0
- data/ext/src/src/graph.h +233 -0
- data/ext/src/src/graphReConstruction.c +1472 -0
- data/ext/src/src/graphReConstruction.h +30 -0
- data/ext/src/src/graphStats.c +2167 -0
- data/ext/src/src/graphStats.h +72 -0
- data/ext/src/src/graphStructures.h +52 -0
- data/ext/src/src/kmer.c +652 -0
- data/ext/src/src/kmer.h +73 -0
- data/ext/src/src/kmerOccurenceTable.c +236 -0
- data/ext/src/src/kmerOccurenceTable.h +44 -0
- data/ext/src/src/kseq.h +223 -0
- data/ext/src/src/locallyCorrectedGraph.c +557 -0
- data/ext/src/src/locallyCorrectedGraph.h +40 -0
- data/ext/src/src/passageMarker.c +677 -0
- data/ext/src/src/passageMarker.h +137 -0
- data/ext/src/src/preGraph.c +1717 -0
- data/ext/src/src/preGraph.h +106 -0
- data/ext/src/src/preGraphConstruction.c +990 -0
- data/ext/src/src/preGraphConstruction.h +26 -0
- data/ext/src/src/probe_node_finder.c +84 -0
- data/ext/src/src/probe_node_finder.h +6 -0
- data/ext/src/src/readCoherentGraph.c +557 -0
- data/ext/src/src/readCoherentGraph.h +30 -0
- data/ext/src/src/readSet.c +1734 -0
- data/ext/src/src/readSet.h +67 -0
- data/ext/src/src/readToNode.c +218 -0
- data/ext/src/src/readToNode.h +35 -0
- data/ext/src/src/recycleBin.c +199 -0
- data/ext/src/src/recycleBin.h +58 -0
- data/ext/src/src/roadMap.c +342 -0
- data/ext/src/src/roadMap.h +65 -0
- data/ext/src/src/run.c +318 -0
- data/ext/src/src/run.h +52 -0
- data/ext/src/src/run2.c +744 -0
- data/ext/src/src/runReadToNode.c +29 -0
- data/ext/src/src/scaffold.c +1876 -0
- data/ext/src/src/scaffold.h +64 -0
- data/ext/src/src/shortReadPairs.c +1243 -0
- data/ext/src/src/shortReadPairs.h +32 -0
- data/ext/src/src/splay.c +259 -0
- data/ext/src/src/splay.h +43 -0
- data/ext/src/src/splayTable.c +1315 -0
- data/ext/src/src/splayTable.h +31 -0
- data/ext/src/src/tightString.c +362 -0
- data/ext/src/src/tightString.h +82 -0
- data/ext/src/src/utility.c +199 -0
- data/ext/src/src/utility.h +98 -0
- data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
- data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
- data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
- data/ext/src/third-party/zlib-1.2.3/README +125 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
- data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
- data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
- data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
- data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
- data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
- data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/configure +459 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/example +0 -0
- data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
- data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
- data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
- data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
- data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
- data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
- data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
- data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
- data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
- data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
- data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
- data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
- data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
- data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
- data/lib/assembly/a_b_visualiser.rb +169 -0
- data/lib/assembly/acyclic_connection_finder.rb +81 -0
- data/lib/assembly/all_orfs.rb +615 -0
- data/lib/assembly/bad_format_writer.rb +46 -0
- data/lib/assembly/bam_probe_read_selector.rb +48 -0
- data/lib/assembly/bubbly_assembler.rb +842 -0
- data/lib/assembly/c_probe_node_finder.rb +38 -0
- data/lib/assembly/connection_interpreter.rb +350 -0
- data/lib/assembly/contig_printer.rb +400 -0
- data/lib/assembly/coverage_based_graph_filter.rb +68 -0
- data/lib/assembly/depth_first_search.rb +63 -0
- data/lib/assembly/dijkstra.rb +216 -0
- data/lib/assembly/fluffer.rb +253 -0
- data/lib/assembly/graph_explorer.rb +85 -0
- data/lib/assembly/graph_generator.rb +315 -0
- data/lib/assembly/height_finder.rb +355 -0
- data/lib/assembly/hybrid_velvet_graph.rb +70 -0
- data/lib/assembly/input_genome.rb +182 -0
- data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
- data/lib/assembly/node_finder.rb +171 -0
- data/lib/assembly/oriented_node_trail.rb +507 -0
- data/lib/assembly/paired_end_assembler.rb +53 -0
- data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
- data/lib/assembly/probed_graph.rb +105 -0
- data/lib/assembly/read_input.rb +79 -0
- data/lib/assembly/read_to_node.rb +37 -0
- data/lib/assembly/scaffold_breaker.rb +126 -0
- data/lib/assembly/sequence_hasher.rb +71 -0
- data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
- data/lib/assembly/single_coherent_wanderer.rb +261 -0
- data/lib/assembly/single_ended_assembler.rb +441 -0
- data/lib/assembly/velvet_c_binding.rb +54 -0
- data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
- data/lib/external/VERSION +1 -0
- data/lib/finishm/assemble.rb +224 -0
- data/lib/finishm/explore.rb +217 -0
- data/lib/finishm/finisher.rb +303 -0
- data/lib/finishm/fluff.rb +122 -0
- data/lib/finishm/gapfiller.rb +325 -0
- data/lib/finishm/orfs_finder.rb +88 -0
- data/lib/finishm/path_counter.rb +90 -0
- data/lib/finishm/primers.rb +425 -0
- data/lib/finishm/primers_check.rb +176 -0
- data/lib/finishm/roundup.rb +344 -0
- data/lib/finishm/sequence.rb +142 -0
- data/lib/finishm/visualise.rb +430 -0
- data/lib/finishm/wander.rb +270 -0
- data/lib/kmer_abundance_pattern.rb +79 -0
- data/lib/kmer_multi_abundance_file.rb +48 -0
- data/lib/oligo_designer.rb +88 -0
- data/lib/priner.rb +66 -0
- data/spec/acyclic_connection_finder_spec.rb +551 -0
- data/spec/all_orfs_spec.rb +443 -0
- data/spec/assemble_spec.rb +186 -0
- data/spec/bubbly_assembler_spec.rb +707 -0
- data/spec/c_node_finder_spec.rb +58 -0
- data/spec/connection_interpreter_spec.rb +284 -0
- data/spec/contig_printer_spec.rb +291 -0
- data/spec/coverage_based_graph_filter_spec.rb +102 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
- data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
- data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
- data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
- data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Graph +46 -0
- data/spec/data/assembly_visualiser/start_kmers1 +2 -0
- data/spec/data/bands.csv +1 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
- data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
- data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
- data/spec/data/c_probe_node_finder/1/Log +756 -0
- data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
- data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
- data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
- data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
- data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
- data/spec/data/contig_printer/1/contigs.fa +4 -0
- data/spec/data/contig_printer/1/seq.fa +2408 -0
- data/spec/data/contig_printer/1/seq.fa.svg +153 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
- data/spec/data/contig_printer/1/seq.node12.fa +4 -0
- data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
- data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
- data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
- data/spec/data/explore/1/a.fa +2 -0
- data/spec/data/explore/1/seq1_and_a.fa +3 -0
- data/spec/data/explore/1/seq2.fa +2 -0
- data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
- data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
- data/spec/data/fluff/1/seq1.fa +2 -0
- data/spec/data/fluff/1/seq2.fa +2 -0
- data/spec/data/gapfilling/1/reads.fa +171 -0
- data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
- data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
- data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
- data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
- data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
- data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
- data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
- data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
- data/spec/data/gapfilling/2/reference.fa +2 -0
- data/spec/data/gapfilling/2/reference_part1.fa +4 -0
- data/spec/data/gapfilling/2/reference_part2.fa +4 -0
- data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
- data/spec/data/gapfilling/2/with_gaps.fa +4 -0
- data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
- data/spec/data/gapfilling/3/reads.fa.gz +0 -0
- data/spec/data/gapfilling/3/reference_part1.fa +4 -0
- data/spec/data/gapfilling/3/reference_part2.fa +4 -0
- data/spec/data/gapfilling/3/with_gaps.fa +4 -0
- data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
- data/spec/data/gapfilling/4/reads.fa.gz +0 -0
- data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
- data/spec/data/gapfilling/5/answer.fna +2 -0
- data/spec/data/gapfilling/5/gappy.fna +2 -0
- data/spec/data/gapfilling/5/reads.fa +17961 -0
- data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
- data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
- data/spec/data/gapfilling/6/random1.fa +28 -0
- data/spec/data/gapfilling/6/random2.fa +28 -0
- data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
- data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
- data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
- data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
- data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
- data/spec/data/kmers_count1.csv +2 -0
- data/spec/data/kmers_count2.csv +3 -0
- data/spec/data/out +3 -0
- data/spec/data/positive_latching_pair.fa +2 -0
- data/spec/data/primers.csv +4 -0
- data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/input.fasta +6 -0
- data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
- data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
- data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
- data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
- data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
- data/spec/data/t/details.txt +5 -0
- data/spec/data/t/details.txt.srt +5 -0
- data/spec/data/t/location.txt +3 -0
- data/spec/data/t/location.txt.srt +3 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
- data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
- data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/answer.fa +2 -0
- data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
- data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
- data/spec/data/velvet_test_trails/Assem/Graph +17 -0
- data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
- data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
- data/spec/data/velvet_test_trails/Assem/Log +35 -0
- data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
- data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
- data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
- data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
- data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
- data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
- data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
- data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
- data/spec/data/velvet_test_trails/read1.fa +2 -0
- data/spec/data/velvet_test_trails/reads.fa +50 -0
- data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
- data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
- data/spec/data/visualise/1/LastGraph +6695 -0
- data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
- data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
- data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
- data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
- data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
- data/spec/data/wander/1/random1.fa +2 -0
- data/spec/data/wander/1/random1.sammy.fa +804 -0
- data/spec/depth_first_search_spec.rb +190 -0
- data/spec/dijkstra_spec.rb +143 -0
- data/spec/explore_spec.rb +29 -0
- data/spec/fluffer_spec.rb +155 -0
- data/spec/gapfiller_spec.rb +107 -0
- data/spec/graph_explorer_spec.rb +475 -0
- data/spec/graph_generator_spec.rb +99 -0
- data/spec/height_finder_spec.rb +306 -0
- data/spec/kmer_abundance_pattern_spec.rb +56 -0
- data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
- data/spec/kmer_profile_finder_spec.rb +38 -0
- data/spec/kmers_count_tabulate_spec.rb +120 -0
- data/spec/oriented_node_trail_spec.rb +221 -0
- data/spec/paired_end_neighbours_spec.rb +126 -0
- data/spec/paths_between_nodes_spec.rb +349 -0
- data/spec/priner_spec.rb +7 -0
- data/spec/read_input_spec.rb +23 -0
- data/spec/read_selection_by_kmer_spec.rb +166 -0
- data/spec/read_to_node_spec.rb +35 -0
- data/spec/roundup_spec.rb +366 -0
- data/spec/scaffold_breaker_spec.rb +144 -0
- data/spec/sequence_spec.rb +43 -0
- data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
- data/spec/single_coherent_wanderer_spec.rb +120 -0
- data/spec/single_ended_assembler_spec.rb +398 -0
- data/spec/spec_helper.rb +310 -0
- data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
- data/spec/visualise_spec.rb +105 -0
- data/spec/wander_spec.rb +119 -0
- data/spec/watch_for_changes.sh +16 -0
- data/validation/fasta_compare.rb +72 -0
- data/validation/gapfill_simulate_perfect.rb +108 -0
- metadata +899 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
class Bio::FinishM::Finisher
|
|
2
|
+
include Bio::FinishM::Logging
|
|
3
|
+
|
|
4
|
+
def add_options(opts, options)
|
|
5
|
+
opts.banner = "\nUsage: finishm finish <options>\n\n"
|
|
6
|
+
|
|
7
|
+
options.merge!({
|
|
8
|
+
:min_leftover_length => false,
|
|
9
|
+
:kmer_coverage_target => 1,
|
|
10
|
+
:contig_end_length => 300,
|
|
11
|
+
:graph_search_leash_length => 20000,
|
|
12
|
+
:reads_to_assemble => nil,
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
# TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
|
|
16
|
+
# Look at the last 300bp of the first contig.
|
|
17
|
+
extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
|
|
18
|
+
contig = nil
|
|
19
|
+
Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
|
|
20
|
+
if contig.nil?
|
|
21
|
+
contig = e.seq
|
|
22
|
+
else
|
|
23
|
+
raise "Multiple sequences found in a contig file! I need exactly one"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
raise "I need a contig to be in the start contig file" if contig.nil?
|
|
27
|
+
Bio::Sequence::NA.new(contig.to_s)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
|
|
31
|
+
options[:pattern] = arg
|
|
32
|
+
end
|
|
33
|
+
opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
|
|
34
|
+
options[:kmer_multiple_abundance_file] = arg
|
|
35
|
+
end
|
|
36
|
+
opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
|
37
|
+
options[:upper_threshold] = arg.to_i
|
|
38
|
+
end
|
|
39
|
+
opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
|
40
|
+
options[:lower_threshold] = arg.to_i
|
|
41
|
+
end
|
|
42
|
+
opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
|
|
43
|
+
options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
|
|
44
|
+
end
|
|
45
|
+
opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
|
|
46
|
+
options[:start_contig] = extract_exactly_one_contig_from_file.call arg
|
|
47
|
+
end
|
|
48
|
+
opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
|
|
49
|
+
options[:end_contig] = extract_exactly_one_contig_from_file.call arg
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
opts.separator "\nOptional arguments:\n\n"
|
|
53
|
+
opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
|
|
54
|
+
options[:min_leftover_length] = arg.to_i
|
|
55
|
+
end
|
|
56
|
+
opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
|
|
57
|
+
options[:kmer_coverage_target] = arg.to_i
|
|
58
|
+
end
|
|
59
|
+
opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
|
|
60
|
+
options[:already_patterned_reads] = arg
|
|
61
|
+
end
|
|
62
|
+
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
|
63
|
+
options[:output_graph_png] = arg
|
|
64
|
+
end
|
|
65
|
+
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
|
66
|
+
options[:output_graph_svg] = arg
|
|
67
|
+
end
|
|
68
|
+
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
|
69
|
+
options[:output_graph_dot] = arg
|
|
70
|
+
end
|
|
71
|
+
opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
|
|
72
|
+
options[:assembly_coverage_cutoff] = arg.to_f
|
|
73
|
+
end
|
|
74
|
+
opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
|
|
75
|
+
options[:contig_end_length] = arg.to_i
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
Bio::FinishM::GraphGenerator.new.add_options opts, options
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def validate_options(options, argv)
|
|
82
|
+
#TODO: give a better description of the error that has occurred
|
|
83
|
+
if argv.length != 0
|
|
84
|
+
return "Dangling argument(s) found e.g. #{argv[0]}"
|
|
85
|
+
elsif options[:already_patterned_reads]
|
|
86
|
+
else
|
|
87
|
+
[:upper_threshold,
|
|
88
|
+
:lower_threshold,
|
|
89
|
+
:pattern,
|
|
90
|
+
:kmer_multiple_abundance_file,
|
|
91
|
+
:reads_files].each do |sym|
|
|
92
|
+
if options[sym].nil?
|
|
93
|
+
return "No option found to specify #{sym}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
return nil #if here, options all were parsed successfully
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
#TODO: this method is too long - split it up by refactoring
|
|
101
|
+
def run(options, argv)
|
|
102
|
+
pooled_reads_filename = 'pooled_sampled_reads.fasta' #TODO: remove this constant into a tempfile or something.
|
|
103
|
+
if options[:already_patterned_reads] #If skipping read extraction
|
|
104
|
+
pooled_reads_filename = options[:already_patterned_reads]
|
|
105
|
+
|
|
106
|
+
else
|
|
107
|
+
# Parse pattern from cmdline
|
|
108
|
+
desired_pattern = KmerAbundancePattern.new
|
|
109
|
+
desired_pattern.parse_from_human(options[:pattern])
|
|
110
|
+
if options[:reads_files].length != desired_pattern.length
|
|
111
|
+
raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Collect the kmers that will be used to find trusted reads i.e.
|
|
115
|
+
# Go through each line of the kmer abundance file, looking for kmers that suit the pattern
|
|
116
|
+
input_file = File.open options[:kmer_multiple_abundance_file]
|
|
117
|
+
csv = CSV.new(input_file, :col_sep => ' ')
|
|
118
|
+
|
|
119
|
+
whitelist_kmers = []
|
|
120
|
+
blacklist_kmers = []
|
|
121
|
+
csv.each do |row|
|
|
122
|
+
max_i = row.length - 2 if max_i.nil?
|
|
123
|
+
|
|
124
|
+
kmer = row[0]
|
|
125
|
+
counts = row[1...row.length].collect{|s| s.to_i}
|
|
126
|
+
this_pattern = []
|
|
127
|
+
counts.each_with_index do |count, i|
|
|
128
|
+
if count > options[:upper_threshold]
|
|
129
|
+
this_pattern[i] = true
|
|
130
|
+
elsif count < options[:lower_threshold]
|
|
131
|
+
this_pattern[i] = false
|
|
132
|
+
else
|
|
133
|
+
# coverage was in no man's land between thresholds.
|
|
134
|
+
# Ignore this kmer as noise.
|
|
135
|
+
this_pattern[i] = '-'
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
#log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
|
|
139
|
+
|
|
140
|
+
if desired_pattern.consistent_with? this_pattern
|
|
141
|
+
whitelist_kmers.push row[0]
|
|
142
|
+
else
|
|
143
|
+
# kmer is not present when it should be
|
|
144
|
+
blacklist_kmers.push row[0]
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
|
|
148
|
+
unless whitelist_kmers.length > 0
|
|
149
|
+
log.error "No kmers found that satisfy the given pattern, exiting.."
|
|
150
|
+
exit 1
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
#outdir = options[:output_directory]
|
|
155
|
+
#Dir.mkdir outdir unless Dir.exist?(outdir)
|
|
156
|
+
|
|
157
|
+
# grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
|
|
158
|
+
#Tempfile.open('whitelist') do |white|
|
|
159
|
+
File.open 'whitelist', 'w' do |white| #TODO: remove 'whitelist' file as a constant
|
|
160
|
+
white.puts whitelist_kmers.join("\n")
|
|
161
|
+
white.close
|
|
162
|
+
|
|
163
|
+
#Tempfile.open('blacklist') do |black|
|
|
164
|
+
File.open('black','w') do |black|
|
|
165
|
+
black.puts blacklist_kmers.join("\n")
|
|
166
|
+
black.close
|
|
167
|
+
|
|
168
|
+
threadpool = []
|
|
169
|
+
sampled_read_files = []
|
|
170
|
+
log.info "Extracting reads that contain suitable kmers"
|
|
171
|
+
options[:reads_files].each_with_index do |file, i|
|
|
172
|
+
next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
|
|
173
|
+
|
|
174
|
+
sampled = File.basename(file)+'.sampled_reads.fasta'
|
|
175
|
+
sampled_read_files.push sampled
|
|
176
|
+
|
|
177
|
+
grep_path = "#{ ENV['HOME'] }/git/priner/bin/read_selection_by_kmer " #TODO: this won't work on other people's systems.
|
|
178
|
+
if options[:min_leftover_length]
|
|
179
|
+
grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
|
|
180
|
+
end
|
|
181
|
+
thr = Thread.new do
|
|
182
|
+
grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
|
|
183
|
+
log.debug "Running cmd: #{grep_cmd}"
|
|
184
|
+
status, stdout, stderr = systemu grep_cmd
|
|
185
|
+
log.debug stderr
|
|
186
|
+
|
|
187
|
+
raise unless status.exitstatus == 0
|
|
188
|
+
log.debug "Finished extracting reads from #{file}"
|
|
189
|
+
end
|
|
190
|
+
threadpool.push thr
|
|
191
|
+
end
|
|
192
|
+
threadpool.each do |thread| thread.join; end #wait until everything is finito
|
|
193
|
+
|
|
194
|
+
log.info "Finished extracting reads for sampling. Now pooling sampled reads"
|
|
195
|
+
pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
|
|
196
|
+
log.debug "Running cmd: #{pool_cmd}"
|
|
197
|
+
status, stdout, stderr = systemu pool_cmd
|
|
198
|
+
raise stderr if stderr != ''
|
|
199
|
+
raise unless status.exitstatus == 0
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
log.info "Extracting dummy reads from the ends of contigs to use as anchors"
|
|
205
|
+
start_contig = options[:start_contig]
|
|
206
|
+
end_contig = options[:end_contig]
|
|
207
|
+
if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
|
|
208
|
+
log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
|
|
209
|
+
end
|
|
210
|
+
if [start_contig.length, end_contig.length].min < options[:contig_end_length]
|
|
211
|
+
log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
|
|
212
|
+
exit 1
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
probe_sequences = [
|
|
216
|
+
start_contig[start_contig.length-options[:contig_end_length]...start_contig.length],
|
|
217
|
+
Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]]).reverse_complement.to_s
|
|
218
|
+
]
|
|
219
|
+
read_input = Bio::FinishM::ReadInput.new
|
|
220
|
+
read_input.fasta_singles = [pooled_reads_filename]
|
|
221
|
+
finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
|
|
222
|
+
graph = finishm_graph.graph
|
|
223
|
+
start_node = finishm_graph.probe_nodes[0]
|
|
224
|
+
start_node_forward = finishm_graph.probe_node_directions[0]
|
|
225
|
+
end_node = finishm_graph.probe_nodes[1]
|
|
226
|
+
end_node_forward = finishm_graph.probe_node_directions[1]
|
|
227
|
+
|
|
228
|
+
log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
|
|
229
|
+
|
|
230
|
+
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
|
231
|
+
original_num_nodes = graph.nodes.length
|
|
232
|
+
original_num_arcs = graph.arcs.length
|
|
233
|
+
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
|
234
|
+
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
|
235
|
+
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
|
236
|
+
|
|
237
|
+
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
|
238
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
239
|
+
log.info "Preparing GraphViz object for output"
|
|
240
|
+
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
|
241
|
+
|
|
242
|
+
if options[:output_graph_png]
|
|
243
|
+
log.info "Converting assembly to a graphviz PNG #{options[:output_graph_png] }"
|
|
244
|
+
gv.output :png => options[:output_graph_png], :use => :neato
|
|
245
|
+
end
|
|
246
|
+
if options[:output_graph_svg]
|
|
247
|
+
log.info "Converting assembly to a graphviz SVG #{options[:output_graph_svg] }"
|
|
248
|
+
gv.output :svg => options[:output_graph_svg], :use => :neato
|
|
249
|
+
end
|
|
250
|
+
if options[:output_graph_dot]
|
|
251
|
+
log.info "Converting assembly to a graphviz DOT #{options[:output_graph_dot] }"
|
|
252
|
+
gv.output :dot => options[:output_graph_dot]
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
|
|
257
|
+
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
|
258
|
+
#raise "Untested connection finder below"
|
|
259
|
+
#trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
|
260
|
+
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
|
261
|
+
log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
|
|
262
|
+
|
|
263
|
+
# log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
|
|
264
|
+
# kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
|
|
265
|
+
# log.info "Finished reading the kmer abundances"
|
|
266
|
+
|
|
267
|
+
# if options[:trail_kmer_coverage_file]
|
|
268
|
+
# log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
|
|
269
|
+
# writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
|
|
270
|
+
# io = File.open(options[:trail_kmer_coverage_file],'w')
|
|
271
|
+
# writer.write(io, trails, kmer_hash)
|
|
272
|
+
# log.info "Finished writing"
|
|
273
|
+
# end
|
|
274
|
+
|
|
275
|
+
# log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
|
|
276
|
+
# kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
|
|
277
|
+
# thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
|
|
278
|
+
# log.info "Using thresholds for filtering: #{thresholds}"
|
|
279
|
+
# trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
|
|
280
|
+
# log.info "After filtering remained #{trails.length} trails"
|
|
281
|
+
|
|
282
|
+
printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new
|
|
283
|
+
trails.each_with_index do |trail, i|
|
|
284
|
+
log.debug "Before attachment to the contig, sequence of the trail was #{trail.sequence}" if log.debug?
|
|
285
|
+
acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
|
|
286
|
+
acon.start_probe_read_id = 1
|
|
287
|
+
acon.end_probe_read_id = 2
|
|
288
|
+
acon.start_probe_node = start_node
|
|
289
|
+
acon.end_probe_node = end_node
|
|
290
|
+
acon.start_probe_contig_offset = options[:contig_end_length]
|
|
291
|
+
acon.end_probe_contig_offset = options[:contig_end_length]
|
|
292
|
+
acon.paths = [trail]
|
|
293
|
+
log.debug "AnchoredConnection object to print for this trail: #{acon.inspect}" if log.debug?
|
|
294
|
+
|
|
295
|
+
puts ">trail#{i+1}"
|
|
296
|
+
puts printer.one_connection_between_two_contigs(
|
|
297
|
+
finishm_graph.graph,
|
|
298
|
+
probe_sequences[0],
|
|
299
|
+
acon,
|
|
300
|
+
probe_sequences[1])
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
class Bio::FinishM::Fluff
|
|
2
|
+
include Bio::FinishM::Logging
|
|
3
|
+
|
|
4
|
+
def add_options(optparse_object, options)
|
|
5
|
+
optparse_object.banner = "\nUsage: finishm fluff --contigs <contig_file> --fastq-gz <reads..> --output-fluff-file <output.fa>
|
|
6
|
+
|
|
7
|
+
Takes a set of contigs, and places probes across them (e.g. every 2kb), and then explores the
|
|
8
|
+
graph from each of these probes, taking all paths within some leash length, including the 'fluff'
|
|
9
|
+
which is not the same path as along the contig. Prints out all of these paths to a fasta file.\n\n"
|
|
10
|
+
|
|
11
|
+
options.merge!({
|
|
12
|
+
:probe_spacing => 2000,
|
|
13
|
+
:probe_length => 100,
|
|
14
|
+
:graph_search_leash_length => 20000,
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
optparse_object.separator "\nRequired arguments:\n\n"
|
|
18
|
+
optparse_object.on("--contigs FILE", "fasta file containing contigs to find the fluff on [required]") do |arg|
|
|
19
|
+
options[:contigs_file] = arg
|
|
20
|
+
end
|
|
21
|
+
optparse_object.on("--output-fluff-file PATH", "Output found paths to this file in fasta format [required]") do |arg|
|
|
22
|
+
options[:output_fluff_file] = arg
|
|
23
|
+
end
|
|
24
|
+
optparse_object.separator "\nThere must be some definition of reads too:\n\n" #TODO improve this help
|
|
25
|
+
Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
|
|
26
|
+
|
|
27
|
+
optparse_object.separator "\nOptional arguments:\n\n"
|
|
28
|
+
optparse_object.on("--probe-spacing NUM", Integer, "Distance between probe points in the contig [default: #{options[:probe_spacing]}]") do |arg|
|
|
29
|
+
options[:probe_spacing] = arg
|
|
30
|
+
end
|
|
31
|
+
optparse_object.on("--probe-size NUM", Integer, "Length of the probe to be inserted into the velvet graph. Must be greater than graph kmer length. [default: #{options[:probe_length]}]") do |arg|
|
|
32
|
+
options[:probe_length] = arg
|
|
33
|
+
end
|
|
34
|
+
optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this far and not much more [default: #{options[:graph_search_leash_length]}]") do |arg|
|
|
35
|
+
options[:graph_search_leash_length] = arg
|
|
36
|
+
end
|
|
37
|
+
optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
|
38
|
+
options[:output_graph_png] = arg
|
|
39
|
+
end
|
|
40
|
+
optparse_object.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
|
41
|
+
options[:output_graph_svg] = arg
|
|
42
|
+
end
|
|
43
|
+
optparse_object.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
|
44
|
+
options[:output_graph_dot] = arg
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
Bio::FinishM::GraphGenerator.new.add_options optparse_object, options
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def validate_options(options, argv)
|
|
51
|
+
#TODO: give a better description of the error that has occurred
|
|
52
|
+
#TODO: require reads options
|
|
53
|
+
if argv.length != 0
|
|
54
|
+
return "Dangling argument(s) found e.g. #{argv[0]}"
|
|
55
|
+
else
|
|
56
|
+
[
|
|
57
|
+
:contigs_file,
|
|
58
|
+
:output_fluff_file
|
|
59
|
+
].each do |sym|
|
|
60
|
+
if options[sym].nil?
|
|
61
|
+
return "No option found to specify #{sym}."
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
unless options[:velvet_kmer_size] < options[:probe_length]
|
|
66
|
+
return "The probe length must be greater than the kmer length, otherwise it will not be incorporated into the kmer graph"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
#if return nil from here, options all were parsed successfully
|
|
70
|
+
return Bio::FinishM::ReadInput.new.validate_options(options, [])
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def run(options, argv)
|
|
75
|
+
# Read in all the contigs sequences
|
|
76
|
+
probe_sequences = []
|
|
77
|
+
sequence_names = []
|
|
78
|
+
Bio::FlatFile.foreach(options[:contigs_file]) do |seq|
|
|
79
|
+
sequence_names.push seq.definition
|
|
80
|
+
|
|
81
|
+
sequence = seq.seq
|
|
82
|
+
0.step(sequence.length-1-options[:probe_length], options[:probe_spacing]) do |offset|
|
|
83
|
+
# Only probe in the forward direction
|
|
84
|
+
probe_sequence = sequence[offset...offset+options[:probe_length]]
|
|
85
|
+
probe_sequences.push probe_sequence
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
log.info "Searching from #{probe_sequences.length} different probes from #{sequence_names.length} contigs)"
|
|
89
|
+
|
|
90
|
+
# Generate the graph with the probe sequences in it.
|
|
91
|
+
read_input = Bio::FinishM::ReadInput.new
|
|
92
|
+
read_input.parse_options options
|
|
93
|
+
finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
|
|
94
|
+
|
|
95
|
+
# Loop over the ends, trying to make connections from each one
|
|
96
|
+
fluffer = Bio::AssemblyGraphAlgorithms::Fluffer.new
|
|
97
|
+
fluffings = fluffer.fluff(finishm_graph, options[:graph_search_leash_length])
|
|
98
|
+
log.debug "Found these fluffings: #{fluffings}" if log.debug?
|
|
99
|
+
log.info "Found #{fluffings.collect{|sets| sets.length}.reduce(:+)} paths in total" if log.info?
|
|
100
|
+
|
|
101
|
+
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
|
102
|
+
log.info "Converting assembly to a graphviz PNG"
|
|
103
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
104
|
+
gv = viser.graphviz(finishm_graph.graph, {:start_node_ids => finishm_graph.probe_nodes.collect{|node| node.node_id}})
|
|
105
|
+
|
|
106
|
+
gv.output :png => options[:output_graph_png], :use => :neato if options[:output_graph_png]
|
|
107
|
+
gv.output :svg => options[:output_graph_svg], :use => :neato if options[:output_graph_svg]
|
|
108
|
+
gv.output :dot => options[:output_graph_dot] if options[:output_graph_dot]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Print out the sequences
|
|
112
|
+
File.open(options[:output_fluff_file], 'w') do |output|
|
|
113
|
+
fluffings.each_with_index do |path_set, probe_number|
|
|
114
|
+
path_set.each_with_index do |path, path_number|
|
|
115
|
+
fate = path_set.fates[path_number]
|
|
116
|
+
output.puts ">probe#{probe_number+1}_path#{path_number+1} #{fate}"
|
|
117
|
+
output.puts path.sequence
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
require 'tmpdir'
|
|
2
|
+
|
|
3
|
+
class Bio::FinishM::GapFiller
|
|
4
|
+
include Bio::FinishM::Logging
|
|
5
|
+
|
|
6
|
+
def add_options(optparse_object, options)
|
|
7
|
+
optparse_object.banner = "\nUsage: finishm gapfill --contigs <contigs_file> --fastq-gz <reads..> --output-fasta <output.fa>
|
|
8
|
+
|
|
9
|
+
Takes a set of reads and a contig that contains gap characters. Then it tries to fill in
|
|
10
|
+
these N characters. It is possible that there is multiple ways to close the gap - in that case
|
|
11
|
+
each can be reported.
|
|
12
|
+
|
|
13
|
+
example: finishm gapfill --contigs to_gapfill.fasta --fastq-gz reads.1.fq.gz,reads.2.fq.gz --output-fasta output.fasta
|
|
14
|
+
\n"
|
|
15
|
+
|
|
16
|
+
options.merge!({
|
|
17
|
+
:contig_end_length => 200,
|
|
18
|
+
:graph_search_leash_length => 20000,
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
optparse_object.separator "\nRequired arguments:\n\n"
|
|
22
|
+
optparse_object.on("--contigs FILE", "fasta file of single contig containing Ns that are to be closed [required]") do |arg|
|
|
23
|
+
options[:contigs_file] = arg
|
|
24
|
+
end
|
|
25
|
+
optparse_object.on("--output-fasta PATH", "Output the gap-filled sequence to this file [required]") do |arg|
|
|
26
|
+
options[:overall_fasta_file] = arg
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
optparse_object.separator "\nThere must be some definition of of how to do the assembly, or else a path to a previous assembly directory:\n\n"
|
|
30
|
+
Bio::FinishM::ReadInput.new.add_options(optparse_object, options)
|
|
31
|
+
Bio::FinishM::GraphGenerator.new.add_options optparse_object, options
|
|
32
|
+
|
|
33
|
+
optparse_object.separator "\nGraph search options:\n\n"
|
|
34
|
+
optparse_object.on("--overhang NUM", Integer, "Start assembling this many base pairs back from the gap [default: #{options[:contig_end_length] }]") do |arg|
|
|
35
|
+
options[:contig_end_length] = arg
|
|
36
|
+
end
|
|
37
|
+
optparse_object.on("--leash-length NUM", Integer, "Don't explore too far in the graph, only this many base pairs and not (much) more [default: #{options[:graph_search_leash_length] }]") do |arg|
|
|
38
|
+
options[:graph_search_leash_length] = arg
|
|
39
|
+
end
|
|
40
|
+
optparse_object.on("--recoherence-kmer NUM", Integer, "Use a kmer longer than the original velvet one, to help remove bubbles and circular paths [default: none]") do |arg|
|
|
41
|
+
options[:recoherence_kmer] = arg
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
optparse_object.separator "\nVisualisation options (of all joins):\n\n"
|
|
45
|
+
optparse_object.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
|
46
|
+
options[:output_graph_png] = arg
|
|
47
|
+
end
|
|
48
|
+
optparse_object.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
|
49
|
+
options[:output_graph_svg] = arg
|
|
50
|
+
end
|
|
51
|
+
optparse_object.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
|
52
|
+
options[:output_graph_dot] = arg
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def validate_options(options, argv)
|
|
57
|
+
#TODO: give a better description of the error that has occurred
|
|
58
|
+
#TODO: require reads options
|
|
59
|
+
if argv.length != 0
|
|
60
|
+
return "Dangling argument(s) found e.g. #{argv[0] }"
|
|
61
|
+
else
|
|
62
|
+
[
|
|
63
|
+
:contigs_file,
|
|
64
|
+
:overall_fasta_file
|
|
65
|
+
].each do |sym|
|
|
66
|
+
if options[sym].nil?
|
|
67
|
+
return "No option found to specify #{sym}"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
#if return nil from here, options all were parsed successfully
|
|
72
|
+
return Bio::FinishM::ReadInput.new.validate_options(options, [])
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def run(options, argv)
|
|
77
|
+
# Read in all the contigs sequences and work out where the gaps are
|
|
78
|
+
genome = Bio::FinishM::InputGenome.new(
|
|
79
|
+
options[:contigs_file],
|
|
80
|
+
options[:contig_end_length],
|
|
81
|
+
options
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
scaffolds = Bio::FinishM::ScaffoldBreaker.new.break_scaffolds(options[:contigs_file])
|
|
86
|
+
gaps = []
|
|
87
|
+
output_fasta_file = File.open(options[:overall_fasta_file],'w')
|
|
88
|
+
num_without_gaps = 0
|
|
89
|
+
scaffolds.each do |scaffold|
|
|
90
|
+
sgaps = scaffold.gaps
|
|
91
|
+
if sgaps.empty?
|
|
92
|
+
num_without_gaps += 1
|
|
93
|
+
output_fasta_file.puts ">#{scaffold.name }"
|
|
94
|
+
output_fasta_file.puts scaffold.sequence
|
|
95
|
+
else
|
|
96
|
+
gaps.push scaffold.gaps
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
gaps.flatten!
|
|
100
|
+
log.info "Detected #{gaps.length} gap(s) from #{scaffolds.length} different contig(s). #{num_without_gaps } contig(s) were gap-free."
|
|
101
|
+
|
|
102
|
+
# Create probe sequences
|
|
103
|
+
probe_sequences = []
|
|
104
|
+
gaps.each do |gap|
|
|
105
|
+
sequence = gap.scaffold.sequence
|
|
106
|
+
|
|
107
|
+
if gap.start < options[:contig_end_length] or gap.stop > sequence.length - options[:contig_end_length]
|
|
108
|
+
log.warn "Found a gap that was too close to the end of a contig, skipping it: #{gap.coords}"
|
|
109
|
+
next
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
log.debug "Processing gap number #{gap.number}, #{gap.coords}"
|
|
113
|
+
first_coords = [
|
|
114
|
+
gap.start-options[:contig_end_length]-1,
|
|
115
|
+
gap.start-1,
|
|
116
|
+
]
|
|
117
|
+
second_coords = [
|
|
118
|
+
gap.stop,
|
|
119
|
+
(gap.stop+options[:contig_end_length]),
|
|
120
|
+
]
|
|
121
|
+
log.debug "Coordinates of the probes are #{first_coords} and #{second_coords}"
|
|
122
|
+
second = sequence[second_coords[0]..second_coords[1]]
|
|
123
|
+
probes = [
|
|
124
|
+
sequence[first_coords[0]...first_coords[1]],
|
|
125
|
+
Bio::Sequence::NA.new(second).reverse_complement.to_s,
|
|
126
|
+
]
|
|
127
|
+
#TODO: this could probably be handled better.. e.g. if the amount of sequence is too small, just throw it out and make one big gap
|
|
128
|
+
if probes[0].match(/N/i) or probes[1].match(/N/i)
|
|
129
|
+
log.warn "Noticed gap that was too close together, skipping: #{gap.coords}"
|
|
130
|
+
next
|
|
131
|
+
end
|
|
132
|
+
probe_sequences.push probes[0]
|
|
133
|
+
probe_sequences.push probes[1]
|
|
134
|
+
end
|
|
135
|
+
log.debug "Generated #{probe_sequences.length} probes e.g. #{probe_sequences[0] }"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# Generate the graph with the probe sequences in it.
|
|
139
|
+
read_input = Bio::FinishM::ReadInput.new
|
|
140
|
+
read_input.parse_options options
|
|
141
|
+
# Own the tmpdir, if one is to be used - need to re-read the LastGraph later on see..
|
|
142
|
+
assembly_directory = options[:output_assembly_path]
|
|
143
|
+
assembly_directory ||= options[:previous_assembly]
|
|
144
|
+
using_tmp_assembly_directory = false
|
|
145
|
+
if assembly_directory.nil?
|
|
146
|
+
using_tmp_assembly_directory = true
|
|
147
|
+
assembly_directory = Dir.mktmpdir
|
|
148
|
+
options[:output_assembly_path] = assembly_directory
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Do the actual graph building and/or initial reading
|
|
152
|
+
options[:parse_sequences] = true
|
|
153
|
+
finishm_graph = Bio::FinishM::GraphGenerator.new.generate_graph(probe_sequences, read_input, options)
|
|
154
|
+
|
|
155
|
+
# Output optional graphics.
|
|
156
|
+
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
|
157
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
158
|
+
# TODO: make these visualise more than one join somehow
|
|
159
|
+
gv = viser.graphviz(finishm_graph.graph, {
|
|
160
|
+
:start_node_id => finishm_graph.probe_nodes[0].node_id,
|
|
161
|
+
:end_node_id => finishm_graph.probe_nodes[1].node_id})
|
|
162
|
+
|
|
163
|
+
if options[:output_graph_png]
|
|
164
|
+
log.info "Converting assembly to a graphviz PNG"
|
|
165
|
+
gv.output :png => options[:output_graph_png], :use => :neato
|
|
166
|
+
end
|
|
167
|
+
if options[:output_graph_svg]
|
|
168
|
+
log.info "Converting assembly to a graphviz SVG"
|
|
169
|
+
gv.output :svg => options[:output_graph_svg], :use => :neato
|
|
170
|
+
end
|
|
171
|
+
if options[:output_graph_dot]
|
|
172
|
+
log.info "Converting assembly to a graphviz DOT"
|
|
173
|
+
gv.output :dot => options[:output_graph_dot]
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Clean up the tmdir, if one was used.
|
|
178
|
+
if using_tmp_assembly_directory
|
|
179
|
+
log.debug "Removing tmpdir that held the assembly `#{assembly_directory}'.."
|
|
180
|
+
FileUtils.remove_entry assembly_directory
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Do the gap-filling and print out the results
|
|
184
|
+
printer = Bio::AssemblyGraphAlgorithms::ContigPrinter.new
|
|
185
|
+
num_total_trails = 0
|
|
186
|
+
num_singly_filled = 0
|
|
187
|
+
num_unbridgable = 0
|
|
188
|
+
|
|
189
|
+
output_trails_file = nil
|
|
190
|
+
output_trails_file = File.open(options[:overall_trail_output_fasta_file],'w') unless options[:overall_trail_output_fasta_file].nil?
|
|
191
|
+
|
|
192
|
+
# Print the fasta output for the scaffold
|
|
193
|
+
print_scaffold = lambda do |last_scaffold, gapfilled_sequence|
|
|
194
|
+
output_fasta_file.puts ">#{last_scaffold.name }"
|
|
195
|
+
#gapfilled_sequence += last_scaffold.contigs[last_scaffold.contigs.length-1].sequence #add last contig
|
|
196
|
+
output_fasta_file.puts gapfilled_sequence
|
|
197
|
+
end
|
|
198
|
+
# Lambda to add a gap the the String representing the scaffold
|
|
199
|
+
#TODO: if the trail is not filled then the wrong sequence is currently printed. BUG???
|
|
200
|
+
filler = lambda do |anchored_connection, following_contig, gapfilled_sequence, gap|
|
|
201
|
+
gapfilled = nil
|
|
202
|
+
if anchored_connection.paths.length == 1
|
|
203
|
+
# If there is only 1 trail, then output scaffolding information
|
|
204
|
+
num_singly_filled += 1
|
|
205
|
+
|
|
206
|
+
gapfilled = printer.one_connection_between_two_contigs(
|
|
207
|
+
finishm_graph.graph,
|
|
208
|
+
gapfilled_sequence,
|
|
209
|
+
anchored_connection,
|
|
210
|
+
following_contig.sequence
|
|
211
|
+
)
|
|
212
|
+
else
|
|
213
|
+
# Otherwise don't make any assumptions
|
|
214
|
+
num_unbridgable += 1 if anchored_connection.paths.empty?
|
|
215
|
+
# TODO: even the there is multiple trails, better info can still be output here
|
|
216
|
+
gapfilled = gapfilled_sequence + 'N'*gap.length + following_contig.sequence
|
|
217
|
+
end
|
|
218
|
+
gapfilled #return this string
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
log.info "Searching for trails between the nodes within the assembly graph"
|
|
222
|
+
log.info "Using contig overhang length #{options[:contig_end_length] } and leash length #{options[:graph_search_leash_length] }"
|
|
223
|
+
gapfilled_sequence = ''
|
|
224
|
+
last_scaffold = nil
|
|
225
|
+
|
|
226
|
+
(0...(probe_sequences.length / 2)).collect{|i| i*2}.each do |start_probe_index|
|
|
227
|
+
gap_number = start_probe_index / 2
|
|
228
|
+
gap = gaps[gap_number]
|
|
229
|
+
log.info "Now working through gap number #{gap_number+1}: #{gap.coords}"
|
|
230
|
+
|
|
231
|
+
probe_index1 = start_probe_index
|
|
232
|
+
probe_index2 = start_probe_index+1
|
|
233
|
+
|
|
234
|
+
connection = gapfill(finishm_graph, probe_index1, probe_index2, options)
|
|
235
|
+
log.info "Found #{connection.paths.length} trails for #{gap.coords}"
|
|
236
|
+
|
|
237
|
+
unless output_trails_file.nil?
|
|
238
|
+
# print the sequences of the trails if asked for:
|
|
239
|
+
trails.each_with_index do |trail, i|
|
|
240
|
+
#TODO: need to output this as something more sensible e.g. VCF format
|
|
241
|
+
output_trails_file.puts ">#{gap.coords}_trail#{i+1}"
|
|
242
|
+
output_trails_file.puts trail.sequence
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
num_total_trails += connection.paths.length
|
|
246
|
+
|
|
247
|
+
# Output the updated sequence. Fill in the sequence if there is only 1 trail
|
|
248
|
+
if gap.scaffold == last_scaffold
|
|
249
|
+
# We are still building the current scaffold
|
|
250
|
+
#gapfilled_sequence += gap.scaffold.contigs[gap.number].sequence
|
|
251
|
+
log.debug "Before adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug?
|
|
252
|
+
gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap
|
|
253
|
+
log.debug "After adding next chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}" if log.debug?
|
|
254
|
+
else
|
|
255
|
+
# We are onto a new scaffold. Print the previous one (unless this the first one)
|
|
256
|
+
unless last_scaffold.nil?
|
|
257
|
+
# print the gapfilled (or not) scaffold.
|
|
258
|
+
print_scaffold.call(last_scaffold, gapfilled_sequence)
|
|
259
|
+
end
|
|
260
|
+
#reset
|
|
261
|
+
last_scaffold = gap.scaffold
|
|
262
|
+
|
|
263
|
+
#add the current gap (and the contig before it)
|
|
264
|
+
log.debug "Before adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}"
|
|
265
|
+
gapfilled_sequence = gap.scaffold.contigs[gap.number].sequence
|
|
266
|
+
log.debug "After adding first chunk of contig, length of scaffold being built is #{gapfilled_sequence.length}"
|
|
267
|
+
gapfilled_sequence = filler.call connection, gap.scaffold.contigs[gap.number+1], gapfilled_sequence, gap
|
|
268
|
+
log.debug "After adding first gap sequence and next contig, gapfilled sequence length is #{gapfilled_sequence.length}"
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
print_scaffold.call(last_scaffold, gapfilled_sequence) # print the last scaffold
|
|
272
|
+
|
|
273
|
+
log.info "#{num_unbridgable } gaps had no suitable bridging paths in the graph within the leash, and found #{num_total_trails} trails in total."
|
|
274
|
+
log.info "Filled #{num_singly_filled } out of #{gaps.length } gaps."
|
|
275
|
+
|
|
276
|
+
output_trails_file.close unless output_trails_file.nil?
|
|
277
|
+
output_fasta_file.close
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Given a finishm graph, gapfill from the first probe to the second. Return a
|
|
281
|
+
# Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection object
|
|
282
|
+
def gapfill(finishm_graph, probe_index1, probe_index2, options)
|
|
283
|
+
start_onode = finishm_graph.velvet_oriented_node(probe_index1)
|
|
284
|
+
end_onode_inward = finishm_graph.velvet_oriented_node(probe_index2)
|
|
285
|
+
unless start_onode and end_onode_inward
|
|
286
|
+
raise "Unable to retrieve both probes from the graph for gap #{gap_number} (#{gap.coords}), fail"
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# The probe from finishm_graph points in the wrong direction for path finding
|
|
290
|
+
end_onode = Bio::Velvet::Graph::OrientedNodeTrail::OrientedNode.new
|
|
291
|
+
end_onode.node = end_onode_inward.node
|
|
292
|
+
end_onode.first_side = end_onode_inward.starts_at_start? ? Bio::Velvet::Graph::OrientedNodeTrail::END_IS_FIRST : Bio::Velvet::Graph::OrientedNodeTrail::START_IS_FIRST
|
|
293
|
+
|
|
294
|
+
adjusted_leash_length = finishm_graph.adjusted_leash_length(probe_index1, options[:graph_search_leash_length])
|
|
295
|
+
log.debug "Using adjusted leash length #{adjusted_leash_length }" if log.debug?
|
|
296
|
+
|
|
297
|
+
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
|
298
|
+
trails = cartographer.find_trails_between_nodes(
|
|
299
|
+
finishm_graph.graph, start_onode, end_onode, adjusted_leash_length, {
|
|
300
|
+
:recoherence_kmer => options[:recoherence_kmer],
|
|
301
|
+
:sequences => finishm_graph.velvet_sequences,
|
|
302
|
+
:max_explore_nodes => options[:max_explore_nodes],
|
|
303
|
+
:max_gapfill_paths => options[:max_gapfill_paths],
|
|
304
|
+
}
|
|
305
|
+
)
|
|
306
|
+
if trails.circular_paths_detected
|
|
307
|
+
log.warn "Circular path detected here, not attempting to gapfill"
|
|
308
|
+
end
|
|
309
|
+
# Convert the trails into OrientedNodePaths
|
|
310
|
+
trails = trails.collect do |trail|
|
|
311
|
+
path = Bio::Velvet::Graph::OrientedNodeTrail.new
|
|
312
|
+
path.trail = trail
|
|
313
|
+
path
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
acon = Bio::AssemblyGraphAlgorithms::ContigPrinter::AnchoredConnection.new
|
|
317
|
+
acon.start_probe_noded_read = finishm_graph.probe_node_reads[probe_index1]
|
|
318
|
+
acon.end_probe_noded_read = finishm_graph.probe_node_reads[probe_index2]
|
|
319
|
+
acon.start_probe_contig_offset = options[:contig_end_length]
|
|
320
|
+
acon.end_probe_contig_offset = options[:contig_end_length]
|
|
321
|
+
acon.paths = trails
|
|
322
|
+
|
|
323
|
+
return acon
|
|
324
|
+
end
|
|
325
|
+
end
|