finishm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitmodules +3 -0
- data/.rspec +1 -0
- data/Gemfile +31 -0
- data/LICENSE.txt +20 -0
- data/README.md +59 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/assembly_visualiser +106 -0
- data/bin/check_primer_combinations.rb +73 -0
- data/bin/contig_joiner.rb +244 -0
- data/bin/contigs_against_assembly.rb +153 -0
- data/bin/finishm +143 -0
- data/bin/finishm_assembler +55 -0
- data/bin/finishm_gap_closer.rb +241 -0
- data/bin/kmer_abundance_file_tool.rb +49 -0
- data/bin/kmer_pattern_to_assembly.rb +377 -0
- data/bin/kmer_profile_finder.rb +92 -0
- data/bin/kmers_count_parse.d +52 -0
- data/bin/kmers_count_tabulate.d +123 -0
- data/bin/kmers_count_tabulate.rb +84 -0
- data/bin/pcr_result_parser.rb +108 -0
- data/bin/primer_finder.rb +119 -0
- data/bin/read_selection_by_kmer.d +174 -0
- data/bin/scaffold_by_pattern.rb +119 -0
- data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
- data/bin/scaffold_end_coverages.rb +69 -0
- data/bin/trail_validator.rb +84 -0
- data/ext/mkrf_conf.rb +56 -0
- data/ext/src/Makefile +140 -0
- data/ext/src/src/allocArray.c +305 -0
- data/ext/src/src/allocArray.h +86 -0
- data/ext/src/src/autoOpen.c +107 -0
- data/ext/src/src/autoOpen.h +18 -0
- data/ext/src/src/binarySequences.c +813 -0
- data/ext/src/src/binarySequences.h +125 -0
- data/ext/src/src/concatenatedGraph.c +233 -0
- data/ext/src/src/concatenatedGraph.h +30 -0
- data/ext/src/src/concatenatedPreGraph.c +262 -0
- data/ext/src/src/concatenatedPreGraph.h +29 -0
- data/ext/src/src/correctedGraph.c +2643 -0
- data/ext/src/src/correctedGraph.h +32 -0
- data/ext/src/src/dfib.c +509 -0
- data/ext/src/src/dfib.h +69 -0
- data/ext/src/src/dfibHeap.c +89 -0
- data/ext/src/src/dfibHeap.h +39 -0
- data/ext/src/src/dfibpriv.h +105 -0
- data/ext/src/src/fib.c +628 -0
- data/ext/src/src/fib.h +78 -0
- data/ext/src/src/fibHeap.c +79 -0
- data/ext/src/src/fibHeap.h +41 -0
- data/ext/src/src/fibpriv.h +110 -0
- data/ext/src/src/globals.h +154 -0
- data/ext/src/src/graph.c +3932 -0
- data/ext/src/src/graph.h +233 -0
- data/ext/src/src/graphReConstruction.c +1472 -0
- data/ext/src/src/graphReConstruction.h +30 -0
- data/ext/src/src/graphStats.c +2167 -0
- data/ext/src/src/graphStats.h +72 -0
- data/ext/src/src/graphStructures.h +52 -0
- data/ext/src/src/kmer.c +652 -0
- data/ext/src/src/kmer.h +73 -0
- data/ext/src/src/kmerOccurenceTable.c +236 -0
- data/ext/src/src/kmerOccurenceTable.h +44 -0
- data/ext/src/src/kseq.h +223 -0
- data/ext/src/src/locallyCorrectedGraph.c +557 -0
- data/ext/src/src/locallyCorrectedGraph.h +40 -0
- data/ext/src/src/passageMarker.c +677 -0
- data/ext/src/src/passageMarker.h +137 -0
- data/ext/src/src/preGraph.c +1717 -0
- data/ext/src/src/preGraph.h +106 -0
- data/ext/src/src/preGraphConstruction.c +990 -0
- data/ext/src/src/preGraphConstruction.h +26 -0
- data/ext/src/src/probe_node_finder.c +84 -0
- data/ext/src/src/probe_node_finder.h +6 -0
- data/ext/src/src/readCoherentGraph.c +557 -0
- data/ext/src/src/readCoherentGraph.h +30 -0
- data/ext/src/src/readSet.c +1734 -0
- data/ext/src/src/readSet.h +67 -0
- data/ext/src/src/readToNode.c +218 -0
- data/ext/src/src/readToNode.h +35 -0
- data/ext/src/src/recycleBin.c +199 -0
- data/ext/src/src/recycleBin.h +58 -0
- data/ext/src/src/roadMap.c +342 -0
- data/ext/src/src/roadMap.h +65 -0
- data/ext/src/src/run.c +318 -0
- data/ext/src/src/run.h +52 -0
- data/ext/src/src/run2.c +744 -0
- data/ext/src/src/runReadToNode.c +29 -0
- data/ext/src/src/scaffold.c +1876 -0
- data/ext/src/src/scaffold.h +64 -0
- data/ext/src/src/shortReadPairs.c +1243 -0
- data/ext/src/src/shortReadPairs.h +32 -0
- data/ext/src/src/splay.c +259 -0
- data/ext/src/src/splay.h +43 -0
- data/ext/src/src/splayTable.c +1315 -0
- data/ext/src/src/splayTable.h +31 -0
- data/ext/src/src/tightString.c +362 -0
- data/ext/src/src/tightString.h +82 -0
- data/ext/src/src/utility.c +199 -0
- data/ext/src/src/utility.h +98 -0
- data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
- data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
- data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
- data/ext/src/third-party/zlib-1.2.3/README +125 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
- data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
- data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
- data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
- data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
- data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
- data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/configure +459 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/example +0 -0
- data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
- data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
- data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
- data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
- data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
- data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
- data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
- data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
- data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
- data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
- data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
- data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
- data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
- data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
- data/lib/assembly/a_b_visualiser.rb +169 -0
- data/lib/assembly/acyclic_connection_finder.rb +81 -0
- data/lib/assembly/all_orfs.rb +615 -0
- data/lib/assembly/bad_format_writer.rb +46 -0
- data/lib/assembly/bam_probe_read_selector.rb +48 -0
- data/lib/assembly/bubbly_assembler.rb +842 -0
- data/lib/assembly/c_probe_node_finder.rb +38 -0
- data/lib/assembly/connection_interpreter.rb +350 -0
- data/lib/assembly/contig_printer.rb +400 -0
- data/lib/assembly/coverage_based_graph_filter.rb +68 -0
- data/lib/assembly/depth_first_search.rb +63 -0
- data/lib/assembly/dijkstra.rb +216 -0
- data/lib/assembly/fluffer.rb +253 -0
- data/lib/assembly/graph_explorer.rb +85 -0
- data/lib/assembly/graph_generator.rb +315 -0
- data/lib/assembly/height_finder.rb +355 -0
- data/lib/assembly/hybrid_velvet_graph.rb +70 -0
- data/lib/assembly/input_genome.rb +182 -0
- data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
- data/lib/assembly/node_finder.rb +171 -0
- data/lib/assembly/oriented_node_trail.rb +507 -0
- data/lib/assembly/paired_end_assembler.rb +53 -0
- data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
- data/lib/assembly/probed_graph.rb +105 -0
- data/lib/assembly/read_input.rb +79 -0
- data/lib/assembly/read_to_node.rb +37 -0
- data/lib/assembly/scaffold_breaker.rb +126 -0
- data/lib/assembly/sequence_hasher.rb +71 -0
- data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
- data/lib/assembly/single_coherent_wanderer.rb +261 -0
- data/lib/assembly/single_ended_assembler.rb +441 -0
- data/lib/assembly/velvet_c_binding.rb +54 -0
- data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
- data/lib/external/VERSION +1 -0
- data/lib/finishm/assemble.rb +224 -0
- data/lib/finishm/explore.rb +217 -0
- data/lib/finishm/finisher.rb +303 -0
- data/lib/finishm/fluff.rb +122 -0
- data/lib/finishm/gapfiller.rb +325 -0
- data/lib/finishm/orfs_finder.rb +88 -0
- data/lib/finishm/path_counter.rb +90 -0
- data/lib/finishm/primers.rb +425 -0
- data/lib/finishm/primers_check.rb +176 -0
- data/lib/finishm/roundup.rb +344 -0
- data/lib/finishm/sequence.rb +142 -0
- data/lib/finishm/visualise.rb +430 -0
- data/lib/finishm/wander.rb +270 -0
- data/lib/kmer_abundance_pattern.rb +79 -0
- data/lib/kmer_multi_abundance_file.rb +48 -0
- data/lib/oligo_designer.rb +88 -0
- data/lib/priner.rb +66 -0
- data/spec/acyclic_connection_finder_spec.rb +551 -0
- data/spec/all_orfs_spec.rb +443 -0
- data/spec/assemble_spec.rb +186 -0
- data/spec/bubbly_assembler_spec.rb +707 -0
- data/spec/c_node_finder_spec.rb +58 -0
- data/spec/connection_interpreter_spec.rb +284 -0
- data/spec/contig_printer_spec.rb +291 -0
- data/spec/coverage_based_graph_filter_spec.rb +102 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
- data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
- data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
- data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
- data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Graph +46 -0
- data/spec/data/assembly_visualiser/start_kmers1 +2 -0
- data/spec/data/bands.csv +1 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
- data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
- data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
- data/spec/data/c_probe_node_finder/1/Log +756 -0
- data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
- data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
- data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
- data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
- data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
- data/spec/data/contig_printer/1/contigs.fa +4 -0
- data/spec/data/contig_printer/1/seq.fa +2408 -0
- data/spec/data/contig_printer/1/seq.fa.svg +153 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
- data/spec/data/contig_printer/1/seq.node12.fa +4 -0
- data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
- data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
- data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
- data/spec/data/explore/1/a.fa +2 -0
- data/spec/data/explore/1/seq1_and_a.fa +3 -0
- data/spec/data/explore/1/seq2.fa +2 -0
- data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
- data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
- data/spec/data/fluff/1/seq1.fa +2 -0
- data/spec/data/fluff/1/seq2.fa +2 -0
- data/spec/data/gapfilling/1/reads.fa +171 -0
- data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
- data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
- data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
- data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
- data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
- data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
- data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
- data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
- data/spec/data/gapfilling/2/reference.fa +2 -0
- data/spec/data/gapfilling/2/reference_part1.fa +4 -0
- data/spec/data/gapfilling/2/reference_part2.fa +4 -0
- data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
- data/spec/data/gapfilling/2/with_gaps.fa +4 -0
- data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
- data/spec/data/gapfilling/3/reads.fa.gz +0 -0
- data/spec/data/gapfilling/3/reference_part1.fa +4 -0
- data/spec/data/gapfilling/3/reference_part2.fa +4 -0
- data/spec/data/gapfilling/3/with_gaps.fa +4 -0
- data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
- data/spec/data/gapfilling/4/reads.fa.gz +0 -0
- data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
- data/spec/data/gapfilling/5/answer.fna +2 -0
- data/spec/data/gapfilling/5/gappy.fna +2 -0
- data/spec/data/gapfilling/5/reads.fa +17961 -0
- data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
- data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
- data/spec/data/gapfilling/6/random1.fa +28 -0
- data/spec/data/gapfilling/6/random2.fa +28 -0
- data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
- data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
- data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
- data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
- data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
- data/spec/data/kmers_count1.csv +2 -0
- data/spec/data/kmers_count2.csv +3 -0
- data/spec/data/out +3 -0
- data/spec/data/positive_latching_pair.fa +2 -0
- data/spec/data/primers.csv +4 -0
- data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/input.fasta +6 -0
- data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
- data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
- data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
- data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
- data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
- data/spec/data/t/details.txt +5 -0
- data/spec/data/t/details.txt.srt +5 -0
- data/spec/data/t/location.txt +3 -0
- data/spec/data/t/location.txt.srt +3 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
- data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
- data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/answer.fa +2 -0
- data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
- data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
- data/spec/data/velvet_test_trails/Assem/Graph +17 -0
- data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
- data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
- data/spec/data/velvet_test_trails/Assem/Log +35 -0
- data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
- data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
- data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
- data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
- data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
- data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
- data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
- data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
- data/spec/data/velvet_test_trails/read1.fa +2 -0
- data/spec/data/velvet_test_trails/reads.fa +50 -0
- data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
- data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
- data/spec/data/visualise/1/LastGraph +6695 -0
- data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
- data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
- data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
- data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
- data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
- data/spec/data/wander/1/random1.fa +2 -0
- data/spec/data/wander/1/random1.sammy.fa +804 -0
- data/spec/depth_first_search_spec.rb +190 -0
- data/spec/dijkstra_spec.rb +143 -0
- data/spec/explore_spec.rb +29 -0
- data/spec/fluffer_spec.rb +155 -0
- data/spec/gapfiller_spec.rb +107 -0
- data/spec/graph_explorer_spec.rb +475 -0
- data/spec/graph_generator_spec.rb +99 -0
- data/spec/height_finder_spec.rb +306 -0
- data/spec/kmer_abundance_pattern_spec.rb +56 -0
- data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
- data/spec/kmer_profile_finder_spec.rb +38 -0
- data/spec/kmers_count_tabulate_spec.rb +120 -0
- data/spec/oriented_node_trail_spec.rb +221 -0
- data/spec/paired_end_neighbours_spec.rb +126 -0
- data/spec/paths_between_nodes_spec.rb +349 -0
- data/spec/priner_spec.rb +7 -0
- data/spec/read_input_spec.rb +23 -0
- data/spec/read_selection_by_kmer_spec.rb +166 -0
- data/spec/read_to_node_spec.rb +35 -0
- data/spec/roundup_spec.rb +366 -0
- data/spec/scaffold_breaker_spec.rb +144 -0
- data/spec/sequence_spec.rb +43 -0
- data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
- data/spec/single_coherent_wanderer_spec.rb +120 -0
- data/spec/single_ended_assembler_spec.rb +398 -0
- data/spec/spec_helper.rb +310 -0
- data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
- data/spec/visualise_spec.rb +105 -0
- data/spec/wander_spec.rb +119 -0
- data/spec/watch_for_changes.sh +16 -0
- data/validation/fasta_compare.rb +72 -0
- data/validation/gapfill_simulate_perfect.rb +108 -0
- metadata +899 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'bio-logger'
|
|
5
|
+
require 'csv'
|
|
6
|
+
|
|
7
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
|
8
|
+
|
|
9
|
+
# Parse command line options into the options hash
|
|
10
|
+
options = {
|
|
11
|
+
:logger => 'stderr',
|
|
12
|
+
:log_level => 'info',
|
|
13
|
+
:min => 0,
|
|
14
|
+
}
|
|
15
|
+
o = OptionParser.new do |opts|
|
|
16
|
+
opts.banner = "
|
|
17
|
+
Usage: #{SCRIPT_NAME} <arguments>
|
|
18
|
+
|
|
19
|
+
grep a multiple kmer abundance file according to specified criteria\n\n"
|
|
20
|
+
|
|
21
|
+
opts.on("--min NUMBER", "At least 1 column has at least this many observations [default: #{options[:min]}]") do |arg|
|
|
22
|
+
options[:min] = arg.to_f
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# logger options
|
|
26
|
+
opts.separator "\nVerbosity:\n\n"
|
|
27
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
|
28
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
|
29
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
|
30
|
+
end; o.parse!
|
|
31
|
+
if ARGV.length != 1
|
|
32
|
+
$stderr.puts o
|
|
33
|
+
exit 1
|
|
34
|
+
end
|
|
35
|
+
# Setup logging
|
|
36
|
+
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
|
37
|
+
|
|
38
|
+
CSV.foreach(ARGV[0], :col_sep => ' ') do |row|
|
|
39
|
+
kmer = row[0]
|
|
40
|
+
passable = false
|
|
41
|
+
row[1...row.length].each do |count|
|
|
42
|
+
if count.to_f > options[:min]
|
|
43
|
+
passable = true
|
|
44
|
+
break
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
puts row.join(' ') if passable
|
|
48
|
+
end
|
|
49
|
+
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'bio-logger'
|
|
5
|
+
require 'csv'
|
|
6
|
+
require 'tempfile'
|
|
7
|
+
require 'pp'
|
|
8
|
+
require 'systemu'
|
|
9
|
+
require 'bio-velvet'
|
|
10
|
+
require 'set'
|
|
11
|
+
|
|
12
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'finishm'
|
|
13
|
+
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
|
14
|
+
require 'priner'
|
|
15
|
+
|
|
16
|
+
# Parse command line options into the options hash
|
|
17
|
+
options = {
|
|
18
|
+
:logger => 'stderr',
|
|
19
|
+
:log_level => 'info',
|
|
20
|
+
:min_leftover_length => false,
|
|
21
|
+
:kmer_coverage_target => 1,
|
|
22
|
+
:velvet_kmer_size => 155,
|
|
23
|
+
:contig_end_length => 300,
|
|
24
|
+
:graph_search_leash_length => 20000,
|
|
25
|
+
:reads_to_assemble => nil,
|
|
26
|
+
:assembly_coverage_cutoff => 1.5,
|
|
27
|
+
:kmer_path_filter_min_coverage => 1,
|
|
28
|
+
:kmer_path_end_exclusion_length => 50,
|
|
29
|
+
:trail_kmer_coverage_file => 'trail_coverages.csv'
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# TODO: make a better interface for this. Maybe specify an entire genome, and then "Contig_1 end, Contig_3 start" or something
|
|
33
|
+
# Look at the last 300bp of the first contig.
|
|
34
|
+
extract_exactly_one_contig_from_file = lambda do |fasta_file_path|
|
|
35
|
+
contig = nil
|
|
36
|
+
Bio::FlatFile.foreach(Bio::FastaFormat, fasta_file_path) do |e|
|
|
37
|
+
if contig.nil?
|
|
38
|
+
contig = e.seq
|
|
39
|
+
else
|
|
40
|
+
raise "Multiple sequences found in a contig file! I need exactly one"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
raise "I need a contig to be in the start contig file" if contig.nil?
|
|
44
|
+
Bio::Sequence::NA.new(contig.to_s)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
o = OptionParser.new do |opts|
|
|
48
|
+
opts.banner = "
|
|
49
|
+
Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
|
|
50
|
+
|
|
51
|
+
Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
|
|
52
|
+
|
|
53
|
+
opts.on("--pattern PATTERN", "kmer abundance pattern e.g. '0111001110' [required]") do |arg|
|
|
54
|
+
options[:pattern] = arg
|
|
55
|
+
end
|
|
56
|
+
opts.on("--kmer-abundances FILE", "kmer multiple abundance file [required]") do |arg|
|
|
57
|
+
options[:kmer_multiple_abundance_file] = arg
|
|
58
|
+
end
|
|
59
|
+
opts.on("--upper-threshold NUM", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
|
60
|
+
options[:upper_threshold] = arg.to_i
|
|
61
|
+
end
|
|
62
|
+
opts.on("--lower-threshold NUM", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
|
63
|
+
options[:lower_threshold] = arg.to_i
|
|
64
|
+
end
|
|
65
|
+
opts.on("--reads FILES", "comma-separated list of sequence reads files in the same order as the pattern was supplied [required]") do |arg|
|
|
66
|
+
options[:reads_files] = arg.split(',').collect{|r| File.absolute_path r}
|
|
67
|
+
end
|
|
68
|
+
opts.on("--start-contig FASTA", "path to a fasta file with the starting contig in it (only). Assumes we are building off the end of this contig [required]") do |arg|
|
|
69
|
+
options[:start_contig] = extract_exactly_one_contig_from_file.call arg
|
|
70
|
+
end
|
|
71
|
+
opts.on("--end-contig FASTA", "path to a fasta file with the ending contig in it (only). Assumes we are building onto the start of this contig [required]") do |arg|
|
|
72
|
+
options[:end_contig] = extract_exactly_one_contig_from_file.call arg
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
opts.separator "\nOptional arguments:\n\n"
|
|
76
|
+
opts.on("--min-leftover-read-length NUMBER", "when searching for reads with kmers, require the kmer to be at the beginning or end of the selected read [default: #{options[:min_leftover_length]}]") do |arg|
|
|
77
|
+
options[:min_leftover_length] = arg.to_i
|
|
78
|
+
end
|
|
79
|
+
opts.on("--kmer-coverage-target NUMBER", "when searching for reads with kmers, require this many copies per kmer [default: #{options[:kmer_coverage_target]}]") do |arg|
|
|
80
|
+
options[:kmer_coverage_target] = arg.to_i
|
|
81
|
+
end
|
|
82
|
+
opts.on("--already-patterned-reads FILE", "Attempt to assemble the reads in the specified file, useful for re-assembly [default: off]") do |arg|
|
|
83
|
+
options[:already_patterned_reads] = arg
|
|
84
|
+
end
|
|
85
|
+
opts.on("--output-assembly PATH", "Output assembly intermediate files to this directory [default: off]") do |arg|
|
|
86
|
+
options[:output_assembly_path] = arg
|
|
87
|
+
end
|
|
88
|
+
opts.on("--assembly-png PATH", "Output assembly as a PNG file [default: off]") do |arg|
|
|
89
|
+
options[:output_graph_png] = arg
|
|
90
|
+
end
|
|
91
|
+
opts.on("--assembly-svg PATH", "Output assembly as an SVG file [default: off]") do |arg|
|
|
92
|
+
options[:output_graph_svg] = arg
|
|
93
|
+
end
|
|
94
|
+
opts.on("--assembly-dot PATH", "Output assembly as an DOT file [default: off]") do |arg|
|
|
95
|
+
options[:output_graph_dot] = arg
|
|
96
|
+
end
|
|
97
|
+
# opts.on("--output-begin-kmers PATH", "Output kmers found at the beginning point to this file [default: off]") do |arg|
|
|
98
|
+
# options[:output_begin_kmers] = arg
|
|
99
|
+
# end
|
|
100
|
+
# opts.on("--output-end-kmers PATH", "Output kmers found at the ending point to this file [default: off]") do |arg|
|
|
101
|
+
# options[:output_end_kmers] = arg
|
|
102
|
+
# end
|
|
103
|
+
opts.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff]}]") do |arg|
|
|
104
|
+
options[:assembly_coverage_cutoff] = arg.to_f
|
|
105
|
+
end
|
|
106
|
+
opts.on("--contig-end-length LENGTH", "Number of base pairs to start into the ends of the contigs [default: #{options[:contig_end_length]}]") do |arg|
|
|
107
|
+
options[:contig_end_length] = arg.to_i
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# logger options
|
|
111
|
+
opts.separator "\nVerbosity:\n\n"
|
|
112
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
|
113
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
|
114
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
|
115
|
+
end; o.parse!
|
|
116
|
+
if ARGV.length != 0 or options[:upper_threshold].nil? or options[:lower_threshold].nil? or options[:pattern].nil? or options[:kmer_multiple_abundance_file].nil? or options[:reads_files].nil?
|
|
117
|
+
pp options
|
|
118
|
+
$stderr.puts o
|
|
119
|
+
exit 1
|
|
120
|
+
end
|
|
121
|
+
# Setup logging
|
|
122
|
+
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
|
123
|
+
Bio::Log::LoggerPlus.new 'bio-velvet'
|
|
124
|
+
Bio::Log::CLI.configure 'bio-velvet'
|
|
125
|
+
|
|
126
|
+
pooled_reads_filename = 'pooled_sampled_reads.fasta'
|
|
127
|
+
if options[:already_patterned_reads] #If skipping read extraction
|
|
128
|
+
pooled_reads_filename = options[:already_patterned_reads]
|
|
129
|
+
|
|
130
|
+
else
|
|
131
|
+
# Parse pattern from cmdline
|
|
132
|
+
desired_pattern = KmerAbundancePattern.new
|
|
133
|
+
desired_pattern.parse_from_human(options[:pattern])
|
|
134
|
+
if options[:reads_files].length != desired_pattern.length
|
|
135
|
+
raise "Number of entries in the pattern #{desired_pattern.length} and number of reads files #{options[:reads].length} not equivalent!"
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Collect the kmers that will be used to find trusted reads i.e.
|
|
139
|
+
# Go through each line of the kmer abundance file, looking for kmers that suit the pattern
|
|
140
|
+
input_file = File.open options[:kmer_multiple_abundance_file]
|
|
141
|
+
csv = CSV.new(input_file, :col_sep => ' ')
|
|
142
|
+
|
|
143
|
+
whitelist_kmers = []
|
|
144
|
+
blacklist_kmers = []
|
|
145
|
+
csv.each do |row|
|
|
146
|
+
max_i = row.length - 2 if max_i.nil?
|
|
147
|
+
|
|
148
|
+
kmer = row[0]
|
|
149
|
+
counts = row[1...row.length].collect{|s| s.to_i}
|
|
150
|
+
probe = 'TTACATCTTATCTACAATAAACCTTCTGCCTTAGTTTTAGAGCCTATCCGAAAAGTCCTGCTGCTCTGAATGTTATCCAAGCACATGCAAAATGAATTAGT'
|
|
151
|
+
this_pattern = []
|
|
152
|
+
counts.each_with_index do |count, i|
|
|
153
|
+
if count > options[:upper_threshold]
|
|
154
|
+
this_pattern[i] = true
|
|
155
|
+
elsif count < options[:lower_threshold]
|
|
156
|
+
this_pattern[i] = false
|
|
157
|
+
else
|
|
158
|
+
# coverage was in no man's land between thresholds.
|
|
159
|
+
# Ignore this kmer as noise.
|
|
160
|
+
this_pattern[i] = '-'
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
#log.debug "Found pattern #{this_pattern} from kmer #{kmer}, which has abundances #{counts}" if log.debug?
|
|
164
|
+
|
|
165
|
+
if desired_pattern.consistent_with? this_pattern
|
|
166
|
+
whitelist_kmers.push row[0]
|
|
167
|
+
else
|
|
168
|
+
# kmer is not present when it should be
|
|
169
|
+
blacklist_kmers.push row[0]
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
log.info "After parsing the kmer multiple abundance file, found #{whitelist_kmers.length} kmers that matched the pattern, and #{blacklist_kmers.length} that didn't"
|
|
173
|
+
unless whitelist_kmers.length > 0
|
|
174
|
+
log.error "No kmers found that satisfy the given pattern, exiting.."
|
|
175
|
+
exit 1
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
#outdir = options[:output_directory]
|
|
180
|
+
#Dir.mkdir outdir unless Dir.exist?(outdir)
|
|
181
|
+
|
|
182
|
+
# grep the pattern out from the raw reads, subsampling so as to not overwhelm the assembler
|
|
183
|
+
#Tempfile.open('whitelist') do |white|
|
|
184
|
+
File.open 'whitelist', 'w' do |white|
|
|
185
|
+
white.puts whitelist_kmers.join("\n")
|
|
186
|
+
white.close
|
|
187
|
+
|
|
188
|
+
#Tempfile.open('blacklist') do |black|
|
|
189
|
+
File.open('black','w') do |black|
|
|
190
|
+
black.puts blacklist_kmers.join("\n")
|
|
191
|
+
black.close
|
|
192
|
+
|
|
193
|
+
threadpool = []
|
|
194
|
+
sampled_read_files = []
|
|
195
|
+
log.info "Extracting reads that contain suitable kmers"
|
|
196
|
+
options[:reads_files].each_with_index do |file, i|
|
|
197
|
+
next unless desired_pattern[i] #Don't extract reads from reads where those reads should not have been amplified
|
|
198
|
+
|
|
199
|
+
sampled = File.basename(file)+'.sampled_reads.fasta'
|
|
200
|
+
sampled_read_files.push sampled
|
|
201
|
+
|
|
202
|
+
grep_path = "#{ENV['HOME']}/git/priner/bin/read_selection_by_kmer "
|
|
203
|
+
if options[:min_leftover_length]
|
|
204
|
+
grep_path += "--min-leftover-length #{options[:min_leftover_length]} "
|
|
205
|
+
end
|
|
206
|
+
thr = Thread.new do
|
|
207
|
+
grep_cmd = "#{grep_path} --whitelist #{white.path} --blacklist #{black.path} --reads #{file} --kmer-coverage-target #{options[:kmer_coverage_target]} > #{sampled}"
|
|
208
|
+
log.debug "Running cmd: #{grep_cmd}"
|
|
209
|
+
status, stdout, stderr = systemu grep_cmd
|
|
210
|
+
log.debug stderr
|
|
211
|
+
|
|
212
|
+
raise unless status.exitstatus == 0
|
|
213
|
+
log.debug "Finished extracting reads from #{file}"
|
|
214
|
+
end
|
|
215
|
+
threadpool.push thr
|
|
216
|
+
end
|
|
217
|
+
threadpool.each do |thread| thread.join; end #wait until everything is finito
|
|
218
|
+
|
|
219
|
+
log.info "Finished extracting reads for sampling. Now pooling sampled reads"
|
|
220
|
+
pool_cmd = "cat #{sampled_read_files.join ' '} >#{pooled_reads_filename}"
|
|
221
|
+
log.debug "Running cmd: #{pool_cmd}"
|
|
222
|
+
status, stdout, stderr = systemu pool_cmd
|
|
223
|
+
raise stderr if stderr != ''
|
|
224
|
+
raise unless status.exitstatus == 0
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
log.info "Extracting dummy reads from the ends of contigs to use as anchors"
|
|
230
|
+
start_contig = options[:start_contig]
|
|
231
|
+
end_contig = options[:end_contig]
|
|
232
|
+
if [start_contig.length, end_contig.length].min < 2*options[:contig_end_length]
|
|
233
|
+
log.warn "Choice of initial/terminal nodes to perform graph search with may not be optimal due to the small contig size"
|
|
234
|
+
end
|
|
235
|
+
if [start_contig.length, end_contig.length].min < options[:contig_end_length]
|
|
236
|
+
log.error "At least one contig too small to proceed with current code base, need to fix the code to allow such a small contig"
|
|
237
|
+
exit 1
|
|
238
|
+
end
|
|
239
|
+
# Use the last bit of the first contig and the first bit of the second contig as the anchors
|
|
240
|
+
velvet_result = nil
|
|
241
|
+
Tempfile.open('anchors.fa') do |tempfile|
|
|
242
|
+
# Putting these same sequences in many times seems to better the
|
|
243
|
+
# chances velvet won't throw them out
|
|
244
|
+
50.times do
|
|
245
|
+
tempfile.puts ">start_contig"
|
|
246
|
+
tempfile.puts start_contig[start_contig.length-options[:contig_end_length]...start_contig.length]
|
|
247
|
+
tempfile.puts ">end_contig"
|
|
248
|
+
#Have to be in reverse, because the node finder finds the node at the start of the read, not the end
|
|
249
|
+
fwd2 = Bio::Sequence::NA.new(end_contig[0...options[:contig_end_length]])
|
|
250
|
+
tempfile.puts fwd2.reverse_complement.to_s
|
|
251
|
+
end
|
|
252
|
+
tempfile.close
|
|
253
|
+
#puts `cat #{tempfile.path}`
|
|
254
|
+
|
|
255
|
+
log.info "Assembling sampled reads with velvet"
|
|
256
|
+
# Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1 and 2, respectively.
|
|
257
|
+
velvet_result = Bio::Velvet::Runner.new.velvet(
|
|
258
|
+
options[:velvet_kmer_size],
|
|
259
|
+
"-short #{tempfile.path} -short2 #{pooled_reads_filename}",
|
|
260
|
+
"-cov_cutoff #{options[:assembly_coverage_cutoff]} -read_trkg yes",
|
|
261
|
+
:output_assembly_path => options[:output_assembly_path]
|
|
262
|
+
)
|
|
263
|
+
if log.debug?
|
|
264
|
+
log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
|
|
265
|
+
log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
|
|
266
|
+
log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
|
|
267
|
+
log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
|
|
268
|
+
end
|
|
269
|
+
log.info "Finished running assembly"
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
log.info "Parsing the graph output from velvet"
|
|
273
|
+
graph = Bio::Velvet::Graph.parse_from_file(File.join velvet_result.result_directory, 'LastGraph')
|
|
274
|
+
log.info "Finished parsing graph: found #{graph.nodes.length} nodes and #{graph.arcs.length} arcs"
|
|
275
|
+
|
|
276
|
+
if options[:assembly_coverage_cutoff]
|
|
277
|
+
log.info "Removing low-coverage nodes from the graph (less than #{options[:assembly_coverage_cutoff]})"
|
|
278
|
+
cutoffer = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
|
|
279
|
+
deleted_nodes, deleted_arcs = cutoffer.remove_low_coverage_nodes(graph, options[:assembly_coverage_cutoff], :whitelisted_sequences => [1,2])
|
|
280
|
+
|
|
281
|
+
log.info "Removed #{deleted_nodes.length} nodes and #{deleted_arcs.length} arcs from the graph due to low coverage"
|
|
282
|
+
log.info "Now there is #{graph.nodes.length} nodes and #{graph.arcs.length} arcs remaining"
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
|
|
286
|
+
log.info "Finding node representing the end of the first contig"
|
|
287
|
+
start_node, start_node_forward = finder.find_unique_node_with_sequence_id(graph, 1)
|
|
288
|
+
log.info "Finding node representing the start of the second contig"
|
|
289
|
+
end_node, end_node_forward = finder.find_unique_node_with_sequence_id(graph, 2)#TODO: find the node nearest the end of this, not the start
|
|
290
|
+
if start_node.nil? or end_node.nil?
|
|
291
|
+
if start_node.nil?
|
|
292
|
+
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _start_ point in them, sorry. Maybe fix the node finding code?"
|
|
293
|
+
end
|
|
294
|
+
if end_node.nil?
|
|
295
|
+
log.error "Unable to find any nodes in the graph that have kmers corresponding to the _end_ point in them, sorry. Maybe fix the node finding code?"
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
if options[:output_graph_png] or options[:output_graph_svg] or options[:output_graph_dot]
|
|
299
|
+
log.info "Converting assembly to a graphviz PNG/SVG/DOT, even if start/end node was not be found properly"
|
|
300
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
301
|
+
gv = viser.graphviz(graph)
|
|
302
|
+
if options[:output_graph_png]
|
|
303
|
+
log.info "Writing PNG of graph to #{options[:output_graph_png]}"
|
|
304
|
+
gv.output :png => options[:output_graph_png]
|
|
305
|
+
end
|
|
306
|
+
if options[:output_graph_svg]
|
|
307
|
+
log.info "Writing SVG of graph to #{options[:output_graph_svg]}"
|
|
308
|
+
gv.output :svg => options[:output_graph_svg]
|
|
309
|
+
end
|
|
310
|
+
if options[:output_graph_dot]
|
|
311
|
+
log.info "Writing DOT of graph to #{options[:output_graph_dot]}"
|
|
312
|
+
gv.output :dot => options[:output_graph_dot]
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
log.error "Unknown start or end points, giving up, sorry."
|
|
316
|
+
exit 1
|
|
317
|
+
end
|
|
318
|
+
log.info "Node(s) found that are suitable as initial and terminal nodes in the graph search, respectively: #{start_node.node_id} and #{end_node.node_id}"
|
|
319
|
+
|
|
320
|
+
log.info "Removing nodes unconnected to either the start or the end from the graph.."
|
|
321
|
+
original_num_nodes = graph.nodes.length
|
|
322
|
+
original_num_arcs = graph.arcs.length
|
|
323
|
+
filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
|
|
324
|
+
filter.remove_unconnected_nodes(graph, [start_node, end_node])
|
|
325
|
+
log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs"
|
|
326
|
+
|
|
327
|
+
if options[:output_graph_png]
|
|
328
|
+
log.info "Converting assembly to a graphviz PNG"
|
|
329
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
330
|
+
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
|
331
|
+
gv.output :png => options[:output_graph_png], :use => :neato
|
|
332
|
+
end
|
|
333
|
+
if options[:output_graph_svg]
|
|
334
|
+
log.info "Converting assembly to a graphviz SVG"
|
|
335
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
336
|
+
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
|
337
|
+
gv.output :svg => options[:output_graph_svg], :use => :neato
|
|
338
|
+
end
|
|
339
|
+
if options[:output_graph_dot]
|
|
340
|
+
log.info "Converting assembly to a graphviz DOT"
|
|
341
|
+
viser = Bio::Assembly::ABVisualiser.new
|
|
342
|
+
gv = viser.graphviz(graph, {:start_node_id => start_node.node_id, :end_node_id => end_node.node_id})
|
|
343
|
+
gv.output :dot => options[:output_graph_dot]
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
log.info "Searching for trails between the initial and terminal nodes, within the assembly graph"
|
|
347
|
+
cartographer = Bio::AssemblyGraphAlgorithms::AcyclicConnectionFinder.new
|
|
348
|
+
#raise "Untested connection finder below"
|
|
349
|
+
#trails = cartographer.find_all_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
|
350
|
+
trails = cartographer.find_trails_between_nodes(graph, start_node, end_node, options[:graph_search_leash_length], start_node_forward)
|
|
351
|
+
log.info "Found #{trails.length} trail(s) between the initial and terminal nodes"
|
|
352
|
+
|
|
353
|
+
log.info "Reading kmer abundances from #{options[:kmer_multiple_abundance_file]}.."
|
|
354
|
+
kmer_hash = Bio::KmerMultipleAbundanceHash.parse_from_file options[:kmer_multiple_abundance_file]
|
|
355
|
+
log.info "Finished reading the kmer abundances"
|
|
356
|
+
|
|
357
|
+
if options[:trail_kmer_coverage_file]
|
|
358
|
+
log.info "Writing out kmer coverages to #{options[:trail_kmer_coverage_file]}.."
|
|
359
|
+
writer = Bio::AssemblyGraphAlgorithms::KmerCoverageWriter.new
|
|
360
|
+
io = File.open(options[:trail_kmer_coverage_file],'w')
|
|
361
|
+
writer.write(io, trails, kmer_hash)
|
|
362
|
+
log.info "Finished writing"
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
log.info "Filtering trail(s) based on kmer coverage, requiring each kmer in the path to have a minimum of #{options[:kmer_path_filter_min_coverage]} coverage in patterned reads, except for the #{options[:kmer_path_end_exclusion_length]}bp at the ends"
|
|
366
|
+
kmer_path_filter = Bio::AssemblyGraphAlgorithms::KmerCoverageBasedPathFilter.new
|
|
367
|
+
thresholds = desired_pattern.collect{|c| c == true ? 1 : 0}
|
|
368
|
+
log.info "Using thresholds for filtering: #{thresholds}"
|
|
369
|
+
trails = kmer_path_filter.filter(trails, kmer_hash, thresholds, :exclude_ending_length => options[:kmer_path_end_exclusion_length])
|
|
370
|
+
log.info "After filtering remained #{trails.length} trails"
|
|
371
|
+
|
|
372
|
+
log.debug "Found trails: #{trails.collect{|t| t.to_s}.join("\n")}"
|
|
373
|
+
|
|
374
|
+
trails.each_with_index do |trail, i|
|
|
375
|
+
puts ">trail#{i+1}"
|
|
376
|
+
puts trail.sequence
|
|
377
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'bio-logger'
|
|
5
|
+
require 'csv'
|
|
6
|
+
|
|
7
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
|
8
|
+
|
|
9
|
+
# Parse command line options into the options hash
|
|
10
|
+
options = {
|
|
11
|
+
:logger => 'stderr',
|
|
12
|
+
:log_level => 'info',
|
|
13
|
+
}
|
|
14
|
+
o = OptionParser.new do |opts|
|
|
15
|
+
opts.banner = "
|
|
16
|
+
Usage: #{SCRIPT_NAME} <kmer_multiple_abundance_file>
|
|
17
|
+
|
|
18
|
+
Given an input kmer then abundances space separated file, and a threshold, print out how many kmers are unique to different subsets of columns\n\n"
|
|
19
|
+
|
|
20
|
+
opts.on("--upper-threshold ARG", "kmer frequency cutoff to saying 'present' [required]") do |arg|
|
|
21
|
+
options[:upper_threshold] = arg.to_i
|
|
22
|
+
end
|
|
23
|
+
opts.on("--lower-threshold ARG", "kmer frequency cutoff to saying 'not present' [required]") do |arg|
|
|
24
|
+
options[:lower_threshold] = arg.to_i
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# logger options
|
|
28
|
+
opts.separator "\nVerbosity:\n\n"
|
|
29
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") {options[:log_level] = 'error'}
|
|
30
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") { |name| options[:logger] = name}
|
|
31
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG"){|s| options[:log_level] = s}
|
|
32
|
+
end; o.parse!
|
|
33
|
+
if ARGV.length != 1 or options[:upper_threshold].nil? or options[:lower_threshold].nil?
|
|
34
|
+
$stderr.puts o
|
|
35
|
+
exit 1
|
|
36
|
+
end
|
|
37
|
+
# Setup logging
|
|
38
|
+
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
|
39
|
+
|
|
40
|
+
encoded_counts = {}
|
|
41
|
+
max_i = 0
|
|
42
|
+
|
|
43
|
+
input_file = nil
|
|
44
|
+
if ARGV[0] == '-'
|
|
45
|
+
input_file = $stdin
|
|
46
|
+
else
|
|
47
|
+
input_file = File.open ARGV[0]
|
|
48
|
+
end
|
|
49
|
+
csv = CSV.new(input_file, :col_sep => ' ')
|
|
50
|
+
|
|
51
|
+
csv.each do |row|
|
|
52
|
+
kmer = row[0]
|
|
53
|
+
counts = row[1...row.length].collect{|s| s.to_i}
|
|
54
|
+
index = 0
|
|
55
|
+
counts.each_with_index do |count, i|
|
|
56
|
+
max_i = i if i > max_i
|
|
57
|
+
|
|
58
|
+
if count > options[:upper_threshold]
|
|
59
|
+
increment = (1<<i)
|
|
60
|
+
index += increment
|
|
61
|
+
log.debug "Found a passable for #{options[:threshold]} in index #{i} for #{counts}, count is now #{index}" if log.debug?
|
|
62
|
+
elsif count < options[:lower_threshold]
|
|
63
|
+
# do nothing
|
|
64
|
+
else
|
|
65
|
+
# coverage was in no man's land between thresholds.
|
|
66
|
+
# Ignore this kmer as noise.
|
|
67
|
+
break
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
if index != 0
|
|
72
|
+
encoded_counts[index] ||= 0
|
|
73
|
+
encoded_counts[index] += 1
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
(0..encoded_counts.keys.max).each do |i|
|
|
78
|
+
total = encoded_counts[i]
|
|
79
|
+
unless total.nil?
|
|
80
|
+
unencoded = i.to_s(2)
|
|
81
|
+
|
|
82
|
+
while unencoded.length <= max_i
|
|
83
|
+
unencoded = '0'+unencoded
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
puts [
|
|
87
|
+
i,
|
|
88
|
+
total,
|
|
89
|
+
unencoded,
|
|
90
|
+
].join "\t"
|
|
91
|
+
end
|
|
92
|
+
end
|