finishm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitmodules +3 -0
- data/.rspec +1 -0
- data/Gemfile +31 -0
- data/LICENSE.txt +20 -0
- data/README.md +59 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/assembly_visualiser +106 -0
- data/bin/check_primer_combinations.rb +73 -0
- data/bin/contig_joiner.rb +244 -0
- data/bin/contigs_against_assembly.rb +153 -0
- data/bin/finishm +143 -0
- data/bin/finishm_assembler +55 -0
- data/bin/finishm_gap_closer.rb +241 -0
- data/bin/kmer_abundance_file_tool.rb +49 -0
- data/bin/kmer_pattern_to_assembly.rb +377 -0
- data/bin/kmer_profile_finder.rb +92 -0
- data/bin/kmers_count_parse.d +52 -0
- data/bin/kmers_count_tabulate.d +123 -0
- data/bin/kmers_count_tabulate.rb +84 -0
- data/bin/pcr_result_parser.rb +108 -0
- data/bin/primer_finder.rb +119 -0
- data/bin/read_selection_by_kmer.d +174 -0
- data/bin/scaffold_by_pattern.rb +119 -0
- data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
- data/bin/scaffold_end_coverages.rb +69 -0
- data/bin/trail_validator.rb +84 -0
- data/ext/mkrf_conf.rb +56 -0
- data/ext/src/Makefile +140 -0
- data/ext/src/src/allocArray.c +305 -0
- data/ext/src/src/allocArray.h +86 -0
- data/ext/src/src/autoOpen.c +107 -0
- data/ext/src/src/autoOpen.h +18 -0
- data/ext/src/src/binarySequences.c +813 -0
- data/ext/src/src/binarySequences.h +125 -0
- data/ext/src/src/concatenatedGraph.c +233 -0
- data/ext/src/src/concatenatedGraph.h +30 -0
- data/ext/src/src/concatenatedPreGraph.c +262 -0
- data/ext/src/src/concatenatedPreGraph.h +29 -0
- data/ext/src/src/correctedGraph.c +2643 -0
- data/ext/src/src/correctedGraph.h +32 -0
- data/ext/src/src/dfib.c +509 -0
- data/ext/src/src/dfib.h +69 -0
- data/ext/src/src/dfibHeap.c +89 -0
- data/ext/src/src/dfibHeap.h +39 -0
- data/ext/src/src/dfibpriv.h +105 -0
- data/ext/src/src/fib.c +628 -0
- data/ext/src/src/fib.h +78 -0
- data/ext/src/src/fibHeap.c +79 -0
- data/ext/src/src/fibHeap.h +41 -0
- data/ext/src/src/fibpriv.h +110 -0
- data/ext/src/src/globals.h +154 -0
- data/ext/src/src/graph.c +3932 -0
- data/ext/src/src/graph.h +233 -0
- data/ext/src/src/graphReConstruction.c +1472 -0
- data/ext/src/src/graphReConstruction.h +30 -0
- data/ext/src/src/graphStats.c +2167 -0
- data/ext/src/src/graphStats.h +72 -0
- data/ext/src/src/graphStructures.h +52 -0
- data/ext/src/src/kmer.c +652 -0
- data/ext/src/src/kmer.h +73 -0
- data/ext/src/src/kmerOccurenceTable.c +236 -0
- data/ext/src/src/kmerOccurenceTable.h +44 -0
- data/ext/src/src/kseq.h +223 -0
- data/ext/src/src/locallyCorrectedGraph.c +557 -0
- data/ext/src/src/locallyCorrectedGraph.h +40 -0
- data/ext/src/src/passageMarker.c +677 -0
- data/ext/src/src/passageMarker.h +137 -0
- data/ext/src/src/preGraph.c +1717 -0
- data/ext/src/src/preGraph.h +106 -0
- data/ext/src/src/preGraphConstruction.c +990 -0
- data/ext/src/src/preGraphConstruction.h +26 -0
- data/ext/src/src/probe_node_finder.c +84 -0
- data/ext/src/src/probe_node_finder.h +6 -0
- data/ext/src/src/readCoherentGraph.c +557 -0
- data/ext/src/src/readCoherentGraph.h +30 -0
- data/ext/src/src/readSet.c +1734 -0
- data/ext/src/src/readSet.h +67 -0
- data/ext/src/src/readToNode.c +218 -0
- data/ext/src/src/readToNode.h +35 -0
- data/ext/src/src/recycleBin.c +199 -0
- data/ext/src/src/recycleBin.h +58 -0
- data/ext/src/src/roadMap.c +342 -0
- data/ext/src/src/roadMap.h +65 -0
- data/ext/src/src/run.c +318 -0
- data/ext/src/src/run.h +52 -0
- data/ext/src/src/run2.c +744 -0
- data/ext/src/src/runReadToNode.c +29 -0
- data/ext/src/src/scaffold.c +1876 -0
- data/ext/src/src/scaffold.h +64 -0
- data/ext/src/src/shortReadPairs.c +1243 -0
- data/ext/src/src/shortReadPairs.h +32 -0
- data/ext/src/src/splay.c +259 -0
- data/ext/src/src/splay.h +43 -0
- data/ext/src/src/splayTable.c +1315 -0
- data/ext/src/src/splayTable.h +31 -0
- data/ext/src/src/tightString.c +362 -0
- data/ext/src/src/tightString.h +82 -0
- data/ext/src/src/utility.c +199 -0
- data/ext/src/src/utility.h +98 -0
- data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
- data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
- data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
- data/ext/src/third-party/zlib-1.2.3/README +125 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
- data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
- data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
- data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
- data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
- data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
- data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/configure +459 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/example +0 -0
- data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
- data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
- data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
- data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
- data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
- data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
- data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
- data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
- data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
- data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
- data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
- data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
- data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
- data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
- data/lib/assembly/a_b_visualiser.rb +169 -0
- data/lib/assembly/acyclic_connection_finder.rb +81 -0
- data/lib/assembly/all_orfs.rb +615 -0
- data/lib/assembly/bad_format_writer.rb +46 -0
- data/lib/assembly/bam_probe_read_selector.rb +48 -0
- data/lib/assembly/bubbly_assembler.rb +842 -0
- data/lib/assembly/c_probe_node_finder.rb +38 -0
- data/lib/assembly/connection_interpreter.rb +350 -0
- data/lib/assembly/contig_printer.rb +400 -0
- data/lib/assembly/coverage_based_graph_filter.rb +68 -0
- data/lib/assembly/depth_first_search.rb +63 -0
- data/lib/assembly/dijkstra.rb +216 -0
- data/lib/assembly/fluffer.rb +253 -0
- data/lib/assembly/graph_explorer.rb +85 -0
- data/lib/assembly/graph_generator.rb +315 -0
- data/lib/assembly/height_finder.rb +355 -0
- data/lib/assembly/hybrid_velvet_graph.rb +70 -0
- data/lib/assembly/input_genome.rb +182 -0
- data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
- data/lib/assembly/node_finder.rb +171 -0
- data/lib/assembly/oriented_node_trail.rb +507 -0
- data/lib/assembly/paired_end_assembler.rb +53 -0
- data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
- data/lib/assembly/probed_graph.rb +105 -0
- data/lib/assembly/read_input.rb +79 -0
- data/lib/assembly/read_to_node.rb +37 -0
- data/lib/assembly/scaffold_breaker.rb +126 -0
- data/lib/assembly/sequence_hasher.rb +71 -0
- data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
- data/lib/assembly/single_coherent_wanderer.rb +261 -0
- data/lib/assembly/single_ended_assembler.rb +441 -0
- data/lib/assembly/velvet_c_binding.rb +54 -0
- data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
- data/lib/external/VERSION +1 -0
- data/lib/finishm/assemble.rb +224 -0
- data/lib/finishm/explore.rb +217 -0
- data/lib/finishm/finisher.rb +303 -0
- data/lib/finishm/fluff.rb +122 -0
- data/lib/finishm/gapfiller.rb +325 -0
- data/lib/finishm/orfs_finder.rb +88 -0
- data/lib/finishm/path_counter.rb +90 -0
- data/lib/finishm/primers.rb +425 -0
- data/lib/finishm/primers_check.rb +176 -0
- data/lib/finishm/roundup.rb +344 -0
- data/lib/finishm/sequence.rb +142 -0
- data/lib/finishm/visualise.rb +430 -0
- data/lib/finishm/wander.rb +270 -0
- data/lib/kmer_abundance_pattern.rb +79 -0
- data/lib/kmer_multi_abundance_file.rb +48 -0
- data/lib/oligo_designer.rb +88 -0
- data/lib/priner.rb +66 -0
- data/spec/acyclic_connection_finder_spec.rb +551 -0
- data/spec/all_orfs_spec.rb +443 -0
- data/spec/assemble_spec.rb +186 -0
- data/spec/bubbly_assembler_spec.rb +707 -0
- data/spec/c_node_finder_spec.rb +58 -0
- data/spec/connection_interpreter_spec.rb +284 -0
- data/spec/contig_printer_spec.rb +291 -0
- data/spec/coverage_based_graph_filter_spec.rb +102 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
- data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
- data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
- data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
- data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Graph +46 -0
- data/spec/data/assembly_visualiser/start_kmers1 +2 -0
- data/spec/data/bands.csv +1 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
- data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
- data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
- data/spec/data/c_probe_node_finder/1/Log +756 -0
- data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
- data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
- data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
- data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
- data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
- data/spec/data/contig_printer/1/contigs.fa +4 -0
- data/spec/data/contig_printer/1/seq.fa +2408 -0
- data/spec/data/contig_printer/1/seq.fa.svg +153 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
- data/spec/data/contig_printer/1/seq.node12.fa +4 -0
- data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
- data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
- data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
- data/spec/data/explore/1/a.fa +2 -0
- data/spec/data/explore/1/seq1_and_a.fa +3 -0
- data/spec/data/explore/1/seq2.fa +2 -0
- data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
- data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
- data/spec/data/fluff/1/seq1.fa +2 -0
- data/spec/data/fluff/1/seq2.fa +2 -0
- data/spec/data/gapfilling/1/reads.fa +171 -0
- data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
- data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
- data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
- data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
- data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
- data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
- data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
- data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
- data/spec/data/gapfilling/2/reference.fa +2 -0
- data/spec/data/gapfilling/2/reference_part1.fa +4 -0
- data/spec/data/gapfilling/2/reference_part2.fa +4 -0
- data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
- data/spec/data/gapfilling/2/with_gaps.fa +4 -0
- data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
- data/spec/data/gapfilling/3/reads.fa.gz +0 -0
- data/spec/data/gapfilling/3/reference_part1.fa +4 -0
- data/spec/data/gapfilling/3/reference_part2.fa +4 -0
- data/spec/data/gapfilling/3/with_gaps.fa +4 -0
- data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
- data/spec/data/gapfilling/4/reads.fa.gz +0 -0
- data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
- data/spec/data/gapfilling/5/answer.fna +2 -0
- data/spec/data/gapfilling/5/gappy.fna +2 -0
- data/spec/data/gapfilling/5/reads.fa +17961 -0
- data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
- data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
- data/spec/data/gapfilling/6/random1.fa +28 -0
- data/spec/data/gapfilling/6/random2.fa +28 -0
- data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
- data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
- data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
- data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
- data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
- data/spec/data/kmers_count1.csv +2 -0
- data/spec/data/kmers_count2.csv +3 -0
- data/spec/data/out +3 -0
- data/spec/data/positive_latching_pair.fa +2 -0
- data/spec/data/primers.csv +4 -0
- data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/input.fasta +6 -0
- data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
- data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
- data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
- data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
- data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
- data/spec/data/t/details.txt +5 -0
- data/spec/data/t/details.txt.srt +5 -0
- data/spec/data/t/location.txt +3 -0
- data/spec/data/t/location.txt.srt +3 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
- data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
- data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/answer.fa +2 -0
- data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
- data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
- data/spec/data/velvet_test_trails/Assem/Graph +17 -0
- data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
- data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
- data/spec/data/velvet_test_trails/Assem/Log +35 -0
- data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
- data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
- data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
- data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
- data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
- data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
- data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
- data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
- data/spec/data/velvet_test_trails/read1.fa +2 -0
- data/spec/data/velvet_test_trails/reads.fa +50 -0
- data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
- data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
- data/spec/data/visualise/1/LastGraph +6695 -0
- data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
- data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
- data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
- data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
- data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
- data/spec/data/wander/1/random1.fa +2 -0
- data/spec/data/wander/1/random1.sammy.fa +804 -0
- data/spec/depth_first_search_spec.rb +190 -0
- data/spec/dijkstra_spec.rb +143 -0
- data/spec/explore_spec.rb +29 -0
- data/spec/fluffer_spec.rb +155 -0
- data/spec/gapfiller_spec.rb +107 -0
- data/spec/graph_explorer_spec.rb +475 -0
- data/spec/graph_generator_spec.rb +99 -0
- data/spec/height_finder_spec.rb +306 -0
- data/spec/kmer_abundance_pattern_spec.rb +56 -0
- data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
- data/spec/kmer_profile_finder_spec.rb +38 -0
- data/spec/kmers_count_tabulate_spec.rb +120 -0
- data/spec/oriented_node_trail_spec.rb +221 -0
- data/spec/paired_end_neighbours_spec.rb +126 -0
- data/spec/paths_between_nodes_spec.rb +349 -0
- data/spec/priner_spec.rb +7 -0
- data/spec/read_input_spec.rb +23 -0
- data/spec/read_selection_by_kmer_spec.rb +166 -0
- data/spec/read_to_node_spec.rb +35 -0
- data/spec/roundup_spec.rb +366 -0
- data/spec/scaffold_breaker_spec.rb +144 -0
- data/spec/sequence_spec.rb +43 -0
- data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
- data/spec/single_coherent_wanderer_spec.rb +120 -0
- data/spec/single_ended_assembler_spec.rb +398 -0
- data/spec/spec_helper.rb +310 -0
- data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
- data/spec/visualise_spec.rb +105 -0
- data/spec/wander_spec.rb +119 -0
- data/spec/watch_for_changes.sh +16 -0
- data/validation/fasta_compare.rb +72 -0
- data/validation/gapfill_simulate_perfect.rb +108 -0
- metadata +899 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
class Bio::FinishM::CProbeNodeFinder
|
|
2
|
+
include Bio::FinishM::Logging
|
|
3
|
+
|
|
4
|
+
# Return an array of [best_node, best_noded_read] that represent the probes in the graph
|
|
5
|
+
def find_probes(velvet_underground_graph, probe_read_ids)
|
|
6
|
+
# First use the C method to extract the set of interesting nodes
|
|
7
|
+
log.debug "Extracting target nodes using the C method.." if log.debug?
|
|
8
|
+
target_node_ids = find_probe_nodes(velvet_underground_graph, probe_read_ids)
|
|
9
|
+
|
|
10
|
+
# Then iterate over just those nodes we know are interesting
|
|
11
|
+
log.debug "Extracting from only those #{target_node_ids.length} nodes that are interesting.." if log.debug?
|
|
12
|
+
target_node_ids_set = Set.new target_node_ids
|
|
13
|
+
return Bio::AssemblyGraphAlgorithms::NodeFinder.new.find_unique_nodes_with_sequence_ids(
|
|
14
|
+
velvet_underground_graph, probe_read_ids, :target_node_ids => target_node_ids_set
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Return a minimal Array of node IDs that contain all probe read IDs
|
|
19
|
+
def find_probe_nodes(velvet_underground_graph, probe_read_ids)
|
|
20
|
+
@bindings ||= Bio::FinishM::VelvetCBinding.new
|
|
21
|
+
|
|
22
|
+
c_probe_read_ids = FFI::MemoryPointer.new(:int32, probe_read_ids.length)
|
|
23
|
+
c_probe_read_ids.write_array_of_int32(probe_read_ids)
|
|
24
|
+
|
|
25
|
+
probe_nodes = @bindings.extract_best_probe_reads(
|
|
26
|
+
velvet_underground_graph.internal_graph_struct,
|
|
27
|
+
c_probe_read_ids,
|
|
28
|
+
probe_read_ids.length)
|
|
29
|
+
probe_nodes2 = probe_nodes.read_array_of_int32(probe_read_ids.length).collect{|n| n.abs}.uniq
|
|
30
|
+
|
|
31
|
+
#clean up
|
|
32
|
+
c_probe_read_ids.free
|
|
33
|
+
|
|
34
|
+
return probe_nodes2
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
require 'yargraph'
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Bio::FinishM::ConnectionInterpreter
|
|
5
|
+
include Bio::FinishM::Logging
|
|
6
|
+
|
|
7
|
+
# connections is an Enumerable of Probe object , sequences is a hash of name => DNA string
|
|
8
|
+
def initialize(connections, sequence_ids)
|
|
9
|
+
@graph = Yargraph::UndirectedGraph.new
|
|
10
|
+
@circular_probes = []
|
|
11
|
+
@sequence_ids = sequence_ids
|
|
12
|
+
|
|
13
|
+
# Setup hash of setable to original
|
|
14
|
+
# Assume there is only 1 connection between two contig ends
|
|
15
|
+
@connection_hash = {}
|
|
16
|
+
connections.each do |conn|
|
|
17
|
+
key = conn.to_settable
|
|
18
|
+
raise "Duplicate connections not handled (yet?), found #{conn} => #{key}" if @connection_hash.key?(key)
|
|
19
|
+
@connection_hash[key] = conn
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Add connections
|
|
23
|
+
connections.each do |conn|
|
|
24
|
+
if conn.probe1.to_settable == conn.probe2.to_settable
|
|
25
|
+
@circular_probes.push con..probe1
|
|
26
|
+
else
|
|
27
|
+
@graph.add_edge conn.probe1.to_settable, conn.probe2.to_settable
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
log.debug "Created a graph with #{@graph.vertices.to_a.length} vertices and #{@graph.edges.length} edges" if log.debug?
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def connections
|
|
35
|
+
@connection_hash.values
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Return sequences that exclusively connect the start to the end. In particular,
|
|
39
|
+
# return an Array of sequence names
|
|
40
|
+
def circular_sequences
|
|
41
|
+
to_return = []
|
|
42
|
+
connections.each do |conn|
|
|
43
|
+
if conn.probe1.sequence_index == conn.probe2.sequence_index and
|
|
44
|
+
conn.probe1.side != conn.probe2.side and
|
|
45
|
+
@graph.edges[conn.probe1.to_settable].length == 1 and
|
|
46
|
+
@graph.edges[conn.probe2.to_settable].length == 1
|
|
47
|
+
|
|
48
|
+
to_return.push conn.probe1.sequence_index
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
return to_return
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Return an Array of Connection objects that represent edges where
|
|
56
|
+
# there is only a single connection from both side
|
|
57
|
+
def doubly_single_contig_connections
|
|
58
|
+
likelies = []
|
|
59
|
+
|
|
60
|
+
already_seen_connections = Set.new
|
|
61
|
+
|
|
62
|
+
@graph.vertices.each do |v|
|
|
63
|
+
# If there is only 1 connection on both sides, then go with that
|
|
64
|
+
neighbours = @graph.neighbours(v)
|
|
65
|
+
log.debug "Testing connection between #{v} and #{neighbours}"
|
|
66
|
+
if neighbours.length == 1 and @graph.neighbours(neighbours[0]).length == 1
|
|
67
|
+
log.debug "Connection passed the doubly-test" if log.debug?
|
|
68
|
+
neighbour = neighbours[0]
|
|
69
|
+
|
|
70
|
+
conn = Connection.new
|
|
71
|
+
conn.probe1 = Probe.new(v)
|
|
72
|
+
conn.probe2 = Probe.new(neighbour)
|
|
73
|
+
settable = conn.to_settable
|
|
74
|
+
# Record the connection unless it is duplicate
|
|
75
|
+
unless already_seen_connections.include?(settable)
|
|
76
|
+
likelies.push @connection_hash[settable]
|
|
77
|
+
already_seen_connections << settable
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
return likelies
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Single linkage cluster the likely_inter_contig_connections
|
|
86
|
+
# and the start to ends for each of the contigs. Assumes
|
|
87
|
+
def scaffolds(contig_connections)
|
|
88
|
+
# It is like an (easy)
|
|
89
|
+
# assembly problem because each vertex can only be connected to
|
|
90
|
+
# two others - 1 intra-contig and 1 inter-contig (unless it is circular)
|
|
91
|
+
likelies_edge_set = Yargraph::UndirectedGraph::EdgeSet.new
|
|
92
|
+
contig_connections.each do |conn|
|
|
93
|
+
likelies_edge_set.add_edge conn.probe1.to_settable, conn.probe2.to_settable
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
scaffolded_paths = []
|
|
97
|
+
circular_single_contigs = Set.new
|
|
98
|
+
|
|
99
|
+
# while there is more elements in the likelies set,
|
|
100
|
+
# 'pop' an arbitrary edge out of the graph
|
|
101
|
+
while starting_edge = likelies_edge_set.pop
|
|
102
|
+
log.debug "starting to scaffold from #{starting_edge}" if log.debug?
|
|
103
|
+
|
|
104
|
+
# Ignore likelies that are circular
|
|
105
|
+
if starting_edge[0][0] == starting_edge[1][0]
|
|
106
|
+
log.debug "Not scaffolding contig #{starting_edge[0][0] } since it appears to be circular" if log.debug?
|
|
107
|
+
circular_single_contigs << starting_edge[0][0]
|
|
108
|
+
next
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
circular = false
|
|
112
|
+
|
|
113
|
+
# go 'left'. Connect the other side of the left.
|
|
114
|
+
lefts = [Probe.new(starting_edge[0])]
|
|
115
|
+
rights = [Probe.new(starting_edge[1])]
|
|
116
|
+
log.debug "rights was #{rights[0].to_s}" if log.debug?
|
|
117
|
+
# while there is another node to the left
|
|
118
|
+
while next_probe = likelies_edge_set[lefts[-1].companion.to_settable].to_a[0]
|
|
119
|
+
next_probe_probe = Probe.new(next_probe)
|
|
120
|
+
companion = lefts[-1].companion
|
|
121
|
+
|
|
122
|
+
likelies_edge_set.delete next_probe, companion.to_settable
|
|
123
|
+
if next_probe_probe.companion.to_settable == rights[0].to_settable
|
|
124
|
+
log.debug "Found multi-contig circularity between #{next_probe_probe.companion} and #{rights[0] }" if log.debug?
|
|
125
|
+
circular = true
|
|
126
|
+
break
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
lefts.push companion
|
|
130
|
+
lefts.push next_probe_probe
|
|
131
|
+
log.debug "Adding node to the left: #{next_probe} and companion #{companion}" if log.debug?
|
|
132
|
+
end
|
|
133
|
+
# and go right
|
|
134
|
+
while next_probe = likelies_edge_set[rights[-1].companion.to_settable].to_a[0]
|
|
135
|
+
companion = rights[-1].companion
|
|
136
|
+
rights.push companion
|
|
137
|
+
rights.push Probe.new(next_probe)
|
|
138
|
+
log.debug "Adding node to the right: #{next_probe} and companion #{companion}" if log.debug?
|
|
139
|
+
likelies_edge_set.delete next_probe, companion.to_settable
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Add the left and the right together into one path
|
|
143
|
+
scaffolded_paths.push(
|
|
144
|
+
PossiblyCircularArray.new(
|
|
145
|
+
[lefts[-1].companion]+
|
|
146
|
+
lefts.reverse+
|
|
147
|
+
rights+
|
|
148
|
+
[rights[-1].companion],
|
|
149
|
+
circular)
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
if log.debug?
|
|
153
|
+
log.debug "Found #{scaffolded_paths.length} multi-contig scaffold(s):"
|
|
154
|
+
scaffolded_paths.each do |path|
|
|
155
|
+
log.debug "Scaffold: #{path.collect{|e| e.to_s}.join(', ') }"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# for each scaffolded set, create new scaffold object
|
|
160
|
+
scaffolds = []
|
|
161
|
+
scaffolded_contigs = Set.new
|
|
162
|
+
scaffolded_paths.each do |path|
|
|
163
|
+
raise if path.length % 2 != 0
|
|
164
|
+
scaffold = Scaffold.new
|
|
165
|
+
scaffold.circular = path.circular
|
|
166
|
+
previous_probe = nil
|
|
167
|
+
path.each_with_index do |probe, i|
|
|
168
|
+
if i % 2 == 1
|
|
169
|
+
previous_probe = probe
|
|
170
|
+
next
|
|
171
|
+
end
|
|
172
|
+
contig = UnscaffoldedContig.new
|
|
173
|
+
contig.sequence_index = probe.sequence_index
|
|
174
|
+
if probe.side == :start
|
|
175
|
+
contig.direction = true
|
|
176
|
+
else
|
|
177
|
+
contig.direction = false
|
|
178
|
+
end
|
|
179
|
+
scaffold.contigs ||= []
|
|
180
|
+
unless scaffold.contigs.empty?
|
|
181
|
+
dummy_conn = Connection.new
|
|
182
|
+
dummy_conn.probe1 = previous_probe
|
|
183
|
+
dummy_conn.probe2 = probe
|
|
184
|
+
original_connection = @connection_hash[dummy_conn.to_settable]
|
|
185
|
+
scaffold.gap_lengths.push original_connection.distance
|
|
186
|
+
end
|
|
187
|
+
scaffold.contigs.push contig
|
|
188
|
+
scaffolded_contigs << probe.sequence_index
|
|
189
|
+
end
|
|
190
|
+
scaffolds.push scaffold
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# for each contig that is not in a contig, add as singleton
|
|
194
|
+
@sequence_ids.each do |i|
|
|
195
|
+
unless scaffolded_contigs.include?(i)
|
|
196
|
+
scaff = Scaffold.new
|
|
197
|
+
contig = UnscaffoldedContig.new
|
|
198
|
+
contig.sequence_index = i
|
|
199
|
+
contig.direction = true
|
|
200
|
+
scaff.contigs = [contig]
|
|
201
|
+
if circular_single_contigs.include?(i)
|
|
202
|
+
scaff.circular = true
|
|
203
|
+
else
|
|
204
|
+
scaff.circular = false
|
|
205
|
+
end
|
|
206
|
+
scaffolds.push scaff
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
return scaffolds
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Assuming the sequence_ids given in the initialize
|
|
214
|
+
# are the same as the sequence_index
|
|
215
|
+
def unconnected_probes
|
|
216
|
+
observed_connections = Set.new
|
|
217
|
+
connections.each do |conn|
|
|
218
|
+
observed_connections << conn.probe1.to_settable
|
|
219
|
+
observed_connections << conn.probe2.to_settable
|
|
220
|
+
end
|
|
221
|
+
to_return = []
|
|
222
|
+
@sequence_ids.each do |index|
|
|
223
|
+
[:start, :end].each do |side|
|
|
224
|
+
probe = Probe.new
|
|
225
|
+
probe.sequence_index = index
|
|
226
|
+
probe.side = side
|
|
227
|
+
unless observed_connections.include?(probe.to_settable)
|
|
228
|
+
to_return.push probe
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
return to_return
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Return an Array of sequence indices that did not have any connections
|
|
236
|
+
# to any others.
|
|
237
|
+
def unconnected_sequences
|
|
238
|
+
observed_sequences = Set.new
|
|
239
|
+
connections.each do |conn|
|
|
240
|
+
observed_sequences << conn.probe1.sequence_index
|
|
241
|
+
observed_sequences << conn.probe2.sequence_index
|
|
242
|
+
end
|
|
243
|
+
return @sequence_ids.to_a - observed_sequences.to_a
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
class Connection
|
|
247
|
+
# Probe objects
|
|
248
|
+
attr_accessor :probe1, :probe2
|
|
249
|
+
|
|
250
|
+
attr_accessor :distance
|
|
251
|
+
|
|
252
|
+
def to_s
|
|
253
|
+
[@probe1, @probe2].join('/')+":#{@distance}"
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def to_settable
|
|
257
|
+
if @probe1.sequence_index < @probe2.sequence_index
|
|
258
|
+
return [@probe1.to_settable, @probe2.to_settable].flatten
|
|
259
|
+
elsif @probe1.sequence_index == @probe2.sequence_index
|
|
260
|
+
if @probe1.side < @probe2.side
|
|
261
|
+
return [@probe1.to_settable, @probe2.to_settable].flatten
|
|
262
|
+
else
|
|
263
|
+
return [@probe2.to_settable, @probe1.to_settable].flatten
|
|
264
|
+
end
|
|
265
|
+
else
|
|
266
|
+
return [@probe2.to_settable, @probe1.to_settable].flatten
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
class Probe
|
|
272
|
+
attr_accessor :side #:start or :end
|
|
273
|
+
attr_accessor :sequence_index #ID of the underlying sequence as an Integer
|
|
274
|
+
|
|
275
|
+
def initialize(settable_representation=nil)
|
|
276
|
+
unless settable_representation.nil?
|
|
277
|
+
@sequence_index = settable_representation[0]
|
|
278
|
+
@side = settable_representation[1]
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def to_settable
|
|
283
|
+
[@sequence_index, @side]
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def to_s
|
|
287
|
+
side = @side == :start ? 's' : 'e'
|
|
288
|
+
"#{@sequence_index}#{side}"
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Return a probe representing the other side of the contig
|
|
292
|
+
def companion
|
|
293
|
+
companion = Probe.new
|
|
294
|
+
companion.sequence_index = @sequence_index
|
|
295
|
+
companion.side = @side == :start ? :end : :start
|
|
296
|
+
return companion
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
class Scaffold
|
|
301
|
+
attr_accessor :contigs, :gap_lengths
|
|
302
|
+
|
|
303
|
+
attr_accessor :circular
|
|
304
|
+
def circular?
|
|
305
|
+
@circular
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def initialize
|
|
309
|
+
@contigs = []
|
|
310
|
+
@gap_lengths = []
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def sequence(sequence_id_to_nucleotides_hash)
|
|
314
|
+
raise "Programming error" unless @contigs.length == @gap_lengths.length + 1
|
|
315
|
+
parts = []
|
|
316
|
+
|
|
317
|
+
add_sequence_of = lambda do |contig|
|
|
318
|
+
seq = sequence_id_to_nucleotides_hash[contig.sequence_index]
|
|
319
|
+
if contig.direction == true
|
|
320
|
+
parts.push seq
|
|
321
|
+
elsif contig.direction == false
|
|
322
|
+
parts.push Bio::Sequence::NA.new(seq).reverse_complement.to_s.upcase
|
|
323
|
+
else
|
|
324
|
+
raise "Programming error"
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
add_sequence_of.call @contigs[0]
|
|
329
|
+
|
|
330
|
+
@gap_lengths.each_with_index do |gap_length, i|
|
|
331
|
+
parts.push 'N'*gap_length
|
|
332
|
+
add_sequence_of.call @contigs[i+1]
|
|
333
|
+
end
|
|
334
|
+
return parts.join('')
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
class UnscaffoldedContig
|
|
339
|
+
attr_accessor :sequence_index, :direction
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
class PossiblyCircularArray < Array
|
|
343
|
+
attr_accessor :circular
|
|
344
|
+
|
|
345
|
+
def initialize(array, circular)
|
|
346
|
+
@circular = circular
|
|
347
|
+
super(array)
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
require 'bio'
|
|
2
|
+
|
|
3
|
+
class Bio::Velvet::Graph::NodedRead
|
|
4
|
+
def adjusted_position(parent_node)
|
|
5
|
+
if @direction == true
|
|
6
|
+
return @offset_from_start_of_node
|
|
7
|
+
elsif @direction == false
|
|
8
|
+
return parent_node.length - @offset_from_start_of_node
|
|
9
|
+
else
|
|
10
|
+
raise "programming error"
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
module Bio
|
|
16
|
+
module AssemblyGraphAlgorithms
|
|
17
|
+
class ContigPrinter
|
|
18
|
+
include Bio::FinishM::Logging
|
|
19
|
+
|
|
20
|
+
class AnchoredConnection
|
|
21
|
+
# The identifiers of the probe reads in the velvet assembly graph
|
|
22
|
+
attr_accessor :start_probe_noded_read, :end_probe_noded_read
|
|
23
|
+
|
|
24
|
+
# number of nucleotides between the start of the start probe read and the start of the end of the contig
|
|
25
|
+
attr_accessor :start_probe_contig_offset
|
|
26
|
+
|
|
27
|
+
# number of nucleotides until the end of the end probe read in the start of the second contig
|
|
28
|
+
attr_accessor :end_probe_contig_offset
|
|
29
|
+
|
|
30
|
+
# Length of the start and end probe sequences
|
|
31
|
+
attr_accessor :start_probe_read_length
|
|
32
|
+
attr_accessor :end_probe_read_length
|
|
33
|
+
|
|
34
|
+
# Enumerable of Enumerables of OrientedNode objects, each list of OrientedNode objects
|
|
35
|
+
# corresponds to a path that forms the connection
|
|
36
|
+
attr_accessor :paths
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Given two contigs, return a consensus path and variants of the path.
|
|
40
|
+
#
|
|
41
|
+
# ----------> <-------- start and end probes (ends of probe sequences may not form part of final path). Directions not variable.
|
|
42
|
+
# --------------------->NNNN-------------------> original sequence to be gapfilled (contig1, NNNN, contig2). Directions not variable
|
|
43
|
+
# ----------- -------> path across the gap. Direction not variable
|
|
44
|
+
# \ /
|
|
45
|
+
# --------------
|
|
46
|
+
# ---------->|<-----|----->|---------> nodes that make up the path (directions and boundaries variable)
|
|
47
|
+
# stage1| stage2 |stage3 stages of sequence construction in this method
|
|
48
|
+
# Much like one_connection_between_two_contigs except can handle multiple connections
|
|
49
|
+
# (but cannot handle 0 connections)
|
|
50
|
+
def ready_two_contigs_and_connections(graph, contig1, anchored_connection, contig2, sequences)
|
|
51
|
+
to_return = ''
|
|
52
|
+
variants = []
|
|
53
|
+
|
|
54
|
+
log.debug "Working with anchored_connection: #{anchored_connection.inspect}" if log.debug?
|
|
55
|
+
|
|
56
|
+
# Stage1 - contig1 before the path begins
|
|
57
|
+
to_return = nil
|
|
58
|
+
if anchored_connection.start_probe_contig_offset == 0
|
|
59
|
+
# 0 is a special case because negative 0 doesn't make sense
|
|
60
|
+
to_return = contig1
|
|
61
|
+
else
|
|
62
|
+
to_return = contig1[0...-(anchored_connection.start_probe_contig_offset)]
|
|
63
|
+
end
|
|
64
|
+
log.debug "After first chunk of sequence added, sequence is #{to_return.length}bp long" if log.debug?
|
|
65
|
+
|
|
66
|
+
# Stage2 - path sequence, beginning and ending with
|
|
67
|
+
# beginning and ending probes
|
|
68
|
+
begin
|
|
69
|
+
example_path = anchored_connection.paths[0]
|
|
70
|
+
path_sequence, variants = sequences_to_variants_conservative(
|
|
71
|
+
anchored_connection.paths.collect{|path| path.sequence}
|
|
72
|
+
)
|
|
73
|
+
log.debug "Reference path has a sequence length #{path_sequence.length}" if log.debug?
|
|
74
|
+
|
|
75
|
+
# Find start index
|
|
76
|
+
begin_onode = example_path[0]
|
|
77
|
+
begin_noded_read = anchored_connection.start_probe_noded_read
|
|
78
|
+
raise if begin_noded_read.nil?
|
|
79
|
+
extra_bit_on_start = ''
|
|
80
|
+
if begin_noded_read.start_coord != 0
|
|
81
|
+
log.warn "Unexpectedly the start of the start probe not did not form part of the path, which is a little suspicious"
|
|
82
|
+
extra_bit_on_start = sequences[begin_noded_read.read_id][0...begin_noded_read.start_coord]
|
|
83
|
+
end
|
|
84
|
+
offset_of_begin_probe_on_path = nil
|
|
85
|
+
# xor read direction on node, and node direction on path
|
|
86
|
+
if (begin_noded_read.direction == true) ^ begin_onode.starts_at_start?
|
|
87
|
+
offset_of_begin_probe_on_path = begin_onode.node.corresponding_contig_length - begin_noded_read.offset_from_start_of_node
|
|
88
|
+
# extra bit on read needs to be reverse complemented
|
|
89
|
+
extra_bit_on_start = Bio::Sequence::NA.new(extra_bit_on_start).reverse_complement.to_s.upcase unless extra_bit_on_start == ''
|
|
90
|
+
else
|
|
91
|
+
offset_of_begin_probe_on_path = begin_noded_read.offset_from_start_of_node
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Correct variants' positions to be relative to the full contig,
|
|
95
|
+
# not just the path sequence
|
|
96
|
+
variants.each do |variant|
|
|
97
|
+
variant.position = variant.position - offset_of_begin_probe_on_path + to_return.length + 1
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Find end index
|
|
101
|
+
end_onode = example_path[-1]
|
|
102
|
+
end_noded_read = anchored_connection.end_probe_noded_read
|
|
103
|
+
raise if end_noded_read.nil?
|
|
104
|
+
extra_bit_on_end = ''
|
|
105
|
+
if end_noded_read.start_coord != 0
|
|
106
|
+
log.warn "Unexpectedly the end of the end probe not did not form part of the path, which is a little suspicious"
|
|
107
|
+
extra_bit_on_end = sequences[end_noded_read.read_id][0...end_noded_read.start_coord]
|
|
108
|
+
end
|
|
109
|
+
# Potentially the example_path has a different length than the reference sequence in bp.
|
|
110
|
+
# Correct this ? Or not a bug? confused. I hate this method. TODO. There is a test for this which is unwritten but it fails
|
|
111
|
+
offset_of_end_node_on_path = example_path[0...-1].reduce(0){|sum, onode| sum += onode.node.length_alone}
|
|
112
|
+
if (end_noded_read.direction == false) ^ end_onode.starts_at_start?
|
|
113
|
+
offset_of_end_node_on_path += end_noded_read.offset_from_start_of_node
|
|
114
|
+
extra_bit_on_end = Bio::Sequence::NA.new(extra_bit_on_end).reverse_complement.to_s.upcase unless extra_bit_on_end == ''
|
|
115
|
+
else
|
|
116
|
+
offset_of_end_node_on_path += end_onode.node.corresponding_contig_length - end_noded_read.offset_from_start_of_node
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
log.debug "Found start index #{offset_of_begin_probe_on_path} and end index #{offset_of_end_node_on_path}" if log.debug?
|
|
120
|
+
to_return += extra_bit_on_start+
|
|
121
|
+
path_sequence[offset_of_begin_probe_on_path...offset_of_end_node_on_path]+
|
|
122
|
+
extra_bit_on_end
|
|
123
|
+
log.debug "After path chunk of sequence added, sequence is #{to_return.length}bp long" if log.debug?
|
|
124
|
+
end #end stage 2
|
|
125
|
+
|
|
126
|
+
# Stage 3
|
|
127
|
+
to_return += contig2[anchored_connection.end_probe_contig_offset..-1]
|
|
128
|
+
log.debug "After last chunk of sequence added, sequence is #{to_return.length}bp long" if log.debug?
|
|
129
|
+
|
|
130
|
+
return to_return, variants
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Like ready_two_contigs_and_connections except assumes that there is only a single
|
|
134
|
+
# connection between the two sides
|
|
135
|
+
def one_connection_between_two_contigs(graph, contig1, anchored_connection, contig2, sequences)
|
|
136
|
+
raise "programming error: only one path expected here" if anchored_connection.paths.length > 1
|
|
137
|
+
return ready_two_contigs_and_connections(graph, contig1, anchored_connection, contig2, sequences)[0]
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
# Given an Array of sequences (each representing a path), do a MSA and return as a list of
|
|
142
|
+
# variants from a sequence that is defintely true. A little hard to define.
|
|
143
|
+
def sequences_to_variants_conservative(sequences)
|
|
144
|
+
if sequences.length == 1
|
|
145
|
+
# No variants here
|
|
146
|
+
return sequences[0], []
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Do alignment
|
|
150
|
+
# Run multiple sequence alignment of each sequence, with the reference sequence first
|
|
151
|
+
log.debug "Running MSA with #{sequences.length} sequences.." if log.debug?
|
|
152
|
+
original_alignments = clustalo(sequences)
|
|
153
|
+
log.debug "Finished running MSA" if log.debug?
|
|
154
|
+
if log.debug?
|
|
155
|
+
log.debug "Alignment found was:"
|
|
156
|
+
original_alignments.each do |align|
|
|
157
|
+
log.debug align
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Work out reference path
|
|
162
|
+
ref = []
|
|
163
|
+
original_alignments[0].split('').each_index do |i|
|
|
164
|
+
base_counts = {}
|
|
165
|
+
original_alignments.each do |aln|
|
|
166
|
+
base = aln[i]
|
|
167
|
+
base_counts[base] ||= 0
|
|
168
|
+
base_counts[base] += 1
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
if base_counts.length == 1
|
|
172
|
+
# where all paths agree, use that base
|
|
173
|
+
ref.push base_counts.keys[0]
|
|
174
|
+
else
|
|
175
|
+
# otherwise use - or N, depending on how many things have a base at each position.
|
|
176
|
+
num_gaps = base_counts['-']
|
|
177
|
+
if num_gaps.nil? or num_gaps < base_counts.values.reduce(:+).to_f / 2
|
|
178
|
+
ref.push 'N'
|
|
179
|
+
else
|
|
180
|
+
ref.push '-'
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# return reference path, and variants
|
|
186
|
+
reference_sequence = ref.join('')
|
|
187
|
+
return reference_sequence, alignment_to_variants(reference_sequence, original_alignments)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Given a MSA (as a single reference and an array of alternates),
|
|
191
|
+
# return a condensed set of variants
|
|
192
|
+
def alignment_to_variants(reference_alignment, alternate_sequences_alignment)
|
|
193
|
+
return [] if alternate_sequences_alignment.empty?
|
|
194
|
+
|
|
195
|
+
# Collect the variants at each sequence at each column
|
|
196
|
+
variants = [] #Array of empty arrays
|
|
197
|
+
reference_position = 0
|
|
198
|
+
i = 0
|
|
199
|
+
reference_alignment.each_char do |ref_base|
|
|
200
|
+
alternate_sequences_alignment.each_with_index do |alignment, sequence_id|
|
|
201
|
+
nonref = alignment[i]
|
|
202
|
+
if nonref != ref_base
|
|
203
|
+
variant = nil
|
|
204
|
+
if ref_base == '-'
|
|
205
|
+
variant = Variant.new reference_position, nonref, Variant::INSERT
|
|
206
|
+
elsif nonref == '-'
|
|
207
|
+
variant = Variant.new reference_position, 1, Variant::DELETION
|
|
208
|
+
else
|
|
209
|
+
variant = Variant.new reference_position, nonref, Variant::SWAP
|
|
210
|
+
end
|
|
211
|
+
variants[sequence_id] ||= []
|
|
212
|
+
variants[sequence_id].push variant
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
reference_position += 1 unless ref_base == '-'
|
|
216
|
+
i += 1
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Condense the single column, single species variants into a condensed set
|
|
220
|
+
return condense_variants!(variants)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Sometimes several paths will contain the same variant. Remove these duplications.
|
|
224
|
+
def condense_variants!(variant_array_of_arrays)
|
|
225
|
+
all_variants = {}
|
|
226
|
+
|
|
227
|
+
variant_array_of_arrays.each_with_index do |variant_array, i|
|
|
228
|
+
last_variant = nil
|
|
229
|
+
current_variants = []
|
|
230
|
+
variant_array.each do |variant|
|
|
231
|
+
# Combine last_variant and this one if
|
|
232
|
+
# their positions are consecutive and their types are the same
|
|
233
|
+
if !last_variant.nil? and last_variant.type == variant.type
|
|
234
|
+
|
|
235
|
+
if variant.type == Variant::INSERT and last_variant.position == variant.position
|
|
236
|
+
last_variant.sequence += variant.sequence
|
|
237
|
+
|
|
238
|
+
elsif variant.type == Variant::DELETION and last_variant.position == variant.position - last_variant.deletion_length
|
|
239
|
+
last_variant.deletion_length += 1
|
|
240
|
+
|
|
241
|
+
elsif variant.type == Variant::SWAP and last_variant.position + last_variant.sequence.length == variant.position
|
|
242
|
+
last_variant.sequence += variant.sequence
|
|
243
|
+
|
|
244
|
+
else
|
|
245
|
+
# Start a new variant
|
|
246
|
+
last_variant = variant
|
|
247
|
+
current_variants.push variant
|
|
248
|
+
end
|
|
249
|
+
else
|
|
250
|
+
last_variant = variant
|
|
251
|
+
current_variants.push variant
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
if log.debug?
|
|
255
|
+
log.debug "Found #{current_variants.length} variants in sequence #{i}:"
|
|
256
|
+
current_variants.each do |variant|
|
|
257
|
+
log.debug variant.to_shorthand
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Multiple paths can have the same variant. Don't duplicate
|
|
262
|
+
current_variants.each do |variant|
|
|
263
|
+
key = [
|
|
264
|
+
variant.position,
|
|
265
|
+
variant.sequence,
|
|
266
|
+
variant.deletion_length,
|
|
267
|
+
variant.type
|
|
268
|
+
]
|
|
269
|
+
all_variants[key] ||= variant
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
return all_variants.values
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# # Given an Enumerable of nucleic acid sequences, align them with MAFFT,
|
|
277
|
+
# # and return an Array of the same size as the input
|
|
278
|
+
# def mafft(sequences)
|
|
279
|
+
# i = 0
|
|
280
|
+
# stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
|
|
281
|
+
# stdout = Bio::Commandeer.run "mafft --retree 1 --quiet --nuc /dev/stdin", {:stdin => stdin, :log => log}
|
|
282
|
+
# to_return = []
|
|
283
|
+
# header = true
|
|
284
|
+
# stdout.each_line do |line|
|
|
285
|
+
# if !header
|
|
286
|
+
# to_return.push line.strip
|
|
287
|
+
# end
|
|
288
|
+
# header = !header
|
|
289
|
+
# end
|
|
290
|
+
# return to_return
|
|
291
|
+
# end
|
|
292
|
+
|
|
293
|
+
def clustalo(sequences)
|
|
294
|
+
i = 0
|
|
295
|
+
stdin = sequences.collect{|s| i+=1; ">#{i}\n#{s}\n"}.join('')
|
|
296
|
+
log.info "Running clustalo with #{sequences.length} sequences, specifically: #{stdin}" #if log.debug?
|
|
297
|
+
stdout = Bio::Commandeer.run "clustalo -t DNA -i - --output-order=input-order", {:stdin => stdin, :log => log}
|
|
298
|
+
to_return = []
|
|
299
|
+
header = true
|
|
300
|
+
Bio::FlatFile.foreach(Bio::FastaFormat, StringIO.new(stdout)) do |seq|
|
|
301
|
+
to_return.push seq.seq.to_s
|
|
302
|
+
end
|
|
303
|
+
return to_return
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class Variant
|
|
310
|
+
#Types:
|
|
311
|
+
INSERT = :insert
|
|
312
|
+
DELETION = :deletion
|
|
313
|
+
SWAP = :swap #n bases swapped for another n bases
|
|
314
|
+
|
|
315
|
+
attr_accessor :reference_name
|
|
316
|
+
|
|
317
|
+
# 0-based position on the contig
|
|
318
|
+
attr_accessor :position
|
|
319
|
+
|
|
320
|
+
# sequence (or nil if variant is a deletion)
|
|
321
|
+
attr_accessor :sequence
|
|
322
|
+
|
|
323
|
+
# length of deletion (or nil if not a deletion)
|
|
324
|
+
attr_accessor :deletion_length
|
|
325
|
+
|
|
326
|
+
# See constants in this class
|
|
327
|
+
attr_accessor :type
|
|
328
|
+
|
|
329
|
+
def initialize(position=nil, sequence_or_deletion_length=nil, type=nil)
|
|
330
|
+
@position = position
|
|
331
|
+
@type = type
|
|
332
|
+
if type == DELETION
|
|
333
|
+
@deletion_length = sequence_or_deletion_length
|
|
334
|
+
else
|
|
335
|
+
@sequence = sequence_or_deletion_length
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def base_number
|
|
340
|
+
@position+1
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def to_shorthand
|
|
344
|
+
if type == DELETION
|
|
345
|
+
"#{base_number}D:#{deletion_length}"
|
|
346
|
+
elsif type == SWAP
|
|
347
|
+
"#{base_number}S:#{sequence.upcase}"
|
|
348
|
+
elsif type == INSERT
|
|
349
|
+
"#{base_number}I:#{sequence.upcase}"
|
|
350
|
+
else
|
|
351
|
+
raise
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# The reference sequence has been reverse complemented. Fix this
|
|
356
|
+
# variant so it makes sense again (position aside)
|
|
357
|
+
def reverse!
|
|
358
|
+
if type == SWAP or type == INSERT
|
|
359
|
+
@sequence = Bio::Sequence::NA.new(@sequence).reverse_complement.to_s.upcase
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO
|
|
364
|
+
def vcf_array(reference_sequence)
|
|
365
|
+
bits = [
|
|
366
|
+
@reference_name,
|
|
367
|
+
@position+1,
|
|
368
|
+
'.',
|
|
369
|
+
]
|
|
370
|
+
case type
|
|
371
|
+
when SWAP then
|
|
372
|
+
bits.push reference_sequence[@position...(@position+@sequence.length) ]
|
|
373
|
+
bits.push @sequence
|
|
374
|
+
when INSERT then
|
|
375
|
+
bits.push '.'
|
|
376
|
+
bits.push @sequence
|
|
377
|
+
when DELETION then
|
|
378
|
+
bits.push reference_sequence[@position...(@position+@deletion_length) ]
|
|
379
|
+
bits.push '.'
|
|
380
|
+
else
|
|
381
|
+
raise
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
bits.push '20'
|
|
385
|
+
bits.push 'PASS'
|
|
386
|
+
bits.push 'finishm'
|
|
387
|
+
return bits
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
def vcf(reference_sequence)
|
|
391
|
+
vcf_array(reference_sequence).join("\t")
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
class PrintableConnection
|
|
396
|
+
attr_accessor :reference_path, :variants
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
end
|