finishm 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitmodules +3 -0
- data/.rspec +1 -0
- data/Gemfile +31 -0
- data/LICENSE.txt +20 -0
- data/README.md +59 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/assembly_visualiser +106 -0
- data/bin/check_primer_combinations.rb +73 -0
- data/bin/contig_joiner.rb +244 -0
- data/bin/contigs_against_assembly.rb +153 -0
- data/bin/finishm +143 -0
- data/bin/finishm_assembler +55 -0
- data/bin/finishm_gap_closer.rb +241 -0
- data/bin/kmer_abundance_file_tool.rb +49 -0
- data/bin/kmer_pattern_to_assembly.rb +377 -0
- data/bin/kmer_profile_finder.rb +92 -0
- data/bin/kmers_count_parse.d +52 -0
- data/bin/kmers_count_tabulate.d +123 -0
- data/bin/kmers_count_tabulate.rb +84 -0
- data/bin/pcr_result_parser.rb +108 -0
- data/bin/primer_finder.rb +119 -0
- data/bin/read_selection_by_kmer.d +174 -0
- data/bin/scaffold_by_pattern.rb +119 -0
- data/bin/scaffold_connection_possibilities_to_knowns.rb +193 -0
- data/bin/scaffold_end_coverages.rb +69 -0
- data/bin/trail_validator.rb +84 -0
- data/ext/mkrf_conf.rb +56 -0
- data/ext/src/Makefile +140 -0
- data/ext/src/src/allocArray.c +305 -0
- data/ext/src/src/allocArray.h +86 -0
- data/ext/src/src/autoOpen.c +107 -0
- data/ext/src/src/autoOpen.h +18 -0
- data/ext/src/src/binarySequences.c +813 -0
- data/ext/src/src/binarySequences.h +125 -0
- data/ext/src/src/concatenatedGraph.c +233 -0
- data/ext/src/src/concatenatedGraph.h +30 -0
- data/ext/src/src/concatenatedPreGraph.c +262 -0
- data/ext/src/src/concatenatedPreGraph.h +29 -0
- data/ext/src/src/correctedGraph.c +2643 -0
- data/ext/src/src/correctedGraph.h +32 -0
- data/ext/src/src/dfib.c +509 -0
- data/ext/src/src/dfib.h +69 -0
- data/ext/src/src/dfibHeap.c +89 -0
- data/ext/src/src/dfibHeap.h +39 -0
- data/ext/src/src/dfibpriv.h +105 -0
- data/ext/src/src/fib.c +628 -0
- data/ext/src/src/fib.h +78 -0
- data/ext/src/src/fibHeap.c +79 -0
- data/ext/src/src/fibHeap.h +41 -0
- data/ext/src/src/fibpriv.h +110 -0
- data/ext/src/src/globals.h +154 -0
- data/ext/src/src/graph.c +3932 -0
- data/ext/src/src/graph.h +233 -0
- data/ext/src/src/graphReConstruction.c +1472 -0
- data/ext/src/src/graphReConstruction.h +30 -0
- data/ext/src/src/graphStats.c +2167 -0
- data/ext/src/src/graphStats.h +72 -0
- data/ext/src/src/graphStructures.h +52 -0
- data/ext/src/src/kmer.c +652 -0
- data/ext/src/src/kmer.h +73 -0
- data/ext/src/src/kmerOccurenceTable.c +236 -0
- data/ext/src/src/kmerOccurenceTable.h +44 -0
- data/ext/src/src/kseq.h +223 -0
- data/ext/src/src/locallyCorrectedGraph.c +557 -0
- data/ext/src/src/locallyCorrectedGraph.h +40 -0
- data/ext/src/src/passageMarker.c +677 -0
- data/ext/src/src/passageMarker.h +137 -0
- data/ext/src/src/preGraph.c +1717 -0
- data/ext/src/src/preGraph.h +106 -0
- data/ext/src/src/preGraphConstruction.c +990 -0
- data/ext/src/src/preGraphConstruction.h +26 -0
- data/ext/src/src/probe_node_finder.c +84 -0
- data/ext/src/src/probe_node_finder.h +6 -0
- data/ext/src/src/readCoherentGraph.c +557 -0
- data/ext/src/src/readCoherentGraph.h +30 -0
- data/ext/src/src/readSet.c +1734 -0
- data/ext/src/src/readSet.h +67 -0
- data/ext/src/src/readToNode.c +218 -0
- data/ext/src/src/readToNode.h +35 -0
- data/ext/src/src/recycleBin.c +199 -0
- data/ext/src/src/recycleBin.h +58 -0
- data/ext/src/src/roadMap.c +342 -0
- data/ext/src/src/roadMap.h +65 -0
- data/ext/src/src/run.c +318 -0
- data/ext/src/src/run.h +52 -0
- data/ext/src/src/run2.c +744 -0
- data/ext/src/src/runReadToNode.c +29 -0
- data/ext/src/src/scaffold.c +1876 -0
- data/ext/src/src/scaffold.h +64 -0
- data/ext/src/src/shortReadPairs.c +1243 -0
- data/ext/src/src/shortReadPairs.h +32 -0
- data/ext/src/src/splay.c +259 -0
- data/ext/src/src/splay.h +43 -0
- data/ext/src/src/splayTable.c +1315 -0
- data/ext/src/src/splayTable.h +31 -0
- data/ext/src/src/tightString.c +362 -0
- data/ext/src/src/tightString.h +82 -0
- data/ext/src/src/utility.c +199 -0
- data/ext/src/src/utility.h +98 -0
- data/ext/src/third-party/zlib-1.2.3/ChangeLog +855 -0
- data/ext/src/third-party/zlib-1.2.3/FAQ +339 -0
- data/ext/src/third-party/zlib-1.2.3/INDEX +51 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile +154 -0
- data/ext/src/third-party/zlib-1.2.3/Makefile.in +154 -0
- data/ext/src/third-party/zlib-1.2.3/README +125 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.c +149 -0
- data/ext/src/third-party/zlib-1.2.3/adler32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/algorithm.txt +209 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.pup +66 -0
- data/ext/src/third-party/zlib-1.2.3/amiga/Makefile.sas +65 -0
- data/ext/src/third-party/zlib-1.2.3/as400/bndsrc +132 -0
- data/ext/src/third-party/zlib-1.2.3/as400/compile.clp +123 -0
- data/ext/src/third-party/zlib-1.2.3/as400/readme.txt +111 -0
- data/ext/src/third-party/zlib-1.2.3/as400/zlib.inc +331 -0
- data/ext/src/third-party/zlib-1.2.3/compress.c +79 -0
- data/ext/src/third-party/zlib-1.2.3/compress.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/configure +459 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/README.contrib +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/buffer_demo.adb +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/mtest.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/read.adb +156 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/readme.txt +65 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/test.adb +463 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.adb +225 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-streams.ads +114 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.adb +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib-thin.ads +450 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.adb +701 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.ads +328 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/ada/zlib.gpr +20 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/README.586 +43 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm586/match.S +364 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/README.686 +34 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/asm686/match.S +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/README +4 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.c +444 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/blast.h +71 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.pk +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/blast/test.txt +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLib.pas +557 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/ZLibConst.pas +11 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/delphi/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.build +33 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.chm +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib.sln +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/AssemblyInfo.cs +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/ChecksumImpl.cs +202 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CircularBuffer.cs +83 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/CodecBase.cs +198 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Deflater.cs +106 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.cs +288 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/DotZLib.csproj +141 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/GZipStream.cs +301 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/Inflater.cs +105 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/DotZLib/UnitTests.cs +274 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/LICENSE_1_0.txt +23 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/dotzlib/readme.txt +58 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/README +1 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.c +608 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/infback9.h +37 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inffix9.h +107 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inflate9.h +47 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.c +323 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/infback9/inftree9.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffas86.c +1157 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/inflate86/inffast.S +1368 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/test.cpp +24 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.cpp +329 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream/zfstream.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream.h +307 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream2/zstream_test.cpp +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/README +35 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/TODO +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/test.cc +50 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.cc +479 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/iostream3/zfstream.h +466 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masm686/match.asm +413 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/bld_ml64.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.asm +513 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/gvmat64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffas8664.c +186 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.asm +392 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/inffasx64.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx64/readme.txt +28 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/bld_ml32.bat +2 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.asm +972 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/gvmat32c.c +62 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.asm +1083 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/inffas32.obj +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/mkasm.bat +3 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/masmx86/readme.txt +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ChangeLogUnzip +67 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/Makefile +25 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/crypt.h +132 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.c +177 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/ioapi.h +75 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.c +270 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/iowin32.h +21 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/miniunz.c +585 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/minizip.c +420 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.c +281 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/mztools.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.c +1598 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/unzip.h +354 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/minizip/zip.h +235 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/example.pas +599 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/readme.txt +76 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibd32.mak +93 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/pascal/zlibpas.pas +236 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/Makefile +8 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/README +63 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.c +837 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/puff.h +31 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/puff/zeros.raw +0 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.c +275 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/testzlib/testzlib.txt +10 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile +14 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/Makefile.msc +17 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/untgz/untgz.c +674 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/readme.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/miniunz.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/minizip.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/testzlib.vcproj +126 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibstat.vcproj +246 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.sln +78 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc7/zlibvc.vcproj +445 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/miniunz.vcproj +566 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/minizip.vcproj +563 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlib.vcproj +948 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/testzlibdll.vcproj +567 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlib.rc +32 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibstat.vcproj +870 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.def +92 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.sln +144 -0
- data/ext/src/third-party/zlib-1.2.3/contrib/vstudio/vc8/zlibvc.vcproj +1219 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.c +423 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.h +441 -0
- data/ext/src/third-party/zlib-1.2.3/crc32.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.c +1736 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.h +331 -0
- data/ext/src/third-party/zlib-1.2.3/deflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/example +0 -0
- data/ext/src/third-party/zlib-1.2.3/example.c +565 -0
- data/ext/src/third-party/zlib-1.2.3/examples/README.examples +42 -0
- data/ext/src/third-party/zlib-1.2.3/examples/fitblk.c +233 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gun.c +693 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzappend.c +500 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzjoin.c +448 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.c +413 -0
- data/ext/src/third-party/zlib-1.2.3/examples/gzlog.h +58 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zlib_how.html +523 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zpipe.c +191 -0
- data/ext/src/third-party/zlib-1.2.3/examples/zran.c +404 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.c +1026 -0
- data/ext/src/third-party/zlib-1.2.3/gzio.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/infback.c +623 -0
- data/ext/src/third-party/zlib-1.2.3/infback.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.h +11 -0
- data/ext/src/third-party/zlib-1.2.3/inffast.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inffixed.h +94 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.c +1368 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.h +115 -0
- data/ext/src/third-party/zlib-1.2.3/inflate.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.c +329 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.h +55 -0
- data/ext/src/third-party/zlib-1.2.3/inftrees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/libz.a +0 -0
- data/ext/src/third-party/zlib-1.2.3/make_vms.com +461 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip +0 -0
- data/ext/src/third-party/zlib-1.2.3/minigzip.c +322 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.bor +109 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.dj2 +104 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.msc +106 -0
- data/ext/src/third-party/zlib-1.2.3/msdos/Makefile.tc +94 -0
- data/ext/src/third-party/zlib-1.2.3/old/Makefile.riscos +151 -0
- data/ext/src/third-party/zlib-1.2.3/old/README +3 -0
- data/ext/src/third-party/zlib-1.2.3/old/descrip.mms +48 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/Makefile.os2 +136 -0
- data/ext/src/third-party/zlib-1.2.3/old/os2/zlib.def +51 -0
- data/ext/src/third-party/zlib-1.2.3/old/visual-basic.txt +160 -0
- data/ext/src/third-party/zlib-1.2.3/old/zlib.html +971 -0
- data/ext/src/third-party/zlib-1.2.3/projects/README.projects +41 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/README.txt +73 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/example.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/minigzip.dsp +278 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsp +609 -0
- data/ext/src/third-party/zlib-1.2.3/projects/visualc6/zlib.dsw +59 -0
- data/ext/src/third-party/zlib-1.2.3/qnx/package.qpg +141 -0
- data/ext/src/third-party/zlib-1.2.3/trees.c +1219 -0
- data/ext/src/third-party/zlib-1.2.3/trees.h +128 -0
- data/ext/src/third-party/zlib-1.2.3/trees.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.c +61 -0
- data/ext/src/third-party/zlib-1.2.3/uncompr.o +0 -0
- data/ext/src/third-party/zlib-1.2.3/win32/DLL_FAQ.txt +397 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.bor +107 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.emx +69 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.gcc +141 -0
- data/ext/src/third-party/zlib-1.2.3/win32/Makefile.msc +126 -0
- data/ext/src/third-party/zlib-1.2.3/win32/VisualC.txt +3 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib.def +60 -0
- data/ext/src/third-party/zlib-1.2.3/win32/zlib1.rc +39 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zconf.in.h +332 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.3 +159 -0
- data/ext/src/third-party/zlib-1.2.3/zlib.h +1357 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.c +318 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.h +269 -0
- data/ext/src/third-party/zlib-1.2.3/zutil.o +0 -0
- data/lib/assembly/a_b_visualiser.rb +169 -0
- data/lib/assembly/acyclic_connection_finder.rb +81 -0
- data/lib/assembly/all_orfs.rb +615 -0
- data/lib/assembly/bad_format_writer.rb +46 -0
- data/lib/assembly/bam_probe_read_selector.rb +48 -0
- data/lib/assembly/bubbly_assembler.rb +842 -0
- data/lib/assembly/c_probe_node_finder.rb +38 -0
- data/lib/assembly/connection_interpreter.rb +350 -0
- data/lib/assembly/contig_printer.rb +400 -0
- data/lib/assembly/coverage_based_graph_filter.rb +68 -0
- data/lib/assembly/depth_first_search.rb +63 -0
- data/lib/assembly/dijkstra.rb +216 -0
- data/lib/assembly/fluffer.rb +253 -0
- data/lib/assembly/graph_explorer.rb +85 -0
- data/lib/assembly/graph_generator.rb +315 -0
- data/lib/assembly/height_finder.rb +355 -0
- data/lib/assembly/hybrid_velvet_graph.rb +70 -0
- data/lib/assembly/input_genome.rb +182 -0
- data/lib/assembly/kmer_coverage_based_path_filter.rb +65 -0
- data/lib/assembly/node_finder.rb +171 -0
- data/lib/assembly/oriented_node_trail.rb +507 -0
- data/lib/assembly/paired_end_assembler.rb +53 -0
- data/lib/assembly/paired_end_neighbour_finder.rb +176 -0
- data/lib/assembly/probed_graph.rb +105 -0
- data/lib/assembly/read_input.rb +79 -0
- data/lib/assembly/read_to_node.rb +37 -0
- data/lib/assembly/scaffold_breaker.rb +126 -0
- data/lib/assembly/sequence_hasher.rb +71 -0
- data/lib/assembly/single_coherent_paths_between_nodes.rb +533 -0
- data/lib/assembly/single_coherent_wanderer.rb +261 -0
- data/lib/assembly/single_ended_assembler.rb +441 -0
- data/lib/assembly/velvet_c_binding.rb +54 -0
- data/lib/assembly/velvet_graph_sequence_extractor.rb +123 -0
- data/lib/external/VERSION +1 -0
- data/lib/finishm/assemble.rb +224 -0
- data/lib/finishm/explore.rb +217 -0
- data/lib/finishm/finisher.rb +303 -0
- data/lib/finishm/fluff.rb +122 -0
- data/lib/finishm/gapfiller.rb +325 -0
- data/lib/finishm/orfs_finder.rb +88 -0
- data/lib/finishm/path_counter.rb +90 -0
- data/lib/finishm/primers.rb +425 -0
- data/lib/finishm/primers_check.rb +176 -0
- data/lib/finishm/roundup.rb +344 -0
- data/lib/finishm/sequence.rb +142 -0
- data/lib/finishm/visualise.rb +430 -0
- data/lib/finishm/wander.rb +270 -0
- data/lib/kmer_abundance_pattern.rb +79 -0
- data/lib/kmer_multi_abundance_file.rb +48 -0
- data/lib/oligo_designer.rb +88 -0
- data/lib/priner.rb +66 -0
- data/spec/acyclic_connection_finder_spec.rb +551 -0
- data/spec/all_orfs_spec.rb +443 -0
- data/spec/assemble_spec.rb +186 -0
- data/spec/bubbly_assembler_spec.rb +707 -0
- data/spec/c_node_finder_spec.rb +58 -0
- data/spec/connection_interpreter_spec.rb +284 -0
- data/spec/contig_printer_spec.rb +291 -0
- data/spec/coverage_based_graph_filter_spec.rb +102 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam +0 -0
- data/spec/data/6_3e4e5e6e.1vANME.bam.bai +0 -0
- data/spec/data/acyclic_connection_finder/1/probes.fa +5 -0
- data/spec/data/acyclic_connection_finder/1/random1.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random1.sammy.fa.gz +0 -0
- data/spec/data/acyclic_connection_finder/1/random2.fa +2 -0
- data/spec/data/acyclic_connection_finder/1/random2.sammy.fa.gz +0 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/random3000.slightly_changed.fa +39 -0
- data/spec/data/assembly/1_simple_bubble_uneven_coverage/reads_combined.fa.gz +0 -0
- data/spec/data/assembly_visualiser/Contig_6_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Contig_7_1_to_250.fa.kmers31 +220 -0
- data/spec/data/assembly_visualiser/Graph +46 -0
- data/spec/data/assembly_visualiser/start_kmers1 +2 -0
- data/spec/data/bands.csv +1 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq +0 -0
- data/spec/data/c_probe_node_finder/1/CnyUnifiedSeq.names +544 -0
- data/spec/data/c_probe_node_finder/1/Graph2 +668 -0
- data/spec/data/c_probe_node_finder/1/LastGraph +668 -0
- data/spec/data/c_probe_node_finder/1/Log +756 -0
- data/spec/data/c_probe_node_finder/1/PreGraph +11 -0
- data/spec/data/c_probe_node_finder/1/Roadmaps +2009 -0
- data/spec/data/c_probe_node_finder/1/contigs.fa +29 -0
- data/spec/data/c_probe_node_finder/1/stats.txt +6 -0
- data/spec/data/contig_printer/1/HOWTO_RECREATE +17 -0
- data/spec/data/contig_printer/1/contigs.fa +4 -0
- data/spec/data/contig_printer/1/seq.fa +2408 -0
- data/spec/data/contig_printer/1/seq.fa.svg +153 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Graph2 +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/LastGraph +2953 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Log +21 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/PreGraph +27 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Roadmaps +5182 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/Sequences +3612 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/contigs.fa +36 -0
- data/spec/data/contig_printer/1/seq.fa.velvet/stats.txt +14 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam +0 -0
- data/spec/data/contig_printer/1/seq.faVseq2_1to550.fa.bam.bai +0 -0
- data/spec/data/contig_printer/1/seq.node12.fa +4 -0
- data/spec/data/contig_printer/1/seq1_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa +2 -0
- data/spec/data/contig_printer/1/seq2_1to550.fa.fai +1 -0
- data/spec/data/explore/1/2seqs.sammy.fa +12004 -0
- data/spec/data/explore/1/HOWTO_RECREATE.txt +6 -0
- data/spec/data/explore/1/a.fa +2 -0
- data/spec/data/explore/1/seq1_and_a.fa +3 -0
- data/spec/data/explore/1/seq2.fa +2 -0
- data/spec/data/fluff/1/2seqs.sammy.fa +12004 -0
- data/spec/data/fluff/1/HOWTO_RECREATE.txt +5 -0
- data/spec/data/fluff/1/seq1.fa +2 -0
- data/spec/data/fluff/1/seq2.fa +2 -0
- data/spec/data/gapfilling/1/reads.fa +171 -0
- data/spec/data/gapfilling/1/trail_with_Ns.fa +5 -0
- data/spec/data/gapfilling/1/velvetAssembly/Graph2 +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/LastGraph +130 -0
- data/spec/data/gapfilling/1/velvetAssembly/Log +199 -0
- data/spec/data/gapfilling/1/velvetAssembly/PreGraph +7 -0
- data/spec/data/gapfilling/1/velvetAssembly/Roadmaps +239 -0
- data/spec/data/gapfilling/1/velvetAssembly/Sequences +281 -0
- data/spec/data/gapfilling/1/velvetAssembly/contigs.fa +12 -0
- data/spec/data/gapfilling/1/velvetAssembly/stats.txt +4 -0
- data/spec/data/gapfilling/2/HOWTO_recreate +17 -0
- data/spec/data/gapfilling/2/reference.fa +2 -0
- data/spec/data/gapfilling/2/reference_part1.fa +4 -0
- data/spec/data/gapfilling/2/reference_part2.fa +4 -0
- data/spec/data/gapfilling/2/sammy_reads.fa.gz +0 -0
- data/spec/data/gapfilling/2/with_gaps.fa +4 -0
- data/spec/data/gapfilling/3/HOWTO_recreate +4 -0
- data/spec/data/gapfilling/3/reads.fa.gz +0 -0
- data/spec/data/gapfilling/3/reference_part1.fa +4 -0
- data/spec/data/gapfilling/3/reference_part2.fa +4 -0
- data/spec/data/gapfilling/3/with_gaps.fa +4 -0
- data/spec/data/gapfilling/4/HOWTO_recreate +1 -0
- data/spec/data/gapfilling/4/reads.fa.gz +0 -0
- data/spec/data/gapfilling/5/HOWTO_RECREATE +7 -0
- data/spec/data/gapfilling/5/answer.fna +2 -0
- data/spec/data/gapfilling/5/gappy.fna +2 -0
- data/spec/data/gapfilling/5/reads.fa +17961 -0
- data/spec/data/gapfilling/5/velvet51_3.5/LastGraph +8337 -0
- data/spec/data/gapfilling/5/velvet51_3.5/Sequences +20921 -0
- data/spec/data/gapfilling/6/random1.fa +28 -0
- data/spec/data/gapfilling/6/random2.fa +28 -0
- data/spec/data/gapfilling/6/random_sequence_length_2000 +0 -0
- data/spec/data/gapfilling/6/reads.random1.fa.gz +0 -0
- data/spec/data/gapfilling/6/reads.random2.fa.gz +0 -0
- data/spec/data/gapfilling/6/to_gapfill.fa +22 -0
- data/spec/data/kmer_profile_to_assembly/multiple_abundance_file1.csv +2 -0
- data/spec/data/kmers_count1.csv +2 -0
- data/spec/data/kmers_count2.csv +3 -0
- data/spec/data/out +3 -0
- data/spec/data/positive_latching_pair.fa +2 -0
- data/spec/data/primers.csv +4 -0
- data/spec/data/read_selection_by_kmer/blacklist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/input.fasta +6 -0
- data/spec/data/read_selection_by_kmer/whitelist1.txt +1 -0
- data/spec/data/read_selection_by_kmer/whitelist2.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/HOWTO_RECREATE.txt +2 -0
- data/spec/data/read_to_node/1_a_graph/LastGraph +6695 -0
- data/spec/data/read_to_node/1_a_graph/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/2_no_read256_or_259/HOWTO_RECREATE.txt +3 -0
- data/spec/data/read_to_node/2_no_read256_or_259/LastGraph +6693 -0
- data/spec/data/read_to_node/2_no_read256_or_259/ReadToNode.bin +0 -0
- data/spec/data/read_to_node/3_no_last_read/LastGraph +6694 -0
- data/spec/data/read_to_node/3_no_last_read/ReadToNode.bin +0 -0
- data/spec/data/t/details.txt +5 -0
- data/spec/data/t/details.txt.srt +5 -0
- data/spec/data/t/location.txt +3 -0
- data/spec/data/t/location.txt.srt +3 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/answer.fa +2 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/reads.fa.gz +0 -0
- data/spec/data/tweak/1_gap_then_unscaffolded/scaffolds.fa +6 -0
- data/spec/data/tweak/2_second_genome/answer2.fa +2 -0
- data/spec/data/tweak/2_second_genome/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/answer.fa +2 -0
- data/spec/data/tweak/3_variant/lesser_answer.fa +2 -0
- data/spec/data/tweak/3_variant/reads.fa.gz +0 -0
- data/spec/data/tweak/3_variant/with_gaps.fa +2 -0
- data/spec/data/velvet_test_trails/Assem/Graph +17 -0
- data/spec/data/velvet_test_trails/Assem/Graph2 +40 -0
- data/spec/data/velvet_test_trails/Assem/LastGraph +40 -0
- data/spec/data/velvet_test_trails/Assem/Log +35 -0
- data/spec/data/velvet_test_trails/Assem/PreGraph +9 -0
- data/spec/data/velvet_test_trails/Assem/Roadmaps +89 -0
- data/spec/data/velvet_test_trails/Assem/Sequences +50 -0
- data/spec/data/velvet_test_trails/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails/Assem/contigs.fa +15 -0
- data/spec/data/velvet_test_trails/Assem/stats.txt +5 -0
- data/spec/data/velvet_test_trails/node_fwds.fa +8 -0
- data/spec/data/velvet_test_trails/node_seqs.fa +9 -0
- data/spec/data/velvet_test_trails/nodes_fwd_rev.fa +16 -0
- data/spec/data/velvet_test_trails/read1.fa +2 -0
- data/spec/data/velvet_test_trails/reads.fa +50 -0
- data/spec/data/velvet_test_trails_reverse/Assem/LastGraph +17 -0
- data/spec/data/velvet_test_trails_reverse/Assem/a.svg +53 -0
- data/spec/data/velvet_test_trails_reverse/reads_reversed.fa +10 -0
- data/spec/data/visualise/1/LastGraph +6695 -0
- data/spec/data/visualise/2_paired_end/HOWTO_RECREATE.txt +10 -0
- data/spec/data/visualise/2_paired_end/rand1.fa +2 -0
- data/spec/data/visualise/2_paired_end/rand2.fa +2 -0
- data/spec/data/visualise/2_paired_end/with_gaps.fa +8 -0
- data/spec/data/visualise/2_paired_end/with_gaps.read_pairs.fa.gz +0 -0
- data/spec/data/wander/1/random1.fa +2 -0
- data/spec/data/wander/1/random1.sammy.fa +804 -0
- data/spec/depth_first_search_spec.rb +190 -0
- data/spec/dijkstra_spec.rb +143 -0
- data/spec/explore_spec.rb +29 -0
- data/spec/fluffer_spec.rb +155 -0
- data/spec/gapfiller_spec.rb +107 -0
- data/spec/graph_explorer_spec.rb +475 -0
- data/spec/graph_generator_spec.rb +99 -0
- data/spec/height_finder_spec.rb +306 -0
- data/spec/kmer_abundance_pattern_spec.rb +56 -0
- data/spec/kmer_coverage_based_path_filter_spec.rb +73 -0
- data/spec/kmer_profile_finder_spec.rb +38 -0
- data/spec/kmers_count_tabulate_spec.rb +120 -0
- data/spec/oriented_node_trail_spec.rb +221 -0
- data/spec/paired_end_neighbours_spec.rb +126 -0
- data/spec/paths_between_nodes_spec.rb +349 -0
- data/spec/priner_spec.rb +7 -0
- data/spec/read_input_spec.rb +23 -0
- data/spec/read_selection_by_kmer_spec.rb +166 -0
- data/spec/read_to_node_spec.rb +35 -0
- data/spec/roundup_spec.rb +366 -0
- data/spec/scaffold_breaker_spec.rb +144 -0
- data/spec/sequence_spec.rb +43 -0
- data/spec/single_coherent_paths_between_nodes_spec.rb +492 -0
- data/spec/single_coherent_wanderer_spec.rb +120 -0
- data/spec/single_ended_assembler_spec.rb +398 -0
- data/spec/spec_helper.rb +310 -0
- data/spec/velvet_graph_sequence_extractor_spec.rb +80 -0
- data/spec/visualise_spec.rb +105 -0
- data/spec/wander_spec.rb +119 -0
- data/spec/watch_for_changes.sh +16 -0
- data/validation/fasta_compare.rb +72 -0
- data/validation/gapfill_simulate_perfect.rb +108 -0
- metadata +899 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
|
|
3
|
+
|
|
4
|
+
This file is part of Velvet.
|
|
5
|
+
|
|
6
|
+
Velvet is free software; you can redistribute it and/or modify
|
|
7
|
+
it under the terms of the GNU General Public License as published by
|
|
8
|
+
the Free Software Foundation; either version 2 of the License, or
|
|
9
|
+
(at your option) any later version.
|
|
10
|
+
|
|
11
|
+
Velvet is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
GNU General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU General Public License
|
|
17
|
+
along with Velvet; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
19
|
+
|
|
20
|
+
*/
|
|
21
|
+
#ifndef _READCOHERENTGRAPH_H_
|
|
22
|
+
#define _READCOHERENTGRAPH_H_
|
|
23
|
+
|
|
24
|
+
void readCoherentGraph(Graph * graph, boolean(*isUnique) (Node * node),
|
|
25
|
+
double coverage, ReadSet * reads);
|
|
26
|
+
|
|
27
|
+
boolean isUniqueSolexa(Node * node);
|
|
28
|
+
|
|
29
|
+
void setMultiplicityCutoff(int value);
|
|
30
|
+
#endif
|
|
@@ -0,0 +1,1734 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
|
|
3
|
+
|
|
4
|
+
This file is part of Velvet.
|
|
5
|
+
|
|
6
|
+
Velvet is free software; you can redistribute it and/or modify
|
|
7
|
+
it under the terms of the GNU General Public License as published by
|
|
8
|
+
the Free Software Foundation; either version 2 of the License, or
|
|
9
|
+
(at your option) any later version.
|
|
10
|
+
|
|
11
|
+
Velvet is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
GNU General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU General Public License
|
|
17
|
+
along with Velvet; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
19
|
+
|
|
20
|
+
*/
|
|
21
|
+
#include <stdlib.h>
|
|
22
|
+
#include <stdio.h>
|
|
23
|
+
#include <string.h>
|
|
24
|
+
#include <math.h>
|
|
25
|
+
#include <time.h>
|
|
26
|
+
#include <limits.h>
|
|
27
|
+
#include <ctype.h>
|
|
28
|
+
|
|
29
|
+
#include "globals.h"
|
|
30
|
+
#include "tightString.h"
|
|
31
|
+
#include "readSet.h"
|
|
32
|
+
#include "utility.h"
|
|
33
|
+
#include "binarySequences.h"
|
|
34
|
+
#include "autoOpen.h"
|
|
35
|
+
#include "kseq.h"
|
|
36
|
+
|
|
37
|
+
#if !defined(BUNDLEDZLIB)
|
|
38
|
+
#include <zlib.h>
|
|
39
|
+
#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
|
|
40
|
+
#include "../third-party/zlib-1.2.3/Win32/include/zlib.h"
|
|
41
|
+
#else
|
|
42
|
+
#include "../third-party/zlib-1.2.3/zlib.h"
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
|
|
46
|
+
# include <fcntl.h>
|
|
47
|
+
# include <io.h>
|
|
48
|
+
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
|
|
49
|
+
#else
|
|
50
|
+
# define SET_BINARY_MODE(file)
|
|
51
|
+
#endif
|
|
52
|
+
|
|
53
|
+
static Mask *allocateMask(SequencesWriter *seqWriteInfo)
|
|
54
|
+
{
|
|
55
|
+
if (seqWriteInfo->m_maskMemory == NULL)
|
|
56
|
+
seqWriteInfo->m_maskMemory = newRecycleBin(sizeof(Mask), 10000);
|
|
57
|
+
|
|
58
|
+
return (Mask *) allocatePointer(seqWriteInfo->m_maskMemory);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
static Mask * newMask(SequencesWriter *seqWriteInfo, Coordinate position)
|
|
62
|
+
{
|
|
63
|
+
Mask * mask = allocateMask(seqWriteInfo);
|
|
64
|
+
mask->start = position;
|
|
65
|
+
mask->finish = position;
|
|
66
|
+
mask->next = NULL;
|
|
67
|
+
return mask;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
//
|
|
71
|
+
// cmd line args can override the createBinary flag
|
|
72
|
+
// note that createBinary is only used by velveth
|
|
73
|
+
//
|
|
74
|
+
boolean createBinary = false;
|
|
75
|
+
boolean isCreateBinary()
|
|
76
|
+
{
|
|
77
|
+
return createBinary;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
void setCreateBinary(boolean val)
|
|
81
|
+
{
|
|
82
|
+
createBinary = val;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
ReadSet *newReadSet()
|
|
86
|
+
{
|
|
87
|
+
ReadSet *rs = callocOrExit(1, ReadSet);
|
|
88
|
+
return rs;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
//////////////////////////////////////////////////////////////////////////
|
|
92
|
+
// Reference identifiers
|
|
93
|
+
//////////////////////////////////////////////////////////////////////////
|
|
94
|
+
|
|
95
|
+
typedef struct referenceCoordinate_st ReferenceCoordinate;
|
|
96
|
+
static Coordinate reference_coordinate_double_strand = true;
|
|
97
|
+
|
|
98
|
+
struct referenceCoordinate_st {
|
|
99
|
+
char * name;
|
|
100
|
+
Coordinate start;
|
|
101
|
+
Coordinate finish;
|
|
102
|
+
IDnum referenceID;
|
|
103
|
+
IDnum counter;
|
|
104
|
+
boolean positive_strand;
|
|
105
|
+
} ATTRIBUTE_PACKED;
|
|
106
|
+
|
|
107
|
+
static int compareRefCoords(const void * ptrA, const void * ptrB) {
|
|
108
|
+
ReferenceCoordinate * A = (ReferenceCoordinate *) ptrA;
|
|
109
|
+
ReferenceCoordinate * B = (ReferenceCoordinate *) ptrB;
|
|
110
|
+
int comp = strcmp(A->name, B->name);
|
|
111
|
+
|
|
112
|
+
if (comp != 0)
|
|
113
|
+
return comp;
|
|
114
|
+
else if (!reference_coordinate_double_strand && A->positive_strand != B->positive_strand)
|
|
115
|
+
return A->positive_strand > B->positive_strand;
|
|
116
|
+
else {
|
|
117
|
+
if (A->finish > -1 && A->finish < B->start)
|
|
118
|
+
return -1;
|
|
119
|
+
else if (B->finish > -1 && A->start > B->finish)
|
|
120
|
+
return 1;
|
|
121
|
+
else return 0;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
typedef struct referenceCoordinateTable_st ReferenceCoordinateTable;
|
|
126
|
+
|
|
127
|
+
struct referenceCoordinateTable_st {
|
|
128
|
+
ReferenceCoordinate * array;
|
|
129
|
+
IDnum arrayLength;
|
|
130
|
+
} ATTRIBUTE_PACKED;
|
|
131
|
+
|
|
132
|
+
static ReferenceCoordinateTable * newReferenceCoordinateTable() {
|
|
133
|
+
ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable);
|
|
134
|
+
table->array = NULL;
|
|
135
|
+
table->arrayLength = 0;
|
|
136
|
+
return table;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
static void printReferenceCoordinateTableStats(ReferenceCoordinateTable * table) {
|
|
140
|
+
IDnum index;
|
|
141
|
+
IDnum counter = 0;
|
|
142
|
+
|
|
143
|
+
velvetLog("Reference mapping counters\n");
|
|
144
|
+
velvetLog("Name\tRead mappings\n");
|
|
145
|
+
|
|
146
|
+
for (index = 0; index < table->arrayLength; index++) {
|
|
147
|
+
velvetLog("%s\t%li\n", table->array[index].name, (long) table->array[index].counter);
|
|
148
|
+
counter += table->array[index].counter;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (counter == 0) {
|
|
152
|
+
velvetLog("WARNING: None of your read mappings recognized the reference sequence!\n");
|
|
153
|
+
velvetLog("Double check that the names are identical between reference fasta headers and SAM/BAM sequences.\n");
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static void destroyReferenceCoordinateTable(ReferenceCoordinateTable * table) {
|
|
158
|
+
IDnum index;
|
|
159
|
+
|
|
160
|
+
if (table->array) {
|
|
161
|
+
printReferenceCoordinateTableStats(table);
|
|
162
|
+
for (index = 0; index < table->arrayLength; index++)
|
|
163
|
+
free(table->array[index].name);
|
|
164
|
+
free(table->array);
|
|
165
|
+
}
|
|
166
|
+
free(table);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) {
|
|
170
|
+
if (table->array == NULL)
|
|
171
|
+
table->array = callocOrExit(extraLength, ReferenceCoordinate);
|
|
172
|
+
else
|
|
173
|
+
table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
static ReferenceCoordinate * findReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
|
|
177
|
+
ReferenceCoordinate * array = table->array;
|
|
178
|
+
ReferenceCoordinate refCoord;
|
|
179
|
+
Coordinate leftIndex = 0;
|
|
180
|
+
Coordinate rightIndex = table->arrayLength - 1;
|
|
181
|
+
Coordinate middleIndex;
|
|
182
|
+
|
|
183
|
+
refCoord.name = name;
|
|
184
|
+
refCoord.start = start;
|
|
185
|
+
refCoord.finish = finish;
|
|
186
|
+
refCoord.referenceID = 0;
|
|
187
|
+
refCoord.positive_strand = positive_strand;
|
|
188
|
+
|
|
189
|
+
while (true) {
|
|
190
|
+
middleIndex = (rightIndex + leftIndex) / 2;
|
|
191
|
+
|
|
192
|
+
if (leftIndex > rightIndex)
|
|
193
|
+
return NULL;
|
|
194
|
+
else if (compareRefCoords(&(array[middleIndex]), &refCoord) == 0)
|
|
195
|
+
return &(array[middleIndex]);
|
|
196
|
+
else if (leftIndex == middleIndex)
|
|
197
|
+
return NULL;
|
|
198
|
+
else if (compareRefCoords(&(array[middleIndex]), &refCoord) > 0)
|
|
199
|
+
rightIndex = middleIndex;
|
|
200
|
+
else
|
|
201
|
+
leftIndex = middleIndex;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
static void addReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
|
|
206
|
+
ReferenceCoordinate * refCoord;
|
|
207
|
+
|
|
208
|
+
if ((refCoord = findReferenceCoordinate(table, name, start, finish, positive_strand))) {
|
|
209
|
+
velvetLog("Overlapping reference coordinates:\n");
|
|
210
|
+
velvetLog("%s:%lli-%lli\n", name, (long long) start, (long long) finish);
|
|
211
|
+
velvetLog("%s:%lli-%lli\n", refCoord->name, (long long) refCoord->start, (long long) refCoord->finish);
|
|
212
|
+
velvetLog("Exiting...\n");
|
|
213
|
+
#ifdef DEBUG
|
|
214
|
+
abort();
|
|
215
|
+
#endif
|
|
216
|
+
exit(1);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
refCoord = &(table->array[table->arrayLength++]);
|
|
220
|
+
|
|
221
|
+
refCoord->name = name;
|
|
222
|
+
refCoord->start = start;
|
|
223
|
+
refCoord->finish = finish;
|
|
224
|
+
refCoord->referenceID = table->arrayLength;
|
|
225
|
+
refCoord->positive_strand = positive_strand;
|
|
226
|
+
refCoord->counter = 0;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
static void sortReferenceCoordinateTable(ReferenceCoordinateTable * table) {
|
|
230
|
+
qsort(table->array, table->arrayLength, sizeof(ReferenceCoordinate), compareRefCoords);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
//////////////////////////////////////////////////////////////////////////
|
|
234
|
+
// File reading
|
|
235
|
+
//////////////////////////////////////////////////////////////////////////
|
|
236
|
+
|
|
237
|
+
static void velvetifySequence(char * str, SequencesWriter *seqWriteInfo) {
|
|
238
|
+
int i;
|
|
239
|
+
char c;
|
|
240
|
+
size_t length = strlen(str);
|
|
241
|
+
|
|
242
|
+
for (i = 0; i < length; i++) {
|
|
243
|
+
c = str[i];
|
|
244
|
+
switch (c) {
|
|
245
|
+
case '\n':
|
|
246
|
+
case '\r':
|
|
247
|
+
case EOF:
|
|
248
|
+
str[i] = '\0';
|
|
249
|
+
break;
|
|
250
|
+
case 'A':
|
|
251
|
+
case 'a':
|
|
252
|
+
str[i] = 'A';
|
|
253
|
+
break;
|
|
254
|
+
case 'C':
|
|
255
|
+
case 'c':
|
|
256
|
+
str[i] = 'C';
|
|
257
|
+
break;
|
|
258
|
+
case 'G':
|
|
259
|
+
case 'g':
|
|
260
|
+
str[i] = 'G';
|
|
261
|
+
break;
|
|
262
|
+
case 'T':
|
|
263
|
+
case 't':
|
|
264
|
+
str[i] = 'T';
|
|
265
|
+
break;
|
|
266
|
+
default:
|
|
267
|
+
str[i] = 'N';
|
|
268
|
+
}
|
|
269
|
+
// non NULL indicates ref masks are being created
|
|
270
|
+
if (seqWriteInfo->m_referenceMask != NULL) {
|
|
271
|
+
if (str[i] == 'N') {
|
|
272
|
+
if (seqWriteInfo->m_openMask) {
|
|
273
|
+
seqWriteInfo->m_current->finish++;
|
|
274
|
+
} else if (*(seqWriteInfo->m_referenceMask) == NULL) {
|
|
275
|
+
*(seqWriteInfo->m_referenceMask) = newMask(seqWriteInfo, seqWriteInfo->m_position);
|
|
276
|
+
seqWriteInfo->m_current = *(seqWriteInfo->m_referenceMask);
|
|
277
|
+
} else {
|
|
278
|
+
seqWriteInfo->m_current->next = newMask(seqWriteInfo, seqWriteInfo->m_position);
|
|
279
|
+
seqWriteInfo->m_current = seqWriteInfo->m_current->next;
|
|
280
|
+
}
|
|
281
|
+
seqWriteInfo->m_openMask = true;
|
|
282
|
+
seqWriteInfo->m_position += 1;
|
|
283
|
+
} else if (str[i] != '\0') {
|
|
284
|
+
seqWriteInfo->m_openMask = false;
|
|
285
|
+
seqWriteInfo->m_position += 1;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
static void reverseComplementSequence(char * str)
|
|
292
|
+
{
|
|
293
|
+
size_t length = strlen(str);
|
|
294
|
+
size_t i;
|
|
295
|
+
|
|
296
|
+
for (i = 0; i < length-1 - i; i++) {
|
|
297
|
+
char c = str[i];
|
|
298
|
+
str[i] = str[length-1 - i];
|
|
299
|
+
str[length-1 - i] = c;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
#ifndef COLOR
|
|
303
|
+
for (i = 0; i < length; i++) {
|
|
304
|
+
switch (str[i]) {
|
|
305
|
+
case 'A':
|
|
306
|
+
case 'a':
|
|
307
|
+
str[i] = 'T';
|
|
308
|
+
break;
|
|
309
|
+
case 'C':
|
|
310
|
+
case 'c':
|
|
311
|
+
str[i] = 'G';
|
|
312
|
+
break;
|
|
313
|
+
case 'G':
|
|
314
|
+
case 'g':
|
|
315
|
+
str[i] = 'C';
|
|
316
|
+
break;
|
|
317
|
+
// As in velvetifySequence(), anything unusual ends up as 'A'
|
|
318
|
+
default:
|
|
319
|
+
str[i] = 'A';
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
#endif
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
static void writeFastaSequence(FILE * outfile, const char * str)
|
|
327
|
+
{
|
|
328
|
+
size_t length = strlen(str);
|
|
329
|
+
size_t start;
|
|
330
|
+
for (start = 0; start < length; start += 60)
|
|
331
|
+
velvetFprintf(outfile, "%.60s\n", &str[start]);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
void convertSequences(ReadSet * rs)
|
|
335
|
+
{
|
|
336
|
+
rs->tSequences = newTightStringArrayFromStringArray(rs->sequences,
|
|
337
|
+
rs->readCount,
|
|
338
|
+
&rs->tSeqMem);
|
|
339
|
+
rs->sequences = NULL;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Returns the value of a 32-bit little-endian-stored integer.
|
|
343
|
+
static int int32(const unsigned char * ptr)
|
|
344
|
+
{
|
|
345
|
+
int x = ptr[3];
|
|
346
|
+
x = (x << 8) | ptr[2];
|
|
347
|
+
x = (x << 8) | ptr[1];
|
|
348
|
+
x = (x << 8) | ptr[0];
|
|
349
|
+
return x;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
void goToEndOfLine(char *line, FILE * file)
|
|
353
|
+
{
|
|
354
|
+
size_t length = strlen(line);
|
|
355
|
+
char c = line[length - 1];
|
|
356
|
+
|
|
357
|
+
while (c != '\n')
|
|
358
|
+
c = fgetc(file);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
static void writeSeqName(char*seq_name, SequencesWriter *seqWriteInfo, Category cat, IDnum *sequenceIndex)
|
|
362
|
+
{
|
|
363
|
+
char name[5001];
|
|
364
|
+
if (isCreateBinary()) {
|
|
365
|
+
cnySeqInsertStart(seqWriteInfo);
|
|
366
|
+
sprintf(name, ">%s", seq_name);
|
|
367
|
+
cnySeqInsertSequenceName(name, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
|
|
368
|
+
} else {
|
|
369
|
+
velvetFprintf(seqWriteInfo->m_pFile,">%s\t%ld\t%d\n", seq_name, (long) ((*sequenceIndex)++), (int) cat);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
static void writeSequence(char*seq, SequencesWriter *seqWriteInfo)
|
|
374
|
+
{
|
|
375
|
+
char str[100];
|
|
376
|
+
velvetifySequence(seq, seqWriteInfo);
|
|
377
|
+
if (isCreateBinary()) {
|
|
378
|
+
cnySeqInsertNucleotideString(seq, seqWriteInfo);
|
|
379
|
+
cnySeqInsertEnd(seqWriteInfo);
|
|
380
|
+
} else {
|
|
381
|
+
Coordinate start = 0;
|
|
382
|
+
while (start <= strlen(seq)) {
|
|
383
|
+
strncpy(str, seq + start, 60);
|
|
384
|
+
str[60] = '\0';
|
|
385
|
+
velvetFprintf(seqWriteInfo->m_pFile, "%s\n", str);
|
|
386
|
+
start += 60;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
static void initFastX(SequencesWriter *seqWriteInfo, Category cat)
|
|
392
|
+
{
|
|
393
|
+
seqWriteInfo->m_referenceMask = NULL;
|
|
394
|
+
seqWriteInfo->m_position = 0;
|
|
395
|
+
seqWriteInfo->m_openMask = false;
|
|
396
|
+
|
|
397
|
+
// Binary file stuff
|
|
398
|
+
if (isCreateBinary() && (cat == REFERENCE)) {
|
|
399
|
+
seqWriteInfo->m_referenceMask = callocOrExit(1, Mask*);
|
|
400
|
+
}
|
|
401
|
+
if (isCreateBinary()) {
|
|
402
|
+
inputCnySeqFileStart(cat, seqWriteInfo);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
static void cleanupFastX(SequencesWriter *seqWriteInfo, Category cat)
|
|
407
|
+
{
|
|
408
|
+
if (seqWriteInfo->m_referenceMask) {
|
|
409
|
+
free(seqWriteInfo->m_referenceMask);
|
|
410
|
+
seqWriteInfo->m_referenceMask = NULL;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
// Imports sequences from a raw sequence file
|
|
416
|
+
// Memory space allocated within this function.
|
|
417
|
+
static void readRawFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex)
|
|
418
|
+
{
|
|
419
|
+
FILE *file;
|
|
420
|
+
const int maxline = 5000;
|
|
421
|
+
char line[5000];
|
|
422
|
+
IDnum counter = 0;
|
|
423
|
+
|
|
424
|
+
initFastX(seqWriteInfo, cat);
|
|
425
|
+
|
|
426
|
+
if (strcmp(filename, "-"))
|
|
427
|
+
file = fopen(filename, "r");
|
|
428
|
+
else
|
|
429
|
+
file = stdin;
|
|
430
|
+
|
|
431
|
+
if (file != NULL)
|
|
432
|
+
velvetLog("Reading raw file %s\n", filename);
|
|
433
|
+
else
|
|
434
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
435
|
+
|
|
436
|
+
while(fgets(line, maxline, file)) {
|
|
437
|
+
if (strlen(line) >= maxline - 1) {
|
|
438
|
+
velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
|
|
439
|
+
#ifdef DEBUG
|
|
440
|
+
abort();
|
|
441
|
+
#endif
|
|
442
|
+
exit(1);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
|
|
446
|
+
writeSequence(line, seqWriteInfo);
|
|
447
|
+
counter++;
|
|
448
|
+
}
|
|
449
|
+
fclose(file);
|
|
450
|
+
cleanupFastX(seqWriteInfo, cat);
|
|
451
|
+
velvetLog("%li reads found.\n", (long) counter);
|
|
452
|
+
velvetLog("Done\n");
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Imports sequences from a zipped raw file
|
|
456
|
+
// Memory space allocated within this function.
|
|
457
|
+
static void readRawGZFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex)
|
|
458
|
+
{
|
|
459
|
+
gzFile file;
|
|
460
|
+
const int maxline = 5000;
|
|
461
|
+
char line[5000];
|
|
462
|
+
IDnum counter = 0;
|
|
463
|
+
|
|
464
|
+
initFastX(seqWriteInfo, cat);
|
|
465
|
+
if (strcmp(filename, "-"))
|
|
466
|
+
file = gzopen(filename, "rb");
|
|
467
|
+
else {
|
|
468
|
+
file = gzdopen(fileno(stdin), "rb");
|
|
469
|
+
SET_BINARY_MODE(stdin);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if (file != NULL)
|
|
473
|
+
velvetLog("Reading zipped raw sequence file %s\n", filename);
|
|
474
|
+
else
|
|
475
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
476
|
+
|
|
477
|
+
while(gzgets(file, line, maxline)) {
|
|
478
|
+
if (strlen(line) >= maxline - 1) {
|
|
479
|
+
velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
|
|
480
|
+
#ifdef DEBUG
|
|
481
|
+
abort();
|
|
482
|
+
#endif
|
|
483
|
+
exit(1);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
writeSeqName("RAW", seqWriteInfo, cat, sequenceIndex);
|
|
487
|
+
writeSequence(line, seqWriteInfo);
|
|
488
|
+
counter++;
|
|
489
|
+
}
|
|
490
|
+
gzclose(file);
|
|
491
|
+
cleanupFastX(seqWriteInfo, cat);
|
|
492
|
+
velvetLog("%li reads found.\n", (long) counter);
|
|
493
|
+
velvetLog("Done\n");
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
static void fillReferenceCoordinateTable(char *filename, ReferenceCoordinateTable * refCoords, IDnum counter)
|
|
497
|
+
{
|
|
498
|
+
FILE *file;
|
|
499
|
+
const int maxline = 5000;
|
|
500
|
+
char line[5000];
|
|
501
|
+
char * name;
|
|
502
|
+
long long start, finish;
|
|
503
|
+
Coordinate i;
|
|
504
|
+
IDnum index = 0;
|
|
505
|
+
|
|
506
|
+
if (strcmp(filename, "-") == 0)
|
|
507
|
+
exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence from stdin");
|
|
508
|
+
else
|
|
509
|
+
file = fopen(filename, "r");
|
|
510
|
+
|
|
511
|
+
if (counter == 0)
|
|
512
|
+
return;
|
|
513
|
+
|
|
514
|
+
resizeReferenceCoordinateTable(refCoords,counter);
|
|
515
|
+
|
|
516
|
+
while (fgets(line, maxline, file) && index < counter) {
|
|
517
|
+
if (line[0] == '>') {
|
|
518
|
+
name = callocOrExit(strlen(line), char);
|
|
519
|
+
|
|
520
|
+
if (strchr(line, ':')) {
|
|
521
|
+
sscanf(strtok(line, ":-\r\n\t "), ">%s", name);
|
|
522
|
+
sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &start);
|
|
523
|
+
sscanf(strtok(NULL, ":-\r\n\t "), "%lli", &finish);
|
|
524
|
+
if (start <= finish)
|
|
525
|
+
addReferenceCoordinate(refCoords, name, start, finish, true);
|
|
526
|
+
else
|
|
527
|
+
addReferenceCoordinate(refCoords, name, finish, start, false);
|
|
528
|
+
} else {
|
|
529
|
+
// Chomping EOL characters and comments
|
|
530
|
+
for (i=strlen(line) - 1; i >= 0; i--)
|
|
531
|
+
if (line[i] == '\n' || line[i] == '\r' || line[i] == ' ' || line[i] == '\t')
|
|
532
|
+
line[i] = '\0';
|
|
533
|
+
|
|
534
|
+
strcpy(name, line + 1);
|
|
535
|
+
addReferenceCoordinate(refCoords, name, 1, -1, true);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
index++;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
sortReferenceCoordinateTable(refCoords);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
#define FASTQ 1
|
|
546
|
+
#define FASTA 2
|
|
547
|
+
#define FASTA_GZ 5
|
|
548
|
+
#define FASTQ_GZ 6
|
|
549
|
+
#define SAM 8
|
|
550
|
+
#define BAM 9
|
|
551
|
+
#define RAW 10
|
|
552
|
+
#define RAW_GZ 11
|
|
553
|
+
#define AUTO 12
|
|
554
|
+
|
|
555
|
+
static gzFile openFastXFile(int fileType, char*filename)
|
|
556
|
+
{
|
|
557
|
+
gzFile file;
|
|
558
|
+
char c;
|
|
559
|
+
|
|
560
|
+
// Choose file or stdin
|
|
561
|
+
if (strcmp(filename, "-")==0) {
|
|
562
|
+
file = gzdopen(fileno(stdin), "rb");
|
|
563
|
+
SET_BINARY_MODE(stdin);
|
|
564
|
+
} else {
|
|
565
|
+
file = gzopen(filename, "rb");
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
// Verify filetype
|
|
569
|
+
c = gzgetc(file);
|
|
570
|
+
switch (fileType) {
|
|
571
|
+
case FASTA:
|
|
572
|
+
case FASTA_GZ:
|
|
573
|
+
if (c != EOF && c!='>')
|
|
574
|
+
exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
|
|
575
|
+
break;
|
|
576
|
+
case FASTQ:
|
|
577
|
+
case FASTQ_GZ:
|
|
578
|
+
if (c != EOF && c!='@')
|
|
579
|
+
exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
|
|
580
|
+
break;
|
|
581
|
+
}
|
|
582
|
+
gzungetc(c, file);
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
if (file != NULL) {
|
|
586
|
+
char *type;
|
|
587
|
+
switch (fileType) {
|
|
588
|
+
case FASTA:
|
|
589
|
+
case FASTA_GZ: type = "FastA"; break;
|
|
590
|
+
case FASTQ:
|
|
591
|
+
case FASTQ_GZ: type = "FastQ"; break;
|
|
592
|
+
default: type = ""; break;
|
|
593
|
+
}
|
|
594
|
+
velvetLog("Reading %s file %s;\n", type, filename);
|
|
595
|
+
} else
|
|
596
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
597
|
+
|
|
598
|
+
return file;
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
typedef struct {
|
|
602
|
+
gzFile gzFile;
|
|
603
|
+
AutoFile *autoFile;
|
|
604
|
+
} FileGZOrAuto;
|
|
605
|
+
|
|
606
|
+
size_t fileGZOrAuto_read(FileGZOrAuto kseq_file, void *ptr, size_t size)
|
|
607
|
+
{
|
|
608
|
+
if (kseq_file.gzFile)
|
|
609
|
+
return gzread(kseq_file.gzFile, ptr, size);
|
|
610
|
+
else
|
|
611
|
+
return fread(ptr, 1, size, kseq_file.autoFile->file);
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
void fileGZOrAuto_close(FileGZOrAuto kseq_file)
|
|
615
|
+
{
|
|
616
|
+
if (kseq_file.gzFile)
|
|
617
|
+
gzclose(kseq_file.gzFile);
|
|
618
|
+
else
|
|
619
|
+
closeFileAuto(kseq_file.autoFile);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
char const* charToType(char c)
|
|
623
|
+
{
|
|
624
|
+
switch(c) {
|
|
625
|
+
case '>': return "FastA";
|
|
626
|
+
case '@': return "FastQ";
|
|
627
|
+
default: return "Unknown";
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// Define mode to use kseq in
|
|
632
|
+
KSEQ_INIT(FileGZOrAuto, fileGZOrAuto_read)
|
|
633
|
+
|
|
634
|
+
// Read in FastA or FastQ files in compressed or gz format
|
|
635
|
+
static void readFastXFile(int fileType, SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum * sequenceIndex, ReferenceCoordinateTable * refCoords)
|
|
636
|
+
{
|
|
637
|
+
kseq_t *seq;
|
|
638
|
+
FileGZOrAuto file;
|
|
639
|
+
IDnum counter = 0;
|
|
640
|
+
|
|
641
|
+
file.gzFile = file.autoFile = NULL;
|
|
642
|
+
if (fileType == AUTO) {
|
|
643
|
+
file.autoFile = openFileAuto(filename);
|
|
644
|
+
if (!file.autoFile)
|
|
645
|
+
exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename);
|
|
646
|
+
velvetLog("Reading file '%s' using '%s' as %s\n", filename, file.autoFile->decompressor, charToType(file.autoFile->first_char));
|
|
647
|
+
} else
|
|
648
|
+
file.gzFile = openFastXFile(fileType, filename);
|
|
649
|
+
|
|
650
|
+
initFastX(seqWriteInfo, cat);
|
|
651
|
+
// Read a sequence at a time
|
|
652
|
+
seq = kseq_init(file);
|
|
653
|
+
while (kseq_read(seq) >= 0) {
|
|
654
|
+
counter++;
|
|
655
|
+
writeSeqName(seq->name.s, seqWriteInfo, cat, sequenceIndex);
|
|
656
|
+
writeSequence(seq->seq.s, seqWriteInfo);
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
kseq_destroy(seq);
|
|
660
|
+
fileGZOrAuto_close(file);
|
|
661
|
+
|
|
662
|
+
if (cat == REFERENCE) {
|
|
663
|
+
fillReferenceCoordinateTable(filename, refCoords, counter);
|
|
664
|
+
}
|
|
665
|
+
cleanupFastX(seqWriteInfo, cat);
|
|
666
|
+
|
|
667
|
+
velvetLog("%li sequences found\n", (long) counter);
|
|
668
|
+
velvetLog("Done\n");
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
static void readFastXPair(int fileType, SequencesWriter *seqWriteInfo, char *filename1, char *filename2, Category cat, IDnum * sequenceIndex)
|
|
672
|
+
{
|
|
673
|
+
kseq_t *seq1, *seq2;
|
|
674
|
+
FileGZOrAuto file1, file2;
|
|
675
|
+
IDnum counter = 0;
|
|
676
|
+
|
|
677
|
+
if (cat==REFERENCE)
|
|
678
|
+
exitErrorf(EXIT_FAILURE, false, "Cannot read reference sequence in 'separate' read mode");
|
|
679
|
+
|
|
680
|
+
file1.gzFile = file1.autoFile = NULL;
|
|
681
|
+
file2.gzFile = file2.autoFile = NULL;
|
|
682
|
+
if (fileType == AUTO) {
|
|
683
|
+
file1.autoFile = openFileAuto(filename1);
|
|
684
|
+
if (!file1.autoFile)
|
|
685
|
+
exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename1);
|
|
686
|
+
velvetLog("Reading file '%s' using '%s' as %s\n", filename1, file1.autoFile->decompressor, charToType(file1.autoFile->first_char));
|
|
687
|
+
file2.autoFile = openFileAuto(filename2);
|
|
688
|
+
if (!file2.autoFile)
|
|
689
|
+
exitErrorf(EXIT_FAILURE, false, "Unable to open file '%s' in auto mode", filename2);
|
|
690
|
+
velvetLog("Reading file '%s' using '%s' as %s\n", filename2, file2.autoFile->decompressor, charToType(file2.autoFile->first_char));
|
|
691
|
+
} else {
|
|
692
|
+
file1.gzFile = openFastXFile(fileType, filename1);
|
|
693
|
+
file2.gzFile = openFastXFile(fileType, filename2);
|
|
694
|
+
}
|
|
695
|
+
initFastX(seqWriteInfo, cat);
|
|
696
|
+
|
|
697
|
+
// Read a sequence at a time
|
|
698
|
+
seq1 = kseq_init(file1);
|
|
699
|
+
seq2 = kseq_init(file2);
|
|
700
|
+
while (kseq_read(seq1) >= 0) {
|
|
701
|
+
counter++;
|
|
702
|
+
writeSeqName(seq1->name.s, seqWriteInfo, cat, sequenceIndex);
|
|
703
|
+
writeSequence(seq1->seq.s, seqWriteInfo);
|
|
704
|
+
|
|
705
|
+
if (kseq_read(seq2) < 0)
|
|
706
|
+
exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too few sequences", filename2);
|
|
707
|
+
|
|
708
|
+
counter++;
|
|
709
|
+
writeSeqName(seq2->name.s, seqWriteInfo, cat, sequenceIndex);
|
|
710
|
+
writeSequence(seq2->seq.s, seqWriteInfo);
|
|
711
|
+
}
|
|
712
|
+
if (kseq_read(seq2) >= 0)
|
|
713
|
+
exitErrorf(EXIT_FAILURE, false, "Right sequence file '%s' has too many sequences", filename2);
|
|
714
|
+
|
|
715
|
+
kseq_destroy(seq1);
|
|
716
|
+
kseq_destroy(seq2);
|
|
717
|
+
|
|
718
|
+
fileGZOrAuto_close(file1);
|
|
719
|
+
fileGZOrAuto_close(file2);
|
|
720
|
+
|
|
721
|
+
cleanupFastX(seqWriteInfo, cat);
|
|
722
|
+
|
|
723
|
+
velvetLog("%li sequences found in total in the paired sequence files\n", (long) counter);
|
|
724
|
+
velvetLog("Done\n");
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
static void addMapping(boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
|
|
728
|
+
if (isCreateBinary()) {
|
|
729
|
+
seqWriteInfo->m_bIsRef = true;
|
|
730
|
+
RefInfoList *refElem = callocOrExit(1, RefInfoList);
|
|
731
|
+
if (refCoord->positive_strand) {
|
|
732
|
+
refElem->m_elem.m_referenceID = (long) orientation * refCoord->referenceID;
|
|
733
|
+
refElem->m_elem.m_pos = (long long) (pos - refCoord->start);
|
|
734
|
+
} else {
|
|
735
|
+
refElem->m_elem.m_referenceID = (long) -orientation * refCoord->referenceID;
|
|
736
|
+
refElem->m_elem.m_pos = (long long) (refCoord->finish - pos - strlen(seq));
|
|
737
|
+
}
|
|
738
|
+
refElem->next = NULL;
|
|
739
|
+
if (seqWriteInfo->m_refInfoHead == NULL) {
|
|
740
|
+
seqWriteInfo->m_refInfoHead = refElem;
|
|
741
|
+
} else {
|
|
742
|
+
(*refTail)->next = refElem;
|
|
743
|
+
}
|
|
744
|
+
*refTail = refElem;
|
|
745
|
+
seqWriteInfo->m_refCnt++;
|
|
746
|
+
} else {
|
|
747
|
+
if (refCoord->positive_strand) {
|
|
748
|
+
snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) orientation * refCoord->referenceID, (long long) (pos - refCoord->start));
|
|
749
|
+
} else
|
|
750
|
+
snprintf(buffer, *buffer_size, "%sM\t%li\t%lli\n", buffer, (long) - orientation * refCoord->referenceID, (long long) (refCoord->finish - pos - strlen(seq)));
|
|
751
|
+
|
|
752
|
+
if (*buffer_size - strlen(buffer) < 100) {
|
|
753
|
+
*buffer_size += 1000;
|
|
754
|
+
buffer = reallocOrExit(buffer, *buffer_size, char);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// Increment counter
|
|
759
|
+
refCoord->counter++;
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
static void writeMappedSequence(IDnum * sequenceIndex, Category cat, Category prev_cat, char * previous_seq, char * previous_qname, char * previous_qname_pairing, char * buffer, SequencesWriter * seqWriteInfo) {
|
|
763
|
+
char print_qname[5000];
|
|
764
|
+
if (isCreateBinary()) {
|
|
765
|
+
if (prev_cat != cat) {
|
|
766
|
+
inputCnySeqFileStart(cat, seqWriteInfo);
|
|
767
|
+
prev_cat = cat;
|
|
768
|
+
}
|
|
769
|
+
cnySeqInsertStart(seqWriteInfo);
|
|
770
|
+
cnySeqInsertNucleotideString(previous_seq, seqWriteInfo);
|
|
771
|
+
sprintf(print_qname, ">%s%s", previous_qname, previous_qname_pairing);
|
|
772
|
+
cnySeqInsertSequenceName(print_qname, (long) ((*sequenceIndex)++), seqWriteInfo, cat);
|
|
773
|
+
cnySeqInsertEnd(seqWriteInfo);
|
|
774
|
+
} else {
|
|
775
|
+
velvetFprintf(seqWriteInfo->m_pFile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
|
|
776
|
+
(long) ((*sequenceIndex)++), (int) cat);
|
|
777
|
+
writeFastaSequence(seqWriteInfo->m_pFile, previous_seq);
|
|
778
|
+
velvetFprintf(seqWriteInfo->m_pFile, "%s", buffer);
|
|
779
|
+
strcpy(buffer, "");
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
static void readCigar(char * cigar, boolean orientation, Coordinate pos, char * seq, ReferenceCoordinate * refCoord, char * buffer, SequencesWriter * seqWriteInfo, RefInfoList ** refTail, size_t * buffer_size) {
|
|
784
|
+
long long cigar_num;
|
|
785
|
+
int cigar_index;
|
|
786
|
+
char c;
|
|
787
|
+
|
|
788
|
+
if (strlen(cigar) == 1 && cigar[0] == '*')
|
|
789
|
+
;
|
|
790
|
+
else {
|
|
791
|
+
cigar_num = 0;
|
|
792
|
+
for (cigar_index = 0; cigar_index < strlen(cigar); cigar_index++) {
|
|
793
|
+
c = cigar[cigar_index];
|
|
794
|
+
if (c == 'M' || c == '=' || c == 'X') {
|
|
795
|
+
if (refCoord->finish < 0 || pos < refCoord->finish)
|
|
796
|
+
addMapping(orientation, pos, seq, refCoord, buffer, seqWriteInfo, refTail, buffer_size);
|
|
797
|
+
cigar_num = 0;
|
|
798
|
+
} else if (c == 'S' || c == 'I') {
|
|
799
|
+
pos -= cigar_num;
|
|
800
|
+
cigar_num = 0;
|
|
801
|
+
} else if (c == 'D' || c == 'N') {
|
|
802
|
+
pos += cigar_num;
|
|
803
|
+
cigar_num = 0;
|
|
804
|
+
} else if (c == 'H' || c == 'P') {
|
|
805
|
+
cigar_num = 0;
|
|
806
|
+
} else if (isdigit(c)) {
|
|
807
|
+
cigar_num = 10 * cigar_num + (c - 48);
|
|
808
|
+
} else {
|
|
809
|
+
abort();
|
|
810
|
+
}
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
static void readSAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
|
|
816
|
+
{
|
|
817
|
+
char line[5000];
|
|
818
|
+
unsigned long lineno;
|
|
819
|
+
IDnum readCount = 0;
|
|
820
|
+
char previous_qname_pairing[10];
|
|
821
|
+
char previous_qname[5000];
|
|
822
|
+
char previous_seq[5000];
|
|
823
|
+
boolean previous_paired = false;
|
|
824
|
+
Category prev_cat = cat;
|
|
825
|
+
Category apparentCat;
|
|
826
|
+
ReferenceCoordinate * refCoord;
|
|
827
|
+
RefInfoList *refTail = NULL;
|
|
828
|
+
seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
|
|
829
|
+
seqWriteInfo->m_position = 0;
|
|
830
|
+
seqWriteInfo->m_openMask = false;
|
|
831
|
+
|
|
832
|
+
size_t buffer_size = 5000;
|
|
833
|
+
char * buffer = callocOrExit(buffer_size, char);
|
|
834
|
+
|
|
835
|
+
if (cat == REFERENCE) {
|
|
836
|
+
velvetLog("SAM file %s cannot contain reference sequences.\n", filename);
|
|
837
|
+
velvetLog("Please check the command line.\n");
|
|
838
|
+
#ifdef DEBUG
|
|
839
|
+
abort();
|
|
840
|
+
#endif
|
|
841
|
+
exit(1);
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
|
|
845
|
+
if (file)
|
|
846
|
+
velvetLog("Reading SAM file %s\n", filename);
|
|
847
|
+
else
|
|
848
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
849
|
+
if (isCreateBinary()) {
|
|
850
|
+
inputCnySeqFileStart(cat, seqWriteInfo);
|
|
851
|
+
}
|
|
852
|
+
strcpy(previous_qname, "");
|
|
853
|
+
for (lineno = 1; fgets(line, sizeof(line), file); lineno++) {
|
|
854
|
+
if (line[0] != '@') {
|
|
855
|
+
char *qname, *flag, *seq, *rname, *cigar;
|
|
856
|
+
long long pos;
|
|
857
|
+
int orientation;
|
|
858
|
+
int i;
|
|
859
|
+
|
|
860
|
+
qname = strtok(line, "\t");
|
|
861
|
+
flag = strtok(NULL, "\t");
|
|
862
|
+
rname = strtok(NULL, "\t");
|
|
863
|
+
sscanf(strtok(NULL, "\t"), "%lli", &pos);
|
|
864
|
+
orientation = 1;
|
|
865
|
+
|
|
866
|
+
// Mapping scor
|
|
867
|
+
(void) strtok(NULL, "\t");
|
|
868
|
+
cigar = strtok(NULL, "\t");
|
|
869
|
+
|
|
870
|
+
// Columns 7,8,9 are paired name, position and score
|
|
871
|
+
for (i = 7; i < 10; i++)
|
|
872
|
+
(void) strtok(NULL, "\t");
|
|
873
|
+
seq = strtok(NULL, "\t");
|
|
874
|
+
|
|
875
|
+
if (seq == NULL) {
|
|
876
|
+
velvetFprintf(stderr,
|
|
877
|
+
"Line #%lu: ignoring SAM record with too few fields\n",
|
|
878
|
+
lineno);
|
|
879
|
+
}
|
|
880
|
+
else if (strcmp(seq, "*") == 0) {
|
|
881
|
+
velvetFprintf(stderr,
|
|
882
|
+
"Line #%lu: ignoring SAM record with omitted SEQ field\n",
|
|
883
|
+
lineno);
|
|
884
|
+
}
|
|
885
|
+
else {
|
|
886
|
+
// Accept flags represented in either decimal or hex:
|
|
887
|
+
int flagbits = strtol(flag, NULL, 0);
|
|
888
|
+
|
|
889
|
+
if (flagbits & 0x4)
|
|
890
|
+
strcpy(rname, "");
|
|
891
|
+
|
|
892
|
+
const char *qname_pairing = "";
|
|
893
|
+
if (flagbits & 0x40)
|
|
894
|
+
qname_pairing = "/1";
|
|
895
|
+
else if (flagbits & 0x80)
|
|
896
|
+
qname_pairing = "/2";
|
|
897
|
+
|
|
898
|
+
if (flagbits & 0x10) {
|
|
899
|
+
orientation = -1;
|
|
900
|
+
reverseComplementSequence(seq);
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
// Determine if paired to previous read
|
|
904
|
+
boolean same_name = (strcmp(qname, previous_qname) == 0);
|
|
905
|
+
if (readCount && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
|
|
906
|
+
if (cat % 2 && !same_name && !previous_paired)
|
|
907
|
+
apparentCat = cat - 1;
|
|
908
|
+
else
|
|
909
|
+
apparentCat = cat;
|
|
910
|
+
|
|
911
|
+
previous_paired = (cat % 2 && same_name);
|
|
912
|
+
|
|
913
|
+
writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
|
|
914
|
+
prev_cat = apparentCat;
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, rname, (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation))) {
|
|
918
|
+
readCigar(cigar, orientation, pos, seq, refCoord, buffer, seqWriteInfo, &refTail, &buffer_size);
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
strcpy(previous_qname, qname);
|
|
922
|
+
strcpy(previous_qname_pairing, qname_pairing);
|
|
923
|
+
strcpy(previous_seq, seq);
|
|
924
|
+
velvetifySequence(previous_seq, seqWriteInfo);
|
|
925
|
+
|
|
926
|
+
readCount++;
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
if (readCount) {
|
|
932
|
+
if (cat % 2 && !previous_paired)
|
|
933
|
+
apparentCat = cat - 1;
|
|
934
|
+
else
|
|
935
|
+
apparentCat = cat;
|
|
936
|
+
writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, buffer, seqWriteInfo);
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
free(buffer);
|
|
940
|
+
fclose(file);
|
|
941
|
+
velvetLog("%lu reads found.\n", (long) readCount);
|
|
942
|
+
velvetLog("Done\n");
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
static int readBAMint32(gzFile file)
|
|
946
|
+
{
|
|
947
|
+
unsigned char buffer[4];
|
|
948
|
+
if (gzread(file, buffer, 4) != 4)
|
|
949
|
+
exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
|
|
950
|
+
|
|
951
|
+
return int32(buffer);
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
static void readBAMFile(SequencesWriter *seqWriteInfo, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
|
|
955
|
+
{
|
|
956
|
+
size_t seqCapacity = 0;
|
|
957
|
+
char *seq = NULL;
|
|
958
|
+
char cigar[5000];
|
|
959
|
+
char cigar_buffer[5000];
|
|
960
|
+
size_t bufferCapacity = 4;
|
|
961
|
+
unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
|
|
962
|
+
unsigned long recno, readCount;
|
|
963
|
+
int i, refCount;
|
|
964
|
+
gzFile file;
|
|
965
|
+
char previous_qname_pairing[10];
|
|
966
|
+
char previous_qname[5000];
|
|
967
|
+
char previous_seq[5000];
|
|
968
|
+
boolean previous_paired = false;
|
|
969
|
+
Category prev_cat = cat;
|
|
970
|
+
Category apparentCat;
|
|
971
|
+
char ** refNames;
|
|
972
|
+
ReferenceCoordinate * refCoord;
|
|
973
|
+
seqWriteInfo->m_referenceMask = NULL; // no ref masks for SAM/BAM
|
|
974
|
+
seqWriteInfo->m_position = 0;
|
|
975
|
+
seqWriteInfo->m_openMask = false;
|
|
976
|
+
|
|
977
|
+
RefInfoList *refTail = NULL;
|
|
978
|
+
size_t mapBuffer_size = 1000;
|
|
979
|
+
char * mapBuffer = callocOrExit(mapBuffer_size, char);
|
|
980
|
+
|
|
981
|
+
if (cat == REFERENCE) {
|
|
982
|
+
velvetLog("BAM file %s cannot contain reference sequences.\n", filename);
|
|
983
|
+
velvetLog("Please check the command line.\n");
|
|
984
|
+
#ifdef DEBUG
|
|
985
|
+
abort();
|
|
986
|
+
#endif
|
|
987
|
+
exit(1);
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
if (strcmp(filename, "-") != 0)
|
|
991
|
+
file = gzopen(filename, "rb");
|
|
992
|
+
else {
|
|
993
|
+
file = gzdopen(fileno(stdin), "rb");
|
|
994
|
+
SET_BINARY_MODE(stdin);
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
if (file != NULL)
|
|
998
|
+
velvetLog("Reading BAM file %s\n", filename);
|
|
999
|
+
else
|
|
1000
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
1001
|
+
|
|
1002
|
+
if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
|
|
1003
|
+
exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
|
|
1004
|
+
|
|
1005
|
+
// Skip header text
|
|
1006
|
+
if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
|
|
1007
|
+
exitErrorf(EXIT_FAILURE, false, "gzseek failed");
|
|
1008
|
+
|
|
1009
|
+
// Skip header reference list
|
|
1010
|
+
refCount = readBAMint32(file);
|
|
1011
|
+
refNames = callocOrExit(refCount, char *);
|
|
1012
|
+
for (i = 0; i < refCount; i++) {
|
|
1013
|
+
int strLength;
|
|
1014
|
+
|
|
1015
|
+
if (gzread(file, buffer, 4) != 4)
|
|
1016
|
+
exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
|
|
1017
|
+
|
|
1018
|
+
strLength = int32(buffer);
|
|
1019
|
+
refNames[i] = callocOrExit(strLength, char);
|
|
1020
|
+
|
|
1021
|
+
if (bufferCapacity < 4 + strLength) {
|
|
1022
|
+
bufferCapacity = 4 + strLength + 4096;
|
|
1023
|
+
buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
if (gzread(file, buffer, 4 + strLength) != 4 + strLength)
|
|
1027
|
+
exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
|
|
1028
|
+
|
|
1029
|
+
strcpy(refNames[i], (char *) buffer);
|
|
1030
|
+
}
|
|
1031
|
+
if (isCreateBinary()) {
|
|
1032
|
+
inputCnySeqFileStart(cat, seqWriteInfo);
|
|
1033
|
+
}
|
|
1034
|
+
strcpy(previous_qname, "");
|
|
1035
|
+
readCount = 0;
|
|
1036
|
+
for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
|
|
1037
|
+
int blockSize = int32(buffer);
|
|
1038
|
+
int readLength;
|
|
1039
|
+
|
|
1040
|
+
if (bufferCapacity < 4 + blockSize) {
|
|
1041
|
+
bufferCapacity = 4 + blockSize + 4096;
|
|
1042
|
+
buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
if (gzread(file, &buffer[4], blockSize) != blockSize)
|
|
1046
|
+
exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
|
|
1047
|
+
|
|
1048
|
+
readLength = int32(&buffer[20]);
|
|
1049
|
+
if (readLength == 0) {
|
|
1050
|
+
velvetFprintf(stderr,
|
|
1051
|
+
"Record #%lu: ignoring BAM record with omitted SEQ field\n",
|
|
1052
|
+
recno);
|
|
1053
|
+
}
|
|
1054
|
+
else {
|
|
1055
|
+
int readNameLength = buffer[12];
|
|
1056
|
+
int flag_nc = int32(&buffer[16]);
|
|
1057
|
+
int flagbits = flag_nc >> 16;
|
|
1058
|
+
int cigarLength = flag_nc & 0xffff;
|
|
1059
|
+
char *qname = (char *)&buffer[36];
|
|
1060
|
+
uint32_t *rawcigar = (uint32_t *) &buffer[36 + readNameLength];
|
|
1061
|
+
unsigned char *rawseq =
|
|
1062
|
+
&buffer[36 + readNameLength + 4 * cigarLength];
|
|
1063
|
+
int rID = int32(&buffer[4]);
|
|
1064
|
+
// NOTE: BAM file coords are 0-based, not 1-based like SAM files
|
|
1065
|
+
// No comment
|
|
1066
|
+
long long pos = int32(&buffer[8]) + 1;
|
|
1067
|
+
int orientation = 1;
|
|
1068
|
+
|
|
1069
|
+
const char *qname_pairing = "";
|
|
1070
|
+
if (flagbits & 0x40)
|
|
1071
|
+
qname_pairing = "/1";
|
|
1072
|
+
else if (flagbits & 0x80)
|
|
1073
|
+
qname_pairing = "/2";
|
|
1074
|
+
|
|
1075
|
+
strcpy(cigar, "");
|
|
1076
|
+
for (i = 0; i < cigarLength; i++) {
|
|
1077
|
+
static const char decode_ops[] = "MIDNSHP=X";
|
|
1078
|
+
uint32_t packed = *(rawcigar++);
|
|
1079
|
+
sprintf(cigar_buffer, "%i%c", packed >> 4, decode_ops[packed & 0xf]);
|
|
1080
|
+
strcat(cigar, cigar_buffer);
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
if (seqCapacity < readLength + 1) {
|
|
1084
|
+
seqCapacity = readLength * 2 + 1;
|
|
1085
|
+
seq = reallocOrExit(seq, seqCapacity, char);
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
for (i = 0; i < readLength; i += 2) {
|
|
1089
|
+
static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
|
|
1090
|
+
unsigned int packed = *(rawseq++);
|
|
1091
|
+
seq[i] = decode_bases[packed >> 4];
|
|
1092
|
+
seq[i+1] = decode_bases[packed & 0xf];
|
|
1093
|
+
}
|
|
1094
|
+
seq[readLength] = '\0';
|
|
1095
|
+
|
|
1096
|
+
if (flagbits & 0x10) {
|
|
1097
|
+
orientation = -1;
|
|
1098
|
+
reverseComplementSequence(seq);
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
// Determine if paired to previous read
|
|
1102
|
+
boolean same_name = (strcmp(qname, previous_qname) == 0);
|
|
1103
|
+
if (readCount > 0 && (!same_name || strcmp(qname_pairing, previous_qname_pairing) != 0)) {
|
|
1104
|
+
if (cat % 2 && !same_name && !previous_paired)
|
|
1105
|
+
apparentCat = cat - 1;
|
|
1106
|
+
else
|
|
1107
|
+
apparentCat = cat;
|
|
1108
|
+
|
|
1109
|
+
previous_paired = (cat % 2 && same_name);
|
|
1110
|
+
|
|
1111
|
+
writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
|
|
1112
|
+
prev_cat = apparentCat;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
if (!(flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[rID], (Coordinate) pos, (Coordinate) pos + strlen(seq) - 1, orientation)))
|
|
1116
|
+
readCigar(cigar, orientation, pos, seq, refCoord, mapBuffer, seqWriteInfo, &refTail, &mapBuffer_size);
|
|
1117
|
+
|
|
1118
|
+
strcpy(previous_qname, qname);
|
|
1119
|
+
strcpy(previous_qname_pairing, qname_pairing);
|
|
1120
|
+
strcpy(previous_seq, seq);
|
|
1121
|
+
velvetifySequence(previous_seq, seqWriteInfo);
|
|
1122
|
+
|
|
1123
|
+
readCount++;
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
if (readCount) {
|
|
1128
|
+
if (cat % 2 && !previous_paired)
|
|
1129
|
+
apparentCat = cat - 1;
|
|
1130
|
+
else
|
|
1131
|
+
apparentCat = cat;
|
|
1132
|
+
writeMappedSequence(sequenceIndex, apparentCat, prev_cat, previous_seq, previous_qname, previous_qname_pairing, mapBuffer, seqWriteInfo);
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
free(seq);
|
|
1136
|
+
free(buffer);
|
|
1137
|
+
free(mapBuffer);
|
|
1138
|
+
|
|
1139
|
+
gzclose(file);
|
|
1140
|
+
velvetLog("%lu reads found.\n", readCount);
|
|
1141
|
+
velvetLog("Done\n");
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
static void printUsage()
|
|
1146
|
+
{
|
|
1147
|
+
puts("Usage:");
|
|
1148
|
+
puts("./velveth directory hash_length {[-file_format][-read_type][-separate|-interleaved] filename} [options]");
|
|
1149
|
+
puts("");
|
|
1150
|
+
puts("\tdirectory\t\t: directory name for output files");
|
|
1151
|
+
printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
|
|
1152
|
+
puts("\tfilename\t\t: path to sequence file or - for standard input");
|
|
1153
|
+
puts("");
|
|
1154
|
+
puts("File format options:");
|
|
1155
|
+
puts("\t-fasta");
|
|
1156
|
+
puts("\t-fastq");
|
|
1157
|
+
puts("\t-raw");
|
|
1158
|
+
puts("\t-fasta.gz");
|
|
1159
|
+
puts("\t-fastq.gz");
|
|
1160
|
+
puts("\t-raw.gz");
|
|
1161
|
+
puts("\t-sam");
|
|
1162
|
+
puts("\t-bam");
|
|
1163
|
+
puts("\t-fmtAuto");
|
|
1164
|
+
puts("");
|
|
1165
|
+
puts("Read type options:");
|
|
1166
|
+
puts("\t-short");
|
|
1167
|
+
puts("\t-shortPaired");
|
|
1168
|
+
puts("\t-short2");
|
|
1169
|
+
puts("\t-shortPaired2");
|
|
1170
|
+
puts("\t-long");
|
|
1171
|
+
puts("\t-longPaired");
|
|
1172
|
+
puts("\t-reference");
|
|
1173
|
+
puts("");
|
|
1174
|
+
puts("Options:");
|
|
1175
|
+
puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
|
|
1176
|
+
puts("");
|
|
1177
|
+
puts("Output:");
|
|
1178
|
+
puts("\tdirectory/Roadmaps");
|
|
1179
|
+
puts("\tdirectory/Sequences");
|
|
1180
|
+
puts("\t\t[Both files are picked up by graph, so please leave them there]");
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
// General argument parser for most functions
|
|
1184
|
+
// Basically a reused portion of toplevel code dumped into here
|
|
1185
|
+
void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand, boolean * noHash)
|
|
1186
|
+
{
|
|
1187
|
+
int argIndex = 1;
|
|
1188
|
+
int filetype = FASTA;
|
|
1189
|
+
Category cat = 0;
|
|
1190
|
+
IDnum sequenceIndex = 1;
|
|
1191
|
+
short short_var;
|
|
1192
|
+
ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
|
|
1193
|
+
boolean reuseSequences = false;
|
|
1194
|
+
boolean separate_pair_files = false;
|
|
1195
|
+
|
|
1196
|
+
if (argc < 2) {
|
|
1197
|
+
printUsage();
|
|
1198
|
+
#ifdef DEBUG
|
|
1199
|
+
abort();
|
|
1200
|
+
#endif
|
|
1201
|
+
exit(1);
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
for (argIndex = 1; argIndex < argc; argIndex++) {
|
|
1205
|
+
if (strcmp(argv[argIndex], "-strand_specific") == 0) {
|
|
1206
|
+
*double_strand = false;
|
|
1207
|
+
reference_coordinate_double_strand = false;
|
|
1208
|
+
} else if (strcmp(argv[argIndex], "-reuse_Sequences") == 0) {
|
|
1209
|
+
reuseSequences = true;
|
|
1210
|
+
} else if (strcmp(argv[argIndex], "-reuse_binary") == 0) {
|
|
1211
|
+
reuseSequences = true;
|
|
1212
|
+
} else if (strcmp(argv[argIndex], "-noHash") == 0) {
|
|
1213
|
+
*noHash = true;
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
if (reuseSequences)
|
|
1218
|
+
return;
|
|
1219
|
+
|
|
1220
|
+
SequencesWriter * seqWriteInfo = NULL;
|
|
1221
|
+
if (isCreateBinary()) {
|
|
1222
|
+
seqWriteInfo = openCnySeqForWrite(filename);
|
|
1223
|
+
seqWriteInfo->m_unifiedSeqFileHeader.m_bDoubleStrand = *double_strand;
|
|
1224
|
+
// file is already open
|
|
1225
|
+
} else {
|
|
1226
|
+
seqWriteInfo = callocOrExit(1, SequencesWriter);
|
|
1227
|
+
seqWriteInfo->m_pFile = fopen(filename, "w");
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
for (argIndex = 1; argIndex < argc; argIndex++) {
|
|
1231
|
+
if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
|
|
1232
|
+
|
|
1233
|
+
if (strcmp(argv[argIndex], "-fastq") == 0)
|
|
1234
|
+
filetype = FASTQ;
|
|
1235
|
+
else if (strcmp(argv[argIndex], "-fasta") == 0)
|
|
1236
|
+
filetype = FASTA;
|
|
1237
|
+
else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
|
|
1238
|
+
filetype = FASTQ_GZ;
|
|
1239
|
+
else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
|
|
1240
|
+
filetype = FASTA_GZ;
|
|
1241
|
+
else if (strcmp(argv[argIndex], "-sam") == 0)
|
|
1242
|
+
filetype = SAM;
|
|
1243
|
+
else if (strcmp(argv[argIndex], "-bam") == 0)
|
|
1244
|
+
filetype = BAM;
|
|
1245
|
+
else if (strcmp(argv[argIndex], "-raw") == 0)
|
|
1246
|
+
filetype = RAW;
|
|
1247
|
+
else if (strcmp(argv[argIndex], "-raw.gz") == 0)
|
|
1248
|
+
filetype = RAW_GZ;
|
|
1249
|
+
else if (strcmp(argv[argIndex], "-fmtAuto") == 0)
|
|
1250
|
+
filetype = AUTO;
|
|
1251
|
+
else if (strcmp(argv[argIndex], "-short") == 0)
|
|
1252
|
+
cat = 0;
|
|
1253
|
+
else if (strcmp(argv[argIndex], "-shortPaired") ==
|
|
1254
|
+
0)
|
|
1255
|
+
cat = 1;
|
|
1256
|
+
else if (strncmp
|
|
1257
|
+
(argv[argIndex], "-shortPaired",
|
|
1258
|
+
12) == 0) {
|
|
1259
|
+
sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
|
|
1260
|
+
cat = (Category) short_var;
|
|
1261
|
+
if (cat < 1 || cat > CATEGORIES) {
|
|
1262
|
+
velvetLog("Unknown option: %s\n",
|
|
1263
|
+
argv[argIndex]);
|
|
1264
|
+
#ifdef DEBUG
|
|
1265
|
+
abort();
|
|
1266
|
+
#endif
|
|
1267
|
+
exit(1);
|
|
1268
|
+
}
|
|
1269
|
+
cat--;
|
|
1270
|
+
cat *= 2;
|
|
1271
|
+
cat++;
|
|
1272
|
+
} else if (strncmp(argv[argIndex], "-short", 6) ==
|
|
1273
|
+
0) {
|
|
1274
|
+
sscanf(argv[argIndex], "-short%hd", &short_var);
|
|
1275
|
+
cat = (Category) short_var;
|
|
1276
|
+
if (cat < 1 || cat > CATEGORIES) {
|
|
1277
|
+
velvetLog("Unknown option: %s\n",
|
|
1278
|
+
argv[argIndex]);
|
|
1279
|
+
#ifdef DEBUG
|
|
1280
|
+
abort();
|
|
1281
|
+
#endif
|
|
1282
|
+
exit(1);
|
|
1283
|
+
}
|
|
1284
|
+
cat--;
|
|
1285
|
+
cat *= 2;
|
|
1286
|
+
} else if (strcmp(argv[argIndex], "-long") == 0)
|
|
1287
|
+
cat = LONG; // CATEGORIES * 2;
|
|
1288
|
+
else if (strcmp(argv[argIndex], "-longPaired") == 0)
|
|
1289
|
+
cat = LONG_PAIRED; // CATEGORIES * 2 + 1;
|
|
1290
|
+
else if (strcmp(argv[argIndex], "-reference") == 0)
|
|
1291
|
+
cat = REFERENCE; // CATEGORIES * 2 + 2
|
|
1292
|
+
else if (strcmp(argv[argIndex], "-strand_specific") == 0) {
|
|
1293
|
+
*double_strand = false;
|
|
1294
|
+
reference_coordinate_double_strand = false;
|
|
1295
|
+
} else if (strcmp(argv[argIndex], "-noHash") == 0) {
|
|
1296
|
+
;
|
|
1297
|
+
} else if (strcmp(argv[argIndex], "-create_binary") == 0) {
|
|
1298
|
+
;
|
|
1299
|
+
} else if (strcmp(argv[argIndex], "-interleaved") == 0) {
|
|
1300
|
+
separate_pair_files = false;
|
|
1301
|
+
} else if (strcmp(argv[argIndex], "-separate") == 0) {
|
|
1302
|
+
separate_pair_files = true;
|
|
1303
|
+
}
|
|
1304
|
+
else {
|
|
1305
|
+
velvetLog("Unknown option: %s\n",
|
|
1306
|
+
argv[argIndex]);
|
|
1307
|
+
#ifdef DEBUG
|
|
1308
|
+
abort();
|
|
1309
|
+
#endif
|
|
1310
|
+
exit(1);
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
continue;
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
if (cat == -1)
|
|
1317
|
+
continue;
|
|
1318
|
+
|
|
1319
|
+
switch (filetype) {
|
|
1320
|
+
case FASTA:
|
|
1321
|
+
case FASTQ:
|
|
1322
|
+
case FASTA_GZ:
|
|
1323
|
+
case FASTQ_GZ:
|
|
1324
|
+
case AUTO:
|
|
1325
|
+
// Separate files for paired reads? Note odd categories used for paired read type
|
|
1326
|
+
if (separate_pair_files && cat%2==1) {
|
|
1327
|
+
argIndex++;
|
|
1328
|
+
if (argIndex>=argc)
|
|
1329
|
+
exitErrorf(EXIT_FAILURE, false, "Require left & right filename for -separate mode");
|
|
1330
|
+
readFastXPair(filetype, seqWriteInfo, argv[argIndex-1], argv[argIndex], cat, &sequenceIndex);
|
|
1331
|
+
} else {
|
|
1332
|
+
readFastXFile(filetype, seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
|
|
1333
|
+
}
|
|
1334
|
+
break;
|
|
1335
|
+
case RAW:
|
|
1336
|
+
if (separate_pair_files && cat%2==1) {
|
|
1337
|
+
exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
|
|
1338
|
+
}
|
|
1339
|
+
readRawFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
|
|
1340
|
+
break;
|
|
1341
|
+
case RAW_GZ:
|
|
1342
|
+
if (separate_pair_files && cat%2==1) {
|
|
1343
|
+
exitErrorf(EXIT_FAILURE, false, "Currently do not support -separate mode for RAW");
|
|
1344
|
+
}
|
|
1345
|
+
readRawGZFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex);
|
|
1346
|
+
break;
|
|
1347
|
+
case SAM:
|
|
1348
|
+
readSAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
|
|
1349
|
+
break;
|
|
1350
|
+
case BAM:
|
|
1351
|
+
readBAMFile(seqWriteInfo, argv[argIndex], cat, &sequenceIndex, refCoords);
|
|
1352
|
+
break;
|
|
1353
|
+
default:
|
|
1354
|
+
velvetLog("Screw up in parser... exiting\n");
|
|
1355
|
+
#ifdef DEBUG
|
|
1356
|
+
abort();
|
|
1357
|
+
#endif
|
|
1358
|
+
exit(1);
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
destroyReferenceCoordinateTable(refCoords);
|
|
1363
|
+
if (isCreateBinary()) {
|
|
1364
|
+
closeCnySeqForWrite(seqWriteInfo);
|
|
1365
|
+
} else {
|
|
1366
|
+
fclose(seqWriteInfo->m_pFile);
|
|
1367
|
+
}
|
|
1368
|
+
if (seqWriteInfo) {
|
|
1369
|
+
free(seqWriteInfo);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
void createReadPairingArray(ReadSet* reads)
|
|
1374
|
+
{
|
|
1375
|
+
IDnum index;
|
|
1376
|
+
IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
|
|
1377
|
+
Category cat = 0;
|
|
1378
|
+
int phase = 0;
|
|
1379
|
+
|
|
1380
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1381
|
+
mateReads[index] = -1;
|
|
1382
|
+
|
|
1383
|
+
reads->mateReads = mateReads;
|
|
1384
|
+
|
|
1385
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1386
|
+
{
|
|
1387
|
+
// Paired category
|
|
1388
|
+
if (cat & 1)
|
|
1389
|
+
{
|
|
1390
|
+
// Leaving the paired category
|
|
1391
|
+
if (reads->categories[index] != cat)
|
|
1392
|
+
{
|
|
1393
|
+
if (phase == 1)
|
|
1394
|
+
{
|
|
1395
|
+
reads->mateReads[index - 1] = -1;
|
|
1396
|
+
reads->categories[index - 1]--;
|
|
1397
|
+
phase = 0;
|
|
1398
|
+
}
|
|
1399
|
+
cat = reads->categories[index];
|
|
1400
|
+
// Into another paired category
|
|
1401
|
+
if (cat & 1)
|
|
1402
|
+
{
|
|
1403
|
+
reads->mateReads[index] = index + 1;
|
|
1404
|
+
phase = 1;
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
else if (phase == 0)
|
|
1408
|
+
{
|
|
1409
|
+
reads->mateReads[index] = index + 1;
|
|
1410
|
+
phase = 1;
|
|
1411
|
+
}
|
|
1412
|
+
else
|
|
1413
|
+
{
|
|
1414
|
+
reads->mateReads[index] = index - 1;
|
|
1415
|
+
phase = 0;
|
|
1416
|
+
}
|
|
1417
|
+
}
|
|
1418
|
+
// Leaving an unpaired category
|
|
1419
|
+
else if (reads->categories[index] != cat)
|
|
1420
|
+
{
|
|
1421
|
+
cat = reads->categories[index];
|
|
1422
|
+
// Into a paired category
|
|
1423
|
+
if (cat & 1)
|
|
1424
|
+
{
|
|
1425
|
+
reads->mateReads[index] = index + 1;
|
|
1426
|
+
phase = 1;
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
int pairedCategories(ReadSet * reads)
|
|
1433
|
+
{
|
|
1434
|
+
boolean pairedCat[CATEGORIES + 1];
|
|
1435
|
+
int pairedCatCount = 0;
|
|
1436
|
+
IDnum index;
|
|
1437
|
+
|
|
1438
|
+
for (index = 0; index <= CATEGORIES; index++)
|
|
1439
|
+
pairedCat[index] = 0;
|
|
1440
|
+
|
|
1441
|
+
for (index = 0; index < reads->readCount; index++) {
|
|
1442
|
+
if (reads->categories[index] & 1 && !pairedCat[reads->categories[index] / 2]) {
|
|
1443
|
+
pairedCat[reads->categories[index] / 2] = true;
|
|
1444
|
+
if (pairedCatCount++ == CATEGORIES)
|
|
1445
|
+
break;
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
return pairedCatCount;
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
boolean isSecondInPair(ReadSet * reads, IDnum index)
|
|
1453
|
+
{
|
|
1454
|
+
return reads->secondInPair[index / 8] & (1 << (index & 7));
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
void computeSecondInPair(ReadSet * reads)
|
|
1458
|
+
{
|
|
1459
|
+
IDnum index;
|
|
1460
|
+
Category currentCat = 0;
|
|
1461
|
+
Category previousCat = 0;
|
|
1462
|
+
int phase = 0;
|
|
1463
|
+
|
|
1464
|
+
if (reads->secondInPair)
|
|
1465
|
+
free (reads->secondInPair);
|
|
1466
|
+
reads->secondInPair = callocOrExit((reads->readCount + 7) / 8, unsigned char);
|
|
1467
|
+
|
|
1468
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1469
|
+
{
|
|
1470
|
+
currentCat = reads->categories[index];
|
|
1471
|
+
if (currentCat & 1)
|
|
1472
|
+
{
|
|
1473
|
+
if (previousCat == currentCat)
|
|
1474
|
+
{
|
|
1475
|
+
if (phase == 0)
|
|
1476
|
+
{
|
|
1477
|
+
phase = 1;
|
|
1478
|
+
}
|
|
1479
|
+
else
|
|
1480
|
+
{
|
|
1481
|
+
reads->secondInPair[index / 8] |= (1 << (index & 7));
|
|
1482
|
+
phase = 0;
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
else {
|
|
1486
|
+
phase = 1;
|
|
1487
|
+
if (index > 0 && previousCat & 1 && !isSecondInPair(reads, index - 1))
|
|
1488
|
+
reads->categories[index - 1] = (reads->categories[index - 1] / 2) * 2;
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
previousCat = currentCat;
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
// Safeguard against odd sets of reads
|
|
1495
|
+
if (!isSecondInPair(reads, reads->readCount - 1)) {
|
|
1496
|
+
reads->categories[reads->readCount - 1] = (reads->categories[reads->readCount - 1] / 2) * 2;
|
|
1497
|
+
}
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
|
|
1501
|
+
{
|
|
1502
|
+
IDnum index;
|
|
1503
|
+
IDnum pairID;
|
|
1504
|
+
IDnum sequenceCount = reads->readCount;
|
|
1505
|
+
IDnum *mateReads = reads->mateReads;
|
|
1506
|
+
|
|
1507
|
+
if (dubiousReads == NULL || mateReads == NULL)
|
|
1508
|
+
return;
|
|
1509
|
+
|
|
1510
|
+
for (index = 0; index < sequenceCount; index++) {
|
|
1511
|
+
if (!dubiousReads[index] || reads->categories[index] % 2 == 0 )
|
|
1512
|
+
continue;
|
|
1513
|
+
|
|
1514
|
+
if (isSecondInPair(reads, index))
|
|
1515
|
+
pairID = index - 1;
|
|
1516
|
+
else
|
|
1517
|
+
pairID = index + 1;
|
|
1518
|
+
|
|
1519
|
+
reads->categories[index] = (reads->categories[index] / 2) * 2;
|
|
1520
|
+
reads->categories[pairID] = (reads->categories[pairID] / 2) * 2;
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
ReadSet *importReadSet(char *filename)
|
|
1525
|
+
{
|
|
1526
|
+
FILE *file = fopen(filename, "r");
|
|
1527
|
+
char *sequence = NULL;
|
|
1528
|
+
Coordinate bpCount = 0;
|
|
1529
|
+
const int maxline = 5000;
|
|
1530
|
+
char line[5000];
|
|
1531
|
+
IDnum sequenceCount, sequenceIndex;
|
|
1532
|
+
ReadSet *reads;
|
|
1533
|
+
short int temp_short;
|
|
1534
|
+
int lineLength;
|
|
1535
|
+
|
|
1536
|
+
if (file != NULL)
|
|
1537
|
+
velvetLog("Reading read set file %s;\n", filename);
|
|
1538
|
+
else
|
|
1539
|
+
exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
|
|
1540
|
+
|
|
1541
|
+
reads = newReadSet();
|
|
1542
|
+
|
|
1543
|
+
// Count number of separate sequences
|
|
1544
|
+
sequenceCount = 0;
|
|
1545
|
+
while (fgets(line, maxline, file) != NULL)
|
|
1546
|
+
if (line[0] == '>')
|
|
1547
|
+
sequenceCount++;
|
|
1548
|
+
fclose(file);
|
|
1549
|
+
velvetLog("%li sequences found\n", (long) sequenceCount);
|
|
1550
|
+
|
|
1551
|
+
reads->readCount = sequenceCount;
|
|
1552
|
+
|
|
1553
|
+
if (reads->readCount == 0) {
|
|
1554
|
+
reads->sequences = NULL;
|
|
1555
|
+
reads->categories = NULL;
|
|
1556
|
+
return reads;
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1559
|
+
reads->sequences = callocOrExit(sequenceCount, char *);
|
|
1560
|
+
reads->categories = callocOrExit(sequenceCount, Category);
|
|
1561
|
+
// Counting base pair length of each sequence:
|
|
1562
|
+
file = fopen(filename, "r");
|
|
1563
|
+
sequenceIndex = -1;
|
|
1564
|
+
while (fgets(line, maxline, file) != NULL) {
|
|
1565
|
+
if (line[0] == '>') {
|
|
1566
|
+
|
|
1567
|
+
// Reading category info
|
|
1568
|
+
sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
|
|
1569
|
+
&temp_short);
|
|
1570
|
+
reads->categories[sequenceIndex + 1] = (Category) temp_short;
|
|
1571
|
+
|
|
1572
|
+
if (sequenceIndex != -1)
|
|
1573
|
+
reads->sequences[sequenceIndex] =
|
|
1574
|
+
mallocOrExit(bpCount + 1, char);
|
|
1575
|
+
sequenceIndex++;
|
|
1576
|
+
bpCount = 0;
|
|
1577
|
+
} if (line[0] == 'M') {;
|
|
1578
|
+
// Map line
|
|
1579
|
+
} else {
|
|
1580
|
+
bpCount += (Coordinate) strlen(line) - 1;
|
|
1581
|
+
|
|
1582
|
+
if (sizeof(ShortLength) == sizeof(int16_t) && (bpCount > SHRT_MAX || bpCount < 0)) {
|
|
1583
|
+
velvetLog("Read %li of length %lli, longer than limit %i\n",
|
|
1584
|
+
(long) sequenceIndex + 1, (long long) bpCount, SHRT_MAX);
|
|
1585
|
+
velvetLog("You should modify recompile with the LONGSEQUENCES option (cf. manual)\n");
|
|
1586
|
+
exit(1);
|
|
1587
|
+
}
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
//velvetLog("Sequence %d has length %d\n", sequenceIndex, bpCount);
|
|
1592
|
+
reads->sequences[sequenceIndex] =
|
|
1593
|
+
mallocOrExit(bpCount + 1, char);
|
|
1594
|
+
fclose(file);
|
|
1595
|
+
|
|
1596
|
+
// Reopen file and memorize line:
|
|
1597
|
+
file = fopen(filename, "r");
|
|
1598
|
+
sequenceIndex = -1;
|
|
1599
|
+
while (fgets(line, maxline, file)) {
|
|
1600
|
+
if (line[0] == '>') {
|
|
1601
|
+
if (sequenceIndex != -1) {
|
|
1602
|
+
sequence[bpCount] = '\0';
|
|
1603
|
+
}
|
|
1604
|
+
sequenceIndex++;
|
|
1605
|
+
bpCount = 0;
|
|
1606
|
+
//velvetLog("Starting to read sequence %d\n",
|
|
1607
|
+
// sequenceIndex);
|
|
1608
|
+
sequence = reads->sequences[sequenceIndex];
|
|
1609
|
+
} else if (line[0] == 'M') {;
|
|
1610
|
+
// Map line
|
|
1611
|
+
} else {
|
|
1612
|
+
lineLength = strlen(line) - 1;
|
|
1613
|
+
strncpy(sequence + bpCount, line, lineLength);
|
|
1614
|
+
bpCount += (Coordinate) lineLength;
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1618
|
+
sequence[bpCount] = '\0';
|
|
1619
|
+
fclose(file);
|
|
1620
|
+
computeSecondInPair(reads);
|
|
1621
|
+
|
|
1622
|
+
velvetLog("Done\n");
|
|
1623
|
+
return reads;
|
|
1624
|
+
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
void logInstructions(int argc, char **argv, char *directory)
|
|
1628
|
+
{
|
|
1629
|
+
int index;
|
|
1630
|
+
char *logFilename =
|
|
1631
|
+
mallocOrExit(strlen(directory) + 100, char);
|
|
1632
|
+
FILE *logFile;
|
|
1633
|
+
time_t date;
|
|
1634
|
+
char *string;
|
|
1635
|
+
|
|
1636
|
+
time(&date);
|
|
1637
|
+
string = ctime(&date);
|
|
1638
|
+
|
|
1639
|
+
strcpy(logFilename, directory);
|
|
1640
|
+
strcat(logFilename, "/Log");
|
|
1641
|
+
logFile = fopen(logFilename, "a");
|
|
1642
|
+
|
|
1643
|
+
if (logFile == NULL)
|
|
1644
|
+
exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
|
|
1645
|
+
|
|
1646
|
+
velvetFprintf(logFile, "%s", string);
|
|
1647
|
+
|
|
1648
|
+
for (index = 0; index < argc; index++)
|
|
1649
|
+
velvetFprintf(logFile, " %s", argv[index]);
|
|
1650
|
+
|
|
1651
|
+
velvetFprintf(logFile, "\n");
|
|
1652
|
+
|
|
1653
|
+
velvetFprintf(logFile, "Version %i.%i.%2.2i%s\n", VERSION_NUMBER,
|
|
1654
|
+
RELEASE_NUMBER, UPDATE_NUMBER, VERSION_BRANCH);
|
|
1655
|
+
velvetFprintf(logFile, "Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
|
|
1656
|
+
velvetFprintf(logFile, "This is free software; see the source for copying conditions. There is NO\n");
|
|
1657
|
+
velvetFprintf(logFile, "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
|
|
1658
|
+
velvetFprintf(logFile, "Compilation settings:\n");
|
|
1659
|
+
velvetFprintf(logFile, "CATEGORIES = %i\n", CATEGORIES);
|
|
1660
|
+
velvetFprintf(logFile, "MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
|
|
1661
|
+
#ifdef _OPENMP
|
|
1662
|
+
velvetFprintf(logFile, "OPENMP\n");
|
|
1663
|
+
#endif
|
|
1664
|
+
#ifdef LONGSEQUENCES
|
|
1665
|
+
velvetFprintf(logFile, "LONGSEQUENCES\n");
|
|
1666
|
+
#endif
|
|
1667
|
+
#ifdef BIGASSEMBLY
|
|
1668
|
+
velvetFprintf(logFile, "BIGASSEMBLY\n");
|
|
1669
|
+
#endif
|
|
1670
|
+
#ifdef COLOR
|
|
1671
|
+
velvetFprintf(logFile, "COLOR\n");
|
|
1672
|
+
#endif
|
|
1673
|
+
#ifdef DEBUG
|
|
1674
|
+
velvetFprintf(logFile, "DEBUG\n");
|
|
1675
|
+
#endif
|
|
1676
|
+
velvetFprintf(logFile, "\n");
|
|
1677
|
+
|
|
1678
|
+
fclose(logFile);
|
|
1679
|
+
free(logFilename);
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
void destroyReadSet(ReadSet * reads)
|
|
1683
|
+
{
|
|
1684
|
+
IDnum index;
|
|
1685
|
+
|
|
1686
|
+
if (reads == NULL)
|
|
1687
|
+
return;
|
|
1688
|
+
|
|
1689
|
+
if (reads->sequences != NULL)
|
|
1690
|
+
{
|
|
1691
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1692
|
+
free(reads->sequences[index]);
|
|
1693
|
+
free(reads->sequences);
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
if (reads->tSequences != NULL)
|
|
1697
|
+
free (reads->tSequences);
|
|
1698
|
+
|
|
1699
|
+
if (reads->tSeqMem != NULL)
|
|
1700
|
+
free (reads->tSeqMem);
|
|
1701
|
+
|
|
1702
|
+
if (reads->labels != NULL)
|
|
1703
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1704
|
+
free(reads->labels[index]);
|
|
1705
|
+
|
|
1706
|
+
if (reads->confidenceScores != NULL)
|
|
1707
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1708
|
+
free(reads->confidenceScores[index]);
|
|
1709
|
+
|
|
1710
|
+
if (reads->kmerProbabilities != NULL)
|
|
1711
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1712
|
+
free(reads->kmerProbabilities[index]);
|
|
1713
|
+
|
|
1714
|
+
free(reads->labels);
|
|
1715
|
+
free(reads->confidenceScores);
|
|
1716
|
+
free(reads->kmerProbabilities);
|
|
1717
|
+
free(reads->mateReads);
|
|
1718
|
+
free(reads->categories);
|
|
1719
|
+
free(reads->secondInPair);
|
|
1720
|
+
free(reads);
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
ShortLength *getSequenceLengths(ReadSet * reads, int wordLength)
|
|
1724
|
+
{
|
|
1725
|
+
ShortLength *lengths = callocOrExit(reads->readCount, ShortLength);
|
|
1726
|
+
IDnum index;
|
|
1727
|
+
int lengthOffset = wordLength - 1;
|
|
1728
|
+
|
|
1729
|
+
for (index = 0; index < reads->readCount; index++)
|
|
1730
|
+
lengths[index] =
|
|
1731
|
+
getLength(getTightStringInArray(reads->tSequences, index)) - lengthOffset;
|
|
1732
|
+
|
|
1733
|
+
return lengths;
|
|
1734
|
+
}
|