miga-base 1.2.17.0 → 1.2.17.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +2 -2
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
- data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
- data/utils/FastAAI/FastAAI +3659 -0
- data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
- data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
- data/utils/FastAAI/README.md +84 -0
- data/utils/enveomics/Docs/recplot2.md +244 -0
- data/utils/enveomics/Examples/aai-matrix.bash +66 -0
- data/utils/enveomics/Examples/ani-matrix.bash +66 -0
- data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
- data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
- data/utils/enveomics/LICENSE.txt +73 -0
- data/utils/enveomics/Makefile +52 -0
- data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
- data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
- data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
- data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
- data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
- data/utils/enveomics/Manifest/Tasks/mapping.json +165 -0
- data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
- data/utils/enveomics/Manifest/Tasks/other.json +906 -0
- data/utils/enveomics/Manifest/Tasks/remote.json +356 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
- data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
- data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
- data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
- data/utils/enveomics/Manifest/categories.json +165 -0
- data/utils/enveomics/Manifest/examples.json +162 -0
- data/utils/enveomics/Manifest/tasks.json +4 -0
- data/utils/enveomics/README.md +42 -0
- data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
- data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
- data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +138 -0
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
- data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
- data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
- data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
- data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
- data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
- data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
- data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
- data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
- data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
- data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
- data/utils/enveomics/Scripts/Chao1.pl +97 -0
- data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
- data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
- data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
- data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
- data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
- data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
- data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
- data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
- data/utils/enveomics/Scripts/FastA.length.pl +38 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
- data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
- data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
- data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
- data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
- data/utils/enveomics/Scripts/FastA.split.pl +55 -0
- data/utils/enveomics/Scripts/FastA.split.rb +79 -0
- data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
- data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
- data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
- data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
- data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
- data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
- data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
- data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
- data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
- data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
- data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
- data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
- data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
- data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
- data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
- data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
- data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
- data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
- data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
- data/utils/enveomics/Scripts/SRA.download.bash +67 -0
- data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
- data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
- data/utils/enveomics/Scripts/Table.barplot.R +31 -0
- data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
- data/utils/enveomics/Scripts/Table.filter.pl +61 -0
- data/utils/enveomics/Scripts/Table.merge.pl +77 -0
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/Table.replace.rb +69 -0
- data/utils/enveomics/Scripts/Table.round.rb +63 -0
- data/utils/enveomics/Scripts/Table.split.pl +57 -0
- data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
- data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
- data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
- data/utils/enveomics/Scripts/aai.rb +421 -0
- data/utils/enveomics/Scripts/ani.rb +362 -0
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/clust.rand.rb +102 -0
- data/utils/enveomics/Scripts/gi2tax.rb +103 -0
- data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
- data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
- data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
- data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
- data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
- data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
- data/utils/enveomics/Scripts/ogs.rb +104 -0
- data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +108 -0
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/Tests/Makefile +10 -0
- data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
- data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
- data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
- data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
- data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
- data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
- data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
- data/utils/enveomics/Tests/alkB.nwk +1 -0
- data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
- data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
- data/utils/enveomics/Tests/hiv1.faa +59 -0
- data/utils/enveomics/Tests/hiv1.fna +134 -0
- data/utils/enveomics/Tests/hiv2.faa +70 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
- data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
- data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
- data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
- data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
- data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
- data/utils/enveomics/build_enveomics_r.bash +45 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
- data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
- data/utils/enveomics/enveomics.R/R/autoprune.R +167 -0
- data/utils/enveomics/enveomics.R/R/barplot.R +203 -0
- data/utils/enveomics/enveomics.R/R/cliopts.R +141 -0
- data/utils/enveomics/enveomics.R/R/df2dist.R +192 -0
- data/utils/enveomics/enveomics.R/R/growthcurve.R +349 -0
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/recplot.R +419 -0
- data/utils/enveomics/enveomics.R/R/recplot2.R +1698 -0
- data/utils/enveomics/enveomics.R/R/tribs.R +638 -0
- data/utils/enveomics/enveomics.R/R/utils.R +90 -0
- data/utils/enveomics/enveomics.R/README.md +81 -0
- data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
- data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +111 -0
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +34 -0
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +25 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +63 -0
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +46 -0
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +78 -0
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +147 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +52 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +21 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +34 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +31 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
- data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
- data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +81 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +49 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +48 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +22 -0
- data/utils/enveomics/globals.mk +8 -0
- data/utils/enveomics/manifest.json +9 -0
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- metadata +268 -6
@@ -0,0 +1,1555 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import os
|
5
|
+
import subprocess
|
6
|
+
import tempfile
|
7
|
+
import argparse
|
8
|
+
import multiprocessing
|
9
|
+
import re
|
10
|
+
import shutil
|
11
|
+
from datetime import datetime
|
12
|
+
|
13
|
+
#Reads a file with adapters and uses them as the starting set for adapter identification. By default, uses the current MiGA adapter list as of Feb. 23, 2021
|
14
|
+
def read_adapters(adapters_fasta):
|
15
|
+
|
16
|
+
cleanup = False
|
17
|
+
if adapters_fasta == "internal":
|
18
|
+
adapters, adapters_fasta = generate_adapters_temporary_file()
|
19
|
+
|
20
|
+
cleanup = True
|
21
|
+
else:
|
22
|
+
adapters = {}
|
23
|
+
current_seq = ""
|
24
|
+
current_id = ""
|
25
|
+
|
26
|
+
adapt = open(adapters_fasta, "r")
|
27
|
+
|
28
|
+
for line in adapt:
|
29
|
+
if line.startswith(">"):
|
30
|
+
if len(current_seq) > 0:
|
31
|
+
adapters[current_id] = current_seq
|
32
|
+
current_id = line.strip()[1:]
|
33
|
+
current_seq = ""
|
34
|
+
else:
|
35
|
+
current_seq += line.strip()
|
36
|
+
|
37
|
+
adapters[current_id] = current_seq
|
38
|
+
|
39
|
+
adapt.close()
|
40
|
+
|
41
|
+
return adapters, adapters_fasta, cleanup
|
42
|
+
|
43
|
+
#Only contains adapters we already recognize as part of a kit. It will need updated as new ones may be added.
|
44
|
+
def family_detection(adapter_seqs):
|
45
|
+
#Currently acceptable fams:
|
46
|
+
'''
|
47
|
+
singleend
|
48
|
+
pairedend
|
49
|
+
dpnII
|
50
|
+
smallrna
|
51
|
+
multiplex
|
52
|
+
pcr
|
53
|
+
dpnIIgex
|
54
|
+
otherrna
|
55
|
+
trueseq
|
56
|
+
rnapcr
|
57
|
+
trueseq2
|
58
|
+
nextera
|
59
|
+
cre-loxp
|
60
|
+
truseq1
|
61
|
+
pcr_primer
|
62
|
+
nextera_junction
|
63
|
+
'''
|
64
|
+
|
65
|
+
#There are some repeats in adapters. All are added - this meant to make the program as conservative as possible.
|
66
|
+
fam_to_id_to_seq = {}
|
67
|
+
#MiGA adapters
|
68
|
+
fam_to_id_to_seq['singleend'] = {'Illumina_Single_End_Apapter_1': 'ACACTCTTTCCCTACACGACGCTGTTCCATCT', 'Illumina_Single_End_Apapter_2': 'CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT', 'Illumina_Single_End_PCR_Primer_1': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Single_End_PCR_Primer_2': 'CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT', 'Illumina_Single_End_Sequencing_Primer': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT'}
|
69
|
+
fam_to_id_to_seq['pairedend'] = {'Illumina_Paired_End_Adapter_1': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_Adapter_2': 'CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'Illumina_Paried_End_PCR_Primer_1': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_PCR_Primer_2': 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'Illumina_Paried_End_Sequencing_Primer_1': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_Sequencing_Primer_2': 'CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT'}
|
70
|
+
fam_to_id_to_seq['dpnII'] = {'Illumina_DpnII_expression_Adapter_1': 'ACAGGTTCAGAGTTCTACAGTCCGAC', 'Illumina_DpnII_expression_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_expression_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_expression_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_DpnII_expression_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_NlaIII_expression_Adapter_1': 'ACAGGTTCAGAGTTCTACAGTCCGACATG', 'Illumina_NlaIII_expression_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_expression_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_expression_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_NlaIII_expression_Sequencing_Primer': 'CCGACAGGTTCAGAGTTCTACAGTCCGACATG'}
|
71
|
+
fam_to_id_to_seq['smallrna'] = {'Illumina_Small_RNA_Adapter_1': 'GTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_Small_RNA_Adapter_2': 'TCGTATGCCGTCTTCTGCTTGT', 'Illumina_Small_RNA_RT_Primer': 'CAAGCAGAAGACGGCATACGA', 'Illumina_Small_RNA_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_Small_RNA_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_Small_RNA_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC'}
|
72
|
+
fam_to_id_to_seq['multiplex'] = {'Illumina_Multiplexing_Adapter_1': 'GATCGGAAGAGCACACGTCT', 'Illumina_Multiplexing_Adapter_2': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_PCR_Primer_1.01': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_PCR_Primer_2.01': 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT', 'Illumina_Multiplexing_Read1_Sequencing_Primer': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_Index_Sequencing_Primer': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'Illumina_Multiplexing_Read2_Sequencing_Primer': 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'}
|
73
|
+
fam_to_id_to_seq['pcr'] = {'Illumina_PCR_Primer_Index_1': 'CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_2': 'CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_3': 'CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_4': 'CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_5': 'CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_6': 'CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_7': 'CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_8': 'CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_9': 'CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_10': 'CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_11': 'CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_12': 'CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC'}
|
74
|
+
fam_to_id_to_seq['dpnIIgex'] = {'Illumina_DpnII_Gex_Adapter_1': 'GATCGTCGGACTGTAGAACTCTGAAC', 'Illumina_DpnII_Gex_Adapter_1.01': 'ACAGGTTCAGAGTTCTACAGTCCGAC', 'Illumina_DpnII_Gex_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_Gex_Adapter_2.01': 'TCGTATGCCGTCTTCTGCTTG', 'Illumina_DpnII_Gex_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_Gex_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_DpnII_Gex_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_NlaIII_Gex_Adapter_1.01': 'TCGGACTGTAGAACTCTGAAC', 'Illumina_NlaIII_Gex_Adapter_1.02': 'ACAGGTTCAGAGTTCTACAGTCCGACATG', 'Illumina_NlaIII_Gex_Adapter_2.01': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_Gex_Adapter_2.02': 'TCGTATGCCGTCTTCTGCTTG', 'Illumina_NlaIII_Gex_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_Gex_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_NlaIII_Gex_Sequencing_Primer': 'CCGACAGGTTCAGAGTTCTACAGTCCGACATG'}
|
75
|
+
fam_to_id_to_seq['otherrna'] = {'Illumina_5p_RNA_Adapter': 'GTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_RNA_Adapter1': 'TCGTATGCCGTCTTCTGCTTGT', 'Illumina_Small_RNA_3p_Adapter_1': 'ATCTCGTATGCCGTCTTCTGCTTG'}
|
76
|
+
fam_to_id_to_seq['trueseq'] = {'TruSeq_Universal_Adapter': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'TruSeq_Adapter_Index_1': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_2': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_3': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_4': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_5': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_6': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_7': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_8': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_9': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_10': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_11': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_12': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG'}
|
77
|
+
fam_to_id_to_seq['rnapcr'] = {'Illumina_RNA_RT_Primer': 'GCCTTGGCACCCGAGAATTCCA', 'Illumina_RNA_PCR_Primer': 'AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA', 'RNA_PCR_Primer_Index_1': 'CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_2': 'CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_3': 'CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_4': 'CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_5': 'CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_6': 'CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_7': 'CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_8': 'CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_9': 'CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_10': 'CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_11': 'CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_12': 'CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_13': 'CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_14': 'CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_15': 'CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_16': 'CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_17': 'CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_18': 'CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_19': 'CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_20': 'CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_21': 'CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_22': 'CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_23': 'CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_24': 'CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_25': 'CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_26': 'CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_27': 'CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_28': 'CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_29': 'CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_30': 'CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_31': 'CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_32': 'CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_33': 'CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_34': 'CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_35': 'CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_36': 'CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_37': 'CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_38': 'CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_39': 'CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_40': 'CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_41': 'CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_42': 'CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_43': 'CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_44': 'CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_45': 'CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_46': 'CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_47': 'CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_48': 'CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA'}
|
78
|
+
fam_to_id_to_seq['abi'] = {'ABI_Dynabead_EcoP_Oligo': 'CTGATCTAGAGGTACCGGATCCCAGCAGT', 'ABI_Solid3_Adapter_A': 'CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG', 'ABI_Solid3_Adapter_B': 'CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT', 'ABI_Solid3_5_AMP_Primer': 'CCACTACGCCTCCGCTTTCCTCTCTATG', 'ABI_Solid3_3_AMP_Primer': 'CTGCCCCGGGTTCCTCATTCT', 'ABI_Solid3_EF1_alpha_Sense_Primer': 'CATGTGTGTTGAGAGCTTC', 'ABI_Solid3_EF1_alpha_Antisense_Primer': 'GAAAACCAAAGTGGTCCAC', 'ABI_Solid3_GAPDH_Forward_Primer': 'TTAGCACCCCTGGCCAAGG', 'ABI_Solid3_GAPDH_Reverse_Primer': 'CTTACTCCTTGGAGGCCATG'}
|
79
|
+
fam_to_id_to_seq['trueseq2'] = {'TruSeq2_SE': 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG', 'TruSeq2_PE_f': 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT', 'TruSeq2_PE_r': 'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'TruSeq3_IndexedAdapter': 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'TruSeq3_UniversalAdapter': 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'}
|
80
|
+
fam_to_id_to_seq['nextera'] = {'Nextera_PE_PrefixNX/1': 'AGATGTGTATAAGAGACAG', 'Nextera_PE_PrefixNX/2': 'AGATGTGTATAAGAGACAG', 'Nextera_PE_Trans1': 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG', 'Nextera_PE_Trans1_rc': 'CTGTCTCTTATACACATCTGACGCTGCCGACGA', 'Nextera_PE_Trans2': 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG', 'Nextera_PE_Trans2_rc': 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC'}
|
81
|
+
#FaQCs adapters for safety
|
82
|
+
fam_to_id_to_seq['cre-loxp'] = {'cre-loxp-forward' : 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG', 'cre-loxp-reverse' : 'AGCATATTGAAGCATATTACATACGATATGCTTCAATAATGC'}
|
83
|
+
fam_to_id_to_seq['truseq1'] = {'TruSeq-adapter-1' : 'GGGGTAGTGTGGATCCTCCTCTAGGCAGTTGGGTTATTCTAGAAGCAGATGTGTTGGCTGTTTCTGAAACTCTGGAAAA', 'TruSeq-adapter-3' : 'CAACAGCCGGTCAAAACATCTGGAGGGTAAGCCATAAACACCTCAACAGAAAA'}
|
84
|
+
fam_to_id_to_seq['pcr_primer'] = {'PCR-primer-1' : 'CGATAACTTCGTATAATGTATGCTATACGAAGTTATTACG', 'PCR-primer-2' : 'GCATAACTTCGTATAGCATACATTATACGAAGTTATACGA'}
|
85
|
+
fam_to_id_to_seq['nextera_junction'] = {'Nextera-junction-adapter-1' : 'CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG'}
|
86
|
+
fam_to_id_to_seq['Nextera-primer-adapter'] = {'Nextera-primer-adapter-1' : 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'Nextera-primer-adapter-2' : 'GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT'}
|
87
|
+
|
88
|
+
detected_fams = []
|
89
|
+
#for each user sequence
|
90
|
+
for seq in adapter_seqs.values():
|
91
|
+
#for each family
|
92
|
+
for family in fam_to_id_to_seq:
|
93
|
+
#Check if the sequence appears as a value in the current family; add it to the detected family list if it's not already there.
|
94
|
+
if seq in fam_to_id_to_seq[family].values() and family not in detected_fams:
|
95
|
+
detected_fams.append(family)
|
96
|
+
|
97
|
+
#If a family was detected, add ALL sequences from that family to the final list, except the ones that the user already supplied.
|
98
|
+
for fam in detected_fams:
|
99
|
+
for id in fam_to_id_to_seq[fam]:
|
100
|
+
sequence = fam_to_id_to_seq[fam][id]
|
101
|
+
#User seqs came in with adapter_seqs, so we want to skip adding the preset one; it would be redundant, change names. Just add the others.
|
102
|
+
if sequence not in adapter_seqs.values():
|
103
|
+
adapter_seqs[id] = sequence
|
104
|
+
|
105
|
+
#add '>' to the start of each seq.
|
106
|
+
easy_print = {}
|
107
|
+
|
108
|
+
for id in adapter_seqs:
|
109
|
+
easy_print[">"+id] = adapter_seqs[id]
|
110
|
+
|
111
|
+
return easy_print
|
112
|
+
|
113
|
+
#This contains code which generates a complete list of illumina adapters from scratch
|
114
|
+
def generate_adapters_temporary_file():
|
115
|
+
|
116
|
+
#print("Preparing adapter file for you.")
|
117
|
+
adapters_dict = {}
|
118
|
+
|
119
|
+
'''
|
120
|
+
I identify the adapter families here with comments. Any adapter recognized in one of these during preprocessing will include
|
121
|
+
all of the members of its family in final, e.g. seeing Illumina_Single_End_Apapter_1 will include the following:
|
122
|
+
Illumina_Single_End_Apapter_1, Illumina_Single_End_Apapter_2, Illumina_Single_End_PCR_Primer_1, Illumina_Single_End_PCR_Primer_2, and Illumina_Single_End_Sequencing_Primer
|
123
|
+
in the final filtering fasta
|
124
|
+
'''
|
125
|
+
|
126
|
+
#Single end family
|
127
|
+
adapters_dict["Illumina_Single_End_Apapter_1"] = "ACACTCTTTCCCTACACGACGCTGTTCCATCT"
|
128
|
+
adapters_dict["Illumina_Single_End_Apapter_2"] = "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"
|
129
|
+
adapters_dict["Illumina_Single_End_PCR_Primer_1"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
130
|
+
adapters_dict["Illumina_Single_End_PCR_Primer_2"] = "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"
|
131
|
+
adapters_dict["Illumina_Single_End_Sequencing_Primer"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
132
|
+
|
133
|
+
#Paired end family
|
134
|
+
adapters_dict["Illumina_Paired_End_Adapter_1"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
135
|
+
adapters_dict["Illumina_Paired_End_Adapter_2"] = "CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
|
136
|
+
adapters_dict["Illumina_Paried_End_PCR_Primer_1"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
137
|
+
adapters_dict["Illumina_Paired_End_PCR_Primer_2"] = "CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
|
138
|
+
adapters_dict["Illumina_Paried_End_Sequencing_Primer_1"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
139
|
+
adapters_dict["Illumina_Paired_End_Sequencing_Primer_2"] = "CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT"
|
140
|
+
|
141
|
+
#DpnII family
|
142
|
+
adapters_dict["Illumina_DpnII_expression_Adapter_1"] = "ACAGGTTCAGAGTTCTACAGTCCGAC"
|
143
|
+
adapters_dict["Illumina_DpnII_expression_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
|
144
|
+
adapters_dict["Illumina_DpnII_expression_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
|
145
|
+
adapters_dict["Illumina_DpnII_expression_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
|
146
|
+
adapters_dict["Illumina_DpnII_expression_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
|
147
|
+
adapters_dict["Illumina_NlaIII_expression_Adapter_1"] = "ACAGGTTCAGAGTTCTACAGTCCGACATG"
|
148
|
+
adapters_dict["Illumina_NlaIII_expression_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
|
149
|
+
adapters_dict["Illumina_NlaIII_expression_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
|
150
|
+
adapters_dict["Illumina_NlaIII_expression_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
|
151
|
+
adapters_dict["Illumina_NlaIII_expression_Sequencing_Primer"] = "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
|
152
|
+
|
153
|
+
#Small RNA family
|
154
|
+
adapters_dict["Illumina_Small_RNA_Adapter_1"] = "GTTCAGAGTTCTACAGTCCGACGATC"
|
155
|
+
adapters_dict["Illumina_Small_RNA_Adapter_2"] = "TCGTATGCCGTCTTCTGCTTGT"
|
156
|
+
adapters_dict["Illumina_Small_RNA_RT_Primer"] = "CAAGCAGAAGACGGCATACGA"
|
157
|
+
adapters_dict["Illumina_Small_RNA_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
|
158
|
+
adapters_dict["Illumina_Small_RNA_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
|
159
|
+
adapters_dict["Illumina_Small_RNA_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
|
160
|
+
|
161
|
+
|
162
|
+
#Multiplexing Family
|
163
|
+
adapters_dict["Illumina_Multiplexing_Adapter_1"] = "GATCGGAAGAGCACACGTCT"
|
164
|
+
adapters_dict["Illumina_Multiplexing_Adapter_2"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
165
|
+
adapters_dict["Illumina_Multiplexing_PCR_Primer_1.01"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
166
|
+
adapters_dict["Illumina_Multiplexing_PCR_Primer_2.01"] = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"
|
167
|
+
adapters_dict["Illumina_Multiplexing_Read1_Sequencing_Primer"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
168
|
+
adapters_dict["Illumina_Multiplexing_Index_Sequencing_Primer"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
|
169
|
+
adapters_dict["Illumina_Multiplexing_Read2_Sequencing_Primer"] = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"
|
170
|
+
|
171
|
+
|
172
|
+
#PCR primer family
|
173
|
+
adapters_dict["Illumina_PCR_Primer_Index_1"] = "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC"
|
174
|
+
adapters_dict["Illumina_PCR_Primer_Index_2"] = "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC"
|
175
|
+
adapters_dict["Illumina_PCR_Primer_Index_3"] = "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC"
|
176
|
+
adapters_dict["Illumina_PCR_Primer_Index_4"] = "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC"
|
177
|
+
adapters_dict["Illumina_PCR_Primer_Index_5"] = "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC"
|
178
|
+
adapters_dict["Illumina_PCR_Primer_Index_6"] = "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC"
|
179
|
+
adapters_dict["Illumina_PCR_Primer_Index_7"] = "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC"
|
180
|
+
adapters_dict["Illumina_PCR_Primer_Index_8"] = "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC"
|
181
|
+
adapters_dict["Illumina_PCR_Primer_Index_9"] = "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC"
|
182
|
+
adapters_dict["Illumina_PCR_Primer_Index_10"] = "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC"
|
183
|
+
adapters_dict["Illumina_PCR_Primer_Index_11"] = "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC"
|
184
|
+
adapters_dict["Illumina_PCR_Primer_Index_12"] = "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"
|
185
|
+
|
186
|
+
|
187
|
+
#DpnII Gex family
|
188
|
+
adapters_dict["Illumina_DpnII_Gex_Adapter_1"] = "GATCGTCGGACTGTAGAACTCTGAAC"
|
189
|
+
adapters_dict["Illumina_DpnII_Gex_Adapter_1.01"] = "ACAGGTTCAGAGTTCTACAGTCCGAC"
|
190
|
+
adapters_dict["Illumina_DpnII_Gex_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
|
191
|
+
adapters_dict["Illumina_DpnII_Gex_Adapter_2.01"] = "TCGTATGCCGTCTTCTGCTTG"
|
192
|
+
adapters_dict["Illumina_DpnII_Gex_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
|
193
|
+
adapters_dict["Illumina_DpnII_Gex_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
|
194
|
+
adapters_dict["Illumina_DpnII_Gex_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
|
195
|
+
adapters_dict["Illumina_NlaIII_Gex_Adapter_1.01"] = "TCGGACTGTAGAACTCTGAAC"
|
196
|
+
adapters_dict["Illumina_NlaIII_Gex_Adapter_1.02"] = "ACAGGTTCAGAGTTCTACAGTCCGACATG"
|
197
|
+
adapters_dict["Illumina_NlaIII_Gex_Adapter_2.01"] = "CAAGCAGAAGACGGCATACGA"
|
198
|
+
adapters_dict["Illumina_NlaIII_Gex_Adapter_2.02"] = "TCGTATGCCGTCTTCTGCTTG"
|
199
|
+
adapters_dict["Illumina_NlaIII_Gex_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
|
200
|
+
adapters_dict["Illumina_NlaIII_Gex_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
|
201
|
+
adapters_dict["Illumina_NlaIII_Gex_Sequencing_Primer"] = "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
|
202
|
+
|
203
|
+
#Other RNA family
|
204
|
+
adapters_dict["Illumina_5p_RNA_Adapter"] = "GTTCAGAGTTCTACAGTCCGACGATC"
|
205
|
+
adapters_dict["Illumina_RNA_Adapter1"] = "TCGTATGCCGTCTTCTGCTTGT"
|
206
|
+
adapters_dict["Illumina_Small_RNA_3p_Adapter_1"] = "ATCTCGTATGCCGTCTTCTGCTTG"
|
207
|
+
|
208
|
+
#TrueSeq family
|
209
|
+
adapters_dict["TruSeq_Universal_Adapter"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
|
210
|
+
adapters_dict["TruSeq_Adapter_Index_1"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"
|
211
|
+
adapters_dict["TruSeq_Adapter_Index_2"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"
|
212
|
+
adapters_dict["TruSeq_Adapter_Index_3"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"
|
213
|
+
adapters_dict["TruSeq_Adapter_Index_4"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"
|
214
|
+
adapters_dict["TruSeq_Adapter_Index_5"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"
|
215
|
+
adapters_dict["TruSeq_Adapter_Index_6"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"
|
216
|
+
adapters_dict["TruSeq_Adapter_Index_7"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"
|
217
|
+
adapters_dict["TruSeq_Adapter_Index_8"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"
|
218
|
+
adapters_dict["TruSeq_Adapter_Index_9"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"
|
219
|
+
adapters_dict["TruSeq_Adapter_Index_10"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"
|
220
|
+
adapters_dict["TruSeq_Adapter_Index_11"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"
|
221
|
+
adapters_dict["TruSeq_Adapter_Index_12"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"
|
222
|
+
|
223
|
+
#RNA PCR family
|
224
|
+
adapters_dict["Illumina_RNA_RT_Primer"] = "GCCTTGGCACCCGAGAATTCCA"
|
225
|
+
adapters_dict["Illumina_RNA_PCR_Primer"] = "AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA"
|
226
|
+
adapters_dict["RNA_PCR_Primer_Index_1"] = "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
227
|
+
adapters_dict["RNA_PCR_Primer_Index_2"] = "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
228
|
+
adapters_dict["RNA_PCR_Primer_Index_3"] = "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
229
|
+
adapters_dict["RNA_PCR_Primer_Index_4"] = "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
230
|
+
adapters_dict["RNA_PCR_Primer_Index_5"] = "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
231
|
+
adapters_dict["RNA_PCR_Primer_Index_6"] = "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
232
|
+
adapters_dict["RNA_PCR_Primer_Index_7"] = "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
233
|
+
adapters_dict["RNA_PCR_Primer_Index_8"] = "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
234
|
+
adapters_dict["RNA_PCR_Primer_Index_9"] = "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
235
|
+
adapters_dict["RNA_PCR_Primer_Index_10"] = "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
236
|
+
adapters_dict["RNA_PCR_Primer_Index_11"] = "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
237
|
+
adapters_dict["RNA_PCR_Primer_Index_12"] = "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
238
|
+
adapters_dict["RNA_PCR_Primer_Index_13"] = "CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
239
|
+
adapters_dict["RNA_PCR_Primer_Index_14"] = "CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
240
|
+
adapters_dict["RNA_PCR_Primer_Index_15"] = "CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
241
|
+
adapters_dict["RNA_PCR_Primer_Index_16"] = "CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
242
|
+
adapters_dict["RNA_PCR_Primer_Index_17"] = "CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
243
|
+
adapters_dict["RNA_PCR_Primer_Index_18"] = "CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
244
|
+
adapters_dict["RNA_PCR_Primer_Index_19"] = "CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
245
|
+
adapters_dict["RNA_PCR_Primer_Index_20"] = "CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
246
|
+
adapters_dict["RNA_PCR_Primer_Index_21"] = "CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
247
|
+
adapters_dict["RNA_PCR_Primer_Index_22"] = "CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
248
|
+
adapters_dict["RNA_PCR_Primer_Index_23"] = "CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
249
|
+
adapters_dict["RNA_PCR_Primer_Index_24"] = "CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
250
|
+
adapters_dict["RNA_PCR_Primer_Index_25"] = "CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
251
|
+
adapters_dict["RNA_PCR_Primer_Index_26"] = "CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
252
|
+
adapters_dict["RNA_PCR_Primer_Index_27"] = "CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
253
|
+
adapters_dict["RNA_PCR_Primer_Index_28"] = "CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
254
|
+
adapters_dict["RNA_PCR_Primer_Index_29"] = "CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
255
|
+
adapters_dict["RNA_PCR_Primer_Index_30"] = "CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
256
|
+
adapters_dict["RNA_PCR_Primer_Index_31"] = "CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
257
|
+
adapters_dict["RNA_PCR_Primer_Index_32"] = "CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
258
|
+
adapters_dict["RNA_PCR_Primer_Index_33"] = "CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
259
|
+
adapters_dict["RNA_PCR_Primer_Index_34"] = "CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
260
|
+
adapters_dict["RNA_PCR_Primer_Index_35"] = "CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
261
|
+
adapters_dict["RNA_PCR_Primer_Index_36"] = "CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
262
|
+
adapters_dict["RNA_PCR_Primer_Index_37"] = "CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
263
|
+
adapters_dict["RNA_PCR_Primer_Index_38"] = "CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
264
|
+
adapters_dict["RNA_PCR_Primer_Index_39"] = "CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
265
|
+
adapters_dict["RNA_PCR_Primer_Index_40"] = "CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
266
|
+
adapters_dict["RNA_PCR_Primer_Index_41"] = "CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
267
|
+
adapters_dict["RNA_PCR_Primer_Index_42"] = "CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
268
|
+
adapters_dict["RNA_PCR_Primer_Index_43"] = "CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
269
|
+
adapters_dict["RNA_PCR_Primer_Index_44"] = "CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
270
|
+
adapters_dict["RNA_PCR_Primer_Index_45"] = "CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
271
|
+
adapters_dict["RNA_PCR_Primer_Index_46"] = "CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
272
|
+
adapters_dict["RNA_PCR_Primer_Index_47"] = "CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
273
|
+
adapters_dict["RNA_PCR_Primer_Index_48"] = "CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
|
274
|
+
|
275
|
+
#ABI family
|
276
|
+
adapters_dict["ABI_Dynabead_EcoP_Oligo"] = "CTGATCTAGAGGTACCGGATCCCAGCAGT"
|
277
|
+
adapters_dict["ABI_Solid3_Adapter_A"] = "CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG"
|
278
|
+
adapters_dict["ABI_Solid3_Adapter_B"] = "CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT"
|
279
|
+
adapters_dict["ABI_Solid3_5_AMP_Primer"] = "CCACTACGCCTCCGCTTTCCTCTCTATG"
|
280
|
+
adapters_dict["ABI_Solid3_3_AMP_Primer"] = "CTGCCCCGGGTTCCTCATTCT"
|
281
|
+
adapters_dict["ABI_Solid3_EF1_alpha_Sense_Primer"] = "CATGTGTGTTGAGAGCTTC"
|
282
|
+
adapters_dict["ABI_Solid3_EF1_alpha_Antisense_Primer"] = "GAAAACCAAAGTGGTCCAC"
|
283
|
+
adapters_dict["ABI_Solid3_GAPDH_Forward_Primer"] = "TTAGCACCCCTGGCCAAGG"
|
284
|
+
adapters_dict["ABI_Solid3_GAPDH_Reverse_Primer"] = "CTTACTCCTTGGAGGCCATG"
|
285
|
+
|
286
|
+
#TrueSeq2 family
|
287
|
+
adapters_dict["TruSeq2_SE"] = "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"
|
288
|
+
adapters_dict["TruSeq2_PE_f"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
|
289
|
+
adapters_dict["TruSeq2_PE_r"] = "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"
|
290
|
+
adapters_dict["TruSeq3_IndexedAdapter"] = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
|
291
|
+
adapters_dict["TruSeq3_UniversalAdapter"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA"
|
292
|
+
|
293
|
+
#Nextera Family
|
294
|
+
adapters_dict["Nextera_PE_PrefixNX/1"] = "AGATGTGTATAAGAGACAG"
|
295
|
+
adapters_dict["Nextera_PE_PrefixNX/2"] = "AGATGTGTATAAGAGACAG"
|
296
|
+
adapters_dict["Nextera_PE_Trans1"] = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
|
297
|
+
adapters_dict["Nextera_PE_Trans1_rc"] = "CTGTCTCTTATACACATCTGACGCTGCCGACGA"
|
298
|
+
adapters_dict["Nextera_PE_Trans2"] = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"
|
299
|
+
adapters_dict["Nextera_PE_Trans2_rc"] = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC"
|
300
|
+
|
301
|
+
all_adapters = tempfile.NamedTemporaryFile(mode = "w", delete = False)
|
302
|
+
|
303
|
+
for adapt in adapters_dict:
|
304
|
+
print(">"+adapt, file = all_adapters)
|
305
|
+
print(adapters_dict[adapt], file = all_adapters)
|
306
|
+
|
307
|
+
name = all_adapters.name
|
308
|
+
all_adapters.close()
|
309
|
+
|
310
|
+
return adapters_dict, name
|
311
|
+
|
312
|
+
#FaCQs supports external adapter sequences, but has no option to EXCLUDE its own internal adapters while doing so.
|
313
|
+
#This function returns a dict of ID:adapter for the FaQCs internal sequences so that Multitrim doesn't break should a FaQCs adapter
|
314
|
+
#appear in parse_adapters
|
315
|
+
def faqcs_internal_adapters():
|
316
|
+
adapters_dict = {}
|
317
|
+
#Below are the adapters present in FaQCs by default.
|
318
|
+
|
319
|
+
#Cre-loxp family
|
320
|
+
adapters_dict["cre-loxp-forward"] = "TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG"
|
321
|
+
adapters_dict["cre-loxp-reverse"] = "AGCATATTGAAGCATATTACATACGATATGCTTCAATAATGC"
|
322
|
+
|
323
|
+
#TruSeq 1 family
|
324
|
+
adapters_dict["TruSeq-adapter-1"] = "GGGGTAGTGTGGATCCTCCTCTAGGCAGTTGGGTTATTCTAGAAGCAGATGTGTTGGCTGTTTCTGAAACTCTGGAAAA"
|
325
|
+
adapters_dict["TruSeq-adapter-3"] = "CAACAGCCGGTCAAAACATCTGGAGGGTAAGCCATAAACACCTCAACAGAAAA"
|
326
|
+
|
327
|
+
#PCR primers
|
328
|
+
adapters_dict["PCR-primer-1"] = "CGATAACTTCGTATAATGTATGCTATACGAAGTTATTACG"
|
329
|
+
adapters_dict["PCR-primer-2"] = "GCATAACTTCGTATAGCATACATTATACGAAGTTATACGA"
|
330
|
+
|
331
|
+
#Nextera Junction family
|
332
|
+
adapters_dict["Nextera-junction-adapter-1"] = "CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG"
|
333
|
+
|
334
|
+
#Nextera-primer-adapter family; these are copies of earlier adapters in this list, but I want to make sure they're detectable since they're internal to FaQCs
|
335
|
+
adapters_dict["Nextera-primer-adapter-1"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
|
336
|
+
adapters_dict["Nextera-primer-adapter-2"] = "GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
|
337
|
+
|
338
|
+
return(adapters_dict)
|
339
|
+
|
340
|
+
#Get file names right up front for ease of use
|
341
|
+
def names_pe(forward, reverse, outdir = ".", prefix = ""):
|
342
|
+
forward_basename = os.path.basename(os.path.normpath(forward))
|
343
|
+
if forward_basename.endswith(".gz"):
|
344
|
+
forward_basename = forward_basename[:-3]
|
345
|
+
forward_basename = os.path.splitext(forward_basename)[0]
|
346
|
+
|
347
|
+
reverse_basename = os.path.basename(os.path.normpath(reverse))
|
348
|
+
if reverse_basename.endswith(".gz"):
|
349
|
+
reverse_basename = reverse_basename[:-3]
|
350
|
+
reverse_basename = os.path.splitext(reverse_basename)[0]
|
351
|
+
|
352
|
+
pre_qc_f = outdir + "/" + prefix + "1.pre_trim_QC_" + forward_basename
|
353
|
+
pre_qc_r = outdir + "/" + prefix + "2.pre_trim_QC_" + reverse_basename
|
354
|
+
|
355
|
+
post_qc_f = outdir + "/" + prefix + "1.post_trim_QC_" + forward_basename
|
356
|
+
post_qc_r = outdir + "/" + prefix + "2.post_trim_QC_" + reverse_basename
|
357
|
+
|
358
|
+
post_trim_reads_f = outdir + "/" + prefix + "1.post_trim_" + forward_basename + ".fq"
|
359
|
+
post_trim_reads_r = outdir + "/" + prefix + "2.post_trim_" + reverse_basename + ".fq"
|
360
|
+
|
361
|
+
return pre_qc_f, pre_qc_r, post_qc_f, post_qc_r, post_trim_reads_f, post_trim_reads_r
|
362
|
+
|
363
|
+
#Get file names right up front for ease of use
|
364
|
+
def names_se(reads, outdir = ".", prefix = ""):
|
365
|
+
base_name = os.path.basename(os.path.normpath(reads))
|
366
|
+
if base_name.endswith(".gz"):
|
367
|
+
base_name = base_name[:-3]
|
368
|
+
base_name = os.path.splitext(base_name)[0]
|
369
|
+
|
370
|
+
pre_qc = outdir + "/" + prefix + "unpaired.pre_trim_QC_" + base_name
|
371
|
+
post_qc = outdir + "/" + prefix + "unpaired.post_trim_QC_" + base_name
|
372
|
+
post_trim_reads = outdir + "/" + prefix + "unpaired.post_trim_" + base_name + ".fq"
|
373
|
+
|
374
|
+
return pre_qc, post_qc, post_trim_reads
|
375
|
+
|
376
|
+
#DSRC needs its own. Whoops.
|
377
|
+
def do_falco(read_name_tool):
|
378
|
+
'''
|
379
|
+
Falco does not support naming files, but does support selecting output directory.
|
380
|
+
|
381
|
+
As we are possibly generating multiple falco reports simultaneously,
|
382
|
+
we get around this issue by generating the generically named files in a temp dir
|
383
|
+
and then move the results to the final location with an appropriate rename.
|
384
|
+
'''
|
385
|
+
|
386
|
+
#temp directory
|
387
|
+
loc = tempfile.mkdtemp()
|
388
|
+
|
389
|
+
reads = read_name_tool[0]
|
390
|
+
output_name = read_name_tool[1]
|
391
|
+
falco_path = read_name_tool[2]
|
392
|
+
|
393
|
+
#falco command
|
394
|
+
command = [falco_path, "--quiet", "-o", loc, reads]
|
395
|
+
#command = [falco_path, "-o", loc, reads]
|
396
|
+
|
397
|
+
|
398
|
+
#run the command
|
399
|
+
#Working perfectly, the falco call should not produce any output. Until falco has bugs patched, it's not working perfectly
|
400
|
+
#subprocess.call(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
401
|
+
subprocess.call(command)
|
402
|
+
|
403
|
+
|
404
|
+
#move the results and rename
|
405
|
+
#I'm just gonna move the html.
|
406
|
+
#shutil.move(loc+"/fastqc_data.txt", output_name + ".data.txt")
|
407
|
+
shutil.move(loc+"/fastqc_report.html", output_name + ".html")
|
408
|
+
|
409
|
+
#Cleanup
|
410
|
+
shutil.rmtree(loc)
|
411
|
+
|
412
|
+
return None
|
413
|
+
|
414
|
+
#do all QC at once - now old
|
415
|
+
def falco_qc_pe(pre_trim_reads_f, pre_trim_reads_r, post_trim_reads_f, post_trim_reads_r, pre_name_f, post_name_f, pre_name_r, post_name_r, threads, falco_binary):
|
416
|
+
pre_forward = [pre_trim_reads_f, pre_name_f, falco_binary]
|
417
|
+
pre_reverse = [pre_trim_reads_r, pre_name_r, falco_binary]
|
418
|
+
post_forward = [post_trim_reads_f+".gz", post_name_f, falco_binary]
|
419
|
+
post_reverse = [post_trim_reads_r+".gz", post_name_r, falco_binary]
|
420
|
+
|
421
|
+
commands = [pre_forward, pre_reverse, post_forward, post_reverse]
|
422
|
+
|
423
|
+
print("Generating QC reports.")
|
424
|
+
|
425
|
+
pool = multiprocessing.Pool(min(4, threads))
|
426
|
+
|
427
|
+
pool.map(do_falco, commands)
|
428
|
+
|
429
|
+
pool.close()
|
430
|
+
|
431
|
+
#do all QC at once - now old
|
432
|
+
def falco_qc_se(pre_trim_reads, post_trim_reads, pre_name, post_name, threads, falco_binary):
|
433
|
+
pre = [pre_trim_reads, pre_name, falco_binary]
|
434
|
+
post = [post_trim_reads+".gz", post_name, falco_binary]
|
435
|
+
|
436
|
+
commands = [pre, post]
|
437
|
+
|
438
|
+
print("Generating QC reports.")
|
439
|
+
|
440
|
+
pool = multiprocessing.Pool(min(2, threads))
|
441
|
+
|
442
|
+
pool.map(do_falco, commands)
|
443
|
+
|
444
|
+
pool.close()
|
445
|
+
|
446
|
+
def do_seqtk(read_tool):
|
447
|
+
sample = read_tool[0]
|
448
|
+
seqtk_path = read_tool[1]
|
449
|
+
|
450
|
+
print("Subsampling:", sample)
|
451
|
+
|
452
|
+
#-s 100 specifies seed as 100. The number chosen is arbitrary, and I only spcify it so that results are deterministic and reproducible.
|
453
|
+
command = [seqtk_path, "sample", "-s", "100", sample, "100000"]
|
454
|
+
|
455
|
+
temp = tempfile.NamedTemporaryFile("w", delete=False)
|
456
|
+
|
457
|
+
ps = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines = True)
|
458
|
+
temp.write(ps.stdout)
|
459
|
+
|
460
|
+
name = temp.name
|
461
|
+
|
462
|
+
temp.close()
|
463
|
+
|
464
|
+
return name
|
465
|
+
|
466
|
+
#Subsample reads; identify adapters with FaQCs
|
467
|
+
def adapter_identification_pe(artificial_artifacts, seqtk_binary, faqcs_binary, forward = "", reverse = "", threads = 1, output = ".", minimum_presence = 0.1, prefix = "", phred_fmt = "33"):
|
468
|
+
#seqtk forward and reverse
|
469
|
+
subsample_f = [forward, seqtk_binary]
|
470
|
+
subsample_r = [reverse, seqtk_binary]
|
471
|
+
|
472
|
+
seqtk_commands = [subsample_f, subsample_r]
|
473
|
+
|
474
|
+
pool = multiprocessing.Pool(min(2, threads))
|
475
|
+
|
476
|
+
seqtk_samples = pool.map(do_seqtk, seqtk_commands)
|
477
|
+
|
478
|
+
pool.close()
|
479
|
+
|
480
|
+
#FaQCs PE with adapter file
|
481
|
+
faqcs_subset_command = [faqcs_binary, "-t", str(threads), "--qc_only", "-d", output, "--artifactFile", artificial_artifacts, "--ascii", phred_fmt]
|
482
|
+
|
483
|
+
#proper naming
|
484
|
+
if prefix != "":
|
485
|
+
faqcs_subset_command.append("--prefix")
|
486
|
+
faqcs_subset_command.append(prefix + "Subsample_Adapter_Detection")
|
487
|
+
pdf_name = prefix + "Subsample_Adapter_Detection_qc_report.pdf"
|
488
|
+
else :
|
489
|
+
faqcs_subset_command.append("--prefix")
|
490
|
+
faqcs_subset_command.append("Subsample_Adapter_Detection")
|
491
|
+
pdf_name = "Subsample_Adapter_Detection_qc_report.pdf"
|
492
|
+
|
493
|
+
#forward strand
|
494
|
+
faqcs_subset_command.append("-1")
|
495
|
+
faqcs_subset_command.append(seqtk_samples[0])
|
496
|
+
|
497
|
+
#reverse strand
|
498
|
+
faqcs_subset_command.append("-2")
|
499
|
+
faqcs_subset_command.append(seqtk_samples[1])
|
500
|
+
|
501
|
+
print("Detecting adapters now... ", end = "")
|
502
|
+
ps = subprocess.Popen(faqcs_subset_command)
|
503
|
+
ps.wait()
|
504
|
+
|
505
|
+
os.remove(output + "/" + pdf_name)
|
506
|
+
|
507
|
+
#Adapter detection from output of FaQCs
|
508
|
+
detection_report = open(output + "/" + prefix + "Subsample_Adapter_Detection.stats.txt")
|
509
|
+
|
510
|
+
detected_adapters = {}
|
511
|
+
begin_assessment = False
|
512
|
+
for line in detection_report:
|
513
|
+
if not begin_assessment:
|
514
|
+
if line.strip().startswith("Reads with Adapters/Primers:"):
|
515
|
+
begin_assessment = True
|
516
|
+
else:
|
517
|
+
segment = line.strip().split()
|
518
|
+
detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
|
519
|
+
|
520
|
+
detection_report.close()
|
521
|
+
|
522
|
+
clean_detection = []
|
523
|
+
|
524
|
+
for adapter in detected_adapters:
|
525
|
+
if detected_adapters[adapter] >= minimum_presence:
|
526
|
+
clean_detection.append(adapter)
|
527
|
+
|
528
|
+
#Cleans up after itself.
|
529
|
+
for item in seqtk_samples:
|
530
|
+
os.remove(item)
|
531
|
+
|
532
|
+
print("Detection done!")
|
533
|
+
|
534
|
+
#Return adapter file
|
535
|
+
return clean_detection
|
536
|
+
|
537
|
+
#Subsample reads; identify adapters with FaQCs
|
538
|
+
def adapter_identification_se(artificial_artifacts, seqtk_binary, faqcs_binary, unpaired = "", threads = 1, output = ".", minimum_presence = 0.1, prefix = "", phred_fmt = "33"):
|
539
|
+
#seqtk forward and reverse
|
540
|
+
subsample = [unpaired, seqtk_binary]
|
541
|
+
|
542
|
+
seqtk_samples = do_seqtk(subsample)
|
543
|
+
|
544
|
+
#FaQCs SE with adapter file
|
545
|
+
faqcs_subset_command = [faqcs_binary, "-t", str(threads), "--qc_only", "-d", output, "--artifactFile", artificial_artifacts, "--ascii", phred_fmt]
|
546
|
+
|
547
|
+
#proper naming
|
548
|
+
if prefix != "":
|
549
|
+
faqcs_subset_command.append("--prefix")
|
550
|
+
faqcs_subset_command.append(prefix + "Subsample_Adapter_Detection")
|
551
|
+
pdf_name = prefix + "Subsample_Adapter_Detection_qc_report.pdf"
|
552
|
+
else :
|
553
|
+
faqcs_subset_command.append("--prefix")
|
554
|
+
faqcs_subset_command.append("Subsample_Adapter_Detection")
|
555
|
+
pdf_name = "Subsample_Adapter_Detection_qc_report.pdf"
|
556
|
+
|
557
|
+
#forward strand
|
558
|
+
faqcs_subset_command.append("-u")
|
559
|
+
faqcs_subset_command.append(seqtk_samples)
|
560
|
+
|
561
|
+
|
562
|
+
print("Detecting adapters now... ", end = "")
|
563
|
+
ps = subprocess.Popen(faqcs_subset_command)
|
564
|
+
ps.wait()
|
565
|
+
|
566
|
+
os.remove(output + "/" + pdf_name)
|
567
|
+
|
568
|
+
#Adapter detection from output of FaQCs
|
569
|
+
detection_report = open(output + "/" + prefix + "Subsample_Adapter_Detection.stats.txt")
|
570
|
+
|
571
|
+
detected_adapters = {}
|
572
|
+
begin_assessment = False
|
573
|
+
for line in detection_report:
|
574
|
+
if not begin_assessment:
|
575
|
+
if line.strip().startswith("Reads with Adapters/Primers:"):
|
576
|
+
begin_assessment = True
|
577
|
+
else:
|
578
|
+
segment = line.strip().split()
|
579
|
+
detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
|
580
|
+
|
581
|
+
detection_report.close()
|
582
|
+
|
583
|
+
clean_detection = []
|
584
|
+
|
585
|
+
for adapter in detected_adapters:
|
586
|
+
if detected_adapters[adapter] >= minimum_presence:
|
587
|
+
clean_detection.append(adapter)
|
588
|
+
|
589
|
+
#Cleans up after itself.
|
590
|
+
os.remove(seqtk_samples)
|
591
|
+
|
592
|
+
print("Detection done!")
|
593
|
+
|
594
|
+
#Return adapter file
|
595
|
+
return clean_detection
|
596
|
+
|
597
|
+
#gets adapter families for later use
|
598
|
+
def parse_adapters(full_list, detected_adapters, output, prefix = ""):
|
599
|
+
print("Creating specific adapters file for you.")
|
600
|
+
|
601
|
+
#detected adapters is just a list of the user's detected adapters by ID.
|
602
|
+
|
603
|
+
faqcs_internal_adapter_list = faqcs_internal_adapters()
|
604
|
+
|
605
|
+
found = False
|
606
|
+
detected_seqs = {}
|
607
|
+
for id in detected_adapters:
|
608
|
+
found = False
|
609
|
+
if id in full_list:
|
610
|
+
found = True
|
611
|
+
print("Adapter sequence:", id, "detected.")
|
612
|
+
detected_seqs[id] = full_list[id]
|
613
|
+
if id in faqcs_internal_adapter_list:
|
614
|
+
found = True
|
615
|
+
print("Adapter sequence:", id, "detected. This adapter is part of a non-optional internal list used by FaQCs and will be included.")
|
616
|
+
detected_seqs[id] = faqcs_internal_adapter_list[id]
|
617
|
+
#Skip adapter if it cannot be found. Should never happen, now that FaQCs' adapters will always be found and other seqs must be from internal or supplied sequences file
|
618
|
+
if not found:
|
619
|
+
print("Adapter sequence:", id, "not found in Multitrim's adapter list! It will NOT be included in trimming.")
|
620
|
+
|
621
|
+
adapters_by_family = family_detection(detected_seqs)
|
622
|
+
|
623
|
+
#This is a file I don't want to be temporary. It both helps identify the adapters present in a dataset and provides a fasta for a user to reuse
|
624
|
+
subset = open(output + "/" + prefix + "detected_adapters.fasta", "w")
|
625
|
+
|
626
|
+
for adapter in adapters_by_family:
|
627
|
+
print(adapter, file = subset)
|
628
|
+
print(adapters_by_family[adapter], file = subset)
|
629
|
+
|
630
|
+
subset.close()
|
631
|
+
|
632
|
+
return(output+"/"+ prefix + "detected_adapters.fasta")
|
633
|
+
|
634
|
+
#paired end version of the full trim; trims using detected adapters with FaQCs -q 27, then fastp --cut_right window 3 qual 20
|
635
|
+
def full_trim_pe(forward_in, reverse_in, forward_out, reverse_out, directory, adapters, threads, faqcs, fastp, score, minlen, window, window_qual, prefix, compressor, compress_level, phred_fmt = "33", advanced = False, skip_fastp = False, skip_faqcs = False):
|
636
|
+
'''
|
637
|
+
Command structure:
|
638
|
+
|
639
|
+
The primary purpose is to issue a FaQCs call on the untrimmed reads, then a subsequent fastp call on the outputs from the FaQCs call.
|
640
|
+
Additionally, supports using only one of the two tools. Commands will be built even if the tool is to be skipped, but the call will never be issued.
|
641
|
+
'''
|
642
|
+
|
643
|
+
faqcs_command = [faqcs, "-t", str(threads), "-1", forward_in, "-2", reverse_in, "--artifactFile", adapters, "-q", str(score), "--min_L", str(minlen), "--prefix", "reads", "--trim_only", "-d", directory, "--ascii", phred_fmt]
|
644
|
+
fastp_command = [fastp, "--thread", str(threads), "--adapter_fasta", adapters, "-l", str(minlen), "--json", directory + "/" + prefix + "post_trim_fastp.json", "--html", directory + "/" + prefix + "post_trim_fastp.html"]
|
645
|
+
|
646
|
+
#Args can be added to fastp command with no consequences if fastp is skipped; command simply won't issue so they will be silent
|
647
|
+
if skip_faqcs:
|
648
|
+
#This handles taking the input reads directly
|
649
|
+
fastp_command.append("-i")
|
650
|
+
fastp_command.append(forward_in)
|
651
|
+
fastp_command.append("-I")
|
652
|
+
fastp_command.append(reverse_in)
|
653
|
+
else:
|
654
|
+
#FaQCs goes first; this is how I coerce FaQCs reads to look afterwards
|
655
|
+
fastp_command.append("-i")
|
656
|
+
fastp_command.append(directory+"/reads.1.trimmed.fastq")
|
657
|
+
fastp_command.append("-I")
|
658
|
+
fastp_command.append(directory+"/reads.2.trimmed.fastq")
|
659
|
+
|
660
|
+
#Outputs are the same regardless of inputs
|
661
|
+
fastp_command.append("-o")
|
662
|
+
fastp_command.append(forward_out)
|
663
|
+
fastp_command.append("-O")
|
664
|
+
fastp_command.append(reverse_out)
|
665
|
+
|
666
|
+
if int(window) > 0:
|
667
|
+
fastp_command.append("--cut_right")
|
668
|
+
fastp_command.append("--cut_right_window_size")
|
669
|
+
fastp_command.append(str(window))
|
670
|
+
fastp_command.append("--cut_right_mean_quality")
|
671
|
+
fastp_command.append(str(window_qual))
|
672
|
+
|
673
|
+
if phred_fmt != "33":
|
674
|
+
fastp_command.append("--phred64")
|
675
|
+
|
676
|
+
if advanced:
|
677
|
+
fastp_command.append("--trim_poly_g")
|
678
|
+
fastp_command.append("--low_complexity_filter")
|
679
|
+
|
680
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
681
|
+
|
682
|
+
#Manage issuing of commands
|
683
|
+
if not skip_faqcs:
|
684
|
+
timer = datetime.now()
|
685
|
+
printable_time = timer.strftime(time_format)
|
686
|
+
print("Trimming with FaQCs. Started at:", printable_time)
|
687
|
+
subprocess.run(faqcs_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
688
|
+
os.remove(directory + "/" + "reads.stats.txt")
|
689
|
+
|
690
|
+
if not skip_fastp:
|
691
|
+
timer = datetime.now()
|
692
|
+
printable_time = timer.strftime(time_format)
|
693
|
+
print("Trimming with Fastp. Started at:", printable_time)
|
694
|
+
subprocess.run(fastp_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
695
|
+
os.remove(directory + "/" + prefix + "post_trim_fastp.json")
|
696
|
+
os.remove(directory + "/" + prefix + "post_trim_fastp.html")
|
697
|
+
|
698
|
+
|
699
|
+
|
700
|
+
#We want to rename the non-fastp files, then pass all files and threads to compress with pigz under the nice, neat names
|
701
|
+
|
702
|
+
if skip_fastp:
|
703
|
+
#rename FaQCs files to correct names; compress
|
704
|
+
|
705
|
+
#remove this one in any event. We don't want any unpaireds with paired end
|
706
|
+
os.remove(directory+"/reads.unpaired.trimmed.fastq")
|
707
|
+
shutil.move(directory+"/reads.1.trimmed.fastq", forward_out)
|
708
|
+
shutil.move(directory+"/reads.2.trimmed.fastq", reverse_out)
|
709
|
+
#compress_commands = [[directory+"/reads.1.trimmed.fastq", forward_out], [directory+"/reads.2.trimmed.fastq", reverse_out]]
|
710
|
+
#might as well be parallel
|
711
|
+
#pool = multiprocessing.Pool(min(2, threads))
|
712
|
+
#pool.map(compress_faqcs, compress_commands)
|
713
|
+
#pool.close()
|
714
|
+
|
715
|
+
elif not skip_faqcs:
|
716
|
+
#remove FaQCs files if fastp has results or skip if FaQCs not done.
|
717
|
+
os.remove(directory+"/reads.1.trimmed.fastq")
|
718
|
+
os.remove(directory+"/reads.2.trimmed.fastq")
|
719
|
+
#remove this one in any event. We don't want any unpaireds with paired end - the call has to be duplicated, unfortunately.
|
720
|
+
os.remove(directory+"/reads.unpaired.trimmed.fastq")
|
721
|
+
|
722
|
+
compress_results([forward_out, reverse_out], threads, compressor, compress_level)
|
723
|
+
|
724
|
+
return None
|
725
|
+
|
726
|
+
#single end version of the full trim; trims using detected adapters with FaQCs -q 27, then fastp --cut_right window 3 qual 20
|
727
|
+
def full_trim_se(reads_in, reads_out, directory, adapters, threads, faqcs, fastp, score, minlen, window, window_qual, prefix, compressor, compress_level, phred_fmt = "33", advanced = False, skip_fastp = False, skip_faqcs = False):
|
728
|
+
'''
|
729
|
+
Command structure:
|
730
|
+
|
731
|
+
The primary purpose is to issue a FaQCs call on the untrimmed reads, then a subsequent fastp call on the outputs from the FaQCs call.
|
732
|
+
Additionally, supports using only one of the two tools. Commands will be built even if the tool is to be skipped, but the call will never be issued.
|
733
|
+
'''
|
734
|
+
faqcs_command = [faqcs, "-t", str(threads), "-u", reads_in, "--artifactFile", adapters, "-q", str(score), "--min_L", str(minlen), "--prefix", "reads", "--trim_only", "-d", directory, "--ascii", phred_fmt]
|
735
|
+
fastp_command = [fastp, "--thread", str(threads), "--adapter_fasta", adapters, "-l", str(minlen), "--json", directory + "/" + prefix + "post_trim_fastp.json", "--html", directory + "/" + prefix + "post_trim_fastp.html"]
|
736
|
+
|
737
|
+
#Args can be added to fastp command with no consequences if fastp is skipped; command simply won't issue so they will be silent
|
738
|
+
if skip_faqcs:
|
739
|
+
#This handles taking the input reads directly
|
740
|
+
fastp_command.append("-i")
|
741
|
+
fastp_command.append(reads_in)
|
742
|
+
else:
|
743
|
+
#FaQCs goes first; this is how I coerce FaQCs reads to look afterwards
|
744
|
+
fastp_command.append("-i")
|
745
|
+
fastp_command.append(directory+"/reads.unpaired.trimmed.fastq")
|
746
|
+
|
747
|
+
#Outputs are the same regardless of inputs
|
748
|
+
fastp_command.append("-o")
|
749
|
+
fastp_command.append(reads_out)
|
750
|
+
|
751
|
+
if int(window) > 0:
|
752
|
+
fastp_command.append("--cut_right")
|
753
|
+
fastp_command.append("--cut_right_window_size")
|
754
|
+
fastp_command.append(str(window))
|
755
|
+
fastp_command.append("--cut_right_mean_quality")
|
756
|
+
fastp_command.append(str(window_qual))
|
757
|
+
|
758
|
+
if phred_fmt != "33":
|
759
|
+
fastp_command.append("--phred64")
|
760
|
+
|
761
|
+
if advanced:
|
762
|
+
fastp_command.append("--trim_poly_g")
|
763
|
+
fastp_command.append("--low_complexity_filter")
|
764
|
+
|
765
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
766
|
+
|
767
|
+
#Manage issuing of commands
|
768
|
+
if not skip_faqcs:
|
769
|
+
timer = datetime.now()
|
770
|
+
printable_time = timer.strftime(time_format)
|
771
|
+
print("Trimming with FaQCs. Started at:", printable_time)
|
772
|
+
subprocess.run(faqcs_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
773
|
+
os.remove(directory + "/" + "reads.stats.txt")
|
774
|
+
|
775
|
+
if not skip_fastp:
|
776
|
+
timer = datetime.now()
|
777
|
+
printable_time = timer.strftime(time_format)
|
778
|
+
print("Trimming with Fastp. Started at:", printable_time)
|
779
|
+
subprocess.run(fastp_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
780
|
+
os.remove(directory + "/" + prefix + "post_trim_fastp.json")
|
781
|
+
os.remove(directory + "/" + prefix + "post_trim_fastp.html")
|
782
|
+
|
783
|
+
|
784
|
+
if skip_fastp:
|
785
|
+
#compress the result
|
786
|
+
#remove this one in any event. We don't want any unpaireds with paired end
|
787
|
+
shutil.move(directory+"/reads.unpaired.trimmed.fastq", reads_out)
|
788
|
+
elif not skip_faqcs:
|
789
|
+
#remove FaQCs files if fastp has results or skip if FaQCs not run.
|
790
|
+
os.remove(directory+"/reads.unpaired.trimmed.fastq")
|
791
|
+
|
792
|
+
compress_results([reads_out], threads, compressor, compress_level)
|
793
|
+
|
794
|
+
return None
|
795
|
+
|
796
|
+
#compress results using selected compressor.
|
797
|
+
def compress_results(output_files, threads, compressor, level):
|
798
|
+
|
799
|
+
#Just for printing feedback.
|
800
|
+
pretty_compressor = ["GZIP", "PIGZ", "DSRC-2"][["gzip", "pigz", "dsrc"].index(compressor)]
|
801
|
+
|
802
|
+
time_format = "%d/%m/%Y %H:%M:%S"
|
803
|
+
timer = datetime.now()
|
804
|
+
printable_time = timer.strftime(time_format)
|
805
|
+
print("Beginning compression of trimmed reads using", pretty_compressor, "at:", printable_time)
|
806
|
+
|
807
|
+
#These get the file sizes, runtimes, compression ratio
|
808
|
+
if compressor == "gzip":
|
809
|
+
gzip_compress_module(output_files, threads, level)
|
810
|
+
|
811
|
+
if compressor == "pigz":
|
812
|
+
pigz_compress_module(output_files, threads, level)
|
813
|
+
|
814
|
+
'''
|
815
|
+
if compressor == "dsrc":
|
816
|
+
dsrc_compress_module(output_files, threads, level)
|
817
|
+
|
818
|
+
'''
|
819
|
+
|
820
|
+
print("Outputs compressed!")
|
821
|
+
|
822
|
+
return None
|
823
|
+
|
824
|
+
#gzip is NOT threaded, so we open up to 4 threads and compress each input simultaneously, feeding the results to falco as we go.
|
825
|
+
def gzip_compress_module(outputs, threads, level):
|
826
|
+
#Get the gzip set up for each file
|
827
|
+
gzip_arguments = []
|
828
|
+
for file in outputs:
|
829
|
+
gzip_arguments.append(["gzip", "-f"+str(level), file])
|
830
|
+
|
831
|
+
#Don't open more threads than you have to.
|
832
|
+
num_files = len(outputs)
|
833
|
+
#Run args
|
834
|
+
pool = multiprocessing.Pool(min(threads, num_files))
|
835
|
+
pool.map(do_gzip_pretty, gzip_arguments)
|
836
|
+
pool.close()
|
837
|
+
|
838
|
+
return None
|
839
|
+
|
840
|
+
#The particular parallelization for this is a bother.
|
841
|
+
def do_gzip_pretty(compress_argument):
|
842
|
+
file = compress_argument[2]
|
843
|
+
start_time = datetime.now()
|
844
|
+
initial_size = os.path.getsize(file)
|
845
|
+
|
846
|
+
subprocess.call(compress_argument)
|
847
|
+
|
848
|
+
final_size = os.path.getsize(file+".gz")
|
849
|
+
end_time = datetime.now()
|
850
|
+
|
851
|
+
pretty_print_file_size(file, initial_size, final_size, start_time, end_time)
|
852
|
+
|
853
|
+
return None
|
854
|
+
|
855
|
+
#pigz is threaded, so compression happens 1 file at a time using all threads, then falco QC 4 using the gzip approach above since the result is in gzip format
|
856
|
+
def pigz_compress_module(outputs, threads, level):
|
857
|
+
for file in outputs:
|
858
|
+
start_time = datetime.now()
|
859
|
+
initial_size = os.path.getsize(file)
|
860
|
+
|
861
|
+
pigz_argument = ["pigz", "-f", "-"+str(level), "-p", str(threads), file]
|
862
|
+
subprocess.call(pigz_argument)
|
863
|
+
|
864
|
+
final_size = os.path.getsize(file+".gz")
|
865
|
+
end_time = datetime.now()
|
866
|
+
|
867
|
+
pretty_print_file_size(file, initial_size, final_size, start_time, end_time)
|
868
|
+
|
869
|
+
return None
|
870
|
+
|
871
|
+
#Unfinished, Has more moving parts to take care of.
|
872
|
+
#DSRC-2 is threaded, but the compressed format is not supported by falco. Thus, we run QC, THEN compress each file 1 at a time using all threads.
|
873
|
+
def dsrc_compress_module(inputs, outputs, threads, level):
|
874
|
+
print("DSRC-2 will also produce QC reports at this time!")
|
875
|
+
|
876
|
+
#DSRC only accepts up to 64 threads
|
877
|
+
if threads > 64:
|
878
|
+
threads = 64
|
879
|
+
|
880
|
+
#DSRC-formatted args
|
881
|
+
threads = "-t"+str(threads)
|
882
|
+
level = "-m"+str(level)
|
883
|
+
|
884
|
+
#falco goes here for DSRC-2, must be uncompressed files.
|
885
|
+
num_files = min(threads, len(inputs)+len(outputs))
|
886
|
+
|
887
|
+
|
888
|
+
for file in files:
|
889
|
+
output_file_name = file+".dsrc"
|
890
|
+
|
891
|
+
start_time = datetime.now()
|
892
|
+
initial_size = os.path.getsize(files[i])
|
893
|
+
|
894
|
+
compress_command = ["dsrc", "c", threads, level, file, output_file_name]
|
895
|
+
subprocess.run(compress_command)
|
896
|
+
|
897
|
+
ending_size = os.path.getsize(output_file_name)
|
898
|
+
end_time = datetime.now()
|
899
|
+
|
900
|
+
pretty_print_file_size(files[i], initial_size, ending_size, start_time, end_time)
|
901
|
+
|
902
|
+
print("Compression and QC complete!")
|
903
|
+
|
904
|
+
#Unfinished.
|
905
|
+
#Function for checking if an input file is a DSRC archive - these have to be decompressed for trimming, since the tools don't directly support such archives.
|
906
|
+
def check_is_dsrc(file):
|
907
|
+
#We're going to make a file in a temporary directory and use it
|
908
|
+
base_name = os.path.basename(file)
|
909
|
+
loc = tempfile.mkdtemp()
|
910
|
+
tempout = loc + "/" + base_name
|
911
|
+
is_dsrc = False
|
912
|
+
|
913
|
+
#Attempt to DSRC decompress into the temp file
|
914
|
+
try:
|
915
|
+
#Multiple reasons this could fail, including tool absence. All should be handled by this except.
|
916
|
+
dsrc_decomp = ["dsrc", "d", file, tempout]
|
917
|
+
subprocess.run(dsrc_decomp, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
|
918
|
+
#DSRC only creates the file if it's successful in opening the file and DSRC can be called in the first place.
|
919
|
+
is_dsrc = os.path.exists(tempout)
|
920
|
+
#If the file cannot be decompressed, delete the temp file and return self.
|
921
|
+
except:
|
922
|
+
shutil.rmtree(loc)
|
923
|
+
|
924
|
+
if is_dsrc:
|
925
|
+
dsrc_file = tempout
|
926
|
+
else:
|
927
|
+
dsrc_file = file
|
928
|
+
|
929
|
+
return is_dsrc, dsrc_file
|
930
|
+
|
931
|
+
#Convert a file's size in bytes to human-readable format.
|
932
|
+
def humansize(nbytes):
|
933
|
+
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
|
934
|
+
i = 0
|
935
|
+
while nbytes >= 1024 and i < len(suffixes)-1:
|
936
|
+
nbytes /= 1024.
|
937
|
+
i += 1
|
938
|
+
f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
|
939
|
+
return '%s %s' % (f, suffixes[i])
|
940
|
+
|
941
|
+
#Print well-formatted compression time info.
|
942
|
+
def pretty_print_file_size(name, start, end, start_time, end_time):
|
943
|
+
runtime = end_time - start_time
|
944
|
+
|
945
|
+
try:
|
946
|
+
hours = runtime.hours
|
947
|
+
except:
|
948
|
+
hours = 0
|
949
|
+
|
950
|
+
try:
|
951
|
+
minutes = runtime.minutes
|
952
|
+
except:
|
953
|
+
minutes = 0
|
954
|
+
|
955
|
+
try:
|
956
|
+
seconds = runtime.seconds
|
957
|
+
except:
|
958
|
+
seconds = 0
|
959
|
+
|
960
|
+
runtime = '%02d:%02d:%02d' % (hours, minutes, seconds)
|
961
|
+
|
962
|
+
print(name, "compressed! Compression took:", runtime, "and the file was compressed to", str(round((end/start)*100, 2)), "percent of original size from", humansize(start), "to", humansize(end))
|
963
|
+
|
964
|
+
return None
|
965
|
+
|
966
|
+
#Stolen from a SO thread on how to issue usage information on an error.
|
967
|
+
class MyParser(argparse.ArgumentParser):
|
968
|
+
def error(self, message):
|
969
|
+
sys.stderr.write('error: %s\n' % message)
|
970
|
+
self.print_help()
|
971
|
+
sys.exit(2)
|
972
|
+
|
973
|
+
#Option parsing
|
974
|
+
def gather_opts():
|
975
|
+
parser = MyParser(description=''' This program is designed to facilitate effective trimming of your reads.
|
976
|
+
It will help to identify the presence of adapters in your reads, trim those adapters and the reads efficiently,
|
977
|
+
and produce several bfore and after quality reports in addition to the trimmed reads. This is a pipeline incorporating
|
978
|
+
FaQCs, falco, and seqtk commands, in addition to several python operations which exist to facilitate adapter finding and
|
979
|
+
subsetting. --user and --UNLIMITED_POWER are jokes, but you should usually use --UNLIMITED_POWER.''')
|
980
|
+
#Use all available cores.
|
981
|
+
parser.add_argument("--max", dest = "Sheev", action = 'store_true', help = "Attempts to detect and use all available processors for threading.")
|
982
|
+
#Or this many threads. Laaaaame
|
983
|
+
parser.add_argument("--threads", "-t", dest = "threads", default = 1, help = "Number of threads to use for parallel processes. Default 1")
|
984
|
+
|
985
|
+
#file inputs
|
986
|
+
parser.add_argument("--forward", "-1", dest = "f", default = "", help = "Forward Strand Reads (use -u for unpaired reads)")
|
987
|
+
parser.add_argument("--reverse", "-2", dest = "r", default = "", help = "Reverse Strand Reads (use -u for unpaired reads)")
|
988
|
+
parser.add_argument("--unpaired", "-u", dest = "u", default = "", help = "Unpaired Reads")
|
989
|
+
|
990
|
+
#final out directory
|
991
|
+
parser.add_argument("--output", "-o", dest = "outdir", default = ".", help = "Directory to send final outputs.")
|
992
|
+
#naming convention
|
993
|
+
parser.add_argument("--prefix", "-p", dest = "pref", default = "", help = "Prefix to place on outputs.")
|
994
|
+
|
995
|
+
#Adapter detection opts
|
996
|
+
parser.add_argument("--min_adapt_pres", "-m", dest = "minpres", default = 0.1, help = "Minimum presence of an adapter for it to be considered present in a set of reads. Default 0.1, so an adapter is considered present if detected in 0.1 percent of reads.")
|
997
|
+
|
998
|
+
parser.add_argument("--adapters", "-a", dest = "adapter_fasta", default = "internal", help = "Supply a custom set of adapters for adapter detection. Detected adapters can come only from this set. Multitrim uses the MiGA adapter set by defualt.")
|
999
|
+
#parser.add_argument("--kits", dest = "adapter_families", default = "internal", help = "Supply a 2-column, comma separated list of adapter IDs and kits of origin. When an adapter is detected, all adapters in the same seq. prep kit are also considered detected when using the default MiGA adapters.")
|
1000
|
+
|
1001
|
+
#Shared options
|
1002
|
+
parser.add_argument("--min_L", "-l", dest = "length", default = "50", help = "Minimum read length. Default 50 base pairs.")
|
1003
|
+
parser.add_argument("--phred_fmt", dest = "phred", default = "33", help = "Phred q score format (default 33)")
|
1004
|
+
parser.add_argument("--advanced", dest = "advanced", action = 'store_true', help = "Apply advanced trimming options (poly-G tail, low-complexity). Only useful for reads sequenced with 2-dye chemistry.")
|
1005
|
+
|
1006
|
+
#FaQCs opts
|
1007
|
+
parser.add_argument("--score", "-s", dest= "score", default = "27", help = "FaQCs quality target. Default 27")
|
1008
|
+
parser.add_argument("--skip_faqcs", dest = "skip_fq", action = 'store_true', help = "Do not trim with FaQCs (use fastp only). Cannot skip both.")
|
1009
|
+
|
1010
|
+
#fastp opts
|
1011
|
+
parser.add_argument("--window", "-w", dest = "mid", default = "3", help = "Trimmomatic-like sliding window. Default 3.")
|
1012
|
+
parser.add_argument("--window_qual", "-q", dest = "mid_q", default = "20", help = "Trim quality cutoff for trimmomatic window. Default 20.")
|
1013
|
+
parser.add_argument("--skip_fastp", dest = "skip_fp", action = 'store_true', help = "Do not trim with fastp (use FaQCs only). Cannot skip both.")
|
1014
|
+
|
1015
|
+
#parser.add_argument("--falco", dest = "falco_path", default = "falco", help = "Location of Falco QC binary.")
|
1016
|
+
#parser.add_argument("--seqtk", dest = "seqtk_path", default = "seqtk", help = "Location of SeqTK binary.")
|
1017
|
+
#parser.add_argument("--faqcs", dest = "faqcs_path", default = "FaQCs", help = "Location of FaQCs binary.")
|
1018
|
+
#parser.add_argument("--fastp", dest = "fastp_path", default = "fastp", help = "Location of fastp binary.")
|
1019
|
+
|
1020
|
+
#Update with DSRC later
|
1021
|
+
#parser.add_argument("--zip", dest = "compressor", default = "gzip", help = "Select a compressor for outputs. Supported options are: 'gzip' (default), 'pigz', 'dsrc'")
|
1022
|
+
#-1 default value used for automating pigz/gzip vs dsrc selection
|
1023
|
+
#parser.add_argument("--level", dest = "zip_level", default = -1, help = "Choose a compression level for outputs. gzip, pigz take values 1-9 with default 6. DSRC takes 0-2 with default 0. Higher values are slower but compress better.")
|
1024
|
+
|
1025
|
+
parser.add_argument("--zip", dest = "compressor", default = "gzip", help = "Select a compressor for outputs. Supported options are: 'gzip' (default), 'pigz'")
|
1026
|
+
#-1 default value used for automating pigz/gzip vs dsrc selection
|
1027
|
+
parser.add_argument("--level", dest = "zip_level", default = -1, help = "Choose a compression level for outputs. gzip and pigz take values 1-9 with default 6")
|
1028
|
+
|
1029
|
+
parser.add_argument("--resources", dest = "resource_list", action = 'store_true', help = "Print a list of resources used by Multitrim and quit.")
|
1030
|
+
|
1031
|
+
|
1032
|
+
return(parser, parser.parse_args())
|
1033
|
+
|
1034
|
+
def print_resources():
|
1035
|
+
print("Multitrim github: https://github.com/KGerhardt/multitrim")
|
1036
|
+
print("MiGA adapters available at: https://github.com/bio-miga/miga/blob/main/utils/adapters.fa")
|
1037
|
+
internal_adapters = faqcs_internal_adapters()
|
1038
|
+
print("FaQCs mandatory adapters are:")
|
1039
|
+
for id in internal_adapters:
|
1040
|
+
print(id, internal_adapters[id])
|
1041
|
+
print("FaQCs github: https://github.com/LANL-Bioinformatics/FaQCs")
|
1042
|
+
print("fastp github: https://github.com/OpenGene/fastp")
|
1043
|
+
print("Falco github: https://github.com/smithlabcode/falco")
|
1044
|
+
|
1045
|
+
#Program Control
|
1046
|
+
def main():
|
1047
|
+
#Keep the parser on hand so I can prent usage as needed.s
|
1048
|
+
help_message, options = gather_opts()
|
1049
|
+
|
1050
|
+
resources = options.resource_list
|
1051
|
+
if resources:
|
1052
|
+
print_resources()
|
1053
|
+
quit()
|
1054
|
+
|
1055
|
+
|
1056
|
+
#Allows for the script to take no inputs and print help/usage
|
1057
|
+
if len(sys.argv)==1:
|
1058
|
+
help_message.print_help(sys.stderr)
|
1059
|
+
quit()
|
1060
|
+
|
1061
|
+
|
1062
|
+
skip_fq = options.skip_fq
|
1063
|
+
skip_fp = options.skip_fp
|
1064
|
+
|
1065
|
+
if skip_fp and skip_fq:
|
1066
|
+
print("Cannot skip both trimming tools. This would result in no trim at all. Exiting program.")
|
1067
|
+
sys.exit(1)
|
1068
|
+
|
1069
|
+
#file name prefix
|
1070
|
+
prefix = str(options.pref)
|
1071
|
+
|
1072
|
+
#Make it more convenient for me later
|
1073
|
+
if prefix != "":
|
1074
|
+
if not prefix.endswith("_"):
|
1075
|
+
prefix = prefix + "_"
|
1076
|
+
|
1077
|
+
#Tool names
|
1078
|
+
fp = "fastp"
|
1079
|
+
fq = "FaQCs"
|
1080
|
+
stk = "seqtk"
|
1081
|
+
that_aint_falco = "falco"
|
1082
|
+
|
1083
|
+
#Get the reads
|
1084
|
+
f = options.f
|
1085
|
+
r = options.r
|
1086
|
+
u = options.u
|
1087
|
+
|
1088
|
+
#phred format
|
1089
|
+
phred = str(options.phred)
|
1090
|
+
|
1091
|
+
#num threads
|
1092
|
+
threads = int(options.threads)
|
1093
|
+
#Check for --max flag
|
1094
|
+
if options.Sheev:
|
1095
|
+
#Detects and uses all the threads a system has available.
|
1096
|
+
try:
|
1097
|
+
threads = len(os.sched_getaffinity(0))
|
1098
|
+
except:
|
1099
|
+
print("Cannot detect how many cores are available! Defaulting to 1. Use --threads to specify more cores if you see this message.")
|
1100
|
+
threads = 1
|
1101
|
+
else:
|
1102
|
+
#Check to ensure a user doesn't request more procs than available.
|
1103
|
+
try:
|
1104
|
+
threads = min(threads, len(os.sched_getaffinity(0)))
|
1105
|
+
except:
|
1106
|
+
#Handle case where the sched getaffinity function is unavailable.
|
1107
|
+
threads = int(options.threads)
|
1108
|
+
|
1109
|
+
|
1110
|
+
|
1111
|
+
#No reads shorter than minlen
|
1112
|
+
minlen = str(options.length)
|
1113
|
+
|
1114
|
+
#advanced trimming opts
|
1115
|
+
#FaQCs:
|
1116
|
+
# currently no advanced opts
|
1117
|
+
#Fastp:
|
1118
|
+
# --trim_poly_g
|
1119
|
+
# --low_complexity_filter
|
1120
|
+
advanced = options.advanced
|
1121
|
+
|
1122
|
+
#These options control the trimming behavior for fastp, correspond to sliding window width and avg. quality min.
|
1123
|
+
mid = str(options.mid)
|
1124
|
+
mid_q = str(options.mid_q)
|
1125
|
+
|
1126
|
+
#faqcs target score. Lower = less aggressive, higher = more aggressive
|
1127
|
+
score = str(options.score)
|
1128
|
+
|
1129
|
+
#directory to place results. Creates if needed, but won't create multiple dirs.
|
1130
|
+
final_output = options.outdir
|
1131
|
+
|
1132
|
+
#Autocomplete may include the slash, but I don't want it
|
1133
|
+
if final_output.endswith("/"):
|
1134
|
+
final_output = final_output[:-1]
|
1135
|
+
|
1136
|
+
#Check to make sure it actually has data, or exits
|
1137
|
+
if f == "" and r == "" and u == "":
|
1138
|
+
print("I need to be given reads! Use -1 and -2 for paired-end reads, or -u for unpaired reads. Exiting program.")
|
1139
|
+
quit()
|
1140
|
+
|
1141
|
+
#Check to make sure that a forward read is paired with a reverse read if either is supplied
|
1142
|
+
if f == "" and r != "" or f != "" and r == "":
|
1143
|
+
print("If you have paired reads, I need both the forward and reverse files. If you just want to process one, use -u to specify it. Exiting program.")
|
1144
|
+
quit()
|
1145
|
+
|
1146
|
+
#fastp cannot take both unpaired and paired simultaneously
|
1147
|
+
if u != "" and (r != "" or f != ""):
|
1148
|
+
print("If you have paired reads, I need both the forward and reverse files. If you just want to process one, use -u to specify it. Exiting program.")
|
1149
|
+
quit()
|
1150
|
+
|
1151
|
+
#Determine single or paired end mode
|
1152
|
+
if u == "":
|
1153
|
+
paired_end = True
|
1154
|
+
else:
|
1155
|
+
paired_end = False
|
1156
|
+
|
1157
|
+
if paired_end:
|
1158
|
+
quit_out = False
|
1159
|
+
if not os.path.exists(f):
|
1160
|
+
print("Forward Reads: " + f + " not found. Multitrim will exit.")
|
1161
|
+
quit_out = True
|
1162
|
+
if not os.path.exists(r):
|
1163
|
+
print("Reverse Reads: " + r + " not found. Multitrim will exit.")
|
1164
|
+
quit_out = True
|
1165
|
+
|
1166
|
+
if quit_out:
|
1167
|
+
quit()
|
1168
|
+
|
1169
|
+
else:
|
1170
|
+
if not os.path.exists(u):
|
1171
|
+
print("Unpaired Reads: " + u + " not found. Multitrim will exit.")
|
1172
|
+
quit()
|
1173
|
+
|
1174
|
+
|
1175
|
+
#Check if a directory is specified and which doesn't exist; try to create if needed or exit gracefully.
|
1176
|
+
#This has to happen last, or it risks making the directory when the program is otherwise going to quit.
|
1177
|
+
if final_output != ".":
|
1178
|
+
if not os.path.exists(final_output):
|
1179
|
+
try:
|
1180
|
+
os.mkdir(final_output)
|
1181
|
+
except:
|
1182
|
+
print("Multitrim wasn't able to find or create the specified output directory. Exiting program.")
|
1183
|
+
quit()
|
1184
|
+
|
1185
|
+
|
1186
|
+
compressor = options.compressor
|
1187
|
+
compression_level = int(options.zip_level)
|
1188
|
+
|
1189
|
+
#if compressor not in ['gzip', 'pigz', 'dsrc']:
|
1190
|
+
if compressor not in ['gzip', 'pigz']:
|
1191
|
+
#print("Chosen compressor '"+compressor+"' not supported! Supported options are: 'gzip', 'pigz', 'dsrc'")
|
1192
|
+
print("Chosen compressor '"+compressor+"' not supported! Supported options are: 'gzip', 'pigz'")
|
1193
|
+
quit()
|
1194
|
+
|
1195
|
+
if compressor in ['gzip', 'pigz']:
|
1196
|
+
if compression_level == -1:
|
1197
|
+
compression_level = 6
|
1198
|
+
|
1199
|
+
if not 1 <= compression_level <= 9:
|
1200
|
+
print("Compression level", compression_level, "not acceptable! For GZIP and PIGZ, supported compression levels are 1-9!")
|
1201
|
+
quit()
|
1202
|
+
|
1203
|
+
#For DSRC development later
|
1204
|
+
'''
|
1205
|
+
if compressor in ['dsrc']:
|
1206
|
+
if compression_level == -1:
|
1207
|
+
compression_level = 0
|
1208
|
+
|
1209
|
+
if not 0 <= compression_level <= 3:
|
1210
|
+
print("Compression level", compression_level, "not acceptable! For DSRC-2, supported compression levels are 0-2!")
|
1211
|
+
quit()
|
1212
|
+
'''
|
1213
|
+
|
1214
|
+
#Check for input adapter file or default to internal list.
|
1215
|
+
input_adapters = options.adapter_fasta
|
1216
|
+
if not os.path.exists(input_adapters) and input_adapters != "internal":
|
1217
|
+
print("Adapters file", input_adapters, "not found! Exiting.")
|
1218
|
+
quit()
|
1219
|
+
#Adapters detected if minpres% of reads have that specific adapter present according to FaQCs stats.
|
1220
|
+
minpres = float(options.minpres)
|
1221
|
+
|
1222
|
+
#Reads user adapters or creates all adapters file from scratch
|
1223
|
+
adapter_set, complete_adapter_file_name, needs_cleanup = read_adapters(input_adapters)
|
1224
|
+
|
1225
|
+
#User feedback
|
1226
|
+
if final_output == ".":
|
1227
|
+
print("Placing results in:", os.getcwd())
|
1228
|
+
else:
|
1229
|
+
print("Placing results in:", final_output)
|
1230
|
+
|
1231
|
+
|
1232
|
+
if options.Sheev:
|
1233
|
+
print("Using all available cores. Number of cores:", threads)
|
1234
|
+
else:
|
1235
|
+
print("Working with", threads, "threads.")
|
1236
|
+
|
1237
|
+
if paired_end:
|
1238
|
+
#two inputs; paired end behavior
|
1239
|
+
print("Primary Strand Reads:", f, "\nReverse Strand Reads:", r)
|
1240
|
+
#User feedback
|
1241
|
+
print("Adapters considered detected if present in "+ str(minpres) + " % of reads.")
|
1242
|
+
|
1243
|
+
pre_qc_f, pre_qc_r, post_qc_f, post_qc_r, post_trim_f, post_trim_r = names_pe(f, r, final_output, prefix)
|
1244
|
+
|
1245
|
+
adapters_detected = adapter_identification_pe(complete_adapter_file_name, stk, fq, f, r, threads, final_output, minpres, prefix, phred)
|
1246
|
+
cleaned_adapters = parse_adapters(adapter_set, adapters_detected, final_output, prefix)
|
1247
|
+
|
1248
|
+
if needs_cleanup:
|
1249
|
+
print("Removing automatically generated adapters...")
|
1250
|
+
os.remove(complete_adapter_file_name)
|
1251
|
+
|
1252
|
+
full_trim_pe(f, r, post_trim_f, post_trim_r, final_output, cleaned_adapters, threads, fq, fp, score, minlen, mid, mid_q, prefix, compressor, compression_level, phred, advanced, skip_fp, skip_fq)
|
1253
|
+
|
1254
|
+
if compressor not in ["dsrc"]:
|
1255
|
+
falco_qc_pe(f, r, post_trim_f, post_trim_r, pre_qc_f, post_qc_f, pre_qc_r, post_qc_r, threads, that_aint_falco)
|
1256
|
+
|
1257
|
+
else:
|
1258
|
+
#one input; SE behavior
|
1259
|
+
print("Unpaired Reads:", u)
|
1260
|
+
#User feedback
|
1261
|
+
print("Adapters considered detected if present in "+ str(minpres) + " % of reads.")
|
1262
|
+
|
1263
|
+
pre_qc, post_qc, post_trim = names_se(u, final_output, prefix)
|
1264
|
+
|
1265
|
+
adapters_detected = adapter_identification_se(complete_adapter_file_name, stk, fq, u, threads, final_output, minpres, prefix, phred)
|
1266
|
+
cleaned_adapters = parse_adapters(adapter_set, adapters_detected, final_output, prefix)
|
1267
|
+
|
1268
|
+
if needs_cleanup:
|
1269
|
+
print("Removing automatically generated adapters...")
|
1270
|
+
os.remove(complete_adapter_file_name)
|
1271
|
+
|
1272
|
+
full_trim_se(u, post_trim, final_output, cleaned_adapters, threads, fq, fp, score, minlen, mid, mid_q, prefix, compressor, compression_level, phred, advanced, skip_fp, skip_fq)
|
1273
|
+
|
1274
|
+
if compressor not in ["dsrc"]:
|
1275
|
+
falco_qc_se(u, post_trim, pre_qc, post_qc, threads, that_aint_falco)
|
1276
|
+
|
1277
|
+
|
1278
|
+
print("Trimming complete.")
|
1279
|
+
|
1280
|
+
#just runs main
|
1281
|
+
if __name__ == "__main__":
|
1282
|
+
main()
|
1283
|
+
|
1284
|
+
#End of functional components of Multitrim.
|
1285
|
+
|
1286
|
+
#Below are leftover creation functions that could be used to update the list of adapters. Not used in the program proper.
|
1287
|
+
|
1288
|
+
#Regenerate adatapers file output from a fasta. This is a utility function I do not expect to see used in the final product.
|
1289
|
+
def fasta_to_permanent_python(original_adapters_fasta):
|
1290
|
+
fasta = open(original_adapters_fasta, "r")
|
1291
|
+
|
1292
|
+
fasta_seq_dict = {}
|
1293
|
+
|
1294
|
+
current_line = fasta.readline().strip()
|
1295
|
+
|
1296
|
+
current_id = current_line
|
1297
|
+
current_seq = ""
|
1298
|
+
|
1299
|
+
current_line = fasta.readline().strip()
|
1300
|
+
|
1301
|
+
while current_line:
|
1302
|
+
if current_line.startswith(">"):
|
1303
|
+
fasta_seq_dict[current_id] = current_seq
|
1304
|
+
current_id = current_line
|
1305
|
+
current_seq = ""
|
1306
|
+
else:
|
1307
|
+
current_seq += current_line
|
1308
|
+
|
1309
|
+
current_line = fasta.readline().strip()
|
1310
|
+
|
1311
|
+
#Finally, python needs this logic.
|
1312
|
+
fasta_seq_dict[current_id] = current_seq
|
1313
|
+
|
1314
|
+
fasta.close()
|
1315
|
+
|
1316
|
+
for contig in fasta_seq_dict:
|
1317
|
+
print("adapters_dict[\""+contig+"\"] = \""+ fasta_seq_dict[contig]+"\"")
|
1318
|
+
|
1319
|
+
#These spit out spoofed python code for the in-built creation of an adapters file to supply tools, without the external need for this file.
|
1320
|
+
#I just copy-paste the results to tbe generate_adapters_temporary_file function's body to get the results.
|
1321
|
+
|
1322
|
+
#As above, this prints out a python-correct set of commands for me to copy-paste.
|
1323
|
+
#This one produces a set of "families" for adapters, where each is the set of adapters in a kit.
|
1324
|
+
def fasta_to_families(original_adapters_fasta):
|
1325
|
+
fasta = open(original_adapters_fasta, "r")
|
1326
|
+
|
1327
|
+
whichfam = [5, 6, 10, 6, 7, 12, 14, 3, 13, 50, 9, 5, 6]
|
1328
|
+
|
1329
|
+
families = []
|
1330
|
+
families.append("singleend")
|
1331
|
+
families.append("pairedend")
|
1332
|
+
families.append("dpnII")
|
1333
|
+
families.append("smallrna")
|
1334
|
+
families.append("multiplex")
|
1335
|
+
families.append("pcr")
|
1336
|
+
families.append("dpnIIgex")
|
1337
|
+
families.append("otherrna")
|
1338
|
+
families.append("trueseq")
|
1339
|
+
families.append("rnapcr")
|
1340
|
+
families.append("abi")
|
1341
|
+
families.append("trueseq2")
|
1342
|
+
families.append("nextera")
|
1343
|
+
|
1344
|
+
current_fam = 0
|
1345
|
+
|
1346
|
+
famlist = []
|
1347
|
+
|
1348
|
+
for family_size in whichfam:
|
1349
|
+
for i in range(0, family_size):
|
1350
|
+
famlist.append(families[current_fam])
|
1351
|
+
current_fam += 1
|
1352
|
+
|
1353
|
+
fasta_fam_dict = {}
|
1354
|
+
|
1355
|
+
current_fam = 0
|
1356
|
+
|
1357
|
+
for line in fasta:
|
1358
|
+
if line.strip().startswith(">"):
|
1359
|
+
fasta_fam_dict[line.strip()[1:]] = famlist[current_fam]
|
1360
|
+
current_fam += 1
|
1361
|
+
|
1362
|
+
fasta.close()
|
1363
|
+
|
1364
|
+
for contig in fasta_fam_dict:
|
1365
|
+
print("adapters_fam_dict[\""+contig+"\"] = \""+ fasta_fam_dict[contig]+"\"")
|
1366
|
+
|
1367
|
+
#identifies the same adapters as in the full file with a family of origin, so that all adapters in a family can be selected.
|
1368
|
+
def create_adapter_families():
|
1369
|
+
adapters_fam_dict = {}
|
1370
|
+
|
1371
|
+
adapters_fam_dict["Illumina_Single_End_Apapter_1"] = "singleend"
|
1372
|
+
adapters_fam_dict["Illumina_Single_End_Apapter_2"] = "singleend"
|
1373
|
+
adapters_fam_dict["Illumina_Single_End_PCR_Primer_1"] = "singleend"
|
1374
|
+
adapters_fam_dict["Illumina_Single_End_PCR_Primer_2"] = "singleend"
|
1375
|
+
adapters_fam_dict["Illumina_Single_End_Sequencing_Primer"] = "singleend"
|
1376
|
+
adapters_fam_dict["Illumina_Paired_End_Adapter_1"] = "pairedend"
|
1377
|
+
adapters_fam_dict["Illumina_Paired_End_Adapter_2"] = "pairedend"
|
1378
|
+
adapters_fam_dict["Illumina_Paried_End_PCR_Primer_1"] = "pairedend"
|
1379
|
+
adapters_fam_dict["Illumina_Paired_End_PCR_Primer_2"] = "pairedend"
|
1380
|
+
adapters_fam_dict["Illumina_Paried_End_Sequencing_Primer_1"] = "pairedend"
|
1381
|
+
adapters_fam_dict["Illumina_Paired_End_Sequencing_Primer_2"] = "pairedend"
|
1382
|
+
adapters_fam_dict["Illumina_DpnII_expression_Adapter_1"] = "dpnII"
|
1383
|
+
adapters_fam_dict["Illumina_DpnII_expression_Adapter_2"] = "dpnII"
|
1384
|
+
adapters_fam_dict["Illumina_DpnII_expression_PCR_Primer_1"] = "dpnII"
|
1385
|
+
adapters_fam_dict["Illumina_DpnII_expression_PCR_Primer_2"] = "dpnII"
|
1386
|
+
adapters_fam_dict["Illumina_DpnII_expression_Sequencing_Primer"] = "dpnII"
|
1387
|
+
adapters_fam_dict["Illumina_NlaIII_expression_Adapter_1"] = "dpnII"
|
1388
|
+
adapters_fam_dict["Illumina_NlaIII_expression_Adapter_2"] = "dpnII"
|
1389
|
+
adapters_fam_dict["Illumina_NlaIII_expression_PCR_Primer_1"] = "dpnII"
|
1390
|
+
adapters_fam_dict["Illumina_NlaIII_expression_PCR_Primer_2"] = "dpnII"
|
1391
|
+
adapters_fam_dict["Illumina_NlaIII_expression_Sequencing_Primer"] = "dpnII"
|
1392
|
+
adapters_fam_dict["Illumina_Small_RNA_Adapter_1"] = "smallrna"
|
1393
|
+
adapters_fam_dict["Illumina_Small_RNA_Adapter_2"] = "smallrna"
|
1394
|
+
adapters_fam_dict["Illumina_Small_RNA_RT_Primer"] = "smallrna"
|
1395
|
+
adapters_fam_dict["Illumina_Small_RNA_PCR_Primer_1"] = "smallrna"
|
1396
|
+
adapters_fam_dict["Illumina_Small_RNA_PCR_Primer_2"] = "smallrna"
|
1397
|
+
adapters_fam_dict["Illumina_Small_RNA_Sequencing_Primer"] = "smallrna"
|
1398
|
+
adapters_fam_dict["Illumina_Multiplexing_Adapter_1"] = "multiplex"
|
1399
|
+
adapters_fam_dict["Illumina_Multiplexing_Adapter_2"] = "multiplex"
|
1400
|
+
adapters_fam_dict["Illumina_Multiplexing_PCR_Primer_1.01"] = "multiplex"
|
1401
|
+
adapters_fam_dict["Illumina_Multiplexing_PCR_Primer_2.01"] = "multiplex"
|
1402
|
+
adapters_fam_dict["Illumina_Multiplexing_Read1_Sequencing_Primer"] = "multiplex"
|
1403
|
+
adapters_fam_dict["Illumina_Multiplexing_Index_Sequencing_Primer"] = "multiplex"
|
1404
|
+
adapters_fam_dict["Illumina_Multiplexing_Read2_Sequencing_Primer"] = "multiplex"
|
1405
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_1"] = "pcr"
|
1406
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_2"] = "pcr"
|
1407
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_3"] = "pcr"
|
1408
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_4"] = "pcr"
|
1409
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_5"] = "pcr"
|
1410
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_6"] = "pcr"
|
1411
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_7"] = "pcr"
|
1412
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_8"] = "pcr"
|
1413
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_9"] = "pcr"
|
1414
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_10"] = "pcr"
|
1415
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_11"] = "pcr"
|
1416
|
+
adapters_fam_dict["Illumina_PCR_Primer_Index_12"] = "pcr"
|
1417
|
+
adapters_fam_dict["Illumina_DpnII_Gex_Adapter_1"] = "dpnIIgex"
|
1418
|
+
adapters_fam_dict["Illumina_DpnII_Gex_Adapter_1.01"] = "dpnIIgex"
|
1419
|
+
adapters_fam_dict["Illumina_DpnII_Gex_Adapter_2"] = "dpnIIgex"
|
1420
|
+
adapters_fam_dict["Illumina_DpnII_Gex_Adapter_2.01"] = "dpnIIgex"
|
1421
|
+
adapters_fam_dict["Illumina_DpnII_Gex_PCR_Primer_1"] = "dpnIIgex"
|
1422
|
+
adapters_fam_dict["Illumina_DpnII_Gex_PCR_Primer_2"] = "dpnIIgex"
|
1423
|
+
adapters_fam_dict["Illumina_DpnII_Gex_Sequencing_Primer"] = "dpnIIgex"
|
1424
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_1.01"] = "dpnIIgex"
|
1425
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_1.02"] = "dpnIIgex"
|
1426
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_2.01"] = "dpnIIgex"
|
1427
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_2.02"] = "dpnIIgex"
|
1428
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_PCR_Primer_1"] = "dpnIIgex"
|
1429
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_PCR_Primer_2"] = "dpnIIgex"
|
1430
|
+
adapters_fam_dict["Illumina_NlaIII_Gex_Sequencing_Primer"] = "dpnIIgex"
|
1431
|
+
adapters_fam_dict["Illumina_5p_RNA_Adapter"] = "otherrna"
|
1432
|
+
adapters_fam_dict["Illumina_RNA_Adapter1"] = "otherrna"
|
1433
|
+
adapters_fam_dict["Illumina_Small_RNA_3p_Adapter_1"] = "otherrna"
|
1434
|
+
adapters_fam_dict["TruSeq_Universal_Adapter"] = "trueseq"
|
1435
|
+
adapters_fam_dict["TruSeq_Adapter_Index_1"] = "trueseq"
|
1436
|
+
adapters_fam_dict["TruSeq_Adapter_Index_2"] = "trueseq"
|
1437
|
+
adapters_fam_dict["TruSeq_Adapter_Index_3"] = "trueseq"
|
1438
|
+
adapters_fam_dict["TruSeq_Adapter_Index_4"] = "trueseq"
|
1439
|
+
adapters_fam_dict["TruSeq_Adapter_Index_5"] = "trueseq"
|
1440
|
+
adapters_fam_dict["TruSeq_Adapter_Index_6"] = "trueseq"
|
1441
|
+
adapters_fam_dict["TruSeq_Adapter_Index_7"] = "trueseq"
|
1442
|
+
adapters_fam_dict["TruSeq_Adapter_Index_8"] = "trueseq"
|
1443
|
+
adapters_fam_dict["TruSeq_Adapter_Index_9"] = "trueseq"
|
1444
|
+
adapters_fam_dict["TruSeq_Adapter_Index_10"] = "trueseq"
|
1445
|
+
adapters_fam_dict["TruSeq_Adapter_Index_11"] = "trueseq"
|
1446
|
+
adapters_fam_dict["TruSeq_Adapter_Index_12"] = "trueseq"
|
1447
|
+
adapters_fam_dict["Illumina_RNA_RT_Primer"] = "rnapcr"
|
1448
|
+
adapters_fam_dict["Illumina_RNA_PCR_Primer"] = "rnapcr"
|
1449
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_1"] = "rnapcr"
|
1450
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_2"] = "rnapcr"
|
1451
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_3"] = "rnapcr"
|
1452
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_4"] = "rnapcr"
|
1453
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_5"] = "rnapcr"
|
1454
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_6"] = "rnapcr"
|
1455
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_7"] = "rnapcr"
|
1456
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_8"] = "rnapcr"
|
1457
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_9"] = "rnapcr"
|
1458
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_10"] = "rnapcr"
|
1459
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_11"] = "rnapcr"
|
1460
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_12"] = "rnapcr"
|
1461
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_13"] = "rnapcr"
|
1462
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_14"] = "rnapcr"
|
1463
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_15"] = "rnapcr"
|
1464
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_16"] = "rnapcr"
|
1465
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_17"] = "rnapcr"
|
1466
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_18"] = "rnapcr"
|
1467
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_19"] = "rnapcr"
|
1468
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_20"] = "rnapcr"
|
1469
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_21"] = "rnapcr"
|
1470
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_22"] = "rnapcr"
|
1471
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_23"] = "rnapcr"
|
1472
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_24"] = "rnapcr"
|
1473
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_25"] = "rnapcr"
|
1474
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_26"] = "rnapcr"
|
1475
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_27"] = "rnapcr"
|
1476
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_28"] = "rnapcr"
|
1477
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_29"] = "rnapcr"
|
1478
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_30"] = "rnapcr"
|
1479
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_31"] = "rnapcr"
|
1480
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_32"] = "rnapcr"
|
1481
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_33"] = "rnapcr"
|
1482
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_34"] = "rnapcr"
|
1483
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_35"] = "rnapcr"
|
1484
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_36"] = "rnapcr"
|
1485
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_37"] = "rnapcr"
|
1486
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_38"] = "rnapcr"
|
1487
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_39"] = "rnapcr"
|
1488
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_40"] = "rnapcr"
|
1489
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_41"] = "rnapcr"
|
1490
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_42"] = "rnapcr"
|
1491
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_43"] = "rnapcr"
|
1492
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_44"] = "rnapcr"
|
1493
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_45"] = "rnapcr"
|
1494
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_46"] = "rnapcr"
|
1495
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_47"] = "rnapcr"
|
1496
|
+
adapters_fam_dict["RNA_PCR_Primer_Index_48"] = "rnapcr"
|
1497
|
+
adapters_fam_dict["ABI_Dynabead_EcoP_Oligo"] = "abi"
|
1498
|
+
adapters_fam_dict["ABI_Solid3_Adapter_A"] = "abi"
|
1499
|
+
adapters_fam_dict["ABI_Solid3_Adapter_B"] = "abi"
|
1500
|
+
adapters_fam_dict["ABI_Solid3_5_AMP_Primer"] = "abi"
|
1501
|
+
adapters_fam_dict["ABI_Solid3_3_AMP_Primer"] = "abi"
|
1502
|
+
adapters_fam_dict["ABI_Solid3_EF1_alpha_Sense_Primer"] = "abi"
|
1503
|
+
adapters_fam_dict["ABI_Solid3_EF1_alpha_Antisense_Primer"] = "abi"
|
1504
|
+
adapters_fam_dict["ABI_Solid3_GAPDH_Forward_Primer"] = "abi"
|
1505
|
+
adapters_fam_dict["ABI_Solid3_GAPDH_Reverse_Primer"] = "abi"
|
1506
|
+
adapters_fam_dict["TruSeq2_SE"] = "trueseq2"
|
1507
|
+
adapters_fam_dict["TruSeq2_PE_f"] = "trueseq2"
|
1508
|
+
adapters_fam_dict["TruSeq2_PE_r"] = "trueseq2"
|
1509
|
+
adapters_fam_dict["TruSeq3_IndexedAdapter"] = "trueseq2"
|
1510
|
+
adapters_fam_dict["TruSeq3_UniversalAdapter"] = "trueseq2"
|
1511
|
+
adapters_fam_dict["Nextera_PE_PrefixNX/1"] = "nextera"
|
1512
|
+
adapters_fam_dict["Nextera_PE_PrefixNX/2"] = "nextera"
|
1513
|
+
adapters_fam_dict["Nextera_PE_Trans1"] = "nextera"
|
1514
|
+
adapters_fam_dict["Nextera_PE_Trans1_rc"] = "nextera"
|
1515
|
+
adapters_fam_dict["Nextera_PE_Trans2"] = "nextera"
|
1516
|
+
adapters_fam_dict["Nextera_PE_Trans2_rc"] = "nextera"
|
1517
|
+
|
1518
|
+
return(adapters_fam_dict)
|
1519
|
+
|
1520
|
+
#creates python code for family:id:sequence dictionary for kit detection code. Relies on the current state of the generate_adapters_temporary_file function.
|
1521
|
+
def create_seq_to_fam():
|
1522
|
+
dict, name = generate_adapters_temporary_file()
|
1523
|
+
|
1524
|
+
fams = create_adapter_families()
|
1525
|
+
|
1526
|
+
seq_to_fam = {}
|
1527
|
+
|
1528
|
+
for id in dict:
|
1529
|
+
family = fams[id[1:]]
|
1530
|
+
seq_to_fam[dict[id]] = family
|
1531
|
+
#print("seq_to_fam['"+dict[id]+"'] = '" + family+ "'")
|
1532
|
+
|
1533
|
+
fam_to_id_to_seq = {}
|
1534
|
+
|
1535
|
+
for id in dict:
|
1536
|
+
family = fams[id[1:]]
|
1537
|
+
seq = dict[id]
|
1538
|
+
|
1539
|
+
if family in fam_to_id_to_seq:
|
1540
|
+
fam_to_id_to_seq[family][id[1:]] = seq
|
1541
|
+
else:
|
1542
|
+
fam_to_id_to_seq[family] = {}
|
1543
|
+
fam_to_id_to_seq[family][id[1:]] = seq
|
1544
|
+
|
1545
|
+
for fam in fam_to_id_to_seq:
|
1546
|
+
print("fam_to_id_to_seq['"+fam+"'] =", fam_to_id_to_seq[fam])
|
1547
|
+
|
1548
|
+
|
1549
|
+
|
1550
|
+
|
1551
|
+
|
1552
|
+
|
1553
|
+
|
1554
|
+
|
1555
|
+
os.remove(name)
|