miga-base 1.2.17.0 → 1.2.17.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (265) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +2 -2
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI +3659 -0
  7. data/utils/FastAAI/FastAAI-legacy/FastAAI +1336 -0
  8. data/utils/FastAAI/FastAAI-legacy/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/FastAAI/README.md +84 -0
  10. data/utils/enveomics/Docs/recplot2.md +244 -0
  11. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  13. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  14. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  15. data/utils/enveomics/LICENSE.txt +73 -0
  16. data/utils/enveomics/Makefile +52 -0
  17. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  18. data/utils/enveomics/Manifest/Tasks/blasttab.json +790 -0
  19. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  20. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  21. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  22. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  23. data/utils/enveomics/Manifest/Tasks/mapping.json +165 -0
  24. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  25. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  26. data/utils/enveomics/Manifest/Tasks/remote.json +356 -0
  27. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +650 -0
  28. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  29. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  30. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  31. data/utils/enveomics/Manifest/categories.json +165 -0
  32. data/utils/enveomics/Manifest/examples.json +162 -0
  33. data/utils/enveomics/Manifest/tasks.json +4 -0
  34. data/utils/enveomics/README.md +42 -0
  35. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  36. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  37. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  38. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  39. data/utils/enveomics/Scripts/BedGraph.tad.rb +138 -0
  40. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  41. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  42. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  43. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  44. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  45. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  46. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  47. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  48. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  49. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  50. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  51. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  52. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  53. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  54. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  55. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  56. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  57. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  58. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +123 -0
  59. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  60. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  61. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  62. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  63. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  64. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  65. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  66. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  67. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  68. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  69. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  70. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  71. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  72. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  73. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  74. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  75. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  76. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  77. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  78. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  79. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  80. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  81. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  82. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  83. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  84. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  85. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  86. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  87. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  88. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  89. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  90. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  91. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  92. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  93. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  94. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  95. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  96. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  97. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  98. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  99. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  100. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  101. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  102. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  103. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  104. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  105. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  106. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  107. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  108. data/utils/enveomics/Scripts/SRA.download.bash +67 -0
  109. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  110. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  111. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  112. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  113. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  114. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  115. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  116. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  117. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  118. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  119. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  120. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  121. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  122. data/utils/enveomics/Scripts/aai.rb +421 -0
  123. data/utils/enveomics/Scripts/ani.rb +362 -0
  124. data/utils/enveomics/Scripts/anir.rb +137 -0
  125. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  126. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  127. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  128. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  129. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  130. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  131. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  132. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  133. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  134. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  135. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  136. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  137. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +88 -0
  138. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  139. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  140. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  141. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  142. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  143. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  144. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  145. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +74 -0
  146. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  147. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  148. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  149. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  150. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  151. data/utils/enveomics/Scripts/ogs.rb +104 -0
  152. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  153. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  154. data/utils/enveomics/Scripts/rbm.rb +108 -0
  155. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  156. data/utils/enveomics/Tests/Makefile +10 -0
  157. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  158. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  159. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  160. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  161. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  162. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  163. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  164. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  165. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  166. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  167. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  168. data/utils/enveomics/Tests/alkB.nwk +1 -0
  169. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  170. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  171. data/utils/enveomics/Tests/hiv1.faa +59 -0
  172. data/utils/enveomics/Tests/hiv1.fna +134 -0
  173. data/utils/enveomics/Tests/hiv2.faa +70 -0
  174. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  175. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  176. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  177. data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
  178. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  179. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  180. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  181. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  182. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  183. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  184. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  185. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  186. data/utils/enveomics/build_enveomics_r.bash +45 -0
  187. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  188. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  189. data/utils/enveomics/enveomics.R/R/autoprune.R +167 -0
  190. data/utils/enveomics/enveomics.R/R/barplot.R +203 -0
  191. data/utils/enveomics/enveomics.R/R/cliopts.R +141 -0
  192. data/utils/enveomics/enveomics.R/R/df2dist.R +192 -0
  193. data/utils/enveomics/enveomics.R/R/growthcurve.R +349 -0
  194. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  195. data/utils/enveomics/enveomics.R/R/recplot.R +419 -0
  196. data/utils/enveomics/enveomics.R/R/recplot2.R +1698 -0
  197. data/utils/enveomics/enveomics.R/R/tribs.R +638 -0
  198. data/utils/enveomics/enveomics.R/R/utils.R +90 -0
  199. data/utils/enveomics/enveomics.R/README.md +81 -0
  200. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  201. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  202. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  203. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  204. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  205. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  206. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +47 -0
  207. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  208. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  209. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +26 -0
  210. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +26 -0
  211. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +44 -0
  212. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +111 -0
  213. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  214. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +34 -0
  215. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +25 -0
  216. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +59 -0
  217. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +63 -0
  218. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +46 -0
  219. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +78 -0
  220. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  221. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  222. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +147 -0
  223. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  224. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +27 -0
  225. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  226. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +28 -0
  227. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +24 -0
  228. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +22 -0
  229. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +22 -0
  230. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +52 -0
  231. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  232. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +21 -0
  233. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  234. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +34 -0
  235. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +23 -0
  236. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +24 -0
  237. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +31 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +56 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +20 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  242. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  243. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  244. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  245. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  246. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  247. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  248. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  249. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  250. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  251. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  252. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +81 -0
  253. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +49 -0
  254. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +48 -0
  255. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  256. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +22 -0
  257. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +22 -0
  258. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +22 -0
  259. data/utils/enveomics/globals.mk +8 -0
  260. data/utils/enveomics/manifest.json +9 -0
  261. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  262. data/utils/multitrim/README.md +67 -0
  263. data/utils/multitrim/multitrim.py +1555 -0
  264. data/utils/multitrim/multitrim.yml +13 -0
  265. metadata +268 -6
@@ -0,0 +1,1555 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ import argparse
8
+ import multiprocessing
9
+ import re
10
+ import shutil
11
+ from datetime import datetime
12
+
13
+ #Reads a file with adapters and uses them as the starting set for adapter identification. By default, uses the current MiGA adapter list as of Feb. 23, 2021
14
+ def read_adapters(adapters_fasta):
15
+
16
+ cleanup = False
17
+ if adapters_fasta == "internal":
18
+ adapters, adapters_fasta = generate_adapters_temporary_file()
19
+
20
+ cleanup = True
21
+ else:
22
+ adapters = {}
23
+ current_seq = ""
24
+ current_id = ""
25
+
26
+ adapt = open(adapters_fasta, "r")
27
+
28
+ for line in adapt:
29
+ if line.startswith(">"):
30
+ if len(current_seq) > 0:
31
+ adapters[current_id] = current_seq
32
+ current_id = line.strip()[1:]
33
+ current_seq = ""
34
+ else:
35
+ current_seq += line.strip()
36
+
37
+ adapters[current_id] = current_seq
38
+
39
+ adapt.close()
40
+
41
+ return adapters, adapters_fasta, cleanup
42
+
43
+ #Only contains adapters we already recognize as part of a kit. It will need updated as new ones may be added.
44
+ def family_detection(adapter_seqs):
45
+ #Currently acceptable fams:
46
+ '''
47
+ singleend
48
+ pairedend
49
+ dpnII
50
+ smallrna
51
+ multiplex
52
+ pcr
53
+ dpnIIgex
54
+ otherrna
55
+ trueseq
56
+ rnapcr
57
+ trueseq2
58
+ nextera
59
+ cre-loxp
60
+ truseq1
61
+ pcr_primer
62
+ nextera_junction
63
+ '''
64
+
65
+ #There are some repeats in adapters. All are added - this meant to make the program as conservative as possible.
66
+ fam_to_id_to_seq = {}
67
+ #MiGA adapters
68
+ fam_to_id_to_seq['singleend'] = {'Illumina_Single_End_Apapter_1': 'ACACTCTTTCCCTACACGACGCTGTTCCATCT', 'Illumina_Single_End_Apapter_2': 'CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT', 'Illumina_Single_End_PCR_Primer_1': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Single_End_PCR_Primer_2': 'CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT', 'Illumina_Single_End_Sequencing_Primer': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT'}
69
+ fam_to_id_to_seq['pairedend'] = {'Illumina_Paired_End_Adapter_1': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_Adapter_2': 'CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'Illumina_Paried_End_PCR_Primer_1': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_PCR_Primer_2': 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT', 'Illumina_Paried_End_Sequencing_Primer_1': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Paired_End_Sequencing_Primer_2': 'CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT'}
70
+ fam_to_id_to_seq['dpnII'] = {'Illumina_DpnII_expression_Adapter_1': 'ACAGGTTCAGAGTTCTACAGTCCGAC', 'Illumina_DpnII_expression_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_expression_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_expression_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_DpnII_expression_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_NlaIII_expression_Adapter_1': 'ACAGGTTCAGAGTTCTACAGTCCGACATG', 'Illumina_NlaIII_expression_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_expression_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_expression_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_NlaIII_expression_Sequencing_Primer': 'CCGACAGGTTCAGAGTTCTACAGTCCGACATG'}
71
+ fam_to_id_to_seq['smallrna'] = {'Illumina_Small_RNA_Adapter_1': 'GTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_Small_RNA_Adapter_2': 'TCGTATGCCGTCTTCTGCTTGT', 'Illumina_Small_RNA_RT_Primer': 'CAAGCAGAAGACGGCATACGA', 'Illumina_Small_RNA_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_Small_RNA_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_Small_RNA_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC'}
72
+ fam_to_id_to_seq['multiplex'] = {'Illumina_Multiplexing_Adapter_1': 'GATCGGAAGAGCACACGTCT', 'Illumina_Multiplexing_Adapter_2': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_PCR_Primer_1.01': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_PCR_Primer_2.01': 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT', 'Illumina_Multiplexing_Read1_Sequencing_Primer': 'ACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'Illumina_Multiplexing_Index_Sequencing_Primer': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'Illumina_Multiplexing_Read2_Sequencing_Primer': 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'}
73
+ fam_to_id_to_seq['pcr'] = {'Illumina_PCR_Primer_Index_1': 'CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_2': 'CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_3': 'CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_4': 'CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_5': 'CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_6': 'CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_7': 'CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_8': 'CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_9': 'CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_10': 'CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_11': 'CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC', 'Illumina_PCR_Primer_Index_12': 'CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC'}
74
+ fam_to_id_to_seq['dpnIIgex'] = {'Illumina_DpnII_Gex_Adapter_1': 'GATCGTCGGACTGTAGAACTCTGAAC', 'Illumina_DpnII_Gex_Adapter_1.01': 'ACAGGTTCAGAGTTCTACAGTCCGAC', 'Illumina_DpnII_Gex_Adapter_2': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_Gex_Adapter_2.01': 'TCGTATGCCGTCTTCTGCTTG', 'Illumina_DpnII_Gex_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_DpnII_Gex_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_DpnII_Gex_Sequencing_Primer': 'CGACAGGTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_NlaIII_Gex_Adapter_1.01': 'TCGGACTGTAGAACTCTGAAC', 'Illumina_NlaIII_Gex_Adapter_1.02': 'ACAGGTTCAGAGTTCTACAGTCCGACATG', 'Illumina_NlaIII_Gex_Adapter_2.01': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_Gex_Adapter_2.02': 'TCGTATGCCGTCTTCTGCTTG', 'Illumina_NlaIII_Gex_PCR_Primer_1': 'CAAGCAGAAGACGGCATACGA', 'Illumina_NlaIII_Gex_PCR_Primer_2': 'AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA', 'Illumina_NlaIII_Gex_Sequencing_Primer': 'CCGACAGGTTCAGAGTTCTACAGTCCGACATG'}
75
+ fam_to_id_to_seq['otherrna'] = {'Illumina_5p_RNA_Adapter': 'GTTCAGAGTTCTACAGTCCGACGATC', 'Illumina_RNA_Adapter1': 'TCGTATGCCGTCTTCTGCTTGT', 'Illumina_Small_RNA_3p_Adapter_1': 'ATCTCGTATGCCGTCTTCTGCTTG'}
76
+ fam_to_id_to_seq['trueseq'] = {'TruSeq_Universal_Adapter': 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT', 'TruSeq_Adapter_Index_1': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_2': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_3': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_4': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_5': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_6': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_7': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_8': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_9': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_10': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_11': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG', 'TruSeq_Adapter_Index_12': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG'}
77
+ fam_to_id_to_seq['rnapcr'] = {'Illumina_RNA_RT_Primer': 'GCCTTGGCACCCGAGAATTCCA', 'Illumina_RNA_PCR_Primer': 'AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA', 'RNA_PCR_Primer_Index_1': 'CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_2': 'CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_3': 'CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_4': 'CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_5': 'CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_6': 'CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_7': 'CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_8': 'CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_9': 'CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_10': 'CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_11': 'CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_12': 'CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_13': 'CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_14': 'CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_15': 'CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_16': 'CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_17': 'CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_18': 'CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_19': 'CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_20': 'CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_21': 'CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_22': 'CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_23': 'CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_24': 'CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_25': 'CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_26': 'CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_27': 'CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_28': 'CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_29': 'CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_30': 'CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_31': 'CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_32': 'CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_33': 'CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_34': 'CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_35': 'CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_36': 'CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_37': 'CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_38': 'CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_39': 'CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_40': 'CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_41': 'CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_42': 'CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_43': 'CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_44': 'CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_45': 'CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_46': 'CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_47': 'CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA', 'RNA_PCR_Primer_Index_48': 'CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA'}
78
+ fam_to_id_to_seq['abi'] = {'ABI_Dynabead_EcoP_Oligo': 'CTGATCTAGAGGTACCGGATCCCAGCAGT', 'ABI_Solid3_Adapter_A': 'CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG', 'ABI_Solid3_Adapter_B': 'CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT', 'ABI_Solid3_5_AMP_Primer': 'CCACTACGCCTCCGCTTTCCTCTCTATG', 'ABI_Solid3_3_AMP_Primer': 'CTGCCCCGGGTTCCTCATTCT', 'ABI_Solid3_EF1_alpha_Sense_Primer': 'CATGTGTGTTGAGAGCTTC', 'ABI_Solid3_EF1_alpha_Antisense_Primer': 'GAAAACCAAAGTGGTCCAC', 'ABI_Solid3_GAPDH_Forward_Primer': 'TTAGCACCCCTGGCCAAGG', 'ABI_Solid3_GAPDH_Reverse_Primer': 'CTTACTCCTTGGAGGCCATG'}
79
+ fam_to_id_to_seq['trueseq2'] = {'TruSeq2_SE': 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG', 'TruSeq2_PE_f': 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT', 'TruSeq2_PE_r': 'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG', 'TruSeq3_IndexedAdapter': 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'TruSeq3_UniversalAdapter': 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'}
80
+ fam_to_id_to_seq['nextera'] = {'Nextera_PE_PrefixNX/1': 'AGATGTGTATAAGAGACAG', 'Nextera_PE_PrefixNX/2': 'AGATGTGTATAAGAGACAG', 'Nextera_PE_Trans1': 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG', 'Nextera_PE_Trans1_rc': 'CTGTCTCTTATACACATCTGACGCTGCCGACGA', 'Nextera_PE_Trans2': 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG', 'Nextera_PE_Trans2_rc': 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC'}
81
+ #FaQCs adapters for safety
82
+ fam_to_id_to_seq['cre-loxp'] = {'cre-loxp-forward' : 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG', 'cre-loxp-reverse' : 'AGCATATTGAAGCATATTACATACGATATGCTTCAATAATGC'}
83
+ fam_to_id_to_seq['truseq1'] = {'TruSeq-adapter-1' : 'GGGGTAGTGTGGATCCTCCTCTAGGCAGTTGGGTTATTCTAGAAGCAGATGTGTTGGCTGTTTCTGAAACTCTGGAAAA', 'TruSeq-adapter-3' : 'CAACAGCCGGTCAAAACATCTGGAGGGTAAGCCATAAACACCTCAACAGAAAA'}
84
+ fam_to_id_to_seq['pcr_primer'] = {'PCR-primer-1' : 'CGATAACTTCGTATAATGTATGCTATACGAAGTTATTACG', 'PCR-primer-2' : 'GCATAACTTCGTATAGCATACATTATACGAAGTTATACGA'}
85
+ fam_to_id_to_seq['nextera_junction'] = {'Nextera-junction-adapter-1' : 'CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG'}
86
+ fam_to_id_to_seq['Nextera-primer-adapter'] = {'Nextera-primer-adapter-1' : 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 'Nextera-primer-adapter-2' : 'GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT'}
87
+
88
+ detected_fams = []
89
+ #for each user sequence
90
+ for seq in adapter_seqs.values():
91
+ #for each family
92
+ for family in fam_to_id_to_seq:
93
+ #Check if the sequence appears as a value in the current family; add it to the detected family list if it's not already there.
94
+ if seq in fam_to_id_to_seq[family].values() and family not in detected_fams:
95
+ detected_fams.append(family)
96
+
97
+ #If a family was detected, add ALL sequences from that family to the final list, except the ones that the user already supplied.
98
+ for fam in detected_fams:
99
+ for id in fam_to_id_to_seq[fam]:
100
+ sequence = fam_to_id_to_seq[fam][id]
101
+ #User seqs came in with adapter_seqs, so we want to skip adding the preset one; it would be redundant, change names. Just add the others.
102
+ if sequence not in adapter_seqs.values():
103
+ adapter_seqs[id] = sequence
104
+
105
+ #add '>' to the start of each seq.
106
+ easy_print = {}
107
+
108
+ for id in adapter_seqs:
109
+ easy_print[">"+id] = adapter_seqs[id]
110
+
111
+ return easy_print
112
+
113
+ #This contains code which generates a complete list of illumina adapters from scratch
114
+ def generate_adapters_temporary_file():
115
+
116
+ #print("Preparing adapter file for you.")
117
+ adapters_dict = {}
118
+
119
+ '''
120
+ I identify the adapter families here with comments. Any adapter recognized in one of these during preprocessing will include
121
+ all of the members of its family in final, e.g. seeing Illumina_Single_End_Apapter_1 will include the following:
122
+ Illumina_Single_End_Apapter_1, Illumina_Single_End_Apapter_2, Illumina_Single_End_PCR_Primer_1, Illumina_Single_End_PCR_Primer_2, and Illumina_Single_End_Sequencing_Primer
123
+ in the final filtering fasta
124
+ '''
125
+
126
+ #Single end family
127
+ adapters_dict["Illumina_Single_End_Apapter_1"] = "ACACTCTTTCCCTACACGACGCTGTTCCATCT"
128
+ adapters_dict["Illumina_Single_End_Apapter_2"] = "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"
129
+ adapters_dict["Illumina_Single_End_PCR_Primer_1"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
130
+ adapters_dict["Illumina_Single_End_PCR_Primer_2"] = "CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT"
131
+ adapters_dict["Illumina_Single_End_Sequencing_Primer"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
132
+
133
+ #Paired end family
134
+ adapters_dict["Illumina_Paired_End_Adapter_1"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
135
+ adapters_dict["Illumina_Paired_End_Adapter_2"] = "CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
136
+ adapters_dict["Illumina_Paried_End_PCR_Primer_1"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
137
+ adapters_dict["Illumina_Paired_End_PCR_Primer_2"] = "CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT"
138
+ adapters_dict["Illumina_Paried_End_Sequencing_Primer_1"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
139
+ adapters_dict["Illumina_Paired_End_Sequencing_Primer_2"] = "CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT"
140
+
141
+ #DpnII family
142
+ adapters_dict["Illumina_DpnII_expression_Adapter_1"] = "ACAGGTTCAGAGTTCTACAGTCCGAC"
143
+ adapters_dict["Illumina_DpnII_expression_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
144
+ adapters_dict["Illumina_DpnII_expression_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
145
+ adapters_dict["Illumina_DpnII_expression_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
146
+ adapters_dict["Illumina_DpnII_expression_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
147
+ adapters_dict["Illumina_NlaIII_expression_Adapter_1"] = "ACAGGTTCAGAGTTCTACAGTCCGACATG"
148
+ adapters_dict["Illumina_NlaIII_expression_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
149
+ adapters_dict["Illumina_NlaIII_expression_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
150
+ adapters_dict["Illumina_NlaIII_expression_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
151
+ adapters_dict["Illumina_NlaIII_expression_Sequencing_Primer"] = "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
152
+
153
+ #Small RNA family
154
+ adapters_dict["Illumina_Small_RNA_Adapter_1"] = "GTTCAGAGTTCTACAGTCCGACGATC"
155
+ adapters_dict["Illumina_Small_RNA_Adapter_2"] = "TCGTATGCCGTCTTCTGCTTGT"
156
+ adapters_dict["Illumina_Small_RNA_RT_Primer"] = "CAAGCAGAAGACGGCATACGA"
157
+ adapters_dict["Illumina_Small_RNA_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
158
+ adapters_dict["Illumina_Small_RNA_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
159
+ adapters_dict["Illumina_Small_RNA_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
160
+
161
+
162
+ #Multiplexing Family
163
+ adapters_dict["Illumina_Multiplexing_Adapter_1"] = "GATCGGAAGAGCACACGTCT"
164
+ adapters_dict["Illumina_Multiplexing_Adapter_2"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
165
+ adapters_dict["Illumina_Multiplexing_PCR_Primer_1.01"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
166
+ adapters_dict["Illumina_Multiplexing_PCR_Primer_2.01"] = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"
167
+ adapters_dict["Illumina_Multiplexing_Read1_Sequencing_Primer"] = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"
168
+ adapters_dict["Illumina_Multiplexing_Index_Sequencing_Primer"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
169
+ adapters_dict["Illumina_Multiplexing_Read2_Sequencing_Primer"] = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT"
170
+
171
+
172
+ #PCR primer family
173
+ adapters_dict["Illumina_PCR_Primer_Index_1"] = "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC"
174
+ adapters_dict["Illumina_PCR_Primer_Index_2"] = "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC"
175
+ adapters_dict["Illumina_PCR_Primer_Index_3"] = "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC"
176
+ adapters_dict["Illumina_PCR_Primer_Index_4"] = "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC"
177
+ adapters_dict["Illumina_PCR_Primer_Index_5"] = "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC"
178
+ adapters_dict["Illumina_PCR_Primer_Index_6"] = "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC"
179
+ adapters_dict["Illumina_PCR_Primer_Index_7"] = "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC"
180
+ adapters_dict["Illumina_PCR_Primer_Index_8"] = "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC"
181
+ adapters_dict["Illumina_PCR_Primer_Index_9"] = "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC"
182
+ adapters_dict["Illumina_PCR_Primer_Index_10"] = "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC"
183
+ adapters_dict["Illumina_PCR_Primer_Index_11"] = "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC"
184
+ adapters_dict["Illumina_PCR_Primer_Index_12"] = "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC"
185
+
186
+
187
+ #DpnII Gex family
188
+ adapters_dict["Illumina_DpnII_Gex_Adapter_1"] = "GATCGTCGGACTGTAGAACTCTGAAC"
189
+ adapters_dict["Illumina_DpnII_Gex_Adapter_1.01"] = "ACAGGTTCAGAGTTCTACAGTCCGAC"
190
+ adapters_dict["Illumina_DpnII_Gex_Adapter_2"] = "CAAGCAGAAGACGGCATACGA"
191
+ adapters_dict["Illumina_DpnII_Gex_Adapter_2.01"] = "TCGTATGCCGTCTTCTGCTTG"
192
+ adapters_dict["Illumina_DpnII_Gex_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
193
+ adapters_dict["Illumina_DpnII_Gex_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
194
+ adapters_dict["Illumina_DpnII_Gex_Sequencing_Primer"] = "CGACAGGTTCAGAGTTCTACAGTCCGACGATC"
195
+ adapters_dict["Illumina_NlaIII_Gex_Adapter_1.01"] = "TCGGACTGTAGAACTCTGAAC"
196
+ adapters_dict["Illumina_NlaIII_Gex_Adapter_1.02"] = "ACAGGTTCAGAGTTCTACAGTCCGACATG"
197
+ adapters_dict["Illumina_NlaIII_Gex_Adapter_2.01"] = "CAAGCAGAAGACGGCATACGA"
198
+ adapters_dict["Illumina_NlaIII_Gex_Adapter_2.02"] = "TCGTATGCCGTCTTCTGCTTG"
199
+ adapters_dict["Illumina_NlaIII_Gex_PCR_Primer_1"] = "CAAGCAGAAGACGGCATACGA"
200
+ adapters_dict["Illumina_NlaIII_Gex_PCR_Primer_2"] = "AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA"
201
+ adapters_dict["Illumina_NlaIII_Gex_Sequencing_Primer"] = "CCGACAGGTTCAGAGTTCTACAGTCCGACATG"
202
+
203
+ #Other RNA family
204
+ adapters_dict["Illumina_5p_RNA_Adapter"] = "GTTCAGAGTTCTACAGTCCGACGATC"
205
+ adapters_dict["Illumina_RNA_Adapter1"] = "TCGTATGCCGTCTTCTGCTTGT"
206
+ adapters_dict["Illumina_Small_RNA_3p_Adapter_1"] = "ATCTCGTATGCCGTCTTCTGCTTG"
207
+
208
+ #TrueSeq family
209
+ adapters_dict["TruSeq_Universal_Adapter"] = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
210
+ adapters_dict["TruSeq_Adapter_Index_1"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"
211
+ adapters_dict["TruSeq_Adapter_Index_2"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG"
212
+ adapters_dict["TruSeq_Adapter_Index_3"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG"
213
+ adapters_dict["TruSeq_Adapter_Index_4"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG"
214
+ adapters_dict["TruSeq_Adapter_Index_5"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG"
215
+ adapters_dict["TruSeq_Adapter_Index_6"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG"
216
+ adapters_dict["TruSeq_Adapter_Index_7"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG"
217
+ adapters_dict["TruSeq_Adapter_Index_8"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG"
218
+ adapters_dict["TruSeq_Adapter_Index_9"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG"
219
+ adapters_dict["TruSeq_Adapter_Index_10"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG"
220
+ adapters_dict["TruSeq_Adapter_Index_11"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG"
221
+ adapters_dict["TruSeq_Adapter_Index_12"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG"
222
+
223
+ #RNA PCR family
224
+ adapters_dict["Illumina_RNA_RT_Primer"] = "GCCTTGGCACCCGAGAATTCCA"
225
+ adapters_dict["Illumina_RNA_PCR_Primer"] = "AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA"
226
+ adapters_dict["RNA_PCR_Primer_Index_1"] = "CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
227
+ adapters_dict["RNA_PCR_Primer_Index_2"] = "CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
228
+ adapters_dict["RNA_PCR_Primer_Index_3"] = "CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
229
+ adapters_dict["RNA_PCR_Primer_Index_4"] = "CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
230
+ adapters_dict["RNA_PCR_Primer_Index_5"] = "CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
231
+ adapters_dict["RNA_PCR_Primer_Index_6"] = "CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
232
+ adapters_dict["RNA_PCR_Primer_Index_7"] = "CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
233
+ adapters_dict["RNA_PCR_Primer_Index_8"] = "CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
234
+ adapters_dict["RNA_PCR_Primer_Index_9"] = "CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
235
+ adapters_dict["RNA_PCR_Primer_Index_10"] = "CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
236
+ adapters_dict["RNA_PCR_Primer_Index_11"] = "CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
237
+ adapters_dict["RNA_PCR_Primer_Index_12"] = "CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
238
+ adapters_dict["RNA_PCR_Primer_Index_13"] = "CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
239
+ adapters_dict["RNA_PCR_Primer_Index_14"] = "CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
240
+ adapters_dict["RNA_PCR_Primer_Index_15"] = "CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
241
+ adapters_dict["RNA_PCR_Primer_Index_16"] = "CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
242
+ adapters_dict["RNA_PCR_Primer_Index_17"] = "CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
243
+ adapters_dict["RNA_PCR_Primer_Index_18"] = "CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
244
+ adapters_dict["RNA_PCR_Primer_Index_19"] = "CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
245
+ adapters_dict["RNA_PCR_Primer_Index_20"] = "CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
246
+ adapters_dict["RNA_PCR_Primer_Index_21"] = "CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
247
+ adapters_dict["RNA_PCR_Primer_Index_22"] = "CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
248
+ adapters_dict["RNA_PCR_Primer_Index_23"] = "CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
249
+ adapters_dict["RNA_PCR_Primer_Index_24"] = "CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
250
+ adapters_dict["RNA_PCR_Primer_Index_25"] = "CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
251
+ adapters_dict["RNA_PCR_Primer_Index_26"] = "CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
252
+ adapters_dict["RNA_PCR_Primer_Index_27"] = "CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
253
+ adapters_dict["RNA_PCR_Primer_Index_28"] = "CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
254
+ adapters_dict["RNA_PCR_Primer_Index_29"] = "CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
255
+ adapters_dict["RNA_PCR_Primer_Index_30"] = "CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
256
+ adapters_dict["RNA_PCR_Primer_Index_31"] = "CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
257
+ adapters_dict["RNA_PCR_Primer_Index_32"] = "CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
258
+ adapters_dict["RNA_PCR_Primer_Index_33"] = "CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
259
+ adapters_dict["RNA_PCR_Primer_Index_34"] = "CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
260
+ adapters_dict["RNA_PCR_Primer_Index_35"] = "CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
261
+ adapters_dict["RNA_PCR_Primer_Index_36"] = "CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
262
+ adapters_dict["RNA_PCR_Primer_Index_37"] = "CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
263
+ adapters_dict["RNA_PCR_Primer_Index_38"] = "CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
264
+ adapters_dict["RNA_PCR_Primer_Index_39"] = "CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
265
+ adapters_dict["RNA_PCR_Primer_Index_40"] = "CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
266
+ adapters_dict["RNA_PCR_Primer_Index_41"] = "CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
267
+ adapters_dict["RNA_PCR_Primer_Index_42"] = "CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
268
+ adapters_dict["RNA_PCR_Primer_Index_43"] = "CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
269
+ adapters_dict["RNA_PCR_Primer_Index_44"] = "CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
270
+ adapters_dict["RNA_PCR_Primer_Index_45"] = "CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
271
+ adapters_dict["RNA_PCR_Primer_Index_46"] = "CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
272
+ adapters_dict["RNA_PCR_Primer_Index_47"] = "CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
273
+ adapters_dict["RNA_PCR_Primer_Index_48"] = "CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
274
+
275
+ #ABI family
276
+ adapters_dict["ABI_Dynabead_EcoP_Oligo"] = "CTGATCTAGAGGTACCGGATCCCAGCAGT"
277
+ adapters_dict["ABI_Solid3_Adapter_A"] = "CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG"
278
+ adapters_dict["ABI_Solid3_Adapter_B"] = "CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT"
279
+ adapters_dict["ABI_Solid3_5_AMP_Primer"] = "CCACTACGCCTCCGCTTTCCTCTCTATG"
280
+ adapters_dict["ABI_Solid3_3_AMP_Primer"] = "CTGCCCCGGGTTCCTCATTCT"
281
+ adapters_dict["ABI_Solid3_EF1_alpha_Sense_Primer"] = "CATGTGTGTTGAGAGCTTC"
282
+ adapters_dict["ABI_Solid3_EF1_alpha_Antisense_Primer"] = "GAAAACCAAAGTGGTCCAC"
283
+ adapters_dict["ABI_Solid3_GAPDH_Forward_Primer"] = "TTAGCACCCCTGGCCAAGG"
284
+ adapters_dict["ABI_Solid3_GAPDH_Reverse_Primer"] = "CTTACTCCTTGGAGGCCATG"
285
+
286
+ #TrueSeq2 family
287
+ adapters_dict["TruSeq2_SE"] = "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"
288
+ adapters_dict["TruSeq2_PE_f"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
289
+ adapters_dict["TruSeq2_PE_r"] = "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"
290
+ adapters_dict["TruSeq3_IndexedAdapter"] = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
291
+ adapters_dict["TruSeq3_UniversalAdapter"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA"
292
+
293
+ #Nextera Family
294
+ adapters_dict["Nextera_PE_PrefixNX/1"] = "AGATGTGTATAAGAGACAG"
295
+ adapters_dict["Nextera_PE_PrefixNX/2"] = "AGATGTGTATAAGAGACAG"
296
+ adapters_dict["Nextera_PE_Trans1"] = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
297
+ adapters_dict["Nextera_PE_Trans1_rc"] = "CTGTCTCTTATACACATCTGACGCTGCCGACGA"
298
+ adapters_dict["Nextera_PE_Trans2"] = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"
299
+ adapters_dict["Nextera_PE_Trans2_rc"] = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC"
300
+
301
+ all_adapters = tempfile.NamedTemporaryFile(mode = "w", delete = False)
302
+
303
+ for adapt in adapters_dict:
304
+ print(">"+adapt, file = all_adapters)
305
+ print(adapters_dict[adapt], file = all_adapters)
306
+
307
+ name = all_adapters.name
308
+ all_adapters.close()
309
+
310
+ return adapters_dict, name
311
+
312
+ #FaCQs supports external adapter sequences, but has no option to EXCLUDE its own internal adapters while doing so.
313
+ #This function returns a dict of ID:adapter for the FaQCs internal sequences so that Multitrim doesn't break should a FaQCs adapter
314
+ #appear in parse_adapters
315
+ def faqcs_internal_adapters():
316
+ adapters_dict = {}
317
+ #Below are the adapters present in FaQCs by default.
318
+
319
+ #Cre-loxp family
320
+ adapters_dict["cre-loxp-forward"] = "TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG"
321
+ adapters_dict["cre-loxp-reverse"] = "AGCATATTGAAGCATATTACATACGATATGCTTCAATAATGC"
322
+
323
+ #TruSeq 1 family
324
+ adapters_dict["TruSeq-adapter-1"] = "GGGGTAGTGTGGATCCTCCTCTAGGCAGTTGGGTTATTCTAGAAGCAGATGTGTTGGCTGTTTCTGAAACTCTGGAAAA"
325
+ adapters_dict["TruSeq-adapter-3"] = "CAACAGCCGGTCAAAACATCTGGAGGGTAAGCCATAAACACCTCAACAGAAAA"
326
+
327
+ #PCR primers
328
+ adapters_dict["PCR-primer-1"] = "CGATAACTTCGTATAATGTATGCTATACGAAGTTATTACG"
329
+ adapters_dict["PCR-primer-2"] = "GCATAACTTCGTATAGCATACATTATACGAAGTTATACGA"
330
+
331
+ #Nextera Junction family
332
+ adapters_dict["Nextera-junction-adapter-1"] = "CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG"
333
+
334
+ #Nextera-primer-adapter family; these are copies of earlier adapters in this list, but I want to make sure they're detectable since they're internal to FaQCs
335
+ adapters_dict["Nextera-primer-adapter-1"] = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
336
+ adapters_dict["Nextera-primer-adapter-2"] = "GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
337
+
338
+ return(adapters_dict)
339
+
340
+ #Get file names right up front for ease of use
341
+ def names_pe(forward, reverse, outdir = ".", prefix = ""):
342
+ forward_basename = os.path.basename(os.path.normpath(forward))
343
+ if forward_basename.endswith(".gz"):
344
+ forward_basename = forward_basename[:-3]
345
+ forward_basename = os.path.splitext(forward_basename)[0]
346
+
347
+ reverse_basename = os.path.basename(os.path.normpath(reverse))
348
+ if reverse_basename.endswith(".gz"):
349
+ reverse_basename = reverse_basename[:-3]
350
+ reverse_basename = os.path.splitext(reverse_basename)[0]
351
+
352
+ pre_qc_f = outdir + "/" + prefix + "1.pre_trim_QC_" + forward_basename
353
+ pre_qc_r = outdir + "/" + prefix + "2.pre_trim_QC_" + reverse_basename
354
+
355
+ post_qc_f = outdir + "/" + prefix + "1.post_trim_QC_" + forward_basename
356
+ post_qc_r = outdir + "/" + prefix + "2.post_trim_QC_" + reverse_basename
357
+
358
+ post_trim_reads_f = outdir + "/" + prefix + "1.post_trim_" + forward_basename + ".fq"
359
+ post_trim_reads_r = outdir + "/" + prefix + "2.post_trim_" + reverse_basename + ".fq"
360
+
361
+ return pre_qc_f, pre_qc_r, post_qc_f, post_qc_r, post_trim_reads_f, post_trim_reads_r
362
+
363
+ #Get file names right up front for ease of use
364
+ def names_se(reads, outdir = ".", prefix = ""):
365
+ base_name = os.path.basename(os.path.normpath(reads))
366
+ if base_name.endswith(".gz"):
367
+ base_name = base_name[:-3]
368
+ base_name = os.path.splitext(base_name)[0]
369
+
370
+ pre_qc = outdir + "/" + prefix + "unpaired.pre_trim_QC_" + base_name
371
+ post_qc = outdir + "/" + prefix + "unpaired.post_trim_QC_" + base_name
372
+ post_trim_reads = outdir + "/" + prefix + "unpaired.post_trim_" + base_name + ".fq"
373
+
374
+ return pre_qc, post_qc, post_trim_reads
375
+
376
+ #DSRC needs its own. Whoops.
377
+ def do_falco(read_name_tool):
378
+ '''
379
+ Falco does not support naming files, but does support selecting output directory.
380
+
381
+ As we are possibly generating multiple falco reports simultaneously,
382
+ we get around this issue by generating the generically named files in a temp dir
383
+ and then move the results to the final location with an appropriate rename.
384
+ '''
385
+
386
+ #temp directory
387
+ loc = tempfile.mkdtemp()
388
+
389
+ reads = read_name_tool[0]
390
+ output_name = read_name_tool[1]
391
+ falco_path = read_name_tool[2]
392
+
393
+ #falco command
394
+ command = [falco_path, "--quiet", "-o", loc, reads]
395
+ #command = [falco_path, "-o", loc, reads]
396
+
397
+
398
+ #run the command
399
+ #Working perfectly, the falco call should not produce any output. Until falco has bugs patched, it's not working perfectly
400
+ #subprocess.call(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
401
+ subprocess.call(command)
402
+
403
+
404
+ #move the results and rename
405
+ #I'm just gonna move the html.
406
+ #shutil.move(loc+"/fastqc_data.txt", output_name + ".data.txt")
407
+ shutil.move(loc+"/fastqc_report.html", output_name + ".html")
408
+
409
+ #Cleanup
410
+ shutil.rmtree(loc)
411
+
412
+ return None
413
+
414
+ #do all QC at once - now old
415
+ def falco_qc_pe(pre_trim_reads_f, pre_trim_reads_r, post_trim_reads_f, post_trim_reads_r, pre_name_f, post_name_f, pre_name_r, post_name_r, threads, falco_binary):
416
+ pre_forward = [pre_trim_reads_f, pre_name_f, falco_binary]
417
+ pre_reverse = [pre_trim_reads_r, pre_name_r, falco_binary]
418
+ post_forward = [post_trim_reads_f+".gz", post_name_f, falco_binary]
419
+ post_reverse = [post_trim_reads_r+".gz", post_name_r, falco_binary]
420
+
421
+ commands = [pre_forward, pre_reverse, post_forward, post_reverse]
422
+
423
+ print("Generating QC reports.")
424
+
425
+ pool = multiprocessing.Pool(min(4, threads))
426
+
427
+ pool.map(do_falco, commands)
428
+
429
+ pool.close()
430
+
431
+ #do all QC at once - now old
432
+ def falco_qc_se(pre_trim_reads, post_trim_reads, pre_name, post_name, threads, falco_binary):
433
+ pre = [pre_trim_reads, pre_name, falco_binary]
434
+ post = [post_trim_reads+".gz", post_name, falco_binary]
435
+
436
+ commands = [pre, post]
437
+
438
+ print("Generating QC reports.")
439
+
440
+ pool = multiprocessing.Pool(min(2, threads))
441
+
442
+ pool.map(do_falco, commands)
443
+
444
+ pool.close()
445
+
446
+ def do_seqtk(read_tool):
447
+ sample = read_tool[0]
448
+ seqtk_path = read_tool[1]
449
+
450
+ print("Subsampling:", sample)
451
+
452
+ #-s 100 specifies seed as 100. The number chosen is arbitrary, and I only spcify it so that results are deterministic and reproducible.
453
+ command = [seqtk_path, "sample", "-s", "100", sample, "100000"]
454
+
455
+ temp = tempfile.NamedTemporaryFile("w", delete=False)
456
+
457
+ ps = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines = True)
458
+ temp.write(ps.stdout)
459
+
460
+ name = temp.name
461
+
462
+ temp.close()
463
+
464
+ return name
465
+
466
+ #Subsample reads; identify adapters with FaQCs
467
+ def adapter_identification_pe(artificial_artifacts, seqtk_binary, faqcs_binary, forward = "", reverse = "", threads = 1, output = ".", minimum_presence = 0.1, prefix = "", phred_fmt = "33"):
468
+ #seqtk forward and reverse
469
+ subsample_f = [forward, seqtk_binary]
470
+ subsample_r = [reverse, seqtk_binary]
471
+
472
+ seqtk_commands = [subsample_f, subsample_r]
473
+
474
+ pool = multiprocessing.Pool(min(2, threads))
475
+
476
+ seqtk_samples = pool.map(do_seqtk, seqtk_commands)
477
+
478
+ pool.close()
479
+
480
+ #FaQCs PE with adapter file
481
+ faqcs_subset_command = [faqcs_binary, "-t", str(threads), "--qc_only", "-d", output, "--artifactFile", artificial_artifacts, "--ascii", phred_fmt]
482
+
483
+ #proper naming
484
+ if prefix != "":
485
+ faqcs_subset_command.append("--prefix")
486
+ faqcs_subset_command.append(prefix + "Subsample_Adapter_Detection")
487
+ pdf_name = prefix + "Subsample_Adapter_Detection_qc_report.pdf"
488
+ else :
489
+ faqcs_subset_command.append("--prefix")
490
+ faqcs_subset_command.append("Subsample_Adapter_Detection")
491
+ pdf_name = "Subsample_Adapter_Detection_qc_report.pdf"
492
+
493
+ #forward strand
494
+ faqcs_subset_command.append("-1")
495
+ faqcs_subset_command.append(seqtk_samples[0])
496
+
497
+ #reverse strand
498
+ faqcs_subset_command.append("-2")
499
+ faqcs_subset_command.append(seqtk_samples[1])
500
+
501
+ print("Detecting adapters now... ", end = "")
502
+ ps = subprocess.Popen(faqcs_subset_command)
503
+ ps.wait()
504
+
505
+ os.remove(output + "/" + pdf_name)
506
+
507
+ #Adapter detection from output of FaQCs
508
+ detection_report = open(output + "/" + prefix + "Subsample_Adapter_Detection.stats.txt")
509
+
510
+ detected_adapters = {}
511
+ begin_assessment = False
512
+ for line in detection_report:
513
+ if not begin_assessment:
514
+ if line.strip().startswith("Reads with Adapters/Primers:"):
515
+ begin_assessment = True
516
+ else:
517
+ segment = line.strip().split()
518
+ detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
519
+
520
+ detection_report.close()
521
+
522
+ clean_detection = []
523
+
524
+ for adapter in detected_adapters:
525
+ if detected_adapters[adapter] >= minimum_presence:
526
+ clean_detection.append(adapter)
527
+
528
+ #Cleans up after itself.
529
+ for item in seqtk_samples:
530
+ os.remove(item)
531
+
532
+ print("Detection done!")
533
+
534
+ #Return adapter file
535
+ return clean_detection
536
+
537
+ #Subsample reads; identify adapters with FaQCs
538
+ def adapter_identification_se(artificial_artifacts, seqtk_binary, faqcs_binary, unpaired = "", threads = 1, output = ".", minimum_presence = 0.1, prefix = "", phred_fmt = "33"):
539
+ #seqtk forward and reverse
540
+ subsample = [unpaired, seqtk_binary]
541
+
542
+ seqtk_samples = do_seqtk(subsample)
543
+
544
+ #FaQCs SE with adapter file
545
+ faqcs_subset_command = [faqcs_binary, "-t", str(threads), "--qc_only", "-d", output, "--artifactFile", artificial_artifacts, "--ascii", phred_fmt]
546
+
547
+ #proper naming
548
+ if prefix != "":
549
+ faqcs_subset_command.append("--prefix")
550
+ faqcs_subset_command.append(prefix + "Subsample_Adapter_Detection")
551
+ pdf_name = prefix + "Subsample_Adapter_Detection_qc_report.pdf"
552
+ else :
553
+ faqcs_subset_command.append("--prefix")
554
+ faqcs_subset_command.append("Subsample_Adapter_Detection")
555
+ pdf_name = "Subsample_Adapter_Detection_qc_report.pdf"
556
+
557
+ #forward strand
558
+ faqcs_subset_command.append("-u")
559
+ faqcs_subset_command.append(seqtk_samples)
560
+
561
+
562
+ print("Detecting adapters now... ", end = "")
563
+ ps = subprocess.Popen(faqcs_subset_command)
564
+ ps.wait()
565
+
566
+ os.remove(output + "/" + pdf_name)
567
+
568
+ #Adapter detection from output of FaQCs
569
+ detection_report = open(output + "/" + prefix + "Subsample_Adapter_Detection.stats.txt")
570
+
571
+ detected_adapters = {}
572
+ begin_assessment = False
573
+ for line in detection_report:
574
+ if not begin_assessment:
575
+ if line.strip().startswith("Reads with Adapters/Primers:"):
576
+ begin_assessment = True
577
+ else:
578
+ segment = line.strip().split()
579
+ detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
580
+
581
+ detection_report.close()
582
+
583
+ clean_detection = []
584
+
585
+ for adapter in detected_adapters:
586
+ if detected_adapters[adapter] >= minimum_presence:
587
+ clean_detection.append(adapter)
588
+
589
+ #Cleans up after itself.
590
+ os.remove(seqtk_samples)
591
+
592
+ print("Detection done!")
593
+
594
+ #Return adapter file
595
+ return clean_detection
596
+
597
+ #gets adapter families for later use
598
+ def parse_adapters(full_list, detected_adapters, output, prefix = ""):
599
+ print("Creating specific adapters file for you.")
600
+
601
+ #detected adapters is just a list of the user's detected adapters by ID.
602
+
603
+ faqcs_internal_adapter_list = faqcs_internal_adapters()
604
+
605
+ found = False
606
+ detected_seqs = {}
607
+ for id in detected_adapters:
608
+ found = False
609
+ if id in full_list:
610
+ found = True
611
+ print("Adapter sequence:", id, "detected.")
612
+ detected_seqs[id] = full_list[id]
613
+ if id in faqcs_internal_adapter_list:
614
+ found = True
615
+ print("Adapter sequence:", id, "detected. This adapter is part of a non-optional internal list used by FaQCs and will be included.")
616
+ detected_seqs[id] = faqcs_internal_adapter_list[id]
617
+ #Skip adapter if it cannot be found. Should never happen, now that FaQCs' adapters will always be found and other seqs must be from internal or supplied sequences file
618
+ if not found:
619
+ print("Adapter sequence:", id, "not found in Multitrim's adapter list! It will NOT be included in trimming.")
620
+
621
+ adapters_by_family = family_detection(detected_seqs)
622
+
623
+ #This is a file I don't want to be temporary. It both helps identify the adapters present in a dataset and provides a fasta for a user to reuse
624
+ subset = open(output + "/" + prefix + "detected_adapters.fasta", "w")
625
+
626
+ for adapter in adapters_by_family:
627
+ print(adapter, file = subset)
628
+ print(adapters_by_family[adapter], file = subset)
629
+
630
+ subset.close()
631
+
632
+ return(output+"/"+ prefix + "detected_adapters.fasta")
633
+
634
+ #paired end version of the full trim; trims using detected adapters with FaQCs -q 27, then fastp --cut_right window 3 qual 20
635
+ def full_trim_pe(forward_in, reverse_in, forward_out, reverse_out, directory, adapters, threads, faqcs, fastp, score, minlen, window, window_qual, prefix, compressor, compress_level, phred_fmt = "33", advanced = False, skip_fastp = False, skip_faqcs = False):
636
+ '''
637
+ Command structure:
638
+
639
+ The primary purpose is to issue a FaQCs call on the untrimmed reads, then a subsequent fastp call on the outputs from the FaQCs call.
640
+ Additionally, supports using only one of the two tools. Commands will be built even if the tool is to be skipped, but the call will never be issued.
641
+ '''
642
+
643
+ faqcs_command = [faqcs, "-t", str(threads), "-1", forward_in, "-2", reverse_in, "--artifactFile", adapters, "-q", str(score), "--min_L", str(minlen), "--prefix", "reads", "--trim_only", "-d", directory, "--ascii", phred_fmt]
644
+ fastp_command = [fastp, "--thread", str(threads), "--adapter_fasta", adapters, "-l", str(minlen), "--json", directory + "/" + prefix + "post_trim_fastp.json", "--html", directory + "/" + prefix + "post_trim_fastp.html"]
645
+
646
+ #Args can be added to fastp command with no consequences if fastp is skipped; command simply won't issue so they will be silent
647
+ if skip_faqcs:
648
+ #This handles taking the input reads directly
649
+ fastp_command.append("-i")
650
+ fastp_command.append(forward_in)
651
+ fastp_command.append("-I")
652
+ fastp_command.append(reverse_in)
653
+ else:
654
+ #FaQCs goes first; this is how I coerce FaQCs reads to look afterwards
655
+ fastp_command.append("-i")
656
+ fastp_command.append(directory+"/reads.1.trimmed.fastq")
657
+ fastp_command.append("-I")
658
+ fastp_command.append(directory+"/reads.2.trimmed.fastq")
659
+
660
+ #Outputs are the same regardless of inputs
661
+ fastp_command.append("-o")
662
+ fastp_command.append(forward_out)
663
+ fastp_command.append("-O")
664
+ fastp_command.append(reverse_out)
665
+
666
+ if int(window) > 0:
667
+ fastp_command.append("--cut_right")
668
+ fastp_command.append("--cut_right_window_size")
669
+ fastp_command.append(str(window))
670
+ fastp_command.append("--cut_right_mean_quality")
671
+ fastp_command.append(str(window_qual))
672
+
673
+ if phred_fmt != "33":
674
+ fastp_command.append("--phred64")
675
+
676
+ if advanced:
677
+ fastp_command.append("--trim_poly_g")
678
+ fastp_command.append("--low_complexity_filter")
679
+
680
+ time_format = "%d/%m/%Y %H:%M:%S"
681
+
682
+ #Manage issuing of commands
683
+ if not skip_faqcs:
684
+ timer = datetime.now()
685
+ printable_time = timer.strftime(time_format)
686
+ print("Trimming with FaQCs. Started at:", printable_time)
687
+ subprocess.run(faqcs_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
688
+ os.remove(directory + "/" + "reads.stats.txt")
689
+
690
+ if not skip_fastp:
691
+ timer = datetime.now()
692
+ printable_time = timer.strftime(time_format)
693
+ print("Trimming with Fastp. Started at:", printable_time)
694
+ subprocess.run(fastp_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
695
+ os.remove(directory + "/" + prefix + "post_trim_fastp.json")
696
+ os.remove(directory + "/" + prefix + "post_trim_fastp.html")
697
+
698
+
699
+
700
+ #We want to rename the non-fastp files, then pass all files and threads to compress with pigz under the nice, neat names
701
+
702
+ if skip_fastp:
703
+ #rename FaQCs files to correct names; compress
704
+
705
+ #remove this one in any event. We don't want any unpaireds with paired end
706
+ os.remove(directory+"/reads.unpaired.trimmed.fastq")
707
+ shutil.move(directory+"/reads.1.trimmed.fastq", forward_out)
708
+ shutil.move(directory+"/reads.2.trimmed.fastq", reverse_out)
709
+ #compress_commands = [[directory+"/reads.1.trimmed.fastq", forward_out], [directory+"/reads.2.trimmed.fastq", reverse_out]]
710
+ #might as well be parallel
711
+ #pool = multiprocessing.Pool(min(2, threads))
712
+ #pool.map(compress_faqcs, compress_commands)
713
+ #pool.close()
714
+
715
+ elif not skip_faqcs:
716
+ #remove FaQCs files if fastp has results or skip if FaQCs not done.
717
+ os.remove(directory+"/reads.1.trimmed.fastq")
718
+ os.remove(directory+"/reads.2.trimmed.fastq")
719
+ #remove this one in any event. We don't want any unpaireds with paired end - the call has to be duplicated, unfortunately.
720
+ os.remove(directory+"/reads.unpaired.trimmed.fastq")
721
+
722
+ compress_results([forward_out, reverse_out], threads, compressor, compress_level)
723
+
724
+ return None
725
+
726
+ #single end version of the full trim; trims using detected adapters with FaQCs -q 27, then fastp --cut_right window 3 qual 20
727
+ def full_trim_se(reads_in, reads_out, directory, adapters, threads, faqcs, fastp, score, minlen, window, window_qual, prefix, compressor, compress_level, phred_fmt = "33", advanced = False, skip_fastp = False, skip_faqcs = False):
728
+ '''
729
+ Command structure:
730
+
731
+ The primary purpose is to issue a FaQCs call on the untrimmed reads, then a subsequent fastp call on the outputs from the FaQCs call.
732
+ Additionally, supports using only one of the two tools. Commands will be built even if the tool is to be skipped, but the call will never be issued.
733
+ '''
734
+ faqcs_command = [faqcs, "-t", str(threads), "-u", reads_in, "--artifactFile", adapters, "-q", str(score), "--min_L", str(minlen), "--prefix", "reads", "--trim_only", "-d", directory, "--ascii", phred_fmt]
735
+ fastp_command = [fastp, "--thread", str(threads), "--adapter_fasta", adapters, "-l", str(minlen), "--json", directory + "/" + prefix + "post_trim_fastp.json", "--html", directory + "/" + prefix + "post_trim_fastp.html"]
736
+
737
+ #Args can be added to fastp command with no consequences if fastp is skipped; command simply won't issue so they will be silent
738
+ if skip_faqcs:
739
+ #This handles taking the input reads directly
740
+ fastp_command.append("-i")
741
+ fastp_command.append(reads_in)
742
+ else:
743
+ #FaQCs goes first; this is how I coerce FaQCs reads to look afterwards
744
+ fastp_command.append("-i")
745
+ fastp_command.append(directory+"/reads.unpaired.trimmed.fastq")
746
+
747
+ #Outputs are the same regardless of inputs
748
+ fastp_command.append("-o")
749
+ fastp_command.append(reads_out)
750
+
751
+ if int(window) > 0:
752
+ fastp_command.append("--cut_right")
753
+ fastp_command.append("--cut_right_window_size")
754
+ fastp_command.append(str(window))
755
+ fastp_command.append("--cut_right_mean_quality")
756
+ fastp_command.append(str(window_qual))
757
+
758
+ if phred_fmt != "33":
759
+ fastp_command.append("--phred64")
760
+
761
+ if advanced:
762
+ fastp_command.append("--trim_poly_g")
763
+ fastp_command.append("--low_complexity_filter")
764
+
765
+ time_format = "%d/%m/%Y %H:%M:%S"
766
+
767
+ #Manage issuing of commands
768
+ if not skip_faqcs:
769
+ timer = datetime.now()
770
+ printable_time = timer.strftime(time_format)
771
+ print("Trimming with FaQCs. Started at:", printable_time)
772
+ subprocess.run(faqcs_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
773
+ os.remove(directory + "/" + "reads.stats.txt")
774
+
775
+ if not skip_fastp:
776
+ timer = datetime.now()
777
+ printable_time = timer.strftime(time_format)
778
+ print("Trimming with Fastp. Started at:", printable_time)
779
+ subprocess.run(fastp_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
780
+ os.remove(directory + "/" + prefix + "post_trim_fastp.json")
781
+ os.remove(directory + "/" + prefix + "post_trim_fastp.html")
782
+
783
+
784
+ if skip_fastp:
785
+ #compress the result
786
+ #remove this one in any event. We don't want any unpaireds with paired end
787
+ shutil.move(directory+"/reads.unpaired.trimmed.fastq", reads_out)
788
+ elif not skip_faqcs:
789
+ #remove FaQCs files if fastp has results or skip if FaQCs not run.
790
+ os.remove(directory+"/reads.unpaired.trimmed.fastq")
791
+
792
+ compress_results([reads_out], threads, compressor, compress_level)
793
+
794
+ return None
795
+
796
+ #compress results using selected compressor.
797
+ def compress_results(output_files, threads, compressor, level):
798
+
799
+ #Just for printing feedback.
800
+ pretty_compressor = ["GZIP", "PIGZ", "DSRC-2"][["gzip", "pigz", "dsrc"].index(compressor)]
801
+
802
+ time_format = "%d/%m/%Y %H:%M:%S"
803
+ timer = datetime.now()
804
+ printable_time = timer.strftime(time_format)
805
+ print("Beginning compression of trimmed reads using", pretty_compressor, "at:", printable_time)
806
+
807
+ #These get the file sizes, runtimes, compression ratio
808
+ if compressor == "gzip":
809
+ gzip_compress_module(output_files, threads, level)
810
+
811
+ if compressor == "pigz":
812
+ pigz_compress_module(output_files, threads, level)
813
+
814
+ '''
815
+ if compressor == "dsrc":
816
+ dsrc_compress_module(output_files, threads, level)
817
+
818
+ '''
819
+
820
+ print("Outputs compressed!")
821
+
822
+ return None
823
+
824
+ #gzip is NOT threaded, so we open up to 4 threads and compress each input simultaneously, feeding the results to falco as we go.
825
+ def gzip_compress_module(outputs, threads, level):
826
+ #Get the gzip set up for each file
827
+ gzip_arguments = []
828
+ for file in outputs:
829
+ gzip_arguments.append(["gzip", "-f"+str(level), file])
830
+
831
+ #Don't open more threads than you have to.
832
+ num_files = len(outputs)
833
+ #Run args
834
+ pool = multiprocessing.Pool(min(threads, num_files))
835
+ pool.map(do_gzip_pretty, gzip_arguments)
836
+ pool.close()
837
+
838
+ return None
839
+
840
+ #The particular parallelization for this is a bother.
841
+ def do_gzip_pretty(compress_argument):
842
+ file = compress_argument[2]
843
+ start_time = datetime.now()
844
+ initial_size = os.path.getsize(file)
845
+
846
+ subprocess.call(compress_argument)
847
+
848
+ final_size = os.path.getsize(file+".gz")
849
+ end_time = datetime.now()
850
+
851
+ pretty_print_file_size(file, initial_size, final_size, start_time, end_time)
852
+
853
+ return None
854
+
855
+ #pigz is threaded, so compression happens 1 file at a time using all threads, then falco QC 4 using the gzip approach above since the result is in gzip format
856
+ def pigz_compress_module(outputs, threads, level):
857
+ for file in outputs:
858
+ start_time = datetime.now()
859
+ initial_size = os.path.getsize(file)
860
+
861
+ pigz_argument = ["pigz", "-f", "-"+str(level), "-p", str(threads), file]
862
+ subprocess.call(pigz_argument)
863
+
864
+ final_size = os.path.getsize(file+".gz")
865
+ end_time = datetime.now()
866
+
867
+ pretty_print_file_size(file, initial_size, final_size, start_time, end_time)
868
+
869
+ return None
870
+
871
+ #Unfinished, Has more moving parts to take care of.
872
+ #DSRC-2 is threaded, but the compressed format is not supported by falco. Thus, we run QC, THEN compress each file 1 at a time using all threads.
873
+ def dsrc_compress_module(inputs, outputs, threads, level):
874
+ print("DSRC-2 will also produce QC reports at this time!")
875
+
876
+ #DSRC only accepts up to 64 threads
877
+ if threads > 64:
878
+ threads = 64
879
+
880
+ #DSRC-formatted args
881
+ threads = "-t"+str(threads)
882
+ level = "-m"+str(level)
883
+
884
+ #falco goes here for DSRC-2, must be uncompressed files.
885
+ num_files = min(threads, len(inputs)+len(outputs))
886
+
887
+
888
+ for file in files:
889
+ output_file_name = file+".dsrc"
890
+
891
+ start_time = datetime.now()
892
+ initial_size = os.path.getsize(files[i])
893
+
894
+ compress_command = ["dsrc", "c", threads, level, file, output_file_name]
895
+ subprocess.run(compress_command)
896
+
897
+ ending_size = os.path.getsize(output_file_name)
898
+ end_time = datetime.now()
899
+
900
+ pretty_print_file_size(files[i], initial_size, ending_size, start_time, end_time)
901
+
902
+ print("Compression and QC complete!")
903
+
904
+ #Unfinished.
905
+ #Function for checking if an input file is a DSRC archive - these have to be decompressed for trimming, since the tools don't directly support such archives.
906
+ def check_is_dsrc(file):
907
+ #We're going to make a file in a temporary directory and use it
908
+ base_name = os.path.basename(file)
909
+ loc = tempfile.mkdtemp()
910
+ tempout = loc + "/" + base_name
911
+ is_dsrc = False
912
+
913
+ #Attempt to DSRC decompress into the temp file
914
+ try:
915
+ #Multiple reasons this could fail, including tool absence. All should be handled by this except.
916
+ dsrc_decomp = ["dsrc", "d", file, tempout]
917
+ subprocess.run(dsrc_decomp, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
918
+ #DSRC only creates the file if it's successful in opening the file and DSRC can be called in the first place.
919
+ is_dsrc = os.path.exists(tempout)
920
+ #If the file cannot be decompressed, delete the temp file and return self.
921
+ except:
922
+ shutil.rmtree(loc)
923
+
924
+ if is_dsrc:
925
+ dsrc_file = tempout
926
+ else:
927
+ dsrc_file = file
928
+
929
+ return is_dsrc, dsrc_file
930
+
931
+ #Convert a file's size in bytes to human-readable format.
932
+ def humansize(nbytes):
933
+ suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
934
+ i = 0
935
+ while nbytes >= 1024 and i < len(suffixes)-1:
936
+ nbytes /= 1024.
937
+ i += 1
938
+ f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
939
+ return '%s %s' % (f, suffixes[i])
940
+
941
+ #Print well-formatted compression time info.
942
+ def pretty_print_file_size(name, start, end, start_time, end_time):
943
+ runtime = end_time - start_time
944
+
945
+ try:
946
+ hours = runtime.hours
947
+ except:
948
+ hours = 0
949
+
950
+ try:
951
+ minutes = runtime.minutes
952
+ except:
953
+ minutes = 0
954
+
955
+ try:
956
+ seconds = runtime.seconds
957
+ except:
958
+ seconds = 0
959
+
960
+ runtime = '%02d:%02d:%02d' % (hours, minutes, seconds)
961
+
962
+ print(name, "compressed! Compression took:", runtime, "and the file was compressed to", str(round((end/start)*100, 2)), "percent of original size from", humansize(start), "to", humansize(end))
963
+
964
+ return None
965
+
966
+ #Stolen from a SO thread on how to issue usage information on an error.
967
+ class MyParser(argparse.ArgumentParser):
968
+ def error(self, message):
969
+ sys.stderr.write('error: %s\n' % message)
970
+ self.print_help()
971
+ sys.exit(2)
972
+
973
+ #Option parsing
974
+ def gather_opts():
975
+ parser = MyParser(description=''' This program is designed to facilitate effective trimming of your reads.
976
+ It will help to identify the presence of adapters in your reads, trim those adapters and the reads efficiently,
977
+ and produce several bfore and after quality reports in addition to the trimmed reads. This is a pipeline incorporating
978
+ FaQCs, falco, and seqtk commands, in addition to several python operations which exist to facilitate adapter finding and
979
+ subsetting. --user and --UNLIMITED_POWER are jokes, but you should usually use --UNLIMITED_POWER.''')
980
+ #Use all available cores.
981
+ parser.add_argument("--max", dest = "Sheev", action = 'store_true', help = "Attempts to detect and use all available processors for threading.")
982
+ #Or this many threads. Laaaaame
983
+ parser.add_argument("--threads", "-t", dest = "threads", default = 1, help = "Number of threads to use for parallel processes. Default 1")
984
+
985
+ #file inputs
986
+ parser.add_argument("--forward", "-1", dest = "f", default = "", help = "Forward Strand Reads (use -u for unpaired reads)")
987
+ parser.add_argument("--reverse", "-2", dest = "r", default = "", help = "Reverse Strand Reads (use -u for unpaired reads)")
988
+ parser.add_argument("--unpaired", "-u", dest = "u", default = "", help = "Unpaired Reads")
989
+
990
+ #final out directory
991
+ parser.add_argument("--output", "-o", dest = "outdir", default = ".", help = "Directory to send final outputs.")
992
+ #naming convention
993
+ parser.add_argument("--prefix", "-p", dest = "pref", default = "", help = "Prefix to place on outputs.")
994
+
995
+ #Adapter detection opts
996
+ parser.add_argument("--min_adapt_pres", "-m", dest = "minpres", default = 0.1, help = "Minimum presence of an adapter for it to be considered present in a set of reads. Default 0.1, so an adapter is considered present if detected in 0.1 percent of reads.")
997
+
998
+ parser.add_argument("--adapters", "-a", dest = "adapter_fasta", default = "internal", help = "Supply a custom set of adapters for adapter detection. Detected adapters can come only from this set. Multitrim uses the MiGA adapter set by defualt.")
999
+ #parser.add_argument("--kits", dest = "adapter_families", default = "internal", help = "Supply a 2-column, comma separated list of adapter IDs and kits of origin. When an adapter is detected, all adapters in the same seq. prep kit are also considered detected when using the default MiGA adapters.")
1000
+
1001
+ #Shared options
1002
+ parser.add_argument("--min_L", "-l", dest = "length", default = "50", help = "Minimum read length. Default 50 base pairs.")
1003
+ parser.add_argument("--phred_fmt", dest = "phred", default = "33", help = "Phred q score format (default 33)")
1004
+ parser.add_argument("--advanced", dest = "advanced", action = 'store_true', help = "Apply advanced trimming options (poly-G tail, low-complexity). Only useful for reads sequenced with 2-dye chemistry.")
1005
+
1006
+ #FaQCs opts
1007
+ parser.add_argument("--score", "-s", dest= "score", default = "27", help = "FaQCs quality target. Default 27")
1008
+ parser.add_argument("--skip_faqcs", dest = "skip_fq", action = 'store_true', help = "Do not trim with FaQCs (use fastp only). Cannot skip both.")
1009
+
1010
+ #fastp opts
1011
+ parser.add_argument("--window", "-w", dest = "mid", default = "3", help = "Trimmomatic-like sliding window. Default 3.")
1012
+ parser.add_argument("--window_qual", "-q", dest = "mid_q", default = "20", help = "Trim quality cutoff for trimmomatic window. Default 20.")
1013
+ parser.add_argument("--skip_fastp", dest = "skip_fp", action = 'store_true', help = "Do not trim with fastp (use FaQCs only). Cannot skip both.")
1014
+
1015
+ #parser.add_argument("--falco", dest = "falco_path", default = "falco", help = "Location of Falco QC binary.")
1016
+ #parser.add_argument("--seqtk", dest = "seqtk_path", default = "seqtk", help = "Location of SeqTK binary.")
1017
+ #parser.add_argument("--faqcs", dest = "faqcs_path", default = "FaQCs", help = "Location of FaQCs binary.")
1018
+ #parser.add_argument("--fastp", dest = "fastp_path", default = "fastp", help = "Location of fastp binary.")
1019
+
1020
+ #Update with DSRC later
1021
+ #parser.add_argument("--zip", dest = "compressor", default = "gzip", help = "Select a compressor for outputs. Supported options are: 'gzip' (default), 'pigz', 'dsrc'")
1022
+ #-1 default value used for automating pigz/gzip vs dsrc selection
1023
+ #parser.add_argument("--level", dest = "zip_level", default = -1, help = "Choose a compression level for outputs. gzip, pigz take values 1-9 with default 6. DSRC takes 0-2 with default 0. Higher values are slower but compress better.")
1024
+
1025
+ parser.add_argument("--zip", dest = "compressor", default = "gzip", help = "Select a compressor for outputs. Supported options are: 'gzip' (default), 'pigz'")
1026
+ #-1 default value used for automating pigz/gzip vs dsrc selection
1027
+ parser.add_argument("--level", dest = "zip_level", default = -1, help = "Choose a compression level for outputs. gzip and pigz take values 1-9 with default 6")
1028
+
1029
+ parser.add_argument("--resources", dest = "resource_list", action = 'store_true', help = "Print a list of resources used by Multitrim and quit.")
1030
+
1031
+
1032
+ return(parser, parser.parse_args())
1033
+
1034
+ def print_resources():
1035
+ print("Multitrim github: https://github.com/KGerhardt/multitrim")
1036
+ print("MiGA adapters available at: https://github.com/bio-miga/miga/blob/main/utils/adapters.fa")
1037
+ internal_adapters = faqcs_internal_adapters()
1038
+ print("FaQCs mandatory adapters are:")
1039
+ for id in internal_adapters:
1040
+ print(id, internal_adapters[id])
1041
+ print("FaQCs github: https://github.com/LANL-Bioinformatics/FaQCs")
1042
+ print("fastp github: https://github.com/OpenGene/fastp")
1043
+ print("Falco github: https://github.com/smithlabcode/falco")
1044
+
1045
+ #Program Control
1046
+ def main():
1047
+ #Keep the parser on hand so I can prent usage as needed.s
1048
+ help_message, options = gather_opts()
1049
+
1050
+ resources = options.resource_list
1051
+ if resources:
1052
+ print_resources()
1053
+ quit()
1054
+
1055
+
1056
+ #Allows for the script to take no inputs and print help/usage
1057
+ if len(sys.argv)==1:
1058
+ help_message.print_help(sys.stderr)
1059
+ quit()
1060
+
1061
+
1062
+ skip_fq = options.skip_fq
1063
+ skip_fp = options.skip_fp
1064
+
1065
+ if skip_fp and skip_fq:
1066
+ print("Cannot skip both trimming tools. This would result in no trim at all. Exiting program.")
1067
+ sys.exit(1)
1068
+
1069
+ #file name prefix
1070
+ prefix = str(options.pref)
1071
+
1072
+ #Make it more convenient for me later
1073
+ if prefix != "":
1074
+ if not prefix.endswith("_"):
1075
+ prefix = prefix + "_"
1076
+
1077
+ #Tool names
1078
+ fp = "fastp"
1079
+ fq = "FaQCs"
1080
+ stk = "seqtk"
1081
+ that_aint_falco = "falco"
1082
+
1083
+ #Get the reads
1084
+ f = options.f
1085
+ r = options.r
1086
+ u = options.u
1087
+
1088
+ #phred format
1089
+ phred = str(options.phred)
1090
+
1091
+ #num threads
1092
+ threads = int(options.threads)
1093
+ #Check for --max flag
1094
+ if options.Sheev:
1095
+ #Detects and uses all the threads a system has available.
1096
+ try:
1097
+ threads = len(os.sched_getaffinity(0))
1098
+ except:
1099
+ print("Cannot detect how many cores are available! Defaulting to 1. Use --threads to specify more cores if you see this message.")
1100
+ threads = 1
1101
+ else:
1102
+ #Check to ensure a user doesn't request more procs than available.
1103
+ try:
1104
+ threads = min(threads, len(os.sched_getaffinity(0)))
1105
+ except:
1106
+ #Handle case where the sched getaffinity function is unavailable.
1107
+ threads = int(options.threads)
1108
+
1109
+
1110
+
1111
+ #No reads shorter than minlen
1112
+ minlen = str(options.length)
1113
+
1114
+ #advanced trimming opts
1115
+ #FaQCs:
1116
+ # currently no advanced opts
1117
+ #Fastp:
1118
+ # --trim_poly_g
1119
+ # --low_complexity_filter
1120
+ advanced = options.advanced
1121
+
1122
+ #These options control the trimming behavior for fastp, correspond to sliding window width and avg. quality min.
1123
+ mid = str(options.mid)
1124
+ mid_q = str(options.mid_q)
1125
+
1126
+ #faqcs target score. Lower = less aggressive, higher = more aggressive
1127
+ score = str(options.score)
1128
+
1129
+ #directory to place results. Creates if needed, but won't create multiple dirs.
1130
+ final_output = options.outdir
1131
+
1132
+ #Autocomplete may include the slash, but I don't want it
1133
+ if final_output.endswith("/"):
1134
+ final_output = final_output[:-1]
1135
+
1136
+ #Check to make sure it actually has data, or exits
1137
+ if f == "" and r == "" and u == "":
1138
+ print("I need to be given reads! Use -1 and -2 for paired-end reads, or -u for unpaired reads. Exiting program.")
1139
+ quit()
1140
+
1141
+ #Check to make sure that a forward read is paired with a reverse read if either is supplied
1142
+ if f == "" and r != "" or f != "" and r == "":
1143
+ print("If you have paired reads, I need both the forward and reverse files. If you just want to process one, use -u to specify it. Exiting program.")
1144
+ quit()
1145
+
1146
+ #fastp cannot take both unpaired and paired simultaneously
1147
+ if u != "" and (r != "" or f != ""):
1148
+ print("If you have paired reads, I need both the forward and reverse files. If you just want to process one, use -u to specify it. Exiting program.")
1149
+ quit()
1150
+
1151
+ #Determine single or paired end mode
1152
+ if u == "":
1153
+ paired_end = True
1154
+ else:
1155
+ paired_end = False
1156
+
1157
+ if paired_end:
1158
+ quit_out = False
1159
+ if not os.path.exists(f):
1160
+ print("Forward Reads: " + f + " not found. Multitrim will exit.")
1161
+ quit_out = True
1162
+ if not os.path.exists(r):
1163
+ print("Reverse Reads: " + r + " not found. Multitrim will exit.")
1164
+ quit_out = True
1165
+
1166
+ if quit_out:
1167
+ quit()
1168
+
1169
+ else:
1170
+ if not os.path.exists(u):
1171
+ print("Unpaired Reads: " + u + " not found. Multitrim will exit.")
1172
+ quit()
1173
+
1174
+
1175
+ #Check if a directory is specified and which doesn't exist; try to create if needed or exit gracefully.
1176
+ #This has to happen last, or it risks making the directory when the program is otherwise going to quit.
1177
+ if final_output != ".":
1178
+ if not os.path.exists(final_output):
1179
+ try:
1180
+ os.mkdir(final_output)
1181
+ except:
1182
+ print("Multitrim wasn't able to find or create the specified output directory. Exiting program.")
1183
+ quit()
1184
+
1185
+
1186
+ compressor = options.compressor
1187
+ compression_level = int(options.zip_level)
1188
+
1189
+ #if compressor not in ['gzip', 'pigz', 'dsrc']:
1190
+ if compressor not in ['gzip', 'pigz']:
1191
+ #print("Chosen compressor '"+compressor+"' not supported! Supported options are: 'gzip', 'pigz', 'dsrc'")
1192
+ print("Chosen compressor '"+compressor+"' not supported! Supported options are: 'gzip', 'pigz'")
1193
+ quit()
1194
+
1195
+ if compressor in ['gzip', 'pigz']:
1196
+ if compression_level == -1:
1197
+ compression_level = 6
1198
+
1199
+ if not 1 <= compression_level <= 9:
1200
+ print("Compression level", compression_level, "not acceptable! For GZIP and PIGZ, supported compression levels are 1-9!")
1201
+ quit()
1202
+
1203
+ #For DSRC development later
1204
+ '''
1205
+ if compressor in ['dsrc']:
1206
+ if compression_level == -1:
1207
+ compression_level = 0
1208
+
1209
+ if not 0 <= compression_level <= 3:
1210
+ print("Compression level", compression_level, "not acceptable! For DSRC-2, supported compression levels are 0-2!")
1211
+ quit()
1212
+ '''
1213
+
1214
+ #Check for input adapter file or default to internal list.
1215
+ input_adapters = options.adapter_fasta
1216
+ if not os.path.exists(input_adapters) and input_adapters != "internal":
1217
+ print("Adapters file", input_adapters, "not found! Exiting.")
1218
+ quit()
1219
+ #Adapters detected if minpres% of reads have that specific adapter present according to FaQCs stats.
1220
+ minpres = float(options.minpres)
1221
+
1222
+ #Reads user adapters or creates all adapters file from scratch
1223
+ adapter_set, complete_adapter_file_name, needs_cleanup = read_adapters(input_adapters)
1224
+
1225
+ #User feedback
1226
+ if final_output == ".":
1227
+ print("Placing results in:", os.getcwd())
1228
+ else:
1229
+ print("Placing results in:", final_output)
1230
+
1231
+
1232
+ if options.Sheev:
1233
+ print("Using all available cores. Number of cores:", threads)
1234
+ else:
1235
+ print("Working with", threads, "threads.")
1236
+
1237
+ if paired_end:
1238
+ #two inputs; paired end behavior
1239
+ print("Primary Strand Reads:", f, "\nReverse Strand Reads:", r)
1240
+ #User feedback
1241
+ print("Adapters considered detected if present in "+ str(minpres) + " % of reads.")
1242
+
1243
+ pre_qc_f, pre_qc_r, post_qc_f, post_qc_r, post_trim_f, post_trim_r = names_pe(f, r, final_output, prefix)
1244
+
1245
+ adapters_detected = adapter_identification_pe(complete_adapter_file_name, stk, fq, f, r, threads, final_output, minpres, prefix, phred)
1246
+ cleaned_adapters = parse_adapters(adapter_set, adapters_detected, final_output, prefix)
1247
+
1248
+ if needs_cleanup:
1249
+ print("Removing automatically generated adapters...")
1250
+ os.remove(complete_adapter_file_name)
1251
+
1252
+ full_trim_pe(f, r, post_trim_f, post_trim_r, final_output, cleaned_adapters, threads, fq, fp, score, minlen, mid, mid_q, prefix, compressor, compression_level, phred, advanced, skip_fp, skip_fq)
1253
+
1254
+ if compressor not in ["dsrc"]:
1255
+ falco_qc_pe(f, r, post_trim_f, post_trim_r, pre_qc_f, post_qc_f, pre_qc_r, post_qc_r, threads, that_aint_falco)
1256
+
1257
+ else:
1258
+ #one input; SE behavior
1259
+ print("Unpaired Reads:", u)
1260
+ #User feedback
1261
+ print("Adapters considered detected if present in "+ str(minpres) + " % of reads.")
1262
+
1263
+ pre_qc, post_qc, post_trim = names_se(u, final_output, prefix)
1264
+
1265
+ adapters_detected = adapter_identification_se(complete_adapter_file_name, stk, fq, u, threads, final_output, minpres, prefix, phred)
1266
+ cleaned_adapters = parse_adapters(adapter_set, adapters_detected, final_output, prefix)
1267
+
1268
+ if needs_cleanup:
1269
+ print("Removing automatically generated adapters...")
1270
+ os.remove(complete_adapter_file_name)
1271
+
1272
+ full_trim_se(u, post_trim, final_output, cleaned_adapters, threads, fq, fp, score, minlen, mid, mid_q, prefix, compressor, compression_level, phred, advanced, skip_fp, skip_fq)
1273
+
1274
+ if compressor not in ["dsrc"]:
1275
+ falco_qc_se(u, post_trim, pre_qc, post_qc, threads, that_aint_falco)
1276
+
1277
+
1278
+ print("Trimming complete.")
1279
+
1280
+ #just runs main
1281
+ if __name__ == "__main__":
1282
+ main()
1283
+
1284
+ #End of functional components of Multitrim.
1285
+
1286
+ #Below are leftover creation functions that could be used to update the list of adapters. Not used in the program proper.
1287
+
1288
+ #Regenerate adatapers file output from a fasta. This is a utility function I do not expect to see used in the final product.
1289
+ def fasta_to_permanent_python(original_adapters_fasta):
1290
+ fasta = open(original_adapters_fasta, "r")
1291
+
1292
+ fasta_seq_dict = {}
1293
+
1294
+ current_line = fasta.readline().strip()
1295
+
1296
+ current_id = current_line
1297
+ current_seq = ""
1298
+
1299
+ current_line = fasta.readline().strip()
1300
+
1301
+ while current_line:
1302
+ if current_line.startswith(">"):
1303
+ fasta_seq_dict[current_id] = current_seq
1304
+ current_id = current_line
1305
+ current_seq = ""
1306
+ else:
1307
+ current_seq += current_line
1308
+
1309
+ current_line = fasta.readline().strip()
1310
+
1311
+ #Finally, python needs this logic.
1312
+ fasta_seq_dict[current_id] = current_seq
1313
+
1314
+ fasta.close()
1315
+
1316
+ for contig in fasta_seq_dict:
1317
+ print("adapters_dict[\""+contig+"\"] = \""+ fasta_seq_dict[contig]+"\"")
1318
+
1319
+ #These spit out spoofed python code for the in-built creation of an adapters file to supply tools, without the external need for this file.
1320
+ #I just copy-paste the results to tbe generate_adapters_temporary_file function's body to get the results.
1321
+
1322
+ #As above, this prints out a python-correct set of commands for me to copy-paste.
1323
+ #This one produces a set of "families" for adapters, where each is the set of adapters in a kit.
1324
+ def fasta_to_families(original_adapters_fasta):
1325
+ fasta = open(original_adapters_fasta, "r")
1326
+
1327
+ whichfam = [5, 6, 10, 6, 7, 12, 14, 3, 13, 50, 9, 5, 6]
1328
+
1329
+ families = []
1330
+ families.append("singleend")
1331
+ families.append("pairedend")
1332
+ families.append("dpnII")
1333
+ families.append("smallrna")
1334
+ families.append("multiplex")
1335
+ families.append("pcr")
1336
+ families.append("dpnIIgex")
1337
+ families.append("otherrna")
1338
+ families.append("trueseq")
1339
+ families.append("rnapcr")
1340
+ families.append("abi")
1341
+ families.append("trueseq2")
1342
+ families.append("nextera")
1343
+
1344
+ current_fam = 0
1345
+
1346
+ famlist = []
1347
+
1348
+ for family_size in whichfam:
1349
+ for i in range(0, family_size):
1350
+ famlist.append(families[current_fam])
1351
+ current_fam += 1
1352
+
1353
+ fasta_fam_dict = {}
1354
+
1355
+ current_fam = 0
1356
+
1357
+ for line in fasta:
1358
+ if line.strip().startswith(">"):
1359
+ fasta_fam_dict[line.strip()[1:]] = famlist[current_fam]
1360
+ current_fam += 1
1361
+
1362
+ fasta.close()
1363
+
1364
+ for contig in fasta_fam_dict:
1365
+ print("adapters_fam_dict[\""+contig+"\"] = \""+ fasta_fam_dict[contig]+"\"")
1366
+
1367
+ #identifies the same adapters as in the full file with a family of origin, so that all adapters in a family can be selected.
1368
+ def create_adapter_families():
1369
+ adapters_fam_dict = {}
1370
+
1371
+ adapters_fam_dict["Illumina_Single_End_Apapter_1"] = "singleend"
1372
+ adapters_fam_dict["Illumina_Single_End_Apapter_2"] = "singleend"
1373
+ adapters_fam_dict["Illumina_Single_End_PCR_Primer_1"] = "singleend"
1374
+ adapters_fam_dict["Illumina_Single_End_PCR_Primer_2"] = "singleend"
1375
+ adapters_fam_dict["Illumina_Single_End_Sequencing_Primer"] = "singleend"
1376
+ adapters_fam_dict["Illumina_Paired_End_Adapter_1"] = "pairedend"
1377
+ adapters_fam_dict["Illumina_Paired_End_Adapter_2"] = "pairedend"
1378
+ adapters_fam_dict["Illumina_Paried_End_PCR_Primer_1"] = "pairedend"
1379
+ adapters_fam_dict["Illumina_Paired_End_PCR_Primer_2"] = "pairedend"
1380
+ adapters_fam_dict["Illumina_Paried_End_Sequencing_Primer_1"] = "pairedend"
1381
+ adapters_fam_dict["Illumina_Paired_End_Sequencing_Primer_2"] = "pairedend"
1382
+ adapters_fam_dict["Illumina_DpnII_expression_Adapter_1"] = "dpnII"
1383
+ adapters_fam_dict["Illumina_DpnII_expression_Adapter_2"] = "dpnII"
1384
+ adapters_fam_dict["Illumina_DpnII_expression_PCR_Primer_1"] = "dpnII"
1385
+ adapters_fam_dict["Illumina_DpnII_expression_PCR_Primer_2"] = "dpnII"
1386
+ adapters_fam_dict["Illumina_DpnII_expression_Sequencing_Primer"] = "dpnII"
1387
+ adapters_fam_dict["Illumina_NlaIII_expression_Adapter_1"] = "dpnII"
1388
+ adapters_fam_dict["Illumina_NlaIII_expression_Adapter_2"] = "dpnII"
1389
+ adapters_fam_dict["Illumina_NlaIII_expression_PCR_Primer_1"] = "dpnII"
1390
+ adapters_fam_dict["Illumina_NlaIII_expression_PCR_Primer_2"] = "dpnII"
1391
+ adapters_fam_dict["Illumina_NlaIII_expression_Sequencing_Primer"] = "dpnII"
1392
+ adapters_fam_dict["Illumina_Small_RNA_Adapter_1"] = "smallrna"
1393
+ adapters_fam_dict["Illumina_Small_RNA_Adapter_2"] = "smallrna"
1394
+ adapters_fam_dict["Illumina_Small_RNA_RT_Primer"] = "smallrna"
1395
+ adapters_fam_dict["Illumina_Small_RNA_PCR_Primer_1"] = "smallrna"
1396
+ adapters_fam_dict["Illumina_Small_RNA_PCR_Primer_2"] = "smallrna"
1397
+ adapters_fam_dict["Illumina_Small_RNA_Sequencing_Primer"] = "smallrna"
1398
+ adapters_fam_dict["Illumina_Multiplexing_Adapter_1"] = "multiplex"
1399
+ adapters_fam_dict["Illumina_Multiplexing_Adapter_2"] = "multiplex"
1400
+ adapters_fam_dict["Illumina_Multiplexing_PCR_Primer_1.01"] = "multiplex"
1401
+ adapters_fam_dict["Illumina_Multiplexing_PCR_Primer_2.01"] = "multiplex"
1402
+ adapters_fam_dict["Illumina_Multiplexing_Read1_Sequencing_Primer"] = "multiplex"
1403
+ adapters_fam_dict["Illumina_Multiplexing_Index_Sequencing_Primer"] = "multiplex"
1404
+ adapters_fam_dict["Illumina_Multiplexing_Read2_Sequencing_Primer"] = "multiplex"
1405
+ adapters_fam_dict["Illumina_PCR_Primer_Index_1"] = "pcr"
1406
+ adapters_fam_dict["Illumina_PCR_Primer_Index_2"] = "pcr"
1407
+ adapters_fam_dict["Illumina_PCR_Primer_Index_3"] = "pcr"
1408
+ adapters_fam_dict["Illumina_PCR_Primer_Index_4"] = "pcr"
1409
+ adapters_fam_dict["Illumina_PCR_Primer_Index_5"] = "pcr"
1410
+ adapters_fam_dict["Illumina_PCR_Primer_Index_6"] = "pcr"
1411
+ adapters_fam_dict["Illumina_PCR_Primer_Index_7"] = "pcr"
1412
+ adapters_fam_dict["Illumina_PCR_Primer_Index_8"] = "pcr"
1413
+ adapters_fam_dict["Illumina_PCR_Primer_Index_9"] = "pcr"
1414
+ adapters_fam_dict["Illumina_PCR_Primer_Index_10"] = "pcr"
1415
+ adapters_fam_dict["Illumina_PCR_Primer_Index_11"] = "pcr"
1416
+ adapters_fam_dict["Illumina_PCR_Primer_Index_12"] = "pcr"
1417
+ adapters_fam_dict["Illumina_DpnII_Gex_Adapter_1"] = "dpnIIgex"
1418
+ adapters_fam_dict["Illumina_DpnII_Gex_Adapter_1.01"] = "dpnIIgex"
1419
+ adapters_fam_dict["Illumina_DpnII_Gex_Adapter_2"] = "dpnIIgex"
1420
+ adapters_fam_dict["Illumina_DpnII_Gex_Adapter_2.01"] = "dpnIIgex"
1421
+ adapters_fam_dict["Illumina_DpnII_Gex_PCR_Primer_1"] = "dpnIIgex"
1422
+ adapters_fam_dict["Illumina_DpnII_Gex_PCR_Primer_2"] = "dpnIIgex"
1423
+ adapters_fam_dict["Illumina_DpnII_Gex_Sequencing_Primer"] = "dpnIIgex"
1424
+ adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_1.01"] = "dpnIIgex"
1425
+ adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_1.02"] = "dpnIIgex"
1426
+ adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_2.01"] = "dpnIIgex"
1427
+ adapters_fam_dict["Illumina_NlaIII_Gex_Adapter_2.02"] = "dpnIIgex"
1428
+ adapters_fam_dict["Illumina_NlaIII_Gex_PCR_Primer_1"] = "dpnIIgex"
1429
+ adapters_fam_dict["Illumina_NlaIII_Gex_PCR_Primer_2"] = "dpnIIgex"
1430
+ adapters_fam_dict["Illumina_NlaIII_Gex_Sequencing_Primer"] = "dpnIIgex"
1431
+ adapters_fam_dict["Illumina_5p_RNA_Adapter"] = "otherrna"
1432
+ adapters_fam_dict["Illumina_RNA_Adapter1"] = "otherrna"
1433
+ adapters_fam_dict["Illumina_Small_RNA_3p_Adapter_1"] = "otherrna"
1434
+ adapters_fam_dict["TruSeq_Universal_Adapter"] = "trueseq"
1435
+ adapters_fam_dict["TruSeq_Adapter_Index_1"] = "trueseq"
1436
+ adapters_fam_dict["TruSeq_Adapter_Index_2"] = "trueseq"
1437
+ adapters_fam_dict["TruSeq_Adapter_Index_3"] = "trueseq"
1438
+ adapters_fam_dict["TruSeq_Adapter_Index_4"] = "trueseq"
1439
+ adapters_fam_dict["TruSeq_Adapter_Index_5"] = "trueseq"
1440
+ adapters_fam_dict["TruSeq_Adapter_Index_6"] = "trueseq"
1441
+ adapters_fam_dict["TruSeq_Adapter_Index_7"] = "trueseq"
1442
+ adapters_fam_dict["TruSeq_Adapter_Index_8"] = "trueseq"
1443
+ adapters_fam_dict["TruSeq_Adapter_Index_9"] = "trueseq"
1444
+ adapters_fam_dict["TruSeq_Adapter_Index_10"] = "trueseq"
1445
+ adapters_fam_dict["TruSeq_Adapter_Index_11"] = "trueseq"
1446
+ adapters_fam_dict["TruSeq_Adapter_Index_12"] = "trueseq"
1447
+ adapters_fam_dict["Illumina_RNA_RT_Primer"] = "rnapcr"
1448
+ adapters_fam_dict["Illumina_RNA_PCR_Primer"] = "rnapcr"
1449
+ adapters_fam_dict["RNA_PCR_Primer_Index_1"] = "rnapcr"
1450
+ adapters_fam_dict["RNA_PCR_Primer_Index_2"] = "rnapcr"
1451
+ adapters_fam_dict["RNA_PCR_Primer_Index_3"] = "rnapcr"
1452
+ adapters_fam_dict["RNA_PCR_Primer_Index_4"] = "rnapcr"
1453
+ adapters_fam_dict["RNA_PCR_Primer_Index_5"] = "rnapcr"
1454
+ adapters_fam_dict["RNA_PCR_Primer_Index_6"] = "rnapcr"
1455
+ adapters_fam_dict["RNA_PCR_Primer_Index_7"] = "rnapcr"
1456
+ adapters_fam_dict["RNA_PCR_Primer_Index_8"] = "rnapcr"
1457
+ adapters_fam_dict["RNA_PCR_Primer_Index_9"] = "rnapcr"
1458
+ adapters_fam_dict["RNA_PCR_Primer_Index_10"] = "rnapcr"
1459
+ adapters_fam_dict["RNA_PCR_Primer_Index_11"] = "rnapcr"
1460
+ adapters_fam_dict["RNA_PCR_Primer_Index_12"] = "rnapcr"
1461
+ adapters_fam_dict["RNA_PCR_Primer_Index_13"] = "rnapcr"
1462
+ adapters_fam_dict["RNA_PCR_Primer_Index_14"] = "rnapcr"
1463
+ adapters_fam_dict["RNA_PCR_Primer_Index_15"] = "rnapcr"
1464
+ adapters_fam_dict["RNA_PCR_Primer_Index_16"] = "rnapcr"
1465
+ adapters_fam_dict["RNA_PCR_Primer_Index_17"] = "rnapcr"
1466
+ adapters_fam_dict["RNA_PCR_Primer_Index_18"] = "rnapcr"
1467
+ adapters_fam_dict["RNA_PCR_Primer_Index_19"] = "rnapcr"
1468
+ adapters_fam_dict["RNA_PCR_Primer_Index_20"] = "rnapcr"
1469
+ adapters_fam_dict["RNA_PCR_Primer_Index_21"] = "rnapcr"
1470
+ adapters_fam_dict["RNA_PCR_Primer_Index_22"] = "rnapcr"
1471
+ adapters_fam_dict["RNA_PCR_Primer_Index_23"] = "rnapcr"
1472
+ adapters_fam_dict["RNA_PCR_Primer_Index_24"] = "rnapcr"
1473
+ adapters_fam_dict["RNA_PCR_Primer_Index_25"] = "rnapcr"
1474
+ adapters_fam_dict["RNA_PCR_Primer_Index_26"] = "rnapcr"
1475
+ adapters_fam_dict["RNA_PCR_Primer_Index_27"] = "rnapcr"
1476
+ adapters_fam_dict["RNA_PCR_Primer_Index_28"] = "rnapcr"
1477
+ adapters_fam_dict["RNA_PCR_Primer_Index_29"] = "rnapcr"
1478
+ adapters_fam_dict["RNA_PCR_Primer_Index_30"] = "rnapcr"
1479
+ adapters_fam_dict["RNA_PCR_Primer_Index_31"] = "rnapcr"
1480
+ adapters_fam_dict["RNA_PCR_Primer_Index_32"] = "rnapcr"
1481
+ adapters_fam_dict["RNA_PCR_Primer_Index_33"] = "rnapcr"
1482
+ adapters_fam_dict["RNA_PCR_Primer_Index_34"] = "rnapcr"
1483
+ adapters_fam_dict["RNA_PCR_Primer_Index_35"] = "rnapcr"
1484
+ adapters_fam_dict["RNA_PCR_Primer_Index_36"] = "rnapcr"
1485
+ adapters_fam_dict["RNA_PCR_Primer_Index_37"] = "rnapcr"
1486
+ adapters_fam_dict["RNA_PCR_Primer_Index_38"] = "rnapcr"
1487
+ adapters_fam_dict["RNA_PCR_Primer_Index_39"] = "rnapcr"
1488
+ adapters_fam_dict["RNA_PCR_Primer_Index_40"] = "rnapcr"
1489
+ adapters_fam_dict["RNA_PCR_Primer_Index_41"] = "rnapcr"
1490
+ adapters_fam_dict["RNA_PCR_Primer_Index_42"] = "rnapcr"
1491
+ adapters_fam_dict["RNA_PCR_Primer_Index_43"] = "rnapcr"
1492
+ adapters_fam_dict["RNA_PCR_Primer_Index_44"] = "rnapcr"
1493
+ adapters_fam_dict["RNA_PCR_Primer_Index_45"] = "rnapcr"
1494
+ adapters_fam_dict["RNA_PCR_Primer_Index_46"] = "rnapcr"
1495
+ adapters_fam_dict["RNA_PCR_Primer_Index_47"] = "rnapcr"
1496
+ adapters_fam_dict["RNA_PCR_Primer_Index_48"] = "rnapcr"
1497
+ adapters_fam_dict["ABI_Dynabead_EcoP_Oligo"] = "abi"
1498
+ adapters_fam_dict["ABI_Solid3_Adapter_A"] = "abi"
1499
+ adapters_fam_dict["ABI_Solid3_Adapter_B"] = "abi"
1500
+ adapters_fam_dict["ABI_Solid3_5_AMP_Primer"] = "abi"
1501
+ adapters_fam_dict["ABI_Solid3_3_AMP_Primer"] = "abi"
1502
+ adapters_fam_dict["ABI_Solid3_EF1_alpha_Sense_Primer"] = "abi"
1503
+ adapters_fam_dict["ABI_Solid3_EF1_alpha_Antisense_Primer"] = "abi"
1504
+ adapters_fam_dict["ABI_Solid3_GAPDH_Forward_Primer"] = "abi"
1505
+ adapters_fam_dict["ABI_Solid3_GAPDH_Reverse_Primer"] = "abi"
1506
+ adapters_fam_dict["TruSeq2_SE"] = "trueseq2"
1507
+ adapters_fam_dict["TruSeq2_PE_f"] = "trueseq2"
1508
+ adapters_fam_dict["TruSeq2_PE_r"] = "trueseq2"
1509
+ adapters_fam_dict["TruSeq3_IndexedAdapter"] = "trueseq2"
1510
+ adapters_fam_dict["TruSeq3_UniversalAdapter"] = "trueseq2"
1511
+ adapters_fam_dict["Nextera_PE_PrefixNX/1"] = "nextera"
1512
+ adapters_fam_dict["Nextera_PE_PrefixNX/2"] = "nextera"
1513
+ adapters_fam_dict["Nextera_PE_Trans1"] = "nextera"
1514
+ adapters_fam_dict["Nextera_PE_Trans1_rc"] = "nextera"
1515
+ adapters_fam_dict["Nextera_PE_Trans2"] = "nextera"
1516
+ adapters_fam_dict["Nextera_PE_Trans2_rc"] = "nextera"
1517
+
1518
+ return(adapters_fam_dict)
1519
+
1520
+ #creates python code for family:id:sequence dictionary for kit detection code. Relies on the current state of the generate_adapters_temporary_file function.
1521
+ def create_seq_to_fam():
1522
+ dict, name = generate_adapters_temporary_file()
1523
+
1524
+ fams = create_adapter_families()
1525
+
1526
+ seq_to_fam = {}
1527
+
1528
+ for id in dict:
1529
+ family = fams[id[1:]]
1530
+ seq_to_fam[dict[id]] = family
1531
+ #print("seq_to_fam['"+dict[id]+"'] = '" + family+ "'")
1532
+
1533
+ fam_to_id_to_seq = {}
1534
+
1535
+ for id in dict:
1536
+ family = fams[id[1:]]
1537
+ seq = dict[id]
1538
+
1539
+ if family in fam_to_id_to_seq:
1540
+ fam_to_id_to_seq[family][id[1:]] = seq
1541
+ else:
1542
+ fam_to_id_to_seq[family] = {}
1543
+ fam_to_id_to_seq[family][id[1:]] = seq
1544
+
1545
+ for fam in fam_to_id_to_seq:
1546
+ print("fam_to_id_to_seq['"+fam+"'] =", fam_to_id_to_seq[fam])
1547
+
1548
+
1549
+
1550
+
1551
+
1552
+
1553
+
1554
+
1555
+ os.remove(name)