miga-base 0.7.26.0 → 0.7.26.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +1 -1
  3. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  4. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  5. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  6. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  7. data/utils/FastAAI/README.md +84 -0
  8. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  9. data/utils/enveomics/Docs/recplot2.md +244 -0
  10. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  11. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  12. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  13. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  14. data/utils/enveomics/LICENSE.txt +73 -0
  15. data/utils/enveomics/Makefile +52 -0
  16. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  17. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  18. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  19. data/utils/enveomics/Manifest/Tasks/fasta.json +766 -0
  20. data/utils/enveomics/Manifest/Tasks/fastq.json +243 -0
  21. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  22. data/utils/enveomics/Manifest/Tasks/mapping.json +67 -0
  23. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  24. data/utils/enveomics/Manifest/Tasks/other.json +829 -0
  25. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  26. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +501 -0
  27. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  28. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  29. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  30. data/utils/enveomics/Manifest/categories.json +156 -0
  31. data/utils/enveomics/Manifest/examples.json +154 -0
  32. data/utils/enveomics/Manifest/tasks.json +4 -0
  33. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  34. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  35. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  36. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  37. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  38. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  39. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  40. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  41. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  42. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  43. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  44. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  45. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  46. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  47. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  48. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  49. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  50. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  51. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  52. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  53. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  54. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  55. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  56. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  57. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  58. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  59. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  60. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  61. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  62. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  63. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  64. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  65. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  66. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  67. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  68. data/utils/enveomics/README.md +42 -0
  69. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  70. data/utils/enveomics/Scripts/Aln.cat.rb +163 -0
  71. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  72. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  73. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  74. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  75. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  76. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  77. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  78. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  79. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  80. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  81. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  82. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  83. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  84. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  85. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  86. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  87. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  88. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  89. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  90. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  91. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  92. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  93. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  94. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  95. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  96. data/utils/enveomics/Scripts/FastA.N50.pl +56 -0
  97. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  98. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  99. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  100. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  101. data/utils/enveomics/Scripts/FastA.fragment.rb +92 -0
  102. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  103. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  104. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  105. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  106. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  107. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  108. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  109. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  110. data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
  111. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  112. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  113. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  114. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  115. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  116. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  117. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  118. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  119. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  120. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  121. data/utils/enveomics/Scripts/FastQ.tag.rb +63 -0
  122. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  123. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  124. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  125. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  126. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  127. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  128. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  129. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  130. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  131. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  132. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  133. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  134. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  135. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  136. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  137. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  138. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  139. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  140. data/utils/enveomics/Scripts/SRA.download.bash +57 -0
  141. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  142. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  143. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  144. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  145. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  146. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  147. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  148. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  149. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  150. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  151. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  152. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  153. data/utils/enveomics/Scripts/aai.rb +418 -0
  154. data/utils/enveomics/Scripts/ani.rb +362 -0
  155. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  156. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  157. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  158. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  159. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  160. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  161. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  162. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  163. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  164. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  165. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  166. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +30 -0
  167. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  168. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  169. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  170. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  171. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  172. data/utils/enveomics/Scripts/ogs.rb +104 -0
  173. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  174. data/utils/enveomics/Scripts/rbm.rb +146 -0
  175. data/utils/enveomics/Tests/Makefile +10 -0
  176. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  177. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  178. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  179. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  180. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  181. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  182. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  183. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  184. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  185. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  186. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  187. data/utils/enveomics/Tests/alkB.nwk +1 -0
  188. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  189. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  190. data/utils/enveomics/Tests/hiv1.faa +59 -0
  191. data/utils/enveomics/Tests/hiv1.fna +134 -0
  192. data/utils/enveomics/Tests/hiv2.faa +70 -0
  193. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  194. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  195. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  196. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  197. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  198. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  199. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  200. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  201. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  202. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  203. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  204. data/utils/enveomics/build_enveomics_r.bash +45 -0
  205. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  206. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  207. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  208. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  209. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  210. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  211. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  212. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  213. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  214. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  215. data/utils/enveomics/enveomics.R/R/utils.R +50 -0
  216. data/utils/enveomics/enveomics.R/README.md +80 -0
  217. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  218. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  219. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  220. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  221. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  222. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  223. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  224. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  225. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  226. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  227. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  228. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -0
  229. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -0
  230. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -0
  231. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  232. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  233. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -0
  234. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -0
  235. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -0
  236. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -0
  237. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -0
  238. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -0
  239. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  240. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  241. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -0
  242. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  243. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  244. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  245. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  246. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -0
  247. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  248. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  249. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -0
  250. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  251. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  252. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  253. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  254. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -0
  255. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  256. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -0
  257. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +37 -0
  258. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -0
  259. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  260. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  261. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -0
  262. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -0
  263. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  264. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  265. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  266. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  267. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -0
  268. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -0
  269. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -0
  270. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -0
  271. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  272. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  273. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  274. data/utils/enveomics/globals.mk +8 -0
  275. data/utils/enveomics/manifest.json +9 -0
  276. metadata +277 -4
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ $:.push File.expand_path("../lib", __FILE__)
7
+ require "enveomics_rb/enveomics"
8
+ require "enveomics_rb/vcf"
9
+
10
+ o = {}
11
+ OptionParser.new do |opt|
12
+ opt.banner = "
13
+ Estimates the Ka/Ks ratio from the SNPs in a VCF file. Ka and Ks are corrected
14
+ using pseudo-counts, but no corrections for multiple substitutions are
15
+ applied.
16
+
17
+ Usage: #{$0} [options]".gsub(/^ +/,"")
18
+ opt.separator ""
19
+ opt.separator "Mandatory"
20
+ opt.on("-i", "--input FILE",
21
+ "Input file in Variant Call Format (VCF)."){ |v| o[:file] = v}
22
+ opt.on("-s", "--seqs FILE",
23
+ "Input gene sequences (nucleotides) in FastA format."){ |v| o[:seqs] = v}
24
+ opt.separator ""
25
+ opt.separator "Parameters"
26
+ opt.on("-f", "--syn-frx FLOAT",
27
+ "Fraction of synonymous substitutions. If passed, the number of sites are",
28
+ "estimated (not counted per gene), speeding up the computation ~10X."
29
+ ){ |v| o[:syn_frx] = v.to_f }
30
+ opt.on("-b", "--syn-bacterial-code",
31
+ "Sets --syn-frx to 0.760417, approximately the proportion of synonymous",
32
+ "substitutions in the bacterial code."){ o[:syn_frx] = 0.760417 }
33
+ opt.separator ""
34
+ opt.separator "Miscellaneous"
35
+ opt.on("-c", "--codon-file FILE",
36
+ "Output file including the codons of substitution variants."
37
+ ){ |v| o[:codon_file] = v }
38
+ opt.on("-h", "--help", "Display this screen.") do
39
+ puts opt
40
+ exit
41
+ end
42
+ opt.separator ""
43
+ end.parse!
44
+
45
+ abort "--input is mandatory" if o[:file].nil?
46
+ abort "--seqs is mandatory" if o[:seqs].nil?
47
+
48
+ # Codon table (11. The Bacterial, Archaeal and Plant Plastid Code)
49
+ # https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG11
50
+ t = {
51
+ AAs: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
52
+ Starts: "---M------**--*----M------------MMMM---------------M------------",
53
+ Base1: "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
54
+ Base2: "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
55
+ Base3: "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
56
+ }
57
+ $codon_aa = {}
58
+ $codon_st = {}
59
+ (0 .. (t[:Base1].size-1)).each do |i|
60
+ cod = [:Base1, :Base2, :Base3].map{ |k| t[k][i] }.join
61
+ $codon_aa[cod] = t[:AAs][i]
62
+ $codon_st[cod] = t[:Starts][i]
63
+ end
64
+
65
+ ##
66
+ # Is the change +cod+ to +cod_alt+ synonymous? +start_codon+ indicates if the
67
+ # codon the first in the gene.
68
+ def syn?(cod, cod_alt, start_codon=false)
69
+ start_codon ?
70
+ ( $codon_st[cod] == $codon_st[cod_alt] ) :
71
+ ( $codon_aa[cod] == $codon_aa[cod_alt] )
72
+ end
73
+
74
+ ##
75
+ # Estimates the fraction of times that the substitutions in the sequence +seq+
76
+ # result in synonymous mutations from those in position +pos+ by any of the
77
+ # nucleotides in +alts+.
78
+ def syn_fraction(seq, pos, alts)
79
+ cod_let = (pos-1)%3
80
+ cod_pos = (pos-1) - cod_let
81
+ cod = seq[cod_pos .. (cod_pos+2)]
82
+ syn = 0
83
+ cod_alts = alts.map do |alt|
84
+ cod_alt = "#{cod}"
85
+ cod_alt[cod_let] = alt
86
+ cod_alt
87
+ end
88
+ syn = cod_alts.map{ |i| syn?(cod, i, pos<=3) ? 1 : 0 }.inject(0,:+)
89
+ $codon_fh.puts [syn, cod, cod_alts.join(",")].join("\t") unless $codon_fh.nil?
90
+ syn.to_f/alts.size
91
+ end
92
+
93
+ # Read sequences
94
+ seqs = {}
95
+ File.open(o[:seqs], "r") do |fh|
96
+ id = ""
97
+ fh.each_line do |ln|
98
+ if ln =~ /^>(\S+)/
99
+ id = $1
100
+ seqs[id] = ""
101
+ else
102
+ seqs[id] += ln.chomp.gsub(/[^A-Za-z]/, "")
103
+ end
104
+ end
105
+ end
106
+
107
+ # Process variants
108
+ $codon_fh = nil
109
+ unless o[:codon_file].nil?
110
+ $codon_fh = File.open(o[:codon_file], "w")
111
+ $codon_fh.puts "#" + %w[Syn Ref Alt].join("\t")
112
+ end
113
+ vcf = VCF.new(o[:file])
114
+ gen = {}
115
+ vcf.each_variant do |v|
116
+ next if v.indel?
117
+ raise "REF doesn't match VCF:\n#{v}" unless seqs[v.chrom][v.pos-1] == v.ref
118
+ gen[v.chrom] ||= [0.0, 0.0]
119
+ alts = v.alt.split(",")
120
+ syn = syn_fraction(seqs[v.chrom], v.pos, alts)
121
+ gen[v.chrom][0] += 1.0-syn
122
+ gen[v.chrom][1] += syn
123
+ end
124
+ $codon_fh.close unless $codon_fh.nil?
125
+ $codon_fh = nil
126
+
127
+ # Ka/Ks
128
+ puts "#" +
129
+ "SeqID KaKs Ka Ks NonSynSubs SynSubs NonSynSites SynSites".tr(" ","\t")
130
+ gen.each do |k,v|
131
+ if o[:syn_frx].nil?
132
+ v[2,3] = [0.0,0.0]
133
+ (1 .. seqs[k].size).each do |pos|
134
+ alts = %w(A C T G) - [seqs[k][pos-1]]
135
+ syn = syn_fraction(seqs[k], pos, alts)
136
+ v[2] += 1.0-syn
137
+ v[3] += syn
138
+ end
139
+ else
140
+ v[2] = seqs[k].size.to_f*o[:syn_frx]
141
+ v[3] = seqs[k].size.to_f*(1.0-o[:syn_frx])
142
+ end
143
+ ka = (v[0] + 1) / (v[2] + 2)
144
+ ks = (v[1] + 1) / (v[3] + 2)
145
+ puts ([k, ka/ks, ka, ks] + v).join("\t")
146
+ end
147
+
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
7
+ require "enveomics_rb/enveomics"
8
+ require "enveomics_rb/vcf"
9
+
10
+ o = {min_dp:4, max_dp:Float::INFINITY, min_ref_dp:2, min_alt_dp:2, min_qual:0.0,
11
+ indels:false, min_ic:0.0}
12
+ OptionParser.new do |opt|
13
+ opt.banner = "
14
+ Counts the number of Single-Nucleotide Polymorphisms (SNPs) in a VCF file.
15
+
16
+ Usage: #{$0} [options]".gsub(/^ +/,"")
17
+ opt.separator ""
18
+ opt.separator "Mandatory"
19
+ opt.on("-i", "--input FILE",
20
+ "Input file in Variant Call Format (VCF)."){ |v| o[:file] = v}
21
+ opt.separator ""
22
+ opt.separator "Parameters"
23
+ opt.on("-o", "--out FILE",
24
+ "Output (filtered) file in Variant Call Format (VCF)."){ |v| o[:out] = v}
25
+ opt.on("-m", "--min-dp INT",
26
+ "Minimum number of reads covering the position. By default: #{o[:min_dp]}."
27
+ ){ |v| o[:min_dp] = v.to_i }
28
+ opt.on("-M", "--max-dp INT",
29
+ "Maximum number of reads covering the position. By default: #{o[:max_dp]}."
30
+ ){ |v| o[:max_dp] = (v=="Infinity" ? Float::INFINITY : v.to_i) }
31
+ opt.on("-r", "--min-ref-dp INT",
32
+ "Minimum number of reads supporting allele REF. " +
33
+ "By default: #{o[:min_ref_dp]}."
34
+ ){ |v| o[:min_ref_dp] = v.to_i }
35
+ opt.on("-a", "--min-alt-dp INT",
36
+ "Minimum number of reads supporting allele ALT. " +
37
+ "By default: #{o[:min_alt_dp]}."
38
+ ){ |v| o[:min_alt_dp] = v.to_i }
39
+ opt.on("-q", "--min-quality FLOAT",
40
+ "Minimum quality of the position mapping. By default: #{o[:min_qual]}."
41
+ ){ |v| o[:max_dp] = v.to_f }
42
+ opt.on("-s", "--min-shannon FLOAT",
43
+ "Minimum information content (in bits, from 0 to 1). " +
44
+ "By default: #{o[:min_ic]}"){ |v| o[:min_ic] = v.to_f }
45
+ opt.on("--[no-]indels",
46
+ "Process (or ignore) indels. By default: ignore."
47
+ ){ |v| o[:indels] = v }
48
+ opt.on("-h", "--help", "Display this screen.") do
49
+ puts opt
50
+ exit
51
+ end
52
+ opt.separator ""
53
+ end.parse!
54
+
55
+ abort "--input is mandatory" if o[:file].nil?
56
+
57
+ vcf = VCF.new(o[:file])
58
+ c = 0
59
+ dp = 0
60
+ ref_dp = 0
61
+ alt_dp = 0
62
+ h = 0
63
+ unless o[:out].nil?
64
+ ofh = File.open(o[:out], "w")
65
+ vcf.each_header{ |h| ofh.print h }
66
+ end
67
+ vcf.each_variant do |v|
68
+ next if v.indel? and not o[:indels]
69
+ next if v.dp < o[:min_dp]
70
+ next if v.dp > o[:max_dp]
71
+ next if v.ref_dp < o[:min_ref_dp]
72
+ next if v.alt_dp < o[:min_alt_dp]
73
+ next if v.qual < o[:min_qual]
74
+ next if v.shannon < o[:min_ic]
75
+ c += 1
76
+ dp += v.dp
77
+ ref_dp += v.ref_dp
78
+ alt_dp += v.alt_dp
79
+ h += v.shannon
80
+ ofh.print v.to_s unless o[:out].nil?
81
+ end
82
+ ofh.close unless o[:out].nil?
83
+
84
+ puts "SNPs: #{c}", "Information content: #{h}",
85
+ "Average SNP depth: #{dp.to_f/c}",
86
+ "Average REF allele depth: #{ref_dp.to_f/c}",
87
+ "Average ALT allele depth: #{alt_dp.to_f/c}"
88
+
@@ -0,0 +1,418 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R
4
+ # @license Artistic-2.0
5
+
6
+ require 'optparse'
7
+ require 'tmpdir'
8
+ require 'zlib'
9
+ has_rest_client = true
10
+ has_sqlite3 = true
11
+ begin
12
+ require 'rubygems'
13
+ require 'restclient'
14
+ rescue LoadError
15
+ has_rest_client = false
16
+ end
17
+ begin
18
+ require 'sqlite3'
19
+ rescue LoadError
20
+ has_sqlite3 = false
21
+ end
22
+
23
+ o = {
24
+ bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
25
+ thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
26
+ len_fraction: 0.0, max_actg: 0.95
27
+ }
28
+ ARGV << '-h' if ARGV.size == 0
29
+ OptionParser.new do |opts|
30
+ opts.banner = "
31
+ Calculates the Average Amino Acid Identity between two genomes
32
+
33
+ Usage: #{$0} [options]"
34
+ opts.separator ''
35
+ opts.separator 'Mandatory'
36
+ opts.on(
37
+ '-1', '--seq1 FILE',
38
+ 'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
39
+ ) { |v| o[:seq1] = v }
40
+ opts.on(
41
+ '-2', '--seq2 FILE',
42
+ 'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
43
+ ) { |v| o[:seq2] = v }
44
+ if has_rest_client
45
+ opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
46
+ 'genome (nucleotides) with the format ncbi:CP014272 instead of files'
47
+ else
48
+ opts.separator ' Install rest-client to enable NCBI-acc support'
49
+ end
50
+ opts.separator ''
51
+ opts.separator 'Search Options'
52
+ opts.on(
53
+ '-l', '--len INT', Integer,
54
+ "Minimum alignment length (in residues). By default: #{o[:len]}"
55
+ ) { |v| o[:len] = v }
56
+ opts.on(
57
+ '-L', '--len-fraction NUM', Float,
58
+ 'Minimum alignment length as a fraction of the shorter sequence',
59
+ "(range 0-1). By default: #{o[:len_fraction]}"
60
+ ) { |v| o[:len_fraction] = v }
61
+ opts.on(
62
+ '-i', '--id FLOAT', Float,
63
+ "Minimum alignment identity (in %). By default: #{o[:id]}"
64
+ ) { |v| o[:id] = v }
65
+ opts.on(
66
+ '-s', '--bitscore FLOAT', Float,
67
+ "Minimum bit score (in bits). By default: #{o[:bits]}"
68
+ ) { |v| o[:bits] = v }
69
+ opts.on(
70
+ '-n', '--hits INT', Integer,
71
+ "Minimum number of hits. By default: #{o[:hits]}"
72
+ ) { |v| o[:hits] = v }
73
+ opts.on(
74
+ '-N', '--nucl',
75
+ 'The input sequences are nucleotides (genes), not proteins'
76
+ ) { |v| o[:nucl] = v }
77
+ opts.on(
78
+ '--max-actg FLOAT', Float,
79
+ 'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
80
+ "By default: #{o[:max_actg]}"
81
+ ) { |v| o[:max_actg] = v }
82
+ opts.separator ''
83
+ opts.separator 'Software Options'
84
+ opts.on(
85
+ '-b', '--bin DIR',
86
+ 'Path to the directory containing the binaries of the search program'
87
+ ) { |v| o[:bin] = v }
88
+ opts.on(
89
+ '-p', '--program STR',
90
+ 'Search program to be used. One of: blast+ (default), blast, blat, diamond'
91
+ ) { |v| o[:program] = v }
92
+ opts.on(
93
+ '-t', '--threads INT', Integer,
94
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
95
+ ) { |v| o[:thr] = v }
96
+ opts.separator ''
97
+ opts.separator 'SQLite3 Options'
98
+ unless has_sqlite3
99
+ opts.separator ' Install sqlite3 gem to enable database support'
100
+ end
101
+ opts.on(
102
+ '-S', '--sqlite3 FILE',
103
+ 'Path to the SQLite3 database to create (or update) with the results'
104
+ ) { |v| o[:sqlite3] = v }
105
+ opts.on(
106
+ '--name1 STR',
107
+ 'Name of --seq1 to use in --sqlite3. By default determined by filename'
108
+ ) { |v| o[:seq1name] = v }
109
+ opts.on(
110
+ '--name2 STR',
111
+ 'Name of --seq2 to use in --sqlite3. By default determined by filename'
112
+ ) { |v| o[:seq2name] = v }
113
+ opts.on(
114
+ '--[no-]save-rbm',
115
+ 'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
116
+ "By default: #{o[:dbrbm]}"
117
+ ) { |v| o[:dbrbm] = v }
118
+ opts.on(
119
+ '--lookup-first',
120
+ 'Indicates if the AAI should be looked up first in the database',
121
+ 'Requires --sqlite3, --auto, --name1, and --name2',
122
+ 'Incompatible with --res, --tab, --out, and --rbm'
123
+ ) { |v| o[:lookupfirst] = v }
124
+ opts.separator ''
125
+ opts.separator 'Other Output Options'
126
+ opts.on(
127
+ '-d', '--dec INT', Integer,
128
+ "Decimal positions to report. By default: #{o[:dec]}"
129
+ ) { |v| o[:dec] = v }
130
+ opts.on(
131
+ '-R', '--rbm FILE',
132
+ 'Saves a file with the reciprocal best matches'
133
+ ) { |v| o[:rbm] = v }
134
+ opts.on(
135
+ '-o', '--out FILE',
136
+ 'Saves a file describing the alignments used for two-way AAI'
137
+ ) { |v| o[:out] = v }
138
+ opts.on(
139
+ '-r', '--res FILE', 'Saves a file with the final results'
140
+ ) { |v| o[:res] = v }
141
+ opts.on(
142
+ '-T', '--tab FILE',
143
+ 'Saves a file with the final two-way results in a tab-delimited form',
144
+ 'The columns are (in that order):',
145
+ 'AAI, standard deviation, proteins used, proteins in the smallest genome'
146
+ ) { |v| o[:tab] = v }
147
+ opts.on(
148
+ '-a', '--auto',
149
+ 'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
150
+ ) { o[:auto] = true }
151
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
152
+ opts.on('-h', '--help', 'Display this screen') do
153
+ puts opts
154
+ exit
155
+ end
156
+ opts.separator ''
157
+ end.parse!
158
+
159
+ # Check input
160
+ abort '-1 is mandatory' if o[:seq1].nil?
161
+ abort '-2 is mandatory' if o[:seq2].nil?
162
+ if o[:program] == 'diamond' && o[:nucl]
163
+ abort '-p diamond is incompatible with -N'
164
+ end
165
+ unless o[:sqlite3].nil? or has_sqlite3
166
+ abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
167
+ end
168
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
169
+ if o[:lookupfirst]
170
+ abort '--lookup-first requires --name1' if o[:seq1name].nil?
171
+ abort '--lookup-first requires --name2' if o[:seq2name].nil?
172
+ abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
173
+ abort '--lookup-first requires --auto' unless o[:auto]
174
+ %w[res tab out rbm].each do |k|
175
+ abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
176
+ end
177
+ end
178
+
179
+ # Create SQLite3 file
180
+ unless o[:sqlite3].nil?
181
+ $stderr.puts "Accessing SQLite3 file: #{o[:sqlite3]}." unless o[:q]
182
+ sqlite_db = SQLite3::Database.new o[:sqlite3]
183
+ sqlite_db.execute "create table if not exists rbm( seq1 varchar(256), " +
184
+ "seq2 varchar(256), id1 varchar(256), id2 varchar(256), id float, " +
185
+ "evalue float, bitscore float )"
186
+ sqlite_db.execute "create table if not exists aai( seq1 varchar(256), " +
187
+ "seq2 varchar(256), aai float, sd float, n int, omega int )"
188
+ end
189
+
190
+ # Look-up first
191
+ if o[:lookupfirst]
192
+ val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
193
+ [o[:seq1name], o[:seq2name]]
194
+ val = sqlite_db.execute "select aai from aai where seq1=? and seq2=?",
195
+ [o[:seq2name], o[:seq1name]] if val.empty?
196
+ unless val.empty?
197
+ puts val.first.first
198
+ exit
199
+ end
200
+ end
201
+
202
+ Dir.mktmpdir do |dir|
203
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
204
+
205
+ # Create databases.
206
+ $stderr.puts "Creating databases." unless o[:q]
207
+ minfrg = nil
208
+ seq_names = []
209
+ seq_len = {}
210
+ actg_cnt = {}
211
+ ori_ids = {}
212
+ [:seq1, :seq2].each do |seq|
213
+ abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
214
+ /^gi:/.match(o[seq])
215
+ acc = /^ncbi:(\S+)/.match(o[seq])
216
+ unless acc.nil?
217
+ abort "NCBI-acc requested, but rest-client not supported. First " +
218
+ "install gem rest-client." unless has_rest_client
219
+ abort "NCBI-acc are currently not supported with --nucl. Please use " +
220
+ "ani.rb instead." if o[:nucl]
221
+ $stderr.puts " Downloading dataset from NCBI:#{acc[1]}." unless o[:q]
222
+ responseLink = RestClient.get(
223
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
224
+ {params:{db:"protein",dbfrom:"nuccore",id:acc[1],idtype:"acc"}})
225
+ abort "Unable to reach NCBI EUtils, error code " +
226
+ responseLink.code.to_s + "." unless responseLink.code == 200
227
+ fromId = true
228
+ protIds = []
229
+ o[seq] = "#{dir}/ncbi-#{seq.to_s}.fa"
230
+ fo = File.open(o[seq], "w")
231
+ responseLink.to_str.each_line.grep(/\s<Id>/) do |ln|
232
+ idMatch = /<Id>(\S+)<\/Id>/.match(ln)
233
+ unless idMatch.nil?
234
+ protIds.push(idMatch[1]) unless fromId
235
+ fromId = false
236
+ end
237
+ end
238
+ response = RestClient.post(
239
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
240
+ db:"nuccore",rettype:"fasta",id:protIds.join(","),idtype:"acc")
241
+ abort "Unable to reach NCBI EUtils, error code " +
242
+ response.code.to_s + "." unless response.code == 200
243
+ fo.puts response.to_str
244
+ fo.close
245
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
246
+ "ncbi:#{acc[1]}" :
247
+ o[ "#{seq}name".to_sym ])
248
+ else
249
+ seq_names << ( o[ "#{seq}name".to_sym ].nil? ?
250
+ File.basename(o[seq], ".*") :
251
+ o[ "#{seq}name".to_sym ])
252
+ end
253
+ $stderr.puts " Reading FastA file: #{o[seq]}" unless o[:q]
254
+ unless o[:sqlite3].nil?
255
+ sqlite_db.execute "delete from rbm where seq1=? and seq2=?", seq_names
256
+ sqlite_db.execute "delete from aai where seq1=? and seq2=?", seq_names
257
+ end
258
+ ori_ids[seq] = [nil]
259
+ seq_len[seq] = [0]
260
+ actg_cnt[seq] = 0
261
+ seqs = 0
262
+ fi = File.extname(o[seq]) == '.gz' ?
263
+ Zlib::GzipReader.open(o[seq]) :
264
+ File.open(o[seq], 'r')
265
+ File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
266
+ fi.each_line do |ln|
267
+ if ln =~ /^>(\S+)/
268
+ seqs += 1
269
+ ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
270
+ seq_len[seq][seqs] = 0
271
+ fo.puts ">#{seqs}"
272
+ else
273
+ fo.puts ln
274
+ seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
275
+ actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
276
+ end
277
+ end
278
+ end
279
+ fi.close
280
+ unless o[:nucl]
281
+ actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
282
+ abort "Input sequences appear to be nucleotides " +
283
+ "(ACTGN fraction: %.2f%%)." % (actg_frx*100) if actg_frx > o[:max_actg]
284
+ end
285
+ $stderr.puts " File contains #{seqs} sequences." unless o[:q]
286
+ minfrg ||= seqs
287
+ minfrg = seqs if minfrg > seqs
288
+ case o[:program].downcase
289
+ when "blast"
290
+ `"#{o[:bin]}formatdb" -i "#{dir}/#{seq}.fa" \
291
+ -p #{o[:nucl] ? "F" : "T"}`
292
+ when "blast+"
293
+ `"#{o[:bin]}makeblastdb" -in "#{dir}/#{seq}.fa" \
294
+ -dbtype #{o[:nucl] ? "nucl" : "prot"}`
295
+ when "blat"
296
+ # Nothing to do
297
+ when "diamond"
298
+ `"#{o[:bin]}diamond" makedb --in "#{dir}/#{seq}.fa" \
299
+ --db "#{dir}/#{seq}.fa.dmnd" --threads "#{o[:thr]}" \
300
+ --quiet`
301
+ else
302
+ abort "Unsupported program: #{o[:program]}."
303
+ end
304
+ end
305
+
306
+ # Best-hits.
307
+ $stderr.puts "Running one-way comparisons." unless o[:q]
308
+ rbh = []
309
+ id2 = 0
310
+ sq2 = 0
311
+ n2 = 0
312
+ unless o[:out].nil?
313
+ fo = File.open(o[:out], "w")
314
+ fo.puts %w(identity aln.len mismatch gap.open evalue bitscore).join("\t")
315
+ end
316
+ res = File.open(o[:res], "w") unless o[:res].nil?
317
+ rbm = File.open(o[:rbm], "w") unless o[:rbm].nil?
318
+ [1,2].each do |i|
319
+ qry_seen = []
320
+ q = "#{dir}/seq#{i}.fa"
321
+ s = "#{dir}/seq#{i==1?2:1}.fa"
322
+ case o[:program].downcase
323
+ when "blast"
324
+ `"#{o[:bin]}blastall" -p blast#{o[:nucl] ? "n": "p"} -d "#{s}" \
325
+ -i "#{q}" -v 1 -b 1 -a #{o[:thr]} -m 8 -o "#{dir}/#{i}.tab"`
326
+ when "blast+"
327
+ `"#{o[:bin]}blast#{o[:nucl] ? "n" : "p"}" -db "#{s}" -query "#{q}" \
328
+ -max_target_seqs 1 -num_threads #{o[:thr]} -outfmt 6 \
329
+ -out "#{dir}/#{i}.tab"`
330
+ when "blat"
331
+ `"#{o[:bin]}blat" "#{s}" "#{q}" #{"-prot" unless o[:nucl]} -out=blast8 \
332
+ "#{dir}/#{i}.tab.uns"`
333
+ `sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
334
+ when "diamond"
335
+ `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
336
+ --query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" --quiet \
337
+ && "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
338
+ --out "#{dir}/#{i}.tab" --quiet`
339
+ else
340
+ abort "Unsupported program: #{o[:program]}."
341
+ end
342
+ fh = File.open("#{dir}/#{i}.tab", "r")
343
+ id = 0
344
+ sq = 0
345
+ n = 0
346
+ fh.each_line do |ln|
347
+ ln.chomp!
348
+ row = ln.split(/\t/)
349
+ next unless qry_seen[ row[0].to_i ].nil?
350
+ next if row[3].to_i < o[:len] and
351
+ next if row[2].to_f < o[:id]
352
+ next if row[11].to_f < o[:bits]
353
+ next if row[3].to_f/[
354
+ seq_len[i==1 ? :seq1 : :seq2][row[0].to_i],
355
+ seq_len[i==1 ? :seq2 : :seq1][row[1].to_i]
356
+ ].min < o[:len_fraction]
357
+ qry_seen[ row[0].to_i ] = 1
358
+ id += row[2].to_f
359
+ sq += row[2].to_f ** 2
360
+ n += 1
361
+ if i==1
362
+ rbh[ row[0].to_i ] = row[1].to_i
363
+ else
364
+ if !rbh[ row[1].to_i ].nil? and rbh[ row[1].to_i ]==row[0].to_i
365
+ id2 += row[2].to_f
366
+ sq2 += row[2].to_f**2
367
+ n2 += 1
368
+ fo.puts [row[2..5],row[10..11]].join("\t") unless o[:out].nil?
369
+ rbm.puts [ori_ids[:seq1][row[1].to_i],
370
+ ori_ids[:seq2][row[0].to_i], row[2..5], row[8..9],
371
+ row[6..7], row[10..11]].join("\t") unless o[:rbm].nil?
372
+ sqlite_db.execute("insert into rbm values(?,?,?,?,?,?,?)",
373
+ seq_names + [ori_ids[:seq1][row[1].to_i],
374
+ ori_ids[:seq2][row[0].to_i], row[2], row[10], row[11]]
375
+ ) if not o[:sqlite3].nil? and o[:dbrbm]
376
+ end
377
+ end
378
+ end
379
+ fh.close
380
+ if n < o[:hits]
381
+ puts "Insuffient hits to estimate one-way AAI: #{n}." unless o[:auto]
382
+ res.puts "Insufficient hits to estimate one-way AAI: #{n}" unless
383
+ o[:res].nil?
384
+ else
385
+ printf "! One-way AAI %d: %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), " +
386
+ "from %i proteins.\n", i, id/n, (sq/n - (id/n)**2)**0.5, n unless
387
+ o[:auto]
388
+ res.puts sprintf "<b>One-way AAI %d:</b> %.#{o[:dec]}f%% " +
389
+ "(SD: %.#{o[:dec]}f%%), from %i proteins.<br/>", i, id/n,
390
+ (sq/n - (id/n)**2)**0.5, n unless o[:res].nil?
391
+ end
392
+ end
393
+ rbm.close unless o[:rbm].nil?
394
+ if n2 < o[:hits]
395
+ puts "Insufficient hits to estimate two-way AAI: #{n2}" unless o[:auto]
396
+ res.puts "Insufficient hits to estimate two-way AAI: #{n2}" unless
397
+ o[:res].nil?
398
+ else
399
+ printf "! Two-way AAI : %.#{o[:dec]}f%% (SD: %.#{o[:dec]}f%%), from %i" +
400
+ " proteins.\n", id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:auto]
401
+ res.puts sprintf "<b>Two-way AAI:</b> %.#{o[:dec]}f%% (SD: " +
402
+ "%.#{o[:dec]}f%%), from %i proteins.<br/>", id2/n2,
403
+ (sq2/n2 - (id2/n2)**2)**0.5, n2 unless o[:res].nil?
404
+ unless o[:tab].nil?
405
+ tab = File.open(o[:tab], "w")
406
+ tab.printf "%.#{o[:dec]}f\t%.#{o[:dec]}f\t%i\t%i\n", id2/n2,
407
+ (sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg
408
+ tab.close
409
+ end
410
+ sqlite_db.execute("insert into aai values(?,?,?,?,?,?)",
411
+ seq_names + [id2/n2, (sq2/n2 - (id2/n2)**2)**0.5, n2, minfrg]) unless
412
+ o[:sqlite3].nil?
413
+ puts id2/n2 if o[:auto]
414
+ end
415
+ res.close unless o[:res].nil?
416
+ fo.close unless o[:out].nil?
417
+ end
418
+