miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @update Mar-23-2016
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ require "optparse"
10
+
11
+ o = {:cog=>false, :desc=>false, :q=>false, :w=>true}
12
+ ARGV << "-h" if ARGV.size==0
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Replaces the COG gene IDs in a BLAST for the COG category."
15
+ opts.separator ""
16
+ opts.separator "Mandatory"
17
+ opts.on("-w", "--whog FILE", "Path to the whog file."){ |v| o[:whog]=v }
18
+ opts.on("-i", "--blast FILE",
19
+ "Path to the Tabular BLAST file with COG IDs as subject."
20
+ ){ |v| o[:blast]=v }
21
+ opts.separator ""
22
+ opts.separator "Optional"
23
+ opts.on("-g", "--cog",
24
+ "If set, returns the COG ID, not the COG category."){ o[:cog]=true }
25
+ opts.on("-d", "--desc",
26
+ "Includes COG description (requires -g/--cog)."){ o[:desc]=true }
27
+ opts.on("-n", "--noverbose", "Run quietly, but show warnings."){ o[:q]=true }
28
+ opts.on("-q", "--quiet", "Run quietly."){ o[:q]=true; o[:w]=false }
29
+ opts.on("-h", "--help", "Display this screen") do
30
+ puts opts
31
+ exit
32
+ end
33
+ opts.separator ""
34
+ end.parse!
35
+
36
+ abort "-w/--whog is mandatory." if o[:whog].nil?
37
+ abort "-i/--blast is mandatory." if o[:blast].nil?
38
+
39
+ $stderr.puts "Parsing whog file." unless o[:q]
40
+ cat = {}
41
+ curCats = []
42
+ fh = File.open o[:whog], "r"
43
+ while ln=fh.gets
44
+ ln.chomp!
45
+ next if /^\s*$/.match ln
46
+ if m=/^\[([A-Z]+)\] (COG\d+) (.*)/.match(ln)
47
+ curCats = o[:cog] ? [ m[2]+(o[:desc]?" #{m[3]}":"") ] : m[1].split(//)
48
+ elsif /^_+$/.match ln
49
+ curCats = []
50
+ elsif m=/^\s+(?:.+?:\s+)?(.*)/.match(ln)
51
+ m[1].split(/\s+/).each do |g|
52
+ cat[g] ||= []
53
+ curCats.each { |i| cat[g] << i }
54
+ end
55
+ else
56
+ abort "Impossible to parse line #{$.}: #{ln}"
57
+ end
58
+ end
59
+ fh.close
60
+
61
+ $stderr.puts "Parsing BLAST." unless o[:q]
62
+ fh = File.open(o[:blast], "r")
63
+ while ln=fh.gets
64
+ row = ln.split(/\t/)
65
+ if cat[ row[1] ].nil?
66
+ $stderr.puts "Warning: line #{$.}: #{row[1]}: " +
67
+ "Impossible to find category.\n" if o[:w]
68
+ else
69
+ cat[ row[1] ].each do |c|
70
+ row[1] = c
71
+ puts row.join("\t")
72
+ end
73
+ end
74
+ end
75
+ fh.close
76
+
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update: Mar-23-2015
5
+ # @license: artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+ use Getopt::Std;
11
+
12
+ my %o;
13
+ getopts('si', \%o);
14
+ my($list, $blast) = @ARGV;
15
+
16
+ ($list and $blast) or die "
17
+ .Description:
18
+ Extracts a subset of hits (queries or subjects) from a tabular BLAST.
19
+
20
+ .Usage: $0 [options] list.txt blast.txt > subset.txt
21
+
22
+ Options:
23
+ -s If set, assumes that list.txt contains subject IDs.
24
+ By default: assumes query IDs.
25
+ -i If set, reports the inverse of the list (i.e., reports
26
+ only hits absent in the list).
27
+
28
+ list.txt List of IDs to extract.
29
+ blast.txt Tabular BLAST file containing the superset of hits.
30
+ subset.txt Tabulat BLAST file to be created.
31
+
32
+ ";
33
+
34
+ open LI, "<", $list or die "Cannot read file: $list: $!\n";
35
+ my %li = map { chomp; $_ => 1 } <LI>;
36
+ close LI;
37
+
38
+ open BLAST, "<", $blast or die "Cannot read file: $blast: $!\n";
39
+ while(my $ln = <BLAST>){
40
+ chomp $ln;
41
+ my @ln = split("\t", $ln);
42
+ my $good = exists $li{$ln[ ($o{s} ? 1 : 0) ]};
43
+ $good = not $good if $o{i};
44
+ print "$ln\n" if $good;
45
+ }
46
+ close BLAST;
47
+
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use LWP::Simple;
6
+ use JSON;
7
+ use File::Copy;
8
+
9
+ my($blast, $cache_file, $max_cache) = @ARGV;
10
+ ($blast) or die "
11
+ Description:
12
+ Takes a BLAST against KEGG_PEP (or KO) and retrieves the pathways in which the subject
13
+ peptides are involved.
14
+
15
+ Usage:
16
+ $0 blast.txt[ cache_file] > output.txt
17
+
18
+ blast.txt Input (filtered) BLAST file.
19
+ cache_file (optional) File containing the saved cache. If unset, the
20
+ cache won't be recoverable across instances of this script.
21
+ It is strongly recommended to set a file. Multiple
22
+ parallel instances of this script may use the same cache
23
+ file.
24
+ output.txt Tab-delimited output file, with the columns:
25
+ o Query ID
26
+ o Subject ID
27
+ o Pathway ID
28
+ o Pathway (reference) description
29
+ o Organism
30
+
31
+ ";
32
+
33
+ $max_cache ||= 0;
34
+ $cache_file ||= "";
35
+
36
+ sub read_cache($){
37
+ my ($cache_file) = @_;
38
+ my $cache = {};
39
+ my $n = 0;
40
+ if($cache_file and -s $cache_file){
41
+ local $/;
42
+ my $json = "";
43
+ while(-e "$cache_file.tmp"){
44
+ print STDERR "Locked cache (read), waiting 1 sec.\n";
45
+ sleep 1;
46
+ }
47
+ open CACHE, "<", $cache_file or die "Cannot read file: $cache_file: $!\n";
48
+ while(<CACHE>){ $json.=$_ }
49
+ close CACHE;
50
+ $cache = decode_json($json);
51
+ $n = scalar keys %$cache;
52
+ }
53
+ return ($cache, $n);
54
+ }
55
+
56
+ sub write_cache($$){
57
+ my($cache, $cache_file) = @_;
58
+ if($cache_file){
59
+ # Get previously saved entries.
60
+ my($cache2, $cache_n2) = &read_cache($cache_file);
61
+ for my $k (keys %$cache2){
62
+ $cache->{$k} ||= $cache2->{$k} unless $k eq "###:paths";
63
+ }
64
+ $cache->{'###:paths'} ||= {};
65
+ for my $p (keys %{$cache2->{'###:paths'}}){
66
+ $cache->{'###:paths'}->{$p} ||= $cache2->{'###:paths'}->{$p};
67
+ }
68
+ # Save merged cache.
69
+ if(-s $cache_file){ copy $cache_file, "$cache_file.pre" or die "Cannot create file: $cache_file.tmp: $!\n" }
70
+ my $json = encode_json($cache);
71
+ while(-e "$cache_file.tmp"){
72
+ print STDERR "Locked cache (write), waiting 1 sec.\n";
73
+ sleep 1;
74
+ }
75
+ open CACHE, ">", "$cache_file.tmp" or die "Cannot create file: $cache_file.tmp: $!\n";
76
+ print CACHE $json;
77
+ close CACHE;
78
+ copy "$cache_file.tmp", $cache_file or die "Cannot create file: $cache_file: $!\n";
79
+ unlink "$cache_file.tmp" or die "Cannot unlink file: $cache_file.tmp: $!\n";
80
+ }
81
+ }
82
+
83
+ sub download_pathways($$){
84
+ my($cache, $ids) = @_;
85
+ my @todownload = ();
86
+ for my $id (@$ids){
87
+ push @todownload, $id unless exists $cache->{'###:paths'}->{$id};
88
+ }
89
+ while($#todownload>=0){
90
+ my @downloading = splice(@todownload, 0, 100);
91
+ my $path = get "http://rest.kegg.jp/list/".join("+", @downloading);
92
+ if($path){
93
+ chomp $path;
94
+ for my $p (split /\n/, $path){
95
+ my @wl = split /\t/, $p;
96
+ $wl[1] =~ s/ - /\t/;
97
+ $cache->{'###:paths'}->{$wl[0]} = $wl[1];
98
+ }
99
+ }
100
+ }
101
+ return $cache;
102
+ }
103
+
104
+ sub download($$){
105
+ my($cache, $todownload) = @_;
106
+ $cache->{'###:paths'} ||= {};
107
+ return $cache unless $#$todownload>=0;
108
+ $cache->{$_} = [] for @$todownload;
109
+ my $list = get "http://rest.kegg.jp/link/pathway/".join("+", @$todownload);
110
+ $list ||= "";
111
+ chomp $list;
112
+ my @pathids = ();
113
+ for my $res (split /\n/, $list){
114
+ my @rel = split /\t/, $res;
115
+ $#rel==1 or die "Unexpected number of columns:\n$res\n";
116
+ my $id = $rel[1];
117
+ push @pathids, $id;
118
+ unless(exists $cache->{$rel[0]}){
119
+ #print STDERR "Request/response difference in ID: ".$rel[0].", searching match.\n";
120
+ for my $id (@$todownload){
121
+ $rel[0] = $id if lc $id eq lc $rel[0];
122
+ }
123
+ die "Cannot find corresponding request.\n" unless exists $cache->{$rel[0]};
124
+ }
125
+ push @{ $cache->{$rel[0]} }, $id;
126
+ }
127
+ return &download_pathways($cache, \@pathids);
128
+ }
129
+
130
+ sub print_out($$){
131
+ my($cache, $hits) = @_;
132
+ for my $hit (@$hits){
133
+ die "Impossible to find gene in cache: ".$hit->[1]."\n" unless exists $cache->{$hit->[1]};
134
+ for my $path (@{$cache->{$hit->[1]}}){
135
+ next if $path =~ /^path:ko\d/;
136
+ unless(exists $cache->{'###:paths'}->{$path}){
137
+ print STDERR "Cannot find pathway in cache: $path (from ".$hit->[1]."), emergency download\n";
138
+ $cache = &download_pathways($cache, [$path]);
139
+ die "Impossible to find pathway: $path.\n" unless exists $cache->{'###:paths'}->{$path};
140
+ }
141
+ print "", join("\t", $hit->[0], $hit->[1], $path, $cache->{'###:paths'}->{$path}), "\n";
142
+ }
143
+ }
144
+ }
145
+
146
+ print STDERR "Loading cache.\n";
147
+ my ($cache, $n) = &read_cache($cache_file);
148
+ print STDERR " $n entries loaded.\n";
149
+ my @nopath = ();
150
+ for my $k (keys %$cache){
151
+ next if $k eq "###:paths";
152
+ for my $p (@{ $cache->{$k} }){
153
+ push @nopath, $p unless exists $cache->{'###:paths'}->{$p};
154
+ }
155
+ }
156
+ if($#nopath>=0){
157
+ print STDERR " Sanitizing ".@nopath." pathways in cache.\n";
158
+ while($#nopath>=0){
159
+ my @paths = ();
160
+ for(1 .. 15){ push @paths, shift @nopath unless $#nopath==-1 }
161
+ $cache = &download_pathways($cache, \@paths);
162
+ }
163
+ &write_cache($cache, $cache_file);
164
+ }
165
+
166
+ my $lines=0;
167
+ my $downs=0;
168
+ my @buff = ();
169
+ my @todownload = ();
170
+ print STDERR "Mapping genes.\n";
171
+ open BLAST, "<", $blast or die "Cannot read file: $blast: $!\n";
172
+ while(<BLAST>){
173
+ chomp;
174
+ my @l = split /\t/;
175
+ print STDERR " Mapping line ".(++$lines).". \r";
176
+ unless(($#todownload+2)%100){
177
+ print STDERR "+\r";
178
+ print STDERR " *\r" unless ++$downs%10;
179
+ $cache = &download($cache, \@todownload);
180
+ @todownload = ();
181
+ &print_out($cache, \@buff);
182
+ @buff = ();
183
+ &write_cache($cache, $cache_file) unless $downs%10;
184
+ }
185
+ push @buff, \@l;
186
+ push @todownload, $l[1] unless exists $cache->{$l[1]};
187
+ }
188
+ print STDERR "\nDone.\n";
189
+ close BLAST;
190
+
191
+ $cache = &download($cache, \@todownload);
192
+ &print_out($cache, \@buff);
193
+ &write_cache($cache, $cache_file);
194
+
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env perl
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @update Mar-23-2015
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ use warnings;
10
+ use strict;
11
+ use Getopt::Std;
12
+
13
+
14
+ sub HELP_MESSAGE { die "
15
+ Usage:
16
+ $0 [options] genes.txt blast.txt ... > blast_metaxa.txt
17
+
18
+ genes.gff2 File containing the genes in any supported format
19
+ (see option -f).
20
+ blast.txt ... One or more tabular BLAST files.
21
+ blast_metaxa.txt Input file for MeTaxa.
22
+
23
+ Options:
24
+ -l <float> Minimum fraction of the gene aligned to consider a
25
+ hit. By default: 0.75. Ignored if -f 'no'.
26
+ -f <str> Format of the genes prediction. Any of:
27
+ o gff2: GFF v2 as produced by MetaGeneMark.hmm.
28
+ o gff3: GFF v3 with id field in the last column.
29
+ o tab: Tabular file with columns gene, gene length,
30
+ and contig.
31
+ o no: Ignores genes file.
32
+ By default: gff2.
33
+ -q Run quietly.
34
+ -h Display this message and exit.
35
+
36
+ ";}
37
+
38
+ my %o;
39
+ getopts('l:f:qh',\%o);
40
+ my($gff, @blasts) = @ARGV;
41
+ ($gff and $#blasts>=0) or &HELP_MESSAGE;
42
+ $o{h} and &HELP_MESSAGE;
43
+ $o{f} ||= "gff2";
44
+ $o{f} = lc $o{f};
45
+ $o{l} ||= 0.75;
46
+
47
+ my %gene;
48
+ if($o{f} ne 'no'){
49
+ print STDERR "Reading genes collection.\n" unless $o{q};
50
+ open GFF, "<", $gff or die "Cannot read file: $gff: $!\n";
51
+ while(<GFF>){
52
+ next if /^#/;
53
+ next if /^\s*$/;
54
+ chomp;
55
+ my @ln = split /\t/;
56
+ if($o{f} eq 'gff2'){
57
+ exists $ln[8] or die "Cannot parse line $.: $_\n";
58
+ my $id = $ln[8];
59
+ $id =~ s/gene_id /gene_id_/;
60
+ $ln[0] =~ s/ .*//;
61
+ $gene{$id} = [$ln[0], (1+$ln[4]-$ln[3])/3];
62
+ }elsif($o{f} eq 'gff3'){
63
+ exists $ln[8] or die "Cannot parse line $.: $_\n";
64
+ $ln[8] =~ /id=([^;]+)/ or die "Cannot parse line $.: $_\n";
65
+ my $id = $1;
66
+ $ln[0] =~ s/ .*//;
67
+ $gene{$id} = [$ln[0], (1+$ln[4]-$ln[3])/3];
68
+ }elsif($o{f} eq 'tab'){
69
+ exists $ln[2] or die "Cannot parse line $.: $_\n";
70
+ $ln[1]+0 or die "$ln[0]: Length zero.\n";
71
+ $gene{$ln[0]} = [$ln[2], $ln[1]/3];
72
+ }else{
73
+ die "Unsupported format: ".$o{f}.".\n";
74
+ }
75
+ }
76
+ close GFF;
77
+ }
78
+
79
+ my $i=0;
80
+ my $p=0;
81
+ print STDERR "Generating MeTaxa input.\n" unless $o{q};
82
+ for my $blast (@blasts){
83
+ print STDERR " o $blast\n" unless $o{q};
84
+ open BLAST, "<", $blast or die "Cannot read file: $blast: $!\n";
85
+ while(<BLAST>){
86
+ chomp;
87
+ my @l = split /\t/;
88
+ $i++;
89
+ my $ctg;
90
+ if($o{f} eq 'no'){
91
+ $ctg = $l[0];
92
+ }else{
93
+ exists $gene{$l[0]} or die "Cannot find contig for gene $l[0].\n";
94
+ next unless $l[3] >= $o{l}*$gene{$l[0]}->[1];
95
+ $ctg = $gene{$l[0]}->[0];
96
+ }
97
+ $l[1] =~ m/gi\|(\d+)\|/ or die "Cannot parse GI in $l[1].\n";
98
+ print "".join("\t", @l, $ctg, $l[0], $1)."\n";
99
+ $p++;
100
+ }
101
+ close BLAST;
102
+ }
103
+ print STDERR " Found $i results, reported $p.\n" unless $o{q};
104
+
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @update: Jul-29-2015
6
+ # @license artistic license 2.0
7
+ #
8
+
9
+ require 'optparse'
10
+
11
+ opts = {:minscore=>0, :besthits=>0, :orient=>0, :sisprefix=>"_"}
12
+ ARGV << '-h' if ARGV.size==0
13
+ OptionParser.new do |opt|
14
+ opt.separator "Identifies the best hits of paired-reads."
15
+ opt.separator ""
16
+ opt.on("-i", "--blast FILE", "Input BLAST file."){ |v| opts[:blast]=v }
17
+ opt.on("-s", "--minscore FLOAT", "Minimum (summed) Bit-Score to consider a pair-match."){ |v| opts[:minscore] = v.to_f }
18
+ opt.on("-b", "--besthits INT", "Outputs top best-hits only (use 0 to output all the paired hits)."){ |v| opts[:besthits]=v.to_i }
19
+ opt.on("-o", "--orient INT", "Checks the orientation of the hit. Values are: 0, no checking; 1, same direction; 2,",
20
+ "inwards; 3, outwards; 4, different direction (i.e., 2 or 3)."){ |v| opts[:orient]=v.to_i }
21
+ opt.on("-p", "--sisprefix STR", "Sister read number prefix in the name of the reads. Escape characters as dots (\\.),",
22
+ "parenthesis (\\(, \\), \\[, \\]), or other characters with special meaning in regular expressions",
23
+ "(\\*, \\+, \\^, \\$, \\|). This prefix allows regular expressions (for example, use ':|\\.' to use any of",
24
+ "colon or dot). Notice that the prefix will not be included in the base name reported in the output."){ |v| opts[:sisprefix]=v }
25
+ opt.on("-h","--help","Display this screen") do
26
+ puts opt
27
+ exit
28
+ end
29
+ opt.separator ""
30
+ opt.separator "Output:"
31
+ opt.separator " Tab-delimited flat file, with the following columns:"
32
+ opt.separator " 1. Query ID (without the \"sister\" identifier)."
33
+ opt.separator " 2. Subject ID."
34
+ opt.separator " 3. Bit score (summed from both sister reads)."
35
+ opt.separator " 4/5. From/To (subject) coordinates for read 1."
36
+ opt.separator " 6/7. From/To (subject) coordinates for read 2."
37
+ opt.separator " 8. Reads orientation (1: same direction, 2: inwards, 3: outwards)."
38
+ opt.separator " 9. Estimated insert size."
39
+ opt.separator ""
40
+ opt.separator "Important note: This script assumes that paired hits are next to each other."
41
+ opt.separator " If this is not the case (e.g., because the blast was concatenated),"
42
+ opt.separator " you must sort the input before running this script."
43
+ opt.separator ""
44
+ end.parse!
45
+ abort "-i/--blast is mandatory." if opts[:blast].nil?
46
+ abort "-i/--blast must exist." unless File.exists? opts[:blast]
47
+
48
+ class SingleHit
49
+ attr_reader :sbj, :score, :orient, :sfrom, :sto, :qfrom, :qto
50
+ def initialize(blast_ln)
51
+ blast_ln.chomp!
52
+ ln = blast_ln.split("\t")
53
+ @sbj = ln[1]
54
+ @score = ln[11].to_f
55
+ @qfrom = ln[6].to_i
56
+ @qto = ln[7].to_i
57
+ @sfrom = ln[8].to_i
58
+ @sto = ln[9].to_i
59
+ @orient = @sfrom < @sto ? 1 : -1;
60
+ end
61
+ end
62
+ class DoubleHit
63
+ attr_reader :name, :sbj, :score, :orient, :hitA, :hitB
64
+ def initialize(name, hitA, hitB)
65
+ raise "Trying to set DoubleHit from hits with different subjects" unless hitA.sbj == hitB.sbj
66
+ @name = name
67
+ @hitA = hitA
68
+ @hitB = hitB
69
+ @sbj = hitA.sbj
70
+ @score = hitA.score + hitB.score
71
+ @orient = (hitA.orient == hitB.orient ? 1:
72
+ ((hitA.orient>0 and hitB.orient<0) ? 2: 3))
73
+ end
74
+ def to_s
75
+ coords = [@hitA.sfrom, @hitB.sfrom, @hitA.sto, @hitB.sto]
76
+ @name + "\t" + @sbj + "\t" + @score.to_s + "\t" +
77
+ @hitA.sfrom.to_s + "\t" + @hitA.sto.to_s + "\t" +
78
+ @hitB.sfrom.to_s + "\t" + @hitB.sto.to_s + "\t" +
79
+ @orient.to_s + "\t" + (coords.max-coords.min).to_s + "\n"
80
+ end
81
+ end
82
+ class PairedHits
83
+ attr_reader :name, :hitsA, :hitsB
84
+ @@minscore = 0
85
+ @@orient = 0
86
+ @@besthits = 0
87
+ def initialize(name)
88
+ @name = name
89
+ @hitsA = []
90
+ @hitsB = []
91
+ @hits = []
92
+ end
93
+ def hits
94
+ @hits = []
95
+ # Search for paired hits
96
+ @hitsA.each do |hitA|
97
+ @hitsB.each do |hitB|
98
+ if hitA.sbj == hitB.sbj
99
+ hit = DoubleHit.new(@name, hitA, hitB)
100
+ next if hit.score <= @@minscore # Minimum bit-score check
101
+ next if ((1 .. 3).include?(@@orient) and @@orient != hit.orient) # "typical" orientation check
102
+ next if (@@orient == 4 and not((2 .. 3).include?(hit.orient))) # "different-orientation" check
103
+ @hits.push(hit)
104
+ end
105
+ end
106
+ end
107
+ # Sort the hits
108
+ @hits.sort! {|x,y| x.score <=> y.score }
109
+ if @@besthits==0
110
+ @hits
111
+ else
112
+ @hits.take(@@besthits)
113
+ end
114
+ end
115
+ def hitsX(x)
116
+ if x == 1
117
+ @hitsA
118
+ else
119
+ @hitsB
120
+ end
121
+ end
122
+ # Class methods
123
+ def PairedHits.minscore=(value)
124
+ @@minscore = value
125
+ end
126
+ def PairedHits.orient=(value)
127
+ @@orient = value
128
+ end
129
+ def PairedHits.besthits=(value)
130
+ @@besthits = value
131
+ end
132
+ end
133
+
134
+ PairedHits.minscore = opts[:minscore]
135
+ PairedHits.orient = opts[:orient]
136
+ PairedHits.besthits = opts[:besthits]
137
+
138
+ begin
139
+ f = File.open(opts[:blast], "r")
140
+ currPair = PairedHits.new(" ")
141
+ while(ln = f.gets)
142
+ m = /^([^\s]*)(?:#{opts[:sisprefix]})([12])/.match(ln)
143
+ raise "Impossible to parse read name in line #{$.} using sister prefix '#{opts[:sisprefix]}':\n#{ln}" unless m
144
+ if m[1] != currPair.name
145
+ currPair.hits.each { |hit| puts hit.to_s }
146
+ currPair = PairedHits.new(m[1])
147
+ end
148
+ currPair.hitsX(m[2].to_i).push(SingleHit.new(ln));
149
+ end
150
+ currPair.hits.each { |hit| puts hit.to_s }
151
+ f.close
152
+ rescue => err
153
+ $stderr.puts "Exception: #{err}\n\n"
154
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
155
+ err
156
+ end
157
+