miga-base 0.7.26.0 → 1.0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/init.rb +11 -7
  11. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  12. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  13. data/lib/miga/cli/action/tax_dist.rb +2 -2
  14. data/lib/miga/cli/action/wf.rb +5 -4
  15. data/lib/miga/common.rb +1 -0
  16. data/lib/miga/daemon.rb +11 -4
  17. data/lib/miga/dataset/result.rb +10 -6
  18. data/lib/miga/json.rb +5 -4
  19. data/lib/miga/metadata.rb +5 -1
  20. data/lib/miga/parallel.rb +36 -0
  21. data/lib/miga/project.rb +8 -8
  22. data/lib/miga/project/base.rb +4 -4
  23. data/lib/miga/project/result.rb +2 -2
  24. data/lib/miga/sqlite.rb +10 -2
  25. data/lib/miga/version.rb +23 -9
  26. data/scripts/aai_distances.bash +16 -18
  27. data/scripts/ani_distances.bash +16 -17
  28. data/scripts/assembly.bash +31 -16
  29. data/scripts/haai_distances.bash +3 -27
  30. data/scripts/miga.bash +6 -4
  31. data/scripts/p.bash +1 -1
  32. data/scripts/read_quality.bash +9 -18
  33. data/scripts/trimmed_fasta.bash +14 -30
  34. data/scripts/trimmed_reads.bash +36 -36
  35. data/test/parallel_test.rb +31 -0
  36. data/test/project_test.rb +2 -1
  37. data/test/remote_dataset_test.rb +1 -1
  38. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Archaea_SCG.hmm +41964 -0
  39. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Bacteria_SCG.hmm +32439 -0
  40. data/utils/FastAAI/00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm +62056 -0
  41. data/utils/FastAAI/FastAAI/FastAAI +1336 -0
  42. data/utils/FastAAI/README.md +84 -0
  43. data/utils/FastAAI/kAAI_v1.0_virus.py +1296 -0
  44. data/utils/distance/commands.rb +1 -0
  45. data/utils/distance/database.rb +0 -1
  46. data/utils/distance/runner.rb +2 -4
  47. data/utils/enveomics/Docs/recplot2.md +244 -0
  48. data/utils/enveomics/Examples/aai-matrix.bash +66 -0
  49. data/utils/enveomics/Examples/ani-matrix.bash +66 -0
  50. data/utils/enveomics/Examples/essential-phylogeny.bash +105 -0
  51. data/utils/enveomics/Examples/unus-genome-phylogeny.bash +100 -0
  52. data/utils/enveomics/LICENSE.txt +73 -0
  53. data/utils/enveomics/Makefile +52 -0
  54. data/utils/enveomics/Manifest/Tasks/aasubs.json +103 -0
  55. data/utils/enveomics/Manifest/Tasks/blasttab.json +786 -0
  56. data/utils/enveomics/Manifest/Tasks/distances.json +161 -0
  57. data/utils/enveomics/Manifest/Tasks/fasta.json +802 -0
  58. data/utils/enveomics/Manifest/Tasks/fastq.json +291 -0
  59. data/utils/enveomics/Manifest/Tasks/graphics.json +126 -0
  60. data/utils/enveomics/Manifest/Tasks/mapping.json +137 -0
  61. data/utils/enveomics/Manifest/Tasks/ogs.json +382 -0
  62. data/utils/enveomics/Manifest/Tasks/other.json +906 -0
  63. data/utils/enveomics/Manifest/Tasks/remote.json +355 -0
  64. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +638 -0
  65. data/utils/enveomics/Manifest/Tasks/tables.json +308 -0
  66. data/utils/enveomics/Manifest/Tasks/trees.json +68 -0
  67. data/utils/enveomics/Manifest/Tasks/variants.json +111 -0
  68. data/utils/enveomics/Manifest/categories.json +165 -0
  69. data/utils/enveomics/Manifest/examples.json +154 -0
  70. data/utils/enveomics/Manifest/tasks.json +4 -0
  71. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +69 -0
  72. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -0
  73. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -0
  74. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -0
  75. data/utils/enveomics/Pipelines/assembly.pbs/README.md +189 -0
  76. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +112 -0
  77. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +23 -0
  78. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +44 -0
  79. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +50 -0
  80. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +37 -0
  81. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +68 -0
  82. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +49 -0
  83. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +80 -0
  84. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +57 -0
  85. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +63 -0
  86. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +38 -0
  87. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +73 -0
  88. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +21 -0
  89. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +72 -0
  90. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +98 -0
  91. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -0
  92. data/utils/enveomics/Pipelines/blast.pbs/README.md +127 -0
  93. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +109 -0
  94. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +128 -0
  95. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +16 -0
  96. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +22 -0
  97. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +26 -0
  98. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +89 -0
  99. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +29 -0
  100. data/utils/enveomics/Pipelines/idba.pbs/README.md +49 -0
  101. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +95 -0
  102. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +56 -0
  103. data/utils/enveomics/Pipelines/trim.pbs/README.md +54 -0
  104. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +70 -0
  105. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +130 -0
  106. data/utils/enveomics/README.md +42 -0
  107. data/utils/enveomics/Scripts/AAsubs.log2ratio.rb +171 -0
  108. data/utils/enveomics/Scripts/Aln.cat.rb +221 -0
  109. data/utils/enveomics/Scripts/Aln.convert.pl +35 -0
  110. data/utils/enveomics/Scripts/AlphaDiversity.pl +152 -0
  111. data/utils/enveomics/Scripts/BedGraph.tad.rb +93 -0
  112. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  113. data/utils/enveomics/Scripts/BlastPairwise.AAsubs.pl +102 -0
  114. data/utils/enveomics/Scripts/BlastTab.addlen.rb +63 -0
  115. data/utils/enveomics/Scripts/BlastTab.advance.bash +48 -0
  116. data/utils/enveomics/Scripts/BlastTab.best_hit_sorted.pl +55 -0
  117. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +104 -0
  118. data/utils/enveomics/Scripts/BlastTab.cogCat.rb +76 -0
  119. data/utils/enveomics/Scripts/BlastTab.filter.pl +47 -0
  120. data/utils/enveomics/Scripts/BlastTab.kegg_pep2path_rest.pl +194 -0
  121. data/utils/enveomics/Scripts/BlastTab.metaxaPrep.pl +104 -0
  122. data/utils/enveomics/Scripts/BlastTab.pairedHits.rb +157 -0
  123. data/utils/enveomics/Scripts/BlastTab.recplot2.R +48 -0
  124. data/utils/enveomics/Scripts/BlastTab.seqdepth.pl +86 -0
  125. data/utils/enveomics/Scripts/BlastTab.seqdepth_ZIP.pl +119 -0
  126. data/utils/enveomics/Scripts/BlastTab.seqdepth_nomedian.pl +86 -0
  127. data/utils/enveomics/Scripts/BlastTab.subsample.pl +47 -0
  128. data/utils/enveomics/Scripts/BlastTab.sumPerHit.pl +114 -0
  129. data/utils/enveomics/Scripts/BlastTab.taxid2taxrank.pl +90 -0
  130. data/utils/enveomics/Scripts/BlastTab.topHits_sorted.rb +101 -0
  131. data/utils/enveomics/Scripts/Chao1.pl +97 -0
  132. data/utils/enveomics/Scripts/CharTable.classify.rb +234 -0
  133. data/utils/enveomics/Scripts/EBIseq2tax.rb +83 -0
  134. data/utils/enveomics/Scripts/FastA.N50.pl +60 -0
  135. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  136. data/utils/enveomics/Scripts/FastA.filter.pl +52 -0
  137. data/utils/enveomics/Scripts/FastA.filterLen.pl +28 -0
  138. data/utils/enveomics/Scripts/FastA.filterN.pl +60 -0
  139. data/utils/enveomics/Scripts/FastA.fragment.rb +100 -0
  140. data/utils/enveomics/Scripts/FastA.gc.pl +42 -0
  141. data/utils/enveomics/Scripts/FastA.interpose.pl +93 -0
  142. data/utils/enveomics/Scripts/FastA.length.pl +38 -0
  143. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  144. data/utils/enveomics/Scripts/FastA.per_file.pl +36 -0
  145. data/utils/enveomics/Scripts/FastA.qlen.pl +57 -0
  146. data/utils/enveomics/Scripts/FastA.rename.pl +65 -0
  147. data/utils/enveomics/Scripts/FastA.revcom.pl +23 -0
  148. data/utils/enveomics/Scripts/FastA.sample.rb +98 -0
  149. data/utils/enveomics/Scripts/FastA.slider.pl +85 -0
  150. data/utils/enveomics/Scripts/FastA.split.pl +55 -0
  151. data/utils/enveomics/Scripts/FastA.split.rb +79 -0
  152. data/utils/enveomics/Scripts/FastA.subsample.pl +131 -0
  153. data/utils/enveomics/Scripts/FastA.tag.rb +65 -0
  154. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  155. data/utils/enveomics/Scripts/FastA.wrap.rb +48 -0
  156. data/utils/enveomics/Scripts/FastQ.filter.pl +54 -0
  157. data/utils/enveomics/Scripts/FastQ.interpose.pl +90 -0
  158. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  159. data/utils/enveomics/Scripts/FastQ.offset.pl +90 -0
  160. data/utils/enveomics/Scripts/FastQ.split.pl +53 -0
  161. data/utils/enveomics/Scripts/FastQ.tag.rb +70 -0
  162. data/utils/enveomics/Scripts/FastQ.test-error.rb +81 -0
  163. data/utils/enveomics/Scripts/FastQ.toFastA.awk +24 -0
  164. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  165. data/utils/enveomics/Scripts/GenBank.add_fields.rb +84 -0
  166. data/utils/enveomics/Scripts/HMM.essential.rb +351 -0
  167. data/utils/enveomics/Scripts/HMM.haai.rb +168 -0
  168. data/utils/enveomics/Scripts/HMMsearch.extractIds.rb +83 -0
  169. data/utils/enveomics/Scripts/JPlace.distances.rb +88 -0
  170. data/utils/enveomics/Scripts/JPlace.to_iToL.rb +320 -0
  171. data/utils/enveomics/Scripts/M5nr.getSequences.rb +81 -0
  172. data/utils/enveomics/Scripts/MeTaxa.distribution.pl +198 -0
  173. data/utils/enveomics/Scripts/MyTaxa.fragsByTax.pl +35 -0
  174. data/utils/enveomics/Scripts/MyTaxa.seq-taxrank.rb +49 -0
  175. data/utils/enveomics/Scripts/NCBIacc2tax.rb +92 -0
  176. data/utils/enveomics/Scripts/Newick.autoprune.R +27 -0
  177. data/utils/enveomics/Scripts/RAxML-EPA.to_iToL.pl +228 -0
  178. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  179. data/utils/enveomics/Scripts/RefSeq.download.bash +48 -0
  180. data/utils/enveomics/Scripts/SRA.download.bash +55 -0
  181. data/utils/enveomics/Scripts/TRIBS.plot-test.R +36 -0
  182. data/utils/enveomics/Scripts/TRIBS.test.R +39 -0
  183. data/utils/enveomics/Scripts/Table.barplot.R +31 -0
  184. data/utils/enveomics/Scripts/Table.df2dist.R +30 -0
  185. data/utils/enveomics/Scripts/Table.filter.pl +61 -0
  186. data/utils/enveomics/Scripts/Table.merge.pl +77 -0
  187. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  188. data/utils/enveomics/Scripts/Table.replace.rb +69 -0
  189. data/utils/enveomics/Scripts/Table.round.rb +63 -0
  190. data/utils/enveomics/Scripts/Table.split.pl +57 -0
  191. data/utils/enveomics/Scripts/Taxonomy.silva2ncbi.rb +227 -0
  192. data/utils/enveomics/Scripts/VCF.KaKs.rb +147 -0
  193. data/utils/enveomics/Scripts/VCF.SNPs.rb +88 -0
  194. data/utils/enveomics/Scripts/aai.rb +419 -0
  195. data/utils/enveomics/Scripts/ani.rb +362 -0
  196. data/utils/enveomics/Scripts/anir.rb +137 -0
  197. data/utils/enveomics/Scripts/clust.rand.rb +102 -0
  198. data/utils/enveomics/Scripts/gi2tax.rb +103 -0
  199. data/utils/enveomics/Scripts/in_silico_GA_GI.pl +96 -0
  200. data/utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz +0 -0
  201. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  202. data/utils/enveomics/Scripts/lib/enveomics.R +1 -0
  203. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  204. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  205. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +24 -0
  206. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  207. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  208. data/utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb +253 -0
  209. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  210. data/utils/enveomics/Scripts/lib/enveomics_rb/og.rb +182 -0
  211. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  212. data/utils/enveomics/Scripts/lib/enveomics_rb/remote_data.rb +74 -0
  213. data/utils/enveomics/Scripts/lib/enveomics_rb/seq_range.rb +237 -0
  214. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  215. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  216. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  217. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  218. data/utils/enveomics/Scripts/lib/enveomics_rb/vcf.rb +135 -0
  219. data/utils/enveomics/Scripts/ogs.annotate.rb +88 -0
  220. data/utils/enveomics/Scripts/ogs.core-pan.rb +160 -0
  221. data/utils/enveomics/Scripts/ogs.extract.rb +125 -0
  222. data/utils/enveomics/Scripts/ogs.mcl.rb +186 -0
  223. data/utils/enveomics/Scripts/ogs.rb +104 -0
  224. data/utils/enveomics/Scripts/ogs.stats.rb +131 -0
  225. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  226. data/utils/enveomics/Scripts/rbm.rb +100 -0
  227. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  228. data/utils/enveomics/Tests/Makefile +10 -0
  229. data/utils/enveomics/Tests/Mgen_M2288.faa +3189 -0
  230. data/utils/enveomics/Tests/Mgen_M2288.fna +8282 -0
  231. data/utils/enveomics/Tests/Mgen_M2321.fna +8288 -0
  232. data/utils/enveomics/Tests/Nequ_Kin4M.faa +2970 -0
  233. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.tribs.Rdata +0 -0
  234. data/utils/enveomics/Tests/Xanthomonas_oryzae-PilA.txt +7 -0
  235. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai-mat.tsv +17 -0
  236. data/utils/enveomics/Tests/Xanthomonas_oryzae.aai.tsv +137 -0
  237. data/utils/enveomics/Tests/a_mg.cds-go.blast.tsv +123 -0
  238. data/utils/enveomics/Tests/a_mg.reads-cds.blast.tsv +200 -0
  239. data/utils/enveomics/Tests/a_mg.reads-cds.counts.tsv +55 -0
  240. data/utils/enveomics/Tests/alkB.nwk +1 -0
  241. data/utils/enveomics/Tests/anthrax-cansnp-data.tsv +13 -0
  242. data/utils/enveomics/Tests/anthrax-cansnp-key.tsv +17 -0
  243. data/utils/enveomics/Tests/hiv1.faa +59 -0
  244. data/utils/enveomics/Tests/hiv1.fna +134 -0
  245. data/utils/enveomics/Tests/hiv2.faa +70 -0
  246. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv +233 -0
  247. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.lim +1 -0
  248. data/utils/enveomics/Tests/hiv_mix-hiv1.blast.tsv.rec +233 -0
  249. data/utils/enveomics/Tests/phyla_counts.tsv +10 -0
  250. data/utils/enveomics/Tests/primate_lentivirus.ogs +11 -0
  251. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv1.rbm +9 -0
  252. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-hiv2.rbm +8 -0
  253. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv1-siv.rbm +6 -0
  254. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-hiv2.rbm +9 -0
  255. data/utils/enveomics/Tests/primate_lentivirus.rbm/hiv2-siv.rbm +6 -0
  256. data/utils/enveomics/Tests/primate_lentivirus.rbm/siv-siv.rbm +6 -0
  257. data/utils/enveomics/build_enveomics_r.bash +45 -0
  258. data/utils/enveomics/enveomics.R/DESCRIPTION +31 -0
  259. data/utils/enveomics/enveomics.R/NAMESPACE +39 -0
  260. data/utils/enveomics/enveomics.R/R/autoprune.R +155 -0
  261. data/utils/enveomics/enveomics.R/R/barplot.R +184 -0
  262. data/utils/enveomics/enveomics.R/R/cliopts.R +135 -0
  263. data/utils/enveomics/enveomics.R/R/df2dist.R +154 -0
  264. data/utils/enveomics/enveomics.R/R/growthcurve.R +331 -0
  265. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  266. data/utils/enveomics/enveomics.R/R/recplot.R +354 -0
  267. data/utils/enveomics/enveomics.R/R/recplot2.R +1631 -0
  268. data/utils/enveomics/enveomics.R/R/tribs.R +583 -0
  269. data/utils/enveomics/enveomics.R/R/utils.R +80 -0
  270. data/utils/enveomics/enveomics.R/README.md +81 -0
  271. data/utils/enveomics/enveomics.R/data/growth.curves.rda +0 -0
  272. data/utils/enveomics/enveomics.R/data/phyla.counts.rda +0 -0
  273. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +16 -0
  274. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +16 -0
  275. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +16 -0
  276. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +25 -0
  277. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +46 -0
  278. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -0
  279. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +47 -0
  280. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -0
  281. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -0
  282. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +40 -0
  283. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +103 -0
  284. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +67 -0
  285. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -0
  286. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -0
  287. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +45 -0
  288. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +44 -0
  289. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +47 -0
  290. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +75 -0
  291. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  292. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +44 -0
  293. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +139 -0
  294. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +45 -0
  295. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -0
  296. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +77 -0
  297. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -0
  298. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -0
  299. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -0
  300. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -0
  301. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +47 -0
  302. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -0
  303. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -0
  304. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +45 -0
  305. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -0
  306. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -0
  307. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -0
  308. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -0
  309. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +52 -0
  310. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -0
  311. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +51 -0
  312. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +43 -0
  313. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +82 -0
  314. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +59 -0
  315. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -0
  316. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +36 -0
  317. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  318. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +68 -0
  319. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -0
  320. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -0
  321. data/utils/enveomics/enveomics.R/man/growth.curves.Rd +14 -0
  322. data/utils/enveomics/enveomics.R/man/phyla.counts.Rd +13 -0
  323. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +78 -0
  324. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +46 -0
  325. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +45 -0
  326. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +125 -0
  327. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -0
  328. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -0
  329. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -0
  330. data/utils/enveomics/globals.mk +8 -0
  331. data/utils/enveomics/manifest.json +9 -0
  332. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  333. data/utils/multitrim/README.md +67 -0
  334. data/utils/multitrim/multitrim.py +1555 -0
  335. data/utils/multitrim/multitrim.yml +13 -0
  336. data/utils/requirements.txt +4 -3
  337. metadata +304 -3
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @update Oct-13-2015
6
+ # @license Artistic License 2.0
7
+ #
8
+
9
+ $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
10
+ require "enveomics_rb/remote_data"
11
+ use "nokogiri"
12
+
13
+ #================================[ Options parsing ]
14
+ $o = {
15
+ q: false, ids: [], dbfrom: "uniprotkb", header: true,
16
+ ret: "ScientificName",
17
+ ranks: %w(superkingdom phylum class order family genus species)}
18
+
19
+ OptionParser.new do |opt|
20
+ opt.banner = "
21
+ Maps a list of EBI-supported IDs to their corresponding NCBI taxonomy using
22
+ EBI RESTful API. Avoid using this script on millions of entries at a time,
23
+ since each entry elicits requests to EBI and NCBI servers.
24
+
25
+ Usage: #{$0} [options]".gsub(/^ +/,"")
26
+ opt.separator ""
27
+ opt.on("-i", "--ids ID1,ID2,...", Array,
28
+ "Comma-separated list of EBI IDs. Required unless -I is passed."
29
+ ){ |v| $o[:ids]=v }
30
+ opt.on("-I", "--infile FILE",
31
+ "Raw text file containing the list of EBI IDs, one per line.",
32
+ "Required unless -i is passed."){ |v| $o[:infile]=v }
33
+ opt.on("-d", "--database DB",
34
+ "EBI database defining the EBI IDs. By default: " + $o[:dbfrom].to_s + "."
35
+ ){ |v| $o[:dbfrom]=v }
36
+ opt.on("-r", "--ranks RANK1,RANK2,...", Array,
37
+ "Taxonomic ranks to report. By default:",
38
+ $o[:ranks].join(",") + "."){ |v| $o[:ranks]=v }
39
+ opt.on("-n", "--noheader",
40
+ "Do not includ a header in the output."){ $o[:header]=false }
41
+ opt.on("-t", "--taxids",
42
+ "Return Taxonomy IDs instead of scientific names."){ $o[:ret]="TaxId" }
43
+ opt.on("-q", "--quiet", "Run quietly."){ |v| $o[:q]=true }
44
+ opt.on("-h", "--help","Display this screen") do
45
+ puts opt
46
+ exit
47
+ end
48
+ opt.separator ""
49
+ end.parse!
50
+
51
+ #================================[ Main ]
52
+ begin
53
+ $o[:ids] += File.readlines($o[:infile]).map{ |l| l.chomp } unless
54
+ $o[:infile].nil?
55
+ $o[:ranks].map!{ |r| r.downcase }
56
+ puts (["ID", "TaxId"] + $o[:ranks].map{ |r| r.capitalize }).join("\t") if
57
+ $o[:header]
58
+ $o[:ids].each do |id|
59
+ id = $1 if id =~ /^[a-z]+\|\S+\|(\S+)/
60
+ taxid = RemoteData.ebiseq2taxid(id, $o[:dbfrom])
61
+ if taxid.nil?
62
+ warn "Cannot find link to taxonomy: #{id}"
63
+ next
64
+ end
65
+ taxonomy = {}
66
+ unless taxid.nil?
67
+ doc = Nokogiri::XML( RemoteData.efetch({db: "taxonomy", id: taxid}) )
68
+ taxonomy[ doc.at_xpath("/TaxaSet/Taxon/Rank").content ] =
69
+ doc.at_xpath("/TaxaSet/Taxon/#{$o[:ret]}").content
70
+ doc.xpath("/TaxaSet/Taxon/LineageEx/Taxon").each do |taxon|
71
+ taxonomy[ taxon.at_xpath("./Rank").content ] =
72
+ taxon.at_xpath("./#{$o[:ret]}").content
73
+ end
74
+ end
75
+ puts ([id, taxid] +
76
+ $o[:ranks].map{ |rank| taxonomy[ rank ] ||= "" }).join("\t")
77
+ end # $o[:ids].each
78
+ rescue => err
79
+ $stderr.puts "Exception: #{err}\n\n"
80
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
81
+ err
82
+ end
83
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license: Artistic-2.0
5
+
6
+ use strict;
7
+ use warnings;
8
+ use List::Util qw/sum min max/;
9
+
10
+ my ($seqs, $minlen, $n__) = @ARGV;
11
+ $seqs or die "
12
+ Description:
13
+ Calculates the N50 value of a set of sequences. Alternatively, it
14
+ can calculate other N** values. It also calculates the total number
15
+ of sequences, the total added length, and the longest sequence length.
16
+
17
+ Usage:
18
+ $0 seqs.fa [minlen [**]]
19
+
20
+ seqs.fa A FastA file containing the sequences
21
+ minlen (optional) The minimum length to take into consideration
22
+ By default: 0
23
+ ** (optional) Value N** to calculate. By default: 50 (N50)
24
+
25
+ ";
26
+
27
+ $minlen ||= 0;
28
+ $n__ ||= 50;
29
+
30
+ my @len = ();
31
+ open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
32
+ while(<SEQ>){
33
+ if(/^>/){
34
+ push @len, 0;
35
+ }else{
36
+ next if /^;/;
37
+ chomp;
38
+ s/\W//g;
39
+ $len[-1] += length $_;
40
+ }
41
+ }
42
+ close SEQ;
43
+
44
+ @len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
45
+ my $tot = (sum(@len) || 0);
46
+
47
+ my $thr = $n__ * $tot / 100;
48
+ my $pos = 0;
49
+ for(@len){
50
+ $pos += $_;
51
+ if($pos >= $thr){
52
+ print "N$n__: $_\n";
53
+ last;
54
+ }
55
+ }
56
+
57
+ print "Sequences: " . scalar(@len) . "\n";
58
+ print "Total length: $tot\n";
59
+ print "Longest sequence: " . pop(@len) . "\n";
60
+
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @license Artistic-2.0
6
+ #
7
+
8
+ require 'optparse'
9
+
10
+ o = {q: false}
11
+ ARGV << '-h' if ARGV.size==0
12
+
13
+ OptionParser.new do |opt|
14
+ opt.banner = "
15
+ Extracts a list of sequences and/or coordinates from multi-FastA files.
16
+
17
+ Usage: #{$0} [options]"
18
+ opt.separator ''
19
+ opt.separator 'Mandatory'
20
+ opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
+ opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
+ opt.on('-c', '--coords STRING',
23
+ 'Comma-delimited list of coordinates (mandatory unless -C is passed).',
24
+ 'The format of the coordinates is "SEQ:FROM..TO" or "SEQ:FROM~LEN":',
25
+ 'SEQ: Sequence ID, or * (asterisk) to extract range from all sequences',
26
+ 'FROM: Integer, position of the first base to include (can be negative)',
27
+ 'TO: Integer, last base to include (can be negative)',
28
+ 'LEN: Length of the range to extract'
29
+ ){ |v| o[:c] = v }
30
+ opt.separator ''
31
+ opt.separator 'Options'
32
+ opt.on('-C', '--coords-file PATH',
33
+ 'File containing the coordinates, one per line.',
34
+ 'Each line must follow the format described for -c.'){ |v| o[:C] = v }
35
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
36
+ opt.on('-h', '--help', 'Display this screen.') do
37
+ puts opt
38
+ exit
39
+ end
40
+ opt.separator ''
41
+ end.parse!
42
+ abort '-i is mandatory.' if o[:i].nil?
43
+ abort '-o is mandatory.' if o[:o].nil?
44
+ abort '-c is mandatory.' if o[:c].nil? and o[:C].nil?
45
+
46
+ # Classses to parse coordinates
47
+ class SeqCoords
48
+ attr :id, :from, :to, :length, :str
49
+ def initialize(str)
50
+ @str = str
51
+ m = /(\S+):(-?\d+)(~|\.\.)(-?\d+)/.match str
52
+ raise "Cannot parse coordinates: #{str}" if m.nil?
53
+ @id = m[1]
54
+ @from = m[2].to_i
55
+ if m[3] == '~'
56
+ @length = m[4].to_i
57
+ else
58
+ @to = m[4].to_i
59
+ end
60
+ end
61
+
62
+ def extract(id, seq)
63
+ return nil unless concerns? id
64
+ from_i = from > 0 ? from : seq.length + 1 + from
65
+ if to.nil?
66
+ seq[from_i, length]
67
+ else
68
+ to_i = to > 0 ? to : seq.length + 1 + to
69
+ seq[from_i .. to_i]
70
+ end
71
+ end
72
+
73
+ def concerns?(seq_id)
74
+ return true if id == '*'
75
+ return id == seq_id
76
+ end
77
+ end
78
+
79
+ class SeqCoordsCollection
80
+ class << self
81
+ def from_str(str)
82
+ c = new
83
+ str.split(',').each { |i| c << SeqCoords.new(i) }
84
+ c
85
+ end
86
+ def from_file(path)
87
+ c = new
88
+ File.open(path, 'r') do |fh|
89
+ fh.each{ |i| c << SeqCoords.new(i.chomp) }
90
+ end
91
+ c
92
+ end
93
+ end
94
+
95
+ attr :collection
96
+
97
+ def initialize
98
+ @collection = []
99
+ end
100
+
101
+ def <<(coords)
102
+ @collection << coords
103
+ end
104
+
105
+ def extract(id, seq)
106
+ @collection.map{ |c| c.extract(id, seq) }.compact
107
+ end
108
+ end
109
+
110
+ # Functions to parse sequences
111
+ def do_stuff(id, sq)
112
+ return if id.nil? or sq.empty?
113
+ @n_in += 1
114
+ sq.gsub!(/[^A-Za-z]/, '')
115
+ i = 0
116
+ @coll.extract(id, sq).each do |new_sq|
117
+ @ofh.puts ">#{id}:#{i += 1}"
118
+ @ofh.puts new_sq
119
+ @n_out += 1
120
+ end
121
+ end
122
+
123
+ # Parse coordinates
124
+ $stderr.puts 'Parsing coordinates' unless o[:q]
125
+ @coll = o[:c].nil? ? SeqCoordsCollection.from_file(o[:C]) :
126
+ SeqCoordsCollection.from_str(o[:c])
127
+ $stderr.puts " Coordinates found: #{@coll.collection.size}"
128
+
129
+ # Parse sequences
130
+ $stderr.puts 'Parsing sequences' unless o[:q]
131
+ @n_in = 0
132
+ @n_out = 0
133
+ @ofh = File.open(o[:o], 'w')
134
+ File.open(o[:i], 'r') do |fh|
135
+ id = nil
136
+ sq = ''
137
+ fh.each do |ln|
138
+ next if ln =~ /^;/
139
+ if ln =~ /^>(\S+)/
140
+ id = $1
141
+ do_stuff(id, sq)
142
+ sq = ''
143
+ else
144
+ sq << ln
145
+ end
146
+ end
147
+ do_stuff(id, sq)
148
+ end
149
+ @ofh.close
150
+ $stderr.puts " Input sequences: #{@n_in}"
151
+ $stderr.puts " Output fragments: #{@n_out}"
152
+
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+ use Getopt::Std;
11
+
12
+ sub HELP_MESSAGE { die "
13
+ .Description:
14
+ Extracts a subset of sequences from a FastA file.
15
+
16
+ .Usage: $0 [options] list.txt seqs.fa > subset.fa
17
+
18
+ [options]
19
+ -r Reverse list. Extracts sequences NOT present in the list.
20
+ -q Runs quietly.
21
+ -h Prints this message and exits.
22
+
23
+ [mandatory]
24
+ list.txt List of sequences to extract.
25
+ seqs.fa FastA file containing the superset of sequences.
26
+ subset.fa FastA file to be created.
27
+
28
+ " }
29
+
30
+ my %o=();
31
+ getopts('rhq', \%o);
32
+ my($list, $fa) = @ARGV;
33
+ ($list and $fa) or &HELP_MESSAGE;
34
+ $o{h} and &HELP_MESSAGE;
35
+
36
+ print STDERR "Reading list.\n" unless $o{q};
37
+ open LI, "<", $list or die "Cannot read file: $list: $!\n";
38
+ my %li = map { chomp; $_ => 1 } <LI>;
39
+ close LI;
40
+
41
+ print STDERR "Filtering FastA.\n" unless $o{q};
42
+ open FA, "<", $fa or die "Cannot read file: $fa: $!\n";
43
+ my $good = 0;
44
+ while(my $ln = <FA>){
45
+ next if $ln =~ /^;/;
46
+ chomp $ln;
47
+ if($ln =~ m/^>((\S+).*)/){ $good = (exists $li{$1} or exists $li{">$1"} or exists $li{$2} or exists $li{$ln}) }
48
+ elsif($ln =~ m/^>/){ $good=$o{r}; print STDERR "Warning: Non-cannonical defline, line $.: $ln\n" }
49
+ print "$ln\n" if (($good and not $o{r}) or ($o{r} and not $good));
50
+ }
51
+ close FA;
52
+
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use Bio::SeqIO;
6
+
7
+ my $file = $ARGV[0];
8
+ my $min = $ARGV[1];
9
+ ($file and $min) or die <<HELP
10
+
11
+ This script will filter a multi fastA file by length
12
+
13
+ Usage "perl $0 fastafile minlenght "
14
+ HELP
15
+ ;
16
+ my $seq_in = Bio::SeqIO->new( -format => 'fasta',-file => $file);
17
+
18
+ while( my $seq1 = $seq_in->next_seq() ) {
19
+
20
+ my $id = $seq1->primary_id;
21
+ chomp $id;
22
+ my $seq = $seq1->seq;
23
+ chomp $seq;
24
+ my $lseq = length($seq);
25
+ if($lseq>=$min){
26
+ print ">$id","\n",$seq,"\n";
27
+ }
28
+ }
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Oct-07-2015
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ use warnings;
9
+ use strict;
10
+
11
+ my($file, $content, $stretch) = @ARGV;
12
+ $file or die <<HELP
13
+
14
+ Description:
15
+ Filter sequences by N-content and presence of long homopolymers.
16
+ Usage:
17
+ $0 sequences.fa [content [stretch]] > filtered.fa
18
+ Where:
19
+ sequences.fa Input file in FastA format
20
+ content A number between 0 and 1 indicating the maximum proportion of Ns
21
+ (1 to turn off, 0.5 by default)
22
+ stretch A number indicating the maximum number of consecutive identical
23
+ nucleotides allowed (0 to turn off, 100 by default)
24
+ filtered.fa Filtered set of sequences.
25
+
26
+ HELP
27
+ ;
28
+ ($content ||= 0.5)+=0;
29
+ ($stretch ||= 100)+=0;
30
+
31
+ my $good = 0;
32
+ my $N = 0;
33
+
34
+ FASTA: {
35
+ local $/ = "\n>";
36
+ open FILE, "<", $file or die "I can not open the file: $file: $!\n";
37
+ SEQ: while(<FILE>){
38
+ $N++;
39
+ s/^;.*//gm;
40
+ s/>//g;
41
+ my($n,$s) = split /\n/, $_, 2;
42
+ (my $clean = $s) =~ s/[^ACTGN]//g;
43
+ if($content < 1){
44
+ (my $Ns = $clean) =~ s/[^N]//g;
45
+ next SEQ if length($Ns)>length($clean)*$content;
46
+ }
47
+ if($stretch > 0){
48
+ for my $nuc (qw(A C T G N)){
49
+ next SEQ if $clean =~ m/[$nuc]{$stretch}/;
50
+ }
51
+ }
52
+ print ">$n\n$s\n";
53
+ $good++;
54
+ }
55
+ close FILE;
56
+ print STDERR "Total sequences: $N\nAfter filtering: $good\n";
57
+ }
58
+
59
+
60
+
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/stats'
8
+ $VERSION = 1.0
9
+
10
+ o = { q: false, completeness: nil, minlen: 500, shuffle: true }
11
+ OptionParser.new do |opts|
12
+ opts.version = $VERSION
13
+ Enveomics.opt_banner(
14
+ opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
15
+ "#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
16
+ )
17
+
18
+ opts.separator 'Mandatory'
19
+ opts.on(
20
+ '-i', '--in FILE',
21
+ 'Path to the FastA file containing the complete sequences',
22
+ 'Supports compression with .gz extension, use - for STDIN'
23
+ ) { |v| o[:in] = v }
24
+ opts.on(
25
+ '-o', '--out FILE', 'Path to the FastA to create',
26
+ 'Supports compression with .gz extension, use - for STDOUT'
27
+ ) { |v| o[:out] = v }
28
+ opts.on(
29
+ '-c', '--completeness FLOAT',
30
+ 'Fraction of genome completeness to simulate from 0 to 1'
31
+ ) { |v| o[:completeness] = v.to_f }
32
+
33
+ opts.separator ''
34
+ opts.separator 'Options'
35
+ opts.on(
36
+ '-m', '--minlen INT',
37
+ "Minimum fragment length to report. By default: #{o[:minlen]}"
38
+ ) { |v| o[:minlen] = v.to_i }
39
+ opts.on(
40
+ '-s', '--sorted', 'Keep fragments sorted as in the input file',
41
+ 'By default, fragments are shuffled'
42
+ ) { |v| o[:shuffle] = !v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
45
+ opts.separator ''
46
+ end.parse!
47
+
48
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
49
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
50
+ raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
51
+
52
+ begin
53
+ # Read input sequences
54
+ g_id = []
55
+ g_seq = []
56
+ ifh = reader(o[:in])
57
+ id = ''
58
+ ifh.each_line do |ln|
59
+ if ln =~ /^>(\S*)/
60
+ g_id << $1
61
+ g_seq << ''
62
+ else
63
+ g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
64
+ end
65
+ end
66
+ ifh.close
67
+
68
+ # Fragment genomes
69
+ f = {}
70
+ binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
71
+ p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
72
+ while !g_seq.empty?
73
+ id = g_id.shift
74
+ seq = g_seq.shift
75
+ gL = seq.length
76
+ while !seq.empty?
77
+ rand_x =
78
+ Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
79
+ fL = [0, (rand_x * binlen).round].max
80
+ f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
81
+ seq = seq[(fL + 1) .. -1]
82
+ seq = '' if seq.nil?
83
+ end
84
+ end
85
+
86
+ # Save output
87
+ k = f.keys
88
+ k.shuffle! if o[:shuffle]
89
+ ofh = writer(o[:out])
90
+ k.each do |id|
91
+ ofh.puts ">#{id}"
92
+ ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
93
+ end
94
+ ofh.close
95
+ rescue => err
96
+ $stderr.puts "Exception: #{err}\n\n"
97
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
98
+ err
99
+ end
100
+