RubyGems - miga-base - Versions diffs - 0.7.26.3 → 1.0.0.sr1 - Mend

miga-base 0.7.26.3 → 1.0.0.sr1

Files changed (105) hide show

checksums.yaml +4 -4
data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
data/lib/miga/cli/action/doctor.rb +50 -19
data/lib/miga/cli/action/doctor/base.rb +20 -18
data/lib/miga/cli/action/init.rb +11 -7
data/lib/miga/cli/action/init/files_helper.rb +1 -0
data/lib/miga/cli/action/ncbi_get.rb +3 -3
data/lib/miga/cli/action/tax_dist.rb +2 -2
data/lib/miga/cli/action/wf.rb +5 -4
data/lib/miga/daemon.rb +11 -4
data/lib/miga/dataset/result.rb +10 -6
data/lib/miga/json.rb +1 -2
data/lib/miga/metadata.rb +5 -1
data/lib/miga/parallel.rb +11 -6
data/lib/miga/project.rb +8 -8
data/lib/miga/project/base.rb +4 -4
data/lib/miga/project/result.rb +2 -2
data/lib/miga/sqlite.rb +7 -0
data/lib/miga/version.rb +23 -9
data/scripts/aai_distances.bash +16 -18
data/scripts/ani_distances.bash +16 -17
data/scripts/assembly.bash +31 -16
data/scripts/haai_distances.bash +3 -27
data/scripts/miga.bash +6 -4
data/scripts/p.bash +1 -1
data/scripts/read_quality.bash +9 -18
data/scripts/trimmed_fasta.bash +14 -30
data/scripts/trimmed_reads.bash +36 -36
data/test/parallel_test.rb +31 -0
data/test/project_test.rb +2 -1
data/utils/distance/commands.rb +1 -0
data/utils/distance/runner.rb +2 -4
data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
data/utils/enveomics/Manifest/Tasks/other.json +77 -0
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
data/utils/enveomics/Manifest/categories.json +13 -4
data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
data/utils/enveomics/Scripts/SRA.download.bash +6 -8
data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
data/utils/enveomics/Scripts/aai.rb +3 -2
data/utils/enveomics/Scripts/anir.rb +137 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
data/utils/enveomics/Scripts/rbm.rb +87 -133
data/utils/enveomics/Scripts/sam.filter.rb +148 -0
data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
data/utils/enveomics/enveomics.R/R/utils.R +30 -0
data/utils/enveomics/enveomics.R/README.md +1 -0
data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
data/utils/multitrim/Multitrim How-To.pdf +0 -0
data/utils/multitrim/README.md +67 -0
data/utils/multitrim/multitrim.py +1555 -0
data/utils/multitrim/multitrim.yml +13 -0
data/utils/requirements.txt +4 -3
metadata +33 -6
data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30

data/test/parallel_test.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+require 'test_helper'
+class ParallelTest < Test::Unit::TestCase
+  include TestHelper
+  def test_distribute
+    declare_forks
+    base = tmpfile('base')
+    assert(!File.exist?("#{base}-3"))
+    MiGA::Parallel.distribute((0..3), 2) do |o, _k, t|
+      File.open("#{base}-#{o}", 'w') { |fh| fh.puts t }
+    end
+    assert(File.exist?("#{base}-3"))
+    assert(!File.exist?("#{base}-4"))
+    t = (0..3).map { |i| File.read("#{base}-#{i}").chomp.to_i }
+    assert_equal([0, 0, 1, 1], t.sort)
+  end
+  def test_thread_enum
+    MiGA::Parallel.thread_enum(%w[a b c d], 3, 1) do |o, _k, _t|
+      assert_equal('b', o)
+    end
+    n = 0
+    MiGA::Parallel.thread_enum(0..19, 4, 0) { n += 1 }
+    assert_equal(5, n)
+  end
+end

data/test/project_test.rb CHANGED Viewed

@@ -27,10 +27,11 @@ class ProjectTest < Test::Unit::TestCase
   def test_create
     assert_equal(tmpfile('create'), project('create').path)
     assert_path_exist(tmpfile('create'))
-    assert_raise do
+    err = capture_stderr do
       ENV['MIGA_HOME'] = tmpfile('chez-moi')
       project('cuckoo')
     end
+    assert_match(/Projects cannot be processed yet/, err.string)
   end
   def test_load

data/utils/distance/commands.rb CHANGED Viewed

@@ -169,6 +169,7 @@ module MiGA::DistanceRunner::Commands
         aai_data[out[1]] = [out[6].to_f, 0, 0, 0] if out[6] !~ /^>/
       end
     end
+    puts "Results: #{haai_data.size} | Inferences: #{aai_data.size}"
     batch_data_to_db(:haai, haai_data)
     batch_data_to_db(:aai, aai_data)

data/utils/distance/runner.rb CHANGED Viewed

@@ -18,10 +18,8 @@ class MiGA::DistanceRunner
       @ref_project = MiGA::Project.load(ref_path)
       raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
     elsif !opts[:run_taxonomy] && dataset.option(:db_project)
-      ref_path = dataset.option(:db_project)
-      if project.option(:db_proj_dir)
-        ref_path = File.expand_path(ref_path, project.option(:db_proj_dir))
-      end
+      ref_location = project.option(:db_proj_dir) || File.dirname(project.path)
+      ref_path = File.expand_path(dataset.option(:db_project), ref_location)
       @ref_project = MiGA::Project.load(ref_path)
       raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
     else

data/utils/enveomics/Manifest/Tasks/fasta.json CHANGED Viewed

@@ -4,7 +4,8 @@
       "task": "FastA.N50.pl",
       "description": ["Calculates the N50 value of a set of sequences.",
         "Alternatively, it can calculate other N** values. It also calculates",
-        "the total number of sequences and the total added length."],
+        "the total number of sequences, the total added length, and the",
+        "longest sequence length."],
       "help_arg": "",
       "see_also": ["FastA.length.pl"],
       "options": [
@@ -354,14 +355,14 @@
           "opt": "--in",
           "arg": "in_file",
           "mandatory": true,
-          "description": "Input FastA file."
+          "description": "Input FastA file (supports .gz compression)."
         },
         {
           "name": "Output file",
           "opt": "--out",
           "arg": "out_file",
           "mandatory": true,
-          "description": "Output FastA file."
+          "description": "Output FastA file (supports .gz compression)."
         },
         {
           "opt": "--fraction",
@@ -733,6 +734,41 @@
         }
       ]
     },
+    {
+      "task": "FastA.toFastQ.rb",
+      "description": "Creates a FastQ-compliant file from a FastA file.",
+      "see_also": "FastQ.toFastA.awk",
+      "help_arg": "--help",
+      "options": [
+        {
+          "name": "Input FastA",
+          "opt": "--in",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": "Input FastA file (supports .gz compression)."
+        },
+        {
+          "name": "Output FastQ",
+          "opt": "--out",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": "Output FastQ file (supports .gz compression)."
+        },
+        {
+          "opt": "--quality",
+          "arg": "integer",
+          "default": 31,
+          "description": ["PHRED quality score to use (fixed), in the range",
+            "[-5, 41]."]
+        },
+        {
+          "opt": "--encoding",
+          "arg": "integer",
+          "default": 33,
+          "description": "Base encoding (33 or 64)."
+        }
+      ]
+    },
     {
       "task": "FastA.wrap.rb",
       "description": "Wraps sequences in a FastA to a given line length.",

data/utils/enveomics/Manifest/Tasks/fastq.json CHANGED Viewed

@@ -81,6 +81,47 @@
         }
       ]
     },
+    {
+      "task": "FastQ.maskQual.rb",
+      "description": "Masks low-quality bases in a FastQ file.",
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--input",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["Path to the FastQ file containing the sequences.",
+            "Supports compression with .gz extension."]
+        },
+        {
+          "opt": "--output",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["Path to the output FastQ file.",
+            "Supports compression with .gz extension."]
+        },
+        {
+          "opt": "--qual",
+          "arg": "integer",
+          "default": 15,
+          "description": "Minimum quality score to allow a base."
+        },
+        {
+          "opt": "--offset",
+          "arg": "integer",
+          "default": 33,
+          "description": "Q-score offset."
+        },
+        {
+          "opt": "--fasta",
+          "description": "Output sequences in FastA format."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly."
+        }
+      ]
+    },
     {
       "task": "FastQ.offset.pl",
       "description": ["There are several FastQ formats. This script takes a",
@@ -160,14 +201,20 @@
           "opt": "--in",
           "arg": "in_file",
           "mandatory": true,
-          "description": "FastQ file containing the sequences."
+          "description": [
+            "FastQ file containing the sequences.",
+            "Supports compression with .gz extension."
+          ]
         },
         {
           "name": "Output file",
           "opt": "--out",
           "arg": "out_file",
           "mandatory": true,
-          "description": "FastQ to create."
+          "description": [
+            "FastQ to create.",
+            "Supports compression with .gz extension."
+          ]
         },
         {
           "opt": "--prefix",
@@ -188,6 +235,7 @@
     {
       "task": "FastQ.toFastA.awk",
       "description": "Translates FastQ files into FastA.",
+      "see_also": "FastA.toFastQ.rb",
       "help_arg": "'' --help",
       "options": [
         "<",

data/utils/enveomics/Manifest/Tasks/mapping.json CHANGED Viewed

@@ -62,6 +62,76 @@
           "description": "Window size, in base pairs."
         }
       ]
+    },
+    {
+      "task": "sam.filter.rb",
+      "description": ["Filters a SAM or BAM file by target sequences and/or",
+        "identity."],
+      "see_also": ["anir.rb"],
+      "help_arg": "--help",
+      "options": [
+        {
+          "opt": "--genome",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["Genome assembly.",
+            "Supports compression with .gz extension."]
+        },
+        {
+          "opt": "--mapping",
+          "arg": "in_file",
+          "mandatory": true,
+          "description": ["Mapping file.",
+            "Supports compression with .gz extension."]
+        },
+        {
+          "opt": "--out-sam",
+          "arg": "out_file",
+          "mandatory": true,
+          "description": ["Output filtered file in SAM format.",
+            "Supports compression with .gz extension."]
+        },
+        {
+          "opt": "--g-format",
+          "arg": "select",
+          "values": ["fasta", "list"],
+          "default": "fasta",
+          "description": ["Genome assembly format."]
+        },
+        {
+          "opt": "--m-format",
+          "arg": "select",
+          "values": ["sam", "bam"],
+          "default": "sam",
+          "description": ["Mapping file format. SAM supports compression with",
+            ".gz file extension."]
+        },
+        {
+          "opt": "--identity",
+          "arg": "float",
+          "description": "Set a fixed threshold of percent identity.",
+          "default": 95.0
+        },
+        {
+          "opt": "--no-header",
+          "description": "Do not include the headers."
+        },
+        {
+          "opt": "--threads",
+          "arg": "integer",
+          "description": "Threads to use.",
+          "default": 2
+        },
+        {
+          "opt": "--log",
+          "arg": "out_file",
+          "description": "Log file to save output."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly."
+        }
+      ]
     }
   ]
 }

data/utils/enveomics/Manifest/Tasks/other.json CHANGED Viewed

@@ -824,6 +824,83 @@
           "description": "Features to map in GFF."
         }
       ]
+    },
+    {
+      "task": "Table.prefScore.R",
+      "description": ["Estimate preference score of species based on occupancy",
+        "in biased sample sets."],
+      "help_arg": "--help",
+      "requires": [ { "r_package": "optparse" } ],
+      "options": [
+        {
+          "name": "Occupancy matrix",
+          "opt": "--x",
+          "arg": "in_file",
+          "description": ["A tab-delimited table of presence/absence (1/0)",
+            "with species as rows and samples as columns."],
+          "mandatory": true
+        },
+        {
+          "name": "Sample set",
+          "opt": "--set",
+          "arg": "in_file",
+          "description": ["A list of sample names that constitute the test",
+            "set, one per line."],
+          "mandatory": true
+        },
+        {
+          "opt": "--ignore",
+          "arg": "in_file",
+          "description": ["A list of species to exclude from the analysis,",
+            "one per line."]
+        },
+        {
+          "name": "Significance threshold",
+          "opt": "--signif-thr",
+          "arg": "float",
+          "description": "Absolute value of the significance threshold."
+        },
+        {
+          "opt": "--col-above",
+          "arg": "string",
+          "description": "Color for points significantly above zero.",
+          "default": "#941100"
+        },
+        {
+          "opt": "--col-equal",
+          "arg": "string",
+          "description": ["Color for points not significantly different from",
+            "zero."],
+          "default": "#BDBDBD"
+        },
+        {
+          "opt": "--col-below",
+          "arg": "string",
+          "description": "Color for points significantly below zero.",
+          "default": "#2F5496"
+        },
+        {
+          "name": "Output preference scores",
+          "arg": "out_file",
+          "description": "Output raw-text file with preference scores.",
+          "mandatory": true
+        },
+        {
+          "name": "Graphical utput",
+          "arg": "out_file",
+          "description": "Output PDF file with preference scores plot."
+        },
+        {
+          "name": "Width",
+          "arg": "float",
+          "description": "Width of the plot in inches (7 by default)."
+        },
+        {
+          "name": "Height",
+          "arg": "float",
+          "description": "Height of the plot in inches (7 by default)."
+        }
+      ]
     }
   ]
 }

data/utils/enveomics/Manifest/Tasks/sequence-identity.json CHANGED Viewed

@@ -362,6 +362,139 @@
         }
       ]
     },
+    {
+      "task": "anir.rb",
+      "description": ["Estimates ANIr: the Average Nucleotide Identity of",
+        "reads against a genome."],
+      "help_arg": "--help",
+      "see_also": ["ani.rb", "sam.filter.rb"],
+      "options": [
+        {
+          "opt": "--reads",
+          "arg": "in_file",
+          "description": "Metagenomic reads."
+        },
+        {
+          "opt": "--genome",
+          "arg": "in_file",
+          "description": "Genome assembly."
+        },
+        {
+          "opt": "--mapping",
+          "arg": "in_file",
+          "description": "Mapping file."
+        },
+        {
+          "opt": "--list",
+          "arg": "in_file",
+          "description": "Output file with identities."
+        },
+        {
+          "opt": "--hist",
+          "arg": "in_file",
+          "description": "Output file with histogram."
+        },
+        {
+          "opt": "--tab",
+          "arg": "out_file",
+          "description": "Output file with results in tabular format."
+        },
+        {
+          "name": "Reads format",
+          "opt": "--r-format",
+          "arg": "select",
+          "description": ["Metagenomic reads format: fastq or fasta.",
+            "Both options support compression with .gz file extension."],
+          "values": ["fastq", "fasta"],
+          "default": "fastq"
+        },
+        {
+          "name": "Reads type",
+          "opt": "--r-type",
+          "arg": "select",
+          "description": ["Type of metagenomic reads: Single reads (single),",
+            "coupled reads in separate files (-m must be comma-delimited;",
+            "coupled), or coupled reads in a single interposed file",
+            "(interleaved)."],
+          "values": ["single", "coupled", "interleaved"],
+          "default": "single"
+        },
+        {
+          "name": "Genome format",
+          "opt": "--g-format",
+          "arg": "select",
+          "description": ["Genome assembly format: fasta or list.",
+            "Both options support compression with .gz file extension.",
+            "If passed in mapping-read mode, filters only matches to these",
+            "contigs."],
+          "values": ["fasta", "list"],
+          "default": "fasta"
+        },
+        {
+          "name": "Mapping format",
+          "opt": "--m-format",
+          "arg": "select",
+          "description": ["Mapping file format: sam, bam, tab, or list.",
+            "All except bam support compression with .gz file extension."],
+          "values": ["sam", "bam", "tab", "list"],
+          "default": "sam"
+        },
+        {
+          "opt": "--identity",
+          "arg": "float",
+          "description": "Set a fixed threshold of percent identity.",
+          "default": 95.0
+        },
+        {
+          "opt": "--algorithm",
+          "arg": "select",
+          "description": ["Set an algorithm to automatically detect identity",
+            "threshold: Valley detection by E-M of Gaussian Mixture Model",
+            "(gmm), fixed threshold (see Identity; fix),",
+            "Pick gmm or fix depending on bimodality (see Bimodality; auto)."],
+          "values": ["gmm", "fix", "auto"],
+          "default": "auto"
+        },
+        {
+          "opt": "--bimodality",
+          "arg": "float",
+          "description": ["Threshold of bimodality below which the algorithm",
+            "is set to fix. The coefficient used is the de Michele & Accantino",
+            "(2014) B index."],
+          "default": 0.5
+        },
+        {
+          "opt": "--coefficient",
+          "arg": "select",
+          "description": ["Coefficient of bimodality for Algorithm auto: ",
+            "Sarle's bimodality coefficient b (sarle), or",
+            "de Michele and Accatino (2014 PLoS ONE) B index",
+            "(use with Bimodality 0.1, dma)."],
+          "values": ["sarle", "dma"],
+          "default": "sarle"
+        },
+        {
+          "opt": "--bin-size",
+          "arg": "float",
+          "description": "Width of histogram bins (in percent identity).",
+          "default": 1.0
+        },
+        {
+          "opt": "--threads",
+          "arg": "integer",
+          "description": "Threads to use."
+        },
+        {
+          "opt": "--log",
+          "arg": "out_file",
+          "description": "Log file to save output."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly."
+        }
+      ]
+    },
     {
       "task": "HMM.haai.rb",
       "description": ["Estimates Average Amino Acid Identity (AAI) from the",
@@ -407,10 +540,14 @@
         "sequences."],
       "help_arg": "--help",
       "cite":[
+        ["Camacho et al, 2009, BMC Bioinf (BLAST+)",
+          "https://doi.org/10.1186/1471-2105-10-421"],
         ["Altschul et al, 2000, JMB (BLAST)",
           "http://dx.doi.org/10.1016/S0022-2836(05)80360-2"],
         ["Buchfink B, Xie C, Huson D, 2015, Nat Meth (Diamond)",
-          "https://dx.doi.org/10.1038/nmeth.3176"]
+          "https://dx.doi.org/10.1038/nmeth.3176"],
+        ["Kent, 2002, Genome Res (BLAT)",
+          "https://doi.org/10.1101/gr.229202"]
       ],
       "options": [
         {