RubyGems - miga-base - Versions diffs - 0.4.3.0 → 0.5.0.0 - Mend

miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/lib/miga/cli.rb +43 -223
data/lib/miga/cli/action/add.rb +91 -62
data/lib/miga/cli/action/classify_wf.rb +97 -0
data/lib/miga/cli/action/daemon.rb +14 -10
data/lib/miga/cli/action/derep_wf.rb +95 -0
data/lib/miga/cli/action/doctor.rb +83 -55
data/lib/miga/cli/action/get.rb +68 -52
data/lib/miga/cli/action/get_db.rb +206 -0
data/lib/miga/cli/action/index_wf.rb +31 -0
data/lib/miga/cli/action/init.rb +115 -190
data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
data/lib/miga/cli/action/ls.rb +20 -11
data/lib/miga/cli/action/ncbi_get.rb +199 -157
data/lib/miga/cli/action/preproc_wf.rb +46 -0
data/lib/miga/cli/action/quality_wf.rb +45 -0
data/lib/miga/cli/action/stats.rb +147 -99
data/lib/miga/cli/action/summary.rb +10 -4
data/lib/miga/cli/action/tax_dist.rb +61 -46
data/lib/miga/cli/action/tax_test.rb +46 -39
data/lib/miga/cli/action/wf.rb +178 -0
data/lib/miga/cli/base.rb +11 -0
data/lib/miga/cli/objects_helper.rb +88 -0
data/lib/miga/cli/opt_helper.rb +160 -0
data/lib/miga/daemon.rb +7 -4
data/lib/miga/dataset/base.rb +5 -5
data/lib/miga/project/base.rb +4 -4
data/lib/miga/project/result.rb +2 -1
data/lib/miga/remote_dataset/base.rb +5 -5
data/lib/miga/remote_dataset/download.rb +1 -1
data/lib/miga/version.rb +3 -3
data/scripts/cds.bash +3 -1
data/scripts/essential_genes.bash +1 -0
data/scripts/stats.bash +1 -1
data/scripts/trimmed_fasta.bash +5 -3
data/utils/distance/runner.rb +3 -0
data/utils/distance/temporal.rb +10 -1
data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
data/utils/enveomics/Scripts/SRA.download.bash +1 -1
data/utils/enveomics/Scripts/aai.rb +163 -128
data/utils/enveomics/build_enveomics_r.bash +11 -10
data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
data/utils/enveomics/enveomics.R/R/utils.R +31 -15
data/utils/enveomics/enveomics.R/README.md +7 -0
data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
data/utils/find-medoid.R +3 -2
data/utils/representatives.rb +5 -3
data/utils/subclade/pipeline.rb +22 -11
data/utils/subclade/runner.rb +5 -1
data/utils/subclades-compile.rb +1 -1
data/utils/subclades.R +9 -3
metadata +15 -4
data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26

data/lib/miga/daemon.rb CHANGED

@@ -35,13 +35,16 @@ class MiGA::Daemon < MiGA::MiGA
   ##
   # Initialize an unactive daemon for the MiGA::Project +project+. See #daemon
-  # to wake the daemon.
-  def initialize(project)
+  # to wake the daemon. If passed, +json+ must be the path to a daemon
+  # definition in json format. Otherwise, the project-stored daemon definition
+  # is used. In either case, missing variables are used as defined in
+  # ~/.miga_daemon.json.
+  def initialize(project, json = nil)
     $_MIGA_DAEMON_LAIR << self
     @project = project
+    json ||= File.expand_path('daemon/daemon.json', project.path)
     @runopts = MiGA::Json.parse(
-      File.expand_path('daemon/daemon.json', project.path),
-      default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
+      json, default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
     @jobs_to_run = []
     @jobs_running = []
     @loop_i = -1

data/lib/miga/dataset/base.rb CHANGED

@@ -35,14 +35,14 @@ module MiGA::Dataset::Base
   ##
   # Supported dataset types.
   @@KNOWN_TYPES = {
-    genome: {description: 'The genome from an isolate.', multi: false},
-    scgenome: {description: 'A Single-cell Amplidied Genome (SAG).',
+    genome: {description: 'The genome from an isolate', multi: false},
+    scgenome: {description: 'A Single-cell Amplified Genome (SAG)',
       multi: false},
-    popgenome: {description: 'A Metagenome-Assembled Genome (MAG).',
+    popgenome: {description: 'A Metagenome-Assembled Genome (MAG)',
       :multi=>false},
-    metagenome: {description: 'A metagenome (excluding viromes).',
+    metagenome: {description: 'A metagenome (excluding viromes)',
       multi: true},
-    virome: {description: 'A viral metagenome.', multi: true}
+    virome: {description: 'A viral metagenome', multi: true}
   }
   ##

data/lib/miga/project/base.rb CHANGED

@@ -76,13 +76,13 @@ module MiGA::Project::Base
   # Supported types of projects.
   @@KNOWN_TYPES = {
     mixed: {
-      description: "Mixed collection of genomes, metagenomes, and viromes.",
+      description: "Mixed collection of genomes, metagenomes, and viromes",
       single: true, multi: true},
-    genomes: {description: "Collection of genomes.",
+    genomes: {description: "Collection of genomes",
       single: true, multi: false},
-    clade: {description: "Collection of closely-related genomes (ANI >= 90%).",
+    clade: {description: "Collection of closely-related genomes (ANI >= 90%)",
       single: true, multi: false},
-    metagenomes: {description: "Collection of metagenomes and/or viromes.",
+    metagenomes: {description: "Collection of metagenomes and/or viromes",
       single: false, multi: true}
   }

data/lib/miga/project/result.rb CHANGED

@@ -110,7 +110,8 @@ module MiGA::Project::Result
       r.add_file(:proposal,      'miga-project.proposed-clades')
       r.add_file(:clades_aai90,  'miga-project.aai90-clades')
       r.add_file(:clades_ani95,  'miga-project.ani95-clades')
-      r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
+      r.add_file(:clades_gsp,  'miga-project.gsp-clades')
+      r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
       r
     end

data/lib/miga/remote_dataset/base.rb CHANGED

@@ -35,9 +35,9 @@ module MiGA::RemoteDataset::Base
   @@UNIVERSE = {
     web: {
       dbs: {
-        assembly: {stage: :assembly, format: :fasta},
-        assembly_gz: {stage: :assembly, format: :fasta_gz},
-        text: {stage: :metadata, format: :text}
+        assembly: { stage: :assembly, format: :fasta },
+        assembly_gz: { stage: :assembly, format: :fasta_gz },
+        text: { stage: :metadata, format: :text }
       },
       url: '%2$s',
       method: :net
@@ -59,8 +59,8 @@ module MiGA::RemoteDataset::Base
     },
     ncbi_map: {
       dbs: {
-        nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
-          format: :json},
+        nuccore: { stage: :metadata, map_to: [:biosample, :assembly],
+          format: :json },
         biosample: {stage: :metadata, map_to: [:assembly], format: :json}
       },
       url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",

data/lib/miga/remote_dataset/download.rb CHANGED

@@ -84,7 +84,7 @@ class MiGA::RemoteDataset
       end
       doc
     end
     ##
     # Looks for the entry +id+ in +dbfrom+, and returns the linked
     # identifier in +db+ (or nil).

data/lib/miga/version.rb CHANGED

@@ -10,15 +10,15 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.4, 3, 0]
+  VERSION = [0.5, 0, 0]
   ##
   # Nickname for the current major.minor version.
-  VERSION_NAME = 'aquatint'
+  VERSION_NAME = 'collotype'
   ##
   # Date of the current gem release.
-  VERSION_DATE = Date.new(2019, 9, 10)
+  VERSION_DATE = Date.new(2019, 11, 25)
   ##
   # Reference of MiGA.

data/scripts/cds.bash CHANGED

@@ -36,7 +36,9 @@ perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
 mv "$DATASET.gff3.t" "$DATASET.gff3"
 # Gzip
-gzip -9 -f "$DATASET.gff3"
+for ext in gff3 faa fna ; do
+  [[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
+done
 # Finalize
 miga date > "$DATASET.done"

data/scripts/essential_genes.bash CHANGED

@@ -9,6 +9,7 @@ cd "$PROJECT/data/07.annotation/01.function/01.essential"
 # Initialize
 miga date > "${DATASET}.start"
 FAA="../../../06.cds/${DATASET}.faa"
+[[ -s "$FAA" ]] || FAA="${FAA}.gz"
 # Check if there are any proteins
 if [[ ! -s $FAA ]] ; then

data/scripts/stats.bash CHANGED

@@ -12,7 +12,7 @@ cd "$DIR"
 miga date > "$DATASET.start"
 # Calculate statistics
-for i in raw_reads trimmed_fasta assembly cds essential_genes distances ; do
+for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
   echo "# $i"
   miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
 done

data/scripts/trimmed_fasta.bash CHANGED

@@ -13,9 +13,11 @@ miga date > "$DATASET.start"
 # Gunzip (if necessary)
 for sis in 1 2 ; do
-  [[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq.gz" \
-    && ! -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
-      && gunzip "../02.trimmed_reads/$b.$sis.clipped.fastq.gz"
+  for ext in clipped clipped.single ; do
+    [[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
+      && ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
+        && gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
+  done
 done
 miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f

data/utils/distance/runner.rb CHANGED

@@ -23,6 +23,9 @@ class MiGA::DistanceRunner
     @dataset = project.dataset(dataset_name)
     @home = File.expand_path('data/09.distances', project.path)
     # Default opts
+    if project.metadata[:aai_save_rbm] == false
+      @opts[:aai_save_rbm] ||= 'no-save-rbm'
+    end
     @opts[:aai_save_rbm] ||= ENV.fetch('MIGA_AAI_SAVE_RBM') do
       project.is_clade? ? 'save-rbm' : 'no-save-rbm'
     end

data/utils/distance/temporal.rb CHANGED

@@ -1,5 +1,6 @@
 require 'tmpdir'
+require 'zlib'
 module MiGA::DistanceRunner::Temporal
@@ -9,7 +10,15 @@ module MiGA::DistanceRunner::Temporal
     rf.each do |res, file|
       r = dataset.result(res)
       f = r.nil? ? nil : r.file_path(file)
-      FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
+      unless f.nil?
+        if f =~ /\.gz/
+          File.open(tmp_file("#{file}.fa"), 'w') do |ofh|
+            Zlib::GzipReader.open(f) { |ifh| ofh.print ifh.read }
+          end
+        else
+          FileUtils.cp(f, tmp_file("#{file}.fa"))
+        end
+      end
     end
   end

data/utils/enveomics/Manifest/Tasks/fasta.json CHANGED

@@ -610,6 +610,11 @@
           "opt": "--defline",
           "description": "Keep the original defline after a space."
         },
+        {
+          "opt": "--list",
+          "arg": "in_file",
+          "description": "Reads a list of IDS."
+        },
         {
           "opt": "--quiet",
           "description": "Run quietly (no STDERR output)."

data/utils/enveomics/Manifest/Tasks/sequence-identity.json CHANGED

@@ -388,6 +388,13 @@
           "arg": "out_file",
           "description": "Output file containing the aligned proteins."
         },
+	{
+	  "opt": "--components",
+	  "arg": "out_file",
+	  "description": ["Output file containing the components of the",
+	    "estimation. Tab-delimited file with model name, matches, and",
+	    "columns."]
+	},
         {
           "opt": "--quiet",
           "description": "Run quietly (no STDERR output)."

data/utils/enveomics/Scripts/BlastTab.addlen.rb CHANGED

@@ -2,46 +2,46 @@
 #
 # @author: Luis M. Rodriguez-R
-# @update: Feb-06-2015
 # @license: artistic license 2.0
 #
 require 'optparse'
-o = {:subject=>FALSE, :quiet=>FALSE}
-ARGV << '-h' if ARGV.size==0
+o = { sbj: false, q: false }
+ARGV << '-h' if ARGV.size == 0
 OptionParser.new do |opts|
-   opts.banner = "
-Appends an extra column to a BLAST with the length of the query or the subject sequence.
-You can pipe two instances to add both:
-   cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
+  opts.banner = "
+Appends an extra column to a BLAST with the length of the query or the subject
+sequence. You can pipe two instances to add both:
+  cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
 Usage: #{$0} [options] < input.blast > output.blast"
-   opts.separator ""
-   opts.separator "Mandatory"
-   opts.on("-f", "--fasta FILE", "Path to the FastA file"){ |v| o[:fasta] = v }
-   opts.separator ""
-   opts.separator "Options"
-   opts.on("-s", "--subject",
-   	"Use the subject column of the BLAST, by default the query column is used"){ o[:subject] = TRUE }
-   opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:quiet] = TRUE }
-   opts.on("-h", "--help", "Display this screen") do
-      puts opts
-      exit
-   end
-   opts.separator ""
+  opts.separator ''
+  opts.separator 'Mandatory'
+  opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
+  opts.separator ''
+  opts.separator 'Options'
+  opts.on('-s', '--subject',
+    'Use the subject column of the BLAST, by default the query column is used'
+    ){ o[:sbj] = true }
+  opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
+  opts.on('-h', '--help', 'Display this screen') do
+    puts opts
+    exit
+  end
+  opts.separator ''
 end.parse!
-abort "-f is mandatory" if o[:fasta].nil?
+abort '-f is mandatory' if o[:fasta].nil?
 len = {}
-id  = ""
-$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:quiet]
-fh = File.open(o[:fasta], "r")
+id  = ''
+$stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
+fh = File.open(o[:fasta], 'r')
 fh.each_line do |ln|
    defline = /^>(\S+)/.match(ln)
    if defline.nil?
       ln.gsub! /[^A-Za-z]/, ''
-      abort "Error: Unsupported format, expecting FastA" if len[id].nil?
+      abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
       len[id] = len[id] + ln.size
    else
       id = defline[1]
@@ -50,12 +50,14 @@ fh.each_line do |ln|
 end
 fh.close
-$stderr.puts "Appending #{o[:subject]?"subject":"query"} length column" unless o[:quiet]
+unless o[:q]
+  $stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
+end
 ARGF.each_line do |ln|
-   ln.chomp!
-   row = ln.split /\t/
-   id = o[:subject] ? row[1] : row[0];
-   abort "Impossible to find sequence of #{id}" if len[id].nil?
-   puts "#{ln}\t#{len[id]}"
+  ln.chomp!
+  row = ln.split /\t/
+  id = o[:sbj] ? row[1] : row[0]
+  abort "Impossible to find sequence of #{id}" if len[id].nil?
+  puts "#{ln}\t#{len[id]}"
 end

data/utils/enveomics/Scripts/FastA.tag.rb CHANGED

@@ -1,64 +1,65 @@
 #!/usr/bin/env ruby
-#
 # @author  Luis M. Rodriguez-R
-# @update  Oct-07-2015
 # @license artistic license 2.0
-#
 require 'optparse'
-o = {:q=>FALSE, :p=>"", :s=>"", :d=>FALSE}
+o = {q: false, p: '', s: '', d: false}
 ARGV << '-h' if ARGV.size==0
 OptionParser.new do |opts|
-   opts.banner = "
+  opts.banner = "
 Generates easy-to-parse tagged reads from FastA files.
 Usage: #{$0} [options]"
-   opts.separator ""
-   opts.separator "Mandatory"
-   opts.on("-i", "--in FILE",
-      "Path to the FastA file containing the sequences."){ |v| o[:in] = v }
-   opts.on("-o", "--out FILE",
-      "Path to the FastA to create."){ |v| o[:out] = v }
-   opts.separator ""
-   opts.separator "ID options"
-   opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
-   opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
-   opts.on("-d", "--defline",
-      "Keep the original defline after a space."){ o[:d]=TRUE }
-   opts.separator ""
-   opts.separator "Other Options"
-   opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
-   opts.on("-h", "--help", "Display this screen") do
-      puts opts
-      exit
-   end
-   opts.separator ""
+  opts.separator ''
+  opts.separator 'Mandatory'
+  opts.on('-i', '--in FILE',
+    'Path to the FastA file containing the sequences.'){ |v| o[:in] = v }
+  opts.on('-o', '--out FILE',
+    'Path to the FastA to create.'){ |v| o[:out] = v }
+  opts.separator ''
+  opts.separator 'ID options'
+  opts.on('-p', '--prefix STR', 'Prefix to use in all IDs.'){ |v| o[:p] = v }
+  opts.on('-s', '--suffix STR', 'Suffix to use in all IDs.'){ |v| o[:s] = v }
+  opts.on('-d', '--defline',
+    'Keep the original defline after a space.'){ o[:d] = true }
+  opts.on('-l', '--list FILE',
+    'Reads a list of IDS.'){ |v| o[:l] = v }
+  opts.separator ''
+  opts.separator 'Other Options'
+  opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
+  opts.on('-h', '--help', 'Display this screen') do
+    puts opts
+    exit
+  end
+  opts.separator ''
 end.parse!
-abort "-i is mandatory" if o[:in].nil?
-abort "-o is mandatory" if o[:out].nil?
+abort '-i is mandatory' if o[:in].nil?
+abort '-o is mandatory' if o[:out].nil?
 begin
-   ifh = File.open(o[:in], 'r');
-   ofh = File.open(o[:out], 'w');
-   i=0
-   while ln=ifh.gets
+  list = o[:l].nil? ? nil :
+    File.readlines(o[:l]).map{ |i| i.chomp.gsub(/^>/, '') }
+  ofh = File.open(o[:out], 'w')
+  i = 0
+  File.open(o[:in], 'r') do |ifh|
+    ifh.each do |ln|
       ln.chomp!
       next if ln =~ /^;/
       unless /^>/.match(ln).nil?
-	 i+=1
-	 ofh.puts ">#{o[:p]}#{i}#{o[:s]}#{ o[:d]?" #{ln[1, ln.size-1]}":"" }"
+        i += 1
+        new_id = o[:l].nil? ? i : list.shift
+        ofh.puts ">#{o[:p]}#{new_id}#{o[:s]}#{o[:d]?" #{ln[1, ln.size-1]}":''}"
       else
-         ofh.puts ln
+        ofh.puts ln
       end
-   end
-   ifh.close
-   ofh.close
+    end
+  end
+  ofh.close
 rescue => err
-   $stderr.puts "Exception: #{err}\n\n"
-   err.backtrace.each { |l| $stderr.puts l + "\n" }
-   err
+  $stderr.puts "Exception: #{err}\n\n"
+  err.backtrace.each { |l| $stderr.puts l + "\n" }
+  err
 end

data/utils/enveomics/Scripts/HMM.essential.rb CHANGED

@@ -8,8 +8,10 @@ require 'enveomics_rb/enveomics'
 use 'tmpdir'
 use 'zlib'
-o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
-  archaea: false, genomeeq: false, metagenome: false, list: false}
+o = {
+  bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
+  archaea: false, genomeeq: false, metagenome: false, list: false
+}
 OptionParser.new do |opts|
   opts.banner = "
 Finds and extracts a collection of essential proteins suitable for genome
@@ -26,65 +28,86 @@ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
 Usage: #{$0} [options]"
   opts.separator ''
   opts.separator 'Mandatory'
-  opts.on('-i', '--in FILE',
-    'Path to the FastA file containing all the proteins in a genome.'
-    ){ |v| o[:in] = v }
+  opts.on(
+    '-i', '--in FILE',
+    'Path to the FastA file (.gz allowed) with all the proteins in a genome'
+  ) { |v| o[:in] = v }
   opts.separator ''
   opts.separator 'Report Options'
-  opts.on('-o', '--out FILE',
-    'Path to the output FastA file with the translated essential genes.',
-    'By default the file is not produced.'){ |v| o[:out] = v }
-  opts.on('-m', '--per-model STR',
+  opts.on(
+    '-o', '--out FILE',
+    'Path to the output FastA file with the translated essential genes',
+    'By default the file is not produced'
+  ) { |v| o[:out] = v }
+  opts.on(
+    '-m', '--per-model STR',
     'Prefix of translated genes in independent files with the name of the',
-    'model appended. By default files are not produced.'
-    ){ |v| o[:permodel] = v }
-  opts.on('-R', '--report FILE',
-    'Path to the report file. By default, the report is sent to the STDOUT.'
-    ){ |v| o[:report] = v }
-  opts.on('--hmm-out FILE',
-    'Save HMMsearch output in this file. By default, not saved.'
-    ){ |v| o[:hmmout] = v }
-  opts.on('--alignments FILE',
+    'model appended. By default files are not produced'
+  ) { |v| o[:permodel] = v }
+  opts.on(
+    '-R', '--report FILE',
+    'Path to the report file. By default, the report is sent to the STDOUT'
+  ) { |v| o[:report] = v }
+  opts.on(
+    '--hmm-out FILE',
+    'Save HMMsearch output in this file. By default, not saved'
+  ) { |v| o[:hmmout] = v }
+  opts.on(
+    '--alignments FILE',
     'Save the aligned proteins in this file. By default, not saved'
-    ){ |v| o[:alignments] = v }
-  opts.on('-B', '--bacteria',
-    'If set, ignores models typically missing in Bacteria.'
-    ){ |v| o[:bacteria] = v }
-  opts.on('-A', '--archaea',
-    'If set, ignores models typically missing in Archaea.'
-    ){ |v| o[:archaea] = v }
-  opts.on('-G', '--genome-eq',
-    'If set, ignores models not suitable for genome-equivalents estimations.',
-    'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
-    ){ |v| o[:genomeeq] = v }
-  opts.on('-r', '--rename STR',
+  ) { |v| o[:alignments] = v }
+  opts.on(
+    '-B', '--bacteria',
+    'If set, ignores models typically missing in Bacteria'
+  ) { |v| o[:bacteria] = v }
+  opts.on(
+    '-A', '--archaea',
+    'If set, ignores models typically missing in Archaea'
+  ) { |v| o[:archaea] = v }
+  opts.on(
+    '-G', '--genome-eq',
+    'If set, ignores models not suitable for genome-equivalents estimations',
+    'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
+  ) { |v| o[:genomeeq] = v }
+  opts.on(
+    '-r', '--rename STR',
     'If set, renames the sequences with the string provided and appends it',
-    'with pipe and the gene name (except in --per-model files).'
-    ){ |v| o[:rename]=v }
-  opts.on('-n', '--no-stats',
-    'If set, no statistics are reported on genome evaluation.'
-    ){ |v| o[:stats] = v }
-  opts.on('-s', '--no-genes',
-    'If set, statistics won\'t include the lists of missing/multi-copy genes.'
-    ){ |v| o[:genes] = v }
-  opts.on('-M', '--metagenome',
+    'with pipe and the gene name (except in --per-model files)'
+  ) { |v| o[:rename] = v }
+  opts.on(
+    '-n', '--no-stats',
+    'If set, no statistics are reported on genome evaluation'
+  ) { |v| o[:stats] = v }
+  opts.on(
+    '-s', '--no-genes',
+    'If set, statistics won\'t include the lists of missing/multi-copy genes'
+  ) { |v| o[:genes] = v }
+  opts.on(
+    '-M', '--metagenome',
     'If set, it allows for multiple copies of each gene and turns on',
-    'metagenomic report mode.'){ |v| o[:metagenome] = v }
+    'metagenomic report mode'
+  ) { |v| o[:metagenome] = v }
   opts.separator ''
   opts.separator 'Other Options'
-  opts.on('-L', '--list-models',
+  opts.on(
+    '-L', '--list-models',
     'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
-    'and -q; ignores all other parameters.'){ |v| o[:list] = v }
-  opts.on('-b', '--bin DIR',
-    'Path to the directory containing the binaries of HMMer 3.0+.'
-    ){ |v| o[:bin] = v }
-  opts.on('--model-file',
-    'External file containing models to search.'){ |v| o[:model_file] = v }
-  opts.on('-t', '--threads INT',
-    "Number of parallel threads to be used.  By default: #{o[:thr]}."
-    ){ |v| o[:thr] = v.to_i }
-  opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
-  opts.on('-h', '--help', 'Display this screen.') do
+    'and -q; ignores all other parameters'
+  ) { |v| o[:list] = v }
+  opts.on(
+    '-b', '--bin DIR',
+    'Path to the directory containing the binaries of HMMer 3.0+'
+  ) { |v| o[:bin] = v }
+  opts.on(
+    '--model-file',
+    'External file containing models to search'
+  ) { |v| o[:model_file] = v }
+  opts.on(
+    '-t', '--threads INT', Integer,
+    "Number of parallel threads to be used.  By default: #{o[:thr]}"
+  ) { |v| o[:thr] = v }
+  opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
+  opts.on('-h', '--help', 'Display this screen') do
     puts opts
     exit
   end
@@ -112,6 +135,13 @@ TIGR00389 TIGR00436 tRNA-synth_1d}
 begin
   Dir.mktmpdir do |dir|
     $stderr.puts "Temporal directory: #{dir}." unless o[:q]
+    if o[:in] =~ /\.gz/
+      tmp_in = File.expand_path('sequences.fa', dir)
+      Zlib::GzipReader.open(o[:in]) do |ifh|
+        File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
+      end
+      o[:in] = tmp_in
+    end
     # Create database.
     $stderr.puts 'Searching models.' unless o[:q]
@@ -144,9 +174,9 @@ begin
         'This script requires HMMER 3.0+.'
     end
     o[:hmmout] ||= "#{dir}/hmmsearch"
-    `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
-      -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
-      > #{dir}/hmmsearch.log`
+    `'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
+      -A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
+      > '#{dir}/hmmsearch.log'`
     # Parse output
     $stderr.puts 'Parsing results.' unless o[:q]