RubyGems - miga-base - Versions diffs - 0.5.0.0 → 0.5.1.0 - Mend

miga-base 0.5.0.0 → 0.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/lib/miga/cli/action/doctor.rb +6 -1
data/lib/miga/cli/action/init.rb +1 -1
data/lib/miga/cli/action/quality_wf.rb +1 -0
data/lib/miga/cli/action/stats.rb +9 -8
data/lib/miga/cli/action/wf.rb +5 -0
data/lib/miga/cli/objects_helper.rb +1 -0
data/lib/miga/common/format.rb +5 -2
data/lib/miga/daemon.rb +2 -2
data/lib/miga/project/dataset.rb +8 -7
data/lib/miga/version.rb +2 -2
data/scripts/essential_genes.bash +9 -8
data/scripts/mytaxa.bash +3 -1
data/scripts/mytaxa_scan.bash +15 -8
data/utils/domain-ess-genes.rb +63 -0
data/utils/enveomics/Manifest/Tasks/other.json +21 -2
data/utils/enveomics/Manifest/examples.json +4 -4
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
data/utils/enveomics/Scripts/HMM.essential.rb +54 -17
data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} +0 -0
data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
data/utils/enveomics/enveomics.R/R/df2dist.R +16 -17
data/utils/enveomics/enveomics.R/R/recplot2.R +20 -15
data/utils/enveomics/enveomics.R/README.md +1 -0
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +5 -4
data/utils/find-medoid.R +6 -1
data/utils/mytaxa_scan.rb +49 -46
data/utils/ref-tree.R +6 -1
data/utils/subclades-nj.R +6 -1
data/utils/subclades.R +6 -1
metadata +6 -6
data/utils/arch-ess-genes.rb +0 -57

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e370d282f1b28480765e1b91fcb7d8921d12baa31d22db1318975a1c2a79e19a
-  data.tar.gz: e7fb3941fd3381e0e9696a2c577aeb157657335e56434e7c6d6650be7ba45e98
+  metadata.gz: f6888c1ce3756b8cc708736c0da052e5a7396277e0c903ebcfc083f17b6915e7
+  data.tar.gz: d998f6e087316a81de4aa8897452344c1987ce0cb9807f4a1e11a29f52dfbcf2
 SHA512:
-  metadata.gz: 4642a212e1b4021e211fd144b515ff49e9ddb7a9b2292430553307a7ae165e4d8d5e6fd8426757f15ea6e70f4c3efbb055e0439497172cc1f91186d522c82635
-  data.tar.gz: 8d5d3ded3c03e56505572102110a4bca4b84d06b2e73bcf208856610a4cd6e60092ce6d54d47dcef2c8acf85f1cce5f8461097e8699344df6738ed8493215112
+  metadata.gz: c6f7f8af791664b2bb704744535e0e39c4d5fc06521beb8feb57f658d6187a667100fde4312a3a8e5f47f5dd9d4b3c06326584d95e80262dc0e02a91795e192c
+  data.tar.gz: 4f633972d8ccc1cc06cc14ca6c48b50759d63af72ba75514a0e877a58af4e1d407fda2c2608c2077dd541e30e86add8359bc2e0838d93a19f7cc1bd5c5f5fff2

data/lib/miga/cli/action/doctor.rb CHANGED Viewed

@@ -104,6 +104,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
         unless ok
           cli.say "  > Registering again #{d.name}:#{r_k}"
           d.add_result(r_k, true, force: true)
+          sr = d.result(:stats) and sr.remove!
         end
       end
     end
@@ -123,7 +124,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
           changed = true
         end
       end
-      d.add_result(:cds, true, force: true) if changed
+      if changed
+        d.add_result(:cds, true, force: true)
+        sr = d.result(:stats) and sr.remove!
+      end
     end
   end
@@ -136,6 +140,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
       if dir.nil?
         cli.say "  > Removing #{d.name}:essential_genes"
         res.remove!
+        sr = d.result(:stats) and sr.remove!
         next
       end
       next if Dir["#{dir}/*.faa"].empty?

data/lib/miga/cli/action/init.rb CHANGED Viewed

@@ -220,7 +220,7 @@ BASH
   def check_r_packages(paths)
     cli.puts 'Looking for R packages:'
-    %w(enveomics.R ape cluster vegan).each do |pkg|
+    %w(ape cluster vegan).each do |pkg|
       cli.print "Testing #{pkg}... "
       if test_r_package(cli, paths, pkg)
         cli.puts 'yes.'

data/lib/miga/cli/action/quality_wf.rb CHANGED Viewed

@@ -25,6 +25,7 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
       %w[project_stats haai_distances aai_distances ani_distances clade_finding]
         .map { |i| ["run_#{i}", false] }
     ]
+    p_metadata[:ess_coll] = cli[:ess_coll]
     d_metadata = { run_distances: false }
     d_metadata[:run_mytaxa_scan] = false unless cli[:mytaxa]
     p = create_project(:assembly, p_metadata, d_metadata)

data/lib/miga/cli/action/stats.rb CHANGED Viewed

@@ -122,17 +122,18 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
         end
       end
     else
-      # Fix estimate for Archaea
-      if !d.metadata[:tax].nil? &&
-            d.metadata[:tax].in?(Taxonomy.new('d:Archaea')) &&
-            r.file_path(:bac_report).nil?
-        scr = "#{MiGA.root_path}/utils/arch-ess-genes.rb"
+      # Fix estimate by domain
+      if !(tax = d.metadata[:tax]).nil? &&
+            %w[Archaea Bacteria].include?(tax[:d]) &&
+            r.file_path(:raw_report).nil?
+        scr = "#{MiGA.root_path}/utils/domain-ess-genes.rb"
         rep = r.file_path(:report)
         rc_p = File.expand_path('.miga_rc', ENV['HOME'])
         rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
-        $stderr.print `#{rc} ruby '#{scr}' '#{rep}' '#{rep}.archaea'`
-        r.add_file(:bac_report, "#{d.name}.ess/log")
-        r.add_file(:report, "#{d.name}.ess/log.archaea")
+        $stderr.print `#{rc} ruby '#{scr}' \
+          '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
+        r.add_file(:raw_report, "#{d.name}.ess/log")
+        r.add_file(:report, "#{d.name}.ess/log.domain")
       end
       # Extract/compute quality values
       stats = {completeness: [0.0, '%'], contamination: [0.0, '%']}

data/lib/miga/cli/action/wf.rb CHANGED Viewed

@@ -24,6 +24,11 @@ module MiGA::Cli::Action::Wf
     opt.separator "    FILES...: #{files_desc}"
     opt.separator ''
     opt.separator 'Workflow Control Options'
+    opt.on(
+      '-C', '--collection STRING',
+      'Collection of essential genes to use as reference',
+      'One of: dupont_2012 (default), lee_2019'
+    ) { |v| cli[:ess_coll] = v }
     if params[:ncbi]
       opt.on(
         '-T', '--ncbi-taxon STRING',

data/lib/miga/cli/objects_helper.rb CHANGED Viewed

@@ -66,6 +66,7 @@ module MiGA::Cli::ObjectsHelper
   end
   def add_metadata(obj, cli = self)
+    raise "Unsupported object: #{obj.class}" unless obj.respond_to? :metadata
     cli[:metadata].split(',').each do |pair|
       (k,v) = pair.split('=')
       case v

data/lib/miga/common/format.rb CHANGED Viewed

@@ -25,10 +25,13 @@ module MiGA::Common::Format
   # Cleans a FastA file in place.
   def clean_fasta_file(file)
     tmp_fh = nil
+    tmp_path = nil
     begin
       if file =~ /\.gz/
         tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
-        tmp_fh = Zlib::GzipWriter.open(tmp_path)
+        File.unlink tmp_path
+        tmp_path += '.gz'
+        tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
         fh = Zlib::GzipReader.open(file)
       else
         tmp_fh = Tempfile.new('MiGA')
@@ -50,7 +53,7 @@ module MiGA::Common::Format
       tmp_fh.print buffer.wrap_width(80)
       tmp_fh.close
       fh.close
-      FileUtils.cp(tmp_path, file)
+      FileUtils.mv(tmp_path, file)
     ensure
       begin
         tmp_fh.close unless tmp_fh.nil?

data/lib/miga/daemon.rb CHANGED Viewed

@@ -285,10 +285,10 @@ class MiGA::Daemon < MiGA::MiGA
       if [nil, '', 0].include? job[:pid]
         job[:pid] = nil
         @jobs_to_run << job
-        say "Unsuccessful #{job[:task_name]}, rescheduling."
+        say "Unsuccessful #{job[:task_name]}, rescheduling"
       else
         @jobs_running << job
-        say "Spawned pid:#{job[:pid]} for #{job[:task_name]}."
+        say "Spawned pid:#{job[:pid]} for #{job[:task_name]}"
       end
     end
 end

data/lib/miga/project/dataset.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 ##
 # Helper module including specific functions handle datasets.
 module MiGA::Project::Dataset
   ##
   # Returns Array of MiGA::Dataset.
   def datasets
@@ -23,7 +23,7 @@ module MiGA::Project::Dataset
   def dataset_names_hash
     @dataset_names_hash ||= Hash[dataset_names.map{ |i| [i,true] }]
   end
   ##
   # Returns MiGA::Dataset.
   def dataset(name)
@@ -47,18 +47,19 @@ module MiGA::Project::Dataset
       end
     end
   end
   ##
   # Add dataset identified by +name+ and return MiGA::Dataset.
   def add_dataset(name)
     unless metadata[:datasets].include? name
       MiGA::Dataset.new(self, name)
       @metadata[:datasets] << name
+      @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
       save
     end
     dataset(name)
   end
   ##
   # Unlink dataset identified by +name+ and return MiGA::Dataset.
   def unlink_dataset(name)
@@ -68,7 +69,7 @@ module MiGA::Project::Dataset
     save
     d
   end
   ##
   # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
   # supported by File#generic_transfer.
@@ -116,7 +117,7 @@ module MiGA::Project::Dataset
     end
     datasets.uniq - metadata[:datasets]
   end
   ##
   # Are all the datasets in the project preprocessed? Save intermediate results
   # if +save+ (until the first incomplete dataset is reached).
@@ -149,6 +150,6 @@ module MiGA::Project::Dataset
   def each_dataset_profile_advance(&blk)
     each_dataset { |ds| blk.call(ds.profile_advance) }
   end
 end

data/lib/miga/version.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.5, 0, 0]
+  VERSION = [0.5, 1, 0]
   ##
   # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
   ##
   # Date of the current gem release.
-  VERSION_DATE = Date.new(2019, 11, 25)
+  VERSION_DATE = Date.new(2020, 1, 6)
   ##
   # Reference of MiGA.

data/scripts/essential_genes.bash CHANGED Viewed

@@ -22,18 +22,19 @@ fi
 # Find and extract essential genes
 [[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
 mkdir "${DATASET}.ess"
-TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
+TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" \
   --metadata "type" | awk '{print $2}')
+COLL=$(miga about -P "$PROJECT" -m ess_coll)
+[[ "$COLL" == "?" ]] && COLL=dupont_2012
+CMD="HMM.essential.rb \
+  -i '$FAA' -o '${DATASET}.ess.faa' -m '${DATASET}.ess/' \
+  -t '$CORES' -r '$DATASET' --collection '$COLL'"
 if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
-  HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
-    -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
-    > "${DATASET}.ess/log"
+  CMD="$CMD --metagenome"
 else
-  HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
-    -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
-    --alignments "${DATASET}.ess/proteins.aln" \
-    > "${DATASET}.ess/log"
+  CMD="$CMD --alignments '${DATASET}.ess/proteins.aln'"
 fi
+$CMD > "${DATASET}.ess/log"
 # Reduce files
 if exists "$DATASET".ess/*.faa ; then

data/scripts/mytaxa.bash CHANGED Viewed

@@ -38,7 +38,9 @@ else
     fi
     # Execute search
-    diamond blastp -q "../../../06.cds/$DATASET.faa" -d "$MT/AllGenomes.faa" \
+    FAA="../../../06.cds/$DATASET.faa"
+    [[ -s "$FAA" ]] || FAA="${FAA}.gz"
+    diamond blastp -q "$FAA" -d "$MT/AllGenomes.faa" \
       -a "$DATASET.daa" -k 5 -p "$CORES" --min-score 60
     diamond view -a "$DATASET.daa" -o "$DATASET.blast"

data/scripts/mytaxa_scan.bash CHANGED Viewed

@@ -39,12 +39,13 @@ else
       exit 1
     fi
+    FAA="../../../06.cds/$DATASET.faa"
+    [[ -s "$FAA" ]] || FAA="${FAA}.gz"
     if [[ ! -s "$DATASET.mytaxa" ]] ; then
       # Execute search
       if [[ ! -s "$DATASET.blast" ]] ; then
-        diamond blastp -q "../../../06.cds/$DATASET.faa" \
-          -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60 \
-          -a "$DATASET.daa" -t "$TMPDIR"
+        diamond blastp -q "$FAA" -a "$DATASET.daa" -t "$TMPDIR" \
+          -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
         diamond view -a "$DATASET.daa" -o "$DATASET.blast" -t "$TMPDIR"
       fi
@@ -53,8 +54,7 @@ else
         | sort -k 13 > "$DATASET.mytaxain"
       "$MT/MyTaxa" "$DATASET.mytaxain" "$DATASET.mytaxa" "0.5"
     fi
-    ruby "$MIGA/utils/mytaxa_scan.rb" "../../../06.cds/$DATASET.faa" \
-          "$DATASET.mytaxa" "$DATASET.wintax"
+    ruby "$MIGA/utils/mytaxa_scan.rb" "$FAA" "$DATASET.mytaxa" "$DATASET.wintax"
     echo "
     source('$MIGA/utils/mytaxa_scan.R');
     pdf('$DATASET.pdf', 12, 7);
@@ -70,11 +70,18 @@ else
         let i=$i+1
         awk "NR==$win" "$DATASET.wintax.genes" | tr "\\t" "\\n" \
           > "$DATASET.reg/$i.ids"
-        FastA.filter.pl -q "$DATASET.reg/$i.ids" \
-          "../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
+        if [[ "$FAA" == *.gz ]] ; then
+          gzip -c -d "$FAA" \
+            | FastA.filter.pl -q "$DATASET.reg/$i.ids" /dev/stdin \
+            > "$DATASET.reg/$i.faa"
+        else
+          FastA.filter.pl -q "$DATASET.reg/$i.ids" "$FAA" \
+            > "$DATASET.reg/$i.faa"
+        fi
       done
       # Archive regions
-      tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
+      tar -cf "$DATASET.reg.tar" "$DATASET.reg"
+      gzip -9 "$DATASET.reg.tar"
       rm -r "$DATASET.reg"
     fi

data/utils/domain-ess-genes.rb ADDED Viewed

@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+esslog = ARGV.shift
+outlog = ARGV.shift
+domain = ARGV.shift
+def quality(hsh)
+  q = {}
+  q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
+  q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
+  q[:cmp] = 100.0*q[:found].to_f/hsh.size
+  q[:cnt] = 100.0*q[:multi].to_f/hsh.size
+  q
+end
+# Find collection and detected anomalies
+cnt_ref = {}
+at = :header
+collection = 'dupont_2012'
+File.open(esslog, 'r') do |fh|
+  fh.each_line do |ln|
+    v = ln.chomp.gsub(/^! +/, '')
+    if v == 'Multiple copies: '
+      at = :multi
+    elsif v == 'Missing genes: '
+      at = :missing
+    elsif v =~ /Collection: (\S+)/
+      collection = $1
+    elsif at == :multi
+      v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
+      cnt_ref[$2] = $1.to_i
+    elsif at == :missing
+      v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
+      cnt_ref[$1] = 0
+    end
+  end
+end
+# Find expected genes for domain
+n_dom = Hash[
+  `HMM.essential.rb -L -q '-#{domain}' -c '#{collection}'`
+    .chomp.split("\n").map { |i| i.split("\t") }
+]
+l_dom = n_dom.keys
+cnt_dom = {}
+l_dom.each { |i| cnt_dom[i] = cnt_ref[i] || 1 }
+#  Correct report
+q = quality(cnt_dom)
+File.open(outlog, 'w') do |ofh|
+  ofh.puts "! Collection: #{collection} #{domain}"
+  ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_dom.size}."
+  ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
+  ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
+  if q[:multi] > 0
+    ofh.puts "! Multiple copies: "
+    cnt_dom.each{ |k,v| ofh.puts "!   #{v} #{k}: #{n_dom[k]}." if v>1 }
+  end
+  if q[:found] < cnt_dom.size
+    ofh.puts "! Missing genes: "
+    cnt_dom.each{ |k,v| ofh.puts "!   #{k}: #{n_dom[k]}." if v==0 }
+  end
+end

data/utils/enveomics/Manifest/Tasks/other.json CHANGED Viewed

@@ -371,8 +371,18 @@
           "source_url": "http://hmmer.janelia.org/software"
         }
       ],
-      "cite": [["Eddy, 2011, PLoS CB",
-        "http://dx.doi.org/10.1371/journal.pcbi.1002195"]],
+      "cite": [
+        ["Eddy, 2011, PLoS CB",
+          "http://dx.doi.org/10.1371/journal.pcbi.1002195"],
+        ["Dupont et al, 2012, ISME J",
+          "https://doi.org/10.1038/ismej.2011.189"],
+        ["Rodriguez-R et al, 2014, ISME J",
+          "https://doi.org/10.1038/ismej.2015.5"],
+        ["Lee, 2019, Bioinf",
+          "https://doi.org/10.1093/bioinformatics/btz188"],
+        ["Eren et al, 2015, PeerJ",
+          "https://doi.org/10.7717/peerj.1319"]
+      ],
       "options": [
         {
           "name": "Input file",
@@ -381,6 +391,15 @@
           "mandatory": true,
           "description": "FastA file containing all the proteins in the genome."
         },
+        {
+          "opt": "--collection",
+          "arg": "string",
+          "default": "dupont_2012",
+          "description": ["Reference collection of essential proteins to use.",
+            "One of: dupont_2012 (default, Dupont et al 2012 modified by",
+            "Rodriguez-R et al 2015), or lee_2019 (Lee 2019 modified by Eren",
+            "et al 2015)."]
+        },
         {
           "name": "Output file",
           "opt": "--out",

data/utils/enveomics/Manifest/examples.json CHANGED Viewed

@@ -64,15 +64,15 @@
       "task": "HMM.essential.rb",
       "description": ["Typical single-copy bacterial genes present in",
         "Mycoplasma genitalium."],
-      "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
-        null,null,null,null,null,null,null,null]
+      "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
+        null,null,null,null,null,null,null,null,null]
     },
     {
       "task": "HMM.essential.rb",
       "description": ["Typical single-copy archaeal genes present in",
         "Nanoarchaeum equitans."],
-      "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
-        null,null,null,null,null,null,null,null]
+      "values": ["Mgen_M2288.faa",null,null,null,null,null,null,null,true,null,
+        null,null,null,null,null,null,null,null,null]
     },
     {
       "task": "Newick.autoprune.R",

data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl CHANGED Viewed

	@@ -1 +1 @@
1	- ~~utils/enveomics/Pipelines/assembly.pbs/../../~~Scripts/FastA.N50.pl
1	+ ../../Scripts/FastA.N50.pl

data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl CHANGED Viewed

	@@ -1 +1 @@
1	- ~~utils/enveomics/Pipelines/assembly.pbs/../../~~Scripts/FastA.filterN.pl
1	+ ../../Scripts/FastA.filterN.pl

data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl CHANGED Viewed

	@@ -1 +1 @@
1	- ~~utils/enveomics/Pipelines/assembly.pbs/../../~~Scripts/FastA.length.pl
1	+ ../../Scripts/FastA.length.pl

data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl CHANGED Viewed

	@@ -1 +1 @@
1	- ~~utils/enveomics/Pipelines/blast.pbs/../../~~Scripts/FastA.split.pl
1	+ ../../Scripts/FastA.split.pl

data/utils/enveomics/Scripts/HMM.essential.rb CHANGED Viewed

@@ -10,7 +10,8 @@ use 'zlib'
 o = {
   bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
-  archaea: false, genomeeq: false, metagenome: false, list: false
+  archaea: false, genomeeq: false, metagenome: false, list: false,
+  collection: 'dupont_2012'
 }
 OptionParser.new do |opts|
   opts.banner = "
@@ -33,7 +34,15 @@ Usage: #{$0} [options]"
     'Path to the FastA file (.gz allowed) with all the proteins in a genome'
   ) { |v| o[:in] = v }
   opts.separator ''
-  opts.separator 'Report Options'
+  opts.separator 'Options'
+  opts.on(
+    '-c', '--collection STR',
+    'Reference collection of essential proteins to use. One of:',
+    '> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
+    '  modified by https://doi.org/10.1038/ismej.2015.5',
+    '> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
+    '  modified by https://doi.org/10.7717/peerj.1319'
+  ) { |v| o[:collection] = v }
   opts.on(
     '-o', '--out FILE',
     'Path to the output FastA file with the translated essential genes',
@@ -117,20 +126,44 @@ abort '-i is mandatory' if o[:in].nil? and not o[:list]
 o[:bin] = o[:bin] + '/' if o[:bin].size > 0
 o[:rename] = nil if o[:metagenome]
-not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
-TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062 TIGR00082 TIGR00086
-TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158 TIGR00165 TIGR00166 TIGR00168
-TIGR00362 TIGR00388 TIGR00396 TIGR00409 TIGR00418 TIGR00420 TIGR00422 TIGR00436
-TIGR00459 TIGR00460 TIGR00472 TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663
-TIGR00775 TIGR00810 TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964
-TIGR00967 TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
-TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059 TIGR01063
-TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169 TIGR01171 TIGR01391
-TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013 TIGR02027 TIGR02191 TIGR02350
-TIGR02386 TIGR02387 TIGR02397 TIGR02432 TIGR02729 TIGR03263 TIGR03594}
-not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
-not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
-TIGR00389 TIGR00436 tRNA-synth_1d}
+case o[:collection]
+when 'dupont_2012'
+  not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
+  TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
+  TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
+  TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
+  TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
+  TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
+  TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
+  TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
+  TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
+  TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
+  TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
+  TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
+  TIGR02729 TIGR03263 TIGR03594}
+  not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
+  not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
+  TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
+when 'lee_2019'
+  not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
+  EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
+  Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
+  Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
+  Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
+  Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
+  RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
+  not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
+  CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
+  DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
+  Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
+  Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
+  Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
+  Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
+  Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
+  not_as_genomeeq = not_in_archaea + not_in_bacteria
+else
+  raise "Unsupported collection: '#{o[:collection]}'"
+end
 begin
   Dir.mktmpdir do |dir|
@@ -148,7 +181,8 @@ begin
     models = {}
     model_id = nil
     dbh = File.open("#{dir}/essential.hmm", 'w')
-    o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
+    o[:model_file] ||= File.expand_path(
+      "../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
     mfh = (File.extname(o[:model_file]) == '.gz') ?
       Zlib::GzipReader.open(o[:model_file]) :
       File.open(o[:model_file], 'r')
@@ -201,6 +235,9 @@ begin
     # Report statistics
     if o[:stats]
       reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
+      modifiers = [:bacteria, :archaea, :genomeeq]
+        .map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
+      reph.puts "! Collection: #{o[:collection]} #{modifiers}"
       if o[:metagenome]
         reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
         gc = [0] * (models.size - genes.size) +

data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} RENAMED Viewed

File without changes

data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz ADDED Viewed

Binary file

data/utils/enveomics/Scripts/lib/enveomics.R CHANGED Viewed

	@@ -1 +1 @@
1	- ~~utils/~~enveomics~~/Scripts/lib/../../enveomics~~.R
1	+ ../../enveomics.R

data/utils/enveomics/enveomics.R/DESCRIPTION CHANGED Viewed

@@ -1,5 +1,5 @@
 Package: enveomics.R
-Version: 1.7.0
+Version: 1.7.1
 Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
 	   email="lmrodriguezr@gmail.com"))
 Title: Various Utilities for Microbial Genomics and Metagenomics

data/utils/enveomics/enveomics.R/R/df2dist.R CHANGED Viewed

@@ -25,25 +25,24 @@
 enve.df2dist <- function(
   x,
-  obj1.index=1,
-  obj2.index=2,
-  dist.index=3,
-  default.d=NA,
-  max.sim=0
+  obj1.index = 1,
+  obj2.index = 2,
+  dist.index = 3,
+  default.d = NA,
+  max.sim = 0
 ){
-  x <- as.data.frame(x);
-  a <- as.character(x[, obj1.index]);
-  b <- as.character(x[, obj2.index]);
-  d <- as.double(x[, dist.index]);
-  if(max.sim!=0) d <- (max.sim - d)/max.sim
-  ids <- unique(c(a,b));
-  m <- matrix(default.d, nrow=length(ids), ncol=length(ids), dimnames=list(ids, ids));
+  x <- as.data.frame(x)
+  a <- as.character(x[, obj1.index])
+  b <- as.character(x[, obj2.index])
+  d <- as.double(x[, dist.index])
+  if(max.sim != 0) d <- (max.sim - d) / max.sim
+  ids <- unique(c(a,b))
+  m <- matrix(default.d,
+    nrow = length(ids), ncol = length(ids), dimnames = list(ids, ids))
   diag(m) <- 0.0
-  for(i in 1:nrow(x)){
-    m[a[i], b[i]] <- d[i];
-  }
-  m <- pmin(m, t(m), na.rm=TRUE)
-  return(as.dist(m));
+  m[cbind(a,b)] <- d
+  m <- pmin(m, t(m), na.rm = TRUE)
+  return(as.dist(m))
 }
 #' Enveomics: Data Frame to Dist (Group)

data/utils/enveomics/enveomics.R/R/recplot2.R CHANGED Viewed

@@ -666,15 +666,16 @@ enve.recplot2.findPeaks <- function(
 #' A vector of number of components to evaluate.
 #' @param criterion
 #' Criterion to use for components selection. Must be one of:
-#' \code{aic} (Akaike Information Criterion),
-#' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
+#' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
+#' (Bayesian Information Criterion or Schwarz Criterion).
 #' @param merge.tol
 #' When attempting to merge peaks with very similar sequencing depth, use
 #' this number of significant digits (in log-scale).
 #' @param verbose
 #' Display (mostly debugging) information.
 #' @param ...
-#' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
+#' Any additional parameters supported by
+#' \code{\link{enve.recplot2.findPeaks.em}}.
 #'
 #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
 #'
@@ -684,10 +685,10 @@ enve.recplot2.findPeaks <- function(
 enve.recplot2.findPeaks.emauto <- function(
   x,
-  components=seq(1,10),
-  criterion='aic',
-  merge.tol=2L,
-  verbose=FALSE,
+  components = seq(1, 5),
+  criterion = 'aic',
+  merge.tol = 2L,
+  verbose = FALSE,
   ...
 ){
   best <- list(crit=0, pstore=list())
@@ -758,19 +759,19 @@ enve.recplot2.findPeaks.emauto <- function(
 enve.recplot2.findPeaks.em <- function(
   x,
-  max.iter=1000,
-  ll.diff.res=1e-8,
-  components=2,
-  rm.top=0.05,
-  verbose=FALSE,
+  max.iter = 1000,
+  ll.diff.res = 1e-8,
+  components = 2,
+  rm.top = 0.05,
+  verbose = FALSE,
   init,
-  log=TRUE
+  log = TRUE
 ){
   # Essential vars
   pos.binsize  <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
   lsd1  <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
-  lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
+  lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
   if(log) lsd1 <- log(lsd1)
   # 1. Initialize
@@ -779,7 +780,7 @@ enve.recplot2.findPeaks.em <- function(
     init <- list(
       mu = tapply(lsd1, km.clust, mean),
       sd = tapply(lsd1, km.clust, sd),
-      alpha = table(km.clust)/length(km.clust)
+      alpha = table(km.clust) / length(km.clust)
     )
   }
   m.step <- init
@@ -795,6 +796,7 @@ enve.recplot2.findPeaks.em <- function(
     ll.diff <- abs(cur.ll - e.step[["ll"]])
     cur.ll <- e.step[["ll"]]
     if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
+    if(is.na(ll.diff) || ll.diff == Inf) break
     if(ll.diff <= ll.diff.res) break
   }
@@ -1431,6 +1433,9 @@ enve.recplot2.findPeaks.__em_e <- function
                                               theta[['sd']][i])*theta[['alpha']][i]))
   sum.of.components <- rowSums(product)
   posterior <- product / sum.of.components
+  for(i in which(sum.of.components == Inf)) {
+    cat(i,'/',nrow(product), ':', product[i,], '\n')
+  }
   return(list(ll=sum(log(sum.of.components)), posterior=posterior))
 }

data/utils/enveomics/enveomics.R/README.md CHANGED Viewed

@@ -52,6 +52,7 @@ For additional information on recruitment plots, see the
 [Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
 ## Changelog
+* 1.7.1: Improved efficiency of `enve.df2dist` about five-fold.
 * 1.7.0: Uniformized output for `enve.recplot2.extractWindows` and
   `enve.recplot2.coordinates` to ease automation. Thanks to Tomeu Viver and
   Roth Conrad for troubleshooting.

data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd CHANGED Viewed

@@ -4,7 +4,7 @@
 \alias{enve.recplot2.findPeaks.emauto}
 \title{Enveomics: Recruitment Plot (2) Emauto Peak Finder}
 \usage{
-enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
+enve.recplot2.findPeaks.emauto(x, components = seq(1, 5),
   criterion = "aic", merge.tol = 2L, verbose = FALSE, ...)
 }
 \arguments{
@@ -13,15 +13,16 @@ enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
 \item{components}{A vector of number of components to evaluate.}
 \item{criterion}{Criterion to use for components selection. Must be one of:
-\code{aic} (Akaike Information Criterion),
-\code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).}
+\code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
+(Bayesian Information Criterion or Schwarz Criterion).}
 \item{merge.tol}{When attempting to merge peaks with very similar sequencing depth, use
 this number of significant digits (in log-scale).}
 \item{verbose}{Display (mostly debugging) information.}
-\item{...}{Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.}
+\item{...}{Any additional parameters supported by
+\code{\link{enve.recplot2.findPeaks.em}}.}
 }
 \value{
 Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.

data/utils/find-medoid.R CHANGED Viewed

@@ -7,7 +7,12 @@
 #= Load stuff
 argv <- commandArgs(trailingOnly = T)
 suppressPackageStartupMessages(library(ape))
-suppressPackageStartupMessages(library(enveomics.R))
+if(Sys.getenv('MIGA') == ''){
+  suppressPackageStartupMessages(library(enveomics.R))
+}else{
+  source(file.path(Sys.getenv('MIGA'),
+    'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
+}
 find_medoids <- function(ani.df, out, clades) {
   if(nrow(ani.df) == 0) return(NULL)

data/utils/mytaxa_scan.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 #!/usr/bin/env ruby
+require 'zlib'
 abort "
 Usage:
 #{$0} {FastA file} {MyTaxa file} {Data output}
@@ -7,52 +9,53 @@ Usage:
 " if ARGV[2].nil?
 begin
-   # Get arguments
-   faa, mytaxa, outdata = ARGV
-   winsize = 10
-   # Extract gene IDs
-   ids = File.open(faa).grep(/^>/).map{|dl| dl.chomp.sub(/^>/,"").sub(/\s.*/,"")}
-   tax = Hash[ids.map{|k| [k, "NA"]}]
-   # Get MyTaxa distributions
-   k, l = nil
-   File.open(mytaxa).each do |ln|
-      ln.chomp!
-      if $.%2 == 1
-	 k, l = ln.split /\t/
-      else
-	 tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
-      end
-   end
-   all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
-   # Estimate Windows and save gene IDs
-   fh = File.open(outdata + ".genes", "w")
-   c = []
-   c << all_tax.map{|t| tax.values.count(t) }
-   n_wins = (ids.size/winsize).ceil
-   (0 .. (n_wins-1)).each do |win|
-      k = ids[win*winsize, winsize]
-      win_t = tax.values_at(*k)
-      fh.puts k.join("\t")
-      c << all_tax.map{|t| win_t.count(t)}
-   end
-   p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
-   fh.close
-   # Save window profiles
-   fh = File.open(outdata, "w")
-   fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
-   fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
-   (0 .. (all_tax.size - 1)).each do |row|
-      fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
-   end
-   fh.close
+  # Get arguments
+  faa, mytaxa, outdata = ARGV
+  winsize = 10
+  # Extract gene IDs
+  ifh = faa =~ /\.gz/ ? Zlib::GzipReader.open(faa) : File.open(faa, 'r')
+  ids = ifh.each_line.grep(/^>/).map{|dl| dl.chomp.sub(/^>/,'').sub(/\s.*/,'')}
+  ifh.close
+  tax = Hash[ids.map{|k| [k, "NA"]}]
+  # Get MyTaxa distributions
+  k, l = nil
+  File.open(mytaxa).each do |ln|
+    ln.chomp!
+    if $.%2 == 1
+      k, l = ln.split /\t/
+    else
+      tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
+    end
+  end
+  all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
+  # Estimate Windows and save gene IDs
+  fh = File.open(outdata + ".genes", "w")
+  c = []
+  c << all_tax.map{|t| tax.values.count(t) }
+  n_wins = (ids.size/winsize).ceil
+  (0 .. (n_wins-1)).each do |win|
+    k = ids[win*winsize, winsize]
+    win_t = tax.values_at(*k)
+    fh.puts k.join("\t")
+    c << all_tax.map{|t| win_t.count(t)}
+  end
+  p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
+  fh.close
+  # Save window profiles
+  fh = File.open(outdata, "w")
+  fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
+  fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
+  (0 .. (all_tax.size - 1)).each do |row|
+    fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
+  end
+  fh.close
 rescue => err
-   $stderr.puts "Exception: #{err}\n\n"
-   err.backtrace.each { |l| $stderr.puts l + "\n" }
-   err
+  $stderr.puts "Exception: #{err}\n\n"
+  err.backtrace.each { |l| $stderr.puts l + "\n" }
+  err
 end

data/utils/ref-tree.R CHANGED Viewed

@@ -7,7 +7,12 @@
 #= Load stuff
 argv <- commandArgs(trailingOnly=T)
 suppressPackageStartupMessages(library(ape))
-suppressPackageStartupMessages(library(enveomics.R))
+if(Sys.getenv('MIGA') == ''){
+  suppressPackageStartupMessages(library(enveomics.R))
+}else{
+  source(file.path(Sys.getenv('MIGA'),
+    'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
+}
 inst <- c("phangorn", "phytools") %in% rownames(installed.packages())
 if(inst[1]){
   suppressPackageStartupMessages(library(phangorn))

data/utils/subclades-nj.R CHANGED Viewed

@@ -12,7 +12,12 @@ suppressPackageStartupMessages(library(cluster))
 suppressPackageStartupMessages(library(phytools))
 suppressPackageStartupMessages(library(phangorn))
 suppressPackageStartupMessages(library(parallel))
-suppressPackageStartupMessages(library(enveomics.R))
+if(Sys.getenv('MIGA') == ''){
+  suppressPackageStartupMessages(library(enveomics.R))
+}else{
+  source(file.path(Sys.getenv('MIGA'),
+    'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
+}
 #= Main function
 subclades <- function(ani_file, out_base, thr=1, ani=c()) {

data/utils/subclades.R CHANGED Viewed

@@ -10,7 +10,12 @@ suppressPackageStartupMessages(library(ape))
 suppressPackageStartupMessages(library(vegan))
 suppressPackageStartupMessages(library(cluster))
 suppressPackageStartupMessages(library(parallel))
-suppressPackageStartupMessages(library(enveomics.R))
+if(Sys.getenv('MIGA') == ''){
+  suppressPackageStartupMessages(library(enveomics.R))
+}else{
+  source(file.path(Sys.getenv('MIGA'),
+    'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
+}
 #= Main function
 subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 0.5.0.0
+  version: 0.5.1.0
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-11-25 00:00:00.000000000 Z
+date: 2020-01-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons
@@ -197,7 +197,6 @@ files:
 - test/taxonomy_test.rb
 - test/test_helper.rb
 - utils/adapters.fa
-- utils/arch-ess-genes.rb
 - utils/cleanup-databases.rb
 - utils/core-pan-plot.R
 - utils/distance/base.rb
@@ -207,6 +206,7 @@ files:
 - utils/distance/runner.rb
 - utils/distance/temporal.rb
 - utils/distances.rb
+- utils/domain-ess-genes.rb
 - utils/enveomics/Docs/recplot2.md
 - utils/enveomics/Examples/aai-matrix.bash
 - utils/enveomics/Examples/ani-matrix.bash
@@ -356,7 +356,8 @@ files:
 - utils/enveomics/Scripts/clust.rand.rb
 - utils/enveomics/Scripts/gi2tax.rb
 - utils/enveomics/Scripts/in_silico_GA_GI.pl
-- utils/enveomics/Scripts/lib/data/essential.hmm.gz
+- utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz
+- utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz
 - utils/enveomics/Scripts/lib/enveomics.R
 - utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
 - utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
@@ -514,8 +515,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: MiGA

data/utils/arch-ess-genes.rb DELETED Viewed

@@ -1,57 +0,0 @@
-#!/usr/bin/env ruby
-esslog = ARGV.shift
-outlog = ARGV.shift
-l_all = `HMM.essential.rb -l -q`.chomp.split("\n").map{ |i| i.gsub(/\t.*/,"") }
-n_arc = Hash[
-  `HMM.essential.rb -l -q -A`.chomp.split("\n").map{ |i| i.split("\t") }
-]
-l_arc = n_arc.keys
-def quality(hsh)
-  q = {}
-  q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
-  q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
-  q[:cmp] = 100.0*q[:found].to_f/hsh.size
-  q[:cnt] = 100.0*q[:multi].to_f/hsh.size
-  q
-end
-cnt_ref = {}
-l_all.each{ |i| cnt_ref[i] = 1 }
-at = :header
-File.open(esslog, "r") do |fh|
-  fh.each_line do |ln|
-    v = ln.chomp.gsub(/^! +/, "")
-    if v=="Multiple copies: "
-      at = :multi
-    elsif v=="Missing genes: "
-      at = :missing
-    elsif at==:multi
-      v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
-      cnt_ref[$2] = $1.to_i
-    elsif at==:missing
-      v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
-      cnt_ref[$1] = 0
-    end
-  end
-end
-cnt_arc = {}
-l_arc.each{ |i| cnt_arc[i] = cnt_ref[i] }
-q = quality(cnt_arc)
-File.open(outlog, "w") do |ofh|
-  ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_arc.size}."
-  ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
-  ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
-  if q[:multi] > 0
-    ofh.puts "! Multiple copies: "
-    cnt_arc.each{ |k,v| ofh.puts "!   #{v} #{k}: #{n_arc[k]}." if v>1 }
-  end
-  if q[:found] < cnt_arc.size
-    ofh.puts "! Missing genes: "
-    cnt_arc.each{ |k,v| ofh.puts "!   #{k}: #{n_arc[k]}." if v==0 }
-  end
-end