RubyGems - miga-base - Versions diffs - 0.3.11.2 → 0.3.12.0 - Mend

miga-base 0.3.11.2 → 0.3.12.0

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/miga/dataset/result.rb +2 -2
data/lib/miga/version.rb +2 -2
data/scripts/essential_genes.bash +14 -14
data/test/daemon_test.rb +1 -0
data/utils/enveomics/Manifest/Tasks/other.json +13 -0
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +33 -1
data/utils/enveomics/Manifest/categories.json +2 -0
data/utils/enveomics/Manifest/examples.json +4 -4
data/utils/enveomics/Scripts/HMM.essential.rb +235 -205
data/utils/enveomics/Scripts/HMM.haai.rb +159 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 71f390ca4ceb03f4d0dfeac23f55939aaf91d135bbd22129102468d8e25095e0
-  data.tar.gz: e139fb696e76345da577f4e8fc4cd4b0bf11efaee23423e5693b5f46b1ee06b6
+  metadata.gz: 48d903a383d237f7b236d8ad1706a5fb017b31d320768353a1bc33846ea0d471
+  data.tar.gz: 9b448f00992aa4152df34ded6a48105afe5c1daf1e994737f253b519f81c998f
 SHA512:
-  metadata.gz: 3cc2ec5d43cc613ab69debed6e623d6793190dae0e6e12954af5d1edf58ced8c2b87fdaa55b9a47b428ed3ed833cf19418875c40b1b8efc9ea84167c94b31fd9
-  data.tar.gz: 5ec76e8f90c73b274b2d4b67ed3b09bd085126940e2c7a5d0941a7be390a0c79c739956690e31902a70a7b1b148f05d08dc665ec555c5c079dd0ba2655736b98
+  metadata.gz: 1579eecdab3c38bda21678baa4c903c85dfbb07e19f993212d89f6b62e1561d4044b41cfc98d80e43780d1fd172ef607a92d5320ff570de87dafda03520f2d59
+  data.tar.gz: d083fc8ae10735f647681d3924c8990951fc2a6ba7d501c10b89d32b0f00088a5128dfca2b24df118b2f8a56e3a5853d210933e300d7ab7bd3bd14c4bab90b8a

data/lib/miga/dataset/result.rb CHANGED Viewed

@@ -226,8 +226,8 @@ module MiGA::Dataset::Result
     def add_result_essential_genes(base, _opts)
       return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
       r = MiGA::Result.new("#{base}.json")
-      add_files_to_ds_result(r, name, ess_genes: ".ess.faa",
-        collection: ".ess", report: ".ess/log")
+      add_files_to_ds_result(r, name, ess_genes: '.ess.faa',
+        collection: '.ess', report: '.ess/log', alignments: '.ess/proteins.aln')
     end
     ##

data/lib/miga/version.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.3, 11, 2]
+  VERSION = [0.3, 12, 0]
   ##
   # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
   ##
   # Date of the current gem release.
-  VERSION_DATE = Date.new(2019, 04, 20)
+  VERSION_DATE = Date.new(2019, 04, 26)
   ##
   # Reference of MiGA.

data/scripts/essential_genes.bash CHANGED Viewed

@@ -7,31 +7,31 @@ SCRIPT="essential_genes"
 cd "$PROJECT/data/07.annotation/01.function/01.essential"
 # Initialize
-miga date > "$DATASET.start"
-FAA="../../../06.cds/$DATASET.faa"
+miga date > "${DATASET}.start"
+FAA="../../../06.cds/${DATASET}.faa"
 # Check if there are any proteins
 if [[ ! -s $FAA ]] ; then
   echo Empty protein set, bypassing essential genes
-  rm "$DATASET.start"
-  miga create_dataset -P "$PROJECT" -D "$DATASET" \
-    -m run_essential_genes=false --update
+  rm "${DATASET}.start"
+  miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
   exit 0
 fi
 # Find and extract essential genes
-[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
-mkdir "$DATASET.ess"
+[[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
+mkdir "${DATASET}.ess"
 TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
   --metadata "type" | awk '{print $2}')
 if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
-  HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
-    -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
-    > "$DATASET.ess/log"
+  HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
+    -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
+    > "${DATASET}.ess/log"
 else
-  HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
-    -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
-    > "$DATASET.ess/log"
+  HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
+    -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
+    --alignments "${DATASET}.ess/proteins.aln" \
+    > "${DATASET}.ess/log"
 fi
 # Reduce files
@@ -42,5 +42,5 @@ if exists "$DATASET".ess/*.faa ; then
 fi
 # Finalize
-miga date > "$DATASET.done"
+miga date > "${DATASET}.done"
 miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f

data/test/daemon_test.rb CHANGED Viewed

@@ -37,6 +37,7 @@ class DaemonTest < Test::Unit::TestCase
       File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
     FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
       File.expand_path("data/01.raw_reads/ds1.done", p.path))
+    ds.first_preprocessing(true)
     out = capture_stdout do
       d.check_datasets
     end

data/utils/enveomics/Manifest/Tasks/other.json CHANGED Viewed

@@ -401,6 +401,19 @@
           "description": ["Path to the report file. By default, the report is",
             "sent to the STDOUT."]
         },
+        {
+          "name": "HMMsearch output",
+          "opt": "--hmm-out",
+          "arg": "out_file",
+          "description": ["Save HMMsearch output in this file. By default,",
+            "not saved."]
+        },
+        {
+          "opt": "--alignments",
+          "opt": "out_file",
+          "description": ["Save the aligned proteins in this file. By default,",
+            "not saved."]
+        },
         {
           "opt": "--bacteria",
           "description": "If set, ignores models typically missing in Bacteria."

data/utils/enveomics/Manifest/Tasks/sequence-identity.json CHANGED Viewed

@@ -189,7 +189,7 @@
       "description": ["Calculates the Average Nucleotide Identity between two",
         "genomes."],
       "help_arg": "--help",
-      "see_also": ["aai.rb","rbm.rb"],
+      "see_also": ["aai.rb","rbm.rb","HMM.essential.rb"],
       "cite": [
         ["Konstantinidis & Tiedje, 2005, PNAS",
           "http://dx.doi.org/10.1073%2Fpnas.0409727102"],
@@ -362,6 +362,38 @@
         }
       ]
     },
+    {
+      "task": "HMM.haai.rb",
+      "description": ["Estimates Average Amino Acid Identity (AAI) from the",
+        "essential genes extracted and aligned by HMM.essential.rb (see",
+        "Alignments)."],
+      "help_arg": "--help",
+      "see_also": ["HMM.essential.rb","aai.rb"],
+      "options": [
+        {
+          "name": "Alignments 1",
+          "opt": "-1",
+          "arg": "in_file",
+          "description": "Input alignments file for genome 1."
+        },
+        {
+          "name": "Alignments 2",
+          "opt": "-2",
+          "arg": "in_file",
+          "description": "Input alignments file for genome 2."
+        },
+        {
+          "name": "Alignment output",
+          "opt": "--aln-out",
+          "arg": "out_file",
+          "description": "Output file containing the aligned proteins."
+        },
+        {
+          "opt": "--quiet",
+          "description": "Run quietly (no STDERR output)."
+        }
+      ]
+    },
     {
       "task": "rbm.rb",
       "description": ["Finds the reciprocal best matches between two sets of",

data/utils/enveomics/Manifest/categories.json CHANGED Viewed

@@ -29,6 +29,7 @@
       "Execution": [
         "aai.rb",
         "ani.rb",
+        "HMM.haai.rb",
         "rbm.rb"
       ]
     },
@@ -101,6 +102,7 @@
       ],
       "Search": [
         "HMM.essential.rb",
+        "HMM.haai.rb",
         "HMMsearch.extractIds.rb",
         "ogs.annotate.rb",
         "ogs.core-pan.rb",

data/utils/enveomics/Manifest/examples.json CHANGED Viewed

@@ -64,15 +64,15 @@
       "task": "HMM.essential.rb",
       "description": ["Typical single-copy bacterial genes present in",
         "Mycoplasma genitalium."],
-      "values": ["Mgen_M2288.faa",null,null,null,true,null,null,null,null,null,
-        null,null,null,null,null,null]
+      "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
+        null,null,null,null,null,null,null,null]
     },
     {
       "task": "HMM.essential.rb",
       "description": ["Typical single-copy archaeal genes present in",
         "Nanoarchaeum equitans."],
-      "values": ["Mgen_M2288.faa",null,null,null,null,true,null,null,null,null,
-        null,null,null,null,null,null]
+      "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
+        null,null,null,null,null,null,null,null]
     },
     {
       "task": "Newick.autoprune.R",

data/utils/enveomics/Scripts/HMM.essential.rb CHANGED Viewed

@@ -1,20 +1,17 @@
 #!/usr/bin/env ruby
-#
 # @author  Luis M. Rodriguez-R
 # @license artistic license 2.0
-# @update  Mar-23-2016
-#
-$:.push File.expand_path("../lib", __FILE__)
-require "enveomics_rb/enveomics"
-use "tmpdir"
-use "zlib"
+$:.push File.expand_path('../lib', __FILE__)
+require 'enveomics_rb/enveomics'
+use 'tmpdir'
+use 'zlib'
-o = {bin:"", thr:2, q:false, stats:true, genes:true, bacteria:false,
-   archaea:false, genomeeq:false, metagenome:false, list:false}
+o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
+  archaea: false, genomeeq: false, metagenome: false, list: false}
 OptionParser.new do |opts|
-   opts.banner = "
+  opts.banner = "
 Finds and extracts a collection of essential proteins suitable for genome
 completeness evaluation and phylogenetic analyses. Important note: most complete
 bacterial genomes contain only 106/111 genes in this collection, therefore
@@ -27,68 +24,74 @@ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
 Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
 Usage: #{$0} [options]"
-   opts.separator ""
-   opts.separator "Mandatory"
-   opts.on("-i", "--in FILE",
-      "Path to the FastA file containing all the proteins in a genome."
-      ){ |v| o[:in] = v }
-   opts.separator ""
-   opts.separator "Report Options"
-   opts.on("-o", "--out FILE",
-      "Path to the output FastA file with the translated essential genes.",
-      "By default the file is not produced."){ |v| o[:out] = v }
-   opts.on("-m", "--per-model STR",
-      "Prefix of translated genes in independent files with the name of the",
-      "model appended. By default files are not produced."
-      ){ |v| o[:permodel] = v }
-   opts.on("-R", "--report FILE",
-      "Path to the report file. By default, the report is sent to the STDOUT."
-      ){ |v| o[:report] = v }
-   opts.on("-B", "--bacteria",
-      "If set, ignores models typically missing in Bacteria."
-      ){ |v| o[:bacteria] = v }
-   opts.on("-A", "--archaea",
-      "If set, ignores models typically missing in Archaea."
-      ){ |v| o[:archaea] = v }
-   opts.on("-G", "--genome-eq",
-      "If set, ignores models not suitable for genome-equivalents estimations.",
-      "See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940."
-      ){ |v| o[:genomeeq] = v }
-   opts.on("-r", "--rename STR",
-      "If set, renames the sequences with the string provided and appends it",
-      "with pipe and the gene name (except in --per-model files)."
-      ){ |v| o[:rename]=v }
-   opts.on("-n", "--no-stats",
-      "If set, no statistics are reported on genome evaluation."
-      ){ |v| o[:stats] = v }
-   opts.on("-s", "--no-genes",
-      "If set, statistics won't include the lists of missing/multi-copy genes."
-      ){ |v| o[:genes] = v }
-   opts.on("-M", "--metagenome",
-      "If set, it allows for multiple copies of each gene and turns on",
-      "metagenomic report mode."){ |v| o[:metagenome] = v }
-   opts.separator ""
-   opts.separator "Other Options"
-   opts.on("-L", "--list-models",
-      "If set, it only lists the models and exits. Compatible with -A, -B, -G,",
-      "and -q; ignores all other parameters."){ |v| o[:list] = v }
-   opts.on("-b", "--bin DIR",
-      "Path to the directory containing the binaries of HMMer 3.0+."
-      ){ |v| o[:bin] = v }
-   opts.on("--model-file",
-      "External file containing models to search."){ |v| o[:model_file] = v }
-   opts.on("-t", "--threads INT",
-      "Number of parallel threads to be used.  By default: #{o[:thr]}."
-      ){ |v| o[:thr] = v.to_i }
-   opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
-   opts.on("-h", "--help", "Display this screen.") do
-      puts opts
-      exit
-   end
-   opts.separator ""
+  opts.separator ''
+  opts.separator 'Mandatory'
+  opts.on('-i', '--in FILE',
+    'Path to the FastA file containing all the proteins in a genome.'
+    ){ |v| o[:in] = v }
+  opts.separator ''
+  opts.separator 'Report Options'
+  opts.on('-o', '--out FILE',
+    'Path to the output FastA file with the translated essential genes.',
+    'By default the file is not produced.'){ |v| o[:out] = v }
+  opts.on('-m', '--per-model STR',
+    'Prefix of translated genes in independent files with the name of the',
+    'model appended. By default files are not produced.'
+    ){ |v| o[:permodel] = v }
+  opts.on('-R', '--report FILE',
+    'Path to the report file. By default, the report is sent to the STDOUT.'
+    ){ |v| o[:report] = v }
+  opts.on('--hmm-out FILE',
+    'Save HMMsearch output in this file. By default, not saved.'
+    ){ |v| o[:hmmout] = v }
+  opts.on('--alignments FILE',
+    'Save the aligned proteins in this file. By default, not saved'
+    ){ |v| o[:alignments] = v }
+  opts.on('-B', '--bacteria',
+    'If set, ignores models typically missing in Bacteria.'
+    ){ |v| o[:bacteria] = v }
+  opts.on('-A', '--archaea',
+    'If set, ignores models typically missing in Archaea.'
+    ){ |v| o[:archaea] = v }
+  opts.on('-G', '--genome-eq',
+    'If set, ignores models not suitable for genome-equivalents estimations.',
+    'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
+    ){ |v| o[:genomeeq] = v }
+  opts.on('-r', '--rename STR',
+    'If set, renames the sequences with the string provided and appends it',
+    'with pipe and the gene name (except in --per-model files).'
+    ){ |v| o[:rename]=v }
+  opts.on('-n', '--no-stats',
+    'If set, no statistics are reported on genome evaluation.'
+    ){ |v| o[:stats] = v }
+  opts.on('-s', '--no-genes',
+    'If set, statistics won\'t include the lists of missing/multi-copy genes.'
+    ){ |v| o[:genes] = v }
+  opts.on('-M', '--metagenome',
+    'If set, it allows for multiple copies of each gene and turns on',
+    'metagenomic report mode.'){ |v| o[:metagenome] = v }
+  opts.separator ''
+  opts.separator 'Other Options'
+  opts.on('-L', '--list-models',
+    'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
+    'and -q; ignores all other parameters.'){ |v| o[:list] = v }
+  opts.on('-b', '--bin DIR',
+    'Path to the directory containing the binaries of HMMer 3.0+.'
+    ){ |v| o[:bin] = v }
+  opts.on('--model-file',
+    'External file containing models to search.'){ |v| o[:model_file] = v }
+  opts.on('-t', '--threads INT',
+    "Number of parallel threads to be used.  By default: #{o[:thr]}."
+    ){ |v| o[:thr] = v.to_i }
+  opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
+  opts.on('-h', '--help', 'Display this screen.') do
+    puts opts
+    exit
+  end
+  opts.separator ''
 end.parse!
-abort "-i is mandatory" if o[:in].nil? and not o[:list]
-o[:bin] = o[:bin]+"/" if o[:bin].size > 0
+abort '-i is mandatory' if o[:in].nil? and not o[:list]
+o[:bin] = o[:bin] + '/' if o[:bin].size > 0
 o[:rename] = nil if o[:metagenome]
 not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
@@ -107,148 +110,175 @@ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
 TIGR00389 TIGR00436 tRNA-synth_1d}
 begin
-   Dir.mktmpdir do |dir|
-      $stderr.puts "Temporal directory: #{dir}." unless o[:q]
+  Dir.mktmpdir do |dir|
+    $stderr.puts "Temporal directory: #{dir}." unless o[:q]
+    # Create database.
+    $stderr.puts 'Searching models.' unless o[:q]
+    models = {}
+    model_id = nil
+    dbh = File.open("#{dir}/essential.hmm", 'w')
+    o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
+    mfh = (File.extname(o[:model_file]) == '.gz') ?
+      Zlib::GzipReader.open(o[:model_file]) :
+      File.open(o[:model_file], 'r')
+    while ln = mfh.gets
+      dbh.print ln
+      ln.chomp!
+      model_id = $1 if ln =~ /^NAME\s+(.+)/
+      models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
+    end
+    dbh.close
+    mfh.close
+    models.delete_if { |m| not_in_archaea.include? m  } if o[:archaea]
+    models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
+    models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
+    if o[:list]
+      models.each_pair{ |id,desc| puts [id,desc].join("\t") }
+      exit
+    end
+    # Check HMMer version and run HMMsearch.
+    if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
+      raise 'You have provided an unsupported version of HMMER. ' +
+        'This script requires HMMER 3.0+.'
+    end
+    o[:hmmout] ||= "#{dir}/hmmsearch"
+    `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
+      -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
+      > #{dir}/hmmsearch.log`
-      # Create database.
-      $stderr.puts "Searching models." unless o[:q]
-      models = {}
-      model_id = nil
-      dbh = File.open("#{dir}/essential.hmm", "w")
-      o[:model_file] ||= File.expand_path("../lib/data/essential.hmm.gz",
-					     __FILE__)
-      mfh = (File.extname(o[:model_file])==".gz") ?
-	 Zlib::GzipReader.open(o[:model_file]) :
-	 File.open(o[:model_file],"r")
-      while ln = mfh.gets
-	 dbh.print ln
-	 ln.chomp!
-	 model_id = $1 if ln =~ /^NAME\s+(.+)/
-	 models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
+    # Parse output
+    $stderr.puts 'Parsing results.' unless o[:q]
+    trash = []
+    genes = {}
+    File.open(o[:hmmout], 'r') do |resh|
+      while ln = resh.gets
+         next if ln =~ /^#/
+         r = ln.split /\s+/
+         next unless models.include? r[2]
+         if o[:metagenome]
+           genes[ r[2] ] = [] if genes[ r[2] ].nil?
+           genes[ r[2] ] << r[0]
+         elsif genes[ r[2] ].nil?
+           genes[ r[2] ] = r[0]
+         else
+           trash << r[2]
+         end
       end
-      dbh.close
-      mfh.close
-      models.delete_if { |m| not_in_archaea.include? m  } if o[:archaea]
-      models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
-      models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
-      if o[:list]
-	 models.each_pair{ |id,desc| puts [id,desc].join("\t") }
-	 exit
+    end
+    # Report statistics
+    if o[:stats]
+      reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
+      if o[:metagenome]
+        reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
+        gc = [0] * (models.size - genes.size) +
+          genes.values.map{ |g| g.length }.sort
+        reph.printf "! Mean number of copies per model: %.3f.\n",
+          gc.inject(:+).to_f / models.size
+        reph.printf "! Median number of copies per model: %.1f.\n",
+          gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
+        if o[:genes] and genes.size != models.size
+          reph.printf "! Missing genes: %s\n",
+            ([''] + models.keys.select{ |m| not genes.keys.include? m }.
+                  map{|m| "#{m}: #{models[m]}."}).join("\n!   ")
+        end
+      else
+        reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
+        reph.printf "! Completeness: %.1f%%.\n",
+          100.0 * genes.size / models.size
+        reph.printf "! Contamination: %.1f%%.\n",
+          100.0 * trash.size / models.size
+        if o[:genes]
+          reph.printf "! Multiple copies: %s\n",
+            ([''] + trash.uniq.
+                  map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
+                  join("\n!   ") unless trash.empty?
+          reph.printf "! Missing genes: %s\n",
+            ([''] + models.keys.select{ |m| not genes.keys.include? m }.
+                  map{ |m| "#{m}: #{models[m]}." }).
+                  join("\n!   ") unless genes.size == models.size
+        end
       end
-      # Check HMMer version and run HMMsearch.
-      if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
-	 raise "You have provided an unsupported version of HMMER. " +
-	    "This script requires HMMER 3.0+."
+      reph.close unless o[:report].nil?
+    end
+    # Extract sequences
+    unless o[:out].nil? and o[:permodel].nil?
+      $stderr.puts 'Extracting sequences.' unless o[:q]
+      faah = File.open(o[:in], 'r')
+      outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
+      geneh = nil
+      in_gene = nil
+      unless o[:permodel].nil?
+        genes.keys.each do |m|
+          File.open("#{o[:permodel]}#{m}.faa", 'w').close
+        end
       end
-      `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{dir}/hmmsearch" \
-	 --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
-	 > #{dir}/hmmsearch.log`
-      # Parse output
-      $stderr.puts "Parsing results." unless o[:q]
-      resh = File.open("#{dir}/hmmsearch","r")
-      trash = []
-      genes = {}
-      while ln = resh.gets
-	 next if ln =~ /^#/
-	 r = ln.split /\s+/
-	 next unless models.include? r[2]
-	 if o[:metagenome]
-	    genes[ r[2] ] = [] if genes[ r[2] ].nil?
-	    genes[ r[2] ] << r[0]
-	 elsif genes[ r[2] ].nil?
-	    genes[ r[2] ] = r[0]
-	 else
-	    trash << r[2]
-	 end
+      while ln = faah.gets
+        if ln =~ /^>(\S+)/
+          if o[:metagenome]
+            in_gene = genes.keys.
+              map{ |k| genes[k].include?($1) ? k : nil }.compact.first
+            in_gene = [in_gene, $1] unless in_gene.nil?
+          else
+            in_gene = genes.rassoc($1)
+          end
+          next if in_gene.nil?
+          geneh.close unless geneh.nil?
+          geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
+            o[:permodel].nil?
+          outh.print(o[:rename].nil? ?
+            ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
+          geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
+            geneh.nil?
+        else
+          next if in_gene.nil?
+          outh.print ln unless outh.nil?
+          geneh.print ln unless geneh.nil?
+        end
       end
+      geneh.close unless geneh.nil?
+      outh.close unless outh.nil?
+      faah.close
+    end
-      # Report statistics
-      if o[:stats]
-	 reph = o[:report].nil? ? $stdout : File.open(o[:report], "w")
-	 if o[:metagenome]
-	    reph.printf "! Essential genes found: %d/%d.\n",
-	       genes.size, models.size
-	    gc = [0]*(models.size - genes.size) +
-	       genes.values.map{|g| g.length}.sort
-	    reph.printf "! Mean number of copies per model: %.3f.\n",
-	       gc.inject(:+).to_f/models.size
-	    reph.printf "! Median number of copies per model: %.1f.\n",
-	       gc.size.even? ? gc[gc.size/2,2].inject(:+).to_f/2 : gc[gc.size/2]
-	    if o[:genes] and genes.size != models.size
-	       reph.printf "! Missing genes: %s\n",
-		  ([""] +
-		     models.keys.select{|m| not genes.keys.include? m
-		     }.map{|m| "#{m}: #{models[m]}."}).join("\n!   ")
-	    end
-	 else
-	    reph.printf "! Essential genes found: %d/%d.\n",
-	       genes.size, models.size
-	    reph.printf "! Completeness: %.1f%%.\n",
-	       100.0*genes.size/models.size
-	    reph.printf "! Contamination: %.1f%%.\n",
-	       100.0*trash.size/models.size
-	    if o[:genes]
-	       reph.printf "! Multiple copies: %s\n",
-		  ([""] +
-		     trash.uniq.map{|m|
-		     "#{trash.count(m)+1} #{m}: #{models[m]}."}
-		  ).join("\n!   ") unless trash.empty?
-	       reph.printf "! Missing genes: %s\n",
-		  ([""] +
-		     models.keys.select{|m| not genes.keys.include? m
-		     }.map{|m| "#{m}: #{models[m]}."}
-		  ).join("\n!   ") unless genes.size==models.size
-	    end
-	 end
-	 reph.close unless o[:report].nil?
+    unless o[:alignments].nil?
+      aln = {}
+      File.open("#{dir}/a.sto", 'r') do |fh|
+        cur_model = nil
+        mask = []
+        fh.each_line do |ln|
+          case ln.chomp
+          when /^# STOCKHOLM/
+            cur_model = nil
+            mask = []
+          when /^#=GS (\S+)\/([\d\-]+)\s+DE/
+            cur_model ||= genes.rassoc($1).first
+            aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
+          when /^#=GC RF\s+(\S+)/
+            aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
+              mask.each{ |d| i[d] = '' }
+            end
+          when /^[^#]\S*\s+(\S+)/
+            next if aln[ cur_model ][ 2 ]
+            aln[ cur_model ][ 2 ] = $1.upcase
+            mask = aln[ cur_model ][ 2 ].split('').each_with_index.
+                map{ |v, k| v == '.' ? k : nil }.compact.reverse
+            aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
+          end
+        end
       end
-      # Extract sequences
-      unless o[:out].nil? and o[:permodel].nil?
-	 $stderr.puts "Extracting sequences." unless o[:q]
-	 faah = File.open(o[:in], "r")
-	 outh = o[:out].nil? ? nil : File.open(o[:out], "w")
-	 geneh = nil
-	 in_gene = nil
-	 unless o[:permodel].nil?
-	    genes.keys.each do |m|
-	       File.open("#{o[:permodel]}#{m}.faa", "w").close
-	    end
-	 end
-	 while ln = faah.gets
-	    if ln =~ /^>(\S+)/
-	       if o[:metagenome]
-		  in_gene = genes.keys.map{|k| genes[k].include?($1) ? k : nil
-		     }.compact.first
-		  in_gene = [in_gene, $1] unless in_gene.nil?
-	       else
-		  in_gene = genes.rassoc($1)
-	       end
-	       next if in_gene.nil?
-	       geneh.close unless geneh.nil?
-	       geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa",
-		  "a+") unless o[:permodel].nil?
-	       outh.print(o[:rename].nil? ?
-		  ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
-	       geneh.print(
-		  o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless geneh.nil?
-	    else
-	       next if in_gene.nil?
-	       outh.print ln unless outh.nil?
-	       geneh.print ln unless geneh.nil?
-	    end
-	 end
-	 geneh.close unless geneh.nil?
-	 outh.close unless outh.nil?
-	 faah.close
+      File.open(o[:alignments], 'w') do |fh|
+        aln.each { |k, v| v.each{ |i| fh.puts i } }
       end
+    end
-      $stderr.puts "Done." unless o[:q]
-   end # |dir|
+    $stderr.puts 'Done.' unless o[:q]
+  end # |dir|
 rescue => err
-   $stderr.puts "Exception: #{err}\n\n"
-   err.backtrace.each { |l| $stderr.puts l + "\n" }
-   err
+  $stderr.puts "Exception: #{err}\n\n"
+  err.backtrace.each { |l| $stderr.puts l + "\n" }
+  err
 end

data/utils/enveomics/Scripts/HMM.haai.rb ADDED Viewed

@@ -0,0 +1,159 @@
+#!/usr/bin/env ruby
+# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @license Artistic-2.0
+require 'optparse'
+o = {q: false}
+ARGV << '-h' if ARGV.size==0
+OptionParser.new do |opt|
+  opt.banner = "
+Estimates Average Amino Acid Identity (AAI) from the essential genes extracted
+and aligned by HMM.essential.rb (see --alignments).
+Usage: #{$0} [options]"
+  opt.separator ''
+  opt.separator 'Mandatory'
+  opt.on('-1 PATH', 'Input alignments file for genome 1.'){ |v| o[:a] = v }
+  opt.on('-2 PATH', 'Input alignments file for genome 2.'){ |v| o[:b] = v }
+  opt.separator ''
+  opt.separator 'Options'
+  opt.on('-a', '--aln-out FILE',
+    'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
+  opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
+  opt.on('-h', '--help', 'Display this screen.') do
+    puts opt
+    exit
+  end
+  opt.separator ''
+end.parse!
+abort '-1 is mandatory.' if o[:a].nil?
+abort '-2 is mandatory.' if o[:b].nil?
+class HList
+  attr_accessor :list
+  def initialize(file)
+    @list = {}
+    r = File.readlines(file)
+    while not r.empty?
+      e = HElement.new(*r.shift(3))
+      @list[ e.model_id ] = e
+    end
+  end
+  def [](model_id)
+    list[model_id]
+  end
+  ##
+  # Returns an array of HAln objects.
+  def align(other)
+    list.keys.map do |model_id|
+      self[model_id].align(other[model_id]) unless other[model_id].nil?
+    end.compact
+  end
+  def models
+    list.keys
+  end
+end
+class HElement
+  attr_accessor :defline, :model_id, :protein_id, :protein_coords
+  attr_accessor :model_aln, :protein_aln
+  def initialize(defline, model_aln, protein_aln)
+    @defline = defline.chomp
+    @model_aln = model_aln.chomp
+    @protein_aln = protein_aln.chomp
+    if defline =~ /^# (.+) : (.+) : (.+)/
+      @model_id = $1
+      @protein_id = $2
+      @protein_coords = $3
+    end
+  end
+  def dup
+    HElement.new(defline, model_aln, protein_aln)
+  end
+  ##
+  # Returns an HAln object
+  def align(other)
+    HAln.new(self, other)
+  end
+  def mask
+    @mask ||= model_aln.chars.
+      each_with_index.map{ |v, k| v == '.' ? k : nil }.
+      compact.reverse
+  end
+  def mask!(template)
+    (template - mask).each do |d|
+      @model_aln[d]   = '-' + @model_aln[d]
+      @protein_aln[d] = '-' + @protein_aln[d]
+    end
+  end
+end
+class HAln
+  attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
+  def initialize(a, b)
+    a_masked = a.dup
+    a_masked.mask! b.mask.reverse
+    b_masked = b.dup
+    b_masked.mask! b_masked.mask
+    @protein_1 = a_masked.protein_aln
+    @protein_2 = b_masked.protein_aln
+    @model_id = a.model_id
+    @protein_1_id = a.protein_id + '/' + a.protein_coords
+    @protein_2_id = b.protein_id + '/' + b.protein_coords
+  end
+  def stats
+    @stats = { len: 0, gaps: 0, matches: 0 }
+    return @stats unless @stats[:id].nil?
+    protein_1.chars.each_with_index do |v, k|
+      next if v == '-' and protein_2[k] == '-'
+      @stats[:len] += 1
+      if v == protein_2[k]
+        @stats[:matches] += 1
+      elsif v == '-' or protein_2[k] == '-'
+        @stats[:gaps] += 1
+      end
+    end
+    @stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
+  end
+  def stats_to_s
+    stats.map{ |k,v| "#{k}:#{v}" }.join " "
+  end
+  def to_s
+    "# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
+      protein_1 + "\n" + protein_2 + "\n"
+  end
+end
+hlist1 = HList.new(o[:a])
+hlist2 = HList.new(o[:b])
+haln_arr = hlist1.align(hlist2)
+avg_identity  = haln_arr.map{ |i| i.stats[:id] }.inject(:+) / haln_arr.size
+avg2_identity = haln_arr.map{ |i| i.stats[:id] ** 2 }.inject(:+) / haln_arr.size
+sd_identity   = Math.sqrt( avg2_identity - avg_identity ** 2 )
+puts "Common models: #{haln_arr.size}"
+puts "All models: #{(hlist1.models | hlist1.models).size}"
+puts "Average identity: #{avg_identity.round(2)}%"
+puts "SD identity: #{sd_identity.round(2)}"
+if o[:alnout]
+  File.open(o[:alnout], 'w') do |fh|
+    haln_arr.each do |i|
+      fh.puts i
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 0.3.11.2
+  version: 0.3.12.0
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-04-20 00:00:00.000000000 Z
+date: 2019-04-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons
@@ -321,6 +321,7 @@ files:
 - utils/enveomics/Scripts/GFF.catsbj.pl
 - utils/enveomics/Scripts/GenBank.add_fields.rb
 - utils/enveomics/Scripts/HMM.essential.rb
+- utils/enveomics/Scripts/HMM.haai.rb
 - utils/enveomics/Scripts/HMMsearch.extractIds.rb
 - utils/enveomics/Scripts/JPlace.distances.rb
 - utils/enveomics/Scripts/JPlace.to_iToL.rb