RubyGems - miga-base - Versions diffs - 0.7.26.3 → 1.0.0.sr1 - Mend

miga-base 0.7.26.3 → 1.0.0.sr1

Files changed (105) hide show

checksums.yaml +4 -4
data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
data/lib/miga/cli/action/doctor.rb +50 -19
data/lib/miga/cli/action/doctor/base.rb +20 -18
data/lib/miga/cli/action/init.rb +11 -7
data/lib/miga/cli/action/init/files_helper.rb +1 -0
data/lib/miga/cli/action/ncbi_get.rb +3 -3
data/lib/miga/cli/action/tax_dist.rb +2 -2
data/lib/miga/cli/action/wf.rb +5 -4
data/lib/miga/daemon.rb +11 -4
data/lib/miga/dataset/result.rb +10 -6
data/lib/miga/json.rb +1 -2
data/lib/miga/metadata.rb +5 -1
data/lib/miga/parallel.rb +11 -6
data/lib/miga/project.rb +8 -8
data/lib/miga/project/base.rb +4 -4
data/lib/miga/project/result.rb +2 -2
data/lib/miga/sqlite.rb +7 -0
data/lib/miga/version.rb +23 -9
data/scripts/aai_distances.bash +16 -18
data/scripts/ani_distances.bash +16 -17
data/scripts/assembly.bash +31 -16
data/scripts/haai_distances.bash +3 -27
data/scripts/miga.bash +6 -4
data/scripts/p.bash +1 -1
data/scripts/read_quality.bash +9 -18
data/scripts/trimmed_fasta.bash +14 -30
data/scripts/trimmed_reads.bash +36 -36
data/test/parallel_test.rb +31 -0
data/test/project_test.rb +2 -1
data/utils/distance/commands.rb +1 -0
data/utils/distance/runner.rb +2 -4
data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
data/utils/enveomics/Manifest/Tasks/other.json +77 -0
data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
data/utils/enveomics/Manifest/categories.json +13 -4
data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
data/utils/enveomics/Scripts/SRA.download.bash +6 -8
data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
data/utils/enveomics/Scripts/aai.rb +3 -2
data/utils/enveomics/Scripts/anir.rb +137 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
data/utils/enveomics/Scripts/rbm.rb +87 -133
data/utils/enveomics/Scripts/sam.filter.rb +148 -0
data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
data/utils/enveomics/enveomics.R/R/utils.R +30 -0
data/utils/enveomics/enveomics.R/README.md +1 -0
data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
data/utils/multitrim/Multitrim How-To.pdf +0 -0
data/utils/multitrim/README.md +67 -0
data/utils/multitrim/multitrim.py +1555 -0
data/utils/multitrim/multitrim.yml +13 -0
data/utils/requirements.txt +4 -3
metadata +33 -6
data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30

data/utils/enveomics/Scripts/FastQ.tag.rb CHANGED Viewed

@@ -1,63 +1,70 @@
 #!/usr/bin/env ruby
-#
-# @author: Luis M. Rodriguez-R
-# @update: Feb-06-2015
-# @license: artistic license 2.0
-#
+# frozen_string_literal: true
-require 'optparse'
+$:.push File.expand_path('../lib', __FILE__)
+require 'enveomics_rb/enveomics'
+$VERSION = 1.1
-o = {:q=>FALSE, :p=>"", :s=>""}
-ARGV << '-h' if ARGV.size==0
+o = { q: false, p: '', s: '' }
 OptionParser.new do |opts|
-   opts.banner = "
-Generates easy-to-parse tagged reads from FastQ files.
+  opts.version = $VERSION
+  Enveomics.opt_banner(
+    opts, 'Generates easy-to-parse tagged reads from FastQ files',
+    "#{File.basename($0)} -i in.fasta -o out.fasta [options]"
+  )
-Usage: #{$0} [options]"
-   opts.separator ""
-   opts.separator "Mandatory"
-   opts.on("-i", "--in FILE", "Path to the FastQ file containing the sequences."){ |v| o[:in] = v }
-   opts.on("-o", "--out FILE", "Path to the FastQ to create."){ |v| o[:out] = v }
-   opts.separator ""
-   opts.separator "ID options"
-   opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
-   opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
-   opts.separator ""
-   opts.separator "Other Options"
-   opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
-   opts.on("-h", "--help", "Display this screen") do
-      puts opts
-      exit
-   end
-   opts.separator ""
+  opts.separator 'Mandatory'
+  opts.on(
+    '-i', '--in FILE',
+    'Path to the FastQ file containing the sequences',
+    'Supports compression with .gz extension, use - for STDIN'
+  ) { |v| o[:in] = v }
+  opts.on(
+    '-o', '--out FILE', 'Path to the FastQ to create',
+    'Supports compression with .gz extension, use - for STDOUT'
+  ) { |v| o[:out] = v }
+  opts.separator ''
+  opts.separator 'ID options'
+  opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
+  opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
+  opts.separator ''
+  opts.separator 'Other Options'
+  opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
+  opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
+  opts.separator ''
 end.parse!
-abort "-i is mandatory" if o[:in].nil?
-abort "-o is mandatory" if o[:out].nil?
+raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
+raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
 begin
-   ifh = File.open(o[:in], 'r');
-   ofh = File.open(o[:out], 'w');
-   i=0
-   while ln=ifh.gets
-      ln.chomp!
-      if $.%4==1 and not /^@/.match(ln).nil?
-	 i+=1
-	 ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
-      elsif $.%4==2 or $.%4==0
-         ofh.puts ln
-      elsif $.%4==3 and not /^\+/.match(ln).nil?
-         ofh.puts "+"
-      else
-         abort "Impossible to parse line #{$.}: #{ln}.\n"
-      end
-   end
-   ifh.close
-   ofh.close
+  ifh = reader(o[:in])
+  ofh = writer(o[:out])
+  i = 0
+  lno = 0
+  ifh.each do |ln|
+    ln.chomp!
+    lno += 1
+    case lno % 4
+    when 1
+      ln =~ /^@/ or
+        raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
+      i += 1
+      ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
+    when 3
+      ln =~ /^\+/ or
+        raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
+      ofh.puts '+'
+    else
+      ofh.puts ln
+    end
+  end
+  ifh.close
+  ofh.close
 rescue => err
-   $stderr.puts "Exception: #{err}\n\n"
-   err.backtrace.each { |l| $stderr.puts l + "\n" }
-   err
+  $stderr.puts "Exception: #{err}\n\n"
+  err.backtrace.each { |l| $stderr.puts l + "\n" }
+  err
 end

data/utils/enveomics/Scripts/SRA.download.bash CHANGED Viewed

@@ -1,21 +1,19 @@
 #!/bin/bash
-#
-# @author  Luis M. Rodriguez-R
-# @license artistic license 2.0
-#
-DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
+DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
 DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
 SRX=$1
 DIR=${2:-$SRX}
+VERSION=1.0
 if [[ "$SRX" == "" ]] ; then
 echo "
+[Enveomics Collection: $(basename "$0" .bash) $VERSION]
 Downloads the set of runs from a project, sample, or experiment in SRA.
 Usage:
-$0 <SRA-ID>[ <dir>]
+$(basename "$0") <SRA-ID>[ <dir>]
 <SRA-ID>	ID of the SRA Project, Sample, or Experiment.
 <dir>		Directory where the files are to be downladed. By default,
@@ -34,7 +32,7 @@ function md5value {
   echo "$o"
 }
-curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
+curl -Ls "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
 tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
   srr=$(echo "$ln"|cut -f 1)
   ftp=$(echo "$ln"|cut -f 2)

data/utils/enveomics/Scripts/Table.prefScore.R ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env Rscript
+#= Load stuff
+args <- commandArgs(trailingOnly = FALSE)
+enveomics_R <- file.path(
+  dirname(sub('^--file=', '', args[grep('^--file=', args)])),
+  'lib',
+  'enveomics.R'
+)
+for(file in c('cliopts.R','utils.R','prefscore.R'))
+  source(file.path(enveomics_R, 'R', file))
+#= Generate interface
+opt <- enve.cliopts(
+  enve.prefscore,
+  file.path(enveomics_R, 'man', 'enve.prefscore.Rd'),
+  positional_arguments = c(1, 4),
+  usage = 'usage: %prog [options] output.tsv [output.pdf [width height]]',
+  mandatory = c('x', 'set'),
+  number = c('signif.thr'),
+  ignore = c('plot'),
+  o_desc = list(
+    x = 'A tab-delimited table of presence/absence (1/0) with species as rows and samples as columns.',
+    set = 'A list of sample names that constitute the test set, one per line',
+    ignore = 'A list of species to exclude from the analysis, one per line'
+  )
+)
+#= Set output files
+opt$options[['x']] <- read.table(
+  opt$options[['x']],
+  header = TRUE,
+  row.names = 1,
+  sep = '\t'
+)
+opt$options[['set']] <- read.table(
+  opt$options[['set']],
+  header = FALSE,
+  sep = '\t',
+  as.is = TRUE
+)[,1]
+if(!is.null(opt$options[['ignore']]))
+  opt$options[['ignore']] <- read.table(
+    opt$options[['ignore']],
+    header = FALSE,
+    sep = '\t',
+    as.is = TRUE
+  )[,1]
+if(length(opt$args) > 1) {
+  args <- as.list(opt$args[-1])
+  for(i in 2:3) if(length(args) >= i) args[[i]] <- as.numeric(args[[i]])
+  do.call('pdf', args)
+} else {
+  opt$options[['plot']] <- FALSE
+}
+#= Run it!
+y <- do.call('enve.prefscore', opt$options)
+write.table(y, opt$args[1], quote = FALSE, sep = '\t', col.names = FALSE)
+if(length(opt$args)>1) ttt <- dev.off()

data/utils/enveomics/Scripts/aai.rb CHANGED Viewed

@@ -236,8 +236,9 @@ Dir.mktmpdir do |dir|
         end
       end
       response = RestClient.post(
-        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
-        db:"nuccore",rettype:"fasta",id:protIds.join(","),idtype:"acc")
+        'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
+        db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
+      )
       abort "Unable to reach NCBI EUtils, error code " +
         response.code.to_s + "." unless response.code == 200
       fo.puts response.to_str

data/utils/enveomics/Scripts/anir.rb ADDED Viewed

@@ -0,0 +1,137 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+$:.push File.expand_path('../lib', __FILE__)
+require 'enveomics_rb/enveomics'
+require 'enveomics_rb/anir'
+$VERSION = 1.0
+o = {
+  q: false, threads: 2,
+  r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
+  identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
+  coefficient: :sarle
+}
+OptionParser.new do |opt|
+  cmd = File.basename($0)
+  opt.banner = <<~BANNER
+    [Enveomics Collection: #{cmd} v#{$VERSION}]
+    Estimates ANIr: the Average Nucleotide Identity of reads against a genome
+    Usage
+        # [ Input/output modes ]
+        # Run mapping and (optionally) save it as SAM
+        # Requires bowtie2
+        #{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
+        # Read mapping from BAM file
+        # Requires samtools
+        #{cmd} -m map.bam --m-format bam [options]
+        # Read mapping from other formats: SAM or Tabular BLAST
+        #{cmd} -m map.blast --m-format tab [options]
+        # Read a list of identities as percentage (contig filtering off)
+        #{cmd} -m identities.txt --m-format list [options]
+        # [ Identity threshold modes ]
+        #{cmd} -i 95 -a fix [options] # Set fixed identity threshold
+        #{cmd} -a gmm [options]       # Find valley by EM of GMM
+        #{cmd} -a auto [options]      # Pick method by bimodality (default)"
+  BANNER
+  opt.separator 'Input/Output'
+  opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
+  opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
+  opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
+  opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
+  opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
+  opt.on(
+    '-T', '--tab PATH', 'Output file with results in tabular format'
+  ) { |v| o[:T] = v }
+  opt.separator ''
+  opt.separator 'Formats'
+  opt.on(
+    '--r-format STRING',
+    'Metagenomic reads format: fastq (default) or fasta',
+    'Both options support compression with .gz file extension'
+  ) { |v| o[:r_format] = v.downcase.to_sym }
+  opt.on(
+    '--r-type STRING', 'Type of metagenomic reads:',
+    '~ single (default): Single reads',
+    '~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
+    '~ interleaved: Coupled reads in a single interposed file'
+  ) { |v| o[:r_type] = v.downcase.to_sym }
+  opt.on(
+    '--g-format STRING',
+    'Genome assembly format: fasta (default) or list',
+    'Both options support compression with .gz file extension',
+    'If passed in mapping-read mode, filters only matches to these contigs'
+  ) { |v| o[:g_format] = v.downcase.to_sym }
+  opt.on(
+    '--m-format STRING',
+    'Mapping file format: sam (default), bam, tab, or list',
+    'sam, tab, and list options support compression with .gz file extension'
+  ) { |v| o[:m_format] = v.downcase.to_sym }
+  opt.separator ''
+  opt.separator 'Identity threshold'
+  opt.on(
+    '-i', '--identity FLOAT', Float,
+    "Set a fixed threshold of percent identity (default: #{o[:identity]})"
+  ) { |v| o[:identity] = v }
+  opt.on(
+    '-a', '--algorithm STRING',
+    'Set an algorithm to automatically detect identity threshold:',
+    '~ gmm: Valley detection by E-M of Gaussian Mixture Model',
+    '~ fix: Fixed threshold, see -i',
+    '~ auto (default): Pick gmm or fix depending on bimodality, see -b'
+  ) { |v| o[:algorithm] = v.downcase.to_sym }
+  opt.on(
+    '-b', '--bimodality FLOAT', Float,
+    'Threshold of bimodality below which the algorithm is set to fix',
+    'The coefficient used is the de Michele & Accantino (2014) B index',
+    "By default: #{o[:bimodality]}"
+  ) { |v| o[:bimodality] = v }
+  opt.on(
+    '--coefficient STRING',
+    'Coefficient of bimodality for -a auto:',
+    '~ sarle (default): Sarle\'s bimodality coefficient b',
+    '~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
+  ) { |v| o[:coefficient] = v.downcase.to_sym }
+  opt.on(
+    '--bin-size FLOAT', Float,
+    "Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
+  ) { |v| o[:bin_size] = v }
+  opt.separator ''
+  opt.separator 'General'
+  opt.on(
+    '-t', '--threads INT', Integer, 'Threads to use'
+  ) { |v| o[:threads] = v }
+  opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
+  opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
+  opt.on('-h', '--help', 'Display this screen') do
+    puts opt
+    exit
+  end
+  opt.separator ''
+end.parse!
+anir = Enveomics::ANIr.new(o)
+anir.go!
+if o[:T]
+  File.open(o[:T], 'w') do |fh|
+    fh.puts "anir\tsd\treads\tid_threshold"
+    fh.puts [
+      anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
+    ].join("\t")
+  end
+end

data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb ADDED Viewed

@@ -0,0 +1,293 @@
+# frozen_string_literal: true
+require 'enveomics_rb/stats'
+require 'fileutils'
+require 'shellwords'
+require 'tmpdir'
+require 'zlib'
+module Enveomics
+  # Wrapper class for ANIr estimation
+  #
+  # Use as: +ANIr.new(opts).go!+
+  class ANIr
+    # Options hash
+    attr :opts
+    # Identities list (unsorted)
+    attr :identities
+    def initialize(opts)
+      @opts = opts
+      @identities = []
+    end
+    # --------------------------------------------------[ High-level pipelines ]
+    # Perform all the analyses
+    def go!
+      read_input
+      detect_identity
+      estimate_ani_r
+    end
+    # Identify input/output mode and read mapping
+    def read_input
+      if opts[:m_format] != :list
+        @tmpdir = Dir.mktmpdir
+        @filter_contigs = !opts[:g].nil?
+        opts[:m] = File.join(@tmpdir, 'map.sam') if opts[:m].nil?
+        run_mapping unless File.exist? opts[:m]
+        load_contigs_to_filter if @filter_contigs
+      end
+      read_mapping = :"read_mapping_from_#{opts[:m_format]}"
+      raise Enveomics::OptionError.new(
+        "Unsupported mapping format: #{opts[:m_format]}"
+      ) unless respond_to? read_mapping
+      @identities = []
+      send(read_mapping)
+      say "- Unfiltered average identity: #{sample.mean}"
+      say "- Reads mapped: #{sample.n}"
+      save_identities
+      save_histogram
+    ensure
+      @tmpdir ||= nil
+      FileUtils.rm_rf @tmpdir if @tmpdir
+    end
+    # Identify the identity threshold
+    def detect_identity
+      say 'Detecting identity threshold'
+      if opts[:algorithm] == :auto
+        say "- Bimodality: #{bimodality}"
+        opts[:algorithm] = bimodality >= opts[:bimodality] ? :gmm : :fix
+      end
+      say "- Algorithm: #{opts[:algorithm]}"
+      if opts[:algorithm] == :gmm
+        detect_identity_by_gmm
+      end
+    end
+    # Estimate ANIr
+    def estimate_ani_r
+      say 'Estimating ANIr'
+      @sample = nil # Empty cached sample
+      @identities.delete_if { |i| i < opts[:identity] }
+      say "- ANIr: #{sample.mean}"
+    end
+    # -----------------------------------------------------------------[ Utils ]
+    # Show progress unless +opts[:q]+
+    def say(*msg)
+      o = '[%s] %s' % [Time.now, msg.join('')]
+      $stderr.puts(o) unless opts[:q]
+      File.open(opts[:log], 'a') { |fh| fh.puts o } if opts[:log]
+    end
+    # Execute command in the shell
+    def run(cmd)
+      say "  - Running: #{cmd.join(' ')}"
+      `#{cmd.shelljoin} 2>&1 | tee >> #{opts[:log] || '/dev/null'}`
+      unless $?.success?
+        raise Enveomics::CommandError.new("#{cmd.first} failed: #{$?}")
+      end
+    end
+    # Returns an open file handler for the file, supporting .gz
+    def reader(file)
+      file =~ /\.gz$/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
+    end
+    # Is the mapping in SAM format?
+    def sam?
+      opts[:m_format] == :sam
+    end
+    # ------------------------------------------------------------[ Map it out ]
+    # Execute Bowtie2 and generate SAM file
+    def run_mapping
+      say 'Running mapping using Bowtie2'
+      raise Enveomics::OptionError.new(
+        'Only SAM output is supported for mapping'
+      ) unless sam?
+      @filter_contigs = false
+      say '- Indexing input sequences'
+      raise Enveomics::OptionError.new(
+        'Only FastA genome input is supported for mapping'
+      ) unless opts[:g_format] == :fasta
+      idx = File.join(@tmpdir, 'genome.idx')
+      run(['bowtie2-build', opts[:g], idx])
+      say '- Mapping metagenomic reads to genome assembly'
+      cmd = [
+        'bowtie2', '-x', idx, '-p', opts[:threads], '-S', opts[:m], '--no-mixed'
+      ]
+      cmd << '-f' if opts[:r_format] == :fasta
+      cmd +=
+        case opts[:r_type]
+        when :single
+          ['-U', opts[:r]]
+        when :coupled
+          pairs = opts[:r].split(',', 2)
+          ['-1', pairs[0], '-2', pairs[1], '--no-discordant']
+        when :interleaved
+          ['--interleaved', opts[:r], '--no-discordant']
+        else
+          raise Enveomics::OptionError.new(
+            "Unsupported reads type: #{o[:r_type]}"
+          )
+        end
+      run(cmd)
+    end
+    # If +@filter_contigs+ is true, reads the genome assembly and saves contig
+    # names to filter the mapping
+    def load_contigs_to_filter
+      return unless @filter_contigs
+      say 'Loading contigs to filter'
+      reader = reader(opts[:g])
+      @contigs_to_filter =
+        case opts[:g_format]
+        when :fasta
+          reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
+        when :list
+          reader.each.map(&:chomp)
+        else
+          raise Enveomics::OptionError.new(
+            "Unsupported genome assembly format: #{opts[:g_format]}"
+          )
+        end
+      reader.close
+      say "- Got #{@contigs_to_filter.size} contigs"
+    end
+    # Reads the mapping file assuming SAM format
+    def read_mapping_from_sam
+      say 'Reading mapping from SAM file'
+      reader = reader(opts[:m])
+      reader.each { |ln| parse_sam_line(ln) }
+      reader.close
+    end
+    # Reads the mapping file assuming BAM format
+    def read_mapping_from_bam
+      say 'Reading mapping from BAM file'
+      IO.popen(['samtools', 'view', opts[:m]].shelljoin) do |fh|
+        fh.each { |ln| parse_sam_line(ln) }
+      end
+    end
+    # Reads the mapping file assuming a Tabular BLAST format
+    def read_mapping_from_tab
+      say 'Reading mapping from Tabular BLAST file'
+      reader = reader(opts[:m])
+      reader.each do |ln|
+        next if ln =~ /^\s*(#.*)?$/ # Comment or empty line
+        row = ln.chomp.split("\t")
+        next if @filter_contigs && !@contigs_to_filter.include?(row[1])
+        @identities << row[2].to_f
+      end
+      reader.close
+    end
+    # Reads the identities from a raw-text list
+    def read_mapping_from_list
+      say 'Reading identities from raw text list'
+      reader = reader(opts[:m])
+      @identities = reader.each.map(&:to_f)
+      reader.close
+    end
+    # Parses one line in SAM format
+    def parse_sam_line(ln)
+      return if ln =~ /^@/ || ln =~ /^\s*$/
+      row = ln.chomp.split("\t")
+      return if row[2] == '*'
+      return if @filter_contigs && !@contigs_to_filter.include?(row[2])
+      length = row[9].size
+      row.shift(11) # Discard non-flag columns
+      flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
+      return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
+      unless flags['MD']
+        raise Enveomics::ParseError.new(
+          "SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
+        )
+      end
+      mismatches = flags['MD'].scan(/[^\d]/).count
+      @identities << 100.0 * (length - mismatches) / length
+    end
+    # Save identites as raw text
+    def save_identities
+      return unless opts[:L]
+      say '- Saving identities'
+      File.open(opts[:L], 'w') do |fh|
+        identities.each { |i| fh.puts i }
+      end
+    end
+    # Save identity histogram as raw text
+    def save_histogram
+      return unless opts[:H]
+      say '- Saving histogram'
+      File.open(opts[:H], 'w') do |fh|
+        fh.puts "from\tto\tcount"
+        sample.histo_ranges.each_with_index do |r, k|
+          fh.puts (r + [sample.histo_counts[k]]).join("\t")
+        end
+      end
+    end
+    # -----------------------------------------------------------[ Peak finder ]
+    # Detect identity threshold by gaussian mixture model EM
+    def detect_identity_by_gmm
+      model_identities_by_gmm_em
+      detect_valley_by_gmm
+    end
+    # Model identities as a 2-gaussian mix by EM
+    def model_identities_by_gmm_em
+      say 'Modeling identities by gaussian mixture model using EM'
+      # TODO: Implement
+      raise Enveomics::UnimplementedError.new('Unimplemented operation')
+    end
+    # Detect valley by gaussian mix
+    def detect_valley_by_gmm
+      say 'Detecting valley by gaussian mixture model'
+      # TODO: Implement
+      raise Enveomics::UnimplementedError.new('Unimplemented operation')
+    end
+    # -----------------------------------------------------------[ Do the math ]
+    # Identities as a Enveomics::Stats::Sample object
+    def sample
+      @sample ||= Enveomics::Stats::Sample.new(
+        identities,
+        effective_range: [nil, 100.0],
+        histo_bin_size: opts[:bin_size]
+      )
+    end
+    # Returns the bimodality coefficient indicated by +opts[:coefficient]+
+    def bimodality
+      @bimodality ||=
+        case opts[:coefficient]
+        when :sarle
+          sample.sarle_bimodality
+        when :dma
+          sample.dma_bimodality
+        else
+          raise Enveomics::OptionError.new(
+            "Unsupported coefficient of bimodality: #{opts[:coefficient]}"
+          )
+        end
+    end
+  end
+end