RubyGems - miga-base - Versions diffs - 0.3.6.3 → 0.3.7.0 - Mend

miga-base 0.3.6.3 → 0.3.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +5 -5
data/actions/init.rb +1 -1
data/actions/tax_dist.rb +32 -26
data/bin/miga +38 -38
data/lib/miga/daemon.rb +11 -5
data/lib/miga/version.rb +1 -1
data/scripts/ogs.bash +4 -2
data/utils/cleanup-databases.rb +6 -5
data/utils/distance/commands.rb +1 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
data/utils/plot-taxdist.R +42 -33
data/utils/requirements.txt +1 -0
metadata +179 -179
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
data/utils/enveomics/Scripts/lib/enveomics.R +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA256:
-  metadata.gz: f989fccd5161208979728e293bfc89d18e24e1ca6941b18a804063749cce56e9
-  data.tar.gz: 7568cb56974c18279428a50f750a5302beae8063a190def53734eb985fe4b56b
+SHA1:
+  metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
+  data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
 SHA512:
-  metadata.gz: 8d026577fd38d399d643fbc9ef7c01ffa7969d8206d6f859655b9d04e41d7c3e75b1432581add654c5b3d58ce6b2bedf566783c2587165f811e12f6fbf87b695
-  data.tar.gz: 4fae078b18a74314a7654d5c7a83ca7277555977b299931e33ddfabc3356cfa82994b00c6950a646eb8c16289f4cdb34f686574fc9505aacfec9ae826242c742
+  metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
+  data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f

data/actions/init.rb CHANGED Viewed

@@ -154,7 +154,7 @@ end
 # Check for R packages
 $stderr.puts "Looking for R packages:"
-%w(enveomics.R ape phangorn phytools cluster vegan).each do |pkg|
+%w(enveomics.R ape cluster vegan).each do |pkg|
   $stderr.print "Testing #{pkg}... "
   `echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
   if $?.success?

data/actions/tax_dist.rb CHANGED Viewed

@@ -3,34 +3,34 @@
 # @package MiGA
 # @license Artistic-2.0
-require "miga/tax_index"
-require "zlib"
-require "tmpdir"
+require 'miga/tax_index'
+require 'zlib'
+require 'tmpdir'
-o = {q:true, format: :json}
+o = {q: true, format: :json}
 OptionParser.new do |opt|
   opt_banner(opt)
   opt_object(opt, o, [:project])
   opt_filter_datasets(opt, o)
-  opt.on("-i", "--index FILE",
-    "Pre-calculated tax-index (in tabular format) to be used.",
-    "If passed, dataset filtering arguments are ignored."
-    ){ |v| o[:index]=v }
+  opt.on('-i', '--index FILE',
+    'Pre-calculated tax-index (in tabular format) to be used.',
+    'If passed, dataset filtering arguments are ignored.'
+    ){ |v| o[:index] = v }
   opt_common(opt, o)
 end.parse!
 ##=> Functions <=
 # Returns the _cannonical_ ID between strings +a+ and +b+.
-def cannid(a, b) ; [a, b].sort.join("-") ; end
+def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
 ##=> Main <=
-opt_require(o, project:"-P")
+opt_require(o, project: '-P')
-$stderr.puts "Loading project." unless o[:q]
+$stderr.puts 'Loading project.' unless o[:q]
 p = MiGA::Project.load(o[:project])
 raise "Impossible to load project: #{o[:project]}" if p.nil?
-metric = p.is_clade? ? "ani" : "aai"
+metric = p.is_clade? ? 'ani' : 'aai'
 res_n  = "#{metric}_distances"
 $stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
 res = p.result res_n
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
 matrix = res.file_path(:matrix)
 raise "#{res_n} has no matrix." if matrix.nil?
 dist = {}
-mfh = matrix=~/\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix,"r")
+mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
 mfh.each_line do |ln|
   next if mfh.lineno==1
-  row = ln.chomp.split(/\t/)
-  dist[cannid(row[1], row[2])] = [row[3], 0, ["root:biota"]]
+  row = ln.chomp.split("\t")
+  dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
+  $stderr.print("  Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
 end
+$stderr.puts "  Lines: #{mfh.lineno}" unless o[:q]
 mfh.close
 Dir.mktmpdir do |dir|
   if o[:index].nil?
-    $stderr.puts "Loading datasets." unless o[:q]
+    $stderr.puts 'Loading datasets.' unless o[:q]
     ds = p.datasets
     ds.select!{ |d| not d.metadata[:tax].nil? }
     ds = filter_datasets!(ds, o)
-    $stderr.puts "Indexing taxonomy." unless o[:q]
+    $stderr.puts 'Indexing taxonomy.' unless o[:q]
     tax_index = MiGA::TaxIndex.new
     ds.each { |d| tax_index << d }
-    tab = File.expand_path("index.tab", dir)
-    File.open(tab, "w") { |fh| fh.print tax_index.to_tab }
+    tab = File.expand_path('index.tab', dir)
+    File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
   else
     tab = o[:index]
   end
-  $stderr.puts "Traversing taxonomy." unless o[:q]
+  $stderr.puts 'Traversing taxonomy.' unless o[:q]
   rank_i = 0
   MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
     $stderr.print "o #{rank}: " unless o[:q]
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
     rank_i += 1
     in_rank = nil
     ds_name = []
-    File.open(tab, "r") do |fh|
+    File.open(tab, 'r') do |fh|
       fh.each_line do |ln|
         if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
           in_rank = nil
           ds_name = []
         elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
-          in_rank = $2=="?" ? nil : $1
+          in_rank = $2 == '?' ? nil : $1
           ds_name = []
         elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
           ds_i = $1
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
             k = cannid(ds_i, ds_j)
             next if dist[k].nil?
             rank_n += 1
-            dist[k][1] = rank_i
-            dist[k][2].unshift in_rank
+            dist[k][3] = rank_i
+            dist[k][4].unshift in_rank
           end
         end
       end
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
   end
 end
+$stderr.puts 'Generating report.' unless o[:q]
 dist.keys.each do |k|
-  puts (k.split("-") + dist[k].flatten).join("\t")
+  dist[k][5] = dist[k][4].reverse.join(' ')
+  dist[k][4] = dist[k][4].first
+  puts (k.split('-') + dist[k]).join("\t")
 end

data/bin/miga CHANGED Viewed

@@ -3,43 +3,43 @@
 # @package MiGA
 # @license Artistic-2.0
-$:.push File.expand_path("../lib", File.dirname(__FILE__))
+$:.push File.expand_path('../../lib', __FILE__)
-require "optparse"
-require "miga"
+require 'optparse'
+require 'miga'
 ##=> Global variables <=
 $task_desc = {
   # Projects
-  new: "Creates an empty MiGA project.",
-  about: "Displays information about a MiGA project.",
-  plugins: "Lists or (un)installs plugins in a MiGA project.",
-  doctor: "Performs consistency checks on a MiGA project.",
+  new:      'Creates an empty MiGA project',
+  about:    'Displays information about a MiGA project',
+  plugins:  'Lists or (un)installs plugins in a MiGA project',
+  doctor:   'Performs consistency checks on a MiGA project',
   # Datasets
-  add: "Creates an empty dataset in a pre-existing MiGA project.",
-  get: "Downloads a dataset from public databases into a MiGA project.",
-  ncbi_get: "Downloads all genomes in a taxon from NCBI into a MiGA project.",
-  rm: "Removes a dataset from an MiGA project.",
-  find: "Finds unregistered datasets based on result files.",
-  ln: "Link datasets (including results) from one project to another.",
-  ls: "Lists all registered datasets in an MiGA project.",
+  add:      'Creates an empty dataset in a pre-existing MiGA project',
+  get:      'Downloads a dataset from public databases into a MiGA project',
+  ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
+  rm:       'Removes a dataset from an MiGA project',
+  find:     'Finds unregistered datasets based on result files',
+  ln:       'Link datasets (including results) from one project to another',
+  ls:       'Lists all registered datasets in an MiGA project',
   # Results
-  add_result: "Registers a result.",
-  stats: "Extracts statistics for the given result.",
-  files: "Lists all registered files from the results of a dataset or project.",
-  run: "Executes locally one step analysis producing the given result.",
-  summary: "Generates a summary table for the statistics of all datasets.",
+  add_result: 'Registers a result',
+  stats:    'Extracts statistics for the given result',
+  files:    'Lists registered files from the results of a dataset or project',
+  run:      'Executes locally one step analysis producing the given result',
+  summary:  'Generates a summary table for the statistics of all datasets',
   # System
-  init: "Initialize MiGA to process new projects.",
-  daemon: "Controls the daemon of a MiGA project.",
-  date: "Returns the current date in standard MiGA format.",
-  console: "Opens an IRB console with MiGA.",
+  init:     'Initialize MiGA to process new projects',
+  daemon:   'Controls the daemon of a MiGA project',
+  date:     'Returns the current date in standard MiGA format',
+  console:  'Opens an IRB console with MiGA',
   # Taxonomy
-  tax_set: "Registers taxonomic information for datasets.",
-  tax_test: "Returns test of taxonomic distributions for query datasets.",
-  tax_index: "Creates a taxonomy-indexed list of the datasets.",
-  tax_dist: "Estimates distributions of distance by taxonomy.",
+  tax_set:  'Registers taxonomic information for datasets',
+  tax_test: 'Returns test of taxonomic distributions for query datasets',
+  tax_index: 'Creates a taxonomy-indexed list of the datasets',
+  tax_dist: 'Estimates distributions of distance by taxonomy',
 }
 $task_alias = {
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
 end
 def add_metadata(o, obj)
-  o[:metadata].split(",").each do |pair|
-    (k,v) = pair.split("=")
+  o[:metadata].split(',').each do |pair|
+    (k,v) = pair.split('=')
     case v
       when 'true';  v = true
       when 'false'; v = false
       when 'nil';   v = nil
     end
-    if k=='_step'
+    if k == '_step'
       obj.metadata["_try_#{v}"] ||= 0
       obj.metadata["_try_#{v}"]  += 1
     end
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
   ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
 case ARGV[0].to_s
-when "-v", "--version"
+when '-v', '--version'
   puts MiGA::MiGA.VERSION
-when "-V", "--long-version"
+when '-V', '--long-version'
   puts MiGA::MiGA.LONG_VERSION
-when "-C", "--citation"
+when '-C', '--citation'
   puts MiGA::MiGA.CITATION
-when "console"
-  require "irb"
-  require "irb/completion"
+when 'console'
+  require 'irb'
+  require 'irb/completion'
   ARGV.shift
   IRB.start
 when *execs
   $task = ARGV.shift.to_sym
-  ARGV << "-h" if ARGV.empty? and not [:date, :init].include? $task
+  ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
   begin
     load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
   rescue => err
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
 Usage: #{$0} {action} [options]
-#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
+#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
 generic options:
     -h, --help          Display this screen.

data/lib/miga/daemon.rb CHANGED Viewed

@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
     status = JSON.parse(File.read(f_path), symbolize_names: true)
     status.keys.each do |i|
       status[i].map! do |j|
-        j.tap { |k| k[:ds] = project.dataset(k[:ds_name]) unless k[:ds].nil? }
+        j.tap do |k|
+          unless k[:ds].nil? or k[:ds_name] == 'miga-project'
+            k[:ds] = project.dataset(k[:ds_name])
+          end
+          k[:job] = k[:job].to_sym unless k[:job].nil?
+        end
       end
     end
     @jobs_running = status[:jobs_running]
     @jobs_to_run  = status[:jobs_to_run]
+    say "- jobs left running: #{@jobs_running.size}"
     purge!
     say "- jobs running: #{@jobs_running.size}"
     say "- jobs to run: #{@jobs_to_run.size}"
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
   ##
   # Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
   # let +ds+ be nil.
-  def get_job(job, ds=nil)
+  def get_job(job, ds = nil)
     (jobs_to_run + jobs_running).find do |j|
-      if ds==nil
-        j[:ds].nil? and j[:job]==job
+      if ds.nil?
+        j[:ds].nil? and j[:job] == job
       else
-        (! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
+        (! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
       end
     end
   end

data/lib/miga/version.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.3, 6, 3]
+  VERSION = [0.3, 7, 0]
   ##
   # Nickname for the current major.minor version.

data/scripts/ogs.bash CHANGED Viewed

@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
 # Initialize
 miga date > "miga-project.start"
-DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
+DS=$(miga ls -P "$PROJECT" --ref --no-multi)
+MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
+[[ $MIN_ID == "?" ]] && MIN_ID=80
 if [[ ! -s miga-project.ogs ]] ; then
   # Extract RBMs
   if [[ ! -s miga-project.abc ]] ; then
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
     for i in $DS ; do
       file="miga-project.tmp/$i.abc"
       [[ -s "$file" ]] && continue
-      echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
+      echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
         | sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
         | awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
         > "$file.tmp"

data/utils/cleanup-databases.rb CHANGED Viewed

@@ -6,18 +6,19 @@ require 'miga'
 ARGV[1] or abort "Usage: #{$0} path/to/project threads"
 $stderr.puts "Cleaning databases..."
-ds_list = MiGA::Project.load(ARGV[0]).datasets.
-  select(&:is_ref?).select(&:is_active?)
+p = MiGA::Project.load(ARGV[0])
+ds_names = p.dataset_names
 thr = ARGV[1].to_i
 (0 .. thr-1).each do |t|
   fork do
     k = -1
-    ds_list.each do |i|
+    ds_names.each do |i|
       k = (k+1) % thr
       next unless k == t
-      i.cleanup_distances!
+      d = p.dataset(i)
+      next unless d.is_ref? and d.is_active?
+      d.cleanup_distances!
     end
   end
 end

data/utils/distance/commands.rb CHANGED Viewed

@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
   ##
   # Estimates AAI against +target+ using hAAI
   def haai(target)
+    return nil if opts[:haai_p] == 'no'
     haai = aai_cmd(tmp_file('ess_genes.fa'),
           target.result(:essential_genes).file_path(:ess_genes),
           dataset.name, target.name, tmp_dbs[:haai],

data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env perl
+#
+# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @update: Oct 07 2015
+# @license: artistic license 2.0
+#
+use strict;
+use warnings;
+use List::Util qw/sum min max/;
+my ($seqs, $minlen, $n__) = @ARGV;
+$seqs or die "
+Description:
+   Calculates the N50 value of a set of sequences.  Alternatively, it
+   can calculate other N** values.  It also calculates the total number
+   of sequences and the total added length.
+Usage:
+   $0 seqs.fa[ minlen[ **]]
+   seqs.fa	A FastA file containing the sequences.
+   minlen	(optional) The minimum length to take into consideration.
+   		By default: 0.
+   **		Value N** to calculate.  By default: 50 (N50).
+";
+$minlen ||= 0;
+$n__    ||= 50;
+my @len = ();
+open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
+while(<SEQ>){
+   if(/^>/){
+      push @len, 0;
+   }else{
+      next if /^;/;
+      chomp;
+      s/\W//g;
+      $len[-1]+=length $_;
+   }
+}
+close SEQ;
+@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
+my $tot = (sum(@len) || 0);
+my $thr = $n__*$tot/100;
+my $pos = 0;
+for(@len){
+   $pos+= $_;
+   if($pos>=$thr){
+      print "N$n__: $_\n";
+      last;
+   }
+}
+print "Sequences: ".scalar(@len)."\n";
+print "Total length: $tot\n";

data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env perl
+#
+# @author  Luis M. Rodriguez-R
+# @update  Oct-07-2015
+# @license artistic license 2.0
+#
+use warnings;
+use strict;
+my($file, $content, $stretch) = @ARGV;
+$file or die <<HELP
+Description:
+   Filter sequences by N-content and presence of long homopolymers.
+Usage:
+   $0 sequences.fa [content [stretch]] > filtered.fa
+Where:
+   sequences.fa	Input file in FastA format
+   content	A number between 0 and 1 indicating the maximum proportion of Ns
+   		(1 to turn off, 0.5 by default)
+   stretch	A number indicating the maximum number of consecutive identical
+   		nucleotides allowed (0 to turn off, 100 by default)
+   filtered.fa	Filtered set of sequences.
+HELP
+;
+($content ||= 0.5)+=0;
+($stretch ||= 100)+=0;
+my $good = 0;
+my $N = 0;
+FASTA: {
+   local $/ = "\n>";
+   open FILE, "<", $file or die "I can not open the file: $file: $!\n";
+   SEQ: while(<FILE>){
+      $N++;
+      s/^;.*//gm;
+      s/>//g;
+      my($n,$s) = split /\n/, $_, 2;
+      (my $clean = $s) =~ s/[^ACTGN]//g;
+      if($content < 1){
+         (my $Ns = $clean) =~ s/[^N]//g;
+	 next SEQ if length($Ns)>length($clean)*$content;
+      }
+      if($stretch > 0){
+         for my $nuc (qw(A C T G N)){
+	    next SEQ if $clean =~ m/[$nuc]{$stretch}/;
+	 }
+      }
+      print ">$n\n$s\n";
+      $good++;
+   }
+   close FILE;
+   print STDERR "Total sequences: $N\nAfter filtering: $good\n";
+}