RubyGems - miga-base - Versions diffs - 0.3.6.3 → 0.3.7.0 - Mend

miga-base 0.3.6.3 → 0.3.7.0

Files changed (21) hide show

checksums.yaml +5 -5
data/actions/init.rb +1 -1
data/actions/tax_dist.rb +32 -26
data/bin/miga +38 -38
data/lib/miga/daemon.rb +11 -5
data/lib/miga/version.rb +1 -1
data/scripts/ogs.bash +4 -2
data/utils/cleanup-databases.rb +6 -5
data/utils/distance/commands.rb +1 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
data/utils/plot-taxdist.R +42 -33
data/utils/requirements.txt +1 -0
metadata +179 -179
data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
data/utils/enveomics/Scripts/lib/enveomics.R +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA256:
-  metadata.gz: f989fccd5161208979728e293bfc89d18e24e1ca6941b18a804063749cce56e9
-  data.tar.gz: 7568cb56974c18279428a50f750a5302beae8063a190def53734eb985fe4b56b
+SHA1:
+  metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
+  data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
 SHA512:
-  metadata.gz: 8d026577fd38d399d643fbc9ef7c01ffa7969d8206d6f859655b9d04e41d7c3e75b1432581add654c5b3d58ce6b2bedf566783c2587165f811e12f6fbf87b695
-  data.tar.gz: 4fae078b18a74314a7654d5c7a83ca7277555977b299931e33ddfabc3356cfa82994b00c6950a646eb8c16289f4cdb34f686574fc9505aacfec9ae826242c742
+  metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
+  data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f

data/actions/init.rb CHANGED Viewed

@@ -154,7 +154,7 @@ end
 # Check for R packages
 $stderr.puts "Looking for R packages:"
-%w(enveomics.R ape phangorn phytools cluster vegan).each do |pkg|
+%w(enveomics.R ape cluster vegan).each do |pkg|
   $stderr.print "Testing #{pkg}... "
   `echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
   if $?.success?

data/actions/tax_dist.rb CHANGED Viewed

@@ -3,34 +3,34 @@
 # @package MiGA
 # @license Artistic-2.0
-require "miga/tax_index"
-require "zlib"
-require "tmpdir"
+require 'miga/tax_index'
+require 'zlib'
+require 'tmpdir'
-o = {q:true, format: :json}
+o = {q: true, format: :json}
 OptionParser.new do |opt|
   opt_banner(opt)
   opt_object(opt, o, [:project])
   opt_filter_datasets(opt, o)
-  opt.on("-i", "--index FILE",
-    "Pre-calculated tax-index (in tabular format) to be used.",
-    "If passed, dataset filtering arguments are ignored."
-    ){ |v| o[:index]=v }
+  opt.on('-i', '--index FILE',
+    'Pre-calculated tax-index (in tabular format) to be used.',
+    'If passed, dataset filtering arguments are ignored.'
+    ){ |v| o[:index] = v }
   opt_common(opt, o)
 end.parse!
 ##=> Functions <=
 # Returns the _cannonical_ ID between strings +a+ and +b+.
-def cannid(a, b) ; [a, b].sort.join("-") ; end
+def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
 ##=> Main <=
-opt_require(o, project:"-P")
+opt_require(o, project: '-P')
-$stderr.puts "Loading project." unless o[:q]
+$stderr.puts 'Loading project.' unless o[:q]
 p = MiGA::Project.load(o[:project])
 raise "Impossible to load project: #{o[:project]}" if p.nil?
-metric = p.is_clade? ? "ani" : "aai"
+metric = p.is_clade? ? 'ani' : 'aai'
 res_n  = "#{metric}_distances"
 $stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
 res = p.result res_n
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
 matrix = res.file_path(:matrix)
 raise "#{res_n} has no matrix." if matrix.nil?
 dist = {}
-mfh = matrix=~/\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix,"r")
+mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
 mfh.each_line do |ln|
   next if mfh.lineno==1
-  row = ln.chomp.split(/\t/)
-  dist[cannid(row[1], row[2])] = [row[3], 0, ["root:biota"]]
+  row = ln.chomp.split("\t")
+  dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
+  $stderr.print("  Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
 end
+$stderr.puts "  Lines: #{mfh.lineno}" unless o[:q]
 mfh.close
 Dir.mktmpdir do |dir|
   if o[:index].nil?
-    $stderr.puts "Loading datasets." unless o[:q]
+    $stderr.puts 'Loading datasets.' unless o[:q]
     ds = p.datasets
     ds.select!{ |d| not d.metadata[:tax].nil? }
     ds = filter_datasets!(ds, o)
-    $stderr.puts "Indexing taxonomy." unless o[:q]
+    $stderr.puts 'Indexing taxonomy.' unless o[:q]
     tax_index = MiGA::TaxIndex.new
     ds.each { |d| tax_index << d }
-    tab = File.expand_path("index.tab", dir)
-    File.open(tab, "w") { |fh| fh.print tax_index.to_tab }
+    tab = File.expand_path('index.tab', dir)
+    File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
   else
     tab = o[:index]
   end
-  $stderr.puts "Traversing taxonomy." unless o[:q]
+  $stderr.puts 'Traversing taxonomy.' unless o[:q]
   rank_i = 0
   MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
     $stderr.print "o #{rank}: " unless o[:q]
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
     rank_i += 1
     in_rank = nil
     ds_name = []
-    File.open(tab, "r") do |fh|
+    File.open(tab, 'r') do |fh|
       fh.each_line do |ln|
         if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
           in_rank = nil
           ds_name = []
         elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
-          in_rank = $2=="?" ? nil : $1
+          in_rank = $2 == '?' ? nil : $1
           ds_name = []
         elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
           ds_i = $1
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
             k = cannid(ds_i, ds_j)
             next if dist[k].nil?
             rank_n += 1
-            dist[k][1] = rank_i
-            dist[k][2].unshift in_rank
+            dist[k][3] = rank_i
+            dist[k][4].unshift in_rank
           end
         end
       end
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
   end
 end
+$stderr.puts 'Generating report.' unless o[:q]
 dist.keys.each do |k|
-  puts (k.split("-") + dist[k].flatten).join("\t")
+  dist[k][5] = dist[k][4].reverse.join(' ')
+  dist[k][4] = dist[k][4].first
+  puts (k.split('-') + dist[k]).join("\t")
 end

data/bin/miga CHANGED Viewed

@@ -3,43 +3,43 @@
 # @package MiGA
 # @license Artistic-2.0
-$:.push File.expand_path("../lib", File.dirname(__FILE__))
+$:.push File.expand_path('../../lib', __FILE__)
-require "optparse"
-require "miga"
+require 'optparse'
+require 'miga'
 ##=> Global variables <=
 $task_desc = {
   # Projects
-  new: "Creates an empty MiGA project.",
-  about: "Displays information about a MiGA project.",
-  plugins: "Lists or (un)installs plugins in a MiGA project.",
-  doctor: "Performs consistency checks on a MiGA project.",
+  new:      'Creates an empty MiGA project',
+  about:    'Displays information about a MiGA project',
+  plugins:  'Lists or (un)installs plugins in a MiGA project',
+  doctor:   'Performs consistency checks on a MiGA project',
   # Datasets
-  add: "Creates an empty dataset in a pre-existing MiGA project.",
-  get: "Downloads a dataset from public databases into a MiGA project.",
-  ncbi_get: "Downloads all genomes in a taxon from NCBI into a MiGA project.",
-  rm: "Removes a dataset from an MiGA project.",
-  find: "Finds unregistered datasets based on result files.",
-  ln: "Link datasets (including results) from one project to another.",
-  ls: "Lists all registered datasets in an MiGA project.",
+  add:      'Creates an empty dataset in a pre-existing MiGA project',
+  get:      'Downloads a dataset from public databases into a MiGA project',
+  ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
+  rm:       'Removes a dataset from an MiGA project',
+  find:     'Finds unregistered datasets based on result files',
+  ln:       'Link datasets (including results) from one project to another',
+  ls:       'Lists all registered datasets in an MiGA project',
   # Results
-  add_result: "Registers a result.",
-  stats: "Extracts statistics for the given result.",
-  files: "Lists all registered files from the results of a dataset or project.",
-  run: "Executes locally one step analysis producing the given result.",
-  summary: "Generates a summary table for the statistics of all datasets.",
+  add_result: 'Registers a result',
+  stats:    'Extracts statistics for the given result',
+  files:    'Lists registered files from the results of a dataset or project',
+  run:      'Executes locally one step analysis producing the given result',
+  summary:  'Generates a summary table for the statistics of all datasets',
   # System
-  init: "Initialize MiGA to process new projects.",
-  daemon: "Controls the daemon of a MiGA project.",
-  date: "Returns the current date in standard MiGA format.",
-  console: "Opens an IRB console with MiGA.",
+  init:     'Initialize MiGA to process new projects',
+  daemon:   'Controls the daemon of a MiGA project',
+  date:     'Returns the current date in standard MiGA format',
+  console:  'Opens an IRB console with MiGA',
   # Taxonomy
-  tax_set: "Registers taxonomic information for datasets.",
-  tax_test: "Returns test of taxonomic distributions for query datasets.",
-  tax_index: "Creates a taxonomy-indexed list of the datasets.",
-  tax_dist: "Estimates distributions of distance by taxonomy.",
+  tax_set:  'Registers taxonomic information for datasets',
+  tax_test: 'Returns test of taxonomic distributions for query datasets',
+  tax_index: 'Creates a taxonomy-indexed list of the datasets',
+  tax_dist: 'Estimates distributions of distance by taxonomy',
 }
 $task_alias = {
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
 end
 def add_metadata(o, obj)
-  o[:metadata].split(",").each do |pair|
-    (k,v) = pair.split("=")
+  o[:metadata].split(',').each do |pair|
+    (k,v) = pair.split('=')
     case v
       when 'true';  v = true
       when 'false'; v = false
       when 'nil';   v = nil
     end
-    if k=='_step'
+    if k == '_step'
       obj.metadata["_try_#{v}"] ||= 0
       obj.metadata["_try_#{v}"]  += 1
     end
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
   ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
 case ARGV[0].to_s
-when "-v", "--version"
+when '-v', '--version'
   puts MiGA::MiGA.VERSION
-when "-V", "--long-version"
+when '-V', '--long-version'
   puts MiGA::MiGA.LONG_VERSION
-when "-C", "--citation"
+when '-C', '--citation'
   puts MiGA::MiGA.CITATION
-when "console"
-  require "irb"
-  require "irb/completion"
+when 'console'
+  require 'irb'
+  require 'irb/completion'
   ARGV.shift
   IRB.start
 when *execs
   $task = ARGV.shift.to_sym
-  ARGV << "-h" if ARGV.empty? and not [:date, :init].include? $task
+  ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
   begin
     load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
   rescue => err
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
 Usage: #{$0} {action} [options]
-#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
+#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
 generic options:
     -h, --help          Display this screen.

data/lib/miga/daemon.rb CHANGED Viewed

@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
     status = JSON.parse(File.read(f_path), symbolize_names: true)
     status.keys.each do |i|
       status[i].map! do |j|
-        j.tap { |k| k[:ds] = project.dataset(k[:ds_name]) unless k[:ds].nil? }
+        j.tap do |k|
+          unless k[:ds].nil? or k[:ds_name] == 'miga-project'
+            k[:ds] = project.dataset(k[:ds_name])
+          end
+          k[:job] = k[:job].to_sym unless k[:job].nil?
+        end
       end
     end
     @jobs_running = status[:jobs_running]
     @jobs_to_run  = status[:jobs_to_run]
+    say "- jobs left running: #{@jobs_running.size}"
     purge!
     say "- jobs running: #{@jobs_running.size}"
     say "- jobs to run: #{@jobs_to_run.size}"
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
   ##
   # Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
   # let +ds+ be nil.
-  def get_job(job, ds=nil)
+  def get_job(job, ds = nil)
     (jobs_to_run + jobs_running).find do |j|
-      if ds==nil
-        j[:ds].nil? and j[:job]==job
+      if ds.nil?
+        j[:ds].nil? and j[:job] == job
       else
-        (! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
+        (! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
       end
     end
   end

data/lib/miga/version.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.3, 6, 3]
+  VERSION = [0.3, 7, 0]
   ##
   # Nickname for the current major.minor version.

data/scripts/ogs.bash CHANGED Viewed

@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
 # Initialize
 miga date > "miga-project.start"
-DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
+DS=$(miga ls -P "$PROJECT" --ref --no-multi)
+MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
+[[ $MIN_ID == "?" ]] && MIN_ID=80
 if [[ ! -s miga-project.ogs ]] ; then
   # Extract RBMs
   if [[ ! -s miga-project.abc ]] ; then
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
     for i in $DS ; do
       file="miga-project.tmp/$i.abc"
       [[ -s "$file" ]] && continue
-      echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
+      echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
         | sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
         | awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
         > "$file.tmp"

data/utils/cleanup-databases.rb CHANGED Viewed

@@ -6,18 +6,19 @@ require 'miga'
 ARGV[1] or abort "Usage: #{$0} path/to/project threads"
 $stderr.puts "Cleaning databases..."
-ds_list = MiGA::Project.load(ARGV[0]).datasets.
-  select(&:is_ref?).select(&:is_active?)
+p = MiGA::Project.load(ARGV[0])
+ds_names = p.dataset_names
 thr = ARGV[1].to_i
 (0 .. thr-1).each do |t|
   fork do
     k = -1
-    ds_list.each do |i|
+    ds_names.each do |i|
       k = (k+1) % thr
       next unless k == t
-      i.cleanup_distances!
+      d = p.dataset(i)
+      next unless d.is_ref? and d.is_active?
+      d.cleanup_distances!
     end
   end
 end

data/utils/distance/commands.rb CHANGED Viewed

@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
   ##
   # Estimates AAI against +target+ using hAAI
   def haai(target)
+    return nil if opts[:haai_p] == 'no'
     haai = aai_cmd(tmp_file('ess_genes.fa'),
           target.result(:essential_genes).file_path(:ess_genes),
           dataset.name, target.name, tmp_dbs[:haai],

data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env perl
+#
+# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
+# @update: Oct 07 2015
+# @license: artistic license 2.0
+#
+use strict;
+use warnings;
+use List::Util qw/sum min max/;
+my ($seqs, $minlen, $n__) = @ARGV;
+$seqs or die "
+Description:
+   Calculates the N50 value of a set of sequences.  Alternatively, it
+   can calculate other N** values.  It also calculates the total number
+   of sequences and the total added length.
+Usage:
+   $0 seqs.fa[ minlen[ **]]
+   seqs.fa	A FastA file containing the sequences.
+   minlen	(optional) The minimum length to take into consideration.
+   		By default: 0.
+   **		Value N** to calculate.  By default: 50 (N50).
+";
+$minlen ||= 0;
+$n__    ||= 50;
+my @len = ();
+open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
+while(<SEQ>){
+   if(/^>/){
+      push @len, 0;
+   }else{
+      next if /^;/;
+      chomp;
+      s/\W//g;
+      $len[-1]+=length $_;
+   }
+}
+close SEQ;
+@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
+my $tot = (sum(@len) || 0);
+my $thr = $n__*$tot/100;
+my $pos = 0;
+for(@len){
+   $pos+= $_;
+   if($pos>=$thr){
+      print "N$n__: $_\n";
+      last;
+   }
+}
+print "Sequences: ".scalar(@len)."\n";
+print "Total length: $tot\n";

data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env perl
+#
+# @author  Luis M. Rodriguez-R
+# @update  Oct-07-2015
+# @license artistic license 2.0
+#
+use warnings;
+use strict;
+my($file, $content, $stretch) = @ARGV;
+$file or die <<HELP
+Description:
+   Filter sequences by N-content and presence of long homopolymers.
+Usage:
+   $0 sequences.fa [content [stretch]] > filtered.fa
+Where:
+   sequences.fa	Input file in FastA format
+   content	A number between 0 and 1 indicating the maximum proportion of Ns
+   		(1 to turn off, 0.5 by default)
+   stretch	A number indicating the maximum number of consecutive identical
+   		nucleotides allowed (0 to turn off, 100 by default)
+   filtered.fa	Filtered set of sequences.
+HELP
+;
+($content ||= 0.5)+=0;
+($stretch ||= 100)+=0;
+my $good = 0;
+my $N = 0;
+FASTA: {
+   local $/ = "\n>";
+   open FILE, "<", $file or die "I can not open the file: $file: $!\n";
+   SEQ: while(<FILE>){
+      $N++;
+      s/^;.*//gm;
+      s/>//g;
+      my($n,$s) = split /\n/, $_, 2;
+      (my $clean = $s) =~ s/[^ACTGN]//g;
+      if($content < 1){
+         (my $Ns = $clean) =~ s/[^N]//g;
+	 next SEQ if length($Ns)>length($clean)*$content;
+      }
+      if($stretch > 0){
+         for my $nuc (qw(A C T G N)){
+	    next SEQ if $clean =~ m/[$nuc]{$stretch}/;
+	 }
+      }
+      print ">$n\n$s\n";
+      $good++;
+   }
+   close FILE;
+   print STDERR "Total sequences: $N\nAfter filtering: $good\n";
+}