RubyGems - miga-base - Versions diffs - 1.3.4.2 → 1.3.5.0 - Mend

miga-base 1.3.4.2 → 1.3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/miga/cli/action/get.rb +6 -1
data/lib/miga/dataset/result/ignore.rb +33 -4
data/lib/miga/result/stats.rb +9 -5
data/lib/miga/version.rb +2 -2
data/scripts/aai_distances.bash +8 -0
data/utils/subclades.R +48 -26
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 786b98a997a9145ad36a42404b756a65ca5f8abe628adf10e27960c58b7d5641
-  data.tar.gz: 95bc1da96a509cbe3c9334b78daf4ebaec9238693ef37fd5f417063943c9cf36
+  metadata.gz: d17244b326441f224e4626b53702018fd0f3915d44e16e9be939a70dd86ceefa
+  data.tar.gz: 8f6c7544ab57957dbfeb53230e56ba9e7eda37c7bab2fea538ee17d9f54fd9f7
 SHA512:
-  metadata.gz: 909a0cf12a2048024d4c8f5ed2210661d96b78b4c6a6efdae650ca49d6f34f8439c812fb14b30df664c8758beb4c532bc4b1c7a6a09e108f7c48418a03d09902
-  data.tar.gz: 2af006dc05851eb66eb7e3f0fec127b10bec73945b5761a3145d9973935a65c93b7b010bb977b4b492f527201f31b68c5caa4c68d8815c2d474f96420c642d31
+  metadata.gz: bd77962a43caa04c72d01ddbe251fb02650f0f003c55b2b13a44ac1affb77b52535e5dd5fd7713c387f5a7bafdd8839255f2cd8d82be8d4c025b562a7beabfbd
+  data.tar.gz: b0cd50e25aa16ce2c64202daa5bbbb256e3f83742513ac984715f1b7c96f7297887856a2853ac6ec49dd43fe63b5c37cd96e83f8047ebcd32fdbc47addf892f7

data/lib/miga/cli/action/get.rb CHANGED Viewed

@@ -120,7 +120,12 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
   def create_remote_dataset(sub_cli, p)
     sub_cli.ensure_par(dataset: '-D', ids: '-I')
     unless sub_cli[:api_key].nil?
-      ENV["#{sub_cli[:universe].to_s.upcase}_API_KEY"] = sub_cli[:api_key]
+      if sub_cli[:universe] == :web && sub_cli[:db] == :assembly_gz
+        ENV['NCBI_API_KEY'] = sub_cli[:api_key]
+      end
+      var_space = sub_cli[:universe].to_s.upcase
+      ENV["#{var_space}_API_KEY"] = sub_cli[:api_key]
     end
     sub_cli.say "Dataset: #{sub_cli[:dataset]}"

data/lib/miga/dataset/result/ignore.rb CHANGED Viewed

@@ -76,18 +76,47 @@ module MiGA::Dataset::Result::Ignore
   ##
   # Ignore +task+ because it's not a reference dataset
   def ignore_noref?(task)
-    self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
+    ignore_by_type?(task, :noref)
   end
   ##
   # Ignore +task+ because it's not a multi dataset
   def ignore_multi?(task)
-    self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
+    ignore_by_type?(task, :multi)
   end
   ##
   # Ignore +task+ because it's not a nonmulti dataset
   def ignore_nonmulti?(task)
-    self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
+    ignore_by_type?(task, :nonmulti)
   end
-end
+  ##
+  # Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
+  # +:nonmulti+
+  def ignore_by_type?(task, type)
+    return false if force_task?(task)
+    test, list =
+      case type.to_sym
+      when :noref
+        [:ref?, self.class.EXCLUDE_NOREF_TASKS]
+      when :multi
+        [:multi?, self.class.ONLY_MULTI_TASKS]
+      when :nonmulti
+        [:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
+      else
+        raise "Unexpected error, unknown type reason: #{type}"
+      end
+    list.include?(task) && !send(test)
+  end
+  ##
+  # Force the +task+ to be executed even if it should otherwise be
+  # ignored due to reasons: +:noref+, +:multi+, or +:nonmulti+. Other
+  # reasons to ignore a task are not affected by metadata forcing
+  def force_task?(task)
+    !!metadata["run_#{task}"]
+  end
+end

data/lib/miga/result/stats.rb CHANGED Viewed

@@ -141,12 +141,16 @@ module MiGA::Result::Stats
       # Determine qualitative range
       stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
       source.metadata[:quality] =
-        case stats[:quality]
-        when 80..100; :excellent
-        when 50..80; :high
-        when 20..50; :intermediate
-        else; :low
+        if stats[:completeness][0] >= 90 && stats[:contamination][0] <= 5
+          :excellent    # Finished or High-quality draft*
+        elsif stats[:completeness][0] >= 50 && stats[:contamination][0] <= 10
+          :high         # Medium-quality draft*
+        elsif stats[:quality] >= 25
+          :intermediate # Low-quality draft* but sufficient for classification
+        else
+          :low          # Low-quality draft* and insufficient for classification
         end
+        # * Bowers et al 2017, DOI: 10.1038/nbt.3893
       source.save
       # Inactivate low-quality datasets

data/lib/miga/version.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module MiGA
   # - String indicating release status:
   #   - rc* release candidate, not released as gem
   #   - [0-9]+ stable release, released as gem
-  VERSION = [1.3, 4, 2].freeze
+  VERSION = [1.3, 5, 0].freeze
   ##
   # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
   ##
   # Date of the current gem relese.
-  VERSION_DATE = Date.new(2023, 4, 7)
+  VERSION_DATE = Date.new(2023, 4, 21)
   ##
   # References of MiGA

data/scripts/aai_distances.bash CHANGED Viewed

@@ -18,6 +18,14 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
   for i in $DS ; do
     echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
   done
+  # The following block pipes retrieved data from all databases, reorganizes the
+  # names in cannonical order, and removes repeats from the first two columns,
+  # in order to keep only one result per pair. This is not being included into
+  # production, but the code may be useful for extremely large databases.
+  # | tee \
+  # | awk -F"\t" \
+  #   'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
+  # | sort -k 1,2 -u
 ) | gzip -9c > miga-project.txt.gz
 # R-ify

data/utils/subclades.R CHANGED Viewed

@@ -47,7 +47,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
     say("Loading")
     ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
       sep = " ", as.is = TRUE)[,1]
-    a <- read.table(paste(out_base, "classif", sep="."),
+    a <- read.table(paste(out_base, "classif", sep = "."),
       sep = "\t", as.is = TRUE)
     ani.types <- a[,2]
     names(ani.types) <- a[,1]
@@ -70,17 +70,17 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
   say("Recursive search")
   for (i in 1:length(ani.medoids)) {
     medoid <- ani.medoids[i]
-    ds_f <- names(ani.types)[ ani.types==i ]
+    ds_f <- names(ani.types)[ani.types == i]
     say("Analyzing subclade", i, "with medoid:", medoid)
-    dir_f <- paste(out_base, ".sc-", i, sep="")
+    dir_f <- paste(out_base, ".sc-", i, sep = "")
     if (!dir.exists(dir_f)) dir.create(dir_f)
     write.table(ds_f,
-      paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
-      quote=FALSE, col.names=FALSE, row.names=FALSE)
+      paste(out_base, ".sc-", i, "/miga-project.all", sep = ""),
+      quote = FALSE, col.names = FALSE, row.names = FALSE)
     if (length(ds_f) > 8L) {
       ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
       subclades(
-        out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
+        out_base = paste(out_base, ".sc-", i, "/miga-project", sep = ""),
         thr = thr,
         ani.d = ani_subset
       )
@@ -111,7 +111,7 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Silhouette
   say("Silhouette")
   nn <- length(labels(ani.d))
-  k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
+  k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
   say("- Make cluster")
   cl <- makeCluster(thr)
   say("- Launch parallel jobs")
@@ -119,8 +119,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
     cl, k,
     function(x) {
       library(cluster)
-      s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
-      c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
+      s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
+      c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
     }
   )
   say("- Stop cluster")
@@ -135,29 +135,38 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Classify genomes
   say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
-  ani.cl <- pam(ani.d, top.n)
+  is.huge <- length(labels(ani.d)) > 4e4
+  ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
   ani.types <- ani.cl$clustering
   ani.medoids <- ani.cl$medoids
   # Build tree
-  say("Tree")
-  ani.ph <- bionj(ani.d)
-  say("- Write")
-  express.ori <- options("expressions")$expressions
-  if(express.ori < ani.ph$Nnode * 4){
-    options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
+  if (is.huge) {
+    say("Bypassing tree for large set")
+    write.table(
+      '{}', file = paste(out_base, ".nwk", sep = ""),
+      col.names = FALSE, row.names = FALSE, quote = FALSE
+    )
+  } else {
+    say("Tree")
+    ani.ph <- bionj(ani.d)
+    say("- Write")
+    express.ori <- options("expressions")$expressions
+    if(express.ori < ani.ph$Nnode * 4){
+      options(expressions = min(c(5e7, ani.ph$Nnode * 4)))
+    }
+    write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
+    options(expressions = express.ori)
   }
-  write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
-  options(expressions=express.ori)
   # Generate graphic report
   say("Graphic report")
   pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
   layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
   plot_distances(ani.d)
-  plot_silhouette(k, s[1,], s[2,], ds, top.n)
+  plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
   plot_clustering(ani.cl, ani.d, ani.types)
-  plot_tree(ani.ph, ani.types, ani.medoids)
+  if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
   dev.off()
   # Save results
@@ -198,7 +207,7 @@ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
     classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
   }
   write.table(
-    classif, paste(out_base, "classif", sep="."),
+    classif, paste(out_base, "classif", sep = "."),
     quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
   )
 }
@@ -249,7 +258,8 @@ plot_clustering <- function (cl, dist, types) {
   top.n <- length(cl$medoids)
   col <- ggplotColours(top.n)
   plot(silhouette(cl), col = col)
-  if (length(labels(dist)) <= 15) {
+  dist.n <- length(labels(dist))
+  if (dist.n <= 15 | dist.n > 4e4) {
     plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
     plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
   } else {
@@ -261,7 +271,7 @@ plot_clustering <- function (cl, dist, types) {
       )
       plot(
         ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
-	xlab = "Component 3", ylab="Component 4"
+	xlab = "Component 3", ylab = "Component 4"
       )
     }else{
       for (i in 1:2)
@@ -305,16 +315,28 @@ ani_distance <- function (ani_file, sel) {
   if (nrow(sim) == 0) return(NULL)
   # Apply filter (if requested)
+  ids <- NULL
   if (!is.na(sel) && file.exists(sel)) {
     say("Filter selection")
-    lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
-    sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
+    ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
+    sim <- sim[sim$a %in% ids & sim$b %in% ids, ]
+  } else {
+    ids <- with(sim, unique(c(a, b)))
   }
   # Transform to distances
   say("Distances")
   sim$d <- 1 - (sim$value / 100)
-  return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
+  return(as.dist(with(sim, {
+    out <- matrix(
+      max(d) * 1.2, nrow = length(ids), ncol = length(ids),
+      dimnames = list(ids, ids)
+    )
+    out[cbind(ids, ids)] <- 0
+    out[cbind(a, b)] <- d
+    out[cbind(b, a)] <- d
+    out
+  })))
 }
 #= Main

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 1.3.4.2
+  version: 1.3.5.0
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-04-07 00:00:00.000000000 Z
+date: 2023-04-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons