RubyGems - miga-base - Versions diffs - 1.3.7.3 → 1.3.8.1 - Mend

miga-base 1.3.7.3 → 1.3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: da6f90e73e7f3f740623ad77e09886b1c7618de2f484ba412e53d329fb771c12
-  data.tar.gz: 662eda2643f1695285e02ef18076aa3c3bed86bfbbd2d0fc9c91481fbe6fb384
+  metadata.gz: 415756b71662dd191b51679aa75a4a4c75bb5ec00d6aacfa7fa6436cdd983819
+  data.tar.gz: a4f441f9d6ff7bb06f2a5fe5534c1d52cea5ead3b82737d3799cb65f46bf81b0
 SHA512:
-  metadata.gz: 268833ab588449626fd9ee1c8374daea22482fe85d73570c6c9bec58dd1b00c8385e53f8c9d26f0903cf88451f715194c2ddd5bb330eff0a5695ffb9f0bcb51f
-  data.tar.gz: 1dabe6af4158fbb26c760aa2c4673d1cdba45b832a54eb01995418bf11a8cb7c02708e99c54a1862d9bf501101f0fc55afc6f9b44d487c37d7d35cd61a1ff688
+  metadata.gz: f44c2506cb04763e21a9ffa81060214ca8f9d3772107b4a55820320838e08de20eac3e2d27805b62a7399b9aead3fd54eeb306a522f8886c3323bfc2149116b4
+  data.tar.gz: '00439963a16f86a3b8539fe0e212b63b7b62ac3eaedc8adebb88f8baae0a688a41a9518e6bbc69fc8db0b58e5551edf44b4c1764e64564aca372f358a6853d78'

data/lib/miga/version.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module MiGA
   # - String indicating release status:
   #   - rc* release candidate, not released as gem
   #   - [0-9]+ stable release, released as gem
-  VERSION = [1.3, 7, 3].freeze
+  VERSION = [1.3, 8, 1].freeze
   ##
   # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
   ##
   # Date of the current gem relese.
-  VERSION_DATE = Date.new(2023, 6, 29)
+  VERSION_DATE = Date.new(2023, 7, 5)
   ##
   # References of MiGA

data/utils/subclades.R CHANGED Viewed

@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
 #= Heavy-lifter
 subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Get ANI distances
-  if (length(ani.d) > 0) {
+  if (length(ani.d) > 0L) {
     # Just use ani.d (and save in dist_rds)
     if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
   } else if (file.exists(dist_rds)) {
@@ -107,21 +107,32 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
     stop("Cannot find input matrix", out_base)
   }
   if (length(labels(ani.d)) <= 8L) return(list())
+  # Subsample huge collections
+  nMax <- 65536L
+  nn <- length(labels(ani.d))
+  is.huge <- nn > nMax
+  if (is.huge) {
+    say("Subsampling large collection")
+    ids <- sample(labels(ani.d), nMax)
+    ani.d.ori <- ani.d
+    ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
+  }
   # Silhouette
   say("Silhouette")
-  nn <- length(labels(ani.d))
   k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
   say("- Make cluster")
   cl <- makeCluster(thr)
   say("- Launch parallel jobs")
   s <- parSapply(
     cl, k,
-    function(x) {
+    function(x, ani.d) {
       library(cluster)
       s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
       c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
-    }
+    },
+    ani.d = ani.d
   )
   say("- Stop cluster")
   stopCluster(cl)
@@ -135,13 +146,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Classify genomes
   say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
-  is.huge <- length(labels(ani.d)) > 4e4
-  ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
+  is.large <- nn > 3e4
+  ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
   ani.types <- ani.cl$clustering
   ani.medoids <- ani.cl$medoids
-  # Build tree
+  # Classify excluded genome (for huge collections)
   if (is.huge) {
+    say("Classifying excluded genomes")
+    ani.d <- ani.d.ori
+    # Find closest medoid for missing genomes
+    missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
+    for (i in missing)
+      ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
+    # Reorder
+    ani.types <- ani.types[labels(ani.d)]
+    # Save missing genomes for inspection
+    write.table(
+      missing, paste0(out_base, ".missing.txt"),
+      quote = FALSE, col.names = FALSE, row.names = FALSE
+    )
+  }
+  # Build tree
+  if (is.large) {
     say("Bypassing tree for large set")
     write.table(
       '{}', file = paste(out_base, ".nwk", sep = ""),
@@ -165,8 +193,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
   plot_distances(ani.d)
   plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
-  plot_clustering(ani.cl, ani.d, ani.types)
-  if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
+  if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
+  if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
   dev.off()
   # Save results
@@ -310,33 +338,48 @@ ani_distance <- function (ani_file, sel) {
   } else {
     sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
   }
+  # Extract individual variables to deal with very large matrices
+  a <- sim$a
+  b <- sim$b
+  d <- 1 - (sim$value / 100)
   # If there is no data, end process
-  if (nrow(sim) == 0) return(NULL)
+  if (length(a) == 0) return(NULL)
   # Apply filter (if requested)
   ids <- NULL
   if (!is.na(sel) && file.exists(sel)) {
     say("Filter selection")
     ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
-    sim <- sim[which(sim$a %in% ids & sim$b %in% ids), ]
+    sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
+    a <- a[sel.idx]
+    b <- b[sel.idx]
+    d <- d[sel.idx]
   } else {
-    ids <- with(sim, unique(c(a, b)))
+    ids <- unique(c(a, b))
   }
-  # Transform to distances
+  # Transform to dist object
   say("Distances")
-  sim$d <- 1 - (sim$value / 100)
-  return(as.dist(with(sim, {
-    out <- matrix(
-      min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
-      dimnames = list(ids, ids)
-    )
-    out[cbind(ids, ids)] <- 0
-    out[cbind(a, b)] <- d
-    out[cbind(b, a)] <- d
-    out
-  })))
+  out <- matrix(
+    min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
+    dimnames = list(ids, ids)
+  )
+  diag(out) <- 0
+  # Split task to reduce peak RAM and support very large matrices
+  # - Note that `k` is subsetting by index, but it's defined as numeric
+  #   instead of integer. The reason is that integer overflow occurs
+  #   at just over 2e9, whereas numerics can represent much larger
+  #   numbers without problems
+  i <- 0
+  while (i < length(a)) {
+    k <- seq(i + 1, min(i + 1e8, length(a)))
+    out[cbind(a[k], b[k])] <- d[k]
+    out[cbind(b[k], a[k])] <- d[k]
+    i <- i + 1e8
+  }
+  return(as.dist(out))
 }
 #= Main

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 1.3.7.3
+  version: 1.3.8.1
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-06-29 00:00:00.000000000 Z
+date: 2023-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons