RubyGems - miga-base - Versions diffs - 1.3.7.3 → 1.3.8.0 - Mend

miga-base 1.3.7.3 → 1.3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: da6f90e73e7f3f740623ad77e09886b1c7618de2f484ba412e53d329fb771c12
-  data.tar.gz: 662eda2643f1695285e02ef18076aa3c3bed86bfbbd2d0fc9c91481fbe6fb384
+  metadata.gz: 26e4d6df1a418582af445818b9965eaee00bb76282e5acd9fc1480d6b9d3e57b
+  data.tar.gz: 16c6f25a55191ca185fd4f5718e721780e718555525301ce98613290f7f38cc5
 SHA512:
-  metadata.gz: 268833ab588449626fd9ee1c8374daea22482fe85d73570c6c9bec58dd1b00c8385e53f8c9d26f0903cf88451f715194c2ddd5bb330eff0a5695ffb9f0bcb51f
-  data.tar.gz: 1dabe6af4158fbb26c760aa2c4673d1cdba45b832a54eb01995418bf11a8cb7c02708e99c54a1862d9bf501101f0fc55afc6f9b44d487c37d7d35cd61a1ff688
+  metadata.gz: c38ef4522680e357f95c239de8529bd1c7f1be307243000e38dabfa592d99226ca5e228b56efa8918ad3ab55de75858f8fdec3ecaeef5d6471ec37af1a0d2fdc
+  data.tar.gz: 3ab0c91d807c27ceaeb412dffbdda2da29d8c558772988b5d8af6063ef9dee206329ff7caf9e5c5795b5ee5b0f019b8b8a0906eb3bb6daca4611b394b5b8df6d

data/lib/miga/version.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module MiGA
   # - String indicating release status:
   #   - rc* release candidate, not released as gem
   #   - [0-9]+ stable release, released as gem
-  VERSION = [1.3, 7, 3].freeze
+  VERSION = [1.3, 8, 0].freeze
   ##
   # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
   ##
   # Date of the current gem relese.
-  VERSION_DATE = Date.new(2023, 6, 29)
+  VERSION_DATE = Date.new(2023, 7, 5)
   ##
   # References of MiGA

data/utils/subclades.R CHANGED Viewed

@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
 #= Heavy-lifter
 subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Get ANI distances
-  if (length(ani.d) > 0) {
+  if (length(ani.d) > 0L) {
     # Just use ani.d (and save in dist_rds)
     if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
   } else if (file.exists(dist_rds)) {
@@ -107,10 +107,20 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
     stop("Cannot find input matrix", out_base)
   }
   if (length(labels(ani.d)) <= 8L) return(list())
+  # Subsample huge collections
+  nMax <- 65536L
+  nn <- length(labels(ani.d))
+  is.huge <- nn > nMax
+  if (is.huge) {
+    say("Subsampling large collection")
+    ids <- sample(labels(ani.d), nMax)
+    ani.d.ori <- ani.d
+    ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
+  }
   # Silhouette
   say("Silhouette")
-  nn <- length(labels(ani.d))
   k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
   say("- Make cluster")
   cl <- makeCluster(thr)
@@ -135,13 +145,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   # Classify genomes
   say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
-  is.huge <- length(labels(ani.d)) > 4e4
-  ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
+  is.large <- nn > 3e4
+  ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
   ani.types <- ani.cl$clustering
   ani.medoids <- ani.cl$medoids
-  # Build tree
+  # Classify excluded genome (for huge collections)
   if (is.huge) {
+    say("Classifying excluded genomes")
+    ani.d <- ani.d.ori
+    # Find closest medoid for missing genomes
+    missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
+    for (i in missing)
+      ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
+    # Reorder
+    ani.types <- ani.types[labels(ani.d)]
+    # Save missing genomes for inspection
+    write.table(
+      missing, paste0(out_base, ".missing.txt"),
+      quote = FALSE, col.names = FALSE, row.names = FALSE
+    )
+  }
+  # Build tree
+  if (is.large) {
     say("Bypassing tree for large set")
     write.table(
       '{}', file = paste(out_base, ".nwk", sep = ""),
@@ -165,8 +192,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
   layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
   plot_distances(ani.d)
   plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
-  plot_clustering(ani.cl, ani.d, ani.types)
-  if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
+  if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
+  if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
   dev.off()
   # Save results
@@ -310,33 +337,48 @@ ani_distance <- function (ani_file, sel) {
   } else {
     sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
   }
+  # Extract individual variables to deal with very large matrices
+  a <- sim$a
+  b <- sim$b
+  d <- 1 - (sim$value / 100)
   # If there is no data, end process
-  if (nrow(sim) == 0) return(NULL)
+  if (length(a) == 0) return(NULL)
   # Apply filter (if requested)
   ids <- NULL
   if (!is.na(sel) && file.exists(sel)) {
     say("Filter selection")
     ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
-    sim <- sim[which(sim$a %in% ids & sim$b %in% ids), ]
+    sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
+    a <- a[sel.idx]
+    b <- b[sel.idx]
+    d <- d[sel.idx]
   } else {
-    ids <- with(sim, unique(c(a, b)))
+    ids <- unique(c(a, b))
   }
-  # Transform to distances
+  # Transform to dist object
   say("Distances")
-  sim$d <- 1 - (sim$value / 100)
-  return(as.dist(with(sim, {
-    out <- matrix(
-      min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
-      dimnames = list(ids, ids)
-    )
-    out[cbind(ids, ids)] <- 0
-    out[cbind(a, b)] <- d
-    out[cbind(b, a)] <- d
-    out
-  })))
+  out <- matrix(
+    min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
+    dimnames = list(ids, ids)
+  )
+  diag(out) <- 0
+  # Split task to reduce peak RAM and support very large matrices
+  # - Note that `k` is subsetting by index, but it's defined as numeric
+  #   instead of integer. The reason is that integer overflow occurs
+  #   at just over 2e9, whereas numerics can represent much larger
+  #   numbers without problems
+  i <- 0
+  while (i < length(a)) {
+    k <- seq(i + 1, min(i + 1e8, length(a)))
+    out[cbind(a[k], b[k])] <- d[k]
+    out[cbind(b[k], a[k])] <- d[k]
+    i <- i + 1e8
+  }
+  return(as.dist(out))
 }
 #= Main

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 1.3.7.3
+  version: 1.3.8.0
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-06-29 00:00:00.000000000 Z
+date: 2023-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons