miga-base 1.3.7.3 → 1.3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +2 -2
  3. data/utils/subclades.R +68 -25
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da6f90e73e7f3f740623ad77e09886b1c7618de2f484ba412e53d329fb771c12
4
- data.tar.gz: 662eda2643f1695285e02ef18076aa3c3bed86bfbbd2d0fc9c91481fbe6fb384
3
+ metadata.gz: 415756b71662dd191b51679aa75a4a4c75bb5ec00d6aacfa7fa6436cdd983819
4
+ data.tar.gz: a4f441f9d6ff7bb06f2a5fe5534c1d52cea5ead3b82737d3799cb65f46bf81b0
5
5
  SHA512:
6
- metadata.gz: 268833ab588449626fd9ee1c8374daea22482fe85d73570c6c9bec58dd1b00c8385e53f8c9d26f0903cf88451f715194c2ddd5bb330eff0a5695ffb9f0bcb51f
7
- data.tar.gz: 1dabe6af4158fbb26c760aa2c4673d1cdba45b832a54eb01995418bf11a8cb7c02708e99c54a1862d9bf501101f0fc55afc6f9b44d487c37d7d35cd61a1ff688
6
+ metadata.gz: f44c2506cb04763e21a9ffa81060214ca8f9d3772107b4a55820320838e08de20eac3e2d27805b62a7399b9aead3fd54eeb306a522f8886c3323bfc2149116b4
7
+ data.tar.gz: '00439963a16f86a3b8539fe0e212b63b7b62ac3eaedc8adebb88f8baae0a688a41a9518e6bbc69fc8db0b58e5551edf44b4c1764e64564aca372f358a6853d78'
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 7, 3].freeze
15
+ VERSION = [1.3, 8, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 6, 29)
23
+ VERSION_DATE = Date.new(2023, 7, 5)
24
24
 
25
25
  ##
26
26
  # References of MiGA
data/utils/subclades.R CHANGED
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
97
97
  #= Heavy-lifter
98
98
  subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
99
99
  # Get ANI distances
100
- if (length(ani.d) > 0) {
100
+ if (length(ani.d) > 0L) {
101
101
  # Just use ani.d (and save in dist_rds)
102
102
  if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
103
103
  } else if (file.exists(dist_rds)) {
@@ -107,21 +107,32 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
107
107
  stop("Cannot find input matrix", out_base)
108
108
  }
109
109
  if (length(labels(ani.d)) <= 8L) return(list())
110
-
110
+
111
+ # Subsample huge collections
112
+ nMax <- 65536L
113
+ nn <- length(labels(ani.d))
114
+ is.huge <- nn > nMax
115
+ if (is.huge) {
116
+ say("Subsampling large collection")
117
+ ids <- sample(labels(ani.d), nMax)
118
+ ani.d.ori <- ani.d
119
+ ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
120
+ }
121
+
111
122
  # Silhouette
112
123
  say("Silhouette")
113
- nn <- length(labels(ani.d))
114
124
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
115
125
  say("- Make cluster")
116
126
  cl <- makeCluster(thr)
117
127
  say("- Launch parallel jobs")
118
128
  s <- parSapply(
119
129
  cl, k,
120
- function(x) {
130
+ function(x, ani.d) {
121
131
  library(cluster)
122
132
  s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
123
133
  c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
124
- }
134
+ },
135
+ ani.d = ani.d
125
136
  )
126
137
  say("- Stop cluster")
127
138
  stopCluster(cl)
@@ -135,13 +146,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
135
146
 
136
147
  # Classify genomes
137
148
  say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
138
- is.huge <- length(labels(ani.d)) > 4e4
139
- ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
149
+ is.large <- nn > 3e4
150
+ ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
140
151
  ani.types <- ani.cl$clustering
141
152
  ani.medoids <- ani.cl$medoids
142
153
 
143
- # Build tree
154
+ # Classify excluded genome (for huge collections)
144
155
  if (is.huge) {
156
+ say("Classifying excluded genomes")
157
+ ani.d <- ani.d.ori
158
+ # Find closest medoid for missing genomes
159
+ missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
160
+ for (i in missing)
161
+ ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
162
+ # Reorder
163
+ ani.types <- ani.types[labels(ani.d)]
164
+ # Save missing genomes for inspection
165
+ write.table(
166
+ missing, paste0(out_base, ".missing.txt"),
167
+ quote = FALSE, col.names = FALSE, row.names = FALSE
168
+ )
169
+ }
170
+
171
+ # Build tree
172
+ if (is.large) {
145
173
  say("Bypassing tree for large set")
146
174
  write.table(
147
175
  '{}', file = paste(out_base, ".nwk", sep = ""),
@@ -165,8 +193,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
165
193
  layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
166
194
  plot_distances(ani.d)
167
195
  plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
168
- plot_clustering(ani.cl, ani.d, ani.types)
169
- if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
196
+ if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
197
+ if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
170
198
  dev.off()
171
199
 
172
200
  # Save results
@@ -310,33 +338,48 @@ ani_distance <- function (ani_file, sel) {
310
338
  } else {
311
339
  sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
312
340
  }
341
+
342
+ # Extract individual variables to deal with very large matrices
343
+ a <- sim$a
344
+ b <- sim$b
345
+ d <- 1 - (sim$value / 100)
313
346
 
314
347
  # If there is no data, end process
315
- if (nrow(sim) == 0) return(NULL)
348
+ if (length(a) == 0) return(NULL)
316
349
 
317
350
  # Apply filter (if requested)
318
351
  ids <- NULL
319
352
  if (!is.na(sel) && file.exists(sel)) {
320
353
  say("Filter selection")
321
354
  ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
322
- sim <- sim[which(sim$a %in% ids & sim$b %in% ids), ]
355
+ sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
356
+ a <- a[sel.idx]
357
+ b <- b[sel.idx]
358
+ d <- d[sel.idx]
323
359
  } else {
324
- ids <- with(sim, unique(c(a, b)))
360
+ ids <- unique(c(a, b))
325
361
  }
326
362
 
327
- # Transform to distances
363
+ # Transform to dist object
328
364
  say("Distances")
329
- sim$d <- 1 - (sim$value / 100)
330
- return(as.dist(with(sim, {
331
- out <- matrix(
332
- min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
333
- dimnames = list(ids, ids)
334
- )
335
- out[cbind(ids, ids)] <- 0
336
- out[cbind(a, b)] <- d
337
- out[cbind(b, a)] <- d
338
- out
339
- })))
365
+ out <- matrix(
366
+ min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
367
+ dimnames = list(ids, ids)
368
+ )
369
+ diag(out) <- 0
370
+ # Split task to reduce peak RAM and support very large matrices
371
+ # - Note that `k` is subsetting by index, but it's defined as numeric
372
+ # instead of integer. The reason is that integer overflow occurs
373
+ # at just over 2e9, whereas numerics can represent much larger
374
+ # numbers without problems
375
+ i <- 0
376
+ while (i < length(a)) {
377
+ k <- seq(i + 1, min(i + 1e8, length(a)))
378
+ out[cbind(a[k], b[k])] <- d[k]
379
+ out[cbind(b[k], a[k])] <- d[k]
380
+ i <- i + 1e8
381
+ }
382
+ return(as.dist(out))
340
383
  }
341
384
 
342
385
  #= Main
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.7.3
4
+ version: 1.3.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons