miga-base 1.3.7.3 → 1.3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +2 -2
  3. data/utils/subclades.R +65 -23
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da6f90e73e7f3f740623ad77e09886b1c7618de2f484ba412e53d329fb771c12
4
- data.tar.gz: 662eda2643f1695285e02ef18076aa3c3bed86bfbbd2d0fc9c91481fbe6fb384
3
+ metadata.gz: 26e4d6df1a418582af445818b9965eaee00bb76282e5acd9fc1480d6b9d3e57b
4
+ data.tar.gz: 16c6f25a55191ca185fd4f5718e721780e718555525301ce98613290f7f38cc5
5
5
  SHA512:
6
- metadata.gz: 268833ab588449626fd9ee1c8374daea22482fe85d73570c6c9bec58dd1b00c8385e53f8c9d26f0903cf88451f715194c2ddd5bb330eff0a5695ffb9f0bcb51f
7
- data.tar.gz: 1dabe6af4158fbb26c760aa2c4673d1cdba45b832a54eb01995418bf11a8cb7c02708e99c54a1862d9bf501101f0fc55afc6f9b44d487c37d7d35cd61a1ff688
6
+ metadata.gz: c38ef4522680e357f95c239de8529bd1c7f1be307243000e38dabfa592d99226ca5e228b56efa8918ad3ab55de75858f8fdec3ecaeef5d6471ec37af1a0d2fdc
7
+ data.tar.gz: 3ab0c91d807c27ceaeb412dffbdda2da29d8c558772988b5d8af6063ef9dee206329ff7caf9e5c5795b5ee5b0f019b8b8a0906eb3bb6daca4611b394b5b8df6d
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 7, 3].freeze
15
+ VERSION = [1.3, 8, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 6, 29)
23
+ VERSION_DATE = Date.new(2023, 7, 5)
24
24
 
25
25
  ##
26
26
  # References of MiGA
data/utils/subclades.R CHANGED
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
97
97
  #= Heavy-lifter
98
98
  subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
99
99
  # Get ANI distances
100
- if (length(ani.d) > 0) {
100
+ if (length(ani.d) > 0L) {
101
101
  # Just use ani.d (and save in dist_rds)
102
102
  if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
103
103
  } else if (file.exists(dist_rds)) {
@@ -107,10 +107,20 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
107
107
  stop("Cannot find input matrix", out_base)
108
108
  }
109
109
  if (length(labels(ani.d)) <= 8L) return(list())
110
-
110
+
111
+ # Subsample huge collections
112
+ nMax <- 65536L
113
+ nn <- length(labels(ani.d))
114
+ is.huge <- nn > nMax
115
+ if (is.huge) {
116
+ say("Subsampling large collection")
117
+ ids <- sample(labels(ani.d), nMax)
118
+ ani.d.ori <- ani.d
119
+ ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
120
+ }
121
+
111
122
  # Silhouette
112
123
  say("Silhouette")
113
- nn <- length(labels(ani.d))
114
124
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
115
125
  say("- Make cluster")
116
126
  cl <- makeCluster(thr)
@@ -135,13 +145,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
135
145
 
136
146
  # Classify genomes
137
147
  say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
138
- is.huge <- length(labels(ani.d)) > 4e4
139
- ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
148
+ is.large <- nn > 3e4
149
+ ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
140
150
  ani.types <- ani.cl$clustering
141
151
  ani.medoids <- ani.cl$medoids
142
152
 
143
- # Build tree
153
+ # Classify excluded genome (for huge collections)
144
154
  if (is.huge) {
155
+ say("Classifying excluded genomes")
156
+ ani.d <- ani.d.ori
157
+ # Find closest medoid for missing genomes
158
+ missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
159
+ for (i in missing)
160
+ ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
161
+ # Reorder
162
+ ani.types <- ani.types[labels(ani.d)]
163
+ # Save missing genomes for inspection
164
+ write.table(
165
+ missing, paste0(out_base, ".missing.txt"),
166
+ quote = FALSE, col.names = FALSE, row.names = FALSE
167
+ )
168
+ }
169
+
170
+ # Build tree
171
+ if (is.large) {
145
172
  say("Bypassing tree for large set")
146
173
  write.table(
147
174
  '{}', file = paste(out_base, ".nwk", sep = ""),
@@ -165,8 +192,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
165
192
  layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
166
193
  plot_distances(ani.d)
167
194
  plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
168
- plot_clustering(ani.cl, ani.d, ani.types)
169
- if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
195
+ if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
196
+ if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
170
197
  dev.off()
171
198
 
172
199
  # Save results
@@ -310,33 +337,48 @@ ani_distance <- function (ani_file, sel) {
310
337
  } else {
311
338
  sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
312
339
  }
340
+
341
+ # Extract individual variables to deal with very large matrices
342
+ a <- sim$a
343
+ b <- sim$b
344
+ d <- 1 - (sim$value / 100)
313
345
 
314
346
  # If there is no data, end process
315
- if (nrow(sim) == 0) return(NULL)
347
+ if (length(a) == 0) return(NULL)
316
348
 
317
349
  # Apply filter (if requested)
318
350
  ids <- NULL
319
351
  if (!is.na(sel) && file.exists(sel)) {
320
352
  say("Filter selection")
321
353
  ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
322
- sim <- sim[which(sim$a %in% ids & sim$b %in% ids), ]
354
+ sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
355
+ a <- a[sel.idx]
356
+ b <- b[sel.idx]
357
+ d <- d[sel.idx]
323
358
  } else {
324
- ids <- with(sim, unique(c(a, b)))
359
+ ids <- unique(c(a, b))
325
360
  }
326
361
 
327
- # Transform to distances
362
+ # Transform to dist object
328
363
  say("Distances")
329
- sim$d <- 1 - (sim$value / 100)
330
- return(as.dist(with(sim, {
331
- out <- matrix(
332
- min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
333
- dimnames = list(ids, ids)
334
- )
335
- out[cbind(ids, ids)] <- 0
336
- out[cbind(a, b)] <- d
337
- out[cbind(b, a)] <- d
338
- out
339
- })))
364
+ out <- matrix(
365
+ min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
366
+ dimnames = list(ids, ids)
367
+ )
368
+ diag(out) <- 0
369
+ # Split task to reduce peak RAM and support very large matrices
370
+ # - Note that `k` is subsetting by index, but it's defined as numeric
371
+ # instead of integer. The reason is that integer overflow occurs
372
+ # at just over 2e9, whereas numerics can represent much larger
373
+ # numbers without problems
374
+ i <- 0
375
+ while (i < length(a)) {
376
+ k <- seq(i + 1, min(i + 1e8, length(a)))
377
+ out[cbind(a[k], b[k])] <- d[k]
378
+ out[cbind(b[k], a[k])] <- d[k]
379
+ i <- i + 1e8
380
+ }
381
+ return(as.dist(out))
340
382
  }
341
383
 
342
384
  #= Main
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.7.3
4
+ version: 1.3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons