miga-base 1.3.7.3 → 1.3.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/version.rb +2 -2
  3. data/utils/subclades.R +65 -23
  4. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da6f90e73e7f3f740623ad77e09886b1c7618de2f484ba412e53d329fb771c12
4
- data.tar.gz: 662eda2643f1695285e02ef18076aa3c3bed86bfbbd2d0fc9c91481fbe6fb384
3
+ metadata.gz: 26e4d6df1a418582af445818b9965eaee00bb76282e5acd9fc1480d6b9d3e57b
4
+ data.tar.gz: 16c6f25a55191ca185fd4f5718e721780e718555525301ce98613290f7f38cc5
5
5
  SHA512:
6
- metadata.gz: 268833ab588449626fd9ee1c8374daea22482fe85d73570c6c9bec58dd1b00c8385e53f8c9d26f0903cf88451f715194c2ddd5bb330eff0a5695ffb9f0bcb51f
7
- data.tar.gz: 1dabe6af4158fbb26c760aa2c4673d1cdba45b832a54eb01995418bf11a8cb7c02708e99c54a1862d9bf501101f0fc55afc6f9b44d487c37d7d35cd61a1ff688
6
+ metadata.gz: c38ef4522680e357f95c239de8529bd1c7f1be307243000e38dabfa592d99226ca5e228b56efa8918ad3ab55de75858f8fdec3ecaeef5d6471ec37af1a0d2fdc
7
+ data.tar.gz: 3ab0c91d807c27ceaeb412dffbdda2da29d8c558772988b5d8af6063ef9dee206329ff7caf9e5c5795b5ee5b0f019b8b8a0906eb3bb6daca4611b394b5b8df6d
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 7, 3].freeze
15
+ VERSION = [1.3, 8, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 6, 29)
23
+ VERSION_DATE = Date.new(2023, 7, 5)
24
24
 
25
25
  ##
26
26
  # References of MiGA
data/utils/subclades.R CHANGED
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
97
97
  #= Heavy-lifter
98
98
  subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
99
99
  # Get ANI distances
100
- if (length(ani.d) > 0) {
100
+ if (length(ani.d) > 0L) {
101
101
  # Just use ani.d (and save in dist_rds)
102
102
  if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
103
103
  } else if (file.exists(dist_rds)) {
@@ -107,10 +107,20 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
107
107
  stop("Cannot find input matrix", out_base)
108
108
  }
109
109
  if (length(labels(ani.d)) <= 8L) return(list())
110
-
110
+
111
+ # Subsample huge collections
112
+ nMax <- 65536L
113
+ nn <- length(labels(ani.d))
114
+ is.huge <- nn > nMax
115
+ if (is.huge) {
116
+ say("Subsampling large collection")
117
+ ids <- sample(labels(ani.d), nMax)
118
+ ani.d.ori <- ani.d
119
+ ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
120
+ }
121
+
111
122
  # Silhouette
112
123
  say("Silhouette")
113
- nn <- length(labels(ani.d))
114
124
  k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
115
125
  say("- Make cluster")
116
126
  cl <- makeCluster(thr)
@@ -135,13 +145,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
135
145
 
136
146
  # Classify genomes
137
147
  say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
138
- is.huge <- length(labels(ani.d)) > 4e4
139
- ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
148
+ is.large <- nn > 3e4
149
+ ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
140
150
  ani.types <- ani.cl$clustering
141
151
  ani.medoids <- ani.cl$medoids
142
152
 
143
- # Build tree
153
+ # Classify excluded genome (for huge collections)
144
154
  if (is.huge) {
155
+ say("Classifying excluded genomes")
156
+ ani.d <- ani.d.ori
157
+ # Find closest medoid for missing genomes
158
+ missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
159
+ for (i in missing)
160
+ ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
161
+ # Reorder
162
+ ani.types <- ani.types[labels(ani.d)]
163
+ # Save missing genomes for inspection
164
+ write.table(
165
+ missing, paste0(out_base, ".missing.txt"),
166
+ quote = FALSE, col.names = FALSE, row.names = FALSE
167
+ )
168
+ }
169
+
170
+ # Build tree
171
+ if (is.large) {
145
172
  say("Bypassing tree for large set")
146
173
  write.table(
147
174
  '{}', file = paste(out_base, ".nwk", sep = ""),
@@ -165,8 +192,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
165
192
  layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
166
193
  plot_distances(ani.d)
167
194
  plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
168
- plot_clustering(ani.cl, ani.d, ani.types)
169
- if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
195
+ if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
196
+ if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
170
197
  dev.off()
171
198
 
172
199
  # Save results
@@ -310,33 +337,48 @@ ani_distance <- function (ani_file, sel) {
310
337
  } else {
311
338
  sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
312
339
  }
340
+
341
+ # Extract individual variables to deal with very large matrices
342
+ a <- sim$a
343
+ b <- sim$b
344
+ d <- 1 - (sim$value / 100)
313
345
 
314
346
  # If there is no data, end process
315
- if (nrow(sim) == 0) return(NULL)
347
+ if (length(a) == 0) return(NULL)
316
348
 
317
349
  # Apply filter (if requested)
318
350
  ids <- NULL
319
351
  if (!is.na(sel) && file.exists(sel)) {
320
352
  say("Filter selection")
321
353
  ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
322
- sim <- sim[which(sim$a %in% ids & sim$b %in% ids), ]
354
+ sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
355
+ a <- a[sel.idx]
356
+ b <- b[sel.idx]
357
+ d <- d[sel.idx]
323
358
  } else {
324
- ids <- with(sim, unique(c(a, b)))
359
+ ids <- unique(c(a, b))
325
360
  }
326
361
 
327
- # Transform to distances
362
+ # Transform to dist object
328
363
  say("Distances")
329
- sim$d <- 1 - (sim$value / 100)
330
- return(as.dist(with(sim, {
331
- out <- matrix(
332
- min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
333
- dimnames = list(ids, ids)
334
- )
335
- out[cbind(ids, ids)] <- 0
336
- out[cbind(a, b)] <- d
337
- out[cbind(b, a)] <- d
338
- out
339
- })))
364
+ out <- matrix(
365
+ min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
366
+ dimnames = list(ids, ids)
367
+ )
368
+ diag(out) <- 0
369
+ # Split task to reduce peak RAM and support very large matrices
370
+ # - Note that `k` is subsetting by index, but it's defined as numeric
371
+ # instead of integer. The reason is that integer overflow occurs
372
+ # at just over 2e9, whereas numerics can represent much larger
373
+ # numbers without problems
374
+ i <- 0
375
+ while (i < length(a)) {
376
+ k <- seq(i + 1, min(i + 1e8, length(a)))
377
+ out[cbind(a[k], b[k])] <- d[k]
378
+ out[cbind(b[k], a[k])] <- d[k]
379
+ i <- i + 1e8
380
+ }
381
+ return(as.dist(out))
340
382
  }
341
383
 
342
384
  #= Main
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.7.3
4
+ version: 1.3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons