miga-base 1.3.7.3 → 1.3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +2 -2
- data/utils/subclades.R +65 -23
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26e4d6df1a418582af445818b9965eaee00bb76282e5acd9fc1480d6b9d3e57b
|
4
|
+
data.tar.gz: 16c6f25a55191ca185fd4f5718e721780e718555525301ce98613290f7f38cc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c38ef4522680e357f95c239de8529bd1c7f1be307243000e38dabfa592d99226ca5e228b56efa8918ad3ab55de75858f8fdec3ecaeef5d6471ec37af1a0d2fdc
|
7
|
+
data.tar.gz: 3ab0c91d807c27ceaeb412dffbdda2da29d8c558772988b5d8af6063ef9dee206329ff7caf9e5c5795b5ee5b0f019b8b8a0906eb3bb6daca4611b394b5b8df6d
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3,
|
15
|
+
VERSION = [1.3, 8, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023,
|
23
|
+
VERSION_DATE = Date.new(2023, 7, 5)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/utils/subclades.R
CHANGED
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
97
97
|
#= Heavy-lifter
|
98
98
|
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
99
99
|
# Get ANI distances
|
100
|
-
if (length(ani.d) >
|
100
|
+
if (length(ani.d) > 0L) {
|
101
101
|
# Just use ani.d (and save in dist_rds)
|
102
102
|
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
103
103
|
} else if (file.exists(dist_rds)) {
|
@@ -107,10 +107,20 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
107
107
|
stop("Cannot find input matrix", out_base)
|
108
108
|
}
|
109
109
|
if (length(labels(ani.d)) <= 8L) return(list())
|
110
|
-
|
110
|
+
|
111
|
+
# Subsample huge collections
|
112
|
+
nMax <- 65536L
|
113
|
+
nn <- length(labels(ani.d))
|
114
|
+
is.huge <- nn > nMax
|
115
|
+
if (is.huge) {
|
116
|
+
say("Subsampling large collection")
|
117
|
+
ids <- sample(labels(ani.d), nMax)
|
118
|
+
ani.d.ori <- ani.d
|
119
|
+
ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
|
120
|
+
}
|
121
|
+
|
111
122
|
# Silhouette
|
112
123
|
say("Silhouette")
|
113
|
-
nn <- length(labels(ani.d))
|
114
124
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
|
115
125
|
say("- Make cluster")
|
116
126
|
cl <- makeCluster(thr)
|
@@ -135,13 +145,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
135
145
|
|
136
146
|
# Classify genomes
|
137
147
|
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
138
|
-
is.
|
139
|
-
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.
|
148
|
+
is.large <- nn > 3e4
|
149
|
+
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
|
140
150
|
ani.types <- ani.cl$clustering
|
141
151
|
ani.medoids <- ani.cl$medoids
|
142
152
|
|
143
|
-
#
|
153
|
+
# Classify excluded genome (for huge collections)
|
144
154
|
if (is.huge) {
|
155
|
+
say("Classifying excluded genomes")
|
156
|
+
ani.d <- ani.d.ori
|
157
|
+
# Find closest medoid for missing genomes
|
158
|
+
missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
|
159
|
+
for (i in missing)
|
160
|
+
ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
|
161
|
+
# Reorder
|
162
|
+
ani.types <- ani.types[labels(ani.d)]
|
163
|
+
# Save missing genomes for inspection
|
164
|
+
write.table(
|
165
|
+
missing, paste0(out_base, ".missing.txt"),
|
166
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
167
|
+
)
|
168
|
+
}
|
169
|
+
|
170
|
+
# Build tree
|
171
|
+
if (is.large) {
|
145
172
|
say("Bypassing tree for large set")
|
146
173
|
write.table(
|
147
174
|
'{}', file = paste(out_base, ".nwk", sep = ""),
|
@@ -165,8 +192,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
165
192
|
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
166
193
|
plot_distances(ani.d)
|
167
194
|
plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
|
168
|
-
plot_clustering(ani.cl, ani.d, ani.types)
|
169
|
-
if (!is.
|
195
|
+
if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
|
196
|
+
if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
|
170
197
|
dev.off()
|
171
198
|
|
172
199
|
# Save results
|
@@ -310,33 +337,48 @@ ani_distance <- function (ani_file, sel) {
|
|
310
337
|
} else {
|
311
338
|
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
312
339
|
}
|
340
|
+
|
341
|
+
# Extract individual variables to deal with very large matrices
|
342
|
+
a <- sim$a
|
343
|
+
b <- sim$b
|
344
|
+
d <- 1 - (sim$value / 100)
|
313
345
|
|
314
346
|
# If there is no data, end process
|
315
|
-
if (
|
347
|
+
if (length(a) == 0) return(NULL)
|
316
348
|
|
317
349
|
# Apply filter (if requested)
|
318
350
|
ids <- NULL
|
319
351
|
if (!is.na(sel) && file.exists(sel)) {
|
320
352
|
say("Filter selection")
|
321
353
|
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
322
|
-
|
354
|
+
sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
|
355
|
+
a <- a[sel.idx]
|
356
|
+
b <- b[sel.idx]
|
357
|
+
d <- d[sel.idx]
|
323
358
|
} else {
|
324
|
-
ids <-
|
359
|
+
ids <- unique(c(a, b))
|
325
360
|
}
|
326
361
|
|
327
|
-
# Transform to
|
362
|
+
# Transform to dist object
|
328
363
|
say("Distances")
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
364
|
+
out <- matrix(
|
365
|
+
min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
|
366
|
+
dimnames = list(ids, ids)
|
367
|
+
)
|
368
|
+
diag(out) <- 0
|
369
|
+
# Split task to reduce peak RAM and support very large matrices
|
370
|
+
# - Note that `k` is subsetting by index, but it's defined as numeric
|
371
|
+
# instead of integer. The reason is that integer overflow occurs
|
372
|
+
# at just over 2e9, whereas numerics can represent much larger
|
373
|
+
# numbers without problems
|
374
|
+
i <- 0
|
375
|
+
while (i < length(a)) {
|
376
|
+
k <- seq(i + 1, min(i + 1e8, length(a)))
|
377
|
+
out[cbind(a[k], b[k])] <- d[k]
|
378
|
+
out[cbind(b[k], a[k])] <- d[k]
|
379
|
+
i <- i + 1e8
|
380
|
+
}
|
381
|
+
return(as.dist(out))
|
340
382
|
}
|
341
383
|
|
342
384
|
#= Main
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|