miga-base 1.3.7.3 → 1.3.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/version.rb +2 -2
- data/utils/subclades.R +68 -25
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 415756b71662dd191b51679aa75a4a4c75bb5ec00d6aacfa7fa6436cdd983819
|
|
4
|
+
data.tar.gz: a4f441f9d6ff7bb06f2a5fe5534c1d52cea5ead3b82737d3799cb65f46bf81b0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f44c2506cb04763e21a9ffa81060214ca8f9d3772107b4a55820320838e08de20eac3e2d27805b62a7399b9aead3fd54eeb306a522f8886c3323bfc2149116b4
|
|
7
|
+
data.tar.gz: '00439963a16f86a3b8539fe0e212b63b7b62ac3eaedc8adebb88f8baae0a688a41a9518e6bbc69fc8db0b58e5551edf44b4c1764e64564aca372f358a6853d78'
|
data/lib/miga/version.rb
CHANGED
|
@@ -12,7 +12,7 @@ module MiGA
|
|
|
12
12
|
# - String indicating release status:
|
|
13
13
|
# - rc* release candidate, not released as gem
|
|
14
14
|
# - [0-9]+ stable release, released as gem
|
|
15
|
-
VERSION = [1.3,
|
|
15
|
+
VERSION = [1.3, 8, 1].freeze
|
|
16
16
|
|
|
17
17
|
##
|
|
18
18
|
# Nickname for the current major.minor version.
|
|
@@ -20,7 +20,7 @@ module MiGA
|
|
|
20
20
|
|
|
21
21
|
##
|
|
22
22
|
# Date of the current gem relese.
|
|
23
|
-
VERSION_DATE = Date.new(2023,
|
|
23
|
+
VERSION_DATE = Date.new(2023, 7, 5)
|
|
24
24
|
|
|
25
25
|
##
|
|
26
26
|
# References of MiGA
|
data/utils/subclades.R
CHANGED
|
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
|
97
97
|
#= Heavy-lifter
|
|
98
98
|
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
99
99
|
# Get ANI distances
|
|
100
|
-
if (length(ani.d) >
|
|
100
|
+
if (length(ani.d) > 0L) {
|
|
101
101
|
# Just use ani.d (and save in dist_rds)
|
|
102
102
|
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
|
103
103
|
} else if (file.exists(dist_rds)) {
|
|
@@ -107,21 +107,32 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
|
107
107
|
stop("Cannot find input matrix", out_base)
|
|
108
108
|
}
|
|
109
109
|
if (length(labels(ani.d)) <= 8L) return(list())
|
|
110
|
-
|
|
110
|
+
|
|
111
|
+
# Subsample huge collections
|
|
112
|
+
nMax <- 65536L
|
|
113
|
+
nn <- length(labels(ani.d))
|
|
114
|
+
is.huge <- nn > nMax
|
|
115
|
+
if (is.huge) {
|
|
116
|
+
say("Subsampling large collection")
|
|
117
|
+
ids <- sample(labels(ani.d), nMax)
|
|
118
|
+
ani.d.ori <- ani.d
|
|
119
|
+
ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
|
|
120
|
+
}
|
|
121
|
+
|
|
111
122
|
# Silhouette
|
|
112
123
|
say("Silhouette")
|
|
113
|
-
nn <- length(labels(ani.d))
|
|
114
124
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
|
|
115
125
|
say("- Make cluster")
|
|
116
126
|
cl <- makeCluster(thr)
|
|
117
127
|
say("- Launch parallel jobs")
|
|
118
128
|
s <- parSapply(
|
|
119
129
|
cl, k,
|
|
120
|
-
function(x) {
|
|
130
|
+
function(x, ani.d) {
|
|
121
131
|
library(cluster)
|
|
122
132
|
s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
|
|
123
133
|
c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
|
|
124
|
-
}
|
|
134
|
+
},
|
|
135
|
+
ani.d = ani.d
|
|
125
136
|
)
|
|
126
137
|
say("- Stop cluster")
|
|
127
138
|
stopCluster(cl)
|
|
@@ -135,13 +146,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
|
135
146
|
|
|
136
147
|
# Classify genomes
|
|
137
148
|
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
|
138
|
-
is.
|
|
139
|
-
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.
|
|
149
|
+
is.large <- nn > 3e4
|
|
150
|
+
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
|
|
140
151
|
ani.types <- ani.cl$clustering
|
|
141
152
|
ani.medoids <- ani.cl$medoids
|
|
142
153
|
|
|
143
|
-
#
|
|
154
|
+
# Classify excluded genome (for huge collections)
|
|
144
155
|
if (is.huge) {
|
|
156
|
+
say("Classifying excluded genomes")
|
|
157
|
+
ani.d <- ani.d.ori
|
|
158
|
+
# Find closest medoid for missing genomes
|
|
159
|
+
missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
|
|
160
|
+
for (i in missing)
|
|
161
|
+
ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
|
|
162
|
+
# Reorder
|
|
163
|
+
ani.types <- ani.types[labels(ani.d)]
|
|
164
|
+
# Save missing genomes for inspection
|
|
165
|
+
write.table(
|
|
166
|
+
missing, paste0(out_base, ".missing.txt"),
|
|
167
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
|
168
|
+
)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
# Build tree
|
|
172
|
+
if (is.large) {
|
|
145
173
|
say("Bypassing tree for large set")
|
|
146
174
|
write.table(
|
|
147
175
|
'{}', file = paste(out_base, ".nwk", sep = ""),
|
|
@@ -165,8 +193,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
|
165
193
|
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
|
166
194
|
plot_distances(ani.d)
|
|
167
195
|
plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
|
|
168
|
-
plot_clustering(ani.cl, ani.d, ani.types)
|
|
169
|
-
if (!is.
|
|
196
|
+
if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
|
|
197
|
+
if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
|
|
170
198
|
dev.off()
|
|
171
199
|
|
|
172
200
|
# Save results
|
|
@@ -310,33 +338,48 @@ ani_distance <- function (ani_file, sel) {
|
|
|
310
338
|
} else {
|
|
311
339
|
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
|
312
340
|
}
|
|
341
|
+
|
|
342
|
+
# Extract individual variables to deal with very large matrices
|
|
343
|
+
a <- sim$a
|
|
344
|
+
b <- sim$b
|
|
345
|
+
d <- 1 - (sim$value / 100)
|
|
313
346
|
|
|
314
347
|
# If there is no data, end process
|
|
315
|
-
if (
|
|
348
|
+
if (length(a) == 0) return(NULL)
|
|
316
349
|
|
|
317
350
|
# Apply filter (if requested)
|
|
318
351
|
ids <- NULL
|
|
319
352
|
if (!is.na(sel) && file.exists(sel)) {
|
|
320
353
|
say("Filter selection")
|
|
321
354
|
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
|
322
|
-
|
|
355
|
+
sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
|
|
356
|
+
a <- a[sel.idx]
|
|
357
|
+
b <- b[sel.idx]
|
|
358
|
+
d <- d[sel.idx]
|
|
323
359
|
} else {
|
|
324
|
-
ids <-
|
|
360
|
+
ids <- unique(c(a, b))
|
|
325
361
|
}
|
|
326
362
|
|
|
327
|
-
# Transform to
|
|
363
|
+
# Transform to dist object
|
|
328
364
|
say("Distances")
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
365
|
+
out <- matrix(
|
|
366
|
+
min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
|
|
367
|
+
dimnames = list(ids, ids)
|
|
368
|
+
)
|
|
369
|
+
diag(out) <- 0
|
|
370
|
+
# Split task to reduce peak RAM and support very large matrices
|
|
371
|
+
# - Note that `k` is subsetting by index, but it's defined as numeric
|
|
372
|
+
# instead of integer. The reason is that integer overflow occurs
|
|
373
|
+
# at just over 2e9, whereas numerics can represent much larger
|
|
374
|
+
# numbers without problems
|
|
375
|
+
i <- 0
|
|
376
|
+
while (i < length(a)) {
|
|
377
|
+
k <- seq(i + 1, min(i + 1e8, length(a)))
|
|
378
|
+
out[cbind(a[k], b[k])] <- d[k]
|
|
379
|
+
out[cbind(b[k], a[k])] <- d[k]
|
|
380
|
+
i <- i + 1e8
|
|
381
|
+
}
|
|
382
|
+
return(as.dist(out))
|
|
340
383
|
}
|
|
341
384
|
|
|
342
385
|
#= Main
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: miga-base
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.8.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Luis M. Rodriguez-R
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-07-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: daemons
|