miga-base 1.3.7.2 → 1.3.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/rm.rb +1 -1
- data/lib/miga/version.rb +2 -2
- data/utils/subclades.R +66 -24
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26e4d6df1a418582af445818b9965eaee00bb76282e5acd9fc1480d6b9d3e57b
|
4
|
+
data.tar.gz: 16c6f25a55191ca185fd4f5718e721780e718555525301ce98613290f7f38cc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c38ef4522680e357f95c239de8529bd1c7f1be307243000e38dabfa592d99226ca5e228b56efa8918ad3ab55de75858f8fdec3ecaeef5d6471ec37af1a0d2fdc
|
7
|
+
data.tar.gz: 3ab0c91d807c27ceaeb412dffbdda2da29d8c558772988b5d8af6063ef9dee206329ff7caf9e5c5795b5ee5b0f019b8b8a0906eb3bb6daca4611b394b5b8df6d
|
data/lib/miga/cli/action/rm.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3,
|
15
|
+
VERSION = [1.3, 8, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023,
|
23
|
+
VERSION_DATE = Date.new(2023, 7, 5)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/utils/subclades.R
CHANGED
@@ -97,7 +97,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
97
97
|
#= Heavy-lifter
|
98
98
|
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
99
99
|
# Get ANI distances
|
100
|
-
if (length(ani.d) >
|
100
|
+
if (length(ani.d) > 0L) {
|
101
101
|
# Just use ani.d (and save in dist_rds)
|
102
102
|
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
103
103
|
} else if (file.exists(dist_rds)) {
|
@@ -107,10 +107,20 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
107
107
|
stop("Cannot find input matrix", out_base)
|
108
108
|
}
|
109
109
|
if (length(labels(ani.d)) <= 8L) return(list())
|
110
|
-
|
110
|
+
|
111
|
+
# Subsample huge collections
|
112
|
+
nMax <- 65536L
|
113
|
+
nn <- length(labels(ani.d))
|
114
|
+
is.huge <- nn > nMax
|
115
|
+
if (is.huge) {
|
116
|
+
say("Subsampling large collection")
|
117
|
+
ids <- sample(labels(ani.d), nMax)
|
118
|
+
ani.d.ori <- ani.d
|
119
|
+
ani.d <- as.dist(as.matrix(ani.d)[ids, ids])
|
120
|
+
}
|
121
|
+
|
111
122
|
# Silhouette
|
112
123
|
say("Silhouette")
|
113
|
-
nn <- length(labels(ani.d))
|
114
124
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
|
115
125
|
say("- Make cluster")
|
116
126
|
cl <- makeCluster(thr)
|
@@ -135,13 +145,30 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
135
145
|
|
136
146
|
# Classify genomes
|
137
147
|
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
138
|
-
is.
|
139
|
-
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.
|
148
|
+
is.large <- nn > 3e4
|
149
|
+
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.large)
|
140
150
|
ani.types <- ani.cl$clustering
|
141
151
|
ani.medoids <- ani.cl$medoids
|
142
152
|
|
143
|
-
#
|
153
|
+
# Classify excluded genome (for huge collections)
|
144
154
|
if (is.huge) {
|
155
|
+
say("Classifying excluded genomes")
|
156
|
+
ani.d <- ani.d.ori
|
157
|
+
# Find closest medoid for missing genomes
|
158
|
+
missing <- labels(ani.d)[!labels(ani.d) %in% names(ani.types)]
|
159
|
+
for (i in missing)
|
160
|
+
ani.types[i] <- which.min(as.matrix(ani.d)[ani.medoids, i])
|
161
|
+
# Reorder
|
162
|
+
ani.types <- ani.types[labels(ani.d)]
|
163
|
+
# Save missing genomes for inspection
|
164
|
+
write.table(
|
165
|
+
missing, paste0(out_base, ".missing.txt"),
|
166
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
167
|
+
)
|
168
|
+
}
|
169
|
+
|
170
|
+
# Build tree
|
171
|
+
if (is.large) {
|
145
172
|
say("Bypassing tree for large set")
|
146
173
|
write.table(
|
147
174
|
'{}', file = paste(out_base, ".nwk", sep = ""),
|
@@ -165,8 +192,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
165
192
|
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
166
193
|
plot_distances(ani.d)
|
167
194
|
plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
|
168
|
-
plot_clustering(ani.cl, ani.d, ani.types)
|
169
|
-
if (!is.
|
195
|
+
if (!is.huge) plot_clustering(ani.cl, ani.d, ani.types)
|
196
|
+
if (!is.large) plot_tree(ani.ph, ani.types, ani.medoids)
|
170
197
|
dev.off()
|
171
198
|
|
172
199
|
# Save results
|
@@ -310,33 +337,48 @@ ani_distance <- function (ani_file, sel) {
|
|
310
337
|
} else {
|
311
338
|
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
312
339
|
}
|
340
|
+
|
341
|
+
# Extract individual variables to deal with very large matrices
|
342
|
+
a <- sim$a
|
343
|
+
b <- sim$b
|
344
|
+
d <- 1 - (sim$value / 100)
|
313
345
|
|
314
|
-
# If there is
|
315
|
-
if (
|
346
|
+
# If there is no data, end process
|
347
|
+
if (length(a) == 0) return(NULL)
|
316
348
|
|
317
349
|
# Apply filter (if requested)
|
318
350
|
ids <- NULL
|
319
351
|
if (!is.na(sel) && file.exists(sel)) {
|
320
352
|
say("Filter selection")
|
321
353
|
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
322
|
-
|
354
|
+
sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
|
355
|
+
a <- a[sel.idx]
|
356
|
+
b <- b[sel.idx]
|
357
|
+
d <- d[sel.idx]
|
323
358
|
} else {
|
324
|
-
ids <-
|
359
|
+
ids <- unique(c(a, b))
|
325
360
|
}
|
326
361
|
|
327
|
-
# Transform to
|
362
|
+
# Transform to dist object
|
328
363
|
say("Distances")
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
364
|
+
out <- matrix(
|
365
|
+
min(max(d) * 1.2, 1.0), nrow = length(ids), ncol = length(ids),
|
366
|
+
dimnames = list(ids, ids)
|
367
|
+
)
|
368
|
+
diag(out) <- 0
|
369
|
+
# Split task to reduce peak RAM and support very large matrices
|
370
|
+
# - Note that `k` is subsetting by index, but it's defined as numeric
|
371
|
+
# instead of integer. The reason is that integer overflow occurs
|
372
|
+
# at just over 2e9, whereas numerics can represent much larger
|
373
|
+
# numbers without problems
|
374
|
+
i <- 0
|
375
|
+
while (i < length(a)) {
|
376
|
+
k <- seq(i + 1, min(i + 1e8, length(a)))
|
377
|
+
out[cbind(a[k], b[k])] <- d[k]
|
378
|
+
out[cbind(b[k], a[k])] <- d[k]
|
379
|
+
i <- i + 1e8
|
380
|
+
}
|
381
|
+
return(as.dist(out))
|
340
382
|
}
|
341
383
|
|
342
384
|
#= Main
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|