miga-base 1.3.4.2 → 1.3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/get.rb +6 -1
- data/lib/miga/dataset/result/ignore.rb +33 -4
- data/lib/miga/result/stats.rb +9 -5
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +8 -0
- data/utils/subclades.R +48 -26
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d17244b326441f224e4626b53702018fd0f3915d44e16e9be939a70dd86ceefa
|
4
|
+
data.tar.gz: 8f6c7544ab57957dbfeb53230e56ba9e7eda37c7bab2fea538ee17d9f54fd9f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd77962a43caa04c72d01ddbe251fb02650f0f003c55b2b13a44ac1affb77b52535e5dd5fd7713c387f5a7bafdd8839255f2cd8d82be8d4c025b562a7beabfbd
|
7
|
+
data.tar.gz: b0cd50e25aa16ce2c64202daa5bbbb256e3f83742513ac984715f1b7c96f7297887856a2853ac6ec49dd43fe63b5c37cd96e83f8047ebcd32fdbc47addf892f7
|
data/lib/miga/cli/action/get.rb
CHANGED
@@ -120,7 +120,12 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
|
|
120
120
|
def create_remote_dataset(sub_cli, p)
|
121
121
|
sub_cli.ensure_par(dataset: '-D', ids: '-I')
|
122
122
|
unless sub_cli[:api_key].nil?
|
123
|
-
|
123
|
+
if sub_cli[:universe] == :web && sub_cli[:db] == :assembly_gz
|
124
|
+
ENV['NCBI_API_KEY'] = sub_cli[:api_key]
|
125
|
+
end
|
126
|
+
|
127
|
+
var_space = sub_cli[:universe].to_s.upcase
|
128
|
+
ENV["#{var_space}_API_KEY"] = sub_cli[:api_key]
|
124
129
|
end
|
125
130
|
|
126
131
|
sub_cli.say "Dataset: #{sub_cli[:dataset]}"
|
@@ -76,18 +76,47 @@ module MiGA::Dataset::Result::Ignore
|
|
76
76
|
##
|
77
77
|
# Ignore +task+ because it's not a reference dataset
|
78
78
|
def ignore_noref?(task)
|
79
|
-
|
79
|
+
ignore_by_type?(task, :noref)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# Ignore +task+ because it's not a multi dataset
|
84
84
|
def ignore_multi?(task)
|
85
|
-
|
85
|
+
ignore_by_type?(task, :multi)
|
86
86
|
end
|
87
87
|
|
88
88
|
##
|
89
89
|
# Ignore +task+ because it's not a nonmulti dataset
|
90
90
|
def ignore_nonmulti?(task)
|
91
|
-
|
91
|
+
ignore_by_type?(task, :nonmulti)
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
|
+
##
|
95
|
+
# Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
|
96
|
+
# +:nonmulti+
|
97
|
+
def ignore_by_type?(task, type)
|
98
|
+
return false if force_task?(task)
|
99
|
+
|
100
|
+
test, list =
|
101
|
+
case type.to_sym
|
102
|
+
when :noref
|
103
|
+
[:ref?, self.class.EXCLUDE_NOREF_TASKS]
|
104
|
+
when :multi
|
105
|
+
[:multi?, self.class.ONLY_MULTI_TASKS]
|
106
|
+
when :nonmulti
|
107
|
+
[:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
|
108
|
+
else
|
109
|
+
raise "Unexpected error, unknown type reason: #{type}"
|
110
|
+
end
|
111
|
+
|
112
|
+
list.include?(task) && !send(test)
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Force the +task+ to be executed even if it should otherwise be
|
117
|
+
# ignored due to reasons: +:noref+, +:multi+, or +:nonmulti+. Other
|
118
|
+
# reasons to ignore a task are not affected by metadata forcing
|
119
|
+
def force_task?(task)
|
120
|
+
!!metadata["run_#{task}"]
|
121
|
+
end
|
122
|
+
end
|
data/lib/miga/result/stats.rb
CHANGED
@@ -141,12 +141,16 @@ module MiGA::Result::Stats
|
|
141
141
|
# Determine qualitative range
|
142
142
|
stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
|
143
143
|
source.metadata[:quality] =
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
if stats[:completeness][0] >= 90 && stats[:contamination][0] <= 5
|
145
|
+
:excellent # Finished or High-quality draft*
|
146
|
+
elsif stats[:completeness][0] >= 50 && stats[:contamination][0] <= 10
|
147
|
+
:high # Medium-quality draft*
|
148
|
+
elsif stats[:quality] >= 25
|
149
|
+
:intermediate # Low-quality draft* but sufficient for classification
|
150
|
+
else
|
151
|
+
:low # Low-quality draft* and insufficient for classification
|
149
152
|
end
|
153
|
+
# * Bowers et al 2017, DOI: 10.1038/nbt.3893
|
150
154
|
source.save
|
151
155
|
|
152
156
|
# Inactivate low-quality datasets
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3,
|
15
|
+
VERSION = [1.3, 5, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023, 4,
|
23
|
+
VERSION_DATE = Date.new(2023, 4, 21)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -18,6 +18,14 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
18
18
|
for i in $DS ; do
|
19
19
|
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
20
20
|
done
|
21
|
+
# The following block pipes retrieved data from all databases, reorganizes the
|
22
|
+
# names in cannonical order, and removes repeats from the first two columns,
|
23
|
+
# in order to keep only one result per pair. This is not being included into
|
24
|
+
# production, but the code may be useful for extremely large databases.
|
25
|
+
# | tee \
|
26
|
+
# | awk -F"\t" \
|
27
|
+
# 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
|
28
|
+
# | sort -k 1,2 -u
|
21
29
|
) | gzip -9c > miga-project.txt.gz
|
22
30
|
|
23
31
|
# R-ify
|
data/utils/subclades.R
CHANGED
@@ -47,7 +47,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
47
47
|
say("Loading")
|
48
48
|
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
49
49
|
sep = " ", as.is = TRUE)[,1]
|
50
|
-
a <- read.table(paste(out_base, "classif", sep="."),
|
50
|
+
a <- read.table(paste(out_base, "classif", sep = "."),
|
51
51
|
sep = "\t", as.is = TRUE)
|
52
52
|
ani.types <- a[,2]
|
53
53
|
names(ani.types) <- a[,1]
|
@@ -70,17 +70,17 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
70
70
|
say("Recursive search")
|
71
71
|
for (i in 1:length(ani.medoids)) {
|
72
72
|
medoid <- ani.medoids[i]
|
73
|
-
ds_f <- names(ani.types)[
|
73
|
+
ds_f <- names(ani.types)[ani.types == i]
|
74
74
|
say("Analyzing subclade", i, "with medoid:", medoid)
|
75
|
-
dir_f <- paste(out_base, ".sc-", i, sep="")
|
75
|
+
dir_f <- paste(out_base, ".sc-", i, sep = "")
|
76
76
|
if (!dir.exists(dir_f)) dir.create(dir_f)
|
77
77
|
write.table(ds_f,
|
78
|
-
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
79
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
78
|
+
paste(out_base, ".sc-", i, "/miga-project.all", sep = ""),
|
79
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE)
|
80
80
|
if (length(ds_f) > 8L) {
|
81
81
|
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
82
82
|
subclades(
|
83
|
-
out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
83
|
+
out_base = paste(out_base, ".sc-", i, "/miga-project", sep = ""),
|
84
84
|
thr = thr,
|
85
85
|
ani.d = ani_subset
|
86
86
|
)
|
@@ -111,7 +111,7 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
111
111
|
# Silhouette
|
112
112
|
say("Silhouette")
|
113
113
|
nn <- length(labels(ani.d))
|
114
|
-
k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
|
114
|
+
k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
|
115
115
|
say("- Make cluster")
|
116
116
|
cl <- makeCluster(thr)
|
117
117
|
say("- Launch parallel jobs")
|
@@ -119,8 +119,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
119
119
|
cl, k,
|
120
120
|
function(x) {
|
121
121
|
library(cluster)
|
122
|
-
s <- pam(ani.d, x, do.swap = FALSE,
|
123
|
-
c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
|
122
|
+
s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
|
123
|
+
c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
|
124
124
|
}
|
125
125
|
)
|
126
126
|
say("- Stop cluster")
|
@@ -135,29 +135,38 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
135
135
|
|
136
136
|
# Classify genomes
|
137
137
|
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
138
|
-
|
138
|
+
is.huge <- length(labels(ani.d)) > 4e4
|
139
|
+
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
|
139
140
|
ani.types <- ani.cl$clustering
|
140
141
|
ani.medoids <- ani.cl$medoids
|
141
142
|
|
142
143
|
# Build tree
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
if (is.huge) {
|
145
|
+
say("Bypassing tree for large set")
|
146
|
+
write.table(
|
147
|
+
'{}', file = paste(out_base, ".nwk", sep = ""),
|
148
|
+
col.names = FALSE, row.names = FALSE, quote = FALSE
|
149
|
+
)
|
150
|
+
} else {
|
151
|
+
say("Tree")
|
152
|
+
ani.ph <- bionj(ani.d)
|
153
|
+
say("- Write")
|
154
|
+
express.ori <- options("expressions")$expressions
|
155
|
+
if(express.ori < ani.ph$Nnode * 4){
|
156
|
+
options(expressions = min(c(5e7, ani.ph$Nnode * 4)))
|
157
|
+
}
|
158
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
159
|
+
options(expressions = express.ori)
|
149
160
|
}
|
150
|
-
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
151
|
-
options(expressions=express.ori)
|
152
161
|
|
153
162
|
# Generate graphic report
|
154
163
|
say("Graphic report")
|
155
164
|
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
156
165
|
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
157
166
|
plot_distances(ani.d)
|
158
|
-
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
167
|
+
plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
|
159
168
|
plot_clustering(ani.cl, ani.d, ani.types)
|
160
|
-
plot_tree(ani.ph, ani.types, ani.medoids)
|
169
|
+
if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
|
161
170
|
dev.off()
|
162
171
|
|
163
172
|
# Save results
|
@@ -198,7 +207,7 @@ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
|
|
198
207
|
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
199
208
|
}
|
200
209
|
write.table(
|
201
|
-
classif, paste(out_base, "classif", sep="."),
|
210
|
+
classif, paste(out_base, "classif", sep = "."),
|
202
211
|
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
|
203
212
|
)
|
204
213
|
}
|
@@ -249,7 +258,8 @@ plot_clustering <- function (cl, dist, types) {
|
|
249
258
|
top.n <- length(cl$medoids)
|
250
259
|
col <- ggplotColours(top.n)
|
251
260
|
plot(silhouette(cl), col = col)
|
252
|
-
|
261
|
+
dist.n <- length(labels(dist))
|
262
|
+
if (dist.n <= 15 | dist.n > 4e4) {
|
253
263
|
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
254
264
|
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
255
265
|
} else {
|
@@ -261,7 +271,7 @@ plot_clustering <- function (cl, dist, types) {
|
|
261
271
|
)
|
262
272
|
plot(
|
263
273
|
ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
|
264
|
-
xlab = "Component 3", ylab="Component 4"
|
274
|
+
xlab = "Component 3", ylab = "Component 4"
|
265
275
|
)
|
266
276
|
}else{
|
267
277
|
for (i in 1:2)
|
@@ -305,16 +315,28 @@ ani_distance <- function (ani_file, sel) {
|
|
305
315
|
if (nrow(sim) == 0) return(NULL)
|
306
316
|
|
307
317
|
# Apply filter (if requested)
|
318
|
+
ids <- NULL
|
308
319
|
if (!is.na(sel) && file.exists(sel)) {
|
309
320
|
say("Filter selection")
|
310
|
-
|
311
|
-
sim <- sim[sim$a %in%
|
321
|
+
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
322
|
+
sim <- sim[sim$a %in% ids & sim$b %in% ids, ]
|
323
|
+
} else {
|
324
|
+
ids <- with(sim, unique(c(a, b)))
|
312
325
|
}
|
313
326
|
|
314
327
|
# Transform to distances
|
315
328
|
say("Distances")
|
316
329
|
sim$d <- 1 - (sim$value / 100)
|
317
|
-
return(
|
330
|
+
return(as.dist(with(sim, {
|
331
|
+
out <- matrix(
|
332
|
+
max(d) * 1.2, nrow = length(ids), ncol = length(ids),
|
333
|
+
dimnames = list(ids, ids)
|
334
|
+
)
|
335
|
+
out[cbind(ids, ids)] <- 0
|
336
|
+
out[cbind(a, b)] <- d
|
337
|
+
out[cbind(b, a)] <- d
|
338
|
+
out
|
339
|
+
})))
|
318
340
|
}
|
319
341
|
|
320
342
|
#= Main
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|