miga-base 1.3.4.2 → 1.3.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/get.rb +6 -1
- data/lib/miga/dataset/result/ignore.rb +33 -4
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +8 -0
- data/utils/subclades.R +48 -26
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '04760780c4ec69edaeb55949c2f15933c7d9ecc1c620bbeca28a01679f957db3'
|
4
|
+
data.tar.gz: 885e191e8c77a7e117d1e22875f6c90615f7cc001d0215930fd690022a88fb0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b47739ea450c9217119ad61317559e7429f87b04531ad624555538885207ffbb054715d18f60757a41f5c7816cb3cdc06b4b81a378b509ac3c1518452ba02ec
|
7
|
+
data.tar.gz: a3c08caf1d98ea5de2c7b137004c126a5b1541403cacd8ab88d0fa3da8a0617fe380448370f3e038bd70d3a4342b20d6476a94a5062c6aec9615838f694c5e6a
|
data/lib/miga/cli/action/get.rb
CHANGED
@@ -120,7 +120,12 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
|
|
120
120
|
def create_remote_dataset(sub_cli, p)
|
121
121
|
sub_cli.ensure_par(dataset: '-D', ids: '-I')
|
122
122
|
unless sub_cli[:api_key].nil?
|
123
|
-
|
123
|
+
if sub_cli[:universe] == :web && sub_cli[:db] == :assembly_gz
|
124
|
+
ENV['NCBI_API_KEY'] = sub_cli[:api_key]
|
125
|
+
end
|
126
|
+
|
127
|
+
var_space = sub_cli[:universe].to_s.upcase
|
128
|
+
ENV["#{var_space}_API_KEY"] = sub_cli[:api_key]
|
124
129
|
end
|
125
130
|
|
126
131
|
sub_cli.say "Dataset: #{sub_cli[:dataset]}"
|
@@ -76,18 +76,47 @@ module MiGA::Dataset::Result::Ignore
|
|
76
76
|
##
|
77
77
|
# Ignore +task+ because it's not a reference dataset
|
78
78
|
def ignore_noref?(task)
|
79
|
-
|
79
|
+
ignore_by_type?(task, :noref)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# Ignore +task+ because it's not a multi dataset
|
84
84
|
def ignore_multi?(task)
|
85
|
-
|
85
|
+
ignore_by_type?(task, :multi)
|
86
86
|
end
|
87
87
|
|
88
88
|
##
|
89
89
|
# Ignore +task+ because it's not a nonmulti dataset
|
90
90
|
def ignore_nonmulti?(task)
|
91
|
-
|
91
|
+
ignore_by_type?(task, :nonmulti)
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
|
+
##
|
95
|
+
# Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
|
96
|
+
# +:nonmulti+
|
97
|
+
def ignore_by_type?(task, type)
|
98
|
+
return false if force_task?(task)
|
99
|
+
|
100
|
+
test, list =
|
101
|
+
case type.to_sym
|
102
|
+
when :noref
|
103
|
+
[:ref?, self.class.EXCLUDE_NOREF_TASKS]
|
104
|
+
when :multi
|
105
|
+
[:multi?, self.class.ONLY_MULTI_TASKS]
|
106
|
+
when :nonmulti
|
107
|
+
[:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
|
108
|
+
else
|
109
|
+
raise "Unexpected error, unknown type reason: #{type}"
|
110
|
+
end
|
111
|
+
|
112
|
+
list.include?(task) && !send(test)
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Force the +task+ to be executed even if it should otherwise be
|
117
|
+
# ignored due to reasons: +:noref+, +:multi+, or +:nonmulti+. Other
|
118
|
+
# reasons to ignore a task are not affected by metadata forcing
|
119
|
+
def force_task?(task)
|
120
|
+
!!metadata["run_#{task}"]
|
121
|
+
end
|
122
|
+
end
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3, 4,
|
15
|
+
VERSION = [1.3, 4, 3].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023, 4,
|
23
|
+
VERSION_DATE = Date.new(2023, 4, 20)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -18,6 +18,14 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
18
18
|
for i in $DS ; do
|
19
19
|
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
20
20
|
done
|
21
|
+
# The following block pipes retrieved data from all databases, reorganizes the
|
22
|
+
# names in cannonical order, and removes repeats from the first two columns,
|
23
|
+
# in order to keep only one result per pair. This is not being included into
|
24
|
+
# production, but the code may be useful for extremely large databases.
|
25
|
+
# | tee \
|
26
|
+
# | awk -F"\t" \
|
27
|
+
# 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
|
28
|
+
# | sort -k 1,2 -u
|
21
29
|
) | gzip -9c > miga-project.txt.gz
|
22
30
|
|
23
31
|
# R-ify
|
data/utils/subclades.R
CHANGED
@@ -47,7 +47,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
47
47
|
say("Loading")
|
48
48
|
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
49
49
|
sep = " ", as.is = TRUE)[,1]
|
50
|
-
a <- read.table(paste(out_base, "classif", sep="."),
|
50
|
+
a <- read.table(paste(out_base, "classif", sep = "."),
|
51
51
|
sep = "\t", as.is = TRUE)
|
52
52
|
ani.types <- a[,2]
|
53
53
|
names(ani.types) <- a[,1]
|
@@ -70,17 +70,17 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
70
70
|
say("Recursive search")
|
71
71
|
for (i in 1:length(ani.medoids)) {
|
72
72
|
medoid <- ani.medoids[i]
|
73
|
-
ds_f <- names(ani.types)[
|
73
|
+
ds_f <- names(ani.types)[ani.types == i]
|
74
74
|
say("Analyzing subclade", i, "with medoid:", medoid)
|
75
|
-
dir_f <- paste(out_base, ".sc-", i, sep="")
|
75
|
+
dir_f <- paste(out_base, ".sc-", i, sep = "")
|
76
76
|
if (!dir.exists(dir_f)) dir.create(dir_f)
|
77
77
|
write.table(ds_f,
|
78
|
-
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
79
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
78
|
+
paste(out_base, ".sc-", i, "/miga-project.all", sep = ""),
|
79
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE)
|
80
80
|
if (length(ds_f) > 8L) {
|
81
81
|
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
82
82
|
subclades(
|
83
|
-
out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
83
|
+
out_base = paste(out_base, ".sc-", i, "/miga-project", sep = ""),
|
84
84
|
thr = thr,
|
85
85
|
ani.d = ani_subset
|
86
86
|
)
|
@@ -111,7 +111,7 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
111
111
|
# Silhouette
|
112
112
|
say("Silhouette")
|
113
113
|
nn <- length(labels(ani.d))
|
114
|
-
k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
|
114
|
+
k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
|
115
115
|
say("- Make cluster")
|
116
116
|
cl <- makeCluster(thr)
|
117
117
|
say("- Launch parallel jobs")
|
@@ -119,8 +119,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
119
119
|
cl, k,
|
120
120
|
function(x) {
|
121
121
|
library(cluster)
|
122
|
-
s <- pam(ani.d, x, do.swap = FALSE,
|
123
|
-
c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
|
122
|
+
s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
|
123
|
+
c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
|
124
124
|
}
|
125
125
|
)
|
126
126
|
say("- Stop cluster")
|
@@ -135,29 +135,38 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
|
135
135
|
|
136
136
|
# Classify genomes
|
137
137
|
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
138
|
-
|
138
|
+
is.huge <- length(labels(ani.d)) > 4e4
|
139
|
+
ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
|
139
140
|
ani.types <- ani.cl$clustering
|
140
141
|
ani.medoids <- ani.cl$medoids
|
141
142
|
|
142
143
|
# Build tree
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
if (is.huge) {
|
145
|
+
say("Bypassing tree for large set")
|
146
|
+
write.table(
|
147
|
+
'{}', file = paste(out_base, ".nwk", sep = ""),
|
148
|
+
col.names = FALSE, row.names = FALSE, quote = FALSE
|
149
|
+
)
|
150
|
+
} else {
|
151
|
+
say("Tree")
|
152
|
+
ani.ph <- bionj(ani.d)
|
153
|
+
say("- Write")
|
154
|
+
express.ori <- options("expressions")$expressions
|
155
|
+
if(express.ori < ani.ph$Nnode * 4){
|
156
|
+
options(expressions = min(c(5e7, ani.ph$Nnode * 4)))
|
157
|
+
}
|
158
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
159
|
+
options(expressions = express.ori)
|
149
160
|
}
|
150
|
-
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
151
|
-
options(expressions=express.ori)
|
152
161
|
|
153
162
|
# Generate graphic report
|
154
163
|
say("Graphic report")
|
155
164
|
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
156
165
|
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
157
166
|
plot_distances(ani.d)
|
158
|
-
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
167
|
+
plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
|
159
168
|
plot_clustering(ani.cl, ani.d, ani.types)
|
160
|
-
plot_tree(ani.ph, ani.types, ani.medoids)
|
169
|
+
if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
|
161
170
|
dev.off()
|
162
171
|
|
163
172
|
# Save results
|
@@ -198,7 +207,7 @@ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
|
|
198
207
|
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
199
208
|
}
|
200
209
|
write.table(
|
201
|
-
classif, paste(out_base, "classif", sep="."),
|
210
|
+
classif, paste(out_base, "classif", sep = "."),
|
202
211
|
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
|
203
212
|
)
|
204
213
|
}
|
@@ -249,7 +258,8 @@ plot_clustering <- function (cl, dist, types) {
|
|
249
258
|
top.n <- length(cl$medoids)
|
250
259
|
col <- ggplotColours(top.n)
|
251
260
|
plot(silhouette(cl), col = col)
|
252
|
-
|
261
|
+
dist.n <- length(labels(dist))
|
262
|
+
if (dist.n <= 15 | dist.n > 4e4) {
|
253
263
|
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
254
264
|
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
255
265
|
} else {
|
@@ -261,7 +271,7 @@ plot_clustering <- function (cl, dist, types) {
|
|
261
271
|
)
|
262
272
|
plot(
|
263
273
|
ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
|
264
|
-
xlab = "Component 3", ylab="Component 4"
|
274
|
+
xlab = "Component 3", ylab = "Component 4"
|
265
275
|
)
|
266
276
|
}else{
|
267
277
|
for (i in 1:2)
|
@@ -305,16 +315,28 @@ ani_distance <- function (ani_file, sel) {
|
|
305
315
|
if (nrow(sim) == 0) return(NULL)
|
306
316
|
|
307
317
|
# Apply filter (if requested)
|
318
|
+
ids <- NULL
|
308
319
|
if (!is.na(sel) && file.exists(sel)) {
|
309
320
|
say("Filter selection")
|
310
|
-
|
311
|
-
sim <- sim[sim$a %in%
|
321
|
+
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
322
|
+
sim <- sim[sim$a %in% ids & sim$b %in% ids, ]
|
323
|
+
} else {
|
324
|
+
ids <- with(sim, unique(c(a, b)))
|
312
325
|
}
|
313
326
|
|
314
327
|
# Transform to distances
|
315
328
|
say("Distances")
|
316
329
|
sim$d <- 1 - (sim$value / 100)
|
317
|
-
return(
|
330
|
+
return(as.dist(with(sim, {
|
331
|
+
out <- matrix(
|
332
|
+
max(d) * 1.2, nrow = length(ids), ncol = length(ids),
|
333
|
+
dimnames = list(ids, ids)
|
334
|
+
)
|
335
|
+
out[cbind(ids, ids)] <- 0
|
336
|
+
out[cbind(a, b)] <- d
|
337
|
+
out[cbind(b, a)] <- d
|
338
|
+
out
|
339
|
+
})))
|
318
340
|
}
|
319
341
|
|
320
342
|
#= Main
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.4.
|
4
|
+
version: 1.3.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|