miga-base 1.3.4.2 → 1.3.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 786b98a997a9145ad36a42404b756a65ca5f8abe628adf10e27960c58b7d5641
4
- data.tar.gz: 95bc1da96a509cbe3c9334b78daf4ebaec9238693ef37fd5f417063943c9cf36
3
+ metadata.gz: d17244b326441f224e4626b53702018fd0f3915d44e16e9be939a70dd86ceefa
4
+ data.tar.gz: 8f6c7544ab57957dbfeb53230e56ba9e7eda37c7bab2fea538ee17d9f54fd9f7
5
5
  SHA512:
6
- metadata.gz: 909a0cf12a2048024d4c8f5ed2210661d96b78b4c6a6efdae650ca49d6f34f8439c812fb14b30df664c8758beb4c532bc4b1c7a6a09e108f7c48418a03d09902
7
- data.tar.gz: 2af006dc05851eb66eb7e3f0fec127b10bec73945b5761a3145d9973935a65c93b7b010bb977b4b492f527201f31b68c5caa4c68d8815c2d474f96420c642d31
6
+ metadata.gz: bd77962a43caa04c72d01ddbe251fb02650f0f003c55b2b13a44ac1affb77b52535e5dd5fd7713c387f5a7bafdd8839255f2cd8d82be8d4c025b562a7beabfbd
7
+ data.tar.gz: b0cd50e25aa16ce2c64202daa5bbbb256e3f83742513ac984715f1b7c96f7297887856a2853ac6ec49dd43fe63b5c37cd96e83f8047ebcd32fdbc47addf892f7
@@ -120,7 +120,12 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
120
120
  def create_remote_dataset(sub_cli, p)
121
121
  sub_cli.ensure_par(dataset: '-D', ids: '-I')
122
122
  unless sub_cli[:api_key].nil?
123
- ENV["#{sub_cli[:universe].to_s.upcase}_API_KEY"] = sub_cli[:api_key]
123
+ if sub_cli[:universe] == :web && sub_cli[:db] == :assembly_gz
124
+ ENV['NCBI_API_KEY'] = sub_cli[:api_key]
125
+ end
126
+
127
+ var_space = sub_cli[:universe].to_s.upcase
128
+ ENV["#{var_space}_API_KEY"] = sub_cli[:api_key]
124
129
  end
125
130
 
126
131
  sub_cli.say "Dataset: #{sub_cli[:dataset]}"
@@ -76,18 +76,47 @@ module MiGA::Dataset::Result::Ignore
76
76
  ##
77
77
  # Ignore +task+ because it's not a reference dataset
78
78
  def ignore_noref?(task)
79
- self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
79
+ ignore_by_type?(task, :noref)
80
80
  end
81
81
 
82
82
  ##
83
83
  # Ignore +task+ because it's not a multi dataset
84
84
  def ignore_multi?(task)
85
- self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
85
+ ignore_by_type?(task, :multi)
86
86
  end
87
87
 
88
88
  ##
89
89
  # Ignore +task+ because it's not a nonmulti dataset
90
90
  def ignore_nonmulti?(task)
91
- self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
91
+ ignore_by_type?(task, :nonmulti)
92
92
  end
93
- end
93
+
94
+ ##
95
+ # Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
96
+ # +:nonmulti+
97
+ def ignore_by_type?(task, type)
98
+ return false if force_task?(task)
99
+
100
+ test, list =
101
+ case type.to_sym
102
+ when :noref
103
+ [:ref?, self.class.EXCLUDE_NOREF_TASKS]
104
+ when :multi
105
+ [:multi?, self.class.ONLY_MULTI_TASKS]
106
+ when :nonmulti
107
+ [:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
108
+ else
109
+ raise "Unexpected error, unknown type reason: #{type}"
110
+ end
111
+
112
+ list.include?(task) && !send(test)
113
+ end
114
+
115
+ ##
116
+ # Force the +task+ to be executed even if it should otherwise be
117
+ # ignored due to reasons: +:noref+, +:multi+, or +:nonmulti+. Other
118
+ # reasons to ignore a task are not affected by metadata forcing
119
+ def force_task?(task)
120
+ !!metadata["run_#{task}"]
121
+ end
122
+ end
@@ -141,12 +141,16 @@ module MiGA::Result::Stats
141
141
  # Determine qualitative range
142
142
  stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
143
143
  source.metadata[:quality] =
144
- case stats[:quality]
145
- when 80..100; :excellent
146
- when 50..80; :high
147
- when 20..50; :intermediate
148
- else; :low
144
+ if stats[:completeness][0] >= 90 && stats[:contamination][0] <= 5
145
+ :excellent # Finished or High-quality draft*
146
+ elsif stats[:completeness][0] >= 50 && stats[:contamination][0] <= 10
147
+ :high # Medium-quality draft*
148
+ elsif stats[:quality] >= 25
149
+ :intermediate # Low-quality draft* but sufficient for classification
150
+ else
151
+ :low # Low-quality draft* and insufficient for classification
149
152
  end
153
+ # * Bowers et al 2017, DOI: 10.1038/nbt.3893
150
154
  source.save
151
155
 
152
156
  # Inactivate low-quality datasets
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 4, 2].freeze
15
+ VERSION = [1.3, 5, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 4, 7)
23
+ VERSION_DATE = Date.new(2023, 4, 21)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -18,6 +18,14 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
18
18
  for i in $DS ; do
19
19
  echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
20
20
  done
21
+ # The following block pipes retrieved data from all databases, reorganizes the
22
+ # names in cannonical order, and removes repeats from the first two columns,
23
+ # in order to keep only one result per pair. This is not being included into
24
+ # production, but the code may be useful for extremely large databases.
25
+ # | tee \
26
+ # | awk -F"\t" \
27
+ # 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
28
+ # | sort -k 1,2 -u
21
29
  ) | gzip -9c > miga-project.txt.gz
22
30
 
23
31
  # R-ify
data/utils/subclades.R CHANGED
@@ -47,7 +47,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
47
47
  say("Loading")
48
48
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
49
49
  sep = " ", as.is = TRUE)[,1]
50
- a <- read.table(paste(out_base, "classif", sep="."),
50
+ a <- read.table(paste(out_base, "classif", sep = "."),
51
51
  sep = "\t", as.is = TRUE)
52
52
  ani.types <- a[,2]
53
53
  names(ani.types) <- a[,1]
@@ -70,17 +70,17 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
70
70
  say("Recursive search")
71
71
  for (i in 1:length(ani.medoids)) {
72
72
  medoid <- ani.medoids[i]
73
- ds_f <- names(ani.types)[ ani.types==i ]
73
+ ds_f <- names(ani.types)[ani.types == i]
74
74
  say("Analyzing subclade", i, "with medoid:", medoid)
75
- dir_f <- paste(out_base, ".sc-", i, sep="")
75
+ dir_f <- paste(out_base, ".sc-", i, sep = "")
76
76
  if (!dir.exists(dir_f)) dir.create(dir_f)
77
77
  write.table(ds_f,
78
- paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
79
- quote=FALSE, col.names=FALSE, row.names=FALSE)
78
+ paste(out_base, ".sc-", i, "/miga-project.all", sep = ""),
79
+ quote = FALSE, col.names = FALSE, row.names = FALSE)
80
80
  if (length(ds_f) > 8L) {
81
81
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
82
82
  subclades(
83
- out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
83
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep = ""),
84
84
  thr = thr,
85
85
  ani.d = ani_subset
86
86
  )
@@ -111,7 +111,7 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
111
111
  # Silhouette
112
112
  say("Silhouette")
113
113
  nn <- length(labels(ani.d))
114
- k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
114
+ k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
115
115
  say("- Make cluster")
116
116
  cl <- makeCluster(thr)
117
117
  say("- Launch parallel jobs")
@@ -119,8 +119,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
119
119
  cl, k,
120
120
  function(x) {
121
121
  library(cluster)
122
- s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
123
- c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
122
+ s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
123
+ c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
124
124
  }
125
125
  )
126
126
  say("- Stop cluster")
@@ -135,29 +135,38 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
135
135
 
136
136
  # Classify genomes
137
137
  say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
138
- ani.cl <- pam(ani.d, top.n)
138
+ is.huge <- length(labels(ani.d)) > 4e4
139
+ ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
139
140
  ani.types <- ani.cl$clustering
140
141
  ani.medoids <- ani.cl$medoids
141
142
 
142
143
  # Build tree
143
- say("Tree")
144
- ani.ph <- bionj(ani.d)
145
- say("- Write")
146
- express.ori <- options("expressions")$expressions
147
- if(express.ori < ani.ph$Nnode * 4){
148
- options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
144
+ if (is.huge) {
145
+ say("Bypassing tree for large set")
146
+ write.table(
147
+ '{}', file = paste(out_base, ".nwk", sep = ""),
148
+ col.names = FALSE, row.names = FALSE, quote = FALSE
149
+ )
150
+ } else {
151
+ say("Tree")
152
+ ani.ph <- bionj(ani.d)
153
+ say("- Write")
154
+ express.ori <- options("expressions")$expressions
155
+ if(express.ori < ani.ph$Nnode * 4){
156
+ options(expressions = min(c(5e7, ani.ph$Nnode * 4)))
157
+ }
158
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
159
+ options(expressions = express.ori)
149
160
  }
150
- write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
151
- options(expressions=express.ori)
152
161
 
153
162
  # Generate graphic report
154
163
  say("Graphic report")
155
164
  pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
156
165
  layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
157
166
  plot_distances(ani.d)
158
- plot_silhouette(k, s[1,], s[2,], ds, top.n)
167
+ plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
159
168
  plot_clustering(ani.cl, ani.d, ani.types)
160
- plot_tree(ani.ph, ani.types, ani.medoids)
169
+ if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
161
170
  dev.off()
162
171
 
163
172
  # Save results
@@ -198,7 +207,7 @@ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
198
207
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
199
208
  }
200
209
  write.table(
201
- classif, paste(out_base, "classif", sep="."),
210
+ classif, paste(out_base, "classif", sep = "."),
202
211
  quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
203
212
  )
204
213
  }
@@ -249,7 +258,8 @@ plot_clustering <- function (cl, dist, types) {
249
258
  top.n <- length(cl$medoids)
250
259
  col <- ggplotColours(top.n)
251
260
  plot(silhouette(cl), col = col)
252
- if (length(labels(dist)) <= 15) {
261
+ dist.n <- length(labels(dist))
262
+ if (dist.n <= 15 | dist.n > 4e4) {
253
263
  plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
254
264
  plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
255
265
  } else {
@@ -261,7 +271,7 @@ plot_clustering <- function (cl, dist, types) {
261
271
  )
262
272
  plot(
263
273
  ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
264
- xlab = "Component 3", ylab="Component 4"
274
+ xlab = "Component 3", ylab = "Component 4"
265
275
  )
266
276
  }else{
267
277
  for (i in 1:2)
@@ -305,16 +315,28 @@ ani_distance <- function (ani_file, sel) {
305
315
  if (nrow(sim) == 0) return(NULL)
306
316
 
307
317
  # Apply filter (if requested)
318
+ ids <- NULL
308
319
  if (!is.na(sel) && file.exists(sel)) {
309
320
  say("Filter selection")
310
- lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
311
- sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
321
+ ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
322
+ sim <- sim[sim$a %in% ids & sim$b %in% ids, ]
323
+ } else {
324
+ ids <- with(sim, unique(c(a, b)))
312
325
  }
313
326
 
314
327
  # Transform to distances
315
328
  say("Distances")
316
329
  sim$d <- 1 - (sim$value / 100)
317
- return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
330
+ return(as.dist(with(sim, {
331
+ out <- matrix(
332
+ max(d) * 1.2, nrow = length(ids), ncol = length(ids),
333
+ dimnames = list(ids, ids)
334
+ )
335
+ out[cbind(ids, ids)] <- 0
336
+ out[cbind(a, b)] <- d
337
+ out[cbind(b, a)] <- d
338
+ out
339
+ })))
318
340
  }
319
341
 
320
342
  #= Main
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4.2
4
+ version: 1.3.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-07 00:00:00.000000000 Z
11
+ date: 2023-04-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons