miga-base 1.3.4.2 → 1.3.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 786b98a997a9145ad36a42404b756a65ca5f8abe628adf10e27960c58b7d5641
4
- data.tar.gz: 95bc1da96a509cbe3c9334b78daf4ebaec9238693ef37fd5f417063943c9cf36
3
+ metadata.gz: '04760780c4ec69edaeb55949c2f15933c7d9ecc1c620bbeca28a01679f957db3'
4
+ data.tar.gz: 885e191e8c77a7e117d1e22875f6c90615f7cc001d0215930fd690022a88fb0d
5
5
  SHA512:
6
- metadata.gz: 909a0cf12a2048024d4c8f5ed2210661d96b78b4c6a6efdae650ca49d6f34f8439c812fb14b30df664c8758beb4c532bc4b1c7a6a09e108f7c48418a03d09902
7
- data.tar.gz: 2af006dc05851eb66eb7e3f0fec127b10bec73945b5761a3145d9973935a65c93b7b010bb977b4b492f527201f31b68c5caa4c68d8815c2d474f96420c642d31
6
+ metadata.gz: 2b47739ea450c9217119ad61317559e7429f87b04531ad624555538885207ffbb054715d18f60757a41f5c7816cb3cdc06b4b81a378b509ac3c1518452ba02ec
7
+ data.tar.gz: a3c08caf1d98ea5de2c7b137004c126a5b1541403cacd8ab88d0fa3da8a0617fe380448370f3e038bd70d3a4342b20d6476a94a5062c6aec9615838f694c5e6a
@@ -120,7 +120,12 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
120
120
  def create_remote_dataset(sub_cli, p)
121
121
  sub_cli.ensure_par(dataset: '-D', ids: '-I')
122
122
  unless sub_cli[:api_key].nil?
123
- ENV["#{sub_cli[:universe].to_s.upcase}_API_KEY"] = sub_cli[:api_key]
123
+ if sub_cli[:universe] == :web && sub_cli[:db] == :assembly_gz
124
+ ENV['NCBI_API_KEY'] = sub_cli[:api_key]
125
+ end
126
+
127
+ var_space = sub_cli[:universe].to_s.upcase
128
+ ENV["#{var_space}_API_KEY"] = sub_cli[:api_key]
124
129
  end
125
130
 
126
131
  sub_cli.say "Dataset: #{sub_cli[:dataset]}"
@@ -76,18 +76,47 @@ module MiGA::Dataset::Result::Ignore
76
76
  ##
77
77
  # Ignore +task+ because it's not a reference dataset
78
78
  def ignore_noref?(task)
79
- self.class.EXCLUDE_NOREF_TASKS.include?(task) && !ref?
79
+ ignore_by_type?(task, :noref)
80
80
  end
81
81
 
82
82
  ##
83
83
  # Ignore +task+ because it's not a multi dataset
84
84
  def ignore_multi?(task)
85
- self.class.ONLY_MULTI_TASKS.include?(task) && !multi?
85
+ ignore_by_type?(task, :multi)
86
86
  end
87
87
 
88
88
  ##
89
89
  # Ignore +task+ because it's not a nonmulti dataset
90
90
  def ignore_nonmulti?(task)
91
- self.class.ONLY_NONMULTI_TASKS.include?(task) && !nonmulti?
91
+ ignore_by_type?(task, :nonmulti)
92
92
  end
93
- end
93
+
94
+ ##
95
+ # Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
96
+ # +:nonmulti+
97
+ def ignore_by_type?(task, type)
98
+ return false if force_task?(task)
99
+
100
+ test, list =
101
+ case type.to_sym
102
+ when :noref
103
+ [:ref?, self.class.EXCLUDE_NOREF_TASKS]
104
+ when :multi
105
+ [:multi?, self.class.ONLY_MULTI_TASKS]
106
+ when :nonmulti
107
+ [:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
108
+ else
109
+ raise "Unexpected error, unknown type reason: #{type}"
110
+ end
111
+
112
+ list.include?(task) && !send(test)
113
+ end
114
+
115
+ ##
116
+ # Force the +task+ to be executed even if it should otherwise be
117
+ # ignored due to reasons: +:noref+, +:multi+, or +:nonmulti+. Other
118
+ # reasons to ignore a task are not affected by metadata forcing
119
+ def force_task?(task)
120
+ !!metadata["run_#{task}"]
121
+ end
122
+ end
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 4, 2].freeze
15
+ VERSION = [1.3, 4, 3].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 4, 7)
23
+ VERSION_DATE = Date.new(2023, 4, 20)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -18,6 +18,14 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
18
18
  for i in $DS ; do
19
19
  echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
20
20
  done
21
+ # The following block pipes retrieved data from all databases, reorganizes the
22
+ # names in cannonical order, and removes repeats from the first two columns,
23
+ # in order to keep only one result per pair. This is not being included into
24
+ # production, but the code may be useful for extremely large databases.
25
+ # | tee \
26
+ # | awk -F"\t" \
27
+ # 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
28
+ # | sort -k 1,2 -u
21
29
  ) | gzip -9c > miga-project.txt.gz
22
30
 
23
31
  # R-ify
data/utils/subclades.R CHANGED
@@ -47,7 +47,7 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
47
47
  say("Loading")
48
48
  ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
49
49
  sep = " ", as.is = TRUE)[,1]
50
- a <- read.table(paste(out_base, "classif", sep="."),
50
+ a <- read.table(paste(out_base, "classif", sep = "."),
51
51
  sep = "\t", as.is = TRUE)
52
52
  ani.types <- a[,2]
53
53
  names(ani.types) <- a[,1]
@@ -70,17 +70,17 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
70
70
  say("Recursive search")
71
71
  for (i in 1:length(ani.medoids)) {
72
72
  medoid <- ani.medoids[i]
73
- ds_f <- names(ani.types)[ ani.types==i ]
73
+ ds_f <- names(ani.types)[ani.types == i]
74
74
  say("Analyzing subclade", i, "with medoid:", medoid)
75
- dir_f <- paste(out_base, ".sc-", i, sep="")
75
+ dir_f <- paste(out_base, ".sc-", i, sep = "")
76
76
  if (!dir.exists(dir_f)) dir.create(dir_f)
77
77
  write.table(ds_f,
78
- paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
79
- quote=FALSE, col.names=FALSE, row.names=FALSE)
78
+ paste(out_base, ".sc-", i, "/miga-project.all", sep = ""),
79
+ quote = FALSE, col.names = FALSE, row.names = FALSE)
80
80
  if (length(ds_f) > 8L) {
81
81
  ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
82
82
  subclades(
83
- out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
83
+ out_base = paste(out_base, ".sc-", i, "/miga-project", sep = ""),
84
84
  thr = thr,
85
85
  ani.d = ani_subset
86
86
  )
@@ -111,7 +111,7 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
111
111
  # Silhouette
112
112
  say("Silhouette")
113
113
  nn <- length(labels(ani.d))
114
- k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
114
+ k <- min(max(floor(0.005 * nn), 2), 20):min(nn - 1, 100)
115
115
  say("- Make cluster")
116
116
  cl <- makeCluster(thr)
117
117
  say("- Launch parallel jobs")
@@ -119,8 +119,8 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
119
119
  cl, k,
120
120
  function(x) {
121
121
  library(cluster)
122
- s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
123
- c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
122
+ s <- pam(ani.d, x, do.swap = FALSE, variant = "faster")$silinfo
123
+ c(s$avg.width, -sum(ifelse(s$widths[, 3] > 0, 0, s$widths[, 3])))
124
124
  }
125
125
  )
126
126
  say("- Stop cluster")
@@ -135,29 +135,38 @@ subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
135
135
 
136
136
  # Classify genomes
137
137
  say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
138
- ani.cl <- pam(ani.d, top.n)
138
+ is.huge <- length(labels(ani.d)) > 4e4
139
+ ani.cl <- pam(ani.d, top.n, variant = "faster", do.swap = !is.huge)
139
140
  ani.types <- ani.cl$clustering
140
141
  ani.medoids <- ani.cl$medoids
141
142
 
142
143
  # Build tree
143
- say("Tree")
144
- ani.ph <- bionj(ani.d)
145
- say("- Write")
146
- express.ori <- options("expressions")$expressions
147
- if(express.ori < ani.ph$Nnode * 4){
148
- options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
144
+ if (is.huge) {
145
+ say("Bypassing tree for large set")
146
+ write.table(
147
+ '{}', file = paste(out_base, ".nwk", sep = ""),
148
+ col.names = FALSE, row.names = FALSE, quote = FALSE
149
+ )
150
+ } else {
151
+ say("Tree")
152
+ ani.ph <- bionj(ani.d)
153
+ say("- Write")
154
+ express.ori <- options("expressions")$expressions
155
+ if(express.ori < ani.ph$Nnode * 4){
156
+ options(expressions = min(c(5e7, ani.ph$Nnode * 4)))
157
+ }
158
+ write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
159
+ options(expressions = express.ori)
149
160
  }
150
- write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
151
- options(expressions=express.ori)
152
161
 
153
162
  # Generate graphic report
154
163
  say("Graphic report")
155
164
  pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
156
165
  layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
157
166
  plot_distances(ani.d)
158
- plot_silhouette(k, s[1,], s[2,], ds, top.n)
167
+ plot_silhouette(k, s[1, ], s[2, ], ds, top.n)
159
168
  plot_clustering(ani.cl, ani.d, ani.types)
160
- plot_tree(ani.ph, ani.types, ani.medoids)
169
+ if (!is.huge) plot_tree(ani.ph, ani.types, ani.medoids)
161
170
  dev.off()
162
171
 
163
172
  # Save results
@@ -198,7 +207,7 @@ write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
198
207
  classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
199
208
  }
200
209
  write.table(
201
- classif, paste(out_base, "classif", sep="."),
210
+ classif, paste(out_base, "classif", sep = "."),
202
211
  quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
203
212
  )
204
213
  }
@@ -249,7 +258,8 @@ plot_clustering <- function (cl, dist, types) {
249
258
  top.n <- length(cl$medoids)
250
259
  col <- ggplotColours(top.n)
251
260
  plot(silhouette(cl), col = col)
252
- if (length(labels(dist)) <= 15) {
261
+ dist.n <- length(labels(dist))
262
+ if (dist.n <= 15 | dist.n > 4e4) {
253
263
  plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
254
264
  plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
255
265
  } else {
@@ -261,7 +271,7 @@ plot_clustering <- function (cl, dist, types) {
261
271
  )
262
272
  plot(
263
273
  ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
264
- xlab = "Component 3", ylab="Component 4"
274
+ xlab = "Component 3", ylab = "Component 4"
265
275
  )
266
276
  }else{
267
277
  for (i in 1:2)
@@ -305,16 +315,28 @@ ani_distance <- function (ani_file, sel) {
305
315
  if (nrow(sim) == 0) return(NULL)
306
316
 
307
317
  # Apply filter (if requested)
318
+ ids <- NULL
308
319
  if (!is.na(sel) && file.exists(sel)) {
309
320
  say("Filter selection")
310
- lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
311
- sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
321
+ ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
322
+ sim <- sim[sim$a %in% ids & sim$b %in% ids, ]
323
+ } else {
324
+ ids <- with(sim, unique(c(a, b)))
312
325
  }
313
326
 
314
327
  # Transform to distances
315
328
  say("Distances")
316
329
  sim$d <- 1 - (sim$value / 100)
317
- return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
330
+ return(as.dist(with(sim, {
331
+ out <- matrix(
332
+ max(d) * 1.2, nrow = length(ids), ncol = length(ids),
333
+ dimnames = list(ids, ids)
334
+ )
335
+ out[cbind(ids, ids)] <- 0
336
+ out[cbind(a, b)] <- d
337
+ out[cbind(b, a)] <- d
338
+ out
339
+ })))
318
340
  }
319
341
 
320
342
  #= Main
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4.2
4
+ version: 1.3.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-07 00:00:00.000000000 Z
11
+ date: 2023-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons