miga-base 1.1.2.1 → 1.1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/init/daemon_helper.rb +1 -1
- data/lib/miga/cli/action/init/files_helper.rb +1 -1
- data/lib/miga/dataset.rb +2 -2
- data/lib/miga/project/dataset.rb +6 -2
- data/lib/miga/project/result.rb +11 -8
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +8 -8
- data/scripts/ani_distances.bash +8 -8
- data/scripts/haai_distances.bash +1 -1
- data/test/project_test.rb +7 -6
- data/utils/find-medoid.R +18 -14
- data/utils/subclade/pipeline.rb +1 -3
- data/utils/subclades.R +195 -128
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 73de682930481bd837b829588081e2c9e70a87054e9e1d91b7d40bf319030349
|
4
|
+
data.tar.gz: b0ed9f7f1acf8fb2530fde84803938e8f3d7fac3400a629c3db88779dd9a679f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4477253800d6a04f3b8e612ed8f5af8ccdb47e5f6aff6efdf2539d252daa7cbc70d009f28ec75fcada925420590614589effce76e465a93d9cb2a7ce093d79ff
|
7
|
+
data.tar.gz: bbb998c715274dc6b000fa3fc4b9f1f550867b78707b4b5eaa43c24b692939769c2958c3beef0bc10f6dbfd9f38a63b143b2617e44208550b19aa837c45711aa
|
@@ -5,7 +5,7 @@
|
|
5
5
|
module MiGA::Cli::Action::Init::DaemonHelper
|
6
6
|
def configure_daemon
|
7
7
|
cli.puts 'Default daemon configuration:'
|
8
|
-
daemon_f = File.expand_path('.miga_daemon.json', ENV['
|
8
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['MIGA_HOME'])
|
9
9
|
unless File.exist?(daemon_f) and cli.ask_user(
|
10
10
|
'A template daemon already exists, do you want to preserve it?',
|
11
11
|
'yes', %w(yes no)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Helper module with files configuration functions for MiGA::Cli::Action::Init
|
5
5
|
module MiGA::Cli::Action::Init::FilesHelper
|
6
6
|
def open_rc_file
|
7
|
-
rc_path = File.expand_path('.miga_rc', ENV['
|
7
|
+
rc_path = File.expand_path('.miga_rc', ENV['MIGA_HOME'])
|
8
8
|
if File.exist? rc_path
|
9
9
|
if cli.ask_user(
|
10
10
|
'I found a previous configuration. Do you want to continue?',
|
data/lib/miga/dataset.rb
CHANGED
@@ -107,7 +107,7 @@ class MiGA::Dataset < MiGA::MiGA
|
|
107
107
|
metadata[:warn] = "Inactive: #{reason}" unless reason.nil?
|
108
108
|
metadata[:inactive] = true
|
109
109
|
metadata.save
|
110
|
-
project.recalculate_tasks(
|
110
|
+
project.recalculate_tasks("Reference dataset inactivated: #{name}") if ref?
|
111
111
|
pull_hook :on_inactivate
|
112
112
|
end
|
113
113
|
|
@@ -117,7 +117,7 @@ class MiGA::Dataset < MiGA::MiGA
|
|
117
117
|
metadata[:inactive] = nil
|
118
118
|
metadata[:warn] = nil if metadata[:warn] && metadata[:warn] =~ /^Inactive: /
|
119
119
|
metadata.save
|
120
|
-
project.recalculate_tasks(
|
120
|
+
project.recalculate_tasks("Reference dataset activated: #{name}") if ref?
|
121
121
|
pull_hook :on_activate
|
122
122
|
end
|
123
123
|
|
data/lib/miga/project/dataset.rb
CHANGED
@@ -52,7 +52,9 @@ module MiGA::Project::Dataset
|
|
52
52
|
@metadata[:datasets] << name
|
53
53
|
@dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
|
54
54
|
save
|
55
|
-
|
55
|
+
if d.ref? && d.active?
|
56
|
+
recalculate_tasks("Reference dataset added: #{d.name}")
|
57
|
+
end
|
56
58
|
pull_hook(:on_add_dataset, name)
|
57
59
|
end
|
58
60
|
dataset(name)
|
@@ -66,7 +68,9 @@ module MiGA::Project::Dataset
|
|
66
68
|
|
67
69
|
self.metadata[:datasets].delete(name)
|
68
70
|
save
|
69
|
-
|
71
|
+
if d.ref? && d.active?
|
72
|
+
recalculate_tasks("Reference dataset unlinked: #{d.name}")
|
73
|
+
end
|
70
74
|
pull_hook(:on_unlink_dataset, name)
|
71
75
|
d
|
72
76
|
end
|
data/lib/miga/project/result.rb
CHANGED
@@ -55,10 +55,11 @@ module MiGA::Project::Result
|
|
55
55
|
##
|
56
56
|
# Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
|
57
57
|
def add_result_distances(base, _opts)
|
58
|
-
return nil unless result_files_exist?(base, %w[.
|
58
|
+
return nil unless result_files_exist?(base, %w[.rds .txt])
|
59
59
|
|
60
60
|
r = MiGA::Result.new("#{base}.json")
|
61
|
-
r.add_file(:
|
61
|
+
r.add_file(:rds, 'miga-project.rds')
|
62
|
+
r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
|
62
63
|
r.add_file(:matrix, 'miga-project.txt')
|
63
64
|
r.add_file(:log, 'miga-project.log') # Legacy file
|
64
65
|
r.add_file(:hist, 'miga-project.hist')
|
@@ -82,12 +83,13 @@ module MiGA::Project::Result
|
|
82
83
|
end
|
83
84
|
|
84
85
|
r = add_result_iter_clades(base)
|
85
|
-
r.add_file(:
|
86
|
-
r.add_file(:
|
87
|
-
r.add_file(:
|
88
|
-
r.add_file(:
|
89
|
-
r.add_file(:
|
90
|
-
r.add_file(:
|
86
|
+
r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
|
87
|
+
r.add_file(:aai_tree, 'miga-project.aai.nwk')
|
88
|
+
r.add_file(:proposal, 'miga-project.proposed-clades')
|
89
|
+
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
90
|
+
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
91
|
+
r.add_file(:clades_gsp, 'miga-project.gsp-clades')
|
92
|
+
r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
|
91
93
|
r
|
92
94
|
end
|
93
95
|
|
@@ -105,6 +107,7 @@ module MiGA::Project::Result
|
|
105
107
|
|
106
108
|
r = add_result_iter_clades(base)
|
107
109
|
r.add_file(:ani_tree, 'miga-project.ani.nwk')
|
110
|
+
r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
|
108
111
|
r
|
109
112
|
end
|
110
113
|
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.1,
|
15
|
+
VERSION = [1.1, 3, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2021, 11,
|
23
|
+
VERSION_DATE = Date.new(2021, 11, 21)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
aai <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(aai[,
|
29
|
-
h <- hist(aai[aai[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(aai, file = "miga-project.rds")
|
28
|
+
if(sum(aai[, "a"] != aai[, "b"]) > 0) {
|
29
|
+
h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/ani_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
ani <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(ani[,
|
29
|
-
h <- hist(ani[ani[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(ani, file = "miga-project.rds")
|
28
|
+
if(sum(ani[, "a"] != ani[, "b"]) > 0) {
|
29
|
+
h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/haai_distances.bash
CHANGED
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
|
15
15
|
# No real need for hAAI distributions at all
|
16
16
|
echo -n "" > miga-project.log
|
17
17
|
echo -n "" > miga-project.txt
|
18
|
-
echo
|
18
|
+
echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
|
19
19
|
|
20
20
|
# Finalize
|
21
21
|
miga_end_project_step "$DIR"
|
data/test/project_test.rb
CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
|
|
82
82
|
def test_add_result
|
83
83
|
p1 = project
|
84
84
|
assert_nil(p1.add_result(:doom))
|
85
|
-
%w[.
|
85
|
+
%w[.rds .log .txt .done].each do |x|
|
86
86
|
assert_nil(p1.add_result(:haai_distances))
|
87
87
|
FileUtils.touch(
|
88
88
|
File.join(
|
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
|
|
117
117
|
# Project tasks
|
118
118
|
expected_files = {
|
119
119
|
project_stats: %w[.taxonomy.json .metadata.db],
|
120
|
-
haai_distances: %w[.
|
121
|
-
aai_distances: %w[.
|
122
|
-
ani_distances: %w[.
|
123
|
-
clade_finding: %w[
|
124
|
-
|
120
|
+
haai_distances: %w[.rds .log .txt],
|
121
|
+
aai_distances: %w[.rds .log .txt],
|
122
|
+
ani_distances: %w[.rds .log .txt],
|
123
|
+
clade_finding: %w[
|
124
|
+
.pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
|
125
|
+
],
|
125
126
|
subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
|
126
127
|
ogs: %w[.ogs .stats]
|
127
128
|
}
|
data/utils/find-medoid.R
CHANGED
@@ -5,26 +5,28 @@
|
|
5
5
|
#
|
6
6
|
|
7
7
|
#= Load stuff
|
8
|
-
argv <- commandArgs(trailingOnly =
|
8
|
+
argv <- commandArgs(trailingOnly = TRUE)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
if(Sys.getenv(
|
10
|
+
if(Sys.getenv("MIGA") == ""){
|
11
11
|
suppressPackageStartupMessages(library(enveomics.R))
|
12
12
|
}else{
|
13
|
-
source(file.path(
|
14
|
-
|
13
|
+
source(file.path(
|
14
|
+
Sys.getenv("MIGA"),
|
15
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
16
|
+
))
|
15
17
|
}
|
16
18
|
|
17
|
-
find_medoids <- function(ani.df, out, clades) {
|
19
|
+
find_medoids <- function (ani.df, out, clades) {
|
18
20
|
if(nrow(ani.df) == 0) return(NULL)
|
19
21
|
ani.df$d <- 1 - (ani.df$value/100)
|
20
|
-
dist <- enve.df2dist(ani.df,
|
22
|
+
dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
|
21
23
|
dist <- as.matrix(dist)
|
22
|
-
cl <- read.table(clades, header = FALSE, sep =
|
24
|
+
cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
|
23
25
|
cl.s <- c()
|
24
26
|
medoids <- c()
|
25
27
|
for(i in cl){
|
26
|
-
lab <- strsplit(i,
|
27
|
-
cat(
|
28
|
+
lab <- strsplit(i, ",")[[1]]
|
29
|
+
cat("Clade of:", lab[1], "\n")
|
28
30
|
if(length(lab) == 1) {
|
29
31
|
lab.s <- lab
|
30
32
|
} else {
|
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
|
|
32
34
|
}
|
33
35
|
med <- lab.s[1]
|
34
36
|
medoids <- c(medoids, med)
|
35
|
-
cl.s <- c(cl.s, paste(lab.s, collapse =
|
37
|
+
cl.s <- c(cl.s, paste(lab.s, collapse = ","))
|
36
38
|
}
|
37
39
|
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
38
|
-
write.table(
|
39
|
-
|
40
|
+
write.table(
|
41
|
+
cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
|
42
|
+
row.names = FALSE, col.names = FALSE
|
43
|
+
)
|
40
44
|
}
|
41
45
|
|
42
46
|
#= Main
|
43
|
-
|
44
|
-
|
47
|
+
cat("Finding Medoids")
|
48
|
+
ani <- readRDS(argv[1])
|
45
49
|
find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
|
46
50
|
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -44,7 +44,7 @@ module MiGA::SubcladeRunner::Pipeline
|
|
44
44
|
# Find genomospecies medoids
|
45
45
|
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
46
46
|
dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
|
47
|
-
`Rscript '#{src}' ../../09.distances/#{dir}/miga-project.
|
47
|
+
`Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
|
48
48
|
miga-project.gsp-medoids miga-project.gsp-clades`
|
49
49
|
if File.exist? 'miga-project.gsp-clades.sorted'
|
50
50
|
File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
|
@@ -54,8 +54,6 @@ module MiGA::SubcladeRunner::Pipeline
|
|
54
54
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
55
55
|
File.open('miga-project.gsp-clades', 'r') do |ifh|
|
56
56
|
ifh.each_line do |ln|
|
57
|
-
next if $. == 1
|
58
|
-
|
59
57
|
r = ln.chomp.split(',')
|
60
58
|
ofh.puts r.join("\t") if r.size >= 5
|
61
59
|
end
|
data/utils/subclades.R
CHANGED
@@ -10,56 +10,51 @@ suppressPackageStartupMessages(library(ape))
|
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
12
12
|
suppressPackageStartupMessages(library(parallel))
|
13
|
-
if(Sys.getenv(
|
13
|
+
if(Sys.getenv("MIGA") == ""){
|
14
14
|
suppressPackageStartupMessages(library(enveomics.R))
|
15
15
|
}else{
|
16
|
-
source(file.path(
|
17
|
-
|
16
|
+
source(file.path(
|
17
|
+
Sys.getenv("MIGA"),
|
18
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
19
|
+
))
|
18
20
|
}
|
19
21
|
|
20
22
|
#= Main function
|
21
23
|
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
22
|
-
say(
|
24
|
+
say("==> Out base:", out_base, "<==")
|
23
25
|
|
24
26
|
# Normalize input matrix
|
25
|
-
|
26
|
-
if(!missing(ani_file)){
|
27
|
-
if(length(ani.d) == 0 && !file.exists(
|
27
|
+
dist_rds <- paste(out_base, "dist.rds", sep = ".")
|
28
|
+
if (!missing(ani_file)) {
|
29
|
+
if(length(ani.d) == 0 && !file.exists(dist_rds)){
|
28
30
|
# Read from ani_file
|
29
|
-
|
30
|
-
if(
|
31
|
+
ani.d <- ani_distance(ani_file, sel)
|
32
|
+
if (is.null(ani.d)) {
|
31
33
|
generate_empty_files(out_base)
|
32
34
|
return(NULL)
|
35
|
+
} else {
|
36
|
+
saveRDS(ani.d, dist_rds)
|
33
37
|
}
|
34
|
-
if(!is.na(sel) && file.exists(sel)){
|
35
|
-
say('Filter selection')
|
36
|
-
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
37
|
-
a <- a[a$a %in% lab & a$b %in% lab, ]
|
38
|
-
}
|
39
|
-
say('Distances')
|
40
|
-
a$d <- 1 - (a$value/100)
|
41
|
-
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
|
42
|
-
save(ani.d, file = dist_rdata)
|
43
38
|
}
|
44
39
|
}
|
45
40
|
|
46
41
|
# Read result if the subclade is ready, run it otherwise
|
47
|
-
if(file.exists(paste(out_base,
|
42
|
+
if (file.exists(paste(out_base, "classif", sep = "."))) {
|
48
43
|
say("Loading")
|
49
44
|
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
50
|
-
sep =
|
45
|
+
sep = " ", as.is = TRUE)[,1]
|
51
46
|
a <- read.table(paste(out_base, "classif", sep="."),
|
52
|
-
sep =
|
47
|
+
sep = "\t", as.is = TRUE)
|
53
48
|
ani.types <- a[,2]
|
54
49
|
names(ani.types) <- a[,1]
|
55
|
-
if(length(ani.d) == 0)
|
56
|
-
}else if(length(labels(ani.d)) > 8L){
|
57
|
-
res <- subclade_clustering(out_base, thr, ani.d,
|
58
|
-
if(length(res) == 0) return(NULL)
|
59
|
-
ani.medoids <- res[[
|
60
|
-
ani.types <- res[[
|
61
|
-
ani.d <- res[[
|
62
|
-
}else{
|
50
|
+
if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
|
51
|
+
} else if (length(labels(ani.d)) > 8L) {
|
52
|
+
res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
|
53
|
+
if (length(res) == 0) return(NULL)
|
54
|
+
ani.medoids <- res[["ani.medoids"]]
|
55
|
+
ani.types <- res[["ani.types"]]
|
56
|
+
ani.d <- res[["ani.d"]]
|
57
|
+
} else {
|
63
58
|
ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
|
64
59
|
ani.types <- rep(1, length(labels(ani.d)))
|
65
60
|
names(ani.types) <- labels(ani.d)
|
@@ -69,66 +64,80 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
69
64
|
|
70
65
|
# Recursive search
|
71
66
|
say("Recursive search")
|
72
|
-
for(i in 1:length(ani.medoids)){
|
67
|
+
for (i in 1:length(ani.medoids)) {
|
73
68
|
medoid <- ani.medoids[i]
|
74
69
|
ds_f <- names(ani.types)[ ani.types==i ]
|
75
70
|
say("Analyzing subclade", i, "with medoid:", medoid)
|
76
71
|
dir_f <- paste(out_base, ".sc-", i, sep="")
|
77
|
-
if(!dir.exists(dir_f)) dir.create(dir_f)
|
72
|
+
if (!dir.exists(dir_f)) dir.create(dir_f)
|
78
73
|
write.table(ds_f,
|
79
74
|
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
80
75
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
81
|
-
if(length(ds_f) > 8L){
|
76
|
+
if (length(ds_f) > 8L) {
|
82
77
|
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
83
|
-
subclades(
|
84
|
-
|
78
|
+
subclades(
|
79
|
+
out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
80
|
+
thr = thr,
|
81
|
+
ani.d = ani_subset
|
82
|
+
)
|
85
83
|
}
|
86
84
|
}
|
87
85
|
|
88
86
|
# Declare recursion up-to-here complete
|
89
|
-
write.table(
|
90
|
-
|
87
|
+
write.table(
|
88
|
+
date(), paste(out_base, "ready", sep = "."),
|
89
|
+
quote = FALSE, row.names = FALSE, col.names = FALSE
|
90
|
+
)
|
91
91
|
}
|
92
92
|
|
93
93
|
#= Heavy-lifter
|
94
|
-
subclade_clustering <- function(out_base, thr, ani.d,
|
94
|
+
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
95
95
|
# Get ANI distances
|
96
|
-
if(length(ani.d) > 0){
|
97
|
-
# Just use ani.d (and save in
|
98
|
-
|
99
|
-
}else if(file.exists(
|
100
|
-
# Read from
|
101
|
-
|
102
|
-
}else{
|
96
|
+
if (length(ani.d) > 0) {
|
97
|
+
# Just use ani.d (and save in dist_rds)
|
98
|
+
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
99
|
+
} else if (file.exists(dist_rds)) {
|
100
|
+
# Read from dist_rds
|
101
|
+
ani.d <- readRDS(dist_rds)
|
102
|
+
} else {
|
103
103
|
stop("Cannot find input matrix", out_base)
|
104
104
|
}
|
105
|
-
if(length(labels(ani.d)) <= 8L) return(list())
|
105
|
+
if (length(labels(ani.d)) <= 8L) return(list())
|
106
106
|
|
107
107
|
# Build tree
|
108
108
|
say("Tree")
|
109
109
|
ani.ph <- bionj(ani.d)
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
say("- Write")
|
111
|
+
express.ori <- options("expressions")$expressions
|
112
|
+
if(express.ori < ani.ph$Nnode * 4){
|
113
|
+
options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
|
113
114
|
}
|
114
|
-
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
115
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
115
116
|
options(expressions=express.ori)
|
116
117
|
|
117
118
|
# Silhouette
|
118
119
|
say("Silhouette")
|
119
120
|
nn <- length(labels(ani.d))
|
120
121
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
|
122
|
+
say("- Make cluster")
|
121
123
|
cl <- makeCluster(thr)
|
122
|
-
|
124
|
+
say("- Launch parallel jobs")
|
125
|
+
s <- parSapply(
|
126
|
+
cl, k,
|
127
|
+
function(x) {
|
123
128
|
library(cluster)
|
124
|
-
s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
125
|
-
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
126
|
-
}
|
129
|
+
s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
|
130
|
+
c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
|
131
|
+
}
|
132
|
+
)
|
133
|
+
say("- Stop cluster")
|
127
134
|
stopCluster(cl)
|
128
|
-
|
129
|
-
s.
|
130
|
-
|
131
|
-
|
135
|
+
say("- Calculate custom criteria")
|
136
|
+
s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
|
137
|
+
s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
|
138
|
+
ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
|
139
|
+
if(mean(s[1,] < 0) < 0.75)
|
140
|
+
ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
|
132
141
|
top.n <- k[which.max(ds)]
|
133
142
|
|
134
143
|
# Classify genomes
|
@@ -139,8 +148,8 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
139
148
|
|
140
149
|
# Generate graphic report
|
141
150
|
say("Graphic report")
|
142
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
143
|
-
layout(matrix(c(1,
|
151
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
152
|
+
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
144
153
|
plot_distances(ani.d)
|
145
154
|
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
146
155
|
plot_clustering(ani.cl, ani.d, ani.types)
|
@@ -153,112 +162,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
153
162
|
# Return data
|
154
163
|
say("Cluster ready")
|
155
164
|
return(list(
|
156
|
-
ani.medoids=ani.medoids,
|
157
|
-
ani.types=ani.types,
|
158
|
-
ani.d=ani.d
|
165
|
+
ani.medoids = ani.medoids,
|
166
|
+
ani.types = ani.types,
|
167
|
+
ani.d = ani.d
|
159
168
|
))
|
160
169
|
}
|
161
170
|
|
162
171
|
#= Helper functions
|
163
|
-
say <- function(...) {
|
172
|
+
say <- function (...) {
|
173
|
+
message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
|
174
|
+
}
|
164
175
|
|
165
|
-
generate_empty_files <- function(out_base) {
|
166
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
167
|
-
plot(1, t="n", axes=F)
|
168
|
-
legend("center", "No data", bty="n")
|
176
|
+
generate_empty_files <- function (out_base) {
|
177
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
178
|
+
plot(1, t = "n", axes = F)
|
179
|
+
legend("center", "No data", bty = "n")
|
169
180
|
dev.off()
|
170
|
-
file.create(paste(out_base,".1.classif",sep=""))
|
171
|
-
file.create(paste(out_base,".1.medoids",sep=""))
|
181
|
+
file.create(paste(out_base, ".1.classif", sep = ""))
|
182
|
+
file.create(paste(out_base, ".1.medoids", sep = ""))
|
172
183
|
}
|
173
184
|
|
174
|
-
write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
|
175
|
-
say(
|
176
|
-
write.table(
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
185
|
+
write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
|
186
|
+
say("Text report")
|
187
|
+
write.table(
|
188
|
+
ani.medoids, paste(out_base, "medoids", sep = "."),
|
189
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
190
|
+
)
|
191
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
|
192
|
+
ani.d.m <- 100 - as.matrix(ani.d) * 100
|
193
|
+
for (j in 1:nrow(classif)) {
|
181
194
|
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
182
195
|
}
|
183
|
-
write.table(
|
184
|
-
|
196
|
+
write.table(
|
197
|
+
classif, paste(out_base, "classif", sep="."),
|
198
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
|
199
|
+
)
|
185
200
|
}
|
186
201
|
|
187
|
-
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
202
|
+
plot_silhouette <- function (k, s, ns, ds, top.n) {
|
188
203
|
# s
|
189
|
-
par(mar=c(4,5,1,5)+0.1)
|
190
|
-
plot(
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
204
|
+
par(mar = c(4,5,1,5)+0.1)
|
205
|
+
plot(
|
206
|
+
1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
|
207
|
+
ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
|
208
|
+
)
|
209
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
|
210
|
+
axis(2, fg = "grey60", col.axis = "grey60")
|
211
|
+
mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
|
212
|
+
|
195
213
|
# ns
|
196
|
-
par(new=TRUE)
|
197
|
-
plot(
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
214
|
+
par(new = TRUE)
|
215
|
+
plot(
|
216
|
+
1, t = "n", bty = "n",
|
217
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
218
|
+
xlim = range(c(0,k)), ylim = range(ns)
|
219
|
+
)
|
220
|
+
points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
|
221
|
+
axis(4, fg = "darkred", col.axis = "darkred")
|
222
|
+
mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
|
223
|
+
|
202
224
|
# ds
|
203
|
-
par(new=TRUE)
|
204
|
-
plot(
|
205
|
-
|
225
|
+
par(new = TRUE)
|
226
|
+
plot(
|
227
|
+
1, t = "n", bty = "n",
|
228
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
229
|
+
xlim = range(c(0,k)), ylim = range(ds)
|
230
|
+
)
|
206
231
|
lines(k, ds)
|
207
|
-
abline(v=top.n, lty=2)
|
232
|
+
abline(v = top.n, lty = 2)
|
208
233
|
}
|
209
234
|
|
210
|
-
plot_distances <- function(dist) {
|
211
|
-
par(mar=c(5,4,1,2)+0.1)
|
212
|
-
hist(
|
235
|
+
plot_distances <- function (dist) {
|
236
|
+
par(mar = c(5,4,1,2) + 0.1)
|
237
|
+
hist(
|
238
|
+
dist, border = NA, col = "grey60", breaks = 50,
|
239
|
+
xlab = "Distances", main = ""
|
240
|
+
)
|
213
241
|
}
|
214
242
|
|
215
|
-
plot_clustering <- function(cl, dist, types) {
|
216
|
-
par(mar=c(5,4,4,2)+0.1)
|
243
|
+
plot_clustering <- function (cl, dist, types) {
|
244
|
+
par(mar = c(5,4,4,2) + 0.1)
|
217
245
|
top.n <- length(cl$medoids)
|
218
246
|
col <- ggplotColours(top.n)
|
219
|
-
plot(silhouette(cl), col=col)
|
220
|
-
if(length(labels(dist))<=15){
|
221
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
222
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
223
|
-
}else{
|
224
|
-
ani.mds <- cmdscale(dist, k=4)
|
225
|
-
if(ncol(ani.mds)==4){
|
226
|
-
plot(
|
227
|
-
|
228
|
-
|
229
|
-
|
247
|
+
plot(silhouette(cl), col = col)
|
248
|
+
if (length(labels(dist)) <= 15) {
|
249
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
250
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
251
|
+
} else {
|
252
|
+
ani.mds <- cmdscale(dist, k = 4)
|
253
|
+
if (ncol(ani.mds) == 4) {
|
254
|
+
plot(
|
255
|
+
ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
|
256
|
+
xlab = "Component 1", ylab = "Component 2"
|
257
|
+
)
|
258
|
+
plot(
|
259
|
+
ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
|
260
|
+
xlab = "Component 3", ylab="Component 4"
|
261
|
+
)
|
230
262
|
}else{
|
231
|
-
|
232
|
-
|
263
|
+
for (i in 1:2)
|
264
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
233
265
|
}
|
234
266
|
}
|
235
267
|
}
|
236
268
|
|
237
|
-
plot_tree <- function(phy, types, medoids){
|
269
|
+
plot_tree <- function (phy, types, medoids) {
|
238
270
|
layout(1)
|
239
271
|
top.n <- length(unique(types))
|
240
272
|
col <- ggplotColours(top.n)
|
241
273
|
is.medoid <- phy$tip.label %in% medoids
|
242
|
-
phy$tip.label[is.medoid] <- paste(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
274
|
+
phy$tip.label[is.medoid] <- paste(
|
275
|
+
phy$tip.label[is.medoid],
|
276
|
+
" [", types[phy$tip.label[is.medoid]], "]",
|
277
|
+
sep = ""
|
278
|
+
)
|
279
|
+
plot(
|
280
|
+
phy, cex = ifelse(is.medoid, 1/3, 1/6),
|
281
|
+
font = ifelse(is.medoid, 2, 1),
|
282
|
+
tip.color = col[types[phy$tip.label]]
|
283
|
+
)
|
284
|
+
}
|
285
|
+
|
286
|
+
ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
|
287
|
+
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
|
288
|
+
hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
|
247
289
|
}
|
248
290
|
|
249
|
-
|
250
|
-
|
251
|
-
|
291
|
+
ani_distance <- function (ani_file, sel) {
|
292
|
+
# Try to locate rds, otherwise read gzipped table
|
293
|
+
rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
|
294
|
+
if (file.exists(rds)) {
|
295
|
+
sim <- readRDS(rds)
|
296
|
+
} else {
|
297
|
+
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
298
|
+
}
|
299
|
+
|
300
|
+
# If there is not data end process
|
301
|
+
if (nrow(sim) == 0) return(NULL)
|
302
|
+
|
303
|
+
# Apply filter (if requested)
|
304
|
+
if (!is.na(sel) && file.exists(sel)) {
|
305
|
+
say("Filter selection")
|
306
|
+
lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
307
|
+
sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
|
308
|
+
}
|
309
|
+
|
310
|
+
# Transform to distances
|
311
|
+
say("Distances")
|
312
|
+
sim$d <- 1 - (sim$value / 100)
|
313
|
+
return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
|
252
314
|
}
|
253
315
|
|
254
316
|
#= Main
|
255
317
|
options(warn = 1)
|
256
|
-
if(length(argv) >= 5 & argv[5] ==
|
318
|
+
if (length(argv) >= 5 & argv[5] == "empty") {
|
257
319
|
generate_empty_files(argv[2])
|
258
|
-
write.table(NULL, paste(argv[2], "medoids", sep="."))
|
259
|
-
write.table(NULL, paste(argv[2], "classif", sep="."))
|
260
|
-
write.table(date(), paste(argv[2], "ready", sep="."))
|
320
|
+
write.table(NULL, paste(argv[2], "medoids", sep = "."))
|
321
|
+
write.table(NULL, paste(argv[2], "classif", sep = "."))
|
322
|
+
write.table(date(), paste(argv[2], "ready", sep = "."))
|
261
323
|
}else{
|
262
|
-
subclades(
|
263
|
-
|
324
|
+
subclades(
|
325
|
+
ani_file = argv[1],
|
326
|
+
out_base = argv[2],
|
327
|
+
thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
|
328
|
+
sel = argv[4]
|
329
|
+
)
|
264
330
|
}
|
331
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-11-
|
11
|
+
date: 2021-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|