miga-base 1.1.2.1 → 1.1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/init/daemon_helper.rb +1 -1
- data/lib/miga/cli/action/init/files_helper.rb +1 -1
- data/lib/miga/dataset.rb +2 -2
- data/lib/miga/project/dataset.rb +6 -2
- data/lib/miga/project/result.rb +11 -8
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +8 -8
- data/scripts/ani_distances.bash +8 -8
- data/scripts/haai_distances.bash +1 -1
- data/test/project_test.rb +7 -6
- data/utils/find-medoid.R +18 -14
- data/utils/subclade/pipeline.rb +1 -3
- data/utils/subclades.R +195 -128
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 73de682930481bd837b829588081e2c9e70a87054e9e1d91b7d40bf319030349
|
4
|
+
data.tar.gz: b0ed9f7f1acf8fb2530fde84803938e8f3d7fac3400a629c3db88779dd9a679f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4477253800d6a04f3b8e612ed8f5af8ccdb47e5f6aff6efdf2539d252daa7cbc70d009f28ec75fcada925420590614589effce76e465a93d9cb2a7ce093d79ff
|
7
|
+
data.tar.gz: bbb998c715274dc6b000fa3fc4b9f1f550867b78707b4b5eaa43c24b692939769c2958c3beef0bc10f6dbfd9f38a63b143b2617e44208550b19aa837c45711aa
|
@@ -5,7 +5,7 @@
|
|
5
5
|
module MiGA::Cli::Action::Init::DaemonHelper
|
6
6
|
def configure_daemon
|
7
7
|
cli.puts 'Default daemon configuration:'
|
8
|
-
daemon_f = File.expand_path('.miga_daemon.json', ENV['
|
8
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['MIGA_HOME'])
|
9
9
|
unless File.exist?(daemon_f) and cli.ask_user(
|
10
10
|
'A template daemon already exists, do you want to preserve it?',
|
11
11
|
'yes', %w(yes no)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Helper module with files configuration functions for MiGA::Cli::Action::Init
|
5
5
|
module MiGA::Cli::Action::Init::FilesHelper
|
6
6
|
def open_rc_file
|
7
|
-
rc_path = File.expand_path('.miga_rc', ENV['
|
7
|
+
rc_path = File.expand_path('.miga_rc', ENV['MIGA_HOME'])
|
8
8
|
if File.exist? rc_path
|
9
9
|
if cli.ask_user(
|
10
10
|
'I found a previous configuration. Do you want to continue?',
|
data/lib/miga/dataset.rb
CHANGED
@@ -107,7 +107,7 @@ class MiGA::Dataset < MiGA::MiGA
|
|
107
107
|
metadata[:warn] = "Inactive: #{reason}" unless reason.nil?
|
108
108
|
metadata[:inactive] = true
|
109
109
|
metadata.save
|
110
|
-
project.recalculate_tasks(
|
110
|
+
project.recalculate_tasks("Reference dataset inactivated: #{name}") if ref?
|
111
111
|
pull_hook :on_inactivate
|
112
112
|
end
|
113
113
|
|
@@ -117,7 +117,7 @@ class MiGA::Dataset < MiGA::MiGA
|
|
117
117
|
metadata[:inactive] = nil
|
118
118
|
metadata[:warn] = nil if metadata[:warn] && metadata[:warn] =~ /^Inactive: /
|
119
119
|
metadata.save
|
120
|
-
project.recalculate_tasks(
|
120
|
+
project.recalculate_tasks("Reference dataset activated: #{name}") if ref?
|
121
121
|
pull_hook :on_activate
|
122
122
|
end
|
123
123
|
|
data/lib/miga/project/dataset.rb
CHANGED
@@ -52,7 +52,9 @@ module MiGA::Project::Dataset
|
|
52
52
|
@metadata[:datasets] << name
|
53
53
|
@dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
|
54
54
|
save
|
55
|
-
|
55
|
+
if d.ref? && d.active?
|
56
|
+
recalculate_tasks("Reference dataset added: #{d.name}")
|
57
|
+
end
|
56
58
|
pull_hook(:on_add_dataset, name)
|
57
59
|
end
|
58
60
|
dataset(name)
|
@@ -66,7 +68,9 @@ module MiGA::Project::Dataset
|
|
66
68
|
|
67
69
|
self.metadata[:datasets].delete(name)
|
68
70
|
save
|
69
|
-
|
71
|
+
if d.ref? && d.active?
|
72
|
+
recalculate_tasks("Reference dataset unlinked: #{d.name}")
|
73
|
+
end
|
70
74
|
pull_hook(:on_unlink_dataset, name)
|
71
75
|
d
|
72
76
|
end
|
data/lib/miga/project/result.rb
CHANGED
@@ -55,10 +55,11 @@ module MiGA::Project::Result
|
|
55
55
|
##
|
56
56
|
# Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
|
57
57
|
def add_result_distances(base, _opts)
|
58
|
-
return nil unless result_files_exist?(base, %w[.
|
58
|
+
return nil unless result_files_exist?(base, %w[.rds .txt])
|
59
59
|
|
60
60
|
r = MiGA::Result.new("#{base}.json")
|
61
|
-
r.add_file(:
|
61
|
+
r.add_file(:rds, 'miga-project.rds')
|
62
|
+
r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
|
62
63
|
r.add_file(:matrix, 'miga-project.txt')
|
63
64
|
r.add_file(:log, 'miga-project.log') # Legacy file
|
64
65
|
r.add_file(:hist, 'miga-project.hist')
|
@@ -82,12 +83,13 @@ module MiGA::Project::Result
|
|
82
83
|
end
|
83
84
|
|
84
85
|
r = add_result_iter_clades(base)
|
85
|
-
r.add_file(:
|
86
|
-
r.add_file(:
|
87
|
-
r.add_file(:
|
88
|
-
r.add_file(:
|
89
|
-
r.add_file(:
|
90
|
-
r.add_file(:
|
86
|
+
r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
|
87
|
+
r.add_file(:aai_tree, 'miga-project.aai.nwk')
|
88
|
+
r.add_file(:proposal, 'miga-project.proposed-clades')
|
89
|
+
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
90
|
+
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
91
|
+
r.add_file(:clades_gsp, 'miga-project.gsp-clades')
|
92
|
+
r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
|
91
93
|
r
|
92
94
|
end
|
93
95
|
|
@@ -105,6 +107,7 @@ module MiGA::Project::Result
|
|
105
107
|
|
106
108
|
r = add_result_iter_clades(base)
|
107
109
|
r.add_file(:ani_tree, 'miga-project.ani.nwk')
|
110
|
+
r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
|
108
111
|
r
|
109
112
|
end
|
110
113
|
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.1,
|
15
|
+
VERSION = [1.1, 3, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2021, 11,
|
23
|
+
VERSION_DATE = Date.new(2021, 11, 21)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
aai <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(aai[,
|
29
|
-
h <- hist(aai[aai[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(aai, file = "miga-project.rds")
|
28
|
+
if(sum(aai[, "a"] != aai[, "b"]) > 0) {
|
29
|
+
h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/ani_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
ani <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(ani[,
|
29
|
-
h <- hist(ani[ani[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(ani, file = "miga-project.rds")
|
28
|
+
if(sum(ani[, "a"] != ani[, "b"]) > 0) {
|
29
|
+
h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/haai_distances.bash
CHANGED
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
|
15
15
|
# No real need for hAAI distributions at all
|
16
16
|
echo -n "" > miga-project.log
|
17
17
|
echo -n "" > miga-project.txt
|
18
|
-
echo
|
18
|
+
echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
|
19
19
|
|
20
20
|
# Finalize
|
21
21
|
miga_end_project_step "$DIR"
|
data/test/project_test.rb
CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
|
|
82
82
|
def test_add_result
|
83
83
|
p1 = project
|
84
84
|
assert_nil(p1.add_result(:doom))
|
85
|
-
%w[.
|
85
|
+
%w[.rds .log .txt .done].each do |x|
|
86
86
|
assert_nil(p1.add_result(:haai_distances))
|
87
87
|
FileUtils.touch(
|
88
88
|
File.join(
|
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
|
|
117
117
|
# Project tasks
|
118
118
|
expected_files = {
|
119
119
|
project_stats: %w[.taxonomy.json .metadata.db],
|
120
|
-
haai_distances: %w[.
|
121
|
-
aai_distances: %w[.
|
122
|
-
ani_distances: %w[.
|
123
|
-
clade_finding: %w[
|
124
|
-
|
120
|
+
haai_distances: %w[.rds .log .txt],
|
121
|
+
aai_distances: %w[.rds .log .txt],
|
122
|
+
ani_distances: %w[.rds .log .txt],
|
123
|
+
clade_finding: %w[
|
124
|
+
.pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
|
125
|
+
],
|
125
126
|
subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
|
126
127
|
ogs: %w[.ogs .stats]
|
127
128
|
}
|
data/utils/find-medoid.R
CHANGED
@@ -5,26 +5,28 @@
|
|
5
5
|
#
|
6
6
|
|
7
7
|
#= Load stuff
|
8
|
-
argv <- commandArgs(trailingOnly =
|
8
|
+
argv <- commandArgs(trailingOnly = TRUE)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
if(Sys.getenv(
|
10
|
+
if(Sys.getenv("MIGA") == ""){
|
11
11
|
suppressPackageStartupMessages(library(enveomics.R))
|
12
12
|
}else{
|
13
|
-
source(file.path(
|
14
|
-
|
13
|
+
source(file.path(
|
14
|
+
Sys.getenv("MIGA"),
|
15
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
16
|
+
))
|
15
17
|
}
|
16
18
|
|
17
|
-
find_medoids <- function(ani.df, out, clades) {
|
19
|
+
find_medoids <- function (ani.df, out, clades) {
|
18
20
|
if(nrow(ani.df) == 0) return(NULL)
|
19
21
|
ani.df$d <- 1 - (ani.df$value/100)
|
20
|
-
dist <- enve.df2dist(ani.df,
|
22
|
+
dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
|
21
23
|
dist <- as.matrix(dist)
|
22
|
-
cl <- read.table(clades, header = FALSE, sep =
|
24
|
+
cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
|
23
25
|
cl.s <- c()
|
24
26
|
medoids <- c()
|
25
27
|
for(i in cl){
|
26
|
-
lab <- strsplit(i,
|
27
|
-
cat(
|
28
|
+
lab <- strsplit(i, ",")[[1]]
|
29
|
+
cat("Clade of:", lab[1], "\n")
|
28
30
|
if(length(lab) == 1) {
|
29
31
|
lab.s <- lab
|
30
32
|
} else {
|
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
|
|
32
34
|
}
|
33
35
|
med <- lab.s[1]
|
34
36
|
medoids <- c(medoids, med)
|
35
|
-
cl.s <- c(cl.s, paste(lab.s, collapse =
|
37
|
+
cl.s <- c(cl.s, paste(lab.s, collapse = ","))
|
36
38
|
}
|
37
39
|
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
38
|
-
write.table(
|
39
|
-
|
40
|
+
write.table(
|
41
|
+
cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
|
42
|
+
row.names = FALSE, col.names = FALSE
|
43
|
+
)
|
40
44
|
}
|
41
45
|
|
42
46
|
#= Main
|
43
|
-
|
44
|
-
|
47
|
+
cat("Finding Medoids")
|
48
|
+
ani <- readRDS(argv[1])
|
45
49
|
find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
|
46
50
|
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -44,7 +44,7 @@ module MiGA::SubcladeRunner::Pipeline
|
|
44
44
|
# Find genomospecies medoids
|
45
45
|
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
46
46
|
dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
|
47
|
-
`Rscript '#{src}' ../../09.distances/#{dir}/miga-project.
|
47
|
+
`Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
|
48
48
|
miga-project.gsp-medoids miga-project.gsp-clades`
|
49
49
|
if File.exist? 'miga-project.gsp-clades.sorted'
|
50
50
|
File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
|
@@ -54,8 +54,6 @@ module MiGA::SubcladeRunner::Pipeline
|
|
54
54
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
55
55
|
File.open('miga-project.gsp-clades', 'r') do |ifh|
|
56
56
|
ifh.each_line do |ln|
|
57
|
-
next if $. == 1
|
58
|
-
|
59
57
|
r = ln.chomp.split(',')
|
60
58
|
ofh.puts r.join("\t") if r.size >= 5
|
61
59
|
end
|
data/utils/subclades.R
CHANGED
@@ -10,56 +10,51 @@ suppressPackageStartupMessages(library(ape))
|
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
12
12
|
suppressPackageStartupMessages(library(parallel))
|
13
|
-
if(Sys.getenv(
|
13
|
+
if(Sys.getenv("MIGA") == ""){
|
14
14
|
suppressPackageStartupMessages(library(enveomics.R))
|
15
15
|
}else{
|
16
|
-
source(file.path(
|
17
|
-
|
16
|
+
source(file.path(
|
17
|
+
Sys.getenv("MIGA"),
|
18
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
19
|
+
))
|
18
20
|
}
|
19
21
|
|
20
22
|
#= Main function
|
21
23
|
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
22
|
-
say(
|
24
|
+
say("==> Out base:", out_base, "<==")
|
23
25
|
|
24
26
|
# Normalize input matrix
|
25
|
-
|
26
|
-
if(!missing(ani_file)){
|
27
|
-
if(length(ani.d) == 0 && !file.exists(
|
27
|
+
dist_rds <- paste(out_base, "dist.rds", sep = ".")
|
28
|
+
if (!missing(ani_file)) {
|
29
|
+
if(length(ani.d) == 0 && !file.exists(dist_rds)){
|
28
30
|
# Read from ani_file
|
29
|
-
|
30
|
-
if(
|
31
|
+
ani.d <- ani_distance(ani_file, sel)
|
32
|
+
if (is.null(ani.d)) {
|
31
33
|
generate_empty_files(out_base)
|
32
34
|
return(NULL)
|
35
|
+
} else {
|
36
|
+
saveRDS(ani.d, dist_rds)
|
33
37
|
}
|
34
|
-
if(!is.na(sel) && file.exists(sel)){
|
35
|
-
say('Filter selection')
|
36
|
-
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
37
|
-
a <- a[a$a %in% lab & a$b %in% lab, ]
|
38
|
-
}
|
39
|
-
say('Distances')
|
40
|
-
a$d <- 1 - (a$value/100)
|
41
|
-
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
|
42
|
-
save(ani.d, file = dist_rdata)
|
43
38
|
}
|
44
39
|
}
|
45
40
|
|
46
41
|
# Read result if the subclade is ready, run it otherwise
|
47
|
-
if(file.exists(paste(out_base,
|
42
|
+
if (file.exists(paste(out_base, "classif", sep = "."))) {
|
48
43
|
say("Loading")
|
49
44
|
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
50
|
-
sep =
|
45
|
+
sep = " ", as.is = TRUE)[,1]
|
51
46
|
a <- read.table(paste(out_base, "classif", sep="."),
|
52
|
-
sep =
|
47
|
+
sep = "\t", as.is = TRUE)
|
53
48
|
ani.types <- a[,2]
|
54
49
|
names(ani.types) <- a[,1]
|
55
|
-
if(length(ani.d) == 0)
|
56
|
-
}else if(length(labels(ani.d)) > 8L){
|
57
|
-
res <- subclade_clustering(out_base, thr, ani.d,
|
58
|
-
if(length(res) == 0) return(NULL)
|
59
|
-
ani.medoids <- res[[
|
60
|
-
ani.types <- res[[
|
61
|
-
ani.d <- res[[
|
62
|
-
}else{
|
50
|
+
if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
|
51
|
+
} else if (length(labels(ani.d)) > 8L) {
|
52
|
+
res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
|
53
|
+
if (length(res) == 0) return(NULL)
|
54
|
+
ani.medoids <- res[["ani.medoids"]]
|
55
|
+
ani.types <- res[["ani.types"]]
|
56
|
+
ani.d <- res[["ani.d"]]
|
57
|
+
} else {
|
63
58
|
ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
|
64
59
|
ani.types <- rep(1, length(labels(ani.d)))
|
65
60
|
names(ani.types) <- labels(ani.d)
|
@@ -69,66 +64,80 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
69
64
|
|
70
65
|
# Recursive search
|
71
66
|
say("Recursive search")
|
72
|
-
for(i in 1:length(ani.medoids)){
|
67
|
+
for (i in 1:length(ani.medoids)) {
|
73
68
|
medoid <- ani.medoids[i]
|
74
69
|
ds_f <- names(ani.types)[ ani.types==i ]
|
75
70
|
say("Analyzing subclade", i, "with medoid:", medoid)
|
76
71
|
dir_f <- paste(out_base, ".sc-", i, sep="")
|
77
|
-
if(!dir.exists(dir_f)) dir.create(dir_f)
|
72
|
+
if (!dir.exists(dir_f)) dir.create(dir_f)
|
78
73
|
write.table(ds_f,
|
79
74
|
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
80
75
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
81
|
-
if(length(ds_f) > 8L){
|
76
|
+
if (length(ds_f) > 8L) {
|
82
77
|
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
83
|
-
subclades(
|
84
|
-
|
78
|
+
subclades(
|
79
|
+
out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
80
|
+
thr = thr,
|
81
|
+
ani.d = ani_subset
|
82
|
+
)
|
85
83
|
}
|
86
84
|
}
|
87
85
|
|
88
86
|
# Declare recursion up-to-here complete
|
89
|
-
write.table(
|
90
|
-
|
87
|
+
write.table(
|
88
|
+
date(), paste(out_base, "ready", sep = "."),
|
89
|
+
quote = FALSE, row.names = FALSE, col.names = FALSE
|
90
|
+
)
|
91
91
|
}
|
92
92
|
|
93
93
|
#= Heavy-lifter
|
94
|
-
subclade_clustering <- function(out_base, thr, ani.d,
|
94
|
+
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
95
95
|
# Get ANI distances
|
96
|
-
if(length(ani.d) > 0){
|
97
|
-
# Just use ani.d (and save in
|
98
|
-
|
99
|
-
}else if(file.exists(
|
100
|
-
# Read from
|
101
|
-
|
102
|
-
}else{
|
96
|
+
if (length(ani.d) > 0) {
|
97
|
+
# Just use ani.d (and save in dist_rds)
|
98
|
+
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
99
|
+
} else if (file.exists(dist_rds)) {
|
100
|
+
# Read from dist_rds
|
101
|
+
ani.d <- readRDS(dist_rds)
|
102
|
+
} else {
|
103
103
|
stop("Cannot find input matrix", out_base)
|
104
104
|
}
|
105
|
-
if(length(labels(ani.d)) <= 8L) return(list())
|
105
|
+
if (length(labels(ani.d)) <= 8L) return(list())
|
106
106
|
|
107
107
|
# Build tree
|
108
108
|
say("Tree")
|
109
109
|
ani.ph <- bionj(ani.d)
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
say("- Write")
|
111
|
+
express.ori <- options("expressions")$expressions
|
112
|
+
if(express.ori < ani.ph$Nnode * 4){
|
113
|
+
options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
|
113
114
|
}
|
114
|
-
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
115
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
115
116
|
options(expressions=express.ori)
|
116
117
|
|
117
118
|
# Silhouette
|
118
119
|
say("Silhouette")
|
119
120
|
nn <- length(labels(ani.d))
|
120
121
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
|
122
|
+
say("- Make cluster")
|
121
123
|
cl <- makeCluster(thr)
|
122
|
-
|
124
|
+
say("- Launch parallel jobs")
|
125
|
+
s <- parSapply(
|
126
|
+
cl, k,
|
127
|
+
function(x) {
|
123
128
|
library(cluster)
|
124
|
-
s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
125
|
-
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
126
|
-
}
|
129
|
+
s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
|
130
|
+
c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
|
131
|
+
}
|
132
|
+
)
|
133
|
+
say("- Stop cluster")
|
127
134
|
stopCluster(cl)
|
128
|
-
|
129
|
-
s.
|
130
|
-
|
131
|
-
|
135
|
+
say("- Calculate custom criteria")
|
136
|
+
s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
|
137
|
+
s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
|
138
|
+
ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
|
139
|
+
if(mean(s[1,] < 0) < 0.75)
|
140
|
+
ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
|
132
141
|
top.n <- k[which.max(ds)]
|
133
142
|
|
134
143
|
# Classify genomes
|
@@ -139,8 +148,8 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
139
148
|
|
140
149
|
# Generate graphic report
|
141
150
|
say("Graphic report")
|
142
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
143
|
-
layout(matrix(c(1,
|
151
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
152
|
+
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
144
153
|
plot_distances(ani.d)
|
145
154
|
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
146
155
|
plot_clustering(ani.cl, ani.d, ani.types)
|
@@ -153,112 +162,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
153
162
|
# Return data
|
154
163
|
say("Cluster ready")
|
155
164
|
return(list(
|
156
|
-
ani.medoids=ani.medoids,
|
157
|
-
ani.types=ani.types,
|
158
|
-
ani.d=ani.d
|
165
|
+
ani.medoids = ani.medoids,
|
166
|
+
ani.types = ani.types,
|
167
|
+
ani.d = ani.d
|
159
168
|
))
|
160
169
|
}
|
161
170
|
|
162
171
|
#= Helper functions
|
163
|
-
say <- function(...) {
|
172
|
+
say <- function (...) {
|
173
|
+
message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
|
174
|
+
}
|
164
175
|
|
165
|
-
generate_empty_files <- function(out_base) {
|
166
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
167
|
-
plot(1, t="n", axes=F)
|
168
|
-
legend("center", "No data", bty="n")
|
176
|
+
generate_empty_files <- function (out_base) {
|
177
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
178
|
+
plot(1, t = "n", axes = F)
|
179
|
+
legend("center", "No data", bty = "n")
|
169
180
|
dev.off()
|
170
|
-
file.create(paste(out_base,".1.classif",sep=""))
|
171
|
-
file.create(paste(out_base,".1.medoids",sep=""))
|
181
|
+
file.create(paste(out_base, ".1.classif", sep = ""))
|
182
|
+
file.create(paste(out_base, ".1.medoids", sep = ""))
|
172
183
|
}
|
173
184
|
|
174
|
-
write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
|
175
|
-
say(
|
176
|
-
write.table(
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
185
|
+
write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
|
186
|
+
say("Text report")
|
187
|
+
write.table(
|
188
|
+
ani.medoids, paste(out_base, "medoids", sep = "."),
|
189
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
190
|
+
)
|
191
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
|
192
|
+
ani.d.m <- 100 - as.matrix(ani.d) * 100
|
193
|
+
for (j in 1:nrow(classif)) {
|
181
194
|
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
182
195
|
}
|
183
|
-
write.table(
|
184
|
-
|
196
|
+
write.table(
|
197
|
+
classif, paste(out_base, "classif", sep="."),
|
198
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
|
199
|
+
)
|
185
200
|
}
|
186
201
|
|
187
|
-
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
202
|
+
plot_silhouette <- function (k, s, ns, ds, top.n) {
|
188
203
|
# s
|
189
|
-
par(mar=c(4,5,1,5)+0.1)
|
190
|
-
plot(
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
204
|
+
par(mar = c(4,5,1,5)+0.1)
|
205
|
+
plot(
|
206
|
+
1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
|
207
|
+
ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
|
208
|
+
)
|
209
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
|
210
|
+
axis(2, fg = "grey60", col.axis = "grey60")
|
211
|
+
mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
|
212
|
+
|
195
213
|
# ns
|
196
|
-
par(new=TRUE)
|
197
|
-
plot(
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
214
|
+
par(new = TRUE)
|
215
|
+
plot(
|
216
|
+
1, t = "n", bty = "n",
|
217
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
218
|
+
xlim = range(c(0,k)), ylim = range(ns)
|
219
|
+
)
|
220
|
+
points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
|
221
|
+
axis(4, fg = "darkred", col.axis = "darkred")
|
222
|
+
mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
|
223
|
+
|
202
224
|
# ds
|
203
|
-
par(new=TRUE)
|
204
|
-
plot(
|
205
|
-
|
225
|
+
par(new = TRUE)
|
226
|
+
plot(
|
227
|
+
1, t = "n", bty = "n",
|
228
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
229
|
+
xlim = range(c(0,k)), ylim = range(ds)
|
230
|
+
)
|
206
231
|
lines(k, ds)
|
207
|
-
abline(v=top.n, lty=2)
|
232
|
+
abline(v = top.n, lty = 2)
|
208
233
|
}
|
209
234
|
|
210
|
-
plot_distances <- function(dist) {
|
211
|
-
par(mar=c(5,4,1,2)+0.1)
|
212
|
-
hist(
|
235
|
+
plot_distances <- function (dist) {
|
236
|
+
par(mar = c(5,4,1,2) + 0.1)
|
237
|
+
hist(
|
238
|
+
dist, border = NA, col = "grey60", breaks = 50,
|
239
|
+
xlab = "Distances", main = ""
|
240
|
+
)
|
213
241
|
}
|
214
242
|
|
215
|
-
plot_clustering <- function(cl, dist, types) {
|
216
|
-
par(mar=c(5,4,4,2)+0.1)
|
243
|
+
plot_clustering <- function (cl, dist, types) {
|
244
|
+
par(mar = c(5,4,4,2) + 0.1)
|
217
245
|
top.n <- length(cl$medoids)
|
218
246
|
col <- ggplotColours(top.n)
|
219
|
-
plot(silhouette(cl), col=col)
|
220
|
-
if(length(labels(dist))<=15){
|
221
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
222
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
223
|
-
}else{
|
224
|
-
ani.mds <- cmdscale(dist, k=4)
|
225
|
-
if(ncol(ani.mds)==4){
|
226
|
-
plot(
|
227
|
-
|
228
|
-
|
229
|
-
|
247
|
+
plot(silhouette(cl), col = col)
|
248
|
+
if (length(labels(dist)) <= 15) {
|
249
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
250
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
251
|
+
} else {
|
252
|
+
ani.mds <- cmdscale(dist, k = 4)
|
253
|
+
if (ncol(ani.mds) == 4) {
|
254
|
+
plot(
|
255
|
+
ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
|
256
|
+
xlab = "Component 1", ylab = "Component 2"
|
257
|
+
)
|
258
|
+
plot(
|
259
|
+
ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
|
260
|
+
xlab = "Component 3", ylab="Component 4"
|
261
|
+
)
|
230
262
|
}else{
|
231
|
-
|
232
|
-
|
263
|
+
for (i in 1:2)
|
264
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
233
265
|
}
|
234
266
|
}
|
235
267
|
}
|
236
268
|
|
237
|
-
plot_tree <- function(phy, types, medoids){
|
269
|
+
plot_tree <- function (phy, types, medoids) {
|
238
270
|
layout(1)
|
239
271
|
top.n <- length(unique(types))
|
240
272
|
col <- ggplotColours(top.n)
|
241
273
|
is.medoid <- phy$tip.label %in% medoids
|
242
|
-
phy$tip.label[is.medoid] <- paste(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
274
|
+
phy$tip.label[is.medoid] <- paste(
|
275
|
+
phy$tip.label[is.medoid],
|
276
|
+
" [", types[phy$tip.label[is.medoid]], "]",
|
277
|
+
sep = ""
|
278
|
+
)
|
279
|
+
plot(
|
280
|
+
phy, cex = ifelse(is.medoid, 1/3, 1/6),
|
281
|
+
font = ifelse(is.medoid, 2, 1),
|
282
|
+
tip.color = col[types[phy$tip.label]]
|
283
|
+
)
|
284
|
+
}
|
285
|
+
|
286
|
+
ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
|
287
|
+
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
|
288
|
+
hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
|
247
289
|
}
|
248
290
|
|
249
|
-
|
250
|
-
|
251
|
-
|
291
|
+
ani_distance <- function (ani_file, sel) {
|
292
|
+
# Try to locate rds, otherwise read gzipped table
|
293
|
+
rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
|
294
|
+
if (file.exists(rds)) {
|
295
|
+
sim <- readRDS(rds)
|
296
|
+
} else {
|
297
|
+
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
298
|
+
}
|
299
|
+
|
300
|
+
# If there is not data end process
|
301
|
+
if (nrow(sim) == 0) return(NULL)
|
302
|
+
|
303
|
+
# Apply filter (if requested)
|
304
|
+
if (!is.na(sel) && file.exists(sel)) {
|
305
|
+
say("Filter selection")
|
306
|
+
lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
307
|
+
sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
|
308
|
+
}
|
309
|
+
|
310
|
+
# Transform to distances
|
311
|
+
say("Distances")
|
312
|
+
sim$d <- 1 - (sim$value / 100)
|
313
|
+
return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
|
252
314
|
}
|
253
315
|
|
254
316
|
#= Main
|
255
317
|
options(warn = 1)
|
256
|
-
if(length(argv) >= 5 & argv[5] ==
|
318
|
+
if (length(argv) >= 5 & argv[5] == "empty") {
|
257
319
|
generate_empty_files(argv[2])
|
258
|
-
write.table(NULL, paste(argv[2], "medoids", sep="."))
|
259
|
-
write.table(NULL, paste(argv[2], "classif", sep="."))
|
260
|
-
write.table(date(), paste(argv[2], "ready", sep="."))
|
320
|
+
write.table(NULL, paste(argv[2], "medoids", sep = "."))
|
321
|
+
write.table(NULL, paste(argv[2], "classif", sep = "."))
|
322
|
+
write.table(date(), paste(argv[2], "ready", sep = "."))
|
261
323
|
}else{
|
262
|
-
subclades(
|
263
|
-
|
324
|
+
subclades(
|
325
|
+
ani_file = argv[1],
|
326
|
+
out_base = argv[2],
|
327
|
+
thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
|
328
|
+
sel = argv[4]
|
329
|
+
)
|
264
330
|
}
|
331
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-11-
|
11
|
+
date: 2021-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|