miga-base 1.1.2.2 → 1.1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/init/daemon_helper.rb +1 -1
- data/lib/miga/cli/action/init/files_helper.rb +1 -1
- data/lib/miga/project/result.rb +11 -8
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +8 -8
- data/scripts/ani_distances.bash +8 -8
- data/scripts/haai_distances.bash +1 -1
- data/test/project_test.rb +7 -6
- data/utils/FastAAI/FastAAI +14 -4
- data/utils/distance/commands.rb +5 -2
- data/utils/distance/database.rb +2 -0
- data/utils/find-medoid.R +18 -14
- data/utils/subclade/pipeline.rb +1 -3
- data/utils/subclades.R +195 -128
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d9d8a12b9eaa48b63df43b871f3cc7da598997f3d208cde9fcf31f2d605d66c
|
4
|
+
data.tar.gz: 72d58adbbbea43886e1e60a608f3f7e0da542c26b3b29ecf202a3b34b9f8ac35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc2c81a38d915c7bb13d53853e0be3c822a0ba7703d2e97cf16615ac4d8f8b4fc3203bc83b76584b48ddd7f13787545d5327d9dd5cd846bbc1be1db868b120a4
|
7
|
+
data.tar.gz: 8cf8bd65e94e6bb551143ebb4fbb4ca37a74e58f777011e3c2c5823f7c9b6a242de63ddda1c496d9c91df01f2471446989267828b69e0de7fd024d3e84d70afa
|
@@ -5,7 +5,7 @@
|
|
5
5
|
module MiGA::Cli::Action::Init::DaemonHelper
|
6
6
|
def configure_daemon
|
7
7
|
cli.puts 'Default daemon configuration:'
|
8
|
-
daemon_f = File.expand_path('.miga_daemon.json', ENV['
|
8
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['MIGA_HOME'])
|
9
9
|
unless File.exist?(daemon_f) and cli.ask_user(
|
10
10
|
'A template daemon already exists, do you want to preserve it?',
|
11
11
|
'yes', %w(yes no)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Helper module with files configuration functions for MiGA::Cli::Action::Init
|
5
5
|
module MiGA::Cli::Action::Init::FilesHelper
|
6
6
|
def open_rc_file
|
7
|
-
rc_path = File.expand_path('.miga_rc', ENV['
|
7
|
+
rc_path = File.expand_path('.miga_rc', ENV['MIGA_HOME'])
|
8
8
|
if File.exist? rc_path
|
9
9
|
if cli.ask_user(
|
10
10
|
'I found a previous configuration. Do you want to continue?',
|
data/lib/miga/project/result.rb
CHANGED
@@ -55,10 +55,11 @@ module MiGA::Project::Result
|
|
55
55
|
##
|
56
56
|
# Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
|
57
57
|
def add_result_distances(base, _opts)
|
58
|
-
return nil unless result_files_exist?(base, %w[.
|
58
|
+
return nil unless result_files_exist?(base, %w[.rds .txt])
|
59
59
|
|
60
60
|
r = MiGA::Result.new("#{base}.json")
|
61
|
-
r.add_file(:
|
61
|
+
r.add_file(:rds, 'miga-project.rds')
|
62
|
+
r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
|
62
63
|
r.add_file(:matrix, 'miga-project.txt')
|
63
64
|
r.add_file(:log, 'miga-project.log') # Legacy file
|
64
65
|
r.add_file(:hist, 'miga-project.hist')
|
@@ -82,12 +83,13 @@ module MiGA::Project::Result
|
|
82
83
|
end
|
83
84
|
|
84
85
|
r = add_result_iter_clades(base)
|
85
|
-
r.add_file(:
|
86
|
-
r.add_file(:
|
87
|
-
r.add_file(:
|
88
|
-
r.add_file(:
|
89
|
-
r.add_file(:
|
90
|
-
r.add_file(:
|
86
|
+
r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
|
87
|
+
r.add_file(:aai_tree, 'miga-project.aai.nwk')
|
88
|
+
r.add_file(:proposal, 'miga-project.proposed-clades')
|
89
|
+
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
|
90
|
+
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
|
91
|
+
r.add_file(:clades_gsp, 'miga-project.gsp-clades')
|
92
|
+
r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
|
91
93
|
r
|
92
94
|
end
|
93
95
|
|
@@ -105,6 +107,7 @@ module MiGA::Project::Result
|
|
105
107
|
|
106
108
|
r = add_result_iter_clades(base)
|
107
109
|
r.add_file(:ani_tree, 'miga-project.ani.nwk')
|
110
|
+
r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
|
108
111
|
r
|
109
112
|
end
|
110
113
|
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.1,
|
15
|
+
VERSION = [1.1, 3, 1].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2021, 11,
|
23
|
+
VERSION_DATE = Date.new(2021, 11, 24)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
aai <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(aai[,
|
29
|
-
h <- hist(aai[aai[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
aai <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(aai, file = "miga-project.rds")
|
28
|
+
if(sum(aai[, "a"] != aai[, "b"]) > 0) {
|
29
|
+
h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/ani_distances.bash
CHANGED
@@ -22,15 +22,15 @@ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
|
22
22
|
|
23
23
|
# R-ify
|
24
24
|
cat <<R | R --vanilla
|
25
|
-
file <- gzfile(
|
26
|
-
ani <- read.table(file, sep =
|
27
|
-
|
28
|
-
if(sum(ani[,
|
29
|
-
h <- hist(ani[ani[,
|
30
|
-
len <- length(h[[
|
25
|
+
file <- gzfile("miga-project.txt.gz")
|
26
|
+
ani <- read.table(file, sep = "\t", header = TRUE, as.is = TRUE)
|
27
|
+
saveRDS(ani, file = "miga-project.rds")
|
28
|
+
if(sum(ani[, "a"] != ani[, "b"]) > 0) {
|
29
|
+
h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[["breaks"]])
|
31
31
|
write.table(
|
32
|
-
cbind(h[[
|
33
|
-
file =
|
32
|
+
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
|
33
|
+
file = "miga-project.hist", quote = FALSE, sep = "\t",
|
34
34
|
col.names = FALSE, row.names = FALSE
|
35
35
|
)
|
36
36
|
}
|
data/scripts/haai_distances.bash
CHANGED
@@ -15,7 +15,7 @@ ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
|
15
15
|
# No real need for hAAI distributions at all
|
16
16
|
echo -n "" > miga-project.log
|
17
17
|
echo -n "" > miga-project.txt
|
18
|
-
echo
|
18
|
+
echo 'aai <- NULL; saveRDS(aai, file = "miga-project.rds")' | R --vanilla
|
19
19
|
|
20
20
|
# Finalize
|
21
21
|
miga_end_project_step "$DIR"
|
data/test/project_test.rb
CHANGED
@@ -82,7 +82,7 @@ class ProjectTest < Test::Unit::TestCase
|
|
82
82
|
def test_add_result
|
83
83
|
p1 = project
|
84
84
|
assert_nil(p1.add_result(:doom))
|
85
|
-
%w[.
|
85
|
+
%w[.rds .log .txt .done].each do |x|
|
86
86
|
assert_nil(p1.add_result(:haai_distances))
|
87
87
|
FileUtils.touch(
|
88
88
|
File.join(
|
@@ -117,11 +117,12 @@ class ProjectTest < Test::Unit::TestCase
|
|
117
117
|
# Project tasks
|
118
118
|
expected_files = {
|
119
119
|
project_stats: %w[.taxonomy.json .metadata.db],
|
120
|
-
haai_distances: %w[.
|
121
|
-
aai_distances: %w[.
|
122
|
-
ani_distances: %w[.
|
123
|
-
clade_finding: %w[
|
124
|
-
|
120
|
+
haai_distances: %w[.rds .log .txt],
|
121
|
+
aai_distances: %w[.rds .log .txt],
|
122
|
+
ani_distances: %w[.rds .log .txt],
|
123
|
+
clade_finding: %w[
|
124
|
+
.pdf .classif .medoids .class.tsv .class.nwk .proposed-clades
|
125
|
+
],
|
125
126
|
subclades: %w[.pdf .classif .medoids .class.tsv .class.nwk],
|
126
127
|
ogs: %w[.ogs .stats]
|
127
128
|
}
|
data/utils/FastAAI/FastAAI
CHANGED
@@ -2702,6 +2702,8 @@ def merge_db_opts():
|
|
2702
2702
|
|
2703
2703
|
parser.add_argument('-d', '--donors', dest = 'donors', default = None, help = 'Comma-separated string of paths to one or more donor databases. The genomes FROM the donors will be added TO the recipient and the donors will be unaltered')
|
2704
2704
|
|
2705
|
+
parser.add_argument('--donor_file', dest = 'donor_file', default = None, help = 'Alternative way to supply donors. A file containing paths to the donor databases, 1 per line')
|
2706
|
+
|
2705
2707
|
parser.add_argument('-r', '--recipient', dest = 'recipient', default = None, help = 'Path to the recipient database. Any genomes FROM the donor database not already in the recipient will be added to this database.')
|
2706
2708
|
|
2707
2709
|
parser.add_argument('--verbose', dest = 'verbose', action='store_true', help = 'Print minor updates to console. Major updates are printed regardless.')
|
@@ -2720,16 +2722,23 @@ def merge_db_thread_starter(rev_index, per_db_accs):
|
|
2720
2722
|
|
2721
2723
|
|
2722
2724
|
|
2723
|
-
def merge_db(recipient, donors, verbose, threads):
|
2725
|
+
def merge_db(recipient, donors, donor_file, verbose, threads):
|
2724
2726
|
#Prettier on the CLI
|
2725
|
-
|
2727
|
+
|
2728
|
+
if donor_file is not None:
|
2729
|
+
fh = agnostic_reader(donor_file)
|
2730
|
+
donors = [line.strip() for line in fh]
|
2731
|
+
fh.close()
|
2732
|
+
|
2726
2733
|
if donors is None or recipient is None:
|
2727
2734
|
print("Either donor or target not given. FastAAI is exiting.")
|
2728
2735
|
return None
|
2729
2736
|
|
2730
2737
|
print("")
|
2731
2738
|
|
2732
|
-
|
2739
|
+
if donor_file is None:
|
2740
|
+
donors = donors.split(",")
|
2741
|
+
|
2733
2742
|
valid_donors = []
|
2734
2743
|
for d in donors:
|
2735
2744
|
if os.path.exists(d):
|
@@ -3454,10 +3463,11 @@ def main():
|
|
3454
3463
|
|
3455
3464
|
recipient = opts.recipient
|
3456
3465
|
donors = opts.donors
|
3466
|
+
donor_file = opts.donor_file
|
3457
3467
|
verbose = opts.verbose
|
3458
3468
|
threads = opts.threads
|
3459
3469
|
|
3460
|
-
merge_db(recipient, donors, verbose, threads)
|
3470
|
+
merge_db(recipient, donors, donor_file, verbose, threads)
|
3461
3471
|
|
3462
3472
|
#################### Query files vs DB ########################
|
3463
3473
|
|
data/utils/distance/commands.rb
CHANGED
@@ -151,8 +151,11 @@ module MiGA::DistanceRunner::Commands
|
|
151
151
|
donors << tgt_idx if tgt_idx
|
152
152
|
end
|
153
153
|
return nil if donors.empty?
|
154
|
+
|
155
|
+
# Build target database
|
156
|
+
File.open(f0 = tmp_file, 'w') { |fh| donors.each { |i| fh.puts i } }
|
154
157
|
run_cmd <<~CMD
|
155
|
-
FastAAI merge_db --
|
158
|
+
FastAAI merge_db --donor_file "#{f0}" \
|
156
159
|
--recipient "#{f1 = tmp_file}" --threads #{opts[:thr]}
|
157
160
|
CMD
|
158
161
|
|
@@ -166,7 +169,7 @@ module MiGA::DistanceRunner::Commands
|
|
166
169
|
# Save values in the databases
|
167
170
|
haai_data = {}
|
168
171
|
aai_data = {}
|
169
|
-
# Ugly workaround to the insistence of FastAAI to
|
172
|
+
# Ugly workaround to the insistence of FastAAI not to provide the files
|
170
173
|
# I ask for ;-)
|
171
174
|
qry_results = File.basename(qry_idx, '.faix') + '_results.txt'
|
172
175
|
out_file = File.join(f2, 'results', qry_results)
|
data/utils/distance/database.rb
CHANGED
@@ -127,6 +127,7 @@ module MiGA::DistanceRunner::Database
|
|
127
127
|
db = tmp_dbs[metric]
|
128
128
|
table = metric == :haai ? :aai : metric
|
129
129
|
SQLite3::Database.new(db) do |conn|
|
130
|
+
conn.execute('BEGIN TRANSACTION')
|
130
131
|
data.each do |k, v|
|
131
132
|
sql = <<~SQL
|
132
133
|
insert into #{table} (
|
@@ -135,6 +136,7 @@ module MiGA::DistanceRunner::Database
|
|
135
136
|
SQL
|
136
137
|
conn.execute(sql, [dataset.name, k] + v)
|
137
138
|
end
|
139
|
+
conn.execute('COMMIT')
|
138
140
|
end
|
139
141
|
checkpoint(metric)
|
140
142
|
end
|
data/utils/find-medoid.R
CHANGED
@@ -5,26 +5,28 @@
|
|
5
5
|
#
|
6
6
|
|
7
7
|
#= Load stuff
|
8
|
-
argv <- commandArgs(trailingOnly =
|
8
|
+
argv <- commandArgs(trailingOnly = TRUE)
|
9
9
|
suppressPackageStartupMessages(library(ape))
|
10
|
-
if(Sys.getenv(
|
10
|
+
if(Sys.getenv("MIGA") == ""){
|
11
11
|
suppressPackageStartupMessages(library(enveomics.R))
|
12
12
|
}else{
|
13
|
-
source(file.path(
|
14
|
-
|
13
|
+
source(file.path(
|
14
|
+
Sys.getenv("MIGA"),
|
15
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
16
|
+
))
|
15
17
|
}
|
16
18
|
|
17
|
-
find_medoids <- function(ani.df, out, clades) {
|
19
|
+
find_medoids <- function (ani.df, out, clades) {
|
18
20
|
if(nrow(ani.df) == 0) return(NULL)
|
19
21
|
ani.df$d <- 1 - (ani.df$value/100)
|
20
|
-
dist <- enve.df2dist(ani.df,
|
22
|
+
dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
|
21
23
|
dist <- as.matrix(dist)
|
22
|
-
cl <- read.table(clades, header = FALSE, sep =
|
24
|
+
cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
|
23
25
|
cl.s <- c()
|
24
26
|
medoids <- c()
|
25
27
|
for(i in cl){
|
26
|
-
lab <- strsplit(i,
|
27
|
-
cat(
|
28
|
+
lab <- strsplit(i, ",")[[1]]
|
29
|
+
cat("Clade of:", lab[1], "\n")
|
28
30
|
if(length(lab) == 1) {
|
29
31
|
lab.s <- lab
|
30
32
|
} else {
|
@@ -32,15 +34,17 @@ find_medoids <- function(ani.df, out, clades) {
|
|
32
34
|
}
|
33
35
|
med <- lab.s[1]
|
34
36
|
medoids <- c(medoids, med)
|
35
|
-
cl.s <- c(cl.s, paste(lab.s, collapse =
|
37
|
+
cl.s <- c(cl.s, paste(lab.s, collapse = ","))
|
36
38
|
}
|
37
39
|
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
38
|
-
write.table(
|
39
|
-
|
40
|
+
write.table(
|
41
|
+
cl.s, paste(clades, ".sorted", sep = ""), quote = FALSE,
|
42
|
+
row.names = FALSE, col.names = FALSE
|
43
|
+
)
|
40
44
|
}
|
41
45
|
|
42
46
|
#= Main
|
43
|
-
|
44
|
-
|
47
|
+
cat("Finding Medoids")
|
48
|
+
ani <- readRDS(argv[1])
|
45
49
|
find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
|
46
50
|
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -44,7 +44,7 @@ module MiGA::SubcladeRunner::Pipeline
|
|
44
44
|
# Find genomospecies medoids
|
45
45
|
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
|
46
46
|
dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
|
47
|
-
`Rscript '#{src}' ../../09.distances/#{dir}/miga-project.
|
47
|
+
`Rscript '#{src}' '../../09.distances/#{dir}/miga-project.rds' \
|
48
48
|
miga-project.gsp-medoids miga-project.gsp-clades`
|
49
49
|
if File.exist? 'miga-project.gsp-clades.sorted'
|
50
50
|
File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
|
@@ -54,8 +54,6 @@ module MiGA::SubcladeRunner::Pipeline
|
|
54
54
|
ofh = File.open('miga-project.proposed-clades', 'w')
|
55
55
|
File.open('miga-project.gsp-clades', 'r') do |ifh|
|
56
56
|
ifh.each_line do |ln|
|
57
|
-
next if $. == 1
|
58
|
-
|
59
57
|
r = ln.chomp.split(',')
|
60
58
|
ofh.puts r.join("\t") if r.size >= 5
|
61
59
|
end
|
data/utils/subclades.R
CHANGED
@@ -10,56 +10,51 @@ suppressPackageStartupMessages(library(ape))
|
|
10
10
|
suppressPackageStartupMessages(library(vegan))
|
11
11
|
suppressPackageStartupMessages(library(cluster))
|
12
12
|
suppressPackageStartupMessages(library(parallel))
|
13
|
-
if(Sys.getenv(
|
13
|
+
if(Sys.getenv("MIGA") == ""){
|
14
14
|
suppressPackageStartupMessages(library(enveomics.R))
|
15
15
|
}else{
|
16
|
-
source(file.path(
|
17
|
-
|
16
|
+
source(file.path(
|
17
|
+
Sys.getenv("MIGA"),
|
18
|
+
"utils", "enveomics", "enveomics.R", "R", "df2dist.R"
|
19
|
+
))
|
18
20
|
}
|
19
21
|
|
20
22
|
#= Main function
|
21
23
|
subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
22
|
-
say(
|
24
|
+
say("==> Out base:", out_base, "<==")
|
23
25
|
|
24
26
|
# Normalize input matrix
|
25
|
-
|
26
|
-
if(!missing(ani_file)){
|
27
|
-
if(length(ani.d) == 0 && !file.exists(
|
27
|
+
dist_rds <- paste(out_base, "dist.rds", sep = ".")
|
28
|
+
if (!missing(ani_file)) {
|
29
|
+
if(length(ani.d) == 0 && !file.exists(dist_rds)){
|
28
30
|
# Read from ani_file
|
29
|
-
|
30
|
-
if(
|
31
|
+
ani.d <- ani_distance(ani_file, sel)
|
32
|
+
if (is.null(ani.d)) {
|
31
33
|
generate_empty_files(out_base)
|
32
34
|
return(NULL)
|
35
|
+
} else {
|
36
|
+
saveRDS(ani.d, dist_rds)
|
33
37
|
}
|
34
|
-
if(!is.na(sel) && file.exists(sel)){
|
35
|
-
say('Filter selection')
|
36
|
-
lab <- read.table(sel, sep='\t', head=FALSE, as.is=TRUE)[,1]
|
37
|
-
a <- a[a$a %in% lab & a$b %in% lab, ]
|
38
|
-
}
|
39
|
-
say('Distances')
|
40
|
-
a$d <- 1 - (a$value/100)
|
41
|
-
ani.d <- enve.df2dist(a, 'a', 'b', 'd', default.d = max(a$d)*1.2)
|
42
|
-
save(ani.d, file = dist_rdata)
|
43
38
|
}
|
44
39
|
}
|
45
40
|
|
46
41
|
# Read result if the subclade is ready, run it otherwise
|
47
|
-
if(file.exists(paste(out_base,
|
42
|
+
if (file.exists(paste(out_base, "classif", sep = "."))) {
|
48
43
|
say("Loading")
|
49
44
|
ani.medoids <- read.table(paste(out_base, "medoids", sep = "."),
|
50
|
-
sep =
|
45
|
+
sep = " ", as.is = TRUE)[,1]
|
51
46
|
a <- read.table(paste(out_base, "classif", sep="."),
|
52
|
-
sep =
|
47
|
+
sep = "\t", as.is = TRUE)
|
53
48
|
ani.types <- a[,2]
|
54
49
|
names(ani.types) <- a[,1]
|
55
|
-
if(length(ani.d) == 0)
|
56
|
-
}else if(length(labels(ani.d)) > 8L){
|
57
|
-
res <- subclade_clustering(out_base, thr, ani.d,
|
58
|
-
if(length(res) == 0) return(NULL)
|
59
|
-
ani.medoids <- res[[
|
60
|
-
ani.types <- res[[
|
61
|
-
ani.d <- res[[
|
62
|
-
}else{
|
50
|
+
if(length(ani.d) == 0) ani.d <- readRDS(dist_rds)
|
51
|
+
} else if (length(labels(ani.d)) > 8L) {
|
52
|
+
res <- subclade_clustering(out_base, thr, ani.d, dist_rds)
|
53
|
+
if (length(res) == 0) return(NULL)
|
54
|
+
ani.medoids <- res[["ani.medoids"]]
|
55
|
+
ani.types <- res[["ani.types"]]
|
56
|
+
ani.d <- res[["ani.d"]]
|
57
|
+
} else {
|
63
58
|
ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
|
64
59
|
ani.types <- rep(1, length(labels(ani.d)))
|
65
60
|
names(ani.types) <- labels(ani.d)
|
@@ -69,66 +64,80 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
69
64
|
|
70
65
|
# Recursive search
|
71
66
|
say("Recursive search")
|
72
|
-
for(i in 1:length(ani.medoids)){
|
67
|
+
for (i in 1:length(ani.medoids)) {
|
73
68
|
medoid <- ani.medoids[i]
|
74
69
|
ds_f <- names(ani.types)[ ani.types==i ]
|
75
70
|
say("Analyzing subclade", i, "with medoid:", medoid)
|
76
71
|
dir_f <- paste(out_base, ".sc-", i, sep="")
|
77
|
-
if(!dir.exists(dir_f)) dir.create(dir_f)
|
72
|
+
if (!dir.exists(dir_f)) dir.create(dir_f)
|
78
73
|
write.table(ds_f,
|
79
74
|
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
80
75
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
81
|
-
if(length(ds_f) > 8L){
|
76
|
+
if (length(ds_f) > 8L) {
|
82
77
|
ani_subset <- as.dist(as.matrix(ani.d)[ds_f, ds_f])
|
83
|
-
subclades(
|
84
|
-
|
78
|
+
subclades(
|
79
|
+
out_base = paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
80
|
+
thr = thr,
|
81
|
+
ani.d = ani_subset
|
82
|
+
)
|
85
83
|
}
|
86
84
|
}
|
87
85
|
|
88
86
|
# Declare recursion up-to-here complete
|
89
|
-
write.table(
|
90
|
-
|
87
|
+
write.table(
|
88
|
+
date(), paste(out_base, "ready", sep = "."),
|
89
|
+
quote = FALSE, row.names = FALSE, col.names = FALSE
|
90
|
+
)
|
91
91
|
}
|
92
92
|
|
93
93
|
#= Heavy-lifter
|
94
|
-
subclade_clustering <- function(out_base, thr, ani.d,
|
94
|
+
subclade_clustering <- function (out_base, thr, ani.d, dist_rds) {
|
95
95
|
# Get ANI distances
|
96
|
-
if(length(ani.d) > 0){
|
97
|
-
# Just use ani.d (and save in
|
98
|
-
|
99
|
-
}else if(file.exists(
|
100
|
-
# Read from
|
101
|
-
|
102
|
-
}else{
|
96
|
+
if (length(ani.d) > 0) {
|
97
|
+
# Just use ani.d (and save in dist_rds)
|
98
|
+
if (!file.exists(dist_rds)) saveRDS(ani.d, dist_rds)
|
99
|
+
} else if (file.exists(dist_rds)) {
|
100
|
+
# Read from dist_rds
|
101
|
+
ani.d <- readRDS(dist_rds)
|
102
|
+
} else {
|
103
103
|
stop("Cannot find input matrix", out_base)
|
104
104
|
}
|
105
|
-
if(length(labels(ani.d)) <= 8L) return(list())
|
105
|
+
if (length(labels(ani.d)) <= 8L) return(list())
|
106
106
|
|
107
107
|
# Build tree
|
108
108
|
say("Tree")
|
109
109
|
ani.ph <- bionj(ani.d)
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
say("- Write")
|
111
|
+
express.ori <- options("expressions")$expressions
|
112
|
+
if(express.ori < ani.ph$Nnode * 4){
|
113
|
+
options(expressions=min(c(5e7, ani.ph$Nnode * 4)))
|
113
114
|
}
|
114
|
-
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
115
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep = ""))
|
115
116
|
options(expressions=express.ori)
|
116
117
|
|
117
118
|
# Silhouette
|
118
119
|
say("Silhouette")
|
119
120
|
nn <- length(labels(ani.d))
|
120
121
|
k <- min(max(floor(0.005 * nn), 2), 20):min(nn-1, 100)
|
122
|
+
say("- Make cluster")
|
121
123
|
cl <- makeCluster(thr)
|
122
|
-
|
124
|
+
say("- Launch parallel jobs")
|
125
|
+
s <- parSapply(
|
126
|
+
cl, k,
|
127
|
+
function(x) {
|
123
128
|
library(cluster)
|
124
|
-
s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
125
|
-
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
126
|
-
}
|
129
|
+
s <- pam(ani.d, x, do.swap = FALSE, pamonce = 1)$silinfo
|
130
|
+
c(s$avg.width, -sum(ifelse(s$widths[,3] > 0, 0, s$widths[,3])))
|
131
|
+
}
|
132
|
+
)
|
133
|
+
say("- Stop cluster")
|
127
134
|
stopCluster(cl)
|
128
|
-
|
129
|
-
s.
|
130
|
-
|
131
|
-
|
135
|
+
say("- Calculate custom criteria")
|
136
|
+
s.avg.z <- (s[1,] - mean(s[1,])) / (sd(s[1,]) + 0.0001)
|
137
|
+
s.neg.z <- (s[2,] - mean(s[2,])) / (sd(s[2,]) + 0.01)
|
138
|
+
ds <- s.avg.z - s.neg.z - 2 / (1:length(k)) - (1:length(k)) / 50
|
139
|
+
if(mean(s[1,] < 0) < 0.75)
|
140
|
+
ds[s[1,] < 0] <- mean(ds) # <- k's with negative average
|
132
141
|
top.n <- k[which.max(ds)]
|
133
142
|
|
134
143
|
# Classify genomes
|
@@ -139,8 +148,8 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
139
148
|
|
140
149
|
# Generate graphic report
|
141
150
|
say("Graphic report")
|
142
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
143
|
-
layout(matrix(c(1,
|
151
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
152
|
+
layout(matrix(c(rep(1:3, each = 2), 4:5), byrow = TRUE, ncol = 2))
|
144
153
|
plot_distances(ani.d)
|
145
154
|
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
146
155
|
plot_clustering(ani.cl, ani.d, ani.types)
|
@@ -153,112 +162,170 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
153
162
|
# Return data
|
154
163
|
say("Cluster ready")
|
155
164
|
return(list(
|
156
|
-
ani.medoids=ani.medoids,
|
157
|
-
ani.types=ani.types,
|
158
|
-
ani.d=ani.d
|
165
|
+
ani.medoids = ani.medoids,
|
166
|
+
ani.types = ani.types,
|
167
|
+
ani.d = ani.d
|
159
168
|
))
|
160
169
|
}
|
161
170
|
|
162
171
|
#= Helper functions
|
163
|
-
say <- function(...) {
|
172
|
+
say <- function (...) {
|
173
|
+
message(paste("[", date(), "]", ..., "\n"), appendLF = FALSE)
|
174
|
+
}
|
164
175
|
|
165
|
-
generate_empty_files <- function(out_base) {
|
166
|
-
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
167
|
-
plot(1, t="n", axes=F)
|
168
|
-
legend("center", "No data", bty="n")
|
176
|
+
generate_empty_files <- function (out_base) {
|
177
|
+
pdf(paste(out_base, ".pdf", sep = ""), 7, 12)
|
178
|
+
plot(1, t = "n", axes = F)
|
179
|
+
legend("center", "No data", bty = "n")
|
169
180
|
dev.off()
|
170
|
-
file.create(paste(out_base,".1.classif",sep=""))
|
171
|
-
file.create(paste(out_base,".1.medoids",sep=""))
|
181
|
+
file.create(paste(out_base, ".1.classif", sep = ""))
|
182
|
+
file.create(paste(out_base, ".1.medoids", sep = ""))
|
172
183
|
}
|
173
184
|
|
174
|
-
write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
|
175
|
-
say(
|
176
|
-
write.table(
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
185
|
+
write_text_report <- function (out_base, ani.d, ani.medoids, ani.types) {
|
186
|
+
say("Text report")
|
187
|
+
write.table(
|
188
|
+
ani.medoids, paste(out_base, "medoids", sep = "."),
|
189
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE
|
190
|
+
)
|
191
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ani.types], NA)
|
192
|
+
ani.d.m <- 100 - as.matrix(ani.d) * 100
|
193
|
+
for (j in 1:nrow(classif)) {
|
181
194
|
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
182
195
|
}
|
183
|
-
write.table(
|
184
|
-
|
196
|
+
write.table(
|
197
|
+
classif, paste(out_base, "classif", sep="."),
|
198
|
+
quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t"
|
199
|
+
)
|
185
200
|
}
|
186
201
|
|
187
|
-
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
202
|
+
plot_silhouette <- function (k, s, ns, ds, top.n) {
|
188
203
|
# s
|
189
|
-
par(mar=c(4,5,1,5)+0.1)
|
190
|
-
plot(
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
204
|
+
par(mar = c(4,5,1,5)+0.1)
|
205
|
+
plot(
|
206
|
+
1, t = "n", xlab = "k (clusters)", ylab = "", xlim = range(c(0,k)),
|
207
|
+
ylim = range(s), bty = "n", xaxs = "i", yaxt = "n"
|
208
|
+
)
|
209
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border = NA, col = "grey80")
|
210
|
+
axis(2, fg = "grey60", col.axis = "grey60")
|
211
|
+
mtext("Mean silhouette", side = 2, line = 3, col = "grey60")
|
212
|
+
|
195
213
|
# ns
|
196
|
-
par(new=TRUE)
|
197
|
-
plot(
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
214
|
+
par(new = TRUE)
|
215
|
+
plot(
|
216
|
+
1, t = "n", bty = "n",
|
217
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
218
|
+
xlim = range(c(0,k)), ylim = range(ns)
|
219
|
+
)
|
220
|
+
points(k, ns, type = "o", pch = 16, col = rgb(1/2,0,0,3/4))
|
221
|
+
axis(4, fg = "darkred", col.axis = "darkred")
|
222
|
+
mtext("Negative silhouette area", side = 4, line = 3, col = "darkred")
|
223
|
+
|
202
224
|
# ds
|
203
|
-
par(new=TRUE)
|
204
|
-
plot(
|
205
|
-
|
225
|
+
par(new = TRUE)
|
226
|
+
plot(
|
227
|
+
1, t = "n", bty = "n",
|
228
|
+
xlab = "", ylab = "", xaxt = "n", yaxt = "n", xaxs = "i",
|
229
|
+
xlim = range(c(0,k)), ylim = range(ds)
|
230
|
+
)
|
206
231
|
lines(k, ds)
|
207
|
-
abline(v=top.n, lty=2)
|
232
|
+
abline(v = top.n, lty = 2)
|
208
233
|
}
|
209
234
|
|
210
|
-
plot_distances <- function(dist) {
|
211
|
-
par(mar=c(5,4,1,2)+0.1)
|
212
|
-
hist(
|
235
|
+
plot_distances <- function (dist) {
|
236
|
+
par(mar = c(5,4,1,2) + 0.1)
|
237
|
+
hist(
|
238
|
+
dist, border = NA, col = "grey60", breaks = 50,
|
239
|
+
xlab = "Distances", main = ""
|
240
|
+
)
|
213
241
|
}
|
214
242
|
|
215
|
-
plot_clustering <- function(cl, dist, types) {
|
216
|
-
par(mar=c(5,4,4,2)+0.1)
|
243
|
+
plot_clustering <- function (cl, dist, types) {
|
244
|
+
par(mar = c(5,4,4,2) + 0.1)
|
217
245
|
top.n <- length(cl$medoids)
|
218
246
|
col <- ggplotColours(top.n)
|
219
|
-
plot(silhouette(cl), col=col)
|
220
|
-
if(length(labels(dist))<=15){
|
221
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
222
|
-
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
223
|
-
}else{
|
224
|
-
ani.mds <- cmdscale(dist, k=4)
|
225
|
-
if(ncol(ani.mds)==4){
|
226
|
-
plot(
|
227
|
-
|
228
|
-
|
229
|
-
|
247
|
+
plot(silhouette(cl), col = col)
|
248
|
+
if (length(labels(dist)) <= 15) {
|
249
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
250
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
251
|
+
} else {
|
252
|
+
ani.mds <- cmdscale(dist, k = 4)
|
253
|
+
if (ncol(ani.mds) == 4) {
|
254
|
+
plot(
|
255
|
+
ani.mds[,1], ani.mds[,2], col = col[types], cex = 1/2,
|
256
|
+
xlab = "Component 1", ylab = "Component 2"
|
257
|
+
)
|
258
|
+
plot(
|
259
|
+
ani.mds[,3], ani.mds[,4], col = col[types], cex = 1/2,
|
260
|
+
xlab = "Component 3", ylab="Component 4"
|
261
|
+
)
|
230
262
|
}else{
|
231
|
-
|
232
|
-
|
263
|
+
for (i in 1:2)
|
264
|
+
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", bty = "n")
|
233
265
|
}
|
234
266
|
}
|
235
267
|
}
|
236
268
|
|
237
|
-
plot_tree <- function(phy, types, medoids){
|
269
|
+
plot_tree <- function (phy, types, medoids) {
|
238
270
|
layout(1)
|
239
271
|
top.n <- length(unique(types))
|
240
272
|
col <- ggplotColours(top.n)
|
241
273
|
is.medoid <- phy$tip.label %in% medoids
|
242
|
-
phy$tip.label[is.medoid] <- paste(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
274
|
+
phy$tip.label[is.medoid] <- paste(
|
275
|
+
phy$tip.label[is.medoid],
|
276
|
+
" [", types[phy$tip.label[is.medoid]], "]",
|
277
|
+
sep = ""
|
278
|
+
)
|
279
|
+
plot(
|
280
|
+
phy, cex = ifelse(is.medoid, 1/3, 1/6),
|
281
|
+
font = ifelse(is.medoid, 2, 1),
|
282
|
+
tip.color = col[types[phy$tip.label]]
|
283
|
+
)
|
284
|
+
}
|
285
|
+
|
286
|
+
ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
|
287
|
+
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360 / n
|
288
|
+
hcl(h = seq(h[1], h[2], length = n), c = 100, l = 65, alpha = alpha)
|
247
289
|
}
|
248
290
|
|
249
|
-
|
250
|
-
|
251
|
-
|
291
|
+
ani_distance <- function (ani_file, sel) {
|
292
|
+
# Try to locate rds, otherwise read gzipped table
|
293
|
+
rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
|
294
|
+
if (file.exists(rds)) {
|
295
|
+
sim <- readRDS(rds)
|
296
|
+
} else {
|
297
|
+
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
|
298
|
+
}
|
299
|
+
|
300
|
+
# If there is not data end process
|
301
|
+
if (nrow(sim) == 0) return(NULL)
|
302
|
+
|
303
|
+
# Apply filter (if requested)
|
304
|
+
if (!is.na(sel) && file.exists(sel)) {
|
305
|
+
say("Filter selection")
|
306
|
+
lab <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
|
307
|
+
sim <- sim[sim$a %in% lab & sim$b %in% lab, ]
|
308
|
+
}
|
309
|
+
|
310
|
+
# Transform to distances
|
311
|
+
say("Distances")
|
312
|
+
sim$d <- 1 - (sim$value / 100)
|
313
|
+
return(enve.df2dist(sim, "a", "b", "d", default.d = max(sim$d) * 1.2))
|
252
314
|
}
|
253
315
|
|
254
316
|
#= Main
|
255
317
|
options(warn = 1)
|
256
|
-
if(length(argv) >= 5 & argv[5] ==
|
318
|
+
if (length(argv) >= 5 & argv[5] == "empty") {
|
257
319
|
generate_empty_files(argv[2])
|
258
|
-
write.table(NULL, paste(argv[2], "medoids", sep="."))
|
259
|
-
write.table(NULL, paste(argv[2], "classif", sep="."))
|
260
|
-
write.table(date(), paste(argv[2], "ready", sep="."))
|
320
|
+
write.table(NULL, paste(argv[2], "medoids", sep = "."))
|
321
|
+
write.table(NULL, paste(argv[2], "classif", sep = "."))
|
322
|
+
write.table(date(), paste(argv[2], "ready", sep = "."))
|
261
323
|
}else{
|
262
|
-
subclades(
|
263
|
-
|
324
|
+
subclades(
|
325
|
+
ani_file = argv[1],
|
326
|
+
out_base = argv[2],
|
327
|
+
thr = ifelse(is.na(argv[3]), 1, as.numeric(argv[3])),
|
328
|
+
sel = argv[4]
|
329
|
+
)
|
264
330
|
}
|
331
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-11-
|
11
|
+
date: 2021-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|