miga-base 1.3.4.3 → 1.3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/{get_db.rb → db.rb} +44 -4
- data/lib/miga/cli/action/doctor/operations.rb +14 -0
- data/lib/miga/cli/action/doctor.rb +11 -9
- data/lib/miga/cli/action/rm.rb +10 -5
- data/lib/miga/cli/action/wf.rb +4 -0
- data/lib/miga/cli/base.rb +41 -40
- data/lib/miga/result/stats.rb +9 -5
- data/lib/miga/taxonomy.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +24 -16
- data/scripts/distances.bash +1 -1
- data/scripts/essential_genes.bash +21 -3
- data/utils/distance/database.rb +24 -7
- data/utils/distance/pipeline.rb +12 -18
- data/utils/distance/runner.rb +35 -27
- data/utils/distance/temporal.rb +1 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9035e8f6ccd641c75bedc38b5f78d3eb1815c51956d399728b1e862c2920cbb
|
4
|
+
data.tar.gz: 2ffdecb639083a3a40d2bab3a39f1df92cb22279314ede2a5a843b278c8be9ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42db29082a3b8097f8ec6f55839202d79e8d00496d55bab8093e4355a26f9133249506266e7f0eb64acea18fb91885abc493a0fe92c81c0eef27ec11f08d3eff
|
7
|
+
data.tar.gz: 5b4ce9ae27ac4542e2af836176dde909f013461b9916f1d1f4cfc0831d6e494d5de76c57d9c7b64ebd87aefd21a2c79edf55a0192838b6e589be21e9e5a4ebb1
|
@@ -4,12 +4,12 @@
|
|
4
4
|
require 'miga/cli/action'
|
5
5
|
require 'digest/md5'
|
6
6
|
|
7
|
-
class MiGA::Cli::Action::
|
7
|
+
class MiGA::Cli::Action::Db < MiGA::Cli::Action
|
8
8
|
def parse_cli
|
9
9
|
cli.defaults = {
|
10
10
|
database: :recommended,
|
11
11
|
version: :latest,
|
12
|
-
local: File.
|
12
|
+
local: File.join(ENV['MIGA_HOME'], '.miga_db'),
|
13
13
|
host: MiGA::MiGA.known_hosts(:miga_db),
|
14
14
|
pb: true,
|
15
15
|
reuse_archive: false,
|
@@ -40,6 +40,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
40
40
|
'--list-versions',
|
41
41
|
'List available versions of the database and exit'
|
42
42
|
) { |v| cli[:list_versions] = v }
|
43
|
+
opt.on(
|
44
|
+
'--list-local',
|
45
|
+
'List only the versions of the local databases (if any) and exit'
|
46
|
+
) { |v| cli[:list_local] = v }
|
43
47
|
opt.on(
|
44
48
|
'--reuse-archive',
|
45
49
|
'Reuse a previously downloaded archive if available'
|
@@ -48,6 +52,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
48
52
|
'--no-overwrite',
|
49
53
|
'Exit without downloading if the target database already exists'
|
50
54
|
) { |v| cli[:overwrite] = v }
|
55
|
+
opt.on(
|
56
|
+
'--tab',
|
57
|
+
'Return a tab-delimited table'
|
58
|
+
) { |v| cli[:tabular] = v }
|
51
59
|
opt.on('--no-progress', 'Supress progress bars') { |v| cli[:pb] = v }
|
52
60
|
end
|
53
61
|
end
|
@@ -60,6 +68,12 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
60
68
|
return
|
61
69
|
end
|
62
70
|
|
71
|
+
# If dealing with local checks only
|
72
|
+
if cli[:list_local]
|
73
|
+
list_local
|
74
|
+
return
|
75
|
+
end
|
76
|
+
|
63
77
|
# Remote manifest
|
64
78
|
@ftp = remote_connection
|
65
79
|
manif = remote_manifest(@ftp)
|
@@ -89,6 +103,28 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
89
103
|
|
90
104
|
private
|
91
105
|
|
106
|
+
def list_local
|
107
|
+
local_manif = local_manifest
|
108
|
+
raise "Local manifest not found." unless local_manif
|
109
|
+
databases =
|
110
|
+
if %i[recommended test].include?(cli[:database])
|
111
|
+
local_manif[:databases].keys
|
112
|
+
else
|
113
|
+
[cli[:database].to_sym]
|
114
|
+
end
|
115
|
+
cli.table(
|
116
|
+
%w[database version genomes updated path],
|
117
|
+
databases.map do |db|
|
118
|
+
path = File.join(cli[:local], db.to_s)
|
119
|
+
p = MiGA::Project.load(path)
|
120
|
+
if p
|
121
|
+
md = p.metadata
|
122
|
+
[db, md[:release], md[:datasets].count, md[:updated], p.path]
|
123
|
+
end
|
124
|
+
end.compact
|
125
|
+
)
|
126
|
+
end
|
127
|
+
|
92
128
|
def remote_connection
|
93
129
|
cli.say "Connecting to '#{cli[:host]}'"
|
94
130
|
MiGA::MiGA.remote_connection(cli[:host])
|
@@ -113,6 +149,11 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
113
149
|
MiGA::Json.parse(file)
|
114
150
|
end
|
115
151
|
|
152
|
+
def local_manifest
|
153
|
+
file = File.join(cli[:local], '_local_manif.json')
|
154
|
+
MiGA::Json.parse(file) if File.exist?(file)
|
155
|
+
end
|
156
|
+
|
116
157
|
def db_requested(manif)
|
117
158
|
[:recommended, :test].each do |n|
|
118
159
|
if cli[:database] == n
|
@@ -208,8 +249,7 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
208
249
|
|
209
250
|
def register_database(manif, db, ver)
|
210
251
|
cli.say "Registering database locally"
|
211
|
-
local_manif =
|
212
|
-
reg = File.exist?(local_manif) ? MiGA::Json.parse(local_manif) : {}
|
252
|
+
local_manif = local_manifest || {}
|
213
253
|
reg[:last_update] = Time.now.to_s
|
214
254
|
reg[:databases] ||= {}
|
215
255
|
reg[:databases][cli[:database]] ||= {}
|
@@ -1,5 +1,19 @@
|
|
1
1
|
|
2
2
|
module MiGA::Cli::Action::Doctor::Operations
|
3
|
+
##
|
4
|
+
# Perform refdb operation with MiGA::Cli +cli+
|
5
|
+
def check_refdb(cli)
|
6
|
+
cli.say 'Checking index format of reference database'
|
7
|
+
ref_dbs = File.join(ENV['MIGA_HOME'], '.miga_db')
|
8
|
+
manif_file = File.join(ref_dbs, '_local_manif.json')
|
9
|
+
return unless File.size?(manif_file)
|
10
|
+
|
11
|
+
MiGA::Json.parse(manif_file)[:databases]&.keys&.each do |db|
|
12
|
+
p = MiGA::Project.load(File.join(ref_dbs, db.to_s))
|
13
|
+
md = p&.metadata
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
3
17
|
##
|
4
18
|
# Perform status operation with MiGA::Cli +cli+
|
5
19
|
def check_status(cli)
|
@@ -46,16 +46,18 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
46
46
|
end
|
47
47
|
|
48
48
|
@@OPERATIONS = {
|
49
|
+
# TODO Implement this check:
|
50
|
+
# refdb: ['ref-db', 'Check index format of reference databases'],
|
49
51
|
status: ['status', 'Update metadata status of all datasets'],
|
50
|
-
db:
|
51
|
-
bidir:
|
52
|
-
dist:
|
53
|
-
files:
|
54
|
-
cds:
|
55
|
-
ess:
|
56
|
-
mts:
|
57
|
-
start:
|
58
|
-
tax:
|
52
|
+
db: ['databases', 'Check integrity of database files'],
|
53
|
+
bidir: ['bidirectional', 'Check distances are bidirectional'],
|
54
|
+
dist: ['distances', 'Check distance summary tables'],
|
55
|
+
files: ['files', 'Check for outdated files'],
|
56
|
+
cds: ['cds', 'Check for gzipped genes and proteins'],
|
57
|
+
ess: ['essential-genes', 'Check for outdated essential genes'],
|
58
|
+
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
59
|
+
start: ['start', 'Check for lingering .start files'],
|
60
|
+
tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
|
59
61
|
}
|
60
62
|
|
61
63
|
class << self
|
data/lib/miga/cli/action/rm.rb
CHANGED
@@ -7,9 +7,9 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
|
|
7
7
|
def parse_cli
|
8
8
|
cli.defaults = { remove: false }
|
9
9
|
cli.parse do |opt|
|
10
|
-
cli.opt_object(opt)
|
10
|
+
cli.opt_object(opt, %i[project dataset_opt result_opt])
|
11
11
|
opt.on(
|
12
|
-
'-
|
12
|
+
'-R', '--remove',
|
13
13
|
'Also remove all associated files',
|
14
14
|
'By default, only unlinks from metadata'
|
15
15
|
) { |v| cli[:remove] = v }
|
@@ -17,8 +17,13 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def perform
|
20
|
-
|
21
|
-
|
22
|
-
d
|
20
|
+
if r = cli.load_result
|
21
|
+
cli[:remove] ? r.remove! : r.unlink
|
22
|
+
elsif d = cli.load_dataset
|
23
|
+
cli.load_project.unlink_dataset(d.name)
|
24
|
+
d.remove! if cli[:remove]
|
25
|
+
else
|
26
|
+
raise "You must define one of --result or --dataset"
|
27
|
+
end
|
23
28
|
end
|
24
29
|
end
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -21,6 +21,10 @@ module MiGA::Cli::Action::Wf
|
|
21
21
|
'-o', '--out_dir PATH',
|
22
22
|
'(Mandatory) Directory to be created with all output data'
|
23
23
|
) { |v| cli[:outdir] = v }
|
24
|
+
opt.on(
|
25
|
+
'-P', '--project PATH',
|
26
|
+
'::HIDE::' # Applying the principle of least surprise, alias of -o
|
27
|
+
) { |v| cli[:outdir] = v }
|
24
28
|
opt.separator ''
|
25
29
|
opt.separator " FILES...: #{files_desc}"
|
26
30
|
opt.separator ''
|
data/lib/miga/cli/base.rb
CHANGED
@@ -2,60 +2,61 @@
|
|
2
2
|
|
3
3
|
module MiGA::Cli::Base
|
4
4
|
@@TASK_DESC = {
|
5
|
-
generic:
|
5
|
+
generic: 'MiGA: The Microbial Genomes Atlas',
|
6
6
|
# Workflows
|
7
|
-
quality_wf:
|
8
|
-
derep_wf:
|
7
|
+
quality_wf: 'Evaluate the quality of input genomes',
|
8
|
+
derep_wf: 'Dereplicate a collection of input genomes',
|
9
9
|
classify_wf: 'Classify input genomes against a reference database',
|
10
|
-
preproc_wf:
|
11
|
-
index_wf:
|
10
|
+
preproc_wf: 'Preprocess input genomes or metagenomes',
|
11
|
+
index_wf: 'Generate distance indexing of input genomes',
|
12
12
|
# Projects
|
13
|
-
new:
|
14
|
-
about:
|
15
|
-
doctor:
|
16
|
-
|
17
|
-
browse: 'Explore a project locally using a web browser',
|
13
|
+
new: 'Create an empty MiGA project',
|
14
|
+
about: 'Display information about a MiGA project',
|
15
|
+
doctor: 'Perform consistency checks on a MiGA project',
|
16
|
+
browse: 'Explore a project locally using a web browser',
|
18
17
|
# Datasets
|
19
|
-
add:
|
20
|
-
get:
|
21
|
-
ncbi_get:
|
22
|
-
gtdb_get:
|
23
|
-
seqcode_get: 'Download all type genomes from SeqCode
|
24
|
-
rm:
|
25
|
-
find:
|
26
|
-
ln:
|
27
|
-
ls:
|
28
|
-
archive:
|
18
|
+
add: 'Create a dataset in a MiGA project',
|
19
|
+
get: 'Download a dataset from public databases into a MiGA project',
|
20
|
+
ncbi_get: 'Download the genomes of a taxon from NCBI to a MiGA project',
|
21
|
+
gtdb_get: 'Download the genomes of a taxon from GTDB to a MiGA project',
|
22
|
+
seqcode_get: 'Download all type genomes from SeqCode to a MiGA project',
|
23
|
+
rm: 'Remove a dataset from a MiGA project',
|
24
|
+
find: 'Find unregistered datasets based on result files',
|
25
|
+
ln: 'Link datasets (incl. results) from one project to another',
|
26
|
+
ls: 'List all registered datasets in a MiGA project',
|
27
|
+
archive: 'Generate a tar-ball with all files from select datasets',
|
29
28
|
# Results
|
30
|
-
add_result:
|
31
|
-
stats:
|
32
|
-
files:
|
33
|
-
run:
|
34
|
-
summary:
|
35
|
-
next_step:
|
29
|
+
add_result: 'Register a result',
|
30
|
+
stats: 'Extract statistics for the given result',
|
31
|
+
files: 'List registered files from a dataset or project',
|
32
|
+
run: 'Execute locally one step analysis producing the given result',
|
33
|
+
summary: 'Generate a summary table for the statistics of all datasets',
|
34
|
+
next_step: 'Return the next task to run in a dataset or project',
|
36
35
|
# Objects (Datasets or Projects)
|
37
|
-
edit:
|
38
|
-
option:
|
39
|
-
touch:
|
36
|
+
edit: 'Edit the metadata of a dataset or project',
|
37
|
+
option: 'Get or set options of a dataset or project',
|
38
|
+
touch: 'Change the "last modified" time to now without changes',
|
40
39
|
# System
|
41
|
-
init:
|
42
|
-
daemon:
|
43
|
-
lair:
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
init: 'Initialize MiGA to process new projects',
|
41
|
+
daemon: 'Control the daemon of a MiGA project',
|
42
|
+
lair: 'Control groups of daemons for several MiGA projects',
|
43
|
+
db: 'Download a pre-indexed database',
|
44
|
+
date: 'Return the current date in standard MiGA format',
|
45
|
+
console: 'Open an IRB console with MiGA',
|
46
|
+
env: 'Shell code to load MiGA environment',
|
47
47
|
# Taxonomy
|
48
|
-
tax_set:
|
49
|
-
tax_test:
|
50
|
-
tax_index:
|
51
|
-
tax_dist:
|
48
|
+
tax_set: 'Register taxonomic information for datasets',
|
49
|
+
tax_test: 'Return test of taxonomic distributions for query datasets',
|
50
|
+
tax_index: 'Create a taxonomy-indexed list of the datasets',
|
51
|
+
tax_dist: 'Estimate distributions of distance by taxonomy',
|
52
52
|
}
|
53
53
|
|
54
54
|
@@TASK_ALIAS = {
|
55
55
|
# Projects
|
56
56
|
create_project: :new,
|
57
57
|
project_info: :about,
|
58
|
-
download: :
|
58
|
+
download: :db,
|
59
|
+
get_db: :db,
|
59
60
|
# Datasets
|
60
61
|
create_dataset: :add,
|
61
62
|
download_dataset: :get,
|
data/lib/miga/result/stats.rb
CHANGED
@@ -141,12 +141,16 @@ module MiGA::Result::Stats
|
|
141
141
|
# Determine qualitative range
|
142
142
|
stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
|
143
143
|
source.metadata[:quality] =
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
if stats[:completeness][0] >= 90 && stats[:contamination][0] <= 5
|
145
|
+
:excellent # Finished or High-quality draft*
|
146
|
+
elsif stats[:completeness][0] >= 50 && stats[:contamination][0] <= 10
|
147
|
+
:high # Medium-quality draft*
|
148
|
+
elsif stats[:quality] >= 25
|
149
|
+
:intermediate # Low-quality draft* but sufficient for classification
|
150
|
+
else
|
151
|
+
:low # Low-quality draft* and insufficient for classification
|
149
152
|
end
|
153
|
+
# * Bowers et al 2017, DOI: 10.1038/nbt.3893
|
150
154
|
source.save
|
151
155
|
|
152
156
|
# Inactivate low-quality datasets
|
data/lib/miga/taxonomy.rb
CHANGED
@@ -137,6 +137,12 @@ class MiGA::Taxonomy < MiGA::MiGA
|
|
137
137
|
self[:ns]
|
138
138
|
end
|
139
139
|
|
140
|
+
##
|
141
|
+
# Domain of the taxonomy (a String) or +nil+
|
142
|
+
def domain
|
143
|
+
self[:d]
|
144
|
+
end
|
145
|
+
|
140
146
|
##
|
141
147
|
# Get the most general rank as a two-entry Array (rank and value).
|
142
148
|
# If +force_ranks+ is true, it always returns the value for domain (d)
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3,
|
15
|
+
VERSION = [1.3, 6, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023, 4,
|
23
|
+
VERSION_DATE = Date.new(2023, 4, 25)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
|
|
10
10
|
miga_start_project_step "$DIR"
|
11
11
|
|
12
12
|
# Extract values
|
13
|
-
|
14
|
-
SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
13
|
+
function foreach_database_aai {
|
14
|
+
local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
|
15
|
+
local k=0
|
16
|
+
while [[ -n ${DS[$k]} ]] ; do
|
17
|
+
echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
|
18
|
+
let k=$k+1
|
20
19
|
done
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
}
|
21
|
+
|
22
|
+
function aai_tsv {
|
23
|
+
DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
|
24
|
+
echo "a b value sd n omega" | tr " " "\\t"
|
25
|
+
if [[ ${#DS[@]} -gt 40000 ]] ; then
|
26
|
+
# Use comparisons in strictly one direction only for huge projects
|
27
|
+
foreach_database_aai \
|
28
|
+
| awk -F"\t" 'BEGIN { OFS="\t" }
|
29
|
+
{ if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
|
30
|
+
| sort -k 1,2 -u
|
31
|
+
else
|
32
|
+
foreach_database "$SQL"
|
33
|
+
fi
|
34
|
+
}
|
35
|
+
|
36
|
+
rm -f "miga-project.txt"
|
37
|
+
aai_tsv | gzip -9c > "miga-project.txt.gz"
|
30
38
|
|
31
39
|
# R-ify
|
32
40
|
cat <<R | R --vanilla
|
data/scripts/distances.bash
CHANGED
@@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
|
|
9
9
|
# Initialize
|
10
10
|
miga date > "$DATASET.start"
|
11
11
|
|
12
|
-
# Check quality
|
12
|
+
# Check quality
|
13
13
|
miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
|
14
14
|
inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
|
15
15
|
[[ "$inactive" == "true" ]] && exit
|
@@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
|
|
12
12
|
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
13
13
|
|
14
14
|
# Check if there are any proteins
|
15
|
-
if [[ ! -s $FAA ]] ; then
|
15
|
+
if [[ ! -s "$FAA" ]] ; then
|
16
16
|
echo Empty protein set, bypassing essential genes
|
17
17
|
rm "${DATASET}.start"
|
18
18
|
miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
|
@@ -35,13 +35,31 @@ HMM.essential.rb \
|
|
35
35
|
-t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
|
36
36
|
> "${DATASET}.ess/log"
|
37
37
|
|
38
|
-
# Index for FastAAI
|
38
|
+
# Index for FastAAI and classify (if needed and possible)
|
39
39
|
NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
|
40
40
|
| wc -l | awk '{print $1}')
|
41
|
-
[[ "$NOMULTI" -eq "1" ]]
|
41
|
+
if [[ "$NOMULTI" -eq "1" ]] ; then
|
42
42
|
python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
|
43
43
|
--protein "$FAA" --output_crystal "${DATASET}.crystal" \
|
44
44
|
--compress
|
45
|
+
|
46
|
+
# Classify
|
47
|
+
DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
|
48
|
+
if [[ "$DOMAIN" == "?" ]] ; then
|
49
|
+
REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
|
50
|
+
echo "Phylum-level classification against $REF_PROJ"
|
51
|
+
if [[ -n "$REF_PROJ" ]] ; then
|
52
|
+
cp "${DATASET}.start" "${DATASET}.start.bak"
|
53
|
+
miga date > "${DATASET}.done"
|
54
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
55
|
+
ruby -I "$MIGA/lib" \
|
56
|
+
"$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
|
57
|
+
run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
|
58
|
+
mv "${DATASET}.start.bak" "${DATASET}.start"
|
59
|
+
rm "${DATASET}.done" "${DATASET}.json"
|
60
|
+
fi
|
61
|
+
fi
|
62
|
+
fi
|
45
63
|
|
46
64
|
# Reduce files
|
47
65
|
if exists "$DATASET".ess/*.faa ; then
|
data/utils/distance/database.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'miga/sqlite'
|
2
2
|
|
3
3
|
module MiGA::DistanceRunner::Database
|
4
4
|
##
|
@@ -11,6 +11,8 @@ module MiGA::DistanceRunner::Database
|
|
11
11
|
{ haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
|
12
12
|
@db_counts[m] = 0
|
13
13
|
@dbs[m] = for_ref ? ref_db(m) : query_db(m)
|
14
|
+
@tmp_dbs[m] = tmp_file("#{m}.db")
|
15
|
+
|
14
16
|
# Remove if corrupt
|
15
17
|
if File.size?(dbs[m])
|
16
18
|
begin
|
@@ -21,9 +23,12 @@ module MiGA::DistanceRunner::Database
|
|
21
23
|
FileUtils.rm dbs[m]
|
22
24
|
end
|
23
25
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
|
27
|
+
# Initialize if it doesn't exist, copy otherwise
|
28
|
+
if File.size? dbs[m]
|
29
|
+
FileUtils.cp(dbs[m], tmp_dbs[m])
|
30
|
+
else
|
31
|
+
SQLite3::Database.new(tmp_dbs[m]) do |conn|
|
27
32
|
conn.execute <<~SQL
|
28
33
|
create table if not exists #{t}(
|
29
34
|
seq1 varchar(256), seq2 varchar(256),
|
@@ -31,10 +36,8 @@ module MiGA::DistanceRunner::Database
|
|
31
36
|
)
|
32
37
|
SQL
|
33
38
|
end
|
39
|
+
FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
|
34
40
|
end
|
35
|
-
# Copy over to (local) temporals
|
36
|
-
@tmp_dbs[m] = tmp_file("#{m}.db")
|
37
|
-
FileUtils.cp(dbs[m], tmp_dbs[m])
|
38
41
|
end
|
39
42
|
end
|
40
43
|
|
@@ -157,6 +160,20 @@ module MiGA::DistanceRunner::Database
|
|
157
160
|
conn.execute(sql).each { |row| data[row.shift] = row }
|
158
161
|
end
|
159
162
|
data
|
163
|
+
rescue => e
|
164
|
+
$stderr.puts "Database file: #{db}" if db ||= nil
|
165
|
+
raise e
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Retrieve the name and AAI of the closest relative from the AAI database
|
170
|
+
def closest_relative
|
171
|
+
db = tmp_dbs[:aai]
|
172
|
+
sql = 'select seq2, aai from aai order by aai desc limit 1'
|
173
|
+
MiGA::SQLite.new(db).run(sql).first
|
174
|
+
rescue => e
|
175
|
+
$stderr.puts "Database file: #{db}" if db ||= nil
|
176
|
+
raise e
|
160
177
|
end
|
161
178
|
|
162
179
|
##
|
data/utils/distance/pipeline.rb
CHANGED
@@ -72,22 +72,13 @@ module MiGA::DistanceRunner::Pipeline
|
|
72
72
|
$stderr.puts "Testing taxonomy | opts = #{opts}"
|
73
73
|
# Get taxonomy of closest relative
|
74
74
|
from_ref_project = (project != ref_project)
|
75
|
-
|
76
|
-
from_ref_project ?
|
77
|
-
File.expand_path('data/09.distances/05.taxonomy', project.path) :
|
78
|
-
home
|
79
|
-
Dir.mkdir res_dir unless Dir.exist? res_dir
|
80
|
-
File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
|
81
|
-
fh.puts Time.now.to_s
|
82
|
-
end
|
83
|
-
dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
|
84
|
-
cr = dataset.closest_relatives(1, from_ref_project)
|
75
|
+
cr = closest_relative
|
85
76
|
return if cr.nil? or cr.empty?
|
86
77
|
|
87
|
-
tax = ref_project.dataset(cr[0]
|
78
|
+
tax = ref_project.dataset(cr[0]).metadata[:tax] || {}
|
88
79
|
|
89
80
|
# Run the test for each rank
|
90
|
-
tax_test = MiGA::TaxDist.aai_pvalues(cr[
|
81
|
+
tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
|
91
82
|
r = tax_test.map do |k, v|
|
92
83
|
sig = ''
|
93
84
|
[0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
|
@@ -95,12 +86,14 @@ module MiGA::DistanceRunner::Pipeline
|
|
95
86
|
end
|
96
87
|
|
97
88
|
# Save test
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
89
|
+
unless opts[:only_domain]
|
90
|
+
File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
|
91
|
+
fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
|
92
|
+
fh.puts ''
|
93
|
+
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
|
94
|
+
fh.puts ''
|
95
|
+
fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
|
96
|
+
end
|
104
97
|
end
|
105
98
|
return r
|
106
99
|
end
|
@@ -115,6 +108,7 @@ module MiGA::DistanceRunner::Pipeline
|
|
115
108
|
.select { |i| i[1] != '?' && i[2] <= pval }
|
116
109
|
.map { |i| i[0, 2].join(':') }
|
117
110
|
dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
|
111
|
+
$stderr.puts " > #{dataset.metadata[:tax]}"
|
118
112
|
dataset.save
|
119
113
|
end
|
120
114
|
end
|
data/utils/distance/runner.rb
CHANGED
@@ -12,8 +12,9 @@ class MiGA::DistanceRunner
|
|
12
12
|
@home = File.expand_path('data/09.distances', project.path)
|
13
13
|
|
14
14
|
# Default opts
|
15
|
-
if opts[:run_taxonomy] &&
|
16
|
-
|
15
|
+
if opts[:run_taxonomy] &&
|
16
|
+
(opts[:ref_project] || project.option(:ref_project))
|
17
|
+
ref_path = opts[:ref_project] || project.option(:ref_project)
|
17
18
|
@home = File.expand_path('05.taxonomy', @home)
|
18
19
|
@ref_project = MiGA::Project.load(ref_path)
|
19
20
|
raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
|
@@ -73,48 +74,55 @@ class MiGA::DistanceRunner
|
|
73
74
|
# Initialize the databases
|
74
75
|
initialize_dbs! false
|
75
76
|
distances_by_request(tsk[1])
|
77
|
+
|
76
78
|
# Calculate the classification-informed AAI/ANI traverse
|
77
|
-
|
78
|
-
fh = File.open(
|
79
|
+
tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
|
80
|
+
fh = File.open(tmp_results, 'w')
|
79
81
|
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
|
80
82
|
fh.close
|
81
83
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
File.
|
89
|
-
|
90
|
-
|
91
|
-
|
84
|
+
unless opts[:only_domain]
|
85
|
+
results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
|
86
|
+
FileUtils.move(tmp_results, results)
|
87
|
+
|
88
|
+
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
89
|
+
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
90
|
+
par = File.expand_path('miga-project.classif', par_dir)
|
91
|
+
closest = { dataset: nil, ani: 0.0 }
|
92
|
+
sbj_datasets = []
|
93
|
+
if File.size? par
|
94
|
+
File.open(par, 'r') do |fh|
|
95
|
+
fh.each_line do |ln|
|
96
|
+
r = ln.chomp.split("\t")
|
97
|
+
sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
|
98
|
+
end
|
92
99
|
end
|
100
|
+
ani = ani_after_aai(sbj_datasets, 80.0)
|
101
|
+
ani_max = ani.map(&:to_f).each_with_index.max
|
102
|
+
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
|
93
103
|
end
|
94
|
-
ani = ani_after_aai(sbj_datasets, 80.0)
|
95
|
-
ani_max = ani.map(&:to_f).each_with_index.max
|
96
|
-
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
|
97
|
-
end
|
98
104
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
105
|
+
# Calculate all the AAIs/ANIs against the closest ANI95-clade
|
106
|
+
# (if AAI > 80%)
|
107
|
+
cl_path = res.file_path :clades_ani95
|
108
|
+
if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
|
109
|
+
clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
|
110
|
+
sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
|
111
|
+
sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
|
112
|
+
ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
|
113
|
+
end
|
106
114
|
end
|
107
115
|
|
108
116
|
# Finalize
|
109
117
|
[:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
|
110
|
-
build_medoids_tree(tsk[1])
|
118
|
+
build_medoids_tree(tsk[1]) unless opts[:only_domain]
|
111
119
|
transfer_taxonomy(tax_test)
|
112
120
|
end
|
113
121
|
|
114
122
|
# Launch analysis for taxonomy jobs
|
115
123
|
def go_taxonomy!
|
116
124
|
$stderr.puts 'Launching taxonomy analysis'
|
117
|
-
return unless project.option(:ref_project)
|
125
|
+
return unless opts[:ref_project] || project.option(:ref_project)
|
118
126
|
|
119
127
|
go_query! # <- yeah, it's actually the same, just different ref_project
|
120
128
|
end
|
data/utils/distance/temporal.rb
CHANGED
@@ -41,6 +41,7 @@ module MiGA::DistanceRunner::Temporal
|
|
41
41
|
|
42
42
|
# Copies temporal databases back to the MiGA Project
|
43
43
|
def checkpoint!(metric)
|
44
|
+
return if opts[:only_domain]
|
44
45
|
$stderr.puts "Checkpoint (metric = #{metric})"
|
45
46
|
|
46
47
|
# This is simply to test database consistency before overwriting the
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-04-
|
11
|
+
date: 2023-04-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -152,6 +152,7 @@ files:
|
|
152
152
|
- lib/miga/cli/action/console.rb
|
153
153
|
- lib/miga/cli/action/daemon.rb
|
154
154
|
- lib/miga/cli/action/date.rb
|
155
|
+
- lib/miga/cli/action/db.rb
|
155
156
|
- lib/miga/cli/action/derep_wf.rb
|
156
157
|
- lib/miga/cli/action/doctor.rb
|
157
158
|
- lib/miga/cli/action/doctor/base.rb
|
@@ -168,7 +169,6 @@ files:
|
|
168
169
|
- lib/miga/cli/action/find.rb
|
169
170
|
- lib/miga/cli/action/generic.rb
|
170
171
|
- lib/miga/cli/action/get.rb
|
171
|
-
- lib/miga/cli/action/get_db.rb
|
172
172
|
- lib/miga/cli/action/gtdb_get.rb
|
173
173
|
- lib/miga/cli/action/index_wf.rb
|
174
174
|
- lib/miga/cli/action/init.rb
|