miga-base 1.3.5.0 → 1.3.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/{get_db.rb → db.rb} +49 -7
- data/lib/miga/cli/action/doctor/operations.rb +14 -0
- data/lib/miga/cli/action/doctor.rb +11 -9
- data/lib/miga/cli/action/rm.rb +10 -5
- data/lib/miga/cli/action/wf.rb +4 -0
- data/lib/miga/cli/base.rb +41 -40
- data/lib/miga/taxonomy.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +24 -16
- data/scripts/distances.bash +1 -1
- data/scripts/essential_genes.bash +21 -3
- data/utils/distance/database.rb +24 -7
- data/utils/distance/pipeline.rb +12 -18
- data/utils/distance/runner.rb +35 -27
- data/utils/distance/temporal.rb +1 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51d2b93599ee94cbf863b737016a37c32d63bc80c8ef80a8b5dbb52257e6f8b5
|
4
|
+
data.tar.gz: 7dced09173180bd58d8ba0feef1a90ecf00d3fc55154ebae4c34d861b0e1e566
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0bbaf2de062ecbb74a9d9636e2fb1478e56edfc5c23f6c3733a183114af86f5efc6cc678d71b1437e5825edeeb57efe932a00172610edac07172555ff5ef568d
|
7
|
+
data.tar.gz: 610f728fd400987765779a6df88869af8611ea928cbc2853fac673a897cb3fbd09ec3eac2e1dc0d827257ce046cd67ab674a41892536520dd38bb93d9ebb66b7
|
@@ -4,12 +4,12 @@
|
|
4
4
|
require 'miga/cli/action'
|
5
5
|
require 'digest/md5'
|
6
6
|
|
7
|
-
class MiGA::Cli::Action::
|
7
|
+
class MiGA::Cli::Action::Db < MiGA::Cli::Action
|
8
8
|
def parse_cli
|
9
9
|
cli.defaults = {
|
10
10
|
database: :recommended,
|
11
11
|
version: :latest,
|
12
|
-
local: File.
|
12
|
+
local: File.join(ENV['MIGA_HOME'], '.miga_db'),
|
13
13
|
host: MiGA::MiGA.known_hosts(:miga_db),
|
14
14
|
pb: true,
|
15
15
|
reuse_archive: false,
|
@@ -40,6 +40,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
40
40
|
'--list-versions',
|
41
41
|
'List available versions of the database and exit'
|
42
42
|
) { |v| cli[:list_versions] = v }
|
43
|
+
opt.on(
|
44
|
+
'--list-local',
|
45
|
+
'List only the versions of the local databases (if any) and exit'
|
46
|
+
) { |v| cli[:list_local] = v }
|
43
47
|
opt.on(
|
44
48
|
'--reuse-archive',
|
45
49
|
'Reuse a previously downloaded archive if available'
|
@@ -48,6 +52,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
48
52
|
'--no-overwrite',
|
49
53
|
'Exit without downloading if the target database already exists'
|
50
54
|
) { |v| cli[:overwrite] = v }
|
55
|
+
opt.on(
|
56
|
+
'--tab',
|
57
|
+
'Return a tab-delimited table'
|
58
|
+
) { |v| cli[:tabular] = v }
|
51
59
|
opt.on('--no-progress', 'Supress progress bars') { |v| cli[:pb] = v }
|
52
60
|
end
|
53
61
|
end
|
@@ -60,6 +68,12 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
60
68
|
return
|
61
69
|
end
|
62
70
|
|
71
|
+
# If dealing with local checks only
|
72
|
+
if cli[:list_local]
|
73
|
+
list_local
|
74
|
+
return
|
75
|
+
end
|
76
|
+
|
63
77
|
# Remote manifest
|
64
78
|
@ftp = remote_connection
|
65
79
|
manif = remote_manifest(@ftp)
|
@@ -89,6 +103,28 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
89
103
|
|
90
104
|
private
|
91
105
|
|
106
|
+
def list_local
|
107
|
+
local_manif = local_manifest
|
108
|
+
raise "Local manifest not found." unless local_manif
|
109
|
+
databases =
|
110
|
+
if %i[recommended test].include?(cli[:database])
|
111
|
+
local_manif[:databases].keys
|
112
|
+
else
|
113
|
+
[cli[:database].to_sym]
|
114
|
+
end
|
115
|
+
cli.table(
|
116
|
+
%w[database version genomes updated path],
|
117
|
+
databases.map do |db|
|
118
|
+
path = File.join(cli[:local], db.to_s)
|
119
|
+
p = MiGA::Project.load(path)
|
120
|
+
if p
|
121
|
+
md = p.metadata
|
122
|
+
[db, md[:release], md[:datasets].count, md[:updated], p.path]
|
123
|
+
end
|
124
|
+
end.compact
|
125
|
+
)
|
126
|
+
end
|
127
|
+
|
92
128
|
def remote_connection
|
93
129
|
cli.say "Connecting to '#{cli[:host]}'"
|
94
130
|
MiGA::MiGA.remote_connection(cli[:host])
|
@@ -113,6 +149,14 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
113
149
|
MiGA::Json.parse(file)
|
114
150
|
end
|
115
151
|
|
152
|
+
def local_manifest_file
|
153
|
+
File.join(cli[:local], '_local_manif.json')
|
154
|
+
end
|
155
|
+
|
156
|
+
def local_manifest
|
157
|
+
MiGA::Json.parse(local_manifest_file) if File.exist?(local_manifest_file)
|
158
|
+
end
|
159
|
+
|
116
160
|
def db_requested(manif)
|
117
161
|
[:recommended, :test].each do |n|
|
118
162
|
if cli[:database] == n
|
@@ -208,15 +252,13 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
|
|
208
252
|
|
209
253
|
def register_database(manif, db, ver)
|
210
254
|
cli.say "Registering database locally"
|
211
|
-
|
212
|
-
reg = File.exist?(local_manif) ? MiGA::Json.parse(local_manif) : {}
|
213
|
-
reg[:last_update] = Time.now.to_s
|
255
|
+
reg = local_manifest.merge(last_update: Time.now.to_s)
|
214
256
|
reg[:databases] ||= {}
|
215
257
|
reg[:databases][cli[:database]] ||= {}
|
216
258
|
reg[:databases][cli[:database]][:manif_last_update] = manif[:last_update]
|
217
259
|
reg[:databases][cli[:database]][:manif_host] = manif[:host]
|
218
|
-
|
260
|
+
reg[:databases][cli[:database]].merge! db
|
219
261
|
reg[:databases][cli[:database]][:local_version] = ver
|
220
|
-
MiGA::Json.generate(reg,
|
262
|
+
MiGA::Json.generate(reg, local_manifest_file)
|
221
263
|
end
|
222
264
|
end
|
@@ -1,5 +1,19 @@
|
|
1
1
|
|
2
2
|
module MiGA::Cli::Action::Doctor::Operations
|
3
|
+
##
|
4
|
+
# Perform refdb operation with MiGA::Cli +cli+
|
5
|
+
def check_refdb(cli)
|
6
|
+
cli.say 'Checking index format of reference database'
|
7
|
+
ref_dbs = File.join(ENV['MIGA_HOME'], '.miga_db')
|
8
|
+
manif_file = File.join(ref_dbs, '_local_manif.json')
|
9
|
+
return unless File.size?(manif_file)
|
10
|
+
|
11
|
+
MiGA::Json.parse(manif_file)[:databases]&.keys&.each do |db|
|
12
|
+
p = MiGA::Project.load(File.join(ref_dbs, db.to_s))
|
13
|
+
md = p&.metadata
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
3
17
|
##
|
4
18
|
# Perform status operation with MiGA::Cli +cli+
|
5
19
|
def check_status(cli)
|
@@ -46,16 +46,18 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
46
46
|
end
|
47
47
|
|
48
48
|
@@OPERATIONS = {
|
49
|
+
# TODO Implement this check:
|
50
|
+
# refdb: ['ref-db', 'Check index format of reference databases'],
|
49
51
|
status: ['status', 'Update metadata status of all datasets'],
|
50
|
-
db:
|
51
|
-
bidir:
|
52
|
-
dist:
|
53
|
-
files:
|
54
|
-
cds:
|
55
|
-
ess:
|
56
|
-
mts:
|
57
|
-
start:
|
58
|
-
tax:
|
52
|
+
db: ['databases', 'Check integrity of database files'],
|
53
|
+
bidir: ['bidirectional', 'Check distances are bidirectional'],
|
54
|
+
dist: ['distances', 'Check distance summary tables'],
|
55
|
+
files: ['files', 'Check for outdated files'],
|
56
|
+
cds: ['cds', 'Check for gzipped genes and proteins'],
|
57
|
+
ess: ['essential-genes', 'Check for outdated essential genes'],
|
58
|
+
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
59
|
+
start: ['start', 'Check for lingering .start files'],
|
60
|
+
tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
|
59
61
|
}
|
60
62
|
|
61
63
|
class << self
|
data/lib/miga/cli/action/rm.rb
CHANGED
@@ -7,9 +7,9 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
|
|
7
7
|
def parse_cli
|
8
8
|
cli.defaults = { remove: false }
|
9
9
|
cli.parse do |opt|
|
10
|
-
cli.opt_object(opt)
|
10
|
+
cli.opt_object(opt, %i[project dataset_opt result_opt])
|
11
11
|
opt.on(
|
12
|
-
'-
|
12
|
+
'-R', '--remove',
|
13
13
|
'Also remove all associated files',
|
14
14
|
'By default, only unlinks from metadata'
|
15
15
|
) { |v| cli[:remove] = v }
|
@@ -17,8 +17,13 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def perform
|
20
|
-
|
21
|
-
|
22
|
-
d
|
20
|
+
if r = cli.load_result
|
21
|
+
cli[:remove] ? r.remove! : r.unlink
|
22
|
+
elsif d = cli.load_dataset
|
23
|
+
cli.load_project.unlink_dataset(d.name)
|
24
|
+
d.remove! if cli[:remove]
|
25
|
+
else
|
26
|
+
raise "You must define one of --result or --dataset"
|
27
|
+
end
|
23
28
|
end
|
24
29
|
end
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -21,6 +21,10 @@ module MiGA::Cli::Action::Wf
|
|
21
21
|
'-o', '--out_dir PATH',
|
22
22
|
'(Mandatory) Directory to be created with all output data'
|
23
23
|
) { |v| cli[:outdir] = v }
|
24
|
+
opt.on(
|
25
|
+
'-P', '--project PATH',
|
26
|
+
'::HIDE::' # Applying the principle of least surprise, alias of -o
|
27
|
+
) { |v| cli[:outdir] = v }
|
24
28
|
opt.separator ''
|
25
29
|
opt.separator " FILES...: #{files_desc}"
|
26
30
|
opt.separator ''
|
data/lib/miga/cli/base.rb
CHANGED
@@ -2,60 +2,61 @@
|
|
2
2
|
|
3
3
|
module MiGA::Cli::Base
|
4
4
|
@@TASK_DESC = {
|
5
|
-
generic:
|
5
|
+
generic: 'MiGA: The Microbial Genomes Atlas',
|
6
6
|
# Workflows
|
7
|
-
quality_wf:
|
8
|
-
derep_wf:
|
7
|
+
quality_wf: 'Evaluate the quality of input genomes',
|
8
|
+
derep_wf: 'Dereplicate a collection of input genomes',
|
9
9
|
classify_wf: 'Classify input genomes against a reference database',
|
10
|
-
preproc_wf:
|
11
|
-
index_wf:
|
10
|
+
preproc_wf: 'Preprocess input genomes or metagenomes',
|
11
|
+
index_wf: 'Generate distance indexing of input genomes',
|
12
12
|
# Projects
|
13
|
-
new:
|
14
|
-
about:
|
15
|
-
doctor:
|
16
|
-
|
17
|
-
browse: 'Explore a project locally using a web browser',
|
13
|
+
new: 'Create an empty MiGA project',
|
14
|
+
about: 'Display information about a MiGA project',
|
15
|
+
doctor: 'Perform consistency checks on a MiGA project',
|
16
|
+
browse: 'Explore a project locally using a web browser',
|
18
17
|
# Datasets
|
19
|
-
add:
|
20
|
-
get:
|
21
|
-
ncbi_get:
|
22
|
-
gtdb_get:
|
23
|
-
seqcode_get: 'Download all type genomes from SeqCode
|
24
|
-
rm:
|
25
|
-
find:
|
26
|
-
ln:
|
27
|
-
ls:
|
28
|
-
archive:
|
18
|
+
add: 'Create a dataset in a MiGA project',
|
19
|
+
get: 'Download a dataset from public databases into a MiGA project',
|
20
|
+
ncbi_get: 'Download the genomes of a taxon from NCBI to a MiGA project',
|
21
|
+
gtdb_get: 'Download the genomes of a taxon from GTDB to a MiGA project',
|
22
|
+
seqcode_get: 'Download all type genomes from SeqCode to a MiGA project',
|
23
|
+
rm: 'Remove a dataset from a MiGA project',
|
24
|
+
find: 'Find unregistered datasets based on result files',
|
25
|
+
ln: 'Link datasets (incl. results) from one project to another',
|
26
|
+
ls: 'List all registered datasets in a MiGA project',
|
27
|
+
archive: 'Generate a tar-ball with all files from select datasets',
|
29
28
|
# Results
|
30
|
-
add_result:
|
31
|
-
stats:
|
32
|
-
files:
|
33
|
-
run:
|
34
|
-
summary:
|
35
|
-
next_step:
|
29
|
+
add_result: 'Register a result',
|
30
|
+
stats: 'Extract statistics for the given result',
|
31
|
+
files: 'List registered files from a dataset or project',
|
32
|
+
run: 'Execute locally one step analysis producing the given result',
|
33
|
+
summary: 'Generate a summary table for the statistics of all datasets',
|
34
|
+
next_step: 'Return the next task to run in a dataset or project',
|
36
35
|
# Objects (Datasets or Projects)
|
37
|
-
edit:
|
38
|
-
option:
|
39
|
-
touch:
|
36
|
+
edit: 'Edit the metadata of a dataset or project',
|
37
|
+
option: 'Get or set options of a dataset or project',
|
38
|
+
touch: 'Change the "last modified" time to now without changes',
|
40
39
|
# System
|
41
|
-
init:
|
42
|
-
daemon:
|
43
|
-
lair:
|
44
|
-
|
45
|
-
|
46
|
-
|
40
|
+
init: 'Initialize MiGA to process new projects',
|
41
|
+
daemon: 'Control the daemon of a MiGA project',
|
42
|
+
lair: 'Control groups of daemons for several MiGA projects',
|
43
|
+
db: 'Download a pre-indexed database',
|
44
|
+
date: 'Return the current date in standard MiGA format',
|
45
|
+
console: 'Open an IRB console with MiGA',
|
46
|
+
env: 'Shell code to load MiGA environment',
|
47
47
|
# Taxonomy
|
48
|
-
tax_set:
|
49
|
-
tax_test:
|
50
|
-
tax_index:
|
51
|
-
tax_dist:
|
48
|
+
tax_set: 'Register taxonomic information for datasets',
|
49
|
+
tax_test: 'Return test of taxonomic distributions for query datasets',
|
50
|
+
tax_index: 'Create a taxonomy-indexed list of the datasets',
|
51
|
+
tax_dist: 'Estimate distributions of distance by taxonomy',
|
52
52
|
}
|
53
53
|
|
54
54
|
@@TASK_ALIAS = {
|
55
55
|
# Projects
|
56
56
|
create_project: :new,
|
57
57
|
project_info: :about,
|
58
|
-
download: :
|
58
|
+
download: :db,
|
59
|
+
get_db: :db,
|
59
60
|
# Datasets
|
60
61
|
create_dataset: :add,
|
61
62
|
download_dataset: :get,
|
data/lib/miga/taxonomy.rb
CHANGED
@@ -137,6 +137,12 @@ class MiGA::Taxonomy < MiGA::MiGA
|
|
137
137
|
self[:ns]
|
138
138
|
end
|
139
139
|
|
140
|
+
##
|
141
|
+
# Domain of the taxonomy (a String) or +nil+
|
142
|
+
def domain
|
143
|
+
self[:d]
|
144
|
+
end
|
145
|
+
|
140
146
|
##
|
141
147
|
# Get the most general rank as a two-entry Array (rank and value).
|
142
148
|
# If +force_ranks+ is true, it always returns the value for domain (d)
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.3,
|
15
|
+
VERSION = [1.3, 6, 1].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023,
|
23
|
+
VERSION_DATE = Date.new(2023, 5, 1)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/aai_distances.bash
CHANGED
@@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
|
|
10
10
|
miga_start_project_step "$DIR"
|
11
11
|
|
12
12
|
# Extract values
|
13
|
-
|
14
|
-
SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
13
|
+
function foreach_database_aai {
|
14
|
+
local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
|
15
|
+
local k=0
|
16
|
+
while [[ -n ${DS[$k]} ]] ; do
|
17
|
+
echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
|
18
|
+
let k=$k+1
|
20
19
|
done
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
}
|
21
|
+
|
22
|
+
function aai_tsv {
|
23
|
+
DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
|
24
|
+
echo "a b value sd n omega" | tr " " "\\t"
|
25
|
+
if [[ ${#DS[@]} -gt 40000 ]] ; then
|
26
|
+
# Use comparisons in strictly one direction only for huge projects
|
27
|
+
foreach_database_aai \
|
28
|
+
| awk -F"\t" 'BEGIN { OFS="\t" }
|
29
|
+
{ if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
|
30
|
+
| sort -k 1,2 -u
|
31
|
+
else
|
32
|
+
foreach_database "$SQL"
|
33
|
+
fi
|
34
|
+
}
|
35
|
+
|
36
|
+
rm -f "miga-project.txt"
|
37
|
+
aai_tsv | gzip -9c > "miga-project.txt.gz"
|
30
38
|
|
31
39
|
# R-ify
|
32
40
|
cat <<R | R --vanilla
|
data/scripts/distances.bash
CHANGED
@@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
|
|
9
9
|
# Initialize
|
10
10
|
miga date > "$DATASET.start"
|
11
11
|
|
12
|
-
# Check quality
|
12
|
+
# Check quality
|
13
13
|
miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
|
14
14
|
inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
|
15
15
|
[[ "$inactive" == "true" ]] && exit
|
@@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
|
|
12
12
|
[[ -s "$FAA" ]] || FAA="${FAA}.gz"
|
13
13
|
|
14
14
|
# Check if there are any proteins
|
15
|
-
if [[ ! -s $FAA ]] ; then
|
15
|
+
if [[ ! -s "$FAA" ]] ; then
|
16
16
|
echo Empty protein set, bypassing essential genes
|
17
17
|
rm "${DATASET}.start"
|
18
18
|
miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
|
@@ -35,13 +35,31 @@ HMM.essential.rb \
|
|
35
35
|
-t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
|
36
36
|
> "${DATASET}.ess/log"
|
37
37
|
|
38
|
-
# Index for FastAAI
|
38
|
+
# Index for FastAAI and classify (if needed and possible)
|
39
39
|
NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
|
40
40
|
| wc -l | awk '{print $1}')
|
41
|
-
[[ "$NOMULTI" -eq "1" ]]
|
41
|
+
if [[ "$NOMULTI" -eq "1" ]] ; then
|
42
42
|
python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
|
43
43
|
--protein "$FAA" --output_crystal "${DATASET}.crystal" \
|
44
44
|
--compress
|
45
|
+
|
46
|
+
# Classify
|
47
|
+
DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
|
48
|
+
if [[ "$DOMAIN" == "?" ]] ; then
|
49
|
+
REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
|
50
|
+
echo "Phylum-level classification against $REF_PROJ"
|
51
|
+
if [[ -n "$REF_PROJ" ]] ; then
|
52
|
+
cp "${DATASET}.start" "${DATASET}.start.bak"
|
53
|
+
miga date > "${DATASET}.done"
|
54
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
55
|
+
ruby -I "$MIGA/lib" \
|
56
|
+
"$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
|
57
|
+
run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
|
58
|
+
mv "${DATASET}.start.bak" "${DATASET}.start"
|
59
|
+
rm "${DATASET}.done" "${DATASET}.json"
|
60
|
+
fi
|
61
|
+
fi
|
62
|
+
fi
|
45
63
|
|
46
64
|
# Reduce files
|
47
65
|
if exists "$DATASET".ess/*.faa ; then
|
data/utils/distance/database.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'miga/sqlite'
|
2
2
|
|
3
3
|
module MiGA::DistanceRunner::Database
|
4
4
|
##
|
@@ -11,6 +11,8 @@ module MiGA::DistanceRunner::Database
|
|
11
11
|
{ haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
|
12
12
|
@db_counts[m] = 0
|
13
13
|
@dbs[m] = for_ref ? ref_db(m) : query_db(m)
|
14
|
+
@tmp_dbs[m] = tmp_file("#{m}.db")
|
15
|
+
|
14
16
|
# Remove if corrupt
|
15
17
|
if File.size?(dbs[m])
|
16
18
|
begin
|
@@ -21,9 +23,12 @@ module MiGA::DistanceRunner::Database
|
|
21
23
|
FileUtils.rm dbs[m]
|
22
24
|
end
|
23
25
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
|
27
|
+
# Initialize if it doesn't exist, copy otherwise
|
28
|
+
if File.size? dbs[m]
|
29
|
+
FileUtils.cp(dbs[m], tmp_dbs[m])
|
30
|
+
else
|
31
|
+
SQLite3::Database.new(tmp_dbs[m]) do |conn|
|
27
32
|
conn.execute <<~SQL
|
28
33
|
create table if not exists #{t}(
|
29
34
|
seq1 varchar(256), seq2 varchar(256),
|
@@ -31,10 +36,8 @@ module MiGA::DistanceRunner::Database
|
|
31
36
|
)
|
32
37
|
SQL
|
33
38
|
end
|
39
|
+
FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
|
34
40
|
end
|
35
|
-
# Copy over to (local) temporals
|
36
|
-
@tmp_dbs[m] = tmp_file("#{m}.db")
|
37
|
-
FileUtils.cp(dbs[m], tmp_dbs[m])
|
38
41
|
end
|
39
42
|
end
|
40
43
|
|
@@ -157,6 +160,20 @@ module MiGA::DistanceRunner::Database
|
|
157
160
|
conn.execute(sql).each { |row| data[row.shift] = row }
|
158
161
|
end
|
159
162
|
data
|
163
|
+
rescue => e
|
164
|
+
$stderr.puts "Database file: #{db}" if db ||= nil
|
165
|
+
raise e
|
166
|
+
end
|
167
|
+
|
168
|
+
##
|
169
|
+
# Retrieve the name and AAI of the closest relative from the AAI database
|
170
|
+
def closest_relative
|
171
|
+
db = tmp_dbs[:aai]
|
172
|
+
sql = 'select seq2, aai from aai order by aai desc limit 1'
|
173
|
+
MiGA::SQLite.new(db).run(sql).first
|
174
|
+
rescue => e
|
175
|
+
$stderr.puts "Database file: #{db}" if db ||= nil
|
176
|
+
raise e
|
160
177
|
end
|
161
178
|
|
162
179
|
##
|
data/utils/distance/pipeline.rb
CHANGED
@@ -72,22 +72,13 @@ module MiGA::DistanceRunner::Pipeline
|
|
72
72
|
$stderr.puts "Testing taxonomy | opts = #{opts}"
|
73
73
|
# Get taxonomy of closest relative
|
74
74
|
from_ref_project = (project != ref_project)
|
75
|
-
|
76
|
-
from_ref_project ?
|
77
|
-
File.expand_path('data/09.distances/05.taxonomy', project.path) :
|
78
|
-
home
|
79
|
-
Dir.mkdir res_dir unless Dir.exist? res_dir
|
80
|
-
File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
|
81
|
-
fh.puts Time.now.to_s
|
82
|
-
end
|
83
|
-
dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
|
84
|
-
cr = dataset.closest_relatives(1, from_ref_project)
|
75
|
+
cr = closest_relative
|
85
76
|
return if cr.nil? or cr.empty?
|
86
77
|
|
87
|
-
tax = ref_project.dataset(cr[0]
|
78
|
+
tax = ref_project.dataset(cr[0]).metadata[:tax] || {}
|
88
79
|
|
89
80
|
# Run the test for each rank
|
90
|
-
tax_test = MiGA::TaxDist.aai_pvalues(cr[
|
81
|
+
tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
|
91
82
|
r = tax_test.map do |k, v|
|
92
83
|
sig = ''
|
93
84
|
[0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
|
@@ -95,12 +86,14 @@ module MiGA::DistanceRunner::Pipeline
|
|
95
86
|
end
|
96
87
|
|
97
88
|
# Save test
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
89
|
+
unless opts[:only_domain]
|
90
|
+
File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
|
91
|
+
fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
|
92
|
+
fh.puts ''
|
93
|
+
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
|
94
|
+
fh.puts ''
|
95
|
+
fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
|
96
|
+
end
|
104
97
|
end
|
105
98
|
return r
|
106
99
|
end
|
@@ -115,6 +108,7 @@ module MiGA::DistanceRunner::Pipeline
|
|
115
108
|
.select { |i| i[1] != '?' && i[2] <= pval }
|
116
109
|
.map { |i| i[0, 2].join(':') }
|
117
110
|
dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
|
111
|
+
$stderr.puts " > #{dataset.metadata[:tax]}"
|
118
112
|
dataset.save
|
119
113
|
end
|
120
114
|
end
|
data/utils/distance/runner.rb
CHANGED
@@ -12,8 +12,9 @@ class MiGA::DistanceRunner
|
|
12
12
|
@home = File.expand_path('data/09.distances', project.path)
|
13
13
|
|
14
14
|
# Default opts
|
15
|
-
if opts[:run_taxonomy] &&
|
16
|
-
|
15
|
+
if opts[:run_taxonomy] &&
|
16
|
+
(opts[:ref_project] || project.option(:ref_project))
|
17
|
+
ref_path = opts[:ref_project] || project.option(:ref_project)
|
17
18
|
@home = File.expand_path('05.taxonomy', @home)
|
18
19
|
@ref_project = MiGA::Project.load(ref_path)
|
19
20
|
raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
|
@@ -73,48 +74,55 @@ class MiGA::DistanceRunner
|
|
73
74
|
# Initialize the databases
|
74
75
|
initialize_dbs! false
|
75
76
|
distances_by_request(tsk[1])
|
77
|
+
|
76
78
|
# Calculate the classification-informed AAI/ANI traverse
|
77
|
-
|
78
|
-
fh = File.open(
|
79
|
+
tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
|
80
|
+
fh = File.open(tmp_results, 'w')
|
79
81
|
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
|
80
82
|
fh.close
|
81
83
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
File.
|
89
|
-
|
90
|
-
|
91
|
-
|
84
|
+
unless opts[:only_domain]
|
85
|
+
results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
|
86
|
+
FileUtils.move(tmp_results, results)
|
87
|
+
|
88
|
+
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
89
|
+
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
90
|
+
par = File.expand_path('miga-project.classif', par_dir)
|
91
|
+
closest = { dataset: nil, ani: 0.0 }
|
92
|
+
sbj_datasets = []
|
93
|
+
if File.size? par
|
94
|
+
File.open(par, 'r') do |fh|
|
95
|
+
fh.each_line do |ln|
|
96
|
+
r = ln.chomp.split("\t")
|
97
|
+
sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
|
98
|
+
end
|
92
99
|
end
|
100
|
+
ani = ani_after_aai(sbj_datasets, 80.0)
|
101
|
+
ani_max = ani.map(&:to_f).each_with_index.max
|
102
|
+
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
|
93
103
|
end
|
94
|
-
ani = ani_after_aai(sbj_datasets, 80.0)
|
95
|
-
ani_max = ani.map(&:to_f).each_with_index.max
|
96
|
-
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
|
97
|
-
end
|
98
104
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
105
|
+
# Calculate all the AAIs/ANIs against the closest ANI95-clade
|
106
|
+
# (if AAI > 80%)
|
107
|
+
cl_path = res.file_path :clades_ani95
|
108
|
+
if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
|
109
|
+
clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
|
110
|
+
sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
|
111
|
+
sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
|
112
|
+
ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
|
113
|
+
end
|
106
114
|
end
|
107
115
|
|
108
116
|
# Finalize
|
109
117
|
[:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
|
110
|
-
build_medoids_tree(tsk[1])
|
118
|
+
build_medoids_tree(tsk[1]) unless opts[:only_domain]
|
111
119
|
transfer_taxonomy(tax_test)
|
112
120
|
end
|
113
121
|
|
114
122
|
# Launch analysis for taxonomy jobs
|
115
123
|
def go_taxonomy!
|
116
124
|
$stderr.puts 'Launching taxonomy analysis'
|
117
|
-
return unless project.option(:ref_project)
|
125
|
+
return unless opts[:ref_project] || project.option(:ref_project)
|
118
126
|
|
119
127
|
go_query! # <- yeah, it's actually the same, just different ref_project
|
120
128
|
end
|
data/utils/distance/temporal.rb
CHANGED
@@ -41,6 +41,7 @@ module MiGA::DistanceRunner::Temporal
|
|
41
41
|
|
42
42
|
# Copies temporal databases back to the MiGA Project
|
43
43
|
def checkpoint!(metric)
|
44
|
+
return if opts[:only_domain]
|
44
45
|
$stderr.puts "Checkpoint (metric = #{metric})"
|
45
46
|
|
46
47
|
# This is simply to test database consistency before overwriting the
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -152,6 +152,7 @@ files:
|
|
152
152
|
- lib/miga/cli/action/console.rb
|
153
153
|
- lib/miga/cli/action/daemon.rb
|
154
154
|
- lib/miga/cli/action/date.rb
|
155
|
+
- lib/miga/cli/action/db.rb
|
155
156
|
- lib/miga/cli/action/derep_wf.rb
|
156
157
|
- lib/miga/cli/action/doctor.rb
|
157
158
|
- lib/miga/cli/action/doctor/base.rb
|
@@ -168,7 +169,6 @@ files:
|
|
168
169
|
- lib/miga/cli/action/find.rb
|
169
170
|
- lib/miga/cli/action/generic.rb
|
170
171
|
- lib/miga/cli/action/get.rb
|
171
|
-
- lib/miga/cli/action/get_db.rb
|
172
172
|
- lib/miga/cli/action/gtdb_get.rb
|
173
173
|
- lib/miga/cli/action/index_wf.rb
|
174
174
|
- lib/miga/cli/action/init.rb
|