miga-base 1.3.5.0 → 1.3.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d17244b326441f224e4626b53702018fd0f3915d44e16e9be939a70dd86ceefa
4
- data.tar.gz: 8f6c7544ab57957dbfeb53230e56ba9e7eda37c7bab2fea538ee17d9f54fd9f7
3
+ metadata.gz: a9035e8f6ccd641c75bedc38b5f78d3eb1815c51956d399728b1e862c2920cbb
4
+ data.tar.gz: 2ffdecb639083a3a40d2bab3a39f1df92cb22279314ede2a5a843b278c8be9ee
5
5
  SHA512:
6
- metadata.gz: bd77962a43caa04c72d01ddbe251fb02650f0f003c55b2b13a44ac1affb77b52535e5dd5fd7713c387f5a7bafdd8839255f2cd8d82be8d4c025b562a7beabfbd
7
- data.tar.gz: b0cd50e25aa16ce2c64202daa5bbbb256e3f83742513ac984715f1b7c96f7297887856a2853ac6ec49dd43fe63b5c37cd96e83f8047ebcd32fdbc47addf892f7
6
+ metadata.gz: 42db29082a3b8097f8ec6f55839202d79e8d00496d55bab8093e4355a26f9133249506266e7f0eb64acea18fb91885abc493a0fe92c81c0eef27ec11f08d3eff
7
+ data.tar.gz: 5b4ce9ae27ac4542e2af836176dde909f013461b9916f1d1f4cfc0831d6e494d5de76c57d9c7b64ebd87aefd21a2c79edf55a0192838b6e589be21e9e5a4ebb1
@@ -4,12 +4,12 @@
4
4
  require 'miga/cli/action'
5
5
  require 'digest/md5'
6
6
 
7
- class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
7
+ class MiGA::Cli::Action::Db < MiGA::Cli::Action
8
8
  def parse_cli
9
9
  cli.defaults = {
10
10
  database: :recommended,
11
11
  version: :latest,
12
- local: File.expand_path('.miga_db', ENV['MIGA_HOME']),
12
+ local: File.join(ENV['MIGA_HOME'], '.miga_db'),
13
13
  host: MiGA::MiGA.known_hosts(:miga_db),
14
14
  pb: true,
15
15
  reuse_archive: false,
@@ -40,6 +40,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
40
40
  '--list-versions',
41
41
  'List available versions of the database and exit'
42
42
  ) { |v| cli[:list_versions] = v }
43
+ opt.on(
44
+ '--list-local',
45
+ 'List only the versions of the local databases (if any) and exit'
46
+ ) { |v| cli[:list_local] = v }
43
47
  opt.on(
44
48
  '--reuse-archive',
45
49
  'Reuse a previously downloaded archive if available'
@@ -48,6 +52,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
48
52
  '--no-overwrite',
49
53
  'Exit without downloading if the target database already exists'
50
54
  ) { |v| cli[:overwrite] = v }
55
+ opt.on(
56
+ '--tab',
57
+ 'Return a tab-delimited table'
58
+ ) { |v| cli[:tabular] = v }
51
59
  opt.on('--no-progress', 'Supress progress bars') { |v| cli[:pb] = v }
52
60
  end
53
61
  end
@@ -60,6 +68,12 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
60
68
  return
61
69
  end
62
70
 
71
+ # If dealing with local checks only
72
+ if cli[:list_local]
73
+ list_local
74
+ return
75
+ end
76
+
63
77
  # Remote manifest
64
78
  @ftp = remote_connection
65
79
  manif = remote_manifest(@ftp)
@@ -89,6 +103,28 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
89
103
 
90
104
  private
91
105
 
106
+ def list_local
107
+ local_manif = local_manifest
108
+ raise "Local manifest not found." unless local_manif
109
+ databases =
110
+ if %i[recommended test].include?(cli[:database])
111
+ local_manif[:databases].keys
112
+ else
113
+ [cli[:database].to_sym]
114
+ end
115
+ cli.table(
116
+ %w[database version genomes updated path],
117
+ databases.map do |db|
118
+ path = File.join(cli[:local], db.to_s)
119
+ p = MiGA::Project.load(path)
120
+ if p
121
+ md = p.metadata
122
+ [db, md[:release], md[:datasets].count, md[:updated], p.path]
123
+ end
124
+ end.compact
125
+ )
126
+ end
127
+
92
128
  def remote_connection
93
129
  cli.say "Connecting to '#{cli[:host]}'"
94
130
  MiGA::MiGA.remote_connection(cli[:host])
@@ -113,6 +149,11 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
113
149
  MiGA::Json.parse(file)
114
150
  end
115
151
 
152
+ def local_manifest
153
+ file = File.join(cli[:local], '_local_manif.json')
154
+ MiGA::Json.parse(file) if File.exist?(file)
155
+ end
156
+
116
157
  def db_requested(manif)
117
158
  [:recommended, :test].each do |n|
118
159
  if cli[:database] == n
@@ -208,8 +249,7 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
208
249
 
209
250
  def register_database(manif, db, ver)
210
251
  cli.say "Registering database locally"
211
- local_manif = File.expand_path('_local_manif.json', cli[:local])
212
- reg = File.exist?(local_manif) ? MiGA::Json.parse(local_manif) : {}
252
+ local_manif = local_manifest || {}
213
253
  reg[:last_update] = Time.now.to_s
214
254
  reg[:databases] ||= {}
215
255
  reg[:databases][cli[:database]] ||= {}
@@ -1,5 +1,19 @@
1
1
 
2
2
  module MiGA::Cli::Action::Doctor::Operations
3
+ ##
4
+ # Perform refdb operation with MiGA::Cli +cli+
5
+ def check_refdb(cli)
6
+ cli.say 'Checking index format of reference database'
7
+ ref_dbs = File.join(ENV['MIGA_HOME'], '.miga_db')
8
+ manif_file = File.join(ref_dbs, '_local_manif.json')
9
+ return unless File.size?(manif_file)
10
+
11
+ MiGA::Json.parse(manif_file)[:databases]&.keys&.each do |db|
12
+ p = MiGA::Project.load(File.join(ref_dbs, db.to_s))
13
+ md = p&.metadata
14
+ end
15
+ end
16
+
3
17
  ##
4
18
  # Perform status operation with MiGA::Cli +cli+
5
19
  def check_status(cli)
@@ -46,16 +46,18 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
46
46
  end
47
47
 
48
48
  @@OPERATIONS = {
49
+ # TODO Implement this check:
50
+ # refdb: ['ref-db', 'Check index format of reference databases'],
49
51
  status: ['status', 'Update metadata status of all datasets'],
50
- db: ['databases', 'Check integrity of database files'],
51
- bidir: ['bidirectional', 'Check distances are bidirectional'],
52
- dist: ['distances', 'Check distance summary tables'],
53
- files: ['files', 'Check for outdated files'],
54
- cds: ['cds', 'Check for gzipped genes and proteins'],
55
- ess: ['essential-genes', 'Check for outdated essential genes'],
56
- mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
57
- start: ['start', 'Check for lingering .start files'],
58
- tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
52
+ db: ['databases', 'Check integrity of database files'],
53
+ bidir: ['bidirectional', 'Check distances are bidirectional'],
54
+ dist: ['distances', 'Check distance summary tables'],
55
+ files: ['files', 'Check for outdated files'],
56
+ cds: ['cds', 'Check for gzipped genes and proteins'],
57
+ ess: ['essential-genes', 'Check for outdated essential genes'],
58
+ mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
59
+ start: ['start', 'Check for lingering .start files'],
60
+ tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
59
61
  }
60
62
 
61
63
  class << self
@@ -7,9 +7,9 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
7
7
  def parse_cli
8
8
  cli.defaults = { remove: false }
9
9
  cli.parse do |opt|
10
- cli.opt_object(opt)
10
+ cli.opt_object(opt, %i[project dataset_opt result_opt])
11
11
  opt.on(
12
- '-r', '--remove',
12
+ '-R', '--remove',
13
13
  'Also remove all associated files',
14
14
  'By default, only unlinks from metadata'
15
15
  ) { |v| cli[:remove] = v }
@@ -17,8 +17,13 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
17
17
  end
18
18
 
19
19
  def perform
20
- d = cli.load_dataset
21
- cli.load_project.unlink_dataset(d.name)
22
- d.remove! if cli[:remove]
20
+ if r = cli.load_result
21
+ cli[:remove] ? r.remove! : r.unlink
22
+ elsif d = cli.load_dataset
23
+ cli.load_project.unlink_dataset(d.name)
24
+ d.remove! if cli[:remove]
25
+ else
26
+ raise "You must define one of --result or --dataset"
27
+ end
23
28
  end
24
29
  end
@@ -21,6 +21,10 @@ module MiGA::Cli::Action::Wf
21
21
  '-o', '--out_dir PATH',
22
22
  '(Mandatory) Directory to be created with all output data'
23
23
  ) { |v| cli[:outdir] = v }
24
+ opt.on(
25
+ '-P', '--project PATH',
26
+ '::HIDE::' # Applying the principle of least surprise, alias of -o
27
+ ) { |v| cli[:outdir] = v }
24
28
  opt.separator ''
25
29
  opt.separator " FILES...: #{files_desc}"
26
30
  opt.separator ''
data/lib/miga/cli/base.rb CHANGED
@@ -2,60 +2,61 @@
2
2
 
3
3
  module MiGA::Cli::Base
4
4
  @@TASK_DESC = {
5
- generic: 'MiGA: The Microbial Genomes Atlas',
5
+ generic: 'MiGA: The Microbial Genomes Atlas',
6
6
  # Workflows
7
- quality_wf: 'Evaluate the quality of input genomes',
8
- derep_wf: 'Dereplicate a collection of input genomes',
7
+ quality_wf: 'Evaluate the quality of input genomes',
8
+ derep_wf: 'Dereplicate a collection of input genomes',
9
9
  classify_wf: 'Classify input genomes against a reference database',
10
- preproc_wf: 'Preprocess input genomes or metagenomes',
11
- index_wf: 'Generate distance indexing of input genomes',
10
+ preproc_wf: 'Preprocess input genomes or metagenomes',
11
+ index_wf: 'Generate distance indexing of input genomes',
12
12
  # Projects
13
- new: 'Create an empty MiGA project',
14
- about: 'Display information about a MiGA project',
15
- doctor: 'Perform consistency checks on a MiGA project',
16
- get_db: 'Download a pre-indexed database',
17
- browse: 'Explore a project locally using a web browser',
13
+ new: 'Create an empty MiGA project',
14
+ about: 'Display information about a MiGA project',
15
+ doctor: 'Perform consistency checks on a MiGA project',
16
+ browse: 'Explore a project locally using a web browser',
18
17
  # Datasets
19
- add: 'Create a dataset in a MiGA project',
20
- get: 'Download a dataset from public databases into a MiGA project',
21
- ncbi_get: 'Download all genomes in a taxon from NCBI into a MiGA project',
22
- gtdb_get: 'Download all genomes in a taxon from GTDB into a MiGA project',
23
- seqcode_get: 'Download all type genomes from SeqCode into a MiGA project',
24
- rm: 'Remove a dataset from a MiGA project',
25
- find: 'Find unregistered datasets based on result files',
26
- ln: 'Link datasets (including results) from one project to another',
27
- ls: 'List all registered datasets in a MiGA project',
28
- archive: 'Generate a tar-ball with all files from select datasets',
18
+ add: 'Create a dataset in a MiGA project',
19
+ get: 'Download a dataset from public databases into a MiGA project',
20
+ ncbi_get: 'Download the genomes of a taxon from NCBI to a MiGA project',
21
+ gtdb_get: 'Download the genomes of a taxon from GTDB to a MiGA project',
22
+ seqcode_get: 'Download all type genomes from SeqCode to a MiGA project',
23
+ rm: 'Remove a dataset from a MiGA project',
24
+ find: 'Find unregistered datasets based on result files',
25
+ ln: 'Link datasets (incl. results) from one project to another',
26
+ ls: 'List all registered datasets in a MiGA project',
27
+ archive: 'Generate a tar-ball with all files from select datasets',
29
28
  # Results
30
- add_result: 'Register a result',
31
- stats: 'Extract statistics for the given result',
32
- files: 'List registered files from the results of a dataset or project',
33
- run: 'Execute locally one step analysis producing the given result',
34
- summary: 'Generate a summary table for the statistics of all datasets',
35
- next_step: 'Return the next task to run in a dataset or project',
29
+ add_result: 'Register a result',
30
+ stats: 'Extract statistics for the given result',
31
+ files: 'List registered files from a dataset or project',
32
+ run: 'Execute locally one step analysis producing the given result',
33
+ summary: 'Generate a summary table for the statistics of all datasets',
34
+ next_step: 'Return the next task to run in a dataset or project',
36
35
  # Objects (Datasets or Projects)
37
- edit: 'Edit the metadata of a dataset or project',
38
- option: 'Get or set options of a dataset or project',
39
- touch: 'Change the "last modified" time to now without changes',
36
+ edit: 'Edit the metadata of a dataset or project',
37
+ option: 'Get or set options of a dataset or project',
38
+ touch: 'Change the "last modified" time to now without changes',
40
39
  # System
41
- init: 'Initialize MiGA to process new projects',
42
- daemon: 'Control the daemon of a MiGA project',
43
- lair: 'Control groups of daemons for several MiGA projects',
44
- date: 'Return the current date in standard MiGA format',
45
- console: 'Open an IRB console with MiGA',
46
- env: 'Shell code to load MiGA environment',
40
+ init: 'Initialize MiGA to process new projects',
41
+ daemon: 'Control the daemon of a MiGA project',
42
+ lair: 'Control groups of daemons for several MiGA projects',
43
+ db: 'Download a pre-indexed database',
44
+ date: 'Return the current date in standard MiGA format',
45
+ console: 'Open an IRB console with MiGA',
46
+ env: 'Shell code to load MiGA environment',
47
47
  # Taxonomy
48
- tax_set: 'Register taxonomic information for datasets',
49
- tax_test: 'Return test of taxonomic distributions for query datasets',
50
- tax_index: 'Create a taxonomy-indexed list of the datasets',
51
- tax_dist: 'Estimate distributions of distance by taxonomy',
48
+ tax_set: 'Register taxonomic information for datasets',
49
+ tax_test: 'Return test of taxonomic distributions for query datasets',
50
+ tax_index: 'Create a taxonomy-indexed list of the datasets',
51
+ tax_dist: 'Estimate distributions of distance by taxonomy',
52
52
  }
53
53
 
54
54
  @@TASK_ALIAS = {
55
55
  # Projects
56
56
  create_project: :new,
57
57
  project_info: :about,
58
- download: :get_db,
58
+ download: :db,
59
+ get_db: :db,
59
60
  # Datasets
60
61
  create_dataset: :add,
61
62
  download_dataset: :get,
data/lib/miga/taxonomy.rb CHANGED
@@ -137,6 +137,12 @@ class MiGA::Taxonomy < MiGA::MiGA
137
137
  self[:ns]
138
138
  end
139
139
 
140
+ ##
141
+ # Domain of the taxonomy (a String) or +nil+
142
+ def domain
143
+ self[:d]
144
+ end
145
+
140
146
  ##
141
147
  # Get the most general rank as a two-entry Array (rank and value).
142
148
  # If +force_ranks+ is true, it always returns the value for domain (d)
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 5, 0].freeze
15
+ VERSION = [1.3, 6, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 4, 21)
23
+ VERSION_DATE = Date.new(2023, 4, 25)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
10
10
  miga_start_project_step "$DIR"
11
11
 
12
12
  # Extract values
13
- rm -f miga-project.txt
14
- SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
16
- (
17
- echo "a b value sd n omega" | tr " " "\\t"
18
- for i in $DS ; do
19
- echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
13
+ function foreach_database_aai {
14
+ local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
+ local k=0
16
+ while [[ -n ${DS[$k]} ]] ; do
17
+ echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
18
+ let k=$k+1
20
19
  done
21
- # The following block pipes retrieved data from all databases, reorganizes the
22
- # names in cannonical order, and removes repeats from the first two columns,
23
- # in order to keep only one result per pair. This is not being included into
24
- # production, but the code may be useful for extremely large databases.
25
- # | tee \
26
- # | awk -F"\t" \
27
- # 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
28
- # | sort -k 1,2 -u
29
- ) | gzip -9c > miga-project.txt.gz
20
+ }
21
+
22
+ function aai_tsv {
23
+ DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
24
+ echo "a b value sd n omega" | tr " " "\\t"
25
+ if [[ ${#DS[@]} -gt 40000 ]] ; then
26
+ # Use comparisons in strictly one direction only for huge projects
27
+ foreach_database_aai \
28
+ | awk -F"\t" 'BEGIN { OFS="\t" }
29
+ { if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
30
+ | sort -k 1,2 -u
31
+ else
32
+ foreach_database "$SQL"
33
+ fi
34
+ }
35
+
36
+ rm -f "miga-project.txt"
37
+ aai_tsv | gzip -9c > "miga-project.txt.gz"
30
38
 
31
39
  # R-ify
32
40
  cat <<R | R --vanilla
@@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
9
9
  # Initialize
10
10
  miga date > "$DATASET.start"
11
11
 
12
- # Check quality first
12
+ # Check quality
13
13
  miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
14
14
  inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
15
15
  [[ "$inactive" == "true" ]] && exit
@@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
12
12
  [[ -s "$FAA" ]] || FAA="${FAA}.gz"
13
13
 
14
14
  # Check if there are any proteins
15
- if [[ ! -s $FAA ]] ; then
15
+ if [[ ! -s "$FAA" ]] ; then
16
16
  echo Empty protein set, bypassing essential genes
17
17
  rm "${DATASET}.start"
18
18
  miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
@@ -35,13 +35,31 @@ HMM.essential.rb \
35
35
  -t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
36
36
  > "${DATASET}.ess/log"
37
37
 
38
- # Index for FastAAI
38
+ # Index for FastAAI and classify (if needed and possible)
39
39
  NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
40
40
  | wc -l | awk '{print $1}')
41
- [[ "$NOMULTI" -eq "1" ]] && \
41
+ if [[ "$NOMULTI" -eq "1" ]] ; then
42
42
  python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
43
43
  --protein "$FAA" --output_crystal "${DATASET}.crystal" \
44
44
  --compress
45
+
46
+ # Classify
47
+ DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
48
+ if [[ "$DOMAIN" == "?" ]] ; then
49
+ REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
50
+ echo "Phylum-level classification against $REF_PROJ"
51
+ if [[ -n "$REF_PROJ" ]] ; then
52
+ cp "${DATASET}.start" "${DATASET}.start.bak"
53
+ miga date > "${DATASET}.done"
54
+ miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
55
+ ruby -I "$MIGA/lib" \
56
+ "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
57
+ run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
58
+ mv "${DATASET}.start.bak" "${DATASET}.start"
59
+ rm "${DATASET}.done" "${DATASET}.json"
60
+ fi
61
+ fi
62
+ fi
45
63
 
46
64
  # Reduce files
47
65
  if exists "$DATASET".ess/*.faa ; then
@@ -1,4 +1,4 @@
1
- require 'sqlite3'
1
+ require 'miga/sqlite'
2
2
 
3
3
  module MiGA::DistanceRunner::Database
4
4
  ##
@@ -11,6 +11,8 @@ module MiGA::DistanceRunner::Database
11
11
  { haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
12
12
  @db_counts[m] = 0
13
13
  @dbs[m] = for_ref ? ref_db(m) : query_db(m)
14
+ @tmp_dbs[m] = tmp_file("#{m}.db")
15
+
14
16
  # Remove if corrupt
15
17
  if File.size?(dbs[m])
16
18
  begin
@@ -21,9 +23,12 @@ module MiGA::DistanceRunner::Database
21
23
  FileUtils.rm dbs[m]
22
24
  end
23
25
  end
24
- # Initialize if it doesn't exist
25
- unless File.size? dbs[m]
26
- SQLite3::Database.new(dbs[m]) do |conn|
26
+
27
+ # Initialize if it doesn't exist, copy otherwise
28
+ if File.size? dbs[m]
29
+ FileUtils.cp(dbs[m], tmp_dbs[m])
30
+ else
31
+ SQLite3::Database.new(tmp_dbs[m]) do |conn|
27
32
  conn.execute <<~SQL
28
33
  create table if not exists #{t}(
29
34
  seq1 varchar(256), seq2 varchar(256),
@@ -31,10 +36,8 @@ module MiGA::DistanceRunner::Database
31
36
  )
32
37
  SQL
33
38
  end
39
+ FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
34
40
  end
35
- # Copy over to (local) temporals
36
- @tmp_dbs[m] = tmp_file("#{m}.db")
37
- FileUtils.cp(dbs[m], tmp_dbs[m])
38
41
  end
39
42
  end
40
43
 
@@ -157,6 +160,20 @@ module MiGA::DistanceRunner::Database
157
160
  conn.execute(sql).each { |row| data[row.shift] = row }
158
161
  end
159
162
  data
163
+ rescue => e
164
+ $stderr.puts "Database file: #{db}" if db ||= nil
165
+ raise e
166
+ end
167
+
168
+ ##
169
+ # Retrieve the name and AAI of the closest relative from the AAI database
170
+ def closest_relative
171
+ db = tmp_dbs[:aai]
172
+ sql = 'select seq2, aai from aai order by aai desc limit 1'
173
+ MiGA::SQLite.new(db).run(sql).first
174
+ rescue => e
175
+ $stderr.puts "Database file: #{db}" if db ||= nil
176
+ raise e
160
177
  end
161
178
 
162
179
  ##
@@ -72,22 +72,13 @@ module MiGA::DistanceRunner::Pipeline
72
72
  $stderr.puts "Testing taxonomy | opts = #{opts}"
73
73
  # Get taxonomy of closest relative
74
74
  from_ref_project = (project != ref_project)
75
- res_dir =
76
- from_ref_project ?
77
- File.expand_path('data/09.distances/05.taxonomy', project.path) :
78
- home
79
- Dir.mkdir res_dir unless Dir.exist? res_dir
80
- File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
81
- fh.puts Time.now.to_s
82
- end
83
- dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
84
- cr = dataset.closest_relatives(1, from_ref_project)
75
+ cr = closest_relative
85
76
  return if cr.nil? or cr.empty?
86
77
 
87
- tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
78
+ tax = ref_project.dataset(cr[0]).metadata[:tax] || {}
88
79
 
89
80
  # Run the test for each rank
90
- tax_test = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax, engine: opts[:aai_p])
81
+ tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
91
82
  r = tax_test.map do |k, v|
92
83
  sig = ''
93
84
  [0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
@@ -95,12 +86,14 @@ module MiGA::DistanceRunner::Pipeline
95
86
  end
96
87
 
97
88
  # Save test
98
- File.open(File.expand_path("#{dataset.name}.intax.txt", home), 'w') do |fh|
99
- fh.puts "Closest relative: #{cr[0][0]} with AAI: #{cr[0][1]}."
100
- fh.puts ''
101
- fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
102
- fh.puts ''
103
- fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
89
+ unless opts[:only_domain]
90
+ File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
91
+ fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
92
+ fh.puts ''
93
+ fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
94
+ fh.puts ''
95
+ fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
96
+ end
104
97
  end
105
98
  return r
106
99
  end
@@ -115,6 +108,7 @@ module MiGA::DistanceRunner::Pipeline
115
108
  .select { |i| i[1] != '?' && i[2] <= pval }
116
109
  .map { |i| i[0, 2].join(':') }
117
110
  dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
111
+ $stderr.puts " > #{dataset.metadata[:tax]}"
118
112
  dataset.save
119
113
  end
120
114
  end
@@ -12,8 +12,9 @@ class MiGA::DistanceRunner
12
12
  @home = File.expand_path('data/09.distances', project.path)
13
13
 
14
14
  # Default opts
15
- if opts[:run_taxonomy] && project.option(:ref_project)
16
- ref_path = project.option(:ref_project)
15
+ if opts[:run_taxonomy] &&
16
+ (opts[:ref_project] || project.option(:ref_project))
17
+ ref_path = opts[:ref_project] || project.option(:ref_project)
17
18
  @home = File.expand_path('05.taxonomy', @home)
18
19
  @ref_project = MiGA::Project.load(ref_path)
19
20
  raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
@@ -73,48 +74,55 @@ class MiGA::DistanceRunner
73
74
  # Initialize the databases
74
75
  initialize_dbs! false
75
76
  distances_by_request(tsk[1])
77
+
76
78
  # Calculate the classification-informed AAI/ANI traverse
77
- results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
78
- fh = File.open(results, 'w')
79
+ tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
80
+ fh = File.open(tmp_results, 'w')
79
81
  classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
80
82
  fh.close
81
83
 
82
- # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
83
- par_dir = File.dirname(File.expand_path(classif, res.dir))
84
- par = File.expand_path('miga-project.classif', par_dir)
85
- closest = { dataset: nil, ani: 0.0 }
86
- sbj_datasets = []
87
- if File.size? par
88
- File.open(par, 'r') do |fh|
89
- fh.each_line do |ln|
90
- r = ln.chomp.split("\t")
91
- sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
84
+ unless opts[:only_domain]
85
+ results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
86
+ FileUtils.move(tmp_results, results)
87
+
88
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
89
+ par_dir = File.dirname(File.expand_path(classif, res.dir))
90
+ par = File.expand_path('miga-project.classif', par_dir)
91
+ closest = { dataset: nil, ani: 0.0 }
92
+ sbj_datasets = []
93
+ if File.size? par
94
+ File.open(par, 'r') do |fh|
95
+ fh.each_line do |ln|
96
+ r = ln.chomp.split("\t")
97
+ sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
98
+ end
92
99
  end
100
+ ani = ani_after_aai(sbj_datasets, 80.0)
101
+ ani_max = ani.map(&:to_f).each_with_index.max
102
+ closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
93
103
  end
94
- ani = ani_after_aai(sbj_datasets, 80.0)
95
- ani_max = ani.map(&:to_f).each_with_index.max
96
- closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
97
- end
98
104
 
99
- # Calculate all the AAIs/ANIs against the closest ANI95-clade (if AAI > 80%)
100
- cl_path = res.file_path :clades_ani95
101
- if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
102
- clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
103
- sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
104
- sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
105
- ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
105
+ # Calculate all the AAIs/ANIs against the closest ANI95-clade
106
+ # (if AAI > 80%)
107
+ cl_path = res.file_path :clades_ani95
108
+ if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
109
+ clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
110
+ sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
111
+ sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
112
+ ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
113
+ end
106
114
  end
107
115
 
108
116
  # Finalize
109
117
  [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
110
- build_medoids_tree(tsk[1])
118
+ build_medoids_tree(tsk[1]) unless opts[:only_domain]
111
119
  transfer_taxonomy(tax_test)
112
120
  end
113
121
 
114
122
  # Launch analysis for taxonomy jobs
115
123
  def go_taxonomy!
116
124
  $stderr.puts 'Launching taxonomy analysis'
117
- return unless project.option(:ref_project)
125
+ return unless opts[:ref_project] || project.option(:ref_project)
118
126
 
119
127
  go_query! # <- yeah, it's actually the same, just different ref_project
120
128
  end
@@ -41,6 +41,7 @@ module MiGA::DistanceRunner::Temporal
41
41
 
42
42
  # Copies temporal databases back to the MiGA Project
43
43
  def checkpoint!(metric)
44
+ return if opts[:only_domain]
44
45
  $stderr.puts "Checkpoint (metric = #{metric})"
45
46
 
46
47
  # This is simply to test database consistency before overwriting the
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5.0
4
+ version: 1.3.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-21 00:00:00.000000000 Z
11
+ date: 2023-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -152,6 +152,7 @@ files:
152
152
  - lib/miga/cli/action/console.rb
153
153
  - lib/miga/cli/action/daemon.rb
154
154
  - lib/miga/cli/action/date.rb
155
+ - lib/miga/cli/action/db.rb
155
156
  - lib/miga/cli/action/derep_wf.rb
156
157
  - lib/miga/cli/action/doctor.rb
157
158
  - lib/miga/cli/action/doctor/base.rb
@@ -168,7 +169,6 @@ files:
168
169
  - lib/miga/cli/action/find.rb
169
170
  - lib/miga/cli/action/generic.rb
170
171
  - lib/miga/cli/action/get.rb
171
- - lib/miga/cli/action/get_db.rb
172
172
  - lib/miga/cli/action/gtdb_get.rb
173
173
  - lib/miga/cli/action/index_wf.rb
174
174
  - lib/miga/cli/action/init.rb