miga-base 1.3.4.3 → 1.3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '04760780c4ec69edaeb55949c2f15933c7d9ecc1c620bbeca28a01679f957db3'
4
- data.tar.gz: 885e191e8c77a7e117d1e22875f6c90615f7cc001d0215930fd690022a88fb0d
3
+ metadata.gz: a9035e8f6ccd641c75bedc38b5f78d3eb1815c51956d399728b1e862c2920cbb
4
+ data.tar.gz: 2ffdecb639083a3a40d2bab3a39f1df92cb22279314ede2a5a843b278c8be9ee
5
5
  SHA512:
6
- metadata.gz: 2b47739ea450c9217119ad61317559e7429f87b04531ad624555538885207ffbb054715d18f60757a41f5c7816cb3cdc06b4b81a378b509ac3c1518452ba02ec
7
- data.tar.gz: a3c08caf1d98ea5de2c7b137004c126a5b1541403cacd8ab88d0fa3da8a0617fe380448370f3e038bd70d3a4342b20d6476a94a5062c6aec9615838f694c5e6a
6
+ metadata.gz: 42db29082a3b8097f8ec6f55839202d79e8d00496d55bab8093e4355a26f9133249506266e7f0eb64acea18fb91885abc493a0fe92c81c0eef27ec11f08d3eff
7
+ data.tar.gz: 5b4ce9ae27ac4542e2af836176dde909f013461b9916f1d1f4cfc0831d6e494d5de76c57d9c7b64ebd87aefd21a2c79edf55a0192838b6e589be21e9e5a4ebb1
@@ -4,12 +4,12 @@
4
4
  require 'miga/cli/action'
5
5
  require 'digest/md5'
6
6
 
7
- class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
7
+ class MiGA::Cli::Action::Db < MiGA::Cli::Action
8
8
  def parse_cli
9
9
  cli.defaults = {
10
10
  database: :recommended,
11
11
  version: :latest,
12
- local: File.expand_path('.miga_db', ENV['MIGA_HOME']),
12
+ local: File.join(ENV['MIGA_HOME'], '.miga_db'),
13
13
  host: MiGA::MiGA.known_hosts(:miga_db),
14
14
  pb: true,
15
15
  reuse_archive: false,
@@ -40,6 +40,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
40
40
  '--list-versions',
41
41
  'List available versions of the database and exit'
42
42
  ) { |v| cli[:list_versions] = v }
43
+ opt.on(
44
+ '--list-local',
45
+ 'List only the versions of the local databases (if any) and exit'
46
+ ) { |v| cli[:list_local] = v }
43
47
  opt.on(
44
48
  '--reuse-archive',
45
49
  'Reuse a previously downloaded archive if available'
@@ -48,6 +52,10 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
48
52
  '--no-overwrite',
49
53
  'Exit without downloading if the target database already exists'
50
54
  ) { |v| cli[:overwrite] = v }
55
+ opt.on(
56
+ '--tab',
57
+ 'Return a tab-delimited table'
58
+ ) { |v| cli[:tabular] = v }
51
59
  opt.on('--no-progress', 'Supress progress bars') { |v| cli[:pb] = v }
52
60
  end
53
61
  end
@@ -60,6 +68,12 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
60
68
  return
61
69
  end
62
70
 
71
+ # If dealing with local checks only
72
+ if cli[:list_local]
73
+ list_local
74
+ return
75
+ end
76
+
63
77
  # Remote manifest
64
78
  @ftp = remote_connection
65
79
  manif = remote_manifest(@ftp)
@@ -89,6 +103,28 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
89
103
 
90
104
  private
91
105
 
106
+ def list_local
107
+ local_manif = local_manifest
108
+ raise "Local manifest not found." unless local_manif
109
+ databases =
110
+ if %i[recommended test].include?(cli[:database])
111
+ local_manif[:databases].keys
112
+ else
113
+ [cli[:database].to_sym]
114
+ end
115
+ cli.table(
116
+ %w[database version genomes updated path],
117
+ databases.map do |db|
118
+ path = File.join(cli[:local], db.to_s)
119
+ p = MiGA::Project.load(path)
120
+ if p
121
+ md = p.metadata
122
+ [db, md[:release], md[:datasets].count, md[:updated], p.path]
123
+ end
124
+ end.compact
125
+ )
126
+ end
127
+
92
128
  def remote_connection
93
129
  cli.say "Connecting to '#{cli[:host]}'"
94
130
  MiGA::MiGA.remote_connection(cli[:host])
@@ -113,6 +149,11 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
113
149
  MiGA::Json.parse(file)
114
150
  end
115
151
 
152
+ def local_manifest
153
+ file = File.join(cli[:local], '_local_manif.json')
154
+ MiGA::Json.parse(file) if File.exist?(file)
155
+ end
156
+
116
157
  def db_requested(manif)
117
158
  [:recommended, :test].each do |n|
118
159
  if cli[:database] == n
@@ -208,8 +249,7 @@ class MiGA::Cli::Action::GetDb < MiGA::Cli::Action
208
249
 
209
250
  def register_database(manif, db, ver)
210
251
  cli.say "Registering database locally"
211
- local_manif = File.expand_path('_local_manif.json', cli[:local])
212
- reg = File.exist?(local_manif) ? MiGA::Json.parse(local_manif) : {}
252
+ local_manif = local_manifest || {}
213
253
  reg[:last_update] = Time.now.to_s
214
254
  reg[:databases] ||= {}
215
255
  reg[:databases][cli[:database]] ||= {}
@@ -1,5 +1,19 @@
1
1
 
2
2
  module MiGA::Cli::Action::Doctor::Operations
3
+ ##
4
+ # Perform refdb operation with MiGA::Cli +cli+
5
+ def check_refdb(cli)
6
+ cli.say 'Checking index format of reference database'
7
+ ref_dbs = File.join(ENV['MIGA_HOME'], '.miga_db')
8
+ manif_file = File.join(ref_dbs, '_local_manif.json')
9
+ return unless File.size?(manif_file)
10
+
11
+ MiGA::Json.parse(manif_file)[:databases]&.keys&.each do |db|
12
+ p = MiGA::Project.load(File.join(ref_dbs, db.to_s))
13
+ md = p&.metadata
14
+ end
15
+ end
16
+
3
17
  ##
4
18
  # Perform status operation with MiGA::Cli +cli+
5
19
  def check_status(cli)
@@ -46,16 +46,18 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
46
46
  end
47
47
 
48
48
  @@OPERATIONS = {
49
+ # TODO Implement this check:
50
+ # refdb: ['ref-db', 'Check index format of reference databases'],
49
51
  status: ['status', 'Update metadata status of all datasets'],
50
- db: ['databases', 'Check integrity of database files'],
51
- bidir: ['bidirectional', 'Check distances are bidirectional'],
52
- dist: ['distances', 'Check distance summary tables'],
53
- files: ['files', 'Check for outdated files'],
54
- cds: ['cds', 'Check for gzipped genes and proteins'],
55
- ess: ['essential-genes', 'Check for outdated essential genes'],
56
- mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
57
- start: ['start', 'Check for lingering .start files'],
58
- tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
52
+ db: ['databases', 'Check integrity of database files'],
53
+ bidir: ['bidirectional', 'Check distances are bidirectional'],
54
+ dist: ['distances', 'Check distance summary tables'],
55
+ files: ['files', 'Check for outdated files'],
56
+ cds: ['cds', 'Check for gzipped genes and proteins'],
57
+ ess: ['essential-genes', 'Check for outdated essential genes'],
58
+ mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
59
+ start: ['start', 'Check for lingering .start files'],
60
+ tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
59
61
  }
60
62
 
61
63
  class << self
@@ -7,9 +7,9 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
7
7
  def parse_cli
8
8
  cli.defaults = { remove: false }
9
9
  cli.parse do |opt|
10
- cli.opt_object(opt)
10
+ cli.opt_object(opt, %i[project dataset_opt result_opt])
11
11
  opt.on(
12
- '-r', '--remove',
12
+ '-R', '--remove',
13
13
  'Also remove all associated files',
14
14
  'By default, only unlinks from metadata'
15
15
  ) { |v| cli[:remove] = v }
@@ -17,8 +17,13 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
17
17
  end
18
18
 
19
19
  def perform
20
- d = cli.load_dataset
21
- cli.load_project.unlink_dataset(d.name)
22
- d.remove! if cli[:remove]
20
+ if r = cli.load_result
21
+ cli[:remove] ? r.remove! : r.unlink
22
+ elsif d = cli.load_dataset
23
+ cli.load_project.unlink_dataset(d.name)
24
+ d.remove! if cli[:remove]
25
+ else
26
+ raise "You must define one of --result or --dataset"
27
+ end
23
28
  end
24
29
  end
@@ -21,6 +21,10 @@ module MiGA::Cli::Action::Wf
21
21
  '-o', '--out_dir PATH',
22
22
  '(Mandatory) Directory to be created with all output data'
23
23
  ) { |v| cli[:outdir] = v }
24
+ opt.on(
25
+ '-P', '--project PATH',
26
+ '::HIDE::' # Applying the principle of least surprise, alias of -o
27
+ ) { |v| cli[:outdir] = v }
24
28
  opt.separator ''
25
29
  opt.separator " FILES...: #{files_desc}"
26
30
  opt.separator ''
data/lib/miga/cli/base.rb CHANGED
@@ -2,60 +2,61 @@
2
2
 
3
3
  module MiGA::Cli::Base
4
4
  @@TASK_DESC = {
5
- generic: 'MiGA: The Microbial Genomes Atlas',
5
+ generic: 'MiGA: The Microbial Genomes Atlas',
6
6
  # Workflows
7
- quality_wf: 'Evaluate the quality of input genomes',
8
- derep_wf: 'Dereplicate a collection of input genomes',
7
+ quality_wf: 'Evaluate the quality of input genomes',
8
+ derep_wf: 'Dereplicate a collection of input genomes',
9
9
  classify_wf: 'Classify input genomes against a reference database',
10
- preproc_wf: 'Preprocess input genomes or metagenomes',
11
- index_wf: 'Generate distance indexing of input genomes',
10
+ preproc_wf: 'Preprocess input genomes or metagenomes',
11
+ index_wf: 'Generate distance indexing of input genomes',
12
12
  # Projects
13
- new: 'Create an empty MiGA project',
14
- about: 'Display information about a MiGA project',
15
- doctor: 'Perform consistency checks on a MiGA project',
16
- get_db: 'Download a pre-indexed database',
17
- browse: 'Explore a project locally using a web browser',
13
+ new: 'Create an empty MiGA project',
14
+ about: 'Display information about a MiGA project',
15
+ doctor: 'Perform consistency checks on a MiGA project',
16
+ browse: 'Explore a project locally using a web browser',
18
17
  # Datasets
19
- add: 'Create a dataset in a MiGA project',
20
- get: 'Download a dataset from public databases into a MiGA project',
21
- ncbi_get: 'Download all genomes in a taxon from NCBI into a MiGA project',
22
- gtdb_get: 'Download all genomes in a taxon from GTDB into a MiGA project',
23
- seqcode_get: 'Download all type genomes from SeqCode into a MiGA project',
24
- rm: 'Remove a dataset from a MiGA project',
25
- find: 'Find unregistered datasets based on result files',
26
- ln: 'Link datasets (including results) from one project to another',
27
- ls: 'List all registered datasets in a MiGA project',
28
- archive: 'Generate a tar-ball with all files from select datasets',
18
+ add: 'Create a dataset in a MiGA project',
19
+ get: 'Download a dataset from public databases into a MiGA project',
20
+ ncbi_get: 'Download the genomes of a taxon from NCBI to a MiGA project',
21
+ gtdb_get: 'Download the genomes of a taxon from GTDB to a MiGA project',
22
+ seqcode_get: 'Download all type genomes from SeqCode to a MiGA project',
23
+ rm: 'Remove a dataset from a MiGA project',
24
+ find: 'Find unregistered datasets based on result files',
25
+ ln: 'Link datasets (incl. results) from one project to another',
26
+ ls: 'List all registered datasets in a MiGA project',
27
+ archive: 'Generate a tar-ball with all files from select datasets',
29
28
  # Results
30
- add_result: 'Register a result',
31
- stats: 'Extract statistics for the given result',
32
- files: 'List registered files from the results of a dataset or project',
33
- run: 'Execute locally one step analysis producing the given result',
34
- summary: 'Generate a summary table for the statistics of all datasets',
35
- next_step: 'Return the next task to run in a dataset or project',
29
+ add_result: 'Register a result',
30
+ stats: 'Extract statistics for the given result',
31
+ files: 'List registered files from a dataset or project',
32
+ run: 'Execute locally one step analysis producing the given result',
33
+ summary: 'Generate a summary table for the statistics of all datasets',
34
+ next_step: 'Return the next task to run in a dataset or project',
36
35
  # Objects (Datasets or Projects)
37
- edit: 'Edit the metadata of a dataset or project',
38
- option: 'Get or set options of a dataset or project',
39
- touch: 'Change the "last modified" time to now without changes',
36
+ edit: 'Edit the metadata of a dataset or project',
37
+ option: 'Get or set options of a dataset or project',
38
+ touch: 'Change the "last modified" time to now without changes',
40
39
  # System
41
- init: 'Initialize MiGA to process new projects',
42
- daemon: 'Control the daemon of a MiGA project',
43
- lair: 'Control groups of daemons for several MiGA projects',
44
- date: 'Return the current date in standard MiGA format',
45
- console: 'Open an IRB console with MiGA',
46
- env: 'Shell code to load MiGA environment',
40
+ init: 'Initialize MiGA to process new projects',
41
+ daemon: 'Control the daemon of a MiGA project',
42
+ lair: 'Control groups of daemons for several MiGA projects',
43
+ db: 'Download a pre-indexed database',
44
+ date: 'Return the current date in standard MiGA format',
45
+ console: 'Open an IRB console with MiGA',
46
+ env: 'Shell code to load MiGA environment',
47
47
  # Taxonomy
48
- tax_set: 'Register taxonomic information for datasets',
49
- tax_test: 'Return test of taxonomic distributions for query datasets',
50
- tax_index: 'Create a taxonomy-indexed list of the datasets',
51
- tax_dist: 'Estimate distributions of distance by taxonomy',
48
+ tax_set: 'Register taxonomic information for datasets',
49
+ tax_test: 'Return test of taxonomic distributions for query datasets',
50
+ tax_index: 'Create a taxonomy-indexed list of the datasets',
51
+ tax_dist: 'Estimate distributions of distance by taxonomy',
52
52
  }
53
53
 
54
54
  @@TASK_ALIAS = {
55
55
  # Projects
56
56
  create_project: :new,
57
57
  project_info: :about,
58
- download: :get_db,
58
+ download: :db,
59
+ get_db: :db,
59
60
  # Datasets
60
61
  create_dataset: :add,
61
62
  download_dataset: :get,
@@ -141,12 +141,16 @@ module MiGA::Result::Stats
141
141
  # Determine qualitative range
142
142
  stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
143
143
  source.metadata[:quality] =
144
- case stats[:quality]
145
- when 80..100; :excellent
146
- when 50..80; :high
147
- when 20..50; :intermediate
148
- else; :low
144
+ if stats[:completeness][0] >= 90 && stats[:contamination][0] <= 5
145
+ :excellent # Finished or High-quality draft*
146
+ elsif stats[:completeness][0] >= 50 && stats[:contamination][0] <= 10
147
+ :high # Medium-quality draft*
148
+ elsif stats[:quality] >= 25
149
+ :intermediate # Low-quality draft* but sufficient for classification
150
+ else
151
+ :low # Low-quality draft* and insufficient for classification
149
152
  end
153
+ # * Bowers et al 2017, DOI: 10.1038/nbt.3893
150
154
  source.save
151
155
 
152
156
  # Inactivate low-quality datasets
data/lib/miga/taxonomy.rb CHANGED
@@ -137,6 +137,12 @@ class MiGA::Taxonomy < MiGA::MiGA
137
137
  self[:ns]
138
138
  end
139
139
 
140
+ ##
141
+ # Domain of the taxonomy (a String) or +nil+
142
+ def domain
143
+ self[:d]
144
+ end
145
+
140
146
  ##
141
147
  # Get the most general rank as a two-entry Array (rank and value).
142
148
  # If +force_ranks+ is true, it always returns the value for domain (d)
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 4, 3].freeze
15
+ VERSION = [1.3, 6, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 4, 20)
23
+ VERSION_DATE = Date.new(2023, 4, 25)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
10
10
  miga_start_project_step "$DIR"
11
11
 
12
12
  # Extract values
13
- rm -f miga-project.txt
14
- SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
16
- (
17
- echo "a b value sd n omega" | tr " " "\\t"
18
- for i in $DS ; do
19
- echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
13
+ function foreach_database_aai {
14
+ local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
+ local k=0
16
+ while [[ -n ${DS[$k]} ]] ; do
17
+ echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
18
+ let k=$k+1
20
19
  done
21
- # The following block pipes retrieved data from all databases, reorganizes the
22
- # names in cannonical order, and removes repeats from the first two columns,
23
- # in order to keep only one result per pair. This is not being included into
24
- # production, but the code may be useful for extremely large databases.
25
- # | tee \
26
- # | awk -F"\t" \
27
- # 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
28
- # | sort -k 1,2 -u
29
- ) | gzip -9c > miga-project.txt.gz
20
+ }
21
+
22
+ function aai_tsv {
23
+ DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
24
+ echo "a b value sd n omega" | tr " " "\\t"
25
+ if [[ ${#DS[@]} -gt 40000 ]] ; then
26
+ # Use comparisons in strictly one direction only for huge projects
27
+ foreach_database_aai \
28
+ | awk -F"\t" 'BEGIN { OFS="\t" }
29
+ { if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
30
+ | sort -k 1,2 -u
31
+ else
32
+ foreach_database "$SQL"
33
+ fi
34
+ }
35
+
36
+ rm -f "miga-project.txt"
37
+ aai_tsv | gzip -9c > "miga-project.txt.gz"
30
38
 
31
39
  # R-ify
32
40
  cat <<R | R --vanilla
@@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
9
9
  # Initialize
10
10
  miga date > "$DATASET.start"
11
11
 
12
- # Check quality first
12
+ # Check quality
13
13
  miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
14
14
  inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
15
15
  [[ "$inactive" == "true" ]] && exit
@@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
12
12
  [[ -s "$FAA" ]] || FAA="${FAA}.gz"
13
13
 
14
14
  # Check if there are any proteins
15
- if [[ ! -s $FAA ]] ; then
15
+ if [[ ! -s "$FAA" ]] ; then
16
16
  echo Empty protein set, bypassing essential genes
17
17
  rm "${DATASET}.start"
18
18
  miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
@@ -35,13 +35,31 @@ HMM.essential.rb \
35
35
  -t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
36
36
  > "${DATASET}.ess/log"
37
37
 
38
- # Index for FastAAI
38
+ # Index for FastAAI and classify (if needed and possible)
39
39
  NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
40
40
  | wc -l | awk '{print $1}')
41
- [[ "$NOMULTI" -eq "1" ]] && \
41
+ if [[ "$NOMULTI" -eq "1" ]] ; then
42
42
  python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
43
43
  --protein "$FAA" --output_crystal "${DATASET}.crystal" \
44
44
  --compress
45
+
46
+ # Classify
47
+ DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
48
+ if [[ "$DOMAIN" == "?" ]] ; then
49
+ REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
50
+ echo "Phylum-level classification against $REF_PROJ"
51
+ if [[ -n "$REF_PROJ" ]] ; then
52
+ cp "${DATASET}.start" "${DATASET}.start.bak"
53
+ miga date > "${DATASET}.done"
54
+ miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
55
+ ruby -I "$MIGA/lib" \
56
+ "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
57
+ run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
58
+ mv "${DATASET}.start.bak" "${DATASET}.start"
59
+ rm "${DATASET}.done" "${DATASET}.json"
60
+ fi
61
+ fi
62
+ fi
45
63
 
46
64
  # Reduce files
47
65
  if exists "$DATASET".ess/*.faa ; then
@@ -1,4 +1,4 @@
1
- require 'sqlite3'
1
+ require 'miga/sqlite'
2
2
 
3
3
  module MiGA::DistanceRunner::Database
4
4
  ##
@@ -11,6 +11,8 @@ module MiGA::DistanceRunner::Database
11
11
  { haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
12
12
  @db_counts[m] = 0
13
13
  @dbs[m] = for_ref ? ref_db(m) : query_db(m)
14
+ @tmp_dbs[m] = tmp_file("#{m}.db")
15
+
14
16
  # Remove if corrupt
15
17
  if File.size?(dbs[m])
16
18
  begin
@@ -21,9 +23,12 @@ module MiGA::DistanceRunner::Database
21
23
  FileUtils.rm dbs[m]
22
24
  end
23
25
  end
24
- # Initialize if it doesn't exist
25
- unless File.size? dbs[m]
26
- SQLite3::Database.new(dbs[m]) do |conn|
26
+
27
+ # Initialize if it doesn't exist, copy otherwise
28
+ if File.size? dbs[m]
29
+ FileUtils.cp(dbs[m], tmp_dbs[m])
30
+ else
31
+ SQLite3::Database.new(tmp_dbs[m]) do |conn|
27
32
  conn.execute <<~SQL
28
33
  create table if not exists #{t}(
29
34
  seq1 varchar(256), seq2 varchar(256),
@@ -31,10 +36,8 @@ module MiGA::DistanceRunner::Database
31
36
  )
32
37
  SQL
33
38
  end
39
+ FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
34
40
  end
35
- # Copy over to (local) temporals
36
- @tmp_dbs[m] = tmp_file("#{m}.db")
37
- FileUtils.cp(dbs[m], tmp_dbs[m])
38
41
  end
39
42
  end
40
43
 
@@ -157,6 +160,20 @@ module MiGA::DistanceRunner::Database
157
160
  conn.execute(sql).each { |row| data[row.shift] = row }
158
161
  end
159
162
  data
163
+ rescue => e
164
+ $stderr.puts "Database file: #{db}" if db ||= nil
165
+ raise e
166
+ end
167
+
168
+ ##
169
+ # Retrieve the name and AAI of the closest relative from the AAI database
170
+ def closest_relative
171
+ db = tmp_dbs[:aai]
172
+ sql = 'select seq2, aai from aai order by aai desc limit 1'
173
+ MiGA::SQLite.new(db).run(sql).first
174
+ rescue => e
175
+ $stderr.puts "Database file: #{db}" if db ||= nil
176
+ raise e
160
177
  end
161
178
 
162
179
  ##
@@ -72,22 +72,13 @@ module MiGA::DistanceRunner::Pipeline
72
72
  $stderr.puts "Testing taxonomy | opts = #{opts}"
73
73
  # Get taxonomy of closest relative
74
74
  from_ref_project = (project != ref_project)
75
- res_dir =
76
- from_ref_project ?
77
- File.expand_path('data/09.distances/05.taxonomy', project.path) :
78
- home
79
- Dir.mkdir res_dir unless Dir.exist? res_dir
80
- File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
81
- fh.puts Time.now.to_s
82
- end
83
- dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
84
- cr = dataset.closest_relatives(1, from_ref_project)
75
+ cr = closest_relative
85
76
  return if cr.nil? or cr.empty?
86
77
 
87
- tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
78
+ tax = ref_project.dataset(cr[0]).metadata[:tax] || {}
88
79
 
89
80
  # Run the test for each rank
90
- tax_test = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax, engine: opts[:aai_p])
81
+ tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
91
82
  r = tax_test.map do |k, v|
92
83
  sig = ''
93
84
  [0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
@@ -95,12 +86,14 @@ module MiGA::DistanceRunner::Pipeline
95
86
  end
96
87
 
97
88
  # Save test
98
- File.open(File.expand_path("#{dataset.name}.intax.txt", home), 'w') do |fh|
99
- fh.puts "Closest relative: #{cr[0][0]} with AAI: #{cr[0][1]}."
100
- fh.puts ''
101
- fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
102
- fh.puts ''
103
- fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
89
+ unless opts[:only_domain]
90
+ File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
91
+ fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
92
+ fh.puts ''
93
+ fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
94
+ fh.puts ''
95
+ fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
96
+ end
104
97
  end
105
98
  return r
106
99
  end
@@ -115,6 +108,7 @@ module MiGA::DistanceRunner::Pipeline
115
108
  .select { |i| i[1] != '?' && i[2] <= pval }
116
109
  .map { |i| i[0, 2].join(':') }
117
110
  dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
111
+ $stderr.puts " > #{dataset.metadata[:tax]}"
118
112
  dataset.save
119
113
  end
120
114
  end
@@ -12,8 +12,9 @@ class MiGA::DistanceRunner
12
12
  @home = File.expand_path('data/09.distances', project.path)
13
13
 
14
14
  # Default opts
15
- if opts[:run_taxonomy] && project.option(:ref_project)
16
- ref_path = project.option(:ref_project)
15
+ if opts[:run_taxonomy] &&
16
+ (opts[:ref_project] || project.option(:ref_project))
17
+ ref_path = opts[:ref_project] || project.option(:ref_project)
17
18
  @home = File.expand_path('05.taxonomy', @home)
18
19
  @ref_project = MiGA::Project.load(ref_path)
19
20
  raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
@@ -73,48 +74,55 @@ class MiGA::DistanceRunner
73
74
  # Initialize the databases
74
75
  initialize_dbs! false
75
76
  distances_by_request(tsk[1])
77
+
76
78
  # Calculate the classification-informed AAI/ANI traverse
77
- results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
78
- fh = File.open(results, 'w')
79
+ tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
80
+ fh = File.open(tmp_results, 'w')
79
81
  classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
80
82
  fh.close
81
83
 
82
- # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
83
- par_dir = File.dirname(File.expand_path(classif, res.dir))
84
- par = File.expand_path('miga-project.classif', par_dir)
85
- closest = { dataset: nil, ani: 0.0 }
86
- sbj_datasets = []
87
- if File.size? par
88
- File.open(par, 'r') do |fh|
89
- fh.each_line do |ln|
90
- r = ln.chomp.split("\t")
91
- sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
84
+ unless opts[:only_domain]
85
+ results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
86
+ FileUtils.move(tmp_results, results)
87
+
88
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
89
+ par_dir = File.dirname(File.expand_path(classif, res.dir))
90
+ par = File.expand_path('miga-project.classif', par_dir)
91
+ closest = { dataset: nil, ani: 0.0 }
92
+ sbj_datasets = []
93
+ if File.size? par
94
+ File.open(par, 'r') do |fh|
95
+ fh.each_line do |ln|
96
+ r = ln.chomp.split("\t")
97
+ sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
98
+ end
92
99
  end
100
+ ani = ani_after_aai(sbj_datasets, 80.0)
101
+ ani_max = ani.map(&:to_f).each_with_index.max
102
+ closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
93
103
  end
94
- ani = ani_after_aai(sbj_datasets, 80.0)
95
- ani_max = ani.map(&:to_f).each_with_index.max
96
- closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
97
- end
98
104
 
99
- # Calculate all the AAIs/ANIs against the closest ANI95-clade (if AAI > 80%)
100
- cl_path = res.file_path :clades_ani95
101
- if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
102
- clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
103
- sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
104
- sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
105
- ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
105
+ # Calculate all the AAIs/ANIs against the closest ANI95-clade
106
+ # (if AAI > 80%)
107
+ cl_path = res.file_path :clades_ani95
108
+ if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
109
+ clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
110
+ sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
111
+ sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
112
+ ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
113
+ end
106
114
  end
107
115
 
108
116
  # Finalize
109
117
  [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
110
- build_medoids_tree(tsk[1])
118
+ build_medoids_tree(tsk[1]) unless opts[:only_domain]
111
119
  transfer_taxonomy(tax_test)
112
120
  end
113
121
 
114
122
  # Launch analysis for taxonomy jobs
115
123
  def go_taxonomy!
116
124
  $stderr.puts 'Launching taxonomy analysis'
117
- return unless project.option(:ref_project)
125
+ return unless opts[:ref_project] || project.option(:ref_project)
118
126
 
119
127
  go_query! # <- yeah, it's actually the same, just different ref_project
120
128
  end
@@ -41,6 +41,7 @@ module MiGA::DistanceRunner::Temporal
41
41
 
42
42
  # Copies temporal databases back to the MiGA Project
43
43
  def checkpoint!(metric)
44
+ return if opts[:only_domain]
44
45
  $stderr.puts "Checkpoint (metric = #{metric})"
45
46
 
46
47
  # This is simply to test database consistency before overwriting the
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4.3
4
+ version: 1.3.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-20 00:00:00.000000000 Z
11
+ date: 2023-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -152,6 +152,7 @@ files:
152
152
  - lib/miga/cli/action/console.rb
153
153
  - lib/miga/cli/action/daemon.rb
154
154
  - lib/miga/cli/action/date.rb
155
+ - lib/miga/cli/action/db.rb
155
156
  - lib/miga/cli/action/derep_wf.rb
156
157
  - lib/miga/cli/action/doctor.rb
157
158
  - lib/miga/cli/action/doctor/base.rb
@@ -168,7 +169,6 @@ files:
168
169
  - lib/miga/cli/action/find.rb
169
170
  - lib/miga/cli/action/generic.rb
170
171
  - lib/miga/cli/action/get.rb
171
- - lib/miga/cli/action/get_db.rb
172
172
  - lib/miga/cli/action/gtdb_get.rb
173
173
  - lib/miga/cli/action/index_wf.rb
174
174
  - lib/miga/cli/action/init.rb