miga-base 0.4.3.0 → 0.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,97 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
|
7
|
+
require 'miga/cli/action/wf'
|
8
|
+
include MiGA::Cli::Action::Wf
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
default_opts_for_wf
|
12
|
+
cli.defaults = {
|
13
|
+
download: false, summaries: true, pvalue: 0.05,
|
14
|
+
local: File.expand_path('.miga_db', ENV['MIGA_HOME'])
|
15
|
+
}
|
16
|
+
cli.parse do |opt|
|
17
|
+
opt.on(
|
18
|
+
'--download-db',
|
19
|
+
'Attempt to download the reference database (all default options)',
|
20
|
+
'It is recommended to use "miga get_db" separately instead'
|
21
|
+
) { |v| cli[:download] = v }
|
22
|
+
opt.on(
|
23
|
+
'-n', '--database STRING',
|
24
|
+
'Name of the reference database to use',
|
25
|
+
'By default, the first locally listed database is used'
|
26
|
+
) { |v| cli[:database] = v.to_sym }
|
27
|
+
opt.on(
|
28
|
+
'-p', '--p-value FLOAT', Float,
|
29
|
+
'Maximum p-value to transfer taxonomy',
|
30
|
+
"By default: #{cli[:pvalue]}"
|
31
|
+
) { |v| cli[:pvalue] = v }
|
32
|
+
opt.on(
|
33
|
+
'-l', '--local-dir PATH',
|
34
|
+
"Local directory to store the database. By default: #{cli[:local]}"
|
35
|
+
) { |v| cli[:local] = v }
|
36
|
+
opt.on(
|
37
|
+
'--db-path STRING',
|
38
|
+
'Path to the reference database to use, a fully indexed MiGA project',
|
39
|
+
'If defined, --local-dir and --database are ignored'
|
40
|
+
) { |v| cli[:db_path] = v }
|
41
|
+
opt.on(
|
42
|
+
'--no-summaries',
|
43
|
+
'Do not generate intermediate step summaries'
|
44
|
+
) { |v| cli[:summaries] = v }
|
45
|
+
opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def perform
|
50
|
+
# Input data
|
51
|
+
ref_db = reference_db
|
52
|
+
p_metadata = Hash[
|
53
|
+
%w[project_stats haai_distances aai_distances ani_distances clade_finding]
|
54
|
+
.map { |i| ["run_#{i}", false] }
|
55
|
+
]
|
56
|
+
p_metadata[:ref_project] = ref_db.path
|
57
|
+
p_metadata[:tax_pvalue] = cli[:pvalue]
|
58
|
+
p = create_project(:assembly, p_metadata,
|
59
|
+
run_ssu: false, run_mytaxa_scan: false, run_distances: false)
|
60
|
+
# Run
|
61
|
+
run_daemon
|
62
|
+
summarize(%w[cds assembly essential_genes]) if cli[:summaries]
|
63
|
+
summarize(['taxonomy'])
|
64
|
+
cli.say "Summary: classification"
|
65
|
+
call_cli([
|
66
|
+
'ls', '-P', cli[:outdir], '-m', 'tax', '--tab',
|
67
|
+
'-o', File.expand_path('classification.tsv', cli[:outdir])
|
68
|
+
])
|
69
|
+
cleanup
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def reference_db
|
75
|
+
cli.say "Locating reference database"
|
76
|
+
ref_db_path = cli[:db_path]
|
77
|
+
if ref_db_path.nil?
|
78
|
+
if cli[:download]
|
79
|
+
get_db_call = ['get_db', '-l', cli[:local]]
|
80
|
+
get_db_call += ['-n', cli[:database]] unless cli[:database].nil?
|
81
|
+
call_cli(get_db_call)
|
82
|
+
end
|
83
|
+
if cli[:database].nil?
|
84
|
+
lm_f = File.expand_path('_local_manif.json', cli[:local])
|
85
|
+
unless File.size? lm_f
|
86
|
+
raise 'No locally listed databases, call "miga get_db" first'
|
87
|
+
end
|
88
|
+
cli[:database] = MiGA::Json.parse(lm_f)[:databases].keys.first
|
89
|
+
end
|
90
|
+
ref_db_path = File.expand_path(cli[:database].to_s, cli[:local])
|
91
|
+
end
|
92
|
+
ref_db = MiGA::Project.load(ref_db_path)
|
93
|
+
raise "Cannot locate reference database: #{ref_db_path}" if ref_db.nil?
|
94
|
+
cli.say "Reference database: #{ref_db.name}"
|
95
|
+
ref_db
|
96
|
+
end
|
97
|
+
end
|
@@ -18,7 +18,7 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
|
|
18
18
|
run: 'Start the application and stay on top.',
|
19
19
|
zap: 'Set the application to a stopped state.',
|
20
20
|
status: 'Show status (PID) of application instances.'
|
21
|
-
}.each{ |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
|
21
|
+
}.each { |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
|
22
22
|
opt.separator ''
|
23
23
|
|
24
24
|
opt.separator 'MiGA options:'
|
@@ -27,45 +27,49 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
|
|
27
27
|
'--shutdown-when-done',
|
28
28
|
'Exit the daemon when all processing is done',
|
29
29
|
'Otherwise, it will stay idle awaiting for new data (default)'
|
30
|
-
|
30
|
+
) { |v| cli[:shutdown_when_done] = v }
|
31
31
|
opt.on(
|
32
32
|
'--latency INT',
|
33
33
|
'Number of seconds the daemon will be sleeping'
|
34
|
-
|
34
|
+
) { |v| cli[:latency] = v.to_i }
|
35
35
|
opt.on(
|
36
36
|
'--max-jobs INT',
|
37
37
|
'Maximum number of jobs to use simultaneously'
|
38
|
-
|
38
|
+
) { |v| cli[:maxjobs] = v.to_i }
|
39
39
|
opt.on(
|
40
40
|
'--ppn INT',
|
41
41
|
'Maximum number of cores to use in a single job'
|
42
|
-
|
42
|
+
) { |v| cli[:ppn] = v.to_i }
|
43
|
+
opt.on(
|
44
|
+
'--json PATH',
|
45
|
+
'Path to a custom daemon definition in json format'
|
46
|
+
) { |v| cli[:json] = v }
|
43
47
|
cli.opt_common(opt)
|
44
48
|
|
45
49
|
opt.separator 'Daemon options:'
|
46
50
|
opt.on(
|
47
51
|
'-t', '--ontop',
|
48
52
|
'Stay on top (does not daemonize)'
|
49
|
-
|
53
|
+
) { cli[:daemon_opts] << '-t' }
|
50
54
|
opt.on(
|
51
55
|
'-f', '--force',
|
52
56
|
'Force operation'
|
53
|
-
|
57
|
+
) { cli[:daemon_opts] << '-f' }
|
54
58
|
opt.on(
|
55
59
|
'-n', '--no_wait',
|
56
60
|
'Do not wait for processes to stop'
|
57
|
-
|
61
|
+
) { cli[:daemon_opts] << '-n' }
|
58
62
|
opt.on(
|
59
63
|
'--shush',
|
60
64
|
'Silence the daemon'
|
61
|
-
|
65
|
+
) { cli[:daemon_opts] << '--shush' }
|
62
66
|
opt.separator ''
|
63
67
|
end
|
64
68
|
end
|
65
69
|
|
66
70
|
def perform
|
67
71
|
p = cli.load_project
|
68
|
-
d = MiGA::Daemon.new(p)
|
72
|
+
d = MiGA::Daemon.new(p, cli[:json])
|
69
73
|
[:latency, :maxjobs, :ppn, :shutdown_when_done].each do |k|
|
70
74
|
d.runopts(k, cli[k]) unless cli[k].nil?
|
71
75
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
|
7
|
+
require 'miga/cli/action/wf'
|
8
|
+
include MiGA::Cli::Action::Wf
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
default_opts_for_wf
|
12
|
+
cli.defaults = {
|
13
|
+
metric: :ani, threshold: 95.0, criterion: :quality,
|
14
|
+
summaries: true, collection: true
|
15
|
+
}
|
16
|
+
cli.parse do |opt|
|
17
|
+
opt.on(
|
18
|
+
'--aai',
|
19
|
+
'Use Average Amino Acid Identity (AAI) as genome similarity metric',
|
20
|
+
'By default: Use Average Nucleotide Identity (ANI)'
|
21
|
+
) { cli[:metric] = :aai }
|
22
|
+
opt.on(
|
23
|
+
'--threshold FLOAT', Float,
|
24
|
+
"Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
|
25
|
+
) { |v| cli[:threshold] = v }
|
26
|
+
opt.on(
|
27
|
+
'--medoids',
|
28
|
+
'Use medoids as clade representatives',
|
29
|
+
'By default: Use genome with the highest quality'
|
30
|
+
) { |v| cli[:criterion] = :medoids }
|
31
|
+
opt.on(
|
32
|
+
'--no-collection',
|
33
|
+
'Do not generate a dereplicated collection of assemblies'
|
34
|
+
) { |v| cli[:collection] = v }
|
35
|
+
opt.on(
|
36
|
+
'--no-summaries',
|
37
|
+
'Do not generate intermediate step summaries'
|
38
|
+
) { |v| cli[:summaries] = v }
|
39
|
+
opts_for_wf_distances(opt)
|
40
|
+
opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def perform
|
45
|
+
# Input data
|
46
|
+
p = create_project(:assembly,
|
47
|
+
{ run_project_stats: false, run_clades: false,
|
48
|
+
gsp_metric: cli[:metric], :"gsp_#{cli[:metric]}" => cli[:threshold] },
|
49
|
+
{ run_mytaxa_scan: false, run_ssu: false })
|
50
|
+
unless cli[:threshold] >= 0.0 && cli[:threshold] <= 100.0
|
51
|
+
raise "The threshold of identity must be in the range [0,100]"
|
52
|
+
end
|
53
|
+
# Run
|
54
|
+
run_daemon
|
55
|
+
dereplicate(p)
|
56
|
+
summarize(%w[cds assembly essential_genes]) if cli[:summaries]
|
57
|
+
cleanup
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def dereplicate(p)
|
63
|
+
cli.say "Extracting genomospecies clades"
|
64
|
+
r = p.result(:clade_finding) or raise "Result unavailable: run failed"
|
65
|
+
c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed'
|
66
|
+
clades = File.readlines(c_f).map { |i| i.chomp.split("\t") }
|
67
|
+
rep = representatives(p)
|
68
|
+
File.open(File.expand_path('genomospecies.tsv', cli[:outdir]), 'w') do |fh|
|
69
|
+
fh.puts "Clade\tRepresentative\tMembers"
|
70
|
+
clades.each_with_index do |i, k|
|
71
|
+
fh.puts ["gsp_#{k+1}", rep[k], i.join(',')].join("\t")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if cli[:collection]
|
75
|
+
dir = File.expand_path('representatives', cli[:outdir])
|
76
|
+
FileUtils.mkdir_p(dir)
|
77
|
+
rep.each do |i|
|
78
|
+
f = p.dataset(i).result(:assembly).file_path(:largecontigs)
|
79
|
+
FileUtils.cp(f, dir)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def representatives(p)
|
85
|
+
cli.say "Identifying representatives"
|
86
|
+
f = File.expand_path('representatives.txt', cli[:outdir])
|
87
|
+
if cli[:criterion] == :medoids
|
88
|
+
FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f)
|
89
|
+
else
|
90
|
+
src = File.expand_path('utils/representatives.rb', MiGA::MiGA.root_path)
|
91
|
+
`ruby '#{src}' '#{p.path}' | cut -f 2 > '#{f}'`
|
92
|
+
end
|
93
|
+
File.readlines(f).map(&:chomp)
|
94
|
+
end
|
95
|
+
end
|
@@ -7,34 +7,32 @@ require 'sqlite3'
|
|
7
7
|
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
8
8
|
|
9
9
|
def parse_cli
|
10
|
-
@@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
|
10
|
+
@@OPERATIONS.keys.each { |i| cli.defaults = { i => true } }
|
11
11
|
cli.parse do |opt|
|
12
|
-
operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
|
12
|
+
operation_n = Hash[@@OPERATIONS.map { |k,v| [v[0], k] }]
|
13
13
|
cli.opt_object(opt, [:project])
|
14
14
|
opt.on(
|
15
15
|
'--ignore TASK1,TASK2', Array,
|
16
16
|
'Do not perform the task(s) listed. Available tasks are:',
|
17
|
-
* @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
|
18
|
-
|
17
|
+
* @@OPERATIONS.values.map { |v| "~ #{v[0]}: #{v[1]}" }
|
18
|
+
) { |v| v.map { |i| cli[operation_n[i]] = false } }
|
19
19
|
opt.on(
|
20
20
|
'--only TASK',
|
21
21
|
'Perform only the specified task (see --ignore)'
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
) do |v|
|
23
|
+
op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
|
24
|
+
@@OPERATIONS.keys.each { |i| cli[i] = false }
|
25
|
+
cli[op_k] = true
|
26
|
+
end
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def check_sqlite3_database(db_file, metric)
|
31
|
-
|
32
|
-
|
33
|
-
conn.execute("select count(*) from #{metric}").first
|
34
|
-
end
|
35
|
-
rescue SQLite3::SQLException
|
36
|
-
yield
|
31
|
+
SQLite3::Database.new(db_file) do |conn|
|
32
|
+
conn.execute("select count(*) from #{metric}").first
|
37
33
|
end
|
34
|
+
rescue SQLite3::SQLException
|
35
|
+
yield
|
38
36
|
end
|
39
37
|
|
40
38
|
def perform
|
@@ -48,6 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
48
46
|
db: ['databases', 'Check database files integrity'],
|
49
47
|
dist: ['distances', 'Check distance summary tables'],
|
50
48
|
files: ['files', 'Check for outdated files'],
|
49
|
+
cds: ['cds', 'Check for gzipped genes and proteins'],
|
51
50
|
ess: ['essential-genes', 'Check for unarchived essential genes'],
|
52
51
|
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
53
52
|
start: ['start', 'Check for lingering .start files'],
|
@@ -84,41 +83,9 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
84
83
|
res = p.result("#{dist}_distances")
|
85
84
|
next if res.nil?
|
86
85
|
cli.say "Checking #{dist} table for consistent datasets"
|
87
|
-
notok =
|
88
|
-
|
89
|
-
|
90
|
-
lineno = 0
|
91
|
-
fh.each_line do |ln|
|
92
|
-
next if (lineno+=1)==1
|
93
|
-
r = ln.split("\t")
|
94
|
-
if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
|
95
|
-
[1,2].each do |i|
|
96
|
-
if p.dataset(r[i]).nil?
|
97
|
-
notok[r[i]] = true
|
98
|
-
else
|
99
|
-
fix[r[i]] = true
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
|
107
|
-
fix.keys.each do |d_n|
|
108
|
-
cli.say " > Fixing #{d_n}."
|
109
|
-
p.dataset(d_n).cleanup_distances!
|
110
|
-
end
|
111
|
-
|
112
|
-
unless notok.empty?
|
113
|
-
cli.say '- Unregistered datasets detected: '
|
114
|
-
if notok.size <= 5
|
115
|
-
notok.keys.each { |i| cli.say " > #{i}" }
|
116
|
-
else
|
117
|
-
cli.say " > #{notok.size}, including #{notok.keys.first}"
|
118
|
-
end
|
119
|
-
cli.say '- Removing tables, recompute'
|
120
|
-
res.remove!
|
121
|
-
end
|
86
|
+
notok, fix = check_dist_eval(cli, p, res)
|
87
|
+
check_dist_fix(cli, p, fix)
|
88
|
+
check_dist_recompute(cli, res, notok)
|
122
89
|
end
|
123
90
|
end
|
124
91
|
|
@@ -142,6 +109,24 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
142
109
|
end
|
143
110
|
end
|
144
111
|
|
112
|
+
def check_cds(cli)
|
113
|
+
cli.say 'Looking for unzipped genes or proteins'
|
114
|
+
cli.load_project.each_dataset do |d|
|
115
|
+
res = d.result(:cds) or next
|
116
|
+
changed = false
|
117
|
+
[:genes, :proteins, :gff3, :gff2, :tab].each do |f|
|
118
|
+
file = res.file_path(f) or next
|
119
|
+
if file !~ /\.gz/
|
120
|
+
cli.say " > Gzipping #{d.name} #{f}"
|
121
|
+
cmdo = `gzip -9 '#{file}'`.chomp
|
122
|
+
warn(cmdo) unless cmdo.empty?
|
123
|
+
changed = true
|
124
|
+
end
|
125
|
+
end
|
126
|
+
d.add_result(:cds, true, force: true) if changed
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
145
130
|
def check_ess(cli)
|
146
131
|
cli.say 'Looking for unarchived essential genes'
|
147
132
|
cli.load_project.each_dataset do |d|
|
@@ -153,11 +138,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
153
138
|
res.remove!
|
154
139
|
next
|
155
140
|
end
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
end
|
141
|
+
next if Dir["#{dir}/*.faa"].empty?
|
142
|
+
cli.say " > Fixing #{d.name}"
|
143
|
+
cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
|
144
|
+
warn(cmdo) unless cmdo.empty?
|
161
145
|
end
|
162
146
|
end
|
163
147
|
|
@@ -207,4 +191,48 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
207
191
|
#cli.say 'o Checking for taxonomy/distances consistency'
|
208
192
|
# TODO: Find 95%ANI clusters with entries from different species
|
209
193
|
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def check_dist_eval(cli, p, res)
|
198
|
+
notok = {}
|
199
|
+
fix = {}
|
200
|
+
Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
|
201
|
+
lineno = 0
|
202
|
+
fh.each_line do |ln|
|
203
|
+
next if (lineno += 1) == 1
|
204
|
+
r = ln.split("\t")
|
205
|
+
next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
|
206
|
+
[1, 2].each do |i|
|
207
|
+
if p.dataset(r[i]).nil?
|
208
|
+
notok[r[i]] = true
|
209
|
+
else
|
210
|
+
fix[r[i]] = true
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
[notok, fix]
|
216
|
+
end
|
217
|
+
|
218
|
+
def check_dist_fix(cli, p, fix)
|
219
|
+
return if fix.empty?
|
220
|
+
cli.say("- Fixing #{fix.size} datasets")
|
221
|
+
fix.keys.each do |d_n|
|
222
|
+
cli.say " > Fixing #{d_n}."
|
223
|
+
p.dataset(d_n).cleanup_distances!
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def check_dist_recompute(cli, p, notok)
|
228
|
+
return if notok.empty?
|
229
|
+
cli.say '- Unregistered datasets detected: '
|
230
|
+
if notok.size <= 5
|
231
|
+
notok.keys.each { |i| cli.say " > #{i}" }
|
232
|
+
else
|
233
|
+
cli.say " > #{notok.size}, including #{notok.keys.first}"
|
234
|
+
end
|
235
|
+
cli.say '- Removing tables, recompute'
|
236
|
+
res.remove!
|
237
|
+
end
|
210
238
|
end
|