miga-base 0.4.3.0 → 0.5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,97 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
|
7
|
+
require 'miga/cli/action/wf'
|
8
|
+
include MiGA::Cli::Action::Wf
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
default_opts_for_wf
|
12
|
+
cli.defaults = {
|
13
|
+
download: false, summaries: true, pvalue: 0.05,
|
14
|
+
local: File.expand_path('.miga_db', ENV['MIGA_HOME'])
|
15
|
+
}
|
16
|
+
cli.parse do |opt|
|
17
|
+
opt.on(
|
18
|
+
'--download-db',
|
19
|
+
'Attempt to download the reference database (all default options)',
|
20
|
+
'It is recommended to use "miga get_db" separately instead'
|
21
|
+
) { |v| cli[:download] = v }
|
22
|
+
opt.on(
|
23
|
+
'-n', '--database STRING',
|
24
|
+
'Name of the reference database to use',
|
25
|
+
'By default, the first locally listed database is used'
|
26
|
+
) { |v| cli[:database] = v.to_sym }
|
27
|
+
opt.on(
|
28
|
+
'-p', '--p-value FLOAT', Float,
|
29
|
+
'Maximum p-value to transfer taxonomy',
|
30
|
+
"By default: #{cli[:pvalue]}"
|
31
|
+
) { |v| cli[:pvalue] = v }
|
32
|
+
opt.on(
|
33
|
+
'-l', '--local-dir PATH',
|
34
|
+
"Local directory to store the database. By default: #{cli[:local]}"
|
35
|
+
) { |v| cli[:local] = v }
|
36
|
+
opt.on(
|
37
|
+
'--db-path STRING',
|
38
|
+
'Path to the reference database to use, a fully indexed MiGA project',
|
39
|
+
'If defined, --local-dir and --database are ignored'
|
40
|
+
) { |v| cli[:db_path] = v }
|
41
|
+
opt.on(
|
42
|
+
'--no-summaries',
|
43
|
+
'Do not generate intermediate step summaries'
|
44
|
+
) { |v| cli[:summaries] = v }
|
45
|
+
opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def perform
|
50
|
+
# Input data
|
51
|
+
ref_db = reference_db
|
52
|
+
p_metadata = Hash[
|
53
|
+
%w[project_stats haai_distances aai_distances ani_distances clade_finding]
|
54
|
+
.map { |i| ["run_#{i}", false] }
|
55
|
+
]
|
56
|
+
p_metadata[:ref_project] = ref_db.path
|
57
|
+
p_metadata[:tax_pvalue] = cli[:pvalue]
|
58
|
+
p = create_project(:assembly, p_metadata,
|
59
|
+
run_ssu: false, run_mytaxa_scan: false, run_distances: false)
|
60
|
+
# Run
|
61
|
+
run_daemon
|
62
|
+
summarize(%w[cds assembly essential_genes]) if cli[:summaries]
|
63
|
+
summarize(['taxonomy'])
|
64
|
+
cli.say "Summary: classification"
|
65
|
+
call_cli([
|
66
|
+
'ls', '-P', cli[:outdir], '-m', 'tax', '--tab',
|
67
|
+
'-o', File.expand_path('classification.tsv', cli[:outdir])
|
68
|
+
])
|
69
|
+
cleanup
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def reference_db
|
75
|
+
cli.say "Locating reference database"
|
76
|
+
ref_db_path = cli[:db_path]
|
77
|
+
if ref_db_path.nil?
|
78
|
+
if cli[:download]
|
79
|
+
get_db_call = ['get_db', '-l', cli[:local]]
|
80
|
+
get_db_call += ['-n', cli[:database]] unless cli[:database].nil?
|
81
|
+
call_cli(get_db_call)
|
82
|
+
end
|
83
|
+
if cli[:database].nil?
|
84
|
+
lm_f = File.expand_path('_local_manif.json', cli[:local])
|
85
|
+
unless File.size? lm_f
|
86
|
+
raise 'No locally listed databases, call "miga get_db" first'
|
87
|
+
end
|
88
|
+
cli[:database] = MiGA::Json.parse(lm_f)[:databases].keys.first
|
89
|
+
end
|
90
|
+
ref_db_path = File.expand_path(cli[:database].to_s, cli[:local])
|
91
|
+
end
|
92
|
+
ref_db = MiGA::Project.load(ref_db_path)
|
93
|
+
raise "Cannot locate reference database: #{ref_db_path}" if ref_db.nil?
|
94
|
+
cli.say "Reference database: #{ref_db.name}"
|
95
|
+
ref_db
|
96
|
+
end
|
97
|
+
end
|
@@ -18,7 +18,7 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
|
|
18
18
|
run: 'Start the application and stay on top.',
|
19
19
|
zap: 'Set the application to a stopped state.',
|
20
20
|
status: 'Show status (PID) of application instances.'
|
21
|
-
}.each{ |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
|
21
|
+
}.each { |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
|
22
22
|
opt.separator ''
|
23
23
|
|
24
24
|
opt.separator 'MiGA options:'
|
@@ -27,45 +27,49 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
|
|
27
27
|
'--shutdown-when-done',
|
28
28
|
'Exit the daemon when all processing is done',
|
29
29
|
'Otherwise, it will stay idle awaiting for new data (default)'
|
30
|
-
|
30
|
+
) { |v| cli[:shutdown_when_done] = v }
|
31
31
|
opt.on(
|
32
32
|
'--latency INT',
|
33
33
|
'Number of seconds the daemon will be sleeping'
|
34
|
-
|
34
|
+
) { |v| cli[:latency] = v.to_i }
|
35
35
|
opt.on(
|
36
36
|
'--max-jobs INT',
|
37
37
|
'Maximum number of jobs to use simultaneously'
|
38
|
-
|
38
|
+
) { |v| cli[:maxjobs] = v.to_i }
|
39
39
|
opt.on(
|
40
40
|
'--ppn INT',
|
41
41
|
'Maximum number of cores to use in a single job'
|
42
|
-
|
42
|
+
) { |v| cli[:ppn] = v.to_i }
|
43
|
+
opt.on(
|
44
|
+
'--json PATH',
|
45
|
+
'Path to a custom daemon definition in json format'
|
46
|
+
) { |v| cli[:json] = v }
|
43
47
|
cli.opt_common(opt)
|
44
48
|
|
45
49
|
opt.separator 'Daemon options:'
|
46
50
|
opt.on(
|
47
51
|
'-t', '--ontop',
|
48
52
|
'Stay on top (does not daemonize)'
|
49
|
-
|
53
|
+
) { cli[:daemon_opts] << '-t' }
|
50
54
|
opt.on(
|
51
55
|
'-f', '--force',
|
52
56
|
'Force operation'
|
53
|
-
|
57
|
+
) { cli[:daemon_opts] << '-f' }
|
54
58
|
opt.on(
|
55
59
|
'-n', '--no_wait',
|
56
60
|
'Do not wait for processes to stop'
|
57
|
-
|
61
|
+
) { cli[:daemon_opts] << '-n' }
|
58
62
|
opt.on(
|
59
63
|
'--shush',
|
60
64
|
'Silence the daemon'
|
61
|
-
|
65
|
+
) { cli[:daemon_opts] << '--shush' }
|
62
66
|
opt.separator ''
|
63
67
|
end
|
64
68
|
end
|
65
69
|
|
66
70
|
def perform
|
67
71
|
p = cli.load_project
|
68
|
-
d = MiGA::Daemon.new(p)
|
72
|
+
d = MiGA::Daemon.new(p, cli[:json])
|
69
73
|
[:latency, :maxjobs, :ppn, :shutdown_when_done].each do |k|
|
70
74
|
d.runopts(k, cli[k]) unless cli[k].nil?
|
71
75
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
|
7
|
+
require 'miga/cli/action/wf'
|
8
|
+
include MiGA::Cli::Action::Wf
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
default_opts_for_wf
|
12
|
+
cli.defaults = {
|
13
|
+
metric: :ani, threshold: 95.0, criterion: :quality,
|
14
|
+
summaries: true, collection: true
|
15
|
+
}
|
16
|
+
cli.parse do |opt|
|
17
|
+
opt.on(
|
18
|
+
'--aai',
|
19
|
+
'Use Average Amino Acid Identity (AAI) as genome similarity metric',
|
20
|
+
'By default: Use Average Nucleotide Identity (ANI)'
|
21
|
+
) { cli[:metric] = :aai }
|
22
|
+
opt.on(
|
23
|
+
'--threshold FLOAT', Float,
|
24
|
+
"Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
|
25
|
+
) { |v| cli[:threshold] = v }
|
26
|
+
opt.on(
|
27
|
+
'--medoids',
|
28
|
+
'Use medoids as clade representatives',
|
29
|
+
'By default: Use genome with the highest quality'
|
30
|
+
) { |v| cli[:criterion] = :medoids }
|
31
|
+
opt.on(
|
32
|
+
'--no-collection',
|
33
|
+
'Do not generate a dereplicated collection of assemblies'
|
34
|
+
) { |v| cli[:collection] = v }
|
35
|
+
opt.on(
|
36
|
+
'--no-summaries',
|
37
|
+
'Do not generate intermediate step summaries'
|
38
|
+
) { |v| cli[:summaries] = v }
|
39
|
+
opts_for_wf_distances(opt)
|
40
|
+
opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def perform
|
45
|
+
# Input data
|
46
|
+
p = create_project(:assembly,
|
47
|
+
{ run_project_stats: false, run_clades: false,
|
48
|
+
gsp_metric: cli[:metric], :"gsp_#{cli[:metric]}" => cli[:threshold] },
|
49
|
+
{ run_mytaxa_scan: false, run_ssu: false })
|
50
|
+
unless cli[:threshold] >= 0.0 && cli[:threshold] <= 100.0
|
51
|
+
raise "The threshold of identity must be in the range [0,100]"
|
52
|
+
end
|
53
|
+
# Run
|
54
|
+
run_daemon
|
55
|
+
dereplicate(p)
|
56
|
+
summarize(%w[cds assembly essential_genes]) if cli[:summaries]
|
57
|
+
cleanup
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def dereplicate(p)
|
63
|
+
cli.say "Extracting genomospecies clades"
|
64
|
+
r = p.result(:clade_finding) or raise "Result unavailable: run failed"
|
65
|
+
c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed'
|
66
|
+
clades = File.readlines(c_f).map { |i| i.chomp.split("\t") }
|
67
|
+
rep = representatives(p)
|
68
|
+
File.open(File.expand_path('genomospecies.tsv', cli[:outdir]), 'w') do |fh|
|
69
|
+
fh.puts "Clade\tRepresentative\tMembers"
|
70
|
+
clades.each_with_index do |i, k|
|
71
|
+
fh.puts ["gsp_#{k+1}", rep[k], i.join(',')].join("\t")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
if cli[:collection]
|
75
|
+
dir = File.expand_path('representatives', cli[:outdir])
|
76
|
+
FileUtils.mkdir_p(dir)
|
77
|
+
rep.each do |i|
|
78
|
+
f = p.dataset(i).result(:assembly).file_path(:largecontigs)
|
79
|
+
FileUtils.cp(f, dir)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def representatives(p)
|
85
|
+
cli.say "Identifying representatives"
|
86
|
+
f = File.expand_path('representatives.txt', cli[:outdir])
|
87
|
+
if cli[:criterion] == :medoids
|
88
|
+
FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f)
|
89
|
+
else
|
90
|
+
src = File.expand_path('utils/representatives.rb', MiGA::MiGA.root_path)
|
91
|
+
`ruby '#{src}' '#{p.path}' | cut -f 2 > '#{f}'`
|
92
|
+
end
|
93
|
+
File.readlines(f).map(&:chomp)
|
94
|
+
end
|
95
|
+
end
|
@@ -7,34 +7,32 @@ require 'sqlite3'
|
|
7
7
|
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
8
8
|
|
9
9
|
def parse_cli
|
10
|
-
@@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
|
10
|
+
@@OPERATIONS.keys.each { |i| cli.defaults = { i => true } }
|
11
11
|
cli.parse do |opt|
|
12
|
-
operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
|
12
|
+
operation_n = Hash[@@OPERATIONS.map { |k,v| [v[0], k] }]
|
13
13
|
cli.opt_object(opt, [:project])
|
14
14
|
opt.on(
|
15
15
|
'--ignore TASK1,TASK2', Array,
|
16
16
|
'Do not perform the task(s) listed. Available tasks are:',
|
17
|
-
* @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
|
18
|
-
|
17
|
+
* @@OPERATIONS.values.map { |v| "~ #{v[0]}: #{v[1]}" }
|
18
|
+
) { |v| v.map { |i| cli[operation_n[i]] = false } }
|
19
19
|
opt.on(
|
20
20
|
'--only TASK',
|
21
21
|
'Perform only the specified task (see --ignore)'
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
) do |v|
|
23
|
+
op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
|
24
|
+
@@OPERATIONS.keys.each { |i| cli[i] = false }
|
25
|
+
cli[op_k] = true
|
26
|
+
end
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def check_sqlite3_database(db_file, metric)
|
31
|
-
|
32
|
-
|
33
|
-
conn.execute("select count(*) from #{metric}").first
|
34
|
-
end
|
35
|
-
rescue SQLite3::SQLException
|
36
|
-
yield
|
31
|
+
SQLite3::Database.new(db_file) do |conn|
|
32
|
+
conn.execute("select count(*) from #{metric}").first
|
37
33
|
end
|
34
|
+
rescue SQLite3::SQLException
|
35
|
+
yield
|
38
36
|
end
|
39
37
|
|
40
38
|
def perform
|
@@ -48,6 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
48
46
|
db: ['databases', 'Check database files integrity'],
|
49
47
|
dist: ['distances', 'Check distance summary tables'],
|
50
48
|
files: ['files', 'Check for outdated files'],
|
49
|
+
cds: ['cds', 'Check for gzipped genes and proteins'],
|
51
50
|
ess: ['essential-genes', 'Check for unarchived essential genes'],
|
52
51
|
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
53
52
|
start: ['start', 'Check for lingering .start files'],
|
@@ -84,41 +83,9 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
84
83
|
res = p.result("#{dist}_distances")
|
85
84
|
next if res.nil?
|
86
85
|
cli.say "Checking #{dist} table for consistent datasets"
|
87
|
-
notok =
|
88
|
-
|
89
|
-
|
90
|
-
lineno = 0
|
91
|
-
fh.each_line do |ln|
|
92
|
-
next if (lineno+=1)==1
|
93
|
-
r = ln.split("\t")
|
94
|
-
if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
|
95
|
-
[1,2].each do |i|
|
96
|
-
if p.dataset(r[i]).nil?
|
97
|
-
notok[r[i]] = true
|
98
|
-
else
|
99
|
-
fix[r[i]] = true
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
|
107
|
-
fix.keys.each do |d_n|
|
108
|
-
cli.say " > Fixing #{d_n}."
|
109
|
-
p.dataset(d_n).cleanup_distances!
|
110
|
-
end
|
111
|
-
|
112
|
-
unless notok.empty?
|
113
|
-
cli.say '- Unregistered datasets detected: '
|
114
|
-
if notok.size <= 5
|
115
|
-
notok.keys.each { |i| cli.say " > #{i}" }
|
116
|
-
else
|
117
|
-
cli.say " > #{notok.size}, including #{notok.keys.first}"
|
118
|
-
end
|
119
|
-
cli.say '- Removing tables, recompute'
|
120
|
-
res.remove!
|
121
|
-
end
|
86
|
+
notok, fix = check_dist_eval(cli, p, res)
|
87
|
+
check_dist_fix(cli, p, fix)
|
88
|
+
check_dist_recompute(cli, res, notok)
|
122
89
|
end
|
123
90
|
end
|
124
91
|
|
@@ -142,6 +109,24 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
142
109
|
end
|
143
110
|
end
|
144
111
|
|
112
|
+
def check_cds(cli)
|
113
|
+
cli.say 'Looking for unzipped genes or proteins'
|
114
|
+
cli.load_project.each_dataset do |d|
|
115
|
+
res = d.result(:cds) or next
|
116
|
+
changed = false
|
117
|
+
[:genes, :proteins, :gff3, :gff2, :tab].each do |f|
|
118
|
+
file = res.file_path(f) or next
|
119
|
+
if file !~ /\.gz/
|
120
|
+
cli.say " > Gzipping #{d.name} #{f}"
|
121
|
+
cmdo = `gzip -9 '#{file}'`.chomp
|
122
|
+
warn(cmdo) unless cmdo.empty?
|
123
|
+
changed = true
|
124
|
+
end
|
125
|
+
end
|
126
|
+
d.add_result(:cds, true, force: true) if changed
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
145
130
|
def check_ess(cli)
|
146
131
|
cli.say 'Looking for unarchived essential genes'
|
147
132
|
cli.load_project.each_dataset do |d|
|
@@ -153,11 +138,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
153
138
|
res.remove!
|
154
139
|
next
|
155
140
|
end
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
end
|
141
|
+
next if Dir["#{dir}/*.faa"].empty?
|
142
|
+
cli.say " > Fixing #{d.name}"
|
143
|
+
cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
|
144
|
+
warn(cmdo) unless cmdo.empty?
|
161
145
|
end
|
162
146
|
end
|
163
147
|
|
@@ -207,4 +191,48 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
207
191
|
#cli.say 'o Checking for taxonomy/distances consistency'
|
208
192
|
# TODO: Find 95%ANI clusters with entries from different species
|
209
193
|
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def check_dist_eval(cli, p, res)
|
198
|
+
notok = {}
|
199
|
+
fix = {}
|
200
|
+
Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
|
201
|
+
lineno = 0
|
202
|
+
fh.each_line do |ln|
|
203
|
+
next if (lineno += 1) == 1
|
204
|
+
r = ln.split("\t")
|
205
|
+
next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
|
206
|
+
[1, 2].each do |i|
|
207
|
+
if p.dataset(r[i]).nil?
|
208
|
+
notok[r[i]] = true
|
209
|
+
else
|
210
|
+
fix[r[i]] = true
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
[notok, fix]
|
216
|
+
end
|
217
|
+
|
218
|
+
def check_dist_fix(cli, p, fix)
|
219
|
+
return if fix.empty?
|
220
|
+
cli.say("- Fixing #{fix.size} datasets")
|
221
|
+
fix.keys.each do |d_n|
|
222
|
+
cli.say " > Fixing #{d_n}."
|
223
|
+
p.dataset(d_n).cleanup_distances!
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def check_dist_recompute(cli, p, notok)
|
228
|
+
return if notok.empty?
|
229
|
+
cli.say '- Unregistered datasets detected: '
|
230
|
+
if notok.size <= 5
|
231
|
+
notok.keys.each { |i| cli.say " > #{i}" }
|
232
|
+
else
|
233
|
+
cli.say " > #{notok.size}, including #{notok.keys.first}"
|
234
|
+
end
|
235
|
+
cli.say '- Removing tables, recompute'
|
236
|
+
res.remove!
|
237
|
+
end
|
210
238
|
end
|