miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,97 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
7
+ require 'miga/cli/action/wf'
8
+ include MiGA::Cli::Action::Wf
9
+
10
+ def parse_cli
11
+ default_opts_for_wf
12
+ cli.defaults = {
13
+ download: false, summaries: true, pvalue: 0.05,
14
+ local: File.expand_path('.miga_db', ENV['MIGA_HOME'])
15
+ }
16
+ cli.parse do |opt|
17
+ opt.on(
18
+ '--download-db',
19
+ 'Attempt to download the reference database (all default options)',
20
+ 'It is recommended to use "miga get_db" separately instead'
21
+ ) { |v| cli[:download] = v }
22
+ opt.on(
23
+ '-n', '--database STRING',
24
+ 'Name of the reference database to use',
25
+ 'By default, the first locally listed database is used'
26
+ ) { |v| cli[:database] = v.to_sym }
27
+ opt.on(
28
+ '-p', '--p-value FLOAT', Float,
29
+ 'Maximum p-value to transfer taxonomy',
30
+ "By default: #{cli[:pvalue]}"
31
+ ) { |v| cli[:pvalue] = v }
32
+ opt.on(
33
+ '-l', '--local-dir PATH',
34
+ "Local directory to store the database. By default: #{cli[:local]}"
35
+ ) { |v| cli[:local] = v }
36
+ opt.on(
37
+ '--db-path STRING',
38
+ 'Path to the reference database to use, a fully indexed MiGA project',
39
+ 'If defined, --local-dir and --database are ignored'
40
+ ) { |v| cli[:db_path] = v }
41
+ opt.on(
42
+ '--no-summaries',
43
+ 'Do not generate intermediate step summaries'
44
+ ) { |v| cli[:summaries] = v }
45
+ opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
46
+ end
47
+ end
48
+
49
+ def perform
50
+ # Input data
51
+ ref_db = reference_db
52
+ p_metadata = Hash[
53
+ %w[project_stats haai_distances aai_distances ani_distances clade_finding]
54
+ .map { |i| ["run_#{i}", false] }
55
+ ]
56
+ p_metadata[:ref_project] = ref_db.path
57
+ p_metadata[:tax_pvalue] = cli[:pvalue]
58
+ p = create_project(:assembly, p_metadata,
59
+ run_ssu: false, run_mytaxa_scan: false, run_distances: false)
60
+ # Run
61
+ run_daemon
62
+ summarize(%w[cds assembly essential_genes]) if cli[:summaries]
63
+ summarize(['taxonomy'])
64
+ cli.say "Summary: classification"
65
+ call_cli([
66
+ 'ls', '-P', cli[:outdir], '-m', 'tax', '--tab',
67
+ '-o', File.expand_path('classification.tsv', cli[:outdir])
68
+ ])
69
+ cleanup
70
+ end
71
+
72
+ private
73
+
74
+ def reference_db
75
+ cli.say "Locating reference database"
76
+ ref_db_path = cli[:db_path]
77
+ if ref_db_path.nil?
78
+ if cli[:download]
79
+ get_db_call = ['get_db', '-l', cli[:local]]
80
+ get_db_call += ['-n', cli[:database]] unless cli[:database].nil?
81
+ call_cli(get_db_call)
82
+ end
83
+ if cli[:database].nil?
84
+ lm_f = File.expand_path('_local_manif.json', cli[:local])
85
+ unless File.size? lm_f
86
+ raise 'No locally listed databases, call "miga get_db" first'
87
+ end
88
+ cli[:database] = MiGA::Json.parse(lm_f)[:databases].keys.first
89
+ end
90
+ ref_db_path = File.expand_path(cli[:database].to_s, cli[:local])
91
+ end
92
+ ref_db = MiGA::Project.load(ref_db_path)
93
+ raise "Cannot locate reference database: #{ref_db_path}" if ref_db.nil?
94
+ cli.say "Reference database: #{ref_db.name}"
95
+ ref_db
96
+ end
97
+ end
@@ -18,7 +18,7 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
18
18
  run: 'Start the application and stay on top.',
19
19
  zap: 'Set the application to a stopped state.',
20
20
  status: 'Show status (PID) of application instances.'
21
- }.each{ |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
21
+ }.each { |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
22
22
  opt.separator ''
23
23
 
24
24
  opt.separator 'MiGA options:'
@@ -27,45 +27,49 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
27
27
  '--shutdown-when-done',
28
28
  'Exit the daemon when all processing is done',
29
29
  'Otherwise, it will stay idle awaiting for new data (default)'
30
- ){ |v| cli[:shutdown_when_done] = v }
30
+ ) { |v| cli[:shutdown_when_done] = v }
31
31
  opt.on(
32
32
  '--latency INT',
33
33
  'Number of seconds the daemon will be sleeping'
34
- ){ |v| cli[:latency]=v.to_i }
34
+ ) { |v| cli[:latency] = v.to_i }
35
35
  opt.on(
36
36
  '--max-jobs INT',
37
37
  'Maximum number of jobs to use simultaneously'
38
- ){ |v| cli[:maxjobs]=v.to_i }
38
+ ) { |v| cli[:maxjobs] = v.to_i }
39
39
  opt.on(
40
40
  '--ppn INT',
41
41
  'Maximum number of cores to use in a single job'
42
- ){ |v| cli[:ppn]=v.to_i }
42
+ ) { |v| cli[:ppn] = v.to_i }
43
+ opt.on(
44
+ '--json PATH',
45
+ 'Path to a custom daemon definition in json format'
46
+ ) { |v| cli[:json] = v }
43
47
  cli.opt_common(opt)
44
48
 
45
49
  opt.separator 'Daemon options:'
46
50
  opt.on(
47
51
  '-t', '--ontop',
48
52
  'Stay on top (does not daemonize)'
49
- ){ cli[:daemon_opts] << '-t' }
53
+ ) { cli[:daemon_opts] << '-t' }
50
54
  opt.on(
51
55
  '-f', '--force',
52
56
  'Force operation'
53
- ){ cli[:daemon_opts] << '-f' }
57
+ ) { cli[:daemon_opts] << '-f' }
54
58
  opt.on(
55
59
  '-n', '--no_wait',
56
60
  'Do not wait for processes to stop'
57
- ){ cli[:daemon_opts] << '-n' }
61
+ ) { cli[:daemon_opts] << '-n' }
58
62
  opt.on(
59
63
  '--shush',
60
64
  'Silence the daemon'
61
- ){ cli[:daemon_opts] << '--shush' }
65
+ ) { cli[:daemon_opts] << '--shush' }
62
66
  opt.separator ''
63
67
  end
64
68
  end
65
69
 
66
70
  def perform
67
71
  p = cli.load_project
68
- d = MiGA::Daemon.new(p)
72
+ d = MiGA::Daemon.new(p, cli[:json])
69
73
  [:latency, :maxjobs, :ppn, :shutdown_when_done].each do |k|
70
74
  d.runopts(k, cli[k]) unless cli[k].nil?
71
75
  end
@@ -0,0 +1,95 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
7
+ require 'miga/cli/action/wf'
8
+ include MiGA::Cli::Action::Wf
9
+
10
+ def parse_cli
11
+ default_opts_for_wf
12
+ cli.defaults = {
13
+ metric: :ani, threshold: 95.0, criterion: :quality,
14
+ summaries: true, collection: true
15
+ }
16
+ cli.parse do |opt|
17
+ opt.on(
18
+ '--aai',
19
+ 'Use Average Amino Acid Identity (AAI) as genome similarity metric',
20
+ 'By default: Use Average Nucleotide Identity (ANI)'
21
+ ) { cli[:metric] = :aai }
22
+ opt.on(
23
+ '--threshold FLOAT', Float,
24
+ "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
25
+ ) { |v| cli[:threshold] = v }
26
+ opt.on(
27
+ '--medoids',
28
+ 'Use medoids as clade representatives',
29
+ 'By default: Use genome with the highest quality'
30
+ ) { |v| cli[:criterion] = :medoids }
31
+ opt.on(
32
+ '--no-collection',
33
+ 'Do not generate a dereplicated collection of assemblies'
34
+ ) { |v| cli[:collection] = v }
35
+ opt.on(
36
+ '--no-summaries',
37
+ 'Do not generate intermediate step summaries'
38
+ ) { |v| cli[:summaries] = v }
39
+ opts_for_wf_distances(opt)
40
+ opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
41
+ end
42
+ end
43
+
44
+ def perform
45
+ # Input data
46
+ p = create_project(:assembly,
47
+ { run_project_stats: false, run_clades: false,
48
+ gsp_metric: cli[:metric], :"gsp_#{cli[:metric]}" => cli[:threshold] },
49
+ { run_mytaxa_scan: false, run_ssu: false })
50
+ unless cli[:threshold] >= 0.0 && cli[:threshold] <= 100.0
51
+ raise "The threshold of identity must be in the range [0,100]"
52
+ end
53
+ # Run
54
+ run_daemon
55
+ dereplicate(p)
56
+ summarize(%w[cds assembly essential_genes]) if cli[:summaries]
57
+ cleanup
58
+ end
59
+
60
+ private
61
+
62
+ def dereplicate(p)
63
+ cli.say "Extracting genomospecies clades"
64
+ r = p.result(:clade_finding) or raise "Result unavailable: run failed"
65
+ c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed'
66
+ clades = File.readlines(c_f).map { |i| i.chomp.split("\t") }
67
+ rep = representatives(p)
68
+ File.open(File.expand_path('genomospecies.tsv', cli[:outdir]), 'w') do |fh|
69
+ fh.puts "Clade\tRepresentative\tMembers"
70
+ clades.each_with_index do |i, k|
71
+ fh.puts ["gsp_#{k+1}", rep[k], i.join(',')].join("\t")
72
+ end
73
+ end
74
+ if cli[:collection]
75
+ dir = File.expand_path('representatives', cli[:outdir])
76
+ FileUtils.mkdir_p(dir)
77
+ rep.each do |i|
78
+ f = p.dataset(i).result(:assembly).file_path(:largecontigs)
79
+ FileUtils.cp(f, dir)
80
+ end
81
+ end
82
+ end
83
+
84
+ def representatives(p)
85
+ cli.say "Identifying representatives"
86
+ f = File.expand_path('representatives.txt', cli[:outdir])
87
+ if cli[:criterion] == :medoids
88
+ FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f)
89
+ else
90
+ src = File.expand_path('utils/representatives.rb', MiGA::MiGA.root_path)
91
+ `ruby '#{src}' '#{p.path}' | cut -f 2 > '#{f}'`
92
+ end
93
+ File.readlines(f).map(&:chomp)
94
+ end
95
+ end
@@ -7,34 +7,32 @@ require 'sqlite3'
7
7
  class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
8
8
 
9
9
  def parse_cli
10
- @@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
10
+ @@OPERATIONS.keys.each { |i| cli.defaults = { i => true } }
11
11
  cli.parse do |opt|
12
- operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
12
+ operation_n = Hash[@@OPERATIONS.map { |k,v| [v[0], k] }]
13
13
  cli.opt_object(opt, [:project])
14
14
  opt.on(
15
15
  '--ignore TASK1,TASK2', Array,
16
16
  'Do not perform the task(s) listed. Available tasks are:',
17
- * @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
18
- ){ |v| v.map{ |i| cli[operation_n[i]] = false } }
17
+ * @@OPERATIONS.values.map { |v| "~ #{v[0]}: #{v[1]}" }
18
+ ) { |v| v.map { |i| cli[operation_n[i]] = false } }
19
19
  opt.on(
20
20
  '--only TASK',
21
21
  'Perform only the specified task (see --ignore)'
22
- ) do |v|
23
- op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
- @@OPERATIONS.keys.each{ |i| cli[i] = false }
25
- cli[op_k] = true
26
- end
22
+ ) do |v|
23
+ op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
+ @@OPERATIONS.keys.each { |i| cli[i] = false }
25
+ cli[op_k] = true
26
+ end
27
27
  end
28
28
  end
29
29
 
30
30
  def check_sqlite3_database(db_file, metric)
31
- begin
32
- SQLite3::Database.new(db_file) do |conn|
33
- conn.execute("select count(*) from #{metric}").first
34
- end
35
- rescue SQLite3::SQLException
36
- yield
31
+ SQLite3::Database.new(db_file) do |conn|
32
+ conn.execute("select count(*) from #{metric}").first
37
33
  end
34
+ rescue SQLite3::SQLException
35
+ yield
38
36
  end
39
37
 
40
38
  def perform
@@ -48,6 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
48
46
  db: ['databases', 'Check database files integrity'],
49
47
  dist: ['distances', 'Check distance summary tables'],
50
48
  files: ['files', 'Check for outdated files'],
49
+ cds: ['cds', 'Check for gzipped genes and proteins'],
51
50
  ess: ['essential-genes', 'Check for unarchived essential genes'],
52
51
  mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
53
52
  start: ['start', 'Check for lingering .start files'],
@@ -84,41 +83,9 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
84
83
  res = p.result("#{dist}_distances")
85
84
  next if res.nil?
86
85
  cli.say "Checking #{dist} table for consistent datasets"
87
- notok = {}
88
- fix = {}
89
- Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
90
- lineno = 0
91
- fh.each_line do |ln|
92
- next if (lineno+=1)==1
93
- r = ln.split("\t")
94
- if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
95
- [1,2].each do |i|
96
- if p.dataset(r[i]).nil?
97
- notok[r[i]] = true
98
- else
99
- fix[r[i]] = true
100
- end
101
- end
102
- end
103
- end
104
- end
105
-
106
- cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
107
- fix.keys.each do |d_n|
108
- cli.say " > Fixing #{d_n}."
109
- p.dataset(d_n).cleanup_distances!
110
- end
111
-
112
- unless notok.empty?
113
- cli.say '- Unregistered datasets detected: '
114
- if notok.size <= 5
115
- notok.keys.each { |i| cli.say " > #{i}" }
116
- else
117
- cli.say " > #{notok.size}, including #{notok.keys.first}"
118
- end
119
- cli.say '- Removing tables, recompute'
120
- res.remove!
121
- end
86
+ notok, fix = check_dist_eval(cli, p, res)
87
+ check_dist_fix(cli, p, fix)
88
+ check_dist_recompute(cli, res, notok)
122
89
  end
123
90
  end
124
91
 
@@ -142,6 +109,24 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
142
109
  end
143
110
  end
144
111
 
112
+ def check_cds(cli)
113
+ cli.say 'Looking for unzipped genes or proteins'
114
+ cli.load_project.each_dataset do |d|
115
+ res = d.result(:cds) or next
116
+ changed = false
117
+ [:genes, :proteins, :gff3, :gff2, :tab].each do |f|
118
+ file = res.file_path(f) or next
119
+ if file !~ /\.gz/
120
+ cli.say " > Gzipping #{d.name} #{f}"
121
+ cmdo = `gzip -9 '#{file}'`.chomp
122
+ warn(cmdo) unless cmdo.empty?
123
+ changed = true
124
+ end
125
+ end
126
+ d.add_result(:cds, true, force: true) if changed
127
+ end
128
+ end
129
+
145
130
  def check_ess(cli)
146
131
  cli.say 'Looking for unarchived essential genes'
147
132
  cli.load_project.each_dataset do |d|
@@ -153,11 +138,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
153
138
  res.remove!
154
139
  next
155
140
  end
156
- unless Dir["#{dir}/*.faa"].empty?
157
- cli.say " > Fixing #{d.name}"
158
- cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
159
- warn(cmdo) unless cmdo.empty?
160
- end
141
+ next if Dir["#{dir}/*.faa"].empty?
142
+ cli.say " > Fixing #{d.name}"
143
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
144
+ warn(cmdo) unless cmdo.empty?
161
145
  end
162
146
  end
163
147
 
@@ -207,4 +191,48 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
207
191
  #cli.say 'o Checking for taxonomy/distances consistency'
208
192
  # TODO: Find 95%ANI clusters with entries from different species
209
193
  end
194
+
195
+ private
196
+
197
+ def check_dist_eval(cli, p, res)
198
+ notok = {}
199
+ fix = {}
200
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
201
+ lineno = 0
202
+ fh.each_line do |ln|
203
+ next if (lineno += 1) == 1
204
+ r = ln.split("\t")
205
+ next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
206
+ [1, 2].each do |i|
207
+ if p.dataset(r[i]).nil?
208
+ notok[r[i]] = true
209
+ else
210
+ fix[r[i]] = true
211
+ end
212
+ end
213
+ end
214
+ end
215
+ [notok, fix]
216
+ end
217
+
218
+ def check_dist_fix(cli, p, fix)
219
+ return if fix.empty?
220
+ cli.say("- Fixing #{fix.size} datasets")
221
+ fix.keys.each do |d_n|
222
+ cli.say " > Fixing #{d_n}."
223
+ p.dataset(d_n).cleanup_distances!
224
+ end
225
+ end
226
+
227
+ def check_dist_recompute(cli, p, notok)
228
+ return if notok.empty?
229
+ cli.say '- Unregistered datasets detected: '
230
+ if notok.size <= 5
231
+ notok.keys.each { |i| cli.say " > #{i}" }
232
+ else
233
+ cli.say " > #{notok.size}, including #{notok.keys.first}"
234
+ end
235
+ cli.say '- Removing tables, recompute'
236
+ res.remove!
237
+ end
210
238
  end