miga-base 0.4.3.0 → 0.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,97 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
7
+ require 'miga/cli/action/wf'
8
+ include MiGA::Cli::Action::Wf
9
+
10
+ def parse_cli
11
+ default_opts_for_wf
12
+ cli.defaults = {
13
+ download: false, summaries: true, pvalue: 0.05,
14
+ local: File.expand_path('.miga_db', ENV['MIGA_HOME'])
15
+ }
16
+ cli.parse do |opt|
17
+ opt.on(
18
+ '--download-db',
19
+ 'Attempt to download the reference database (all default options)',
20
+ 'It is recommended to use "miga get_db" separately instead'
21
+ ) { |v| cli[:download] = v }
22
+ opt.on(
23
+ '-n', '--database STRING',
24
+ 'Name of the reference database to use',
25
+ 'By default, the first locally listed database is used'
26
+ ) { |v| cli[:database] = v.to_sym }
27
+ opt.on(
28
+ '-p', '--p-value FLOAT', Float,
29
+ 'Maximum p-value to transfer taxonomy',
30
+ "By default: #{cli[:pvalue]}"
31
+ ) { |v| cli[:pvalue] = v }
32
+ opt.on(
33
+ '-l', '--local-dir PATH',
34
+ "Local directory to store the database. By default: #{cli[:local]}"
35
+ ) { |v| cli[:local] = v }
36
+ opt.on(
37
+ '--db-path STRING',
38
+ 'Path to the reference database to use, a fully indexed MiGA project',
39
+ 'If defined, --local-dir and --database are ignored'
40
+ ) { |v| cli[:db_path] = v }
41
+ opt.on(
42
+ '--no-summaries',
43
+ 'Do not generate intermediate step summaries'
44
+ ) { |v| cli[:summaries] = v }
45
+ opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
46
+ end
47
+ end
48
+
49
+ def perform
50
+ # Input data
51
+ ref_db = reference_db
52
+ p_metadata = Hash[
53
+ %w[project_stats haai_distances aai_distances ani_distances clade_finding]
54
+ .map { |i| ["run_#{i}", false] }
55
+ ]
56
+ p_metadata[:ref_project] = ref_db.path
57
+ p_metadata[:tax_pvalue] = cli[:pvalue]
58
+ p = create_project(:assembly, p_metadata,
59
+ run_ssu: false, run_mytaxa_scan: false, run_distances: false)
60
+ # Run
61
+ run_daemon
62
+ summarize(%w[cds assembly essential_genes]) if cli[:summaries]
63
+ summarize(['taxonomy'])
64
+ cli.say "Summary: classification"
65
+ call_cli([
66
+ 'ls', '-P', cli[:outdir], '-m', 'tax', '--tab',
67
+ '-o', File.expand_path('classification.tsv', cli[:outdir])
68
+ ])
69
+ cleanup
70
+ end
71
+
72
+ private
73
+
74
+ def reference_db
75
+ cli.say "Locating reference database"
76
+ ref_db_path = cli[:db_path]
77
+ if ref_db_path.nil?
78
+ if cli[:download]
79
+ get_db_call = ['get_db', '-l', cli[:local]]
80
+ get_db_call += ['-n', cli[:database]] unless cli[:database].nil?
81
+ call_cli(get_db_call)
82
+ end
83
+ if cli[:database].nil?
84
+ lm_f = File.expand_path('_local_manif.json', cli[:local])
85
+ unless File.size? lm_f
86
+ raise 'No locally listed databases, call "miga get_db" first'
87
+ end
88
+ cli[:database] = MiGA::Json.parse(lm_f)[:databases].keys.first
89
+ end
90
+ ref_db_path = File.expand_path(cli[:database].to_s, cli[:local])
91
+ end
92
+ ref_db = MiGA::Project.load(ref_db_path)
93
+ raise "Cannot locate reference database: #{ref_db_path}" if ref_db.nil?
94
+ cli.say "Reference database: #{ref_db.name}"
95
+ ref_db
96
+ end
97
+ end
@@ -18,7 +18,7 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
18
18
  run: 'Start the application and stay on top.',
19
19
  zap: 'Set the application to a stopped state.',
20
20
  status: 'Show status (PID) of application instances.'
21
- }.each{ |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
21
+ }.each { |k,v| opt.separator sprintf ' %*s%s', -33, k, v }
22
22
  opt.separator ''
23
23
 
24
24
  opt.separator 'MiGA options:'
@@ -27,45 +27,49 @@ class MiGA::Cli::Action::Daemon < MiGA::Cli::Action
27
27
  '--shutdown-when-done',
28
28
  'Exit the daemon when all processing is done',
29
29
  'Otherwise, it will stay idle awaiting for new data (default)'
30
- ){ |v| cli[:shutdown_when_done] = v }
30
+ ) { |v| cli[:shutdown_when_done] = v }
31
31
  opt.on(
32
32
  '--latency INT',
33
33
  'Number of seconds the daemon will be sleeping'
34
- ){ |v| cli[:latency]=v.to_i }
34
+ ) { |v| cli[:latency] = v.to_i }
35
35
  opt.on(
36
36
  '--max-jobs INT',
37
37
  'Maximum number of jobs to use simultaneously'
38
- ){ |v| cli[:maxjobs]=v.to_i }
38
+ ) { |v| cli[:maxjobs] = v.to_i }
39
39
  opt.on(
40
40
  '--ppn INT',
41
41
  'Maximum number of cores to use in a single job'
42
- ){ |v| cli[:ppn]=v.to_i }
42
+ ) { |v| cli[:ppn] = v.to_i }
43
+ opt.on(
44
+ '--json PATH',
45
+ 'Path to a custom daemon definition in json format'
46
+ ) { |v| cli[:json] = v }
43
47
  cli.opt_common(opt)
44
48
 
45
49
  opt.separator 'Daemon options:'
46
50
  opt.on(
47
51
  '-t', '--ontop',
48
52
  'Stay on top (does not daemonize)'
49
- ){ cli[:daemon_opts] << '-t' }
53
+ ) { cli[:daemon_opts] << '-t' }
50
54
  opt.on(
51
55
  '-f', '--force',
52
56
  'Force operation'
53
- ){ cli[:daemon_opts] << '-f' }
57
+ ) { cli[:daemon_opts] << '-f' }
54
58
  opt.on(
55
59
  '-n', '--no_wait',
56
60
  'Do not wait for processes to stop'
57
- ){ cli[:daemon_opts] << '-n' }
61
+ ) { cli[:daemon_opts] << '-n' }
58
62
  opt.on(
59
63
  '--shush',
60
64
  'Silence the daemon'
61
- ){ cli[:daemon_opts] << '--shush' }
65
+ ) { cli[:daemon_opts] << '--shush' }
62
66
  opt.separator ''
63
67
  end
64
68
  end
65
69
 
66
70
  def perform
67
71
  p = cli.load_project
68
- d = MiGA::Daemon.new(p)
72
+ d = MiGA::Daemon.new(p, cli[:json])
69
73
  [:latency, :maxjobs, :ppn, :shutdown_when_done].each do |k|
70
74
  d.runopts(k, cli[k]) unless cli[k].nil?
71
75
  end
@@ -0,0 +1,95 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
7
+ require 'miga/cli/action/wf'
8
+ include MiGA::Cli::Action::Wf
9
+
10
+ def parse_cli
11
+ default_opts_for_wf
12
+ cli.defaults = {
13
+ metric: :ani, threshold: 95.0, criterion: :quality,
14
+ summaries: true, collection: true
15
+ }
16
+ cli.parse do |opt|
17
+ opt.on(
18
+ '--aai',
19
+ 'Use Average Amino Acid Identity (AAI) as genome similarity metric',
20
+ 'By default: Use Average Nucleotide Identity (ANI)'
21
+ ) { cli[:metric] = :aai }
22
+ opt.on(
23
+ '--threshold FLOAT', Float,
24
+ "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
25
+ ) { |v| cli[:threshold] = v }
26
+ opt.on(
27
+ '--medoids',
28
+ 'Use medoids as clade representatives',
29
+ 'By default: Use genome with the highest quality'
30
+ ) { |v| cli[:criterion] = :medoids }
31
+ opt.on(
32
+ '--no-collection',
33
+ 'Do not generate a dereplicated collection of assemblies'
34
+ ) { |v| cli[:collection] = v }
35
+ opt.on(
36
+ '--no-summaries',
37
+ 'Do not generate intermediate step summaries'
38
+ ) { |v| cli[:summaries] = v }
39
+ opts_for_wf_distances(opt)
40
+ opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
41
+ end
42
+ end
43
+
44
+ def perform
45
+ # Input data
46
+ p = create_project(:assembly,
47
+ { run_project_stats: false, run_clades: false,
48
+ gsp_metric: cli[:metric], :"gsp_#{cli[:metric]}" => cli[:threshold] },
49
+ { run_mytaxa_scan: false, run_ssu: false })
50
+ unless cli[:threshold] >= 0.0 && cli[:threshold] <= 100.0
51
+ raise "The threshold of identity must be in the range [0,100]"
52
+ end
53
+ # Run
54
+ run_daemon
55
+ dereplicate(p)
56
+ summarize(%w[cds assembly essential_genes]) if cli[:summaries]
57
+ cleanup
58
+ end
59
+
60
+ private
61
+
62
+ def dereplicate(p)
63
+ cli.say "Extracting genomospecies clades"
64
+ r = p.result(:clade_finding) or raise "Result unavailable: run failed"
65
+ c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed'
66
+ clades = File.readlines(c_f).map { |i| i.chomp.split("\t") }
67
+ rep = representatives(p)
68
+ File.open(File.expand_path('genomospecies.tsv', cli[:outdir]), 'w') do |fh|
69
+ fh.puts "Clade\tRepresentative\tMembers"
70
+ clades.each_with_index do |i, k|
71
+ fh.puts ["gsp_#{k+1}", rep[k], i.join(',')].join("\t")
72
+ end
73
+ end
74
+ if cli[:collection]
75
+ dir = File.expand_path('representatives', cli[:outdir])
76
+ FileUtils.mkdir_p(dir)
77
+ rep.each do |i|
78
+ f = p.dataset(i).result(:assembly).file_path(:largecontigs)
79
+ FileUtils.cp(f, dir)
80
+ end
81
+ end
82
+ end
83
+
84
+ def representatives(p)
85
+ cli.say "Identifying representatives"
86
+ f = File.expand_path('representatives.txt', cli[:outdir])
87
+ if cli[:criterion] == :medoids
88
+ FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f)
89
+ else
90
+ src = File.expand_path('utils/representatives.rb', MiGA::MiGA.root_path)
91
+ `ruby '#{src}' '#{p.path}' | cut -f 2 > '#{f}'`
92
+ end
93
+ File.readlines(f).map(&:chomp)
94
+ end
95
+ end
@@ -7,34 +7,32 @@ require 'sqlite3'
7
7
  class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
8
8
 
9
9
  def parse_cli
10
- @@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
10
+ @@OPERATIONS.keys.each { |i| cli.defaults = { i => true } }
11
11
  cli.parse do |opt|
12
- operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
12
+ operation_n = Hash[@@OPERATIONS.map { |k,v| [v[0], k] }]
13
13
  cli.opt_object(opt, [:project])
14
14
  opt.on(
15
15
  '--ignore TASK1,TASK2', Array,
16
16
  'Do not perform the task(s) listed. Available tasks are:',
17
- * @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
18
- ){ |v| v.map{ |i| cli[operation_n[i]] = false } }
17
+ * @@OPERATIONS.values.map { |v| "~ #{v[0]}: #{v[1]}" }
18
+ ) { |v| v.map { |i| cli[operation_n[i]] = false } }
19
19
  opt.on(
20
20
  '--only TASK',
21
21
  'Perform only the specified task (see --ignore)'
22
- ) do |v|
23
- op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
- @@OPERATIONS.keys.each{ |i| cli[i] = false }
25
- cli[op_k] = true
26
- end
22
+ ) do |v|
23
+ op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
+ @@OPERATIONS.keys.each { |i| cli[i] = false }
25
+ cli[op_k] = true
26
+ end
27
27
  end
28
28
  end
29
29
 
30
30
  def check_sqlite3_database(db_file, metric)
31
- begin
32
- SQLite3::Database.new(db_file) do |conn|
33
- conn.execute("select count(*) from #{metric}").first
34
- end
35
- rescue SQLite3::SQLException
36
- yield
31
+ SQLite3::Database.new(db_file) do |conn|
32
+ conn.execute("select count(*) from #{metric}").first
37
33
  end
34
+ rescue SQLite3::SQLException
35
+ yield
38
36
  end
39
37
 
40
38
  def perform
@@ -48,6 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
48
46
  db: ['databases', 'Check database files integrity'],
49
47
  dist: ['distances', 'Check distance summary tables'],
50
48
  files: ['files', 'Check for outdated files'],
49
+ cds: ['cds', 'Check for gzipped genes and proteins'],
51
50
  ess: ['essential-genes', 'Check for unarchived essential genes'],
52
51
  mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
53
52
  start: ['start', 'Check for lingering .start files'],
@@ -84,41 +83,9 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
84
83
  res = p.result("#{dist}_distances")
85
84
  next if res.nil?
86
85
  cli.say "Checking #{dist} table for consistent datasets"
87
- notok = {}
88
- fix = {}
89
- Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
90
- lineno = 0
91
- fh.each_line do |ln|
92
- next if (lineno+=1)==1
93
- r = ln.split("\t")
94
- if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
95
- [1,2].each do |i|
96
- if p.dataset(r[i]).nil?
97
- notok[r[i]] = true
98
- else
99
- fix[r[i]] = true
100
- end
101
- end
102
- end
103
- end
104
- end
105
-
106
- cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
107
- fix.keys.each do |d_n|
108
- cli.say " > Fixing #{d_n}."
109
- p.dataset(d_n).cleanup_distances!
110
- end
111
-
112
- unless notok.empty?
113
- cli.say '- Unregistered datasets detected: '
114
- if notok.size <= 5
115
- notok.keys.each { |i| cli.say " > #{i}" }
116
- else
117
- cli.say " > #{notok.size}, including #{notok.keys.first}"
118
- end
119
- cli.say '- Removing tables, recompute'
120
- res.remove!
121
- end
86
+ notok, fix = check_dist_eval(cli, p, res)
87
+ check_dist_fix(cli, p, fix)
88
+ check_dist_recompute(cli, res, notok)
122
89
  end
123
90
  end
124
91
 
@@ -142,6 +109,24 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
142
109
  end
143
110
  end
144
111
 
112
+ def check_cds(cli)
113
+ cli.say 'Looking for unzipped genes or proteins'
114
+ cli.load_project.each_dataset do |d|
115
+ res = d.result(:cds) or next
116
+ changed = false
117
+ [:genes, :proteins, :gff3, :gff2, :tab].each do |f|
118
+ file = res.file_path(f) or next
119
+ if file !~ /\.gz/
120
+ cli.say " > Gzipping #{d.name} #{f}"
121
+ cmdo = `gzip -9 '#{file}'`.chomp
122
+ warn(cmdo) unless cmdo.empty?
123
+ changed = true
124
+ end
125
+ end
126
+ d.add_result(:cds, true, force: true) if changed
127
+ end
128
+ end
129
+
145
130
  def check_ess(cli)
146
131
  cli.say 'Looking for unarchived essential genes'
147
132
  cli.load_project.each_dataset do |d|
@@ -153,11 +138,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
153
138
  res.remove!
154
139
  next
155
140
  end
156
- unless Dir["#{dir}/*.faa"].empty?
157
- cli.say " > Fixing #{d.name}"
158
- cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
159
- warn(cmdo) unless cmdo.empty?
160
- end
141
+ next if Dir["#{dir}/*.faa"].empty?
142
+ cli.say " > Fixing #{d.name}"
143
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
144
+ warn(cmdo) unless cmdo.empty?
161
145
  end
162
146
  end
163
147
 
@@ -207,4 +191,48 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
207
191
  #cli.say 'o Checking for taxonomy/distances consistency'
208
192
  # TODO: Find 95%ANI clusters with entries from different species
209
193
  end
194
+
195
+ private
196
+
197
+ def check_dist_eval(cli, p, res)
198
+ notok = {}
199
+ fix = {}
200
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
201
+ lineno = 0
202
+ fh.each_line do |ln|
203
+ next if (lineno += 1) == 1
204
+ r = ln.split("\t")
205
+ next unless [1, 2].map { |i| p.dataset(r[i]).nil? }.any?
206
+ [1, 2].each do |i|
207
+ if p.dataset(r[i]).nil?
208
+ notok[r[i]] = true
209
+ else
210
+ fix[r[i]] = true
211
+ end
212
+ end
213
+ end
214
+ end
215
+ [notok, fix]
216
+ end
217
+
218
+ def check_dist_fix(cli, p, fix)
219
+ return if fix.empty?
220
+ cli.say("- Fixing #{fix.size} datasets")
221
+ fix.keys.each do |d_n|
222
+ cli.say " > Fixing #{d_n}."
223
+ p.dataset(d_n).cleanup_distances!
224
+ end
225
+ end
226
+
227
+ def check_dist_recompute(cli, p, notok)
228
+ return if notok.empty?
229
+ cli.say '- Unregistered datasets detected: '
230
+ if notok.size <= 5
231
+ notok.keys.each { |i| cli.say " > #{i}" }
232
+ else
233
+ cli.say " > #{notok.size}, including #{notok.keys.first}"
234
+ end
235
+ cli.say '- Removing tables, recompute'
236
+ res.remove!
237
+ end
210
238
  end