miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -35,13 +35,16 @@ class MiGA::Daemon < MiGA::MiGA
35
35
 
36
36
  ##
37
37
  # Initialize an unactive daemon for the MiGA::Project +project+. See #daemon
38
- # to wake the daemon.
39
- def initialize(project)
38
+ # to wake the daemon. If passed, +json+ must be the path to a daemon
39
+ # definition in json format. Otherwise, the project-stored daemon definition
40
+ # is used. In either case, missing variables are used as defined in
41
+ # ~/.miga_daemon.json.
42
+ def initialize(project, json = nil)
40
43
  $_MIGA_DAEMON_LAIR << self
41
44
  @project = project
45
+ json ||= File.expand_path('daemon/daemon.json', project.path)
42
46
  @runopts = MiGA::Json.parse(
43
- File.expand_path('daemon/daemon.json', project.path),
44
- default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
47
+ json, default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
45
48
  @jobs_to_run = []
46
49
  @jobs_running = []
47
50
  @loop_i = -1
@@ -35,14 +35,14 @@ module MiGA::Dataset::Base
35
35
  ##
36
36
  # Supported dataset types.
37
37
  @@KNOWN_TYPES = {
38
- genome: {description: 'The genome from an isolate.', multi: false},
39
- scgenome: {description: 'A Single-cell Amplidied Genome (SAG).',
38
+ genome: {description: 'The genome from an isolate', multi: false},
39
+ scgenome: {description: 'A Single-cell Amplified Genome (SAG)',
40
40
  multi: false},
41
- popgenome: {description: 'A Metagenome-Assembled Genome (MAG).',
41
+ popgenome: {description: 'A Metagenome-Assembled Genome (MAG)',
42
42
  :multi=>false},
43
- metagenome: {description: 'A metagenome (excluding viromes).',
43
+ metagenome: {description: 'A metagenome (excluding viromes)',
44
44
  multi: true},
45
- virome: {description: 'A viral metagenome.', multi: true}
45
+ virome: {description: 'A viral metagenome', multi: true}
46
46
  }
47
47
 
48
48
  ##
@@ -76,13 +76,13 @@ module MiGA::Project::Base
76
76
  # Supported types of projects.
77
77
  @@KNOWN_TYPES = {
78
78
  mixed: {
79
- description: "Mixed collection of genomes, metagenomes, and viromes.",
79
+ description: "Mixed collection of genomes, metagenomes, and viromes",
80
80
  single: true, multi: true},
81
- genomes: {description: "Collection of genomes.",
81
+ genomes: {description: "Collection of genomes",
82
82
  single: true, multi: false},
83
- clade: {description: "Collection of closely-related genomes (ANI >= 90%).",
83
+ clade: {description: "Collection of closely-related genomes (ANI >= 90%)",
84
84
  single: true, multi: false},
85
- metagenomes: {description: "Collection of metagenomes and/or viromes.",
85
+ metagenomes: {description: "Collection of metagenomes and/or viromes",
86
86
  single: false, multi: true}
87
87
  }
88
88
 
@@ -110,7 +110,8 @@ module MiGA::Project::Result
110
110
  r.add_file(:proposal, 'miga-project.proposed-clades')
111
111
  r.add_file(:clades_aai90, 'miga-project.aai90-clades')
112
112
  r.add_file(:clades_ani95, 'miga-project.ani95-clades')
113
- r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
113
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
114
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
114
115
  r
115
116
  end
116
117
 
@@ -35,9 +35,9 @@ module MiGA::RemoteDataset::Base
35
35
  @@UNIVERSE = {
36
36
  web: {
37
37
  dbs: {
38
- assembly: {stage: :assembly, format: :fasta},
39
- assembly_gz: {stage: :assembly, format: :fasta_gz},
40
- text: {stage: :metadata, format: :text}
38
+ assembly: { stage: :assembly, format: :fasta },
39
+ assembly_gz: { stage: :assembly, format: :fasta_gz },
40
+ text: { stage: :metadata, format: :text }
41
41
  },
42
42
  url: '%2$s',
43
43
  method: :net
@@ -59,8 +59,8 @@ module MiGA::RemoteDataset::Base
59
59
  },
60
60
  ncbi_map: {
61
61
  dbs: {
62
- nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
63
- format: :json},
62
+ nuccore: { stage: :metadata, map_to: [:biosample, :assembly],
63
+ format: :json },
64
64
  biosample: {stage: :metadata, map_to: [:assembly], format: :json}
65
65
  },
66
66
  url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
@@ -84,7 +84,7 @@ class MiGA::RemoteDataset
84
84
  end
85
85
  doc
86
86
  end
87
-
87
+
88
88
  ##
89
89
  # Looks for the entry +id+ in +dbfrom+, and returns the linked
90
90
  # identifier in +db+ (or nil).
@@ -10,15 +10,15 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.4, 3, 0]
13
+ VERSION = [0.5, 0, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
17
- VERSION_NAME = 'aquatint'
17
+ VERSION_NAME = 'collotype'
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 9, 10)
21
+ VERSION_DATE = Date.new(2019, 11, 25)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -36,7 +36,9 @@ perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
36
36
  mv "$DATASET.gff3.t" "$DATASET.gff3"
37
37
 
38
38
  # Gzip
39
- gzip -9 -f "$DATASET.gff3"
39
+ for ext in gff3 faa fna ; do
40
+ [[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
41
+ done
40
42
 
41
43
  # Finalize
42
44
  miga date > "$DATASET.done"
@@ -9,6 +9,7 @@ cd "$PROJECT/data/07.annotation/01.function/01.essential"
9
9
  # Initialize
10
10
  miga date > "${DATASET}.start"
11
11
  FAA="../../../06.cds/${DATASET}.faa"
12
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
12
13
 
13
14
  # Check if there are any proteins
14
15
  if [[ ! -s $FAA ]] ; then
@@ -12,7 +12,7 @@ cd "$DIR"
12
12
  miga date > "$DATASET.start"
13
13
 
14
14
  # Calculate statistics
15
- for i in raw_reads trimmed_fasta assembly cds essential_genes distances ; do
15
+ for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
16
16
  echo "# $i"
17
17
  miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
18
18
  done
@@ -13,9 +13,11 @@ miga date > "$DATASET.start"
13
13
 
14
14
  # Gunzip (if necessary)
15
15
  for sis in 1 2 ; do
16
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq.gz" \
17
- && ! -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
18
- && gunzip "../02.trimmed_reads/$b.$sis.clipped.fastq.gz"
16
+ for ext in clipped clipped.single ; do
17
+ [[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
18
+ && ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
19
+ && gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
20
+ done
19
21
  done
20
22
  miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
21
23
 
@@ -23,6 +23,9 @@ class MiGA::DistanceRunner
23
23
  @dataset = project.dataset(dataset_name)
24
24
  @home = File.expand_path('data/09.distances', project.path)
25
25
  # Default opts
26
+ if project.metadata[:aai_save_rbm] == false
27
+ @opts[:aai_save_rbm] ||= 'no-save-rbm'
28
+ end
26
29
  @opts[:aai_save_rbm] ||= ENV.fetch('MIGA_AAI_SAVE_RBM') do
27
30
  project.is_clade? ? 'save-rbm' : 'no-save-rbm'
28
31
  end
@@ -1,5 +1,6 @@
1
1
 
2
2
  require 'tmpdir'
3
+ require 'zlib'
3
4
 
4
5
  module MiGA::DistanceRunner::Temporal
5
6
 
@@ -9,7 +10,15 @@ module MiGA::DistanceRunner::Temporal
9
10
  rf.each do |res, file|
10
11
  r = dataset.result(res)
11
12
  f = r.nil? ? nil : r.file_path(file)
12
- FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
13
+ unless f.nil?
14
+ if f =~ /\.gz/
15
+ File.open(tmp_file("#{file}.fa"), 'w') do |ofh|
16
+ Zlib::GzipReader.open(f) { |ifh| ofh.print ifh.read }
17
+ end
18
+ else
19
+ FileUtils.cp(f, tmp_file("#{file}.fa"))
20
+ end
21
+ end
13
22
  end
14
23
  end
15
24
 
@@ -610,6 +610,11 @@
610
610
  "opt": "--defline",
611
611
  "description": "Keep the original defline after a space."
612
612
  },
613
+ {
614
+ "opt": "--list",
615
+ "arg": "in_file",
616
+ "description": "Reads a list of IDS."
617
+ },
613
618
  {
614
619
  "opt": "--quiet",
615
620
  "description": "Run quietly (no STDERR output)."
@@ -388,6 +388,13 @@
388
388
  "arg": "out_file",
389
389
  "description": "Output file containing the aligned proteins."
390
390
  },
391
+ {
392
+ "opt": "--components",
393
+ "arg": "out_file",
394
+ "description": ["Output file containing the components of the",
395
+ "estimation. Tab-delimited file with model name, matches, and",
396
+ "columns."]
397
+ },
391
398
  {
392
399
  "opt": "--quiet",
393
400
  "description": "Run quietly (no STDERR output)."
@@ -2,46 +2,46 @@
2
2
 
3
3
  #
4
4
  # @author: Luis M. Rodriguez-R
5
- # @update: Feb-06-2015
6
5
  # @license: artistic license 2.0
7
6
  #
8
7
 
9
8
  require 'optparse'
10
9
 
11
- o = {:subject=>FALSE, :quiet=>FALSE}
12
- ARGV << '-h' if ARGV.size==0
10
+ o = { sbj: false, q: false }
11
+ ARGV << '-h' if ARGV.size == 0
13
12
  OptionParser.new do |opts|
14
- opts.banner = "
15
- Appends an extra column to a BLAST with the length of the query or the subject sequence.
16
- You can pipe two instances to add both:
17
- cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
13
+ opts.banner = "
14
+ Appends an extra column to a BLAST with the length of the query or the subject
15
+ sequence. You can pipe two instances to add both:
16
+ cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
18
17
 
19
18
  Usage: #{$0} [options] < input.blast > output.blast"
20
- opts.separator ""
21
- opts.separator "Mandatory"
22
- opts.on("-f", "--fasta FILE", "Path to the FastA file"){ |v| o[:fasta] = v }
23
- opts.separator ""
24
- opts.separator "Options"
25
- opts.on("-s", "--subject",
26
- "Use the subject column of the BLAST, by default the query column is used"){ o[:subject] = TRUE }
27
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:quiet] = TRUE }
28
- opts.on("-h", "--help", "Display this screen") do
29
- puts opts
30
- exit
31
- end
32
- opts.separator ""
19
+ opts.separator ''
20
+ opts.separator 'Mandatory'
21
+ opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
22
+ opts.separator ''
23
+ opts.separator 'Options'
24
+ opts.on('-s', '--subject',
25
+ 'Use the subject column of the BLAST, by default the query column is used'
26
+ ){ o[:sbj] = true }
27
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
28
+ opts.on('-h', '--help', 'Display this screen') do
29
+ puts opts
30
+ exit
31
+ end
32
+ opts.separator ''
33
33
  end.parse!
34
- abort "-f is mandatory" if o[:fasta].nil?
34
+ abort '-f is mandatory' if o[:fasta].nil?
35
35
 
36
36
  len = {}
37
- id = ""
38
- $stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:quiet]
39
- fh = File.open(o[:fasta], "r")
37
+ id = ''
38
+ $stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
39
+ fh = File.open(o[:fasta], 'r')
40
40
  fh.each_line do |ln|
41
41
  defline = /^>(\S+)/.match(ln)
42
42
  if defline.nil?
43
43
  ln.gsub! /[^A-Za-z]/, ''
44
- abort "Error: Unsupported format, expecting FastA" if len[id].nil?
44
+ abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
45
45
  len[id] = len[id] + ln.size
46
46
  else
47
47
  id = defline[1]
@@ -50,12 +50,14 @@ fh.each_line do |ln|
50
50
  end
51
51
  fh.close
52
52
 
53
- $stderr.puts "Appending #{o[:subject]?"subject":"query"} length column" unless o[:quiet]
53
+ unless o[:q]
54
+ $stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
55
+ end
54
56
  ARGF.each_line do |ln|
55
- ln.chomp!
56
- row = ln.split /\t/
57
- id = o[:subject] ? row[1] : row[0];
58
- abort "Impossible to find sequence of #{id}" if len[id].nil?
59
- puts "#{ln}\t#{len[id]}"
57
+ ln.chomp!
58
+ row = ln.split /\t/
59
+ id = o[:sbj] ? row[1] : row[0]
60
+ abort "Impossible to find sequence of #{id}" if len[id].nil?
61
+ puts "#{ln}\t#{len[id]}"
60
62
  end
61
63
 
@@ -1,64 +1,65 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
3
  # @author Luis M. Rodriguez-R
5
- # @update Oct-07-2015
6
4
  # @license artistic license 2.0
7
- #
8
5
 
9
6
  require 'optparse'
10
7
 
11
- o = {:q=>FALSE, :p=>"", :s=>"", :d=>FALSE}
8
+ o = {q: false, p: '', s: '', d: false}
12
9
  ARGV << '-h' if ARGV.size==0
13
10
  OptionParser.new do |opts|
14
- opts.banner = "
11
+ opts.banner = "
15
12
  Generates easy-to-parse tagged reads from FastA files.
16
13
 
17
14
  Usage: #{$0} [options]"
18
- opts.separator ""
19
- opts.separator "Mandatory"
20
- opts.on("-i", "--in FILE",
21
- "Path to the FastA file containing the sequences."){ |v| o[:in] = v }
22
- opts.on("-o", "--out FILE",
23
- "Path to the FastA to create."){ |v| o[:out] = v }
24
- opts.separator ""
25
- opts.separator "ID options"
26
- opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
27
- opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
28
- opts.on("-d", "--defline",
29
- "Keep the original defline after a space."){ o[:d]=TRUE }
30
- opts.separator ""
31
- opts.separator "Other Options"
32
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
33
- opts.on("-h", "--help", "Display this screen") do
34
- puts opts
35
- exit
36
- end
37
- opts.separator ""
15
+ opts.separator ''
16
+ opts.separator 'Mandatory'
17
+ opts.on('-i', '--in FILE',
18
+ 'Path to the FastA file containing the sequences.'){ |v| o[:in] = v }
19
+ opts.on('-o', '--out FILE',
20
+ 'Path to the FastA to create.'){ |v| o[:out] = v }
21
+ opts.separator ''
22
+ opts.separator 'ID options'
23
+ opts.on('-p', '--prefix STR', 'Prefix to use in all IDs.'){ |v| o[:p] = v }
24
+ opts.on('-s', '--suffix STR', 'Suffix to use in all IDs.'){ |v| o[:s] = v }
25
+ opts.on('-d', '--defline',
26
+ 'Keep the original defline after a space.'){ o[:d] = true }
27
+ opts.on('-l', '--list FILE',
28
+ 'Reads a list of IDS.'){ |v| o[:l] = v }
29
+ opts.separator ''
30
+ opts.separator 'Other Options'
31
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
32
+ opts.on('-h', '--help', 'Display this screen') do
33
+ puts opts
34
+ exit
35
+ end
36
+ opts.separator ''
38
37
  end.parse!
39
- abort "-i is mandatory" if o[:in].nil?
40
- abort "-o is mandatory" if o[:out].nil?
38
+ abort '-i is mandatory' if o[:in].nil?
39
+ abort '-o is mandatory' if o[:out].nil?
41
40
 
42
41
  begin
43
- ifh = File.open(o[:in], 'r');
44
- ofh = File.open(o[:out], 'w');
45
- i=0
46
- while ln=ifh.gets
42
+ list = o[:l].nil? ? nil :
43
+ File.readlines(o[:l]).map{ |i| i.chomp.gsub(/^>/, '') }
44
+ ofh = File.open(o[:out], 'w')
45
+ i = 0
46
+ File.open(o[:in], 'r') do |ifh|
47
+ ifh.each do |ln|
47
48
  ln.chomp!
48
49
  next if ln =~ /^;/
49
50
  unless /^>/.match(ln).nil?
50
- i+=1
51
- ofh.puts ">#{o[:p]}#{i}#{o[:s]}#{ o[:d]?" #{ln[1, ln.size-1]}":"" }"
51
+ i += 1
52
+ new_id = o[:l].nil? ? i : list.shift
53
+ ofh.puts ">#{o[:p]}#{new_id}#{o[:s]}#{o[:d]?" #{ln[1, ln.size-1]}":''}"
52
54
  else
53
- ofh.puts ln
55
+ ofh.puts ln
54
56
  end
55
- end
56
- ifh.close
57
- ofh.close
57
+ end
58
+ end
59
+ ofh.close
58
60
  rescue => err
59
- $stderr.puts "Exception: #{err}\n\n"
60
- err.backtrace.each { |l| $stderr.puts l + "\n" }
61
- err
61
+ $stderr.puts "Exception: #{err}\n\n"
62
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
63
+ err
62
64
  end
63
65
 
64
-
@@ -8,8 +8,10 @@ require 'enveomics_rb/enveomics'
8
8
  use 'tmpdir'
9
9
  use 'zlib'
10
10
 
11
- o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
12
- archaea: false, genomeeq: false, metagenome: false, list: false}
11
+ o = {
12
+ bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
+ archaea: false, genomeeq: false, metagenome: false, list: false
14
+ }
13
15
  OptionParser.new do |opts|
14
16
  opts.banner = "
15
17
  Finds and extracts a collection of essential proteins suitable for genome
@@ -26,65 +28,86 @@ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
26
28
  Usage: #{$0} [options]"
27
29
  opts.separator ''
28
30
  opts.separator 'Mandatory'
29
- opts.on('-i', '--in FILE',
30
- 'Path to the FastA file containing all the proteins in a genome.'
31
- ){ |v| o[:in] = v }
31
+ opts.on(
32
+ '-i', '--in FILE',
33
+ 'Path to the FastA file (.gz allowed) with all the proteins in a genome'
34
+ ) { |v| o[:in] = v }
32
35
  opts.separator ''
33
36
  opts.separator 'Report Options'
34
- opts.on('-o', '--out FILE',
35
- 'Path to the output FastA file with the translated essential genes.',
36
- 'By default the file is not produced.'){ |v| o[:out] = v }
37
- opts.on('-m', '--per-model STR',
37
+ opts.on(
38
+ '-o', '--out FILE',
39
+ 'Path to the output FastA file with the translated essential genes',
40
+ 'By default the file is not produced'
41
+ ) { |v| o[:out] = v }
42
+ opts.on(
43
+ '-m', '--per-model STR',
38
44
  'Prefix of translated genes in independent files with the name of the',
39
- 'model appended. By default files are not produced.'
40
- ){ |v| o[:permodel] = v }
41
- opts.on('-R', '--report FILE',
42
- 'Path to the report file. By default, the report is sent to the STDOUT.'
43
- ){ |v| o[:report] = v }
44
- opts.on('--hmm-out FILE',
45
- 'Save HMMsearch output in this file. By default, not saved.'
46
- ){ |v| o[:hmmout] = v }
47
- opts.on('--alignments FILE',
45
+ 'model appended. By default files are not produced'
46
+ ) { |v| o[:permodel] = v }
47
+ opts.on(
48
+ '-R', '--report FILE',
49
+ 'Path to the report file. By default, the report is sent to the STDOUT'
50
+ ) { |v| o[:report] = v }
51
+ opts.on(
52
+ '--hmm-out FILE',
53
+ 'Save HMMsearch output in this file. By default, not saved'
54
+ ) { |v| o[:hmmout] = v }
55
+ opts.on(
56
+ '--alignments FILE',
48
57
  'Save the aligned proteins in this file. By default, not saved'
49
- ){ |v| o[:alignments] = v }
50
- opts.on('-B', '--bacteria',
51
- 'If set, ignores models typically missing in Bacteria.'
52
- ){ |v| o[:bacteria] = v }
53
- opts.on('-A', '--archaea',
54
- 'If set, ignores models typically missing in Archaea.'
55
- ){ |v| o[:archaea] = v }
56
- opts.on('-G', '--genome-eq',
57
- 'If set, ignores models not suitable for genome-equivalents estimations.',
58
- 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
59
- ){ |v| o[:genomeeq] = v }
60
- opts.on('-r', '--rename STR',
58
+ ) { |v| o[:alignments] = v }
59
+ opts.on(
60
+ '-B', '--bacteria',
61
+ 'If set, ignores models typically missing in Bacteria'
62
+ ) { |v| o[:bacteria] = v }
63
+ opts.on(
64
+ '-A', '--archaea',
65
+ 'If set, ignores models typically missing in Archaea'
66
+ ) { |v| o[:archaea] = v }
67
+ opts.on(
68
+ '-G', '--genome-eq',
69
+ 'If set, ignores models not suitable for genome-equivalents estimations',
70
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
71
+ ) { |v| o[:genomeeq] = v }
72
+ opts.on(
73
+ '-r', '--rename STR',
61
74
  'If set, renames the sequences with the string provided and appends it',
62
- 'with pipe and the gene name (except in --per-model files).'
63
- ){ |v| o[:rename]=v }
64
- opts.on('-n', '--no-stats',
65
- 'If set, no statistics are reported on genome evaluation.'
66
- ){ |v| o[:stats] = v }
67
- opts.on('-s', '--no-genes',
68
- 'If set, statistics won\'t include the lists of missing/multi-copy genes.'
69
- ){ |v| o[:genes] = v }
70
- opts.on('-M', '--metagenome',
75
+ 'with pipe and the gene name (except in --per-model files)'
76
+ ) { |v| o[:rename] = v }
77
+ opts.on(
78
+ '-n', '--no-stats',
79
+ 'If set, no statistics are reported on genome evaluation'
80
+ ) { |v| o[:stats] = v }
81
+ opts.on(
82
+ '-s', '--no-genes',
83
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes'
84
+ ) { |v| o[:genes] = v }
85
+ opts.on(
86
+ '-M', '--metagenome',
71
87
  'If set, it allows for multiple copies of each gene and turns on',
72
- 'metagenomic report mode.'){ |v| o[:metagenome] = v }
88
+ 'metagenomic report mode'
89
+ ) { |v| o[:metagenome] = v }
73
90
  opts.separator ''
74
91
  opts.separator 'Other Options'
75
- opts.on('-L', '--list-models',
92
+ opts.on(
93
+ '-L', '--list-models',
76
94
  'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
77
- 'and -q; ignores all other parameters.'){ |v| o[:list] = v }
78
- opts.on('-b', '--bin DIR',
79
- 'Path to the directory containing the binaries of HMMer 3.0+.'
80
- ){ |v| o[:bin] = v }
81
- opts.on('--model-file',
82
- 'External file containing models to search.'){ |v| o[:model_file] = v }
83
- opts.on('-t', '--threads INT',
84
- "Number of parallel threads to be used. By default: #{o[:thr]}."
85
- ){ |v| o[:thr] = v.to_i }
86
- opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
87
- opts.on('-h', '--help', 'Display this screen.') do
95
+ 'and -q; ignores all other parameters'
96
+ ) { |v| o[:list] = v }
97
+ opts.on(
98
+ '-b', '--bin DIR',
99
+ 'Path to the directory containing the binaries of HMMer 3.0+'
100
+ ) { |v| o[:bin] = v }
101
+ opts.on(
102
+ '--model-file',
103
+ 'External file containing models to search'
104
+ ) { |v| o[:model_file] = v }
105
+ opts.on(
106
+ '-t', '--threads INT', Integer,
107
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
108
+ ) { |v| o[:thr] = v }
109
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
110
+ opts.on('-h', '--help', 'Display this screen') do
88
111
  puts opts
89
112
  exit
90
113
  end
@@ -112,6 +135,13 @@ TIGR00389 TIGR00436 tRNA-synth_1d}
112
135
  begin
113
136
  Dir.mktmpdir do |dir|
114
137
  $stderr.puts "Temporal directory: #{dir}." unless o[:q]
138
+ if o[:in] =~ /\.gz/
139
+ tmp_in = File.expand_path('sequences.fa', dir)
140
+ Zlib::GzipReader.open(o[:in]) do |ifh|
141
+ File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
142
+ end
143
+ o[:in] = tmp_in
144
+ end
115
145
 
116
146
  # Create database.
117
147
  $stderr.puts 'Searching models.' unless o[:q]
@@ -144,9 +174,9 @@ begin
144
174
  'This script requires HMMER 3.0+.'
145
175
  end
146
176
  o[:hmmout] ||= "#{dir}/hmmsearch"
147
- `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
148
- -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
149
- > #{dir}/hmmsearch.log`
177
+ `'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
178
+ -A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
179
+ > '#{dir}/hmmsearch.log'`
150
180
 
151
181
  # Parse output
152
182
  $stderr.puts 'Parsing results.' unless o[:q]