miga-base 0.4.3.0 → 0.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -35,13 +35,16 @@ class MiGA::Daemon < MiGA::MiGA
35
35
 
36
36
  ##
37
37
  # Initialize an unactive daemon for the MiGA::Project +project+. See #daemon
38
- # to wake the daemon.
39
- def initialize(project)
38
+ # to wake the daemon. If passed, +json+ must be the path to a daemon
39
+ # definition in json format. Otherwise, the project-stored daemon definition
40
+ # is used. In either case, missing variables are used as defined in
41
+ # ~/.miga_daemon.json.
42
+ def initialize(project, json = nil)
40
43
  $_MIGA_DAEMON_LAIR << self
41
44
  @project = project
45
+ json ||= File.expand_path('daemon/daemon.json', project.path)
42
46
  @runopts = MiGA::Json.parse(
43
- File.expand_path('daemon/daemon.json', project.path),
44
- default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
47
+ json, default: File.expand_path('.miga_daemon.json', ENV['MIGA_HOME']))
45
48
  @jobs_to_run = []
46
49
  @jobs_running = []
47
50
  @loop_i = -1
@@ -35,14 +35,14 @@ module MiGA::Dataset::Base
35
35
  ##
36
36
  # Supported dataset types.
37
37
  @@KNOWN_TYPES = {
38
- genome: {description: 'The genome from an isolate.', multi: false},
39
- scgenome: {description: 'A Single-cell Amplidied Genome (SAG).',
38
+ genome: {description: 'The genome from an isolate', multi: false},
39
+ scgenome: {description: 'A Single-cell Amplified Genome (SAG)',
40
40
  multi: false},
41
- popgenome: {description: 'A Metagenome-Assembled Genome (MAG).',
41
+ popgenome: {description: 'A Metagenome-Assembled Genome (MAG)',
42
42
  :multi=>false},
43
- metagenome: {description: 'A metagenome (excluding viromes).',
43
+ metagenome: {description: 'A metagenome (excluding viromes)',
44
44
  multi: true},
45
- virome: {description: 'A viral metagenome.', multi: true}
45
+ virome: {description: 'A viral metagenome', multi: true}
46
46
  }
47
47
 
48
48
  ##
@@ -76,13 +76,13 @@ module MiGA::Project::Base
76
76
  # Supported types of projects.
77
77
  @@KNOWN_TYPES = {
78
78
  mixed: {
79
- description: "Mixed collection of genomes, metagenomes, and viromes.",
79
+ description: "Mixed collection of genomes, metagenomes, and viromes",
80
80
  single: true, multi: true},
81
- genomes: {description: "Collection of genomes.",
81
+ genomes: {description: "Collection of genomes",
82
82
  single: true, multi: false},
83
- clade: {description: "Collection of closely-related genomes (ANI >= 90%).",
83
+ clade: {description: "Collection of closely-related genomes (ANI >= 90%)",
84
84
  single: true, multi: false},
85
- metagenomes: {description: "Collection of metagenomes and/or viromes.",
85
+ metagenomes: {description: "Collection of metagenomes and/or viromes",
86
86
  single: false, multi: true}
87
87
  }
88
88
 
@@ -110,7 +110,8 @@ module MiGA::Project::Result
110
110
  r.add_file(:proposal, 'miga-project.proposed-clades')
111
111
  r.add_file(:clades_aai90, 'miga-project.aai90-clades')
112
112
  r.add_file(:clades_ani95, 'miga-project.ani95-clades')
113
- r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
113
+ r.add_file(:clades_gsp, 'miga-project.gsp-clades')
114
+ r.add_file(:medoids_gsp, 'miga-project.gsp-medoids')
114
115
  r
115
116
  end
116
117
 
@@ -35,9 +35,9 @@ module MiGA::RemoteDataset::Base
35
35
  @@UNIVERSE = {
36
36
  web: {
37
37
  dbs: {
38
- assembly: {stage: :assembly, format: :fasta},
39
- assembly_gz: {stage: :assembly, format: :fasta_gz},
40
- text: {stage: :metadata, format: :text}
38
+ assembly: { stage: :assembly, format: :fasta },
39
+ assembly_gz: { stage: :assembly, format: :fasta_gz },
40
+ text: { stage: :metadata, format: :text }
41
41
  },
42
42
  url: '%2$s',
43
43
  method: :net
@@ -59,8 +59,8 @@ module MiGA::RemoteDataset::Base
59
59
  },
60
60
  ncbi_map: {
61
61
  dbs: {
62
- nuccore: {stage: :metadata, map_to: [:biosample, :assembly],
63
- format: :json},
62
+ nuccore: { stage: :metadata, map_to: [:biosample, :assembly],
63
+ format: :json },
64
64
  biosample: {stage: :metadata, map_to: [:assembly], format: :json}
65
65
  },
66
66
  url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
@@ -84,7 +84,7 @@ class MiGA::RemoteDataset
84
84
  end
85
85
  doc
86
86
  end
87
-
87
+
88
88
  ##
89
89
  # Looks for the entry +id+ in +dbfrom+, and returns the linked
90
90
  # identifier in +db+ (or nil).
@@ -10,15 +10,15 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.4, 3, 0]
13
+ VERSION = [0.5, 0, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
17
- VERSION_NAME = 'aquatint'
17
+ VERSION_NAME = 'collotype'
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 9, 10)
21
+ VERSION_DATE = Date.new(2019, 11, 25)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -36,7 +36,9 @@ perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
36
36
  mv "$DATASET.gff3.t" "$DATASET.gff3"
37
37
 
38
38
  # Gzip
39
- gzip -9 -f "$DATASET.gff3"
39
+ for ext in gff3 faa fna ; do
40
+ [[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
41
+ done
40
42
 
41
43
  # Finalize
42
44
  miga date > "$DATASET.done"
@@ -9,6 +9,7 @@ cd "$PROJECT/data/07.annotation/01.function/01.essential"
9
9
  # Initialize
10
10
  miga date > "${DATASET}.start"
11
11
  FAA="../../../06.cds/${DATASET}.faa"
12
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
12
13
 
13
14
  # Check if there are any proteins
14
15
  if [[ ! -s $FAA ]] ; then
@@ -12,7 +12,7 @@ cd "$DIR"
12
12
  miga date > "$DATASET.start"
13
13
 
14
14
  # Calculate statistics
15
- for i in raw_reads trimmed_fasta assembly cds essential_genes distances ; do
15
+ for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
16
16
  echo "# $i"
17
17
  miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
18
18
  done
@@ -13,9 +13,11 @@ miga date > "$DATASET.start"
13
13
 
14
14
  # Gunzip (if necessary)
15
15
  for sis in 1 2 ; do
16
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq.gz" \
17
- && ! -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
18
- && gunzip "../02.trimmed_reads/$b.$sis.clipped.fastq.gz"
16
+ for ext in clipped clipped.single ; do
17
+ [[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
18
+ && ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
19
+ && gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
20
+ done
19
21
  done
20
22
  miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
21
23
 
@@ -23,6 +23,9 @@ class MiGA::DistanceRunner
23
23
  @dataset = project.dataset(dataset_name)
24
24
  @home = File.expand_path('data/09.distances', project.path)
25
25
  # Default opts
26
+ if project.metadata[:aai_save_rbm] == false
27
+ @opts[:aai_save_rbm] ||= 'no-save-rbm'
28
+ end
26
29
  @opts[:aai_save_rbm] ||= ENV.fetch('MIGA_AAI_SAVE_RBM') do
27
30
  project.is_clade? ? 'save-rbm' : 'no-save-rbm'
28
31
  end
@@ -1,5 +1,6 @@
1
1
 
2
2
  require 'tmpdir'
3
+ require 'zlib'
3
4
 
4
5
  module MiGA::DistanceRunner::Temporal
5
6
 
@@ -9,7 +10,15 @@ module MiGA::DistanceRunner::Temporal
9
10
  rf.each do |res, file|
10
11
  r = dataset.result(res)
11
12
  f = r.nil? ? nil : r.file_path(file)
12
- FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
13
+ unless f.nil?
14
+ if f =~ /\.gz/
15
+ File.open(tmp_file("#{file}.fa"), 'w') do |ofh|
16
+ Zlib::GzipReader.open(f) { |ifh| ofh.print ifh.read }
17
+ end
18
+ else
19
+ FileUtils.cp(f, tmp_file("#{file}.fa"))
20
+ end
21
+ end
13
22
  end
14
23
  end
15
24
 
@@ -610,6 +610,11 @@
610
610
  "opt": "--defline",
611
611
  "description": "Keep the original defline after a space."
612
612
  },
613
+ {
614
+ "opt": "--list",
615
+ "arg": "in_file",
616
+ "description": "Reads a list of IDS."
617
+ },
613
618
  {
614
619
  "opt": "--quiet",
615
620
  "description": "Run quietly (no STDERR output)."
@@ -388,6 +388,13 @@
388
388
  "arg": "out_file",
389
389
  "description": "Output file containing the aligned proteins."
390
390
  },
391
+ {
392
+ "opt": "--components",
393
+ "arg": "out_file",
394
+ "description": ["Output file containing the components of the",
395
+ "estimation. Tab-delimited file with model name, matches, and",
396
+ "columns."]
397
+ },
391
398
  {
392
399
  "opt": "--quiet",
393
400
  "description": "Run quietly (no STDERR output)."
@@ -2,46 +2,46 @@
2
2
 
3
3
  #
4
4
  # @author: Luis M. Rodriguez-R
5
- # @update: Feb-06-2015
6
5
  # @license: artistic license 2.0
7
6
  #
8
7
 
9
8
  require 'optparse'
10
9
 
11
- o = {:subject=>FALSE, :quiet=>FALSE}
12
- ARGV << '-h' if ARGV.size==0
10
+ o = { sbj: false, q: false }
11
+ ARGV << '-h' if ARGV.size == 0
13
12
  OptionParser.new do |opts|
14
- opts.banner = "
15
- Appends an extra column to a BLAST with the length of the query or the subject sequence.
16
- You can pipe two instances to add both:
17
- cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
13
+ opts.banner = "
14
+ Appends an extra column to a BLAST with the length of the query or the subject
15
+ sequence. You can pipe two instances to add both:
16
+ cat input.blast | #{$0} -f queries.fa | #{$0} -f subjects.fa -s > output.blast
18
17
 
19
18
  Usage: #{$0} [options] < input.blast > output.blast"
20
- opts.separator ""
21
- opts.separator "Mandatory"
22
- opts.on("-f", "--fasta FILE", "Path to the FastA file"){ |v| o[:fasta] = v }
23
- opts.separator ""
24
- opts.separator "Options"
25
- opts.on("-s", "--subject",
26
- "Use the subject column of the BLAST, by default the query column is used"){ o[:subject] = TRUE }
27
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:quiet] = TRUE }
28
- opts.on("-h", "--help", "Display this screen") do
29
- puts opts
30
- exit
31
- end
32
- opts.separator ""
19
+ opts.separator ''
20
+ opts.separator 'Mandatory'
21
+ opts.on('-f', '--fasta FILE', 'Path to the FastA file'){ |v| o[:fasta] = v }
22
+ opts.separator ''
23
+ opts.separator 'Options'
24
+ opts.on('-s', '--subject',
25
+ 'Use the subject column of the BLAST, by default the query column is used'
26
+ ){ o[:sbj] = true }
27
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
28
+ opts.on('-h', '--help', 'Display this screen') do
29
+ puts opts
30
+ exit
31
+ end
32
+ opts.separator ''
33
33
  end.parse!
34
- abort "-f is mandatory" if o[:fasta].nil?
34
+ abort '-f is mandatory' if o[:fasta].nil?
35
35
 
36
36
  len = {}
37
- id = ""
38
- $stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:quiet]
39
- fh = File.open(o[:fasta], "r")
37
+ id = ''
38
+ $stderr.puts "Reading FastA file: #{o[:fasta]}" unless o[:q]
39
+ fh = File.open(o[:fasta], 'r')
40
40
  fh.each_line do |ln|
41
41
  defline = /^>(\S+)/.match(ln)
42
42
  if defline.nil?
43
43
  ln.gsub! /[^A-Za-z]/, ''
44
- abort "Error: Unsupported format, expecting FastA" if len[id].nil?
44
+ abort 'Error: Unsupported format, expecting FastA' if len[id].nil?
45
45
  len[id] = len[id] + ln.size
46
46
  else
47
47
  id = defline[1]
@@ -50,12 +50,14 @@ fh.each_line do |ln|
50
50
  end
51
51
  fh.close
52
52
 
53
- $stderr.puts "Appending #{o[:subject]?"subject":"query"} length column" unless o[:quiet]
53
+ unless o[:q]
54
+ $stderr.puts 'Appending %s length column' % (o[:sbj] ? 'subject' : 'query')
55
+ end
54
56
  ARGF.each_line do |ln|
55
- ln.chomp!
56
- row = ln.split /\t/
57
- id = o[:subject] ? row[1] : row[0];
58
- abort "Impossible to find sequence of #{id}" if len[id].nil?
59
- puts "#{ln}\t#{len[id]}"
57
+ ln.chomp!
58
+ row = ln.split /\t/
59
+ id = o[:sbj] ? row[1] : row[0]
60
+ abort "Impossible to find sequence of #{id}" if len[id].nil?
61
+ puts "#{ln}\t#{len[id]}"
60
62
  end
61
63
 
@@ -1,64 +1,65 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
3
  # @author Luis M. Rodriguez-R
5
- # @update Oct-07-2015
6
4
  # @license artistic license 2.0
7
- #
8
5
 
9
6
  require 'optparse'
10
7
 
11
- o = {:q=>FALSE, :p=>"", :s=>"", :d=>FALSE}
8
+ o = {q: false, p: '', s: '', d: false}
12
9
  ARGV << '-h' if ARGV.size==0
13
10
  OptionParser.new do |opts|
14
- opts.banner = "
11
+ opts.banner = "
15
12
  Generates easy-to-parse tagged reads from FastA files.
16
13
 
17
14
  Usage: #{$0} [options]"
18
- opts.separator ""
19
- opts.separator "Mandatory"
20
- opts.on("-i", "--in FILE",
21
- "Path to the FastA file containing the sequences."){ |v| o[:in] = v }
22
- opts.on("-o", "--out FILE",
23
- "Path to the FastA to create."){ |v| o[:out] = v }
24
- opts.separator ""
25
- opts.separator "ID options"
26
- opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
27
- opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
28
- opts.on("-d", "--defline",
29
- "Keep the original defline after a space."){ o[:d]=TRUE }
30
- opts.separator ""
31
- opts.separator "Other Options"
32
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
33
- opts.on("-h", "--help", "Display this screen") do
34
- puts opts
35
- exit
36
- end
37
- opts.separator ""
15
+ opts.separator ''
16
+ opts.separator 'Mandatory'
17
+ opts.on('-i', '--in FILE',
18
+ 'Path to the FastA file containing the sequences.'){ |v| o[:in] = v }
19
+ opts.on('-o', '--out FILE',
20
+ 'Path to the FastA to create.'){ |v| o[:out] = v }
21
+ opts.separator ''
22
+ opts.separator 'ID options'
23
+ opts.on('-p', '--prefix STR', 'Prefix to use in all IDs.'){ |v| o[:p] = v }
24
+ opts.on('-s', '--suffix STR', 'Suffix to use in all IDs.'){ |v| o[:s] = v }
25
+ opts.on('-d', '--defline',
26
+ 'Keep the original defline after a space.'){ o[:d] = true }
27
+ opts.on('-l', '--list FILE',
28
+ 'Reads a list of IDS.'){ |v| o[:l] = v }
29
+ opts.separator ''
30
+ opts.separator 'Other Options'
31
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
32
+ opts.on('-h', '--help', 'Display this screen') do
33
+ puts opts
34
+ exit
35
+ end
36
+ opts.separator ''
38
37
  end.parse!
39
- abort "-i is mandatory" if o[:in].nil?
40
- abort "-o is mandatory" if o[:out].nil?
38
+ abort '-i is mandatory' if o[:in].nil?
39
+ abort '-o is mandatory' if o[:out].nil?
41
40
 
42
41
  begin
43
- ifh = File.open(o[:in], 'r');
44
- ofh = File.open(o[:out], 'w');
45
- i=0
46
- while ln=ifh.gets
42
+ list = o[:l].nil? ? nil :
43
+ File.readlines(o[:l]).map{ |i| i.chomp.gsub(/^>/, '') }
44
+ ofh = File.open(o[:out], 'w')
45
+ i = 0
46
+ File.open(o[:in], 'r') do |ifh|
47
+ ifh.each do |ln|
47
48
  ln.chomp!
48
49
  next if ln =~ /^;/
49
50
  unless /^>/.match(ln).nil?
50
- i+=1
51
- ofh.puts ">#{o[:p]}#{i}#{o[:s]}#{ o[:d]?" #{ln[1, ln.size-1]}":"" }"
51
+ i += 1
52
+ new_id = o[:l].nil? ? i : list.shift
53
+ ofh.puts ">#{o[:p]}#{new_id}#{o[:s]}#{o[:d]?" #{ln[1, ln.size-1]}":''}"
52
54
  else
53
- ofh.puts ln
55
+ ofh.puts ln
54
56
  end
55
- end
56
- ifh.close
57
- ofh.close
57
+ end
58
+ end
59
+ ofh.close
58
60
  rescue => err
59
- $stderr.puts "Exception: #{err}\n\n"
60
- err.backtrace.each { |l| $stderr.puts l + "\n" }
61
- err
61
+ $stderr.puts "Exception: #{err}\n\n"
62
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
63
+ err
62
64
  end
63
65
 
64
-
@@ -8,8 +8,10 @@ require 'enveomics_rb/enveomics'
8
8
  use 'tmpdir'
9
9
  use 'zlib'
10
10
 
11
- o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
12
- archaea: false, genomeeq: false, metagenome: false, list: false}
11
+ o = {
12
+ bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
+ archaea: false, genomeeq: false, metagenome: false, list: false
14
+ }
13
15
  OptionParser.new do |opts|
14
16
  opts.banner = "
15
17
  Finds and extracts a collection of essential proteins suitable for genome
@@ -26,65 +28,86 @@ Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
26
28
  Usage: #{$0} [options]"
27
29
  opts.separator ''
28
30
  opts.separator 'Mandatory'
29
- opts.on('-i', '--in FILE',
30
- 'Path to the FastA file containing all the proteins in a genome.'
31
- ){ |v| o[:in] = v }
31
+ opts.on(
32
+ '-i', '--in FILE',
33
+ 'Path to the FastA file (.gz allowed) with all the proteins in a genome'
34
+ ) { |v| o[:in] = v }
32
35
  opts.separator ''
33
36
  opts.separator 'Report Options'
34
- opts.on('-o', '--out FILE',
35
- 'Path to the output FastA file with the translated essential genes.',
36
- 'By default the file is not produced.'){ |v| o[:out] = v }
37
- opts.on('-m', '--per-model STR',
37
+ opts.on(
38
+ '-o', '--out FILE',
39
+ 'Path to the output FastA file with the translated essential genes',
40
+ 'By default the file is not produced'
41
+ ) { |v| o[:out] = v }
42
+ opts.on(
43
+ '-m', '--per-model STR',
38
44
  'Prefix of translated genes in independent files with the name of the',
39
- 'model appended. By default files are not produced.'
40
- ){ |v| o[:permodel] = v }
41
- opts.on('-R', '--report FILE',
42
- 'Path to the report file. By default, the report is sent to the STDOUT.'
43
- ){ |v| o[:report] = v }
44
- opts.on('--hmm-out FILE',
45
- 'Save HMMsearch output in this file. By default, not saved.'
46
- ){ |v| o[:hmmout] = v }
47
- opts.on('--alignments FILE',
45
+ 'model appended. By default files are not produced'
46
+ ) { |v| o[:permodel] = v }
47
+ opts.on(
48
+ '-R', '--report FILE',
49
+ 'Path to the report file. By default, the report is sent to the STDOUT'
50
+ ) { |v| o[:report] = v }
51
+ opts.on(
52
+ '--hmm-out FILE',
53
+ 'Save HMMsearch output in this file. By default, not saved'
54
+ ) { |v| o[:hmmout] = v }
55
+ opts.on(
56
+ '--alignments FILE',
48
57
  'Save the aligned proteins in this file. By default, not saved'
49
- ){ |v| o[:alignments] = v }
50
- opts.on('-B', '--bacteria',
51
- 'If set, ignores models typically missing in Bacteria.'
52
- ){ |v| o[:bacteria] = v }
53
- opts.on('-A', '--archaea',
54
- 'If set, ignores models typically missing in Archaea.'
55
- ){ |v| o[:archaea] = v }
56
- opts.on('-G', '--genome-eq',
57
- 'If set, ignores models not suitable for genome-equivalents estimations.',
58
- 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
59
- ){ |v| o[:genomeeq] = v }
60
- opts.on('-r', '--rename STR',
58
+ ) { |v| o[:alignments] = v }
59
+ opts.on(
60
+ '-B', '--bacteria',
61
+ 'If set, ignores models typically missing in Bacteria'
62
+ ) { |v| o[:bacteria] = v }
63
+ opts.on(
64
+ '-A', '--archaea',
65
+ 'If set, ignores models typically missing in Archaea'
66
+ ) { |v| o[:archaea] = v }
67
+ opts.on(
68
+ '-G', '--genome-eq',
69
+ 'If set, ignores models not suitable for genome-equivalents estimations',
70
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940'
71
+ ) { |v| o[:genomeeq] = v }
72
+ opts.on(
73
+ '-r', '--rename STR',
61
74
  'If set, renames the sequences with the string provided and appends it',
62
- 'with pipe and the gene name (except in --per-model files).'
63
- ){ |v| o[:rename]=v }
64
- opts.on('-n', '--no-stats',
65
- 'If set, no statistics are reported on genome evaluation.'
66
- ){ |v| o[:stats] = v }
67
- opts.on('-s', '--no-genes',
68
- 'If set, statistics won\'t include the lists of missing/multi-copy genes.'
69
- ){ |v| o[:genes] = v }
70
- opts.on('-M', '--metagenome',
75
+ 'with pipe and the gene name (except in --per-model files)'
76
+ ) { |v| o[:rename] = v }
77
+ opts.on(
78
+ '-n', '--no-stats',
79
+ 'If set, no statistics are reported on genome evaluation'
80
+ ) { |v| o[:stats] = v }
81
+ opts.on(
82
+ '-s', '--no-genes',
83
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes'
84
+ ) { |v| o[:genes] = v }
85
+ opts.on(
86
+ '-M', '--metagenome',
71
87
  'If set, it allows for multiple copies of each gene and turns on',
72
- 'metagenomic report mode.'){ |v| o[:metagenome] = v }
88
+ 'metagenomic report mode'
89
+ ) { |v| o[:metagenome] = v }
73
90
  opts.separator ''
74
91
  opts.separator 'Other Options'
75
- opts.on('-L', '--list-models',
92
+ opts.on(
93
+ '-L', '--list-models',
76
94
  'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
77
- 'and -q; ignores all other parameters.'){ |v| o[:list] = v }
78
- opts.on('-b', '--bin DIR',
79
- 'Path to the directory containing the binaries of HMMer 3.0+.'
80
- ){ |v| o[:bin] = v }
81
- opts.on('--model-file',
82
- 'External file containing models to search.'){ |v| o[:model_file] = v }
83
- opts.on('-t', '--threads INT',
84
- "Number of parallel threads to be used. By default: #{o[:thr]}."
85
- ){ |v| o[:thr] = v.to_i }
86
- opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
87
- opts.on('-h', '--help', 'Display this screen.') do
95
+ 'and -q; ignores all other parameters'
96
+ ) { |v| o[:list] = v }
97
+ opts.on(
98
+ '-b', '--bin DIR',
99
+ 'Path to the directory containing the binaries of HMMer 3.0+'
100
+ ) { |v| o[:bin] = v }
101
+ opts.on(
102
+ '--model-file',
103
+ 'External file containing models to search'
104
+ ) { |v| o[:model_file] = v }
105
+ opts.on(
106
+ '-t', '--threads INT', Integer,
107
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
108
+ ) { |v| o[:thr] = v }
109
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)'){ o[:q] = true }
110
+ opts.on('-h', '--help', 'Display this screen') do
88
111
  puts opts
89
112
  exit
90
113
  end
@@ -112,6 +135,13 @@ TIGR00389 TIGR00436 tRNA-synth_1d}
112
135
  begin
113
136
  Dir.mktmpdir do |dir|
114
137
  $stderr.puts "Temporal directory: #{dir}." unless o[:q]
138
+ if o[:in] =~ /\.gz/
139
+ tmp_in = File.expand_path('sequences.fa', dir)
140
+ Zlib::GzipReader.open(o[:in]) do |ifh|
141
+ File.open(tmp_in, 'w') { |ofh| ofh.print ifh.read }
142
+ end
143
+ o[:in] = tmp_in
144
+ end
115
145
 
116
146
  # Create database.
117
147
  $stderr.puts 'Searching models.' unless o[:q]
@@ -144,9 +174,9 @@ begin
144
174
  'This script requires HMMER 3.0+.'
145
175
  end
146
176
  o[:hmmout] ||= "#{dir}/hmmsearch"
147
- `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
148
- -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
149
- > #{dir}/hmmsearch.log`
177
+ `'#{o[:bin]}hmmsearch' --cpu #{o[:thr]} --tblout '#{o[:hmmout]}' \
178
+ -A '#{dir}/a.sto' --cut_tc --notextw '#{dir}/essential.hmm' '#{o[:in]}' \
179
+ > '#{dir}/hmmsearch.log'`
150
180
 
151
181
  # Parse output
152
182
  $stderr.puts 'Parsing results.' unless o[:q]