miga-base 0.3.11.2 → 0.3.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71f390ca4ceb03f4d0dfeac23f55939aaf91d135bbd22129102468d8e25095e0
4
- data.tar.gz: e139fb696e76345da577f4e8fc4cd4b0bf11efaee23423e5693b5f46b1ee06b6
3
+ metadata.gz: 48d903a383d237f7b236d8ad1706a5fb017b31d320768353a1bc33846ea0d471
4
+ data.tar.gz: 9b448f00992aa4152df34ded6a48105afe5c1daf1e994737f253b519f81c998f
5
5
  SHA512:
6
- metadata.gz: 3cc2ec5d43cc613ab69debed6e623d6793190dae0e6e12954af5d1edf58ced8c2b87fdaa55b9a47b428ed3ed833cf19418875c40b1b8efc9ea84167c94b31fd9
7
- data.tar.gz: 5ec76e8f90c73b274b2d4b67ed3b09bd085126940e2c7a5d0941a7be390a0c79c739956690e31902a70a7b1b148f05d08dc665ec555c5c079dd0ba2655736b98
6
+ metadata.gz: 1579eecdab3c38bda21678baa4c903c85dfbb07e19f993212d89f6b62e1561d4044b41cfc98d80e43780d1fd172ef607a92d5320ff570de87dafda03520f2d59
7
+ data.tar.gz: d083fc8ae10735f647681d3924c8990951fc2a6ba7d501c10b89d32b0f00088a5128dfca2b24df118b2f8a56e3a5853d210933e300d7ab7bd3bd14c4bab90b8a
@@ -226,8 +226,8 @@ module MiGA::Dataset::Result
226
226
  def add_result_essential_genes(base, _opts)
227
227
  return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
228
228
  r = MiGA::Result.new("#{base}.json")
229
- add_files_to_ds_result(r, name, ess_genes: ".ess.faa",
230
- collection: ".ess", report: ".ess/log")
229
+ add_files_to_ds_result(r, name, ess_genes: '.ess.faa',
230
+ collection: '.ess', report: '.ess/log', alignments: '.ess/proteins.aln')
231
231
  end
232
232
 
233
233
  ##
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 11, 2]
13
+ VERSION = [0.3, 12, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 04, 20)
21
+ VERSION_DATE = Date.new(2019, 04, 26)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -7,31 +7,31 @@ SCRIPT="essential_genes"
7
7
  cd "$PROJECT/data/07.annotation/01.function/01.essential"
8
8
 
9
9
  # Initialize
10
- miga date > "$DATASET.start"
11
- FAA="../../../06.cds/$DATASET.faa"
10
+ miga date > "${DATASET}.start"
11
+ FAA="../../../06.cds/${DATASET}.faa"
12
12
 
13
13
  # Check if there are any proteins
14
14
  if [[ ! -s $FAA ]] ; then
15
15
  echo Empty protein set, bypassing essential genes
16
- rm "$DATASET.start"
17
- miga create_dataset -P "$PROJECT" -D "$DATASET" \
18
- -m run_essential_genes=false --update
16
+ rm "${DATASET}.start"
17
+ miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
19
18
  exit 0
20
19
  fi
21
20
 
22
21
  # Find and extract essential genes
23
- [[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
24
- mkdir "$DATASET.ess"
22
+ [[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
23
+ mkdir "${DATASET}.ess"
25
24
  TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
26
25
  --metadata "type" | awk '{print $2}')
27
26
  if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
28
- HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
29
- -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
30
- > "$DATASET.ess/log"
27
+ HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
28
+ -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
29
+ > "${DATASET}.ess/log"
31
30
  else
32
- HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
33
- -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
34
- > "$DATASET.ess/log"
31
+ HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
32
+ -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
33
+ --alignments "${DATASET}.ess/proteins.aln" \
34
+ > "${DATASET}.ess/log"
35
35
  fi
36
36
 
37
37
  # Reduce files
@@ -42,5 +42,5 @@ if exists "$DATASET".ess/*.faa ; then
42
42
  fi
43
43
 
44
44
  # Finalize
45
- miga date > "$DATASET.done"
45
+ miga date > "${DATASET}.done"
46
46
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
data/test/daemon_test.rb CHANGED
@@ -37,6 +37,7 @@ class DaemonTest < Test::Unit::TestCase
37
37
  File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
38
38
  FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
39
39
  File.expand_path("data/01.raw_reads/ds1.done", p.path))
40
+ ds.first_preprocessing(true)
40
41
  out = capture_stdout do
41
42
  d.check_datasets
42
43
  end
@@ -401,6 +401,19 @@
401
401
  "description": ["Path to the report file. By default, the report is",
402
402
  "sent to the STDOUT."]
403
403
  },
404
+ {
405
+ "name": "HMMsearch output",
406
+ "opt": "--hmm-out",
407
+ "arg": "out_file",
408
+ "description": ["Save HMMsearch output in this file. By default,",
409
+ "not saved."]
410
+ },
411
+ {
412
+ "opt": "--alignments",
413
+ "opt": "out_file",
414
+ "description": ["Save the aligned proteins in this file. By default,",
415
+ "not saved."]
416
+ },
404
417
  {
405
418
  "opt": "--bacteria",
406
419
  "description": "If set, ignores models typically missing in Bacteria."
@@ -189,7 +189,7 @@
189
189
  "description": ["Calculates the Average Nucleotide Identity between two",
190
190
  "genomes."],
191
191
  "help_arg": "--help",
192
- "see_also": ["aai.rb","rbm.rb"],
192
+ "see_also": ["aai.rb","rbm.rb","HMM.essential.rb"],
193
193
  "cite": [
194
194
  ["Konstantinidis & Tiedje, 2005, PNAS",
195
195
  "http://dx.doi.org/10.1073%2Fpnas.0409727102"],
@@ -362,6 +362,38 @@
362
362
  }
363
363
  ]
364
364
  },
365
+ {
366
+ "task": "HMM.haai.rb",
367
+ "description": ["Estimates Average Amino Acid Identity (AAI) from the",
368
+ "essential genes extracted and aligned by HMM.essential.rb (see",
369
+ "Alignments)."],
370
+ "help_arg": "--help",
371
+ "see_also": ["HMM.essential.rb","aai.rb"],
372
+ "options": [
373
+ {
374
+ "name": "Alignments 1",
375
+ "opt": "-1",
376
+ "arg": "in_file",
377
+ "description": "Input alignments file for genome 1."
378
+ },
379
+ {
380
+ "name": "Alignments 2",
381
+ "opt": "-2",
382
+ "arg": "in_file",
383
+ "description": "Input alignments file for genome 2."
384
+ },
385
+ {
386
+ "name": "Alignment output",
387
+ "opt": "--aln-out",
388
+ "arg": "out_file",
389
+ "description": "Output file containing the aligned proteins."
390
+ },
391
+ {
392
+ "opt": "--quiet",
393
+ "description": "Run quietly (no STDERR output)."
394
+ }
395
+ ]
396
+ },
365
397
  {
366
398
  "task": "rbm.rb",
367
399
  "description": ["Finds the reciprocal best matches between two sets of",
@@ -29,6 +29,7 @@
29
29
  "Execution": [
30
30
  "aai.rb",
31
31
  "ani.rb",
32
+ "HMM.haai.rb",
32
33
  "rbm.rb"
33
34
  ]
34
35
  },
@@ -101,6 +102,7 @@
101
102
  ],
102
103
  "Search": [
103
104
  "HMM.essential.rb",
105
+ "HMM.haai.rb",
104
106
  "HMMsearch.extractIds.rb",
105
107
  "ogs.annotate.rb",
106
108
  "ogs.core-pan.rb",
@@ -64,15 +64,15 @@
64
64
  "task": "HMM.essential.rb",
65
65
  "description": ["Typical single-copy bacterial genes present in",
66
66
  "Mycoplasma genitalium."],
67
- "values": ["Mgen_M2288.faa",null,null,null,true,null,null,null,null,null,
68
- null,null,null,null,null,null]
67
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
68
+ null,null,null,null,null,null,null,null]
69
69
  },
70
70
  {
71
71
  "task": "HMM.essential.rb",
72
72
  "description": ["Typical single-copy archaeal genes present in",
73
73
  "Nanoarchaeum equitans."],
74
- "values": ["Mgen_M2288.faa",null,null,null,null,true,null,null,null,null,
75
- null,null,null,null,null,null]
74
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
75
+ null,null,null,null,null,null,null,null]
76
76
  },
77
77
  {
78
78
  "task": "Newick.autoprune.R",
@@ -1,20 +1,17 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
3
  # @author Luis M. Rodriguez-R
5
4
  # @license artistic license 2.0
6
- # @update Mar-23-2016
7
- #
8
5
 
9
- $:.push File.expand_path("../lib", __FILE__)
10
- require "enveomics_rb/enveomics"
11
- use "tmpdir"
12
- use "zlib"
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ use 'tmpdir'
9
+ use 'zlib'
13
10
 
14
- o = {bin:"", thr:2, q:false, stats:true, genes:true, bacteria:false,
15
- archaea:false, genomeeq:false, metagenome:false, list:false}
11
+ o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
12
+ archaea: false, genomeeq: false, metagenome: false, list: false}
16
13
  OptionParser.new do |opts|
17
- opts.banner = "
14
+ opts.banner = "
18
15
  Finds and extracts a collection of essential proteins suitable for genome
19
16
  completeness evaluation and phylogenetic analyses. Important note: most complete
20
17
  bacterial genomes contain only 106/111 genes in this collection, therefore
@@ -27,68 +24,74 @@ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
27
24
  Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
28
25
 
29
26
  Usage: #{$0} [options]"
30
- opts.separator ""
31
- opts.separator "Mandatory"
32
- opts.on("-i", "--in FILE",
33
- "Path to the FastA file containing all the proteins in a genome."
34
- ){ |v| o[:in] = v }
35
- opts.separator ""
36
- opts.separator "Report Options"
37
- opts.on("-o", "--out FILE",
38
- "Path to the output FastA file with the translated essential genes.",
39
- "By default the file is not produced."){ |v| o[:out] = v }
40
- opts.on("-m", "--per-model STR",
41
- "Prefix of translated genes in independent files with the name of the",
42
- "model appended. By default files are not produced."
43
- ){ |v| o[:permodel] = v }
44
- opts.on("-R", "--report FILE",
45
- "Path to the report file. By default, the report is sent to the STDOUT."
46
- ){ |v| o[:report] = v }
47
- opts.on("-B", "--bacteria",
48
- "If set, ignores models typically missing in Bacteria."
49
- ){ |v| o[:bacteria] = v }
50
- opts.on("-A", "--archaea",
51
- "If set, ignores models typically missing in Archaea."
52
- ){ |v| o[:archaea] = v }
53
- opts.on("-G", "--genome-eq",
54
- "If set, ignores models not suitable for genome-equivalents estimations.",
55
- "See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940."
56
- ){ |v| o[:genomeeq] = v }
57
- opts.on("-r", "--rename STR",
58
- "If set, renames the sequences with the string provided and appends it",
59
- "with pipe and the gene name (except in --per-model files)."
60
- ){ |v| o[:rename]=v }
61
- opts.on("-n", "--no-stats",
62
- "If set, no statistics are reported on genome evaluation."
63
- ){ |v| o[:stats] = v }
64
- opts.on("-s", "--no-genes",
65
- "If set, statistics won't include the lists of missing/multi-copy genes."
66
- ){ |v| o[:genes] = v }
67
- opts.on("-M", "--metagenome",
68
- "If set, it allows for multiple copies of each gene and turns on",
69
- "metagenomic report mode."){ |v| o[:metagenome] = v }
70
- opts.separator ""
71
- opts.separator "Other Options"
72
- opts.on("-L", "--list-models",
73
- "If set, it only lists the models and exits. Compatible with -A, -B, -G,",
74
- "and -q; ignores all other parameters."){ |v| o[:list] = v }
75
- opts.on("-b", "--bin DIR",
76
- "Path to the directory containing the binaries of HMMer 3.0+."
77
- ){ |v| o[:bin] = v }
78
- opts.on("--model-file",
79
- "External file containing models to search."){ |v| o[:model_file] = v }
80
- opts.on("-t", "--threads INT",
81
- "Number of parallel threads to be used. By default: #{o[:thr]}."
82
- ){ |v| o[:thr] = v.to_i }
83
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
84
- opts.on("-h", "--help", "Display this screen.") do
85
- puts opts
86
- exit
87
- end
88
- opts.separator ""
27
+ opts.separator ''
28
+ opts.separator 'Mandatory'
29
+ opts.on('-i', '--in FILE',
30
+ 'Path to the FastA file containing all the proteins in a genome.'
31
+ ){ |v| o[:in] = v }
32
+ opts.separator ''
33
+ opts.separator 'Report Options'
34
+ opts.on('-o', '--out FILE',
35
+ 'Path to the output FastA file with the translated essential genes.',
36
+ 'By default the file is not produced.'){ |v| o[:out] = v }
37
+ opts.on('-m', '--per-model STR',
38
+ 'Prefix of translated genes in independent files with the name of the',
39
+ 'model appended. By default files are not produced.'
40
+ ){ |v| o[:permodel] = v }
41
+ opts.on('-R', '--report FILE',
42
+ 'Path to the report file. By default, the report is sent to the STDOUT.'
43
+ ){ |v| o[:report] = v }
44
+ opts.on('--hmm-out FILE',
45
+ 'Save HMMsearch output in this file. By default, not saved.'
46
+ ){ |v| o[:hmmout] = v }
47
+ opts.on('--alignments FILE',
48
+ 'Save the aligned proteins in this file. By default, not saved'
49
+ ){ |v| o[:alignments] = v }
50
+ opts.on('-B', '--bacteria',
51
+ 'If set, ignores models typically missing in Bacteria.'
52
+ ){ |v| o[:bacteria] = v }
53
+ opts.on('-A', '--archaea',
54
+ 'If set, ignores models typically missing in Archaea.'
55
+ ){ |v| o[:archaea] = v }
56
+ opts.on('-G', '--genome-eq',
57
+ 'If set, ignores models not suitable for genome-equivalents estimations.',
58
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
59
+ ){ |v| o[:genomeeq] = v }
60
+ opts.on('-r', '--rename STR',
61
+ 'If set, renames the sequences with the string provided and appends it',
62
+ 'with pipe and the gene name (except in --per-model files).'
63
+ ){ |v| o[:rename]=v }
64
+ opts.on('-n', '--no-stats',
65
+ 'If set, no statistics are reported on genome evaluation.'
66
+ ){ |v| o[:stats] = v }
67
+ opts.on('-s', '--no-genes',
68
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes.'
69
+ ){ |v| o[:genes] = v }
70
+ opts.on('-M', '--metagenome',
71
+ 'If set, it allows for multiple copies of each gene and turns on',
72
+ 'metagenomic report mode.'){ |v| o[:metagenome] = v }
73
+ opts.separator ''
74
+ opts.separator 'Other Options'
75
+ opts.on('-L', '--list-models',
76
+ 'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
77
+ 'and -q; ignores all other parameters.'){ |v| o[:list] = v }
78
+ opts.on('-b', '--bin DIR',
79
+ 'Path to the directory containing the binaries of HMMer 3.0+.'
80
+ ){ |v| o[:bin] = v }
81
+ opts.on('--model-file',
82
+ 'External file containing models to search.'){ |v| o[:model_file] = v }
83
+ opts.on('-t', '--threads INT',
84
+ "Number of parallel threads to be used. By default: #{o[:thr]}."
85
+ ){ |v| o[:thr] = v.to_i }
86
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
87
+ opts.on('-h', '--help', 'Display this screen.') do
88
+ puts opts
89
+ exit
90
+ end
91
+ opts.separator ''
89
92
  end.parse!
90
- abort "-i is mandatory" if o[:in].nil? and not o[:list]
91
- o[:bin] = o[:bin]+"/" if o[:bin].size > 0
93
+ abort '-i is mandatory' if o[:in].nil? and not o[:list]
94
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
92
95
  o[:rename] = nil if o[:metagenome]
93
96
 
94
97
  not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
@@ -107,148 +110,175 @@ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
107
110
  TIGR00389 TIGR00436 tRNA-synth_1d}
108
111
 
109
112
  begin
110
- Dir.mktmpdir do |dir|
111
- $stderr.puts "Temporal directory: #{dir}." unless o[:q]
113
+ Dir.mktmpdir do |dir|
114
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
115
+
116
+ # Create database.
117
+ $stderr.puts 'Searching models.' unless o[:q]
118
+ models = {}
119
+ model_id = nil
120
+ dbh = File.open("#{dir}/essential.hmm", 'w')
121
+ o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
122
+ mfh = (File.extname(o[:model_file]) == '.gz') ?
123
+ Zlib::GzipReader.open(o[:model_file]) :
124
+ File.open(o[:model_file], 'r')
125
+ while ln = mfh.gets
126
+ dbh.print ln
127
+ ln.chomp!
128
+ model_id = $1 if ln =~ /^NAME\s+(.+)/
129
+ models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
130
+ end
131
+ dbh.close
132
+ mfh.close
133
+ models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
134
+ models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
135
+ models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
136
+ if o[:list]
137
+ models.each_pair{ |id,desc| puts [id,desc].join("\t") }
138
+ exit
139
+ end
140
+
141
+ # Check HMMer version and run HMMsearch.
142
+ if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
143
+ raise 'You have provided an unsupported version of HMMER. ' +
144
+ 'This script requires HMMER 3.0+.'
145
+ end
146
+ o[:hmmout] ||= "#{dir}/hmmsearch"
147
+ `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
148
+ -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
149
+ > #{dir}/hmmsearch.log`
112
150
 
113
- # Create database.
114
- $stderr.puts "Searching models." unless o[:q]
115
- models = {}
116
- model_id = nil
117
- dbh = File.open("#{dir}/essential.hmm", "w")
118
- o[:model_file] ||= File.expand_path("../lib/data/essential.hmm.gz",
119
- __FILE__)
120
- mfh = (File.extname(o[:model_file])==".gz") ?
121
- Zlib::GzipReader.open(o[:model_file]) :
122
- File.open(o[:model_file],"r")
123
- while ln = mfh.gets
124
- dbh.print ln
125
- ln.chomp!
126
- model_id = $1 if ln =~ /^NAME\s+(.+)/
127
- models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
151
+ # Parse output
152
+ $stderr.puts 'Parsing results.' unless o[:q]
153
+ trash = []
154
+ genes = {}
155
+ File.open(o[:hmmout], 'r') do |resh|
156
+ while ln = resh.gets
157
+ next if ln =~ /^#/
158
+ r = ln.split /\s+/
159
+ next unless models.include? r[2]
160
+ if o[:metagenome]
161
+ genes[ r[2] ] = [] if genes[ r[2] ].nil?
162
+ genes[ r[2] ] << r[0]
163
+ elsif genes[ r[2] ].nil?
164
+ genes[ r[2] ] = r[0]
165
+ else
166
+ trash << r[2]
167
+ end
128
168
  end
129
- dbh.close
130
- mfh.close
131
- models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
132
- models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
133
- models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
134
- if o[:list]
135
- models.each_pair{ |id,desc| puts [id,desc].join("\t") }
136
- exit
169
+ end
170
+
171
+ # Report statistics
172
+ if o[:stats]
173
+ reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
174
+ if o[:metagenome]
175
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
176
+ gc = [0] * (models.size - genes.size) +
177
+ genes.values.map{ |g| g.length }.sort
178
+ reph.printf "! Mean number of copies per model: %.3f.\n",
179
+ gc.inject(:+).to_f / models.size
180
+ reph.printf "! Median number of copies per model: %.1f.\n",
181
+ gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
182
+ if o[:genes] and genes.size != models.size
183
+ reph.printf "! Missing genes: %s\n",
184
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
185
+ map{|m| "#{m}: #{models[m]}."}).join("\n! ")
186
+ end
187
+ else
188
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
189
+ reph.printf "! Completeness: %.1f%%.\n",
190
+ 100.0 * genes.size / models.size
191
+ reph.printf "! Contamination: %.1f%%.\n",
192
+ 100.0 * trash.size / models.size
193
+ if o[:genes]
194
+ reph.printf "! Multiple copies: %s\n",
195
+ ([''] + trash.uniq.
196
+ map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
197
+ join("\n! ") unless trash.empty?
198
+ reph.printf "! Missing genes: %s\n",
199
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
200
+ map{ |m| "#{m}: #{models[m]}." }).
201
+ join("\n! ") unless genes.size == models.size
202
+ end
137
203
  end
138
-
139
- # Check HMMer version and run HMMsearch.
140
- if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
141
- raise "You have provided an unsupported version of HMMER. " +
142
- "This script requires HMMER 3.0+."
204
+ reph.close unless o[:report].nil?
205
+ end
206
+
207
+ # Extract sequences
208
+ unless o[:out].nil? and o[:permodel].nil?
209
+ $stderr.puts 'Extracting sequences.' unless o[:q]
210
+ faah = File.open(o[:in], 'r')
211
+ outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
212
+ geneh = nil
213
+ in_gene = nil
214
+ unless o[:permodel].nil?
215
+ genes.keys.each do |m|
216
+ File.open("#{o[:permodel]}#{m}.faa", 'w').close
217
+ end
143
218
  end
144
- `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{dir}/hmmsearch" \
145
- --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
146
- > #{dir}/hmmsearch.log`
147
-
148
- # Parse output
149
- $stderr.puts "Parsing results." unless o[:q]
150
- resh = File.open("#{dir}/hmmsearch","r")
151
- trash = []
152
- genes = {}
153
- while ln = resh.gets
154
- next if ln =~ /^#/
155
- r = ln.split /\s+/
156
- next unless models.include? r[2]
157
- if o[:metagenome]
158
- genes[ r[2] ] = [] if genes[ r[2] ].nil?
159
- genes[ r[2] ] << r[0]
160
- elsif genes[ r[2] ].nil?
161
- genes[ r[2] ] = r[0]
162
- else
163
- trash << r[2]
164
- end
219
+ while ln = faah.gets
220
+ if ln =~ /^>(\S+)/
221
+ if o[:metagenome]
222
+ in_gene = genes.keys.
223
+ map{ |k| genes[k].include?($1) ? k : nil }.compact.first
224
+ in_gene = [in_gene, $1] unless in_gene.nil?
225
+ else
226
+ in_gene = genes.rassoc($1)
227
+ end
228
+ next if in_gene.nil?
229
+ geneh.close unless geneh.nil?
230
+ geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
231
+ o[:permodel].nil?
232
+ outh.print(o[:rename].nil? ?
233
+ ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
234
+ geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
235
+ geneh.nil?
236
+ else
237
+ next if in_gene.nil?
238
+ outh.print ln unless outh.nil?
239
+ geneh.print ln unless geneh.nil?
240
+ end
165
241
  end
242
+ geneh.close unless geneh.nil?
243
+ outh.close unless outh.nil?
244
+ faah.close
245
+ end
166
246
 
167
- # Report statistics
168
- if o[:stats]
169
- reph = o[:report].nil? ? $stdout : File.open(o[:report], "w")
170
- if o[:metagenome]
171
- reph.printf "! Essential genes found: %d/%d.\n",
172
- genes.size, models.size
173
- gc = [0]*(models.size - genes.size) +
174
- genes.values.map{|g| g.length}.sort
175
- reph.printf "! Mean number of copies per model: %.3f.\n",
176
- gc.inject(:+).to_f/models.size
177
- reph.printf "! Median number of copies per model: %.1f.\n",
178
- gc.size.even? ? gc[gc.size/2,2].inject(:+).to_f/2 : gc[gc.size/2]
179
- if o[:genes] and genes.size != models.size
180
- reph.printf "! Missing genes: %s\n",
181
- ([""] +
182
- models.keys.select{|m| not genes.keys.include? m
183
- }.map{|m| "#{m}: #{models[m]}."}).join("\n! ")
184
- end
185
- else
186
- reph.printf "! Essential genes found: %d/%d.\n",
187
- genes.size, models.size
188
- reph.printf "! Completeness: %.1f%%.\n",
189
- 100.0*genes.size/models.size
190
- reph.printf "! Contamination: %.1f%%.\n",
191
- 100.0*trash.size/models.size
192
- if o[:genes]
193
- reph.printf "! Multiple copies: %s\n",
194
- ([""] +
195
- trash.uniq.map{|m|
196
- "#{trash.count(m)+1} #{m}: #{models[m]}."}
197
- ).join("\n! ") unless trash.empty?
198
- reph.printf "! Missing genes: %s\n",
199
- ([""] +
200
- models.keys.select{|m| not genes.keys.include? m
201
- }.map{|m| "#{m}: #{models[m]}."}
202
- ).join("\n! ") unless genes.size==models.size
203
- end
204
- end
205
- reph.close unless o[:report].nil?
247
+ unless o[:alignments].nil?
248
+ aln = {}
249
+ File.open("#{dir}/a.sto", 'r') do |fh|
250
+ cur_model = nil
251
+ mask = []
252
+ fh.each_line do |ln|
253
+ case ln.chomp
254
+ when /^# STOCKHOLM/
255
+ cur_model = nil
256
+ mask = []
257
+ when /^#=GS (\S+)\/([\d\-]+)\s+DE/
258
+ cur_model ||= genes.rassoc($1).first
259
+ aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
260
+ when /^#=GC RF\s+(\S+)/
261
+ aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
262
+ mask.each{ |d| i[d] = '' }
263
+ end
264
+ when /^[^#]\S*\s+(\S+)/
265
+ next if aln[ cur_model ][ 2 ]
266
+ aln[ cur_model ][ 2 ] = $1.upcase
267
+ mask = aln[ cur_model ][ 2 ].split('').each_with_index.
268
+ map{ |v, k| v == '.' ? k : nil }.compact.reverse
269
+ aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
270
+ end
271
+ end
206
272
  end
207
-
208
- # Extract sequences
209
- unless o[:out].nil? and o[:permodel].nil?
210
- $stderr.puts "Extracting sequences." unless o[:q]
211
- faah = File.open(o[:in], "r")
212
- outh = o[:out].nil? ? nil : File.open(o[:out], "w")
213
- geneh = nil
214
- in_gene = nil
215
- unless o[:permodel].nil?
216
- genes.keys.each do |m|
217
- File.open("#{o[:permodel]}#{m}.faa", "w").close
218
- end
219
- end
220
- while ln = faah.gets
221
- if ln =~ /^>(\S+)/
222
- if o[:metagenome]
223
- in_gene = genes.keys.map{|k| genes[k].include?($1) ? k : nil
224
- }.compact.first
225
- in_gene = [in_gene, $1] unless in_gene.nil?
226
- else
227
- in_gene = genes.rassoc($1)
228
- end
229
- next if in_gene.nil?
230
- geneh.close unless geneh.nil?
231
- geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa",
232
- "a+") unless o[:permodel].nil?
233
- outh.print(o[:rename].nil? ?
234
- ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
235
- geneh.print(
236
- o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless geneh.nil?
237
- else
238
- next if in_gene.nil?
239
- outh.print ln unless outh.nil?
240
- geneh.print ln unless geneh.nil?
241
- end
242
- end
243
- geneh.close unless geneh.nil?
244
- outh.close unless outh.nil?
245
- faah.close
273
+ File.open(o[:alignments], 'w') do |fh|
274
+ aln.each { |k, v| v.each{ |i| fh.puts i } }
246
275
  end
276
+ end
247
277
 
248
- $stderr.puts "Done." unless o[:q]
249
- end # |dir|
278
+ $stderr.puts 'Done.' unless o[:q]
279
+ end # |dir|
250
280
  rescue => err
251
- $stderr.puts "Exception: #{err}\n\n"
252
- err.backtrace.each { |l| $stderr.puts l + "\n" }
253
- err
281
+ $stderr.puts "Exception: #{err}\n\n"
282
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
283
+ err
254
284
  end
@@ -0,0 +1,159 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license Artistic-2.0
5
+
6
+ require 'optparse'
7
+
8
+ o = {q: false}
9
+ ARGV << '-h' if ARGV.size==0
10
+
11
+ OptionParser.new do |opt|
12
+ opt.banner = "
13
+ Estimates Average Amino Acid Identity (AAI) from the essential genes extracted
14
+ and aligned by HMM.essential.rb (see --alignments).
15
+
16
+ Usage: #{$0} [options]"
17
+ opt.separator ''
18
+ opt.separator 'Mandatory'
19
+ opt.on('-1 PATH', 'Input alignments file for genome 1.'){ |v| o[:a] = v }
20
+ opt.on('-2 PATH', 'Input alignments file for genome 2.'){ |v| o[:b] = v }
21
+ opt.separator ''
22
+ opt.separator 'Options'
23
+ opt.on('-a', '--aln-out FILE',
24
+ 'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
25
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
26
+ opt.on('-h', '--help', 'Display this screen.') do
27
+ puts opt
28
+ exit
29
+ end
30
+ opt.separator ''
31
+ end.parse!
32
+ abort '-1 is mandatory.' if o[:a].nil?
33
+ abort '-2 is mandatory.' if o[:b].nil?
34
+
35
+ class HList
36
+ attr_accessor :list
37
+ def initialize(file)
38
+ @list = {}
39
+ r = File.readlines(file)
40
+ while not r.empty?
41
+ e = HElement.new(*r.shift(3))
42
+ @list[ e.model_id ] = e
43
+ end
44
+ end
45
+
46
+ def [](model_id)
47
+ list[model_id]
48
+ end
49
+
50
+ ##
51
+ # Returns an array of HAln objects.
52
+ def align(other)
53
+ list.keys.map do |model_id|
54
+ self[model_id].align(other[model_id]) unless other[model_id].nil?
55
+ end.compact
56
+ end
57
+
58
+ def models
59
+ list.keys
60
+ end
61
+ end
62
+
63
+ class HElement
64
+ attr_accessor :defline, :model_id, :protein_id, :protein_coords
65
+ attr_accessor :model_aln, :protein_aln
66
+ def initialize(defline, model_aln, protein_aln)
67
+ @defline = defline.chomp
68
+ @model_aln = model_aln.chomp
69
+ @protein_aln = protein_aln.chomp
70
+ if defline =~ /^# (.+) : (.+) : (.+)/
71
+ @model_id = $1
72
+ @protein_id = $2
73
+ @protein_coords = $3
74
+ end
75
+ end
76
+
77
+ def dup
78
+ HElement.new(defline, model_aln, protein_aln)
79
+ end
80
+
81
+ ##
82
+ # Returns an HAln object
83
+ def align(other)
84
+ HAln.new(self, other)
85
+ end
86
+
87
+ def mask
88
+ @mask ||= model_aln.chars.
89
+ each_with_index.map{ |v, k| v == '.' ? k : nil }.
90
+ compact.reverse
91
+ end
92
+
93
+ def mask!(template)
94
+ (template - mask).each do |d|
95
+ @model_aln[d] = '-' + @model_aln[d]
96
+ @protein_aln[d] = '-' + @protein_aln[d]
97
+ end
98
+ end
99
+ end
100
+
101
+ class HAln
102
+ attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
103
+ def initialize(a, b)
104
+ a_masked = a.dup
105
+ a_masked.mask! b.mask.reverse
106
+ b_masked = b.dup
107
+ b_masked.mask! b_masked.mask
108
+ @protein_1 = a_masked.protein_aln
109
+ @protein_2 = b_masked.protein_aln
110
+ @model_id = a.model_id
111
+ @protein_1_id = a.protein_id + '/' + a.protein_coords
112
+ @protein_2_id = b.protein_id + '/' + b.protein_coords
113
+ end
114
+
115
+ def stats
116
+ @stats = { len: 0, gaps: 0, matches: 0 }
117
+ return @stats unless @stats[:id].nil?
118
+ protein_1.chars.each_with_index do |v, k|
119
+ next if v == '-' and protein_2[k] == '-'
120
+ @stats[:len] += 1
121
+ if v == protein_2[k]
122
+ @stats[:matches] += 1
123
+ elsif v == '-' or protein_2[k] == '-'
124
+ @stats[:gaps] += 1
125
+ end
126
+ end
127
+ @stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
128
+ end
129
+
130
+ def stats_to_s
131
+ stats.map{ |k,v| "#{k}:#{v}" }.join " "
132
+ end
133
+
134
+ def to_s
135
+ "# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
136
+ protein_1 + "\n" + protein_2 + "\n"
137
+ end
138
+ end
139
+
140
+ hlist1 = HList.new(o[:a])
141
+ hlist2 = HList.new(o[:b])
142
+ haln_arr = hlist1.align(hlist2)
143
+
144
+ avg_identity = haln_arr.map{ |i| i.stats[:id] }.inject(:+) / haln_arr.size
145
+ avg2_identity = haln_arr.map{ |i| i.stats[:id] ** 2 }.inject(:+) / haln_arr.size
146
+ sd_identity = Math.sqrt( avg2_identity - avg_identity ** 2 )
147
+ puts "Common models: #{haln_arr.size}"
148
+ puts "All models: #{(hlist1.models | hlist1.models).size}"
149
+ puts "Average identity: #{avg_identity.round(2)}%"
150
+ puts "SD identity: #{sd_identity.round(2)}"
151
+
152
+ if o[:alnout]
153
+ File.open(o[:alnout], 'w') do |fh|
154
+ haln_arr.each do |i|
155
+ fh.puts i
156
+ end
157
+ end
158
+ end
159
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11.2
4
+ version: 0.3.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-04-20 00:00:00.000000000 Z
11
+ date: 2019-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -321,6 +321,7 @@ files:
321
321
  - utils/enveomics/Scripts/GFF.catsbj.pl
322
322
  - utils/enveomics/Scripts/GenBank.add_fields.rb
323
323
  - utils/enveomics/Scripts/HMM.essential.rb
324
+ - utils/enveomics/Scripts/HMM.haai.rb
324
325
  - utils/enveomics/Scripts/HMMsearch.extractIds.rb
325
326
  - utils/enveomics/Scripts/JPlace.distances.rb
326
327
  - utils/enveomics/Scripts/JPlace.to_iToL.rb