miga-base 0.3.11.2 → 0.3.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71f390ca4ceb03f4d0dfeac23f55939aaf91d135bbd22129102468d8e25095e0
4
- data.tar.gz: e139fb696e76345da577f4e8fc4cd4b0bf11efaee23423e5693b5f46b1ee06b6
3
+ metadata.gz: 48d903a383d237f7b236d8ad1706a5fb017b31d320768353a1bc33846ea0d471
4
+ data.tar.gz: 9b448f00992aa4152df34ded6a48105afe5c1daf1e994737f253b519f81c998f
5
5
  SHA512:
6
- metadata.gz: 3cc2ec5d43cc613ab69debed6e623d6793190dae0e6e12954af5d1edf58ced8c2b87fdaa55b9a47b428ed3ed833cf19418875c40b1b8efc9ea84167c94b31fd9
7
- data.tar.gz: 5ec76e8f90c73b274b2d4b67ed3b09bd085126940e2c7a5d0941a7be390a0c79c739956690e31902a70a7b1b148f05d08dc665ec555c5c079dd0ba2655736b98
6
+ metadata.gz: 1579eecdab3c38bda21678baa4c903c85dfbb07e19f993212d89f6b62e1561d4044b41cfc98d80e43780d1fd172ef607a92d5320ff570de87dafda03520f2d59
7
+ data.tar.gz: d083fc8ae10735f647681d3924c8990951fc2a6ba7d501c10b89d32b0f00088a5128dfca2b24df118b2f8a56e3a5853d210933e300d7ab7bd3bd14c4bab90b8a
@@ -226,8 +226,8 @@ module MiGA::Dataset::Result
226
226
  def add_result_essential_genes(base, _opts)
227
227
  return nil unless result_files_exist?(base, %w[.ess.faa .ess .ess/log])
228
228
  r = MiGA::Result.new("#{base}.json")
229
- add_files_to_ds_result(r, name, ess_genes: ".ess.faa",
230
- collection: ".ess", report: ".ess/log")
229
+ add_files_to_ds_result(r, name, ess_genes: '.ess.faa',
230
+ collection: '.ess', report: '.ess/log', alignments: '.ess/proteins.aln')
231
231
  end
232
232
 
233
233
  ##
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 11, 2]
13
+ VERSION = [0.3, 12, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 04, 20)
21
+ VERSION_DATE = Date.new(2019, 04, 26)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -7,31 +7,31 @@ SCRIPT="essential_genes"
7
7
  cd "$PROJECT/data/07.annotation/01.function/01.essential"
8
8
 
9
9
  # Initialize
10
- miga date > "$DATASET.start"
11
- FAA="../../../06.cds/$DATASET.faa"
10
+ miga date > "${DATASET}.start"
11
+ FAA="../../../06.cds/${DATASET}.faa"
12
12
 
13
13
  # Check if there are any proteins
14
14
  if [[ ! -s $FAA ]] ; then
15
15
  echo Empty protein set, bypassing essential genes
16
- rm "$DATASET.start"
17
- miga create_dataset -P "$PROJECT" -D "$DATASET" \
18
- -m run_essential_genes=false --update
16
+ rm "${DATASET}.start"
17
+ miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
19
18
  exit 0
20
19
  fi
21
20
 
22
21
  # Find and extract essential genes
23
- [[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
24
- mkdir "$DATASET.ess"
22
+ [[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
23
+ mkdir "${DATASET}.ess"
25
24
  TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
26
25
  --metadata "type" | awk '{print $2}')
27
26
  if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
28
- HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
29
- -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
30
- > "$DATASET.ess/log"
27
+ HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
28
+ -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
29
+ > "${DATASET}.ess/log"
31
30
  else
32
- HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
33
- -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
34
- > "$DATASET.ess/log"
31
+ HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
32
+ -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
33
+ --alignments "${DATASET}.ess/proteins.aln" \
34
+ > "${DATASET}.ess/log"
35
35
  fi
36
36
 
37
37
  # Reduce files
@@ -42,5 +42,5 @@ if exists "$DATASET".ess/*.faa ; then
42
42
  fi
43
43
 
44
44
  # Finalize
45
- miga date > "$DATASET.done"
45
+ miga date > "${DATASET}.done"
46
46
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
data/test/daemon_test.rb CHANGED
@@ -37,6 +37,7 @@ class DaemonTest < Test::Unit::TestCase
37
37
  File.expand_path("data/01.raw_reads/ds1.1.fastq", p.path))
38
38
  FileUtils.cp(File.expand_path("daemon/daemon.json", p.path),
39
39
  File.expand_path("data/01.raw_reads/ds1.done", p.path))
40
+ ds.first_preprocessing(true)
40
41
  out = capture_stdout do
41
42
  d.check_datasets
42
43
  end
@@ -401,6 +401,19 @@
401
401
  "description": ["Path to the report file. By default, the report is",
402
402
  "sent to the STDOUT."]
403
403
  },
404
+ {
405
+ "name": "HMMsearch output",
406
+ "opt": "--hmm-out",
407
+ "arg": "out_file",
408
+ "description": ["Save HMMsearch output in this file. By default,",
409
+ "not saved."]
410
+ },
411
+ {
412
+ "opt": "--alignments",
413
+ "opt": "out_file",
414
+ "description": ["Save the aligned proteins in this file. By default,",
415
+ "not saved."]
416
+ },
404
417
  {
405
418
  "opt": "--bacteria",
406
419
  "description": "If set, ignores models typically missing in Bacteria."
@@ -189,7 +189,7 @@
189
189
  "description": ["Calculates the Average Nucleotide Identity between two",
190
190
  "genomes."],
191
191
  "help_arg": "--help",
192
- "see_also": ["aai.rb","rbm.rb"],
192
+ "see_also": ["aai.rb","rbm.rb","HMM.essential.rb"],
193
193
  "cite": [
194
194
  ["Konstantinidis & Tiedje, 2005, PNAS",
195
195
  "http://dx.doi.org/10.1073%2Fpnas.0409727102"],
@@ -362,6 +362,38 @@
362
362
  }
363
363
  ]
364
364
  },
365
+ {
366
+ "task": "HMM.haai.rb",
367
+ "description": ["Estimates Average Amino Acid Identity (AAI) from the",
368
+ "essential genes extracted and aligned by HMM.essential.rb (see",
369
+ "Alignments)."],
370
+ "help_arg": "--help",
371
+ "see_also": ["HMM.essential.rb","aai.rb"],
372
+ "options": [
373
+ {
374
+ "name": "Alignments 1",
375
+ "opt": "-1",
376
+ "arg": "in_file",
377
+ "description": "Input alignments file for genome 1."
378
+ },
379
+ {
380
+ "name": "Alignments 2",
381
+ "opt": "-2",
382
+ "arg": "in_file",
383
+ "description": "Input alignments file for genome 2."
384
+ },
385
+ {
386
+ "name": "Alignment output",
387
+ "opt": "--aln-out",
388
+ "arg": "out_file",
389
+ "description": "Output file containing the aligned proteins."
390
+ },
391
+ {
392
+ "opt": "--quiet",
393
+ "description": "Run quietly (no STDERR output)."
394
+ }
395
+ ]
396
+ },
365
397
  {
366
398
  "task": "rbm.rb",
367
399
  "description": ["Finds the reciprocal best matches between two sets of",
@@ -29,6 +29,7 @@
29
29
  "Execution": [
30
30
  "aai.rb",
31
31
  "ani.rb",
32
+ "HMM.haai.rb",
32
33
  "rbm.rb"
33
34
  ]
34
35
  },
@@ -101,6 +102,7 @@
101
102
  ],
102
103
  "Search": [
103
104
  "HMM.essential.rb",
105
+ "HMM.haai.rb",
104
106
  "HMMsearch.extractIds.rb",
105
107
  "ogs.annotate.rb",
106
108
  "ogs.core-pan.rb",
@@ -64,15 +64,15 @@
64
64
  "task": "HMM.essential.rb",
65
65
  "description": ["Typical single-copy bacterial genes present in",
66
66
  "Mycoplasma genitalium."],
67
- "values": ["Mgen_M2288.faa",null,null,null,true,null,null,null,null,null,
68
- null,null,null,null,null,null]
67
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
68
+ null,null,null,null,null,null,null,null]
69
69
  },
70
70
  {
71
71
  "task": "HMM.essential.rb",
72
72
  "description": ["Typical single-copy archaeal genes present in",
73
73
  "Nanoarchaeum equitans."],
74
- "values": ["Mgen_M2288.faa",null,null,null,null,true,null,null,null,null,
75
- null,null,null,null,null,null]
74
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
75
+ null,null,null,null,null,null,null,null]
76
76
  },
77
77
  {
78
78
  "task": "Newick.autoprune.R",
@@ -1,20 +1,17 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
3
  # @author Luis M. Rodriguez-R
5
4
  # @license artistic license 2.0
6
- # @update Mar-23-2016
7
- #
8
5
 
9
- $:.push File.expand_path("../lib", __FILE__)
10
- require "enveomics_rb/enveomics"
11
- use "tmpdir"
12
- use "zlib"
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
8
+ use 'tmpdir'
9
+ use 'zlib'
13
10
 
14
- o = {bin:"", thr:2, q:false, stats:true, genes:true, bacteria:false,
15
- archaea:false, genomeeq:false, metagenome:false, list:false}
11
+ o = {bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
12
+ archaea: false, genomeeq: false, metagenome: false, list: false}
16
13
  OptionParser.new do |opts|
17
- opts.banner = "
14
+ opts.banner = "
18
15
  Finds and extracts a collection of essential proteins suitable for genome
19
16
  completeness evaluation and phylogenetic analyses. Important note: most complete
20
17
  bacterial genomes contain only 106/111 genes in this collection, therefore
@@ -27,68 +24,74 @@ completeness (e.g., Nanoarchaeum equitans returns 88.5%).
27
24
  Requires HMMer 3.0+ (http://hmmer.janelia.org/software).
28
25
 
29
26
  Usage: #{$0} [options]"
30
- opts.separator ""
31
- opts.separator "Mandatory"
32
- opts.on("-i", "--in FILE",
33
- "Path to the FastA file containing all the proteins in a genome."
34
- ){ |v| o[:in] = v }
35
- opts.separator ""
36
- opts.separator "Report Options"
37
- opts.on("-o", "--out FILE",
38
- "Path to the output FastA file with the translated essential genes.",
39
- "By default the file is not produced."){ |v| o[:out] = v }
40
- opts.on("-m", "--per-model STR",
41
- "Prefix of translated genes in independent files with the name of the",
42
- "model appended. By default files are not produced."
43
- ){ |v| o[:permodel] = v }
44
- opts.on("-R", "--report FILE",
45
- "Path to the report file. By default, the report is sent to the STDOUT."
46
- ){ |v| o[:report] = v }
47
- opts.on("-B", "--bacteria",
48
- "If set, ignores models typically missing in Bacteria."
49
- ){ |v| o[:bacteria] = v }
50
- opts.on("-A", "--archaea",
51
- "If set, ignores models typically missing in Archaea."
52
- ){ |v| o[:archaea] = v }
53
- opts.on("-G", "--genome-eq",
54
- "If set, ignores models not suitable for genome-equivalents estimations.",
55
- "See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940."
56
- ){ |v| o[:genomeeq] = v }
57
- opts.on("-r", "--rename STR",
58
- "If set, renames the sequences with the string provided and appends it",
59
- "with pipe and the gene name (except in --per-model files)."
60
- ){ |v| o[:rename]=v }
61
- opts.on("-n", "--no-stats",
62
- "If set, no statistics are reported on genome evaluation."
63
- ){ |v| o[:stats] = v }
64
- opts.on("-s", "--no-genes",
65
- "If set, statistics won't include the lists of missing/multi-copy genes."
66
- ){ |v| o[:genes] = v }
67
- opts.on("-M", "--metagenome",
68
- "If set, it allows for multiple copies of each gene and turns on",
69
- "metagenomic report mode."){ |v| o[:metagenome] = v }
70
- opts.separator ""
71
- opts.separator "Other Options"
72
- opts.on("-L", "--list-models",
73
- "If set, it only lists the models and exits. Compatible with -A, -B, -G,",
74
- "and -q; ignores all other parameters."){ |v| o[:list] = v }
75
- opts.on("-b", "--bin DIR",
76
- "Path to the directory containing the binaries of HMMer 3.0+."
77
- ){ |v| o[:bin] = v }
78
- opts.on("--model-file",
79
- "External file containing models to search."){ |v| o[:model_file] = v }
80
- opts.on("-t", "--threads INT",
81
- "Number of parallel threads to be used. By default: #{o[:thr]}."
82
- ){ |v| o[:thr] = v.to_i }
83
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)."){ o[:q] = true }
84
- opts.on("-h", "--help", "Display this screen.") do
85
- puts opts
86
- exit
87
- end
88
- opts.separator ""
27
+ opts.separator ''
28
+ opts.separator 'Mandatory'
29
+ opts.on('-i', '--in FILE',
30
+ 'Path to the FastA file containing all the proteins in a genome.'
31
+ ){ |v| o[:in] = v }
32
+ opts.separator ''
33
+ opts.separator 'Report Options'
34
+ opts.on('-o', '--out FILE',
35
+ 'Path to the output FastA file with the translated essential genes.',
36
+ 'By default the file is not produced.'){ |v| o[:out] = v }
37
+ opts.on('-m', '--per-model STR',
38
+ 'Prefix of translated genes in independent files with the name of the',
39
+ 'model appended. By default files are not produced.'
40
+ ){ |v| o[:permodel] = v }
41
+ opts.on('-R', '--report FILE',
42
+ 'Path to the report file. By default, the report is sent to the STDOUT.'
43
+ ){ |v| o[:report] = v }
44
+ opts.on('--hmm-out FILE',
45
+ 'Save HMMsearch output in this file. By default, not saved.'
46
+ ){ |v| o[:hmmout] = v }
47
+ opts.on('--alignments FILE',
48
+ 'Save the aligned proteins in this file. By default, not saved'
49
+ ){ |v| o[:alignments] = v }
50
+ opts.on('-B', '--bacteria',
51
+ 'If set, ignores models typically missing in Bacteria.'
52
+ ){ |v| o[:bacteria] = v }
53
+ opts.on('-A', '--archaea',
54
+ 'If set, ignores models typically missing in Archaea.'
55
+ ){ |v| o[:archaea] = v }
56
+ opts.on('-G', '--genome-eq',
57
+ 'If set, ignores models not suitable for genome-equivalents estimations.',
58
+ 'See Rodriguez-R et al, 2015, ISME J 9(9):1928-1940.'
59
+ ){ |v| o[:genomeeq] = v }
60
+ opts.on('-r', '--rename STR',
61
+ 'If set, renames the sequences with the string provided and appends it',
62
+ 'with pipe and the gene name (except in --per-model files).'
63
+ ){ |v| o[:rename]=v }
64
+ opts.on('-n', '--no-stats',
65
+ 'If set, no statistics are reported on genome evaluation.'
66
+ ){ |v| o[:stats] = v }
67
+ opts.on('-s', '--no-genes',
68
+ 'If set, statistics won\'t include the lists of missing/multi-copy genes.'
69
+ ){ |v| o[:genes] = v }
70
+ opts.on('-M', '--metagenome',
71
+ 'If set, it allows for multiple copies of each gene and turns on',
72
+ 'metagenomic report mode.'){ |v| o[:metagenome] = v }
73
+ opts.separator ''
74
+ opts.separator 'Other Options'
75
+ opts.on('-L', '--list-models',
76
+ 'If set, it only lists the models and exits. Compatible with -A, -B, -G,',
77
+ 'and -q; ignores all other parameters.'){ |v| o[:list] = v }
78
+ opts.on('-b', '--bin DIR',
79
+ 'Path to the directory containing the binaries of HMMer 3.0+.'
80
+ ){ |v| o[:bin] = v }
81
+ opts.on('--model-file',
82
+ 'External file containing models to search.'){ |v| o[:model_file] = v }
83
+ opts.on('-t', '--threads INT',
84
+ "Number of parallel threads to be used. By default: #{o[:thr]}."
85
+ ){ |v| o[:thr] = v.to_i }
86
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
87
+ opts.on('-h', '--help', 'Display this screen.') do
88
+ puts opts
89
+ exit
90
+ end
91
+ opts.separator ''
89
92
  end.parse!
90
- abort "-i is mandatory" if o[:in].nil? and not o[:list]
91
- o[:bin] = o[:bin]+"/" if o[:bin].size > 0
93
+ abort '-i is mandatory' if o[:in].nil? and not o[:list]
94
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
92
95
  o[:rename] = nil if o[:metagenome]
93
96
 
94
97
  not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
@@ -107,148 +110,175 @@ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
107
110
  TIGR00389 TIGR00436 tRNA-synth_1d}
108
111
 
109
112
  begin
110
- Dir.mktmpdir do |dir|
111
- $stderr.puts "Temporal directory: #{dir}." unless o[:q]
113
+ Dir.mktmpdir do |dir|
114
+ $stderr.puts "Temporal directory: #{dir}." unless o[:q]
115
+
116
+ # Create database.
117
+ $stderr.puts 'Searching models.' unless o[:q]
118
+ models = {}
119
+ model_id = nil
120
+ dbh = File.open("#{dir}/essential.hmm", 'w')
121
+ o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
122
+ mfh = (File.extname(o[:model_file]) == '.gz') ?
123
+ Zlib::GzipReader.open(o[:model_file]) :
124
+ File.open(o[:model_file], 'r')
125
+ while ln = mfh.gets
126
+ dbh.print ln
127
+ ln.chomp!
128
+ model_id = $1 if ln =~ /^NAME\s+(.+)/
129
+ models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
130
+ end
131
+ dbh.close
132
+ mfh.close
133
+ models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
134
+ models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
135
+ models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
136
+ if o[:list]
137
+ models.each_pair{ |id,desc| puts [id,desc].join("\t") }
138
+ exit
139
+ end
140
+
141
+ # Check HMMer version and run HMMsearch.
142
+ if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
143
+ raise 'You have provided an unsupported version of HMMER. ' +
144
+ 'This script requires HMMER 3.0+.'
145
+ end
146
+ o[:hmmout] ||= "#{dir}/hmmsearch"
147
+ `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{o[:hmmout]}" \
148
+ -A "#{dir}/a.sto" --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
149
+ > #{dir}/hmmsearch.log`
112
150
 
113
- # Create database.
114
- $stderr.puts "Searching models." unless o[:q]
115
- models = {}
116
- model_id = nil
117
- dbh = File.open("#{dir}/essential.hmm", "w")
118
- o[:model_file] ||= File.expand_path("../lib/data/essential.hmm.gz",
119
- __FILE__)
120
- mfh = (File.extname(o[:model_file])==".gz") ?
121
- Zlib::GzipReader.open(o[:model_file]) :
122
- File.open(o[:model_file],"r")
123
- while ln = mfh.gets
124
- dbh.print ln
125
- ln.chomp!
126
- model_id = $1 if ln =~ /^NAME\s+(.+)/
127
- models[model_id] = $1 if ln =~ /^DESC\s+(.+)/
151
+ # Parse output
152
+ $stderr.puts 'Parsing results.' unless o[:q]
153
+ trash = []
154
+ genes = {}
155
+ File.open(o[:hmmout], 'r') do |resh|
156
+ while ln = resh.gets
157
+ next if ln =~ /^#/
158
+ r = ln.split /\s+/
159
+ next unless models.include? r[2]
160
+ if o[:metagenome]
161
+ genes[ r[2] ] = [] if genes[ r[2] ].nil?
162
+ genes[ r[2] ] << r[0]
163
+ elsif genes[ r[2] ].nil?
164
+ genes[ r[2] ] = r[0]
165
+ else
166
+ trash << r[2]
167
+ end
128
168
  end
129
- dbh.close
130
- mfh.close
131
- models.delete_if { |m| not_in_archaea.include? m } if o[:archaea]
132
- models.delete_if { |m| not_in_bacteria.include? m } if o[:bacteria]
133
- models.delete_if { |m| not_as_genomeeq.include? m } if o[:genomeeq]
134
- if o[:list]
135
- models.each_pair{ |id,desc| puts [id,desc].join("\t") }
136
- exit
169
+ end
170
+
171
+ # Report statistics
172
+ if o[:stats]
173
+ reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
174
+ if o[:metagenome]
175
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
176
+ gc = [0] * (models.size - genes.size) +
177
+ genes.values.map{ |g| g.length }.sort
178
+ reph.printf "! Mean number of copies per model: %.3f.\n",
179
+ gc.inject(:+).to_f / models.size
180
+ reph.printf "! Median number of copies per model: %.1f.\n",
181
+ gc.size.even? ? gc[gc.size/2, 2].inject(:+).to_f / 2 : gc[gc.size/2]
182
+ if o[:genes] and genes.size != models.size
183
+ reph.printf "! Missing genes: %s\n",
184
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
185
+ map{|m| "#{m}: #{models[m]}."}).join("\n! ")
186
+ end
187
+ else
188
+ reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
189
+ reph.printf "! Completeness: %.1f%%.\n",
190
+ 100.0 * genes.size / models.size
191
+ reph.printf "! Contamination: %.1f%%.\n",
192
+ 100.0 * trash.size / models.size
193
+ if o[:genes]
194
+ reph.printf "! Multiple copies: %s\n",
195
+ ([''] + trash.uniq.
196
+ map{ |m| "#{trash.count(m)+1} #{m}: #{models[m]}." }).
197
+ join("\n! ") unless trash.empty?
198
+ reph.printf "! Missing genes: %s\n",
199
+ ([''] + models.keys.select{ |m| not genes.keys.include? m }.
200
+ map{ |m| "#{m}: #{models[m]}." }).
201
+ join("\n! ") unless genes.size == models.size
202
+ end
137
203
  end
138
-
139
- # Check HMMer version and run HMMsearch.
140
- if `"#{o[:bin]}hmmsearch" -h`.lines[1] !~ /HMMER 3/
141
- raise "You have provided an unsupported version of HMMER. " +
142
- "This script requires HMMER 3.0+."
204
+ reph.close unless o[:report].nil?
205
+ end
206
+
207
+ # Extract sequences
208
+ unless o[:out].nil? and o[:permodel].nil?
209
+ $stderr.puts 'Extracting sequences.' unless o[:q]
210
+ faah = File.open(o[:in], 'r')
211
+ outh = o[:out].nil? ? nil : File.open(o[:out], 'w')
212
+ geneh = nil
213
+ in_gene = nil
214
+ unless o[:permodel].nil?
215
+ genes.keys.each do |m|
216
+ File.open("#{o[:permodel]}#{m}.faa", 'w').close
217
+ end
143
218
  end
144
- `"#{o[:bin]}hmmsearch" --cpu #{o[:thr]} --tblout "#{dir}/hmmsearch" \
145
- --cut_tc --notextw "#{dir}/essential.hmm" "#{o[:in]}" \
146
- > #{dir}/hmmsearch.log`
147
-
148
- # Parse output
149
- $stderr.puts "Parsing results." unless o[:q]
150
- resh = File.open("#{dir}/hmmsearch","r")
151
- trash = []
152
- genes = {}
153
- while ln = resh.gets
154
- next if ln =~ /^#/
155
- r = ln.split /\s+/
156
- next unless models.include? r[2]
157
- if o[:metagenome]
158
- genes[ r[2] ] = [] if genes[ r[2] ].nil?
159
- genes[ r[2] ] << r[0]
160
- elsif genes[ r[2] ].nil?
161
- genes[ r[2] ] = r[0]
162
- else
163
- trash << r[2]
164
- end
219
+ while ln = faah.gets
220
+ if ln =~ /^>(\S+)/
221
+ if o[:metagenome]
222
+ in_gene = genes.keys.
223
+ map{ |k| genes[k].include?($1) ? k : nil }.compact.first
224
+ in_gene = [in_gene, $1] unless in_gene.nil?
225
+ else
226
+ in_gene = genes.rassoc($1)
227
+ end
228
+ next if in_gene.nil?
229
+ geneh.close unless geneh.nil?
230
+ geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa", 'a+') unless
231
+ o[:permodel].nil?
232
+ outh.print(o[:rename].nil? ?
233
+ ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
234
+ geneh.print(o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless
235
+ geneh.nil?
236
+ else
237
+ next if in_gene.nil?
238
+ outh.print ln unless outh.nil?
239
+ geneh.print ln unless geneh.nil?
240
+ end
165
241
  end
242
+ geneh.close unless geneh.nil?
243
+ outh.close unless outh.nil?
244
+ faah.close
245
+ end
166
246
 
167
- # Report statistics
168
- if o[:stats]
169
- reph = o[:report].nil? ? $stdout : File.open(o[:report], "w")
170
- if o[:metagenome]
171
- reph.printf "! Essential genes found: %d/%d.\n",
172
- genes.size, models.size
173
- gc = [0]*(models.size - genes.size) +
174
- genes.values.map{|g| g.length}.sort
175
- reph.printf "! Mean number of copies per model: %.3f.\n",
176
- gc.inject(:+).to_f/models.size
177
- reph.printf "! Median number of copies per model: %.1f.\n",
178
- gc.size.even? ? gc[gc.size/2,2].inject(:+).to_f/2 : gc[gc.size/2]
179
- if o[:genes] and genes.size != models.size
180
- reph.printf "! Missing genes: %s\n",
181
- ([""] +
182
- models.keys.select{|m| not genes.keys.include? m
183
- }.map{|m| "#{m}: #{models[m]}."}).join("\n! ")
184
- end
185
- else
186
- reph.printf "! Essential genes found: %d/%d.\n",
187
- genes.size, models.size
188
- reph.printf "! Completeness: %.1f%%.\n",
189
- 100.0*genes.size/models.size
190
- reph.printf "! Contamination: %.1f%%.\n",
191
- 100.0*trash.size/models.size
192
- if o[:genes]
193
- reph.printf "! Multiple copies: %s\n",
194
- ([""] +
195
- trash.uniq.map{|m|
196
- "#{trash.count(m)+1} #{m}: #{models[m]}."}
197
- ).join("\n! ") unless trash.empty?
198
- reph.printf "! Missing genes: %s\n",
199
- ([""] +
200
- models.keys.select{|m| not genes.keys.include? m
201
- }.map{|m| "#{m}: #{models[m]}."}
202
- ).join("\n! ") unless genes.size==models.size
203
- end
204
- end
205
- reph.close unless o[:report].nil?
247
+ unless o[:alignments].nil?
248
+ aln = {}
249
+ File.open("#{dir}/a.sto", 'r') do |fh|
250
+ cur_model = nil
251
+ mask = []
252
+ fh.each_line do |ln|
253
+ case ln.chomp
254
+ when /^# STOCKHOLM/
255
+ cur_model = nil
256
+ mask = []
257
+ when /^#=GS (\S+)\/([\d\-]+)\s+DE/
258
+ cur_model ||= genes.rassoc($1).first
259
+ aln[ cur_model ] ||= [ "# #{cur_model} : #{$1} : #{$2}" ]
260
+ when /^#=GC RF\s+(\S+)/
261
+ aln[ cur_model ][ 1 ] ||= $1.upcase.tap do |i|
262
+ mask.each{ |d| i[d] = '' }
263
+ end
264
+ when /^[^#]\S*\s+(\S+)/
265
+ next if aln[ cur_model ][ 2 ]
266
+ aln[ cur_model ][ 2 ] = $1.upcase
267
+ mask = aln[ cur_model ][ 2 ].split('').each_with_index.
268
+ map{ |v, k| v == '.' ? k : nil }.compact.reverse
269
+ aln[ cur_model ][ 2 ].delete!('.') unless mask.empty?
270
+ end
271
+ end
206
272
  end
207
-
208
- # Extract sequences
209
- unless o[:out].nil? and o[:permodel].nil?
210
- $stderr.puts "Extracting sequences." unless o[:q]
211
- faah = File.open(o[:in], "r")
212
- outh = o[:out].nil? ? nil : File.open(o[:out], "w")
213
- geneh = nil
214
- in_gene = nil
215
- unless o[:permodel].nil?
216
- genes.keys.each do |m|
217
- File.open("#{o[:permodel]}#{m}.faa", "w").close
218
- end
219
- end
220
- while ln = faah.gets
221
- if ln =~ /^>(\S+)/
222
- if o[:metagenome]
223
- in_gene = genes.keys.map{|k| genes[k].include?($1) ? k : nil
224
- }.compact.first
225
- in_gene = [in_gene, $1] unless in_gene.nil?
226
- else
227
- in_gene = genes.rassoc($1)
228
- end
229
- next if in_gene.nil?
230
- geneh.close unless geneh.nil?
231
- geneh = File.open("#{o[:permodel]}#{in_gene[0]}.faa",
232
- "a+") unless o[:permodel].nil?
233
- outh.print(o[:rename].nil? ?
234
- ln : ">#{o[:rename]}|#{in_gene[0]}\n") unless outh.nil?
235
- geneh.print(
236
- o[:rename].nil? ? ln : ">#{o[:rename]}\n") unless geneh.nil?
237
- else
238
- next if in_gene.nil?
239
- outh.print ln unless outh.nil?
240
- geneh.print ln unless geneh.nil?
241
- end
242
- end
243
- geneh.close unless geneh.nil?
244
- outh.close unless outh.nil?
245
- faah.close
273
+ File.open(o[:alignments], 'w') do |fh|
274
+ aln.each { |k, v| v.each{ |i| fh.puts i } }
246
275
  end
276
+ end
247
277
 
248
- $stderr.puts "Done." unless o[:q]
249
- end # |dir|
278
+ $stderr.puts 'Done.' unless o[:q]
279
+ end # |dir|
250
280
  rescue => err
251
- $stderr.puts "Exception: #{err}\n\n"
252
- err.backtrace.each { |l| $stderr.puts l + "\n" }
253
- err
281
+ $stderr.puts "Exception: #{err}\n\n"
282
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
283
+ err
254
284
  end
@@ -0,0 +1,159 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
+ # @license Artistic-2.0
5
+
6
+ require 'optparse'
7
+
8
+ o = {q: false}
9
+ ARGV << '-h' if ARGV.size==0
10
+
11
+ OptionParser.new do |opt|
12
+ opt.banner = "
13
+ Estimates Average Amino Acid Identity (AAI) from the essential genes extracted
14
+ and aligned by HMM.essential.rb (see --alignments).
15
+
16
+ Usage: #{$0} [options]"
17
+ opt.separator ''
18
+ opt.separator 'Mandatory'
19
+ opt.on('-1 PATH', 'Input alignments file for genome 1.'){ |v| o[:a] = v }
20
+ opt.on('-2 PATH', 'Input alignments file for genome 2.'){ |v| o[:b] = v }
21
+ opt.separator ''
22
+ opt.separator 'Options'
23
+ opt.on('-a', '--aln-out FILE',
24
+ 'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
25
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
26
+ opt.on('-h', '--help', 'Display this screen.') do
27
+ puts opt
28
+ exit
29
+ end
30
+ opt.separator ''
31
+ end.parse!
32
+ abort '-1 is mandatory.' if o[:a].nil?
33
+ abort '-2 is mandatory.' if o[:b].nil?
34
+
35
+ class HList
36
+ attr_accessor :list
37
+ def initialize(file)
38
+ @list = {}
39
+ r = File.readlines(file)
40
+ while not r.empty?
41
+ e = HElement.new(*r.shift(3))
42
+ @list[ e.model_id ] = e
43
+ end
44
+ end
45
+
46
+ def [](model_id)
47
+ list[model_id]
48
+ end
49
+
50
+ ##
51
+ # Returns an array of HAln objects.
52
+ def align(other)
53
+ list.keys.map do |model_id|
54
+ self[model_id].align(other[model_id]) unless other[model_id].nil?
55
+ end.compact
56
+ end
57
+
58
+ def models
59
+ list.keys
60
+ end
61
+ end
62
+
63
+ class HElement
64
+ attr_accessor :defline, :model_id, :protein_id, :protein_coords
65
+ attr_accessor :model_aln, :protein_aln
66
+ def initialize(defline, model_aln, protein_aln)
67
+ @defline = defline.chomp
68
+ @model_aln = model_aln.chomp
69
+ @protein_aln = protein_aln.chomp
70
+ if defline =~ /^# (.+) : (.+) : (.+)/
71
+ @model_id = $1
72
+ @protein_id = $2
73
+ @protein_coords = $3
74
+ end
75
+ end
76
+
77
+ def dup
78
+ HElement.new(defline, model_aln, protein_aln)
79
+ end
80
+
81
+ ##
82
+ # Returns an HAln object
83
+ def align(other)
84
+ HAln.new(self, other)
85
+ end
86
+
87
+ def mask
88
+ @mask ||= model_aln.chars.
89
+ each_with_index.map{ |v, k| v == '.' ? k : nil }.
90
+ compact.reverse
91
+ end
92
+
93
+ def mask!(template)
94
+ (template - mask).each do |d|
95
+ @model_aln[d] = '-' + @model_aln[d]
96
+ @protein_aln[d] = '-' + @protein_aln[d]
97
+ end
98
+ end
99
+ end
100
+
101
+ class HAln
102
+ attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
103
+ def initialize(a, b)
104
+ a_masked = a.dup
105
+ a_masked.mask! b.mask.reverse
106
+ b_masked = b.dup
107
+ b_masked.mask! b_masked.mask
108
+ @protein_1 = a_masked.protein_aln
109
+ @protein_2 = b_masked.protein_aln
110
+ @model_id = a.model_id
111
+ @protein_1_id = a.protein_id + '/' + a.protein_coords
112
+ @protein_2_id = b.protein_id + '/' + b.protein_coords
113
+ end
114
+
115
+ def stats
116
+ @stats = { len: 0, gaps: 0, matches: 0 }
117
+ return @stats unless @stats[:id].nil?
118
+ protein_1.chars.each_with_index do |v, k|
119
+ next if v == '-' and protein_2[k] == '-'
120
+ @stats[:len] += 1
121
+ if v == protein_2[k]
122
+ @stats[:matches] += 1
123
+ elsif v == '-' or protein_2[k] == '-'
124
+ @stats[:gaps] += 1
125
+ end
126
+ end
127
+ @stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
128
+ end
129
+
130
+ def stats_to_s
131
+ stats.map{ |k,v| "#{k}:#{v}" }.join " "
132
+ end
133
+
134
+ def to_s
135
+ "# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
136
+ protein_1 + "\n" + protein_2 + "\n"
137
+ end
138
+ end
139
+
140
+ hlist1 = HList.new(o[:a])
141
+ hlist2 = HList.new(o[:b])
142
+ haln_arr = hlist1.align(hlist2)
143
+
144
+ avg_identity = haln_arr.map{ |i| i.stats[:id] }.inject(:+) / haln_arr.size
145
+ avg2_identity = haln_arr.map{ |i| i.stats[:id] ** 2 }.inject(:+) / haln_arr.size
146
+ sd_identity = Math.sqrt( avg2_identity - avg_identity ** 2 )
147
+ puts "Common models: #{haln_arr.size}"
148
+ puts "All models: #{(hlist1.models | hlist1.models).size}"
149
+ puts "Average identity: #{avg_identity.round(2)}%"
150
+ puts "SD identity: #{sd_identity.round(2)}"
151
+
152
+ if o[:alnout]
153
+ File.open(o[:alnout], 'w') do |fh|
154
+ haln_arr.each do |i|
155
+ fh.puts i
156
+ end
157
+ end
158
+ end
159
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11.2
4
+ version: 0.3.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-04-20 00:00:00.000000000 Z
11
+ date: 2019-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -321,6 +321,7 @@ files:
321
321
  - utils/enveomics/Scripts/GFF.catsbj.pl
322
322
  - utils/enveomics/Scripts/GenBank.add_fields.rb
323
323
  - utils/enveomics/Scripts/HMM.essential.rb
324
+ - utils/enveomics/Scripts/HMM.haai.rb
324
325
  - utils/enveomics/Scripts/HMMsearch.extractIds.rb
325
326
  - utils/enveomics/Scripts/JPlace.distances.rb
326
327
  - utils/enveomics/Scripts/JPlace.to_iToL.rb