miga-base 0.5.0.0 → 0.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor.rb +6 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/cli/action/quality_wf.rb +1 -0
  5. data/lib/miga/cli/action/stats.rb +9 -8
  6. data/lib/miga/cli/action/wf.rb +5 -0
  7. data/lib/miga/cli/objects_helper.rb +1 -0
  8. data/lib/miga/common/format.rb +5 -2
  9. data/lib/miga/daemon.rb +2 -2
  10. data/lib/miga/project/dataset.rb +8 -7
  11. data/lib/miga/version.rb +2 -2
  12. data/scripts/essential_genes.bash +9 -8
  13. data/scripts/mytaxa.bash +3 -1
  14. data/scripts/mytaxa_scan.bash +15 -8
  15. data/utils/domain-ess-genes.rb +63 -0
  16. data/utils/enveomics/Manifest/Tasks/other.json +21 -2
  17. data/utils/enveomics/Manifest/examples.json +4 -4
  18. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
  19. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
  20. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
  21. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
  22. data/utils/enveomics/Scripts/HMM.essential.rb +54 -17
  23. data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} +0 -0
  24. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  25. data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
  26. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  27. data/utils/enveomics/enveomics.R/R/df2dist.R +16 -17
  28. data/utils/enveomics/enveomics.R/R/recplot2.R +20 -15
  29. data/utils/enveomics/enveomics.R/README.md +1 -0
  30. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +5 -4
  31. data/utils/find-medoid.R +6 -1
  32. data/utils/mytaxa_scan.rb +49 -46
  33. data/utils/ref-tree.R +6 -1
  34. data/utils/subclades-nj.R +6 -1
  35. data/utils/subclades.R +6 -1
  36. metadata +6 -6
  37. data/utils/arch-ess-genes.rb +0 -57
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e370d282f1b28480765e1b91fcb7d8921d12baa31d22db1318975a1c2a79e19a
4
- data.tar.gz: e7fb3941fd3381e0e9696a2c577aeb157657335e56434e7c6d6650be7ba45e98
3
+ metadata.gz: f6888c1ce3756b8cc708736c0da052e5a7396277e0c903ebcfc083f17b6915e7
4
+ data.tar.gz: d998f6e087316a81de4aa8897452344c1987ce0cb9807f4a1e11a29f52dfbcf2
5
5
  SHA512:
6
- metadata.gz: 4642a212e1b4021e211fd144b515ff49e9ddb7a9b2292430553307a7ae165e4d8d5e6fd8426757f15ea6e70f4c3efbb055e0439497172cc1f91186d522c82635
7
- data.tar.gz: 8d5d3ded3c03e56505572102110a4bca4b84d06b2e73bcf208856610a4cd6e60092ce6d54d47dcef2c8acf85f1cce5f8461097e8699344df6738ed8493215112
6
+ metadata.gz: c6f7f8af791664b2bb704744535e0e39c4d5fc06521beb8feb57f658d6187a667100fde4312a3a8e5f47f5dd9d4b3c06326584d95e80262dc0e02a91795e192c
7
+ data.tar.gz: 4f633972d8ccc1cc06cc14ca6c48b50759d63af72ba75514a0e877a58af4e1d407fda2c2608c2077dd541e30e86add8359bc2e0838d93a19f7cc1bd5c5f5fff2
@@ -104,6 +104,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
104
104
  unless ok
105
105
  cli.say " > Registering again #{d.name}:#{r_k}"
106
106
  d.add_result(r_k, true, force: true)
107
+ sr = d.result(:stats) and sr.remove!
107
108
  end
108
109
  end
109
110
  end
@@ -123,7 +124,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
123
124
  changed = true
124
125
  end
125
126
  end
126
- d.add_result(:cds, true, force: true) if changed
127
+ if changed
128
+ d.add_result(:cds, true, force: true)
129
+ sr = d.result(:stats) and sr.remove!
130
+ end
127
131
  end
128
132
  end
129
133
 
@@ -136,6 +140,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
136
140
  if dir.nil?
137
141
  cli.say " > Removing #{d.name}:essential_genes"
138
142
  res.remove!
143
+ sr = d.result(:stats) and sr.remove!
139
144
  next
140
145
  end
141
146
  next if Dir["#{dir}/*.faa"].empty?
@@ -220,7 +220,7 @@ BASH
220
220
 
221
221
  def check_r_packages(paths)
222
222
  cli.puts 'Looking for R packages:'
223
- %w(enveomics.R ape cluster vegan).each do |pkg|
223
+ %w(ape cluster vegan).each do |pkg|
224
224
  cli.print "Testing #{pkg}... "
225
225
  if test_r_package(cli, paths, pkg)
226
226
  cli.puts 'yes.'
@@ -25,6 +25,7 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
25
25
  %w[project_stats haai_distances aai_distances ani_distances clade_finding]
26
26
  .map { |i| ["run_#{i}", false] }
27
27
  ]
28
+ p_metadata[:ess_coll] = cli[:ess_coll]
28
29
  d_metadata = { run_distances: false }
29
30
  d_metadata[:run_mytaxa_scan] = false unless cli[:mytaxa]
30
31
  p = create_project(:assembly, p_metadata, d_metadata)
@@ -122,17 +122,18 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
122
122
  end
123
123
  end
124
124
  else
125
- # Fix estimate for Archaea
126
- if !d.metadata[:tax].nil? &&
127
- d.metadata[:tax].in?(Taxonomy.new('d:Archaea')) &&
128
- r.file_path(:bac_report).nil?
129
- scr = "#{MiGA.root_path}/utils/arch-ess-genes.rb"
125
+ # Fix estimate by domain
126
+ if !(tax = d.metadata[:tax]).nil? &&
127
+ %w[Archaea Bacteria].include?(tax[:d]) &&
128
+ r.file_path(:raw_report).nil?
129
+ scr = "#{MiGA.root_path}/utils/domain-ess-genes.rb"
130
130
  rep = r.file_path(:report)
131
131
  rc_p = File.expand_path('.miga_rc', ENV['HOME'])
132
132
  rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
133
- $stderr.print `#{rc} ruby '#{scr}' '#{rep}' '#{rep}.archaea'`
134
- r.add_file(:bac_report, "#{d.name}.ess/log")
135
- r.add_file(:report, "#{d.name}.ess/log.archaea")
133
+ $stderr.print `#{rc} ruby '#{scr}' \
134
+ '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
135
+ r.add_file(:raw_report, "#{d.name}.ess/log")
136
+ r.add_file(:report, "#{d.name}.ess/log.domain")
136
137
  end
137
138
  # Extract/compute quality values
138
139
  stats = {completeness: [0.0, '%'], contamination: [0.0, '%']}
@@ -24,6 +24,11 @@ module MiGA::Cli::Action::Wf
24
24
  opt.separator " FILES...: #{files_desc}"
25
25
  opt.separator ''
26
26
  opt.separator 'Workflow Control Options'
27
+ opt.on(
28
+ '-C', '--collection STRING',
29
+ 'Collection of essential genes to use as reference',
30
+ 'One of: dupont_2012 (default), lee_2019'
31
+ ) { |v| cli[:ess_coll] = v }
27
32
  if params[:ncbi]
28
33
  opt.on(
29
34
  '-T', '--ncbi-taxon STRING',
@@ -66,6 +66,7 @@ module MiGA::Cli::ObjectsHelper
66
66
  end
67
67
 
68
68
  def add_metadata(obj, cli = self)
69
+ raise "Unsupported object: #{obj.class}" unless obj.respond_to? :metadata
69
70
  cli[:metadata].split(',').each do |pair|
70
71
  (k,v) = pair.split('=')
71
72
  case v
@@ -25,10 +25,13 @@ module MiGA::Common::Format
25
25
  # Cleans a FastA file in place.
26
26
  def clean_fasta_file(file)
27
27
  tmp_fh = nil
28
+ tmp_path = nil
28
29
  begin
29
30
  if file =~ /\.gz/
30
31
  tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
31
- tmp_fh = Zlib::GzipWriter.open(tmp_path)
32
+ File.unlink tmp_path
33
+ tmp_path += '.gz'
34
+ tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
32
35
  fh = Zlib::GzipReader.open(file)
33
36
  else
34
37
  tmp_fh = Tempfile.new('MiGA')
@@ -50,7 +53,7 @@ module MiGA::Common::Format
50
53
  tmp_fh.print buffer.wrap_width(80)
51
54
  tmp_fh.close
52
55
  fh.close
53
- FileUtils.cp(tmp_path, file)
56
+ FileUtils.mv(tmp_path, file)
54
57
  ensure
55
58
  begin
56
59
  tmp_fh.close unless tmp_fh.nil?
data/lib/miga/daemon.rb CHANGED
@@ -285,10 +285,10 @@ class MiGA::Daemon < MiGA::MiGA
285
285
  if [nil, '', 0].include? job[:pid]
286
286
  job[:pid] = nil
287
287
  @jobs_to_run << job
288
- say "Unsuccessful #{job[:task_name]}, rescheduling."
288
+ say "Unsuccessful #{job[:task_name]}, rescheduling"
289
289
  else
290
290
  @jobs_running << job
291
- say "Spawned pid:#{job[:pid]} for #{job[:task_name]}."
291
+ say "Spawned pid:#{job[:pid]} for #{job[:task_name]}"
292
292
  end
293
293
  end
294
294
  end
@@ -4,7 +4,7 @@
4
4
  ##
5
5
  # Helper module including specific functions handle datasets.
6
6
  module MiGA::Project::Dataset
7
-
7
+
8
8
  ##
9
9
  # Returns Array of MiGA::Dataset.
10
10
  def datasets
@@ -23,7 +23,7 @@ module MiGA::Project::Dataset
23
23
  def dataset_names_hash
24
24
  @dataset_names_hash ||= Hash[dataset_names.map{ |i| [i,true] }]
25
25
  end
26
-
26
+
27
27
  ##
28
28
  # Returns MiGA::Dataset.
29
29
  def dataset(name)
@@ -47,18 +47,19 @@ module MiGA::Project::Dataset
47
47
  end
48
48
  end
49
49
  end
50
-
50
+
51
51
  ##
52
52
  # Add dataset identified by +name+ and return MiGA::Dataset.
53
53
  def add_dataset(name)
54
54
  unless metadata[:datasets].include? name
55
55
  MiGA::Dataset.new(self, name)
56
56
  @metadata[:datasets] << name
57
+ @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
57
58
  save
58
59
  end
59
60
  dataset(name)
60
61
  end
61
-
62
+
62
63
  ##
63
64
  # Unlink dataset identified by +name+ and return MiGA::Dataset.
64
65
  def unlink_dataset(name)
@@ -68,7 +69,7 @@ module MiGA::Project::Dataset
68
69
  save
69
70
  d
70
71
  end
71
-
72
+
72
73
  ##
73
74
  # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
74
75
  # supported by File#generic_transfer.
@@ -116,7 +117,7 @@ module MiGA::Project::Dataset
116
117
  end
117
118
  datasets.uniq - metadata[:datasets]
118
119
  end
119
-
120
+
120
121
  ##
121
122
  # Are all the datasets in the project preprocessed? Save intermediate results
122
123
  # if +save+ (until the first incomplete dataset is reached).
@@ -149,6 +150,6 @@ module MiGA::Project::Dataset
149
150
  def each_dataset_profile_advance(&blk)
150
151
  each_dataset { |ds| blk.call(ds.profile_advance) }
151
152
  end
152
-
153
+
153
154
  end
154
155
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.5, 0, 0]
13
+ VERSION = [0.5, 1, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 11, 25)
21
+ VERSION_DATE = Date.new(2020, 1, 6)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -22,18 +22,19 @@ fi
22
22
  # Find and extract essential genes
23
23
  [[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
24
24
  mkdir "${DATASET}.ess"
25
- TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
25
+ TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" \
26
26
  --metadata "type" | awk '{print $2}')
27
+ COLL=$(miga about -P "$PROJECT" -m ess_coll)
28
+ [[ "$COLL" == "?" ]] && COLL=dupont_2012
29
+ CMD="HMM.essential.rb \
30
+ -i '$FAA' -o '${DATASET}.ess.faa' -m '${DATASET}.ess/' \
31
+ -t '$CORES' -r '$DATASET' --collection '$COLL'"
27
32
  if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
28
- HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
29
- -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
30
- > "${DATASET}.ess/log"
33
+ CMD="$CMD --metagenome"
31
34
  else
32
- HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
33
- -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
34
- --alignments "${DATASET}.ess/proteins.aln" \
35
- > "${DATASET}.ess/log"
35
+ CMD="$CMD --alignments '${DATASET}.ess/proteins.aln'"
36
36
  fi
37
+ $CMD > "${DATASET}.ess/log"
37
38
 
38
39
  # Reduce files
39
40
  if exists "$DATASET".ess/*.faa ; then
data/scripts/mytaxa.bash CHANGED
@@ -38,7 +38,9 @@ else
38
38
  fi
39
39
 
40
40
  # Execute search
41
- diamond blastp -q "../../../06.cds/$DATASET.faa" -d "$MT/AllGenomes.faa" \
41
+ FAA="../../../06.cds/$DATASET.faa"
42
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
43
+ diamond blastp -q "$FAA" -d "$MT/AllGenomes.faa" \
42
44
  -a "$DATASET.daa" -k 5 -p "$CORES" --min-score 60
43
45
  diamond view -a "$DATASET.daa" -o "$DATASET.blast"
44
46
 
@@ -39,12 +39,13 @@ else
39
39
  exit 1
40
40
  fi
41
41
 
42
+ FAA="../../../06.cds/$DATASET.faa"
43
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
42
44
  if [[ ! -s "$DATASET.mytaxa" ]] ; then
43
45
  # Execute search
44
46
  if [[ ! -s "$DATASET.blast" ]] ; then
45
- diamond blastp -q "../../../06.cds/$DATASET.faa" \
46
- -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60 \
47
- -a "$DATASET.daa" -t "$TMPDIR"
47
+ diamond blastp -q "$FAA" -a "$DATASET.daa" -t "$TMPDIR" \
48
+ -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
48
49
  diamond view -a "$DATASET.daa" -o "$DATASET.blast" -t "$TMPDIR"
49
50
  fi
50
51
 
@@ -53,8 +54,7 @@ else
53
54
  | sort -k 13 > "$DATASET.mytaxain"
54
55
  "$MT/MyTaxa" "$DATASET.mytaxain" "$DATASET.mytaxa" "0.5"
55
56
  fi
56
- ruby "$MIGA/utils/mytaxa_scan.rb" "../../../06.cds/$DATASET.faa" \
57
- "$DATASET.mytaxa" "$DATASET.wintax"
57
+ ruby "$MIGA/utils/mytaxa_scan.rb" "$FAA" "$DATASET.mytaxa" "$DATASET.wintax"
58
58
  echo "
59
59
  source('$MIGA/utils/mytaxa_scan.R');
60
60
  pdf('$DATASET.pdf', 12, 7);
@@ -70,11 +70,18 @@ else
70
70
  let i=$i+1
71
71
  awk "NR==$win" "$DATASET.wintax.genes" | tr "\\t" "\\n" \
72
72
  > "$DATASET.reg/$i.ids"
73
- FastA.filter.pl -q "$DATASET.reg/$i.ids" \
74
- "../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
73
+ if [[ "$FAA" == *.gz ]] ; then
74
+ gzip -c -d "$FAA" \
75
+ | FastA.filter.pl -q "$DATASET.reg/$i.ids" /dev/stdin \
76
+ > "$DATASET.reg/$i.faa"
77
+ else
78
+ FastA.filter.pl -q "$DATASET.reg/$i.ids" "$FAA" \
79
+ > "$DATASET.reg/$i.faa"
80
+ fi
75
81
  done
76
82
  # Archive regions
77
- tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
83
+ tar -cf "$DATASET.reg.tar" "$DATASET.reg"
84
+ gzip -9 "$DATASET.reg.tar"
78
85
  rm -r "$DATASET.reg"
79
86
  fi
80
87
 
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ esslog = ARGV.shift
4
+ outlog = ARGV.shift
5
+ domain = ARGV.shift
6
+
7
+ def quality(hsh)
8
+ q = {}
9
+ q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
10
+ q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
11
+ q[:cmp] = 100.0*q[:found].to_f/hsh.size
12
+ q[:cnt] = 100.0*q[:multi].to_f/hsh.size
13
+ q
14
+ end
15
+
16
+ # Find collection and detected anomalies
17
+ cnt_ref = {}
18
+ at = :header
19
+ collection = 'dupont_2012'
20
+ File.open(esslog, 'r') do |fh|
21
+ fh.each_line do |ln|
22
+ v = ln.chomp.gsub(/^! +/, '')
23
+ if v == 'Multiple copies: '
24
+ at = :multi
25
+ elsif v == 'Missing genes: '
26
+ at = :missing
27
+ elsif v =~ /Collection: (\S+)/
28
+ collection = $1
29
+ elsif at == :multi
30
+ v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
31
+ cnt_ref[$2] = $1.to_i
32
+ elsif at == :missing
33
+ v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
34
+ cnt_ref[$1] = 0
35
+ end
36
+ end
37
+ end
38
+
39
+ # Find expected genes for domain
40
+ n_dom = Hash[
41
+ `HMM.essential.rb -L -q '-#{domain}' -c '#{collection}'`
42
+ .chomp.split("\n").map { |i| i.split("\t") }
43
+ ]
44
+ l_dom = n_dom.keys
45
+ cnt_dom = {}
46
+ l_dom.each { |i| cnt_dom[i] = cnt_ref[i] || 1 }
47
+
48
+ # Correct report
49
+ q = quality(cnt_dom)
50
+ File.open(outlog, 'w') do |ofh|
51
+ ofh.puts "! Collection: #{collection} #{domain}"
52
+ ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_dom.size}."
53
+ ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
54
+ ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
55
+ if q[:multi] > 0
56
+ ofh.puts "! Multiple copies: "
57
+ cnt_dom.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_dom[k]}." if v>1 }
58
+ end
59
+ if q[:found] < cnt_dom.size
60
+ ofh.puts "! Missing genes: "
61
+ cnt_dom.each{ |k,v| ofh.puts "! #{k}: #{n_dom[k]}." if v==0 }
62
+ end
63
+ end
@@ -371,8 +371,18 @@
371
371
  "source_url": "http://hmmer.janelia.org/software"
372
372
  }
373
373
  ],
374
- "cite": [["Eddy, 2011, PLoS CB",
375
- "http://dx.doi.org/10.1371/journal.pcbi.1002195"]],
374
+ "cite": [
375
+ ["Eddy, 2011, PLoS CB",
376
+ "http://dx.doi.org/10.1371/journal.pcbi.1002195"],
377
+ ["Dupont et al, 2012, ISME J",
378
+ "https://doi.org/10.1038/ismej.2011.189"],
379
+ ["Rodriguez-R et al, 2014, ISME J",
380
+ "https://doi.org/10.1038/ismej.2015.5"],
381
+ ["Lee, 2019, Bioinf",
382
+ "https://doi.org/10.1093/bioinformatics/btz188"],
383
+ ["Eren et al, 2015, PeerJ",
384
+ "https://doi.org/10.7717/peerj.1319"]
385
+ ],
376
386
  "options": [
377
387
  {
378
388
  "name": "Input file",
@@ -381,6 +391,15 @@
381
391
  "mandatory": true,
382
392
  "description": "FastA file containing all the proteins in the genome."
383
393
  },
394
+ {
395
+ "opt": "--collection",
396
+ "arg": "string",
397
+ "default": "dupont_2012",
398
+ "description": ["Reference collection of essential proteins to use.",
399
+ "One of: dupont_2012 (default, Dupont et al 2012 modified by",
400
+ "Rodriguez-R et al 2015), or lee_2019 (Lee 2019 modified by Eren",
401
+ "et al 2015)."]
402
+ },
384
403
  {
385
404
  "name": "Output file",
386
405
  "opt": "--out",
@@ -64,15 +64,15 @@
64
64
  "task": "HMM.essential.rb",
65
65
  "description": ["Typical single-copy bacterial genes present in",
66
66
  "Mycoplasma genitalium."],
67
- "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
68
- null,null,null,null,null,null,null,null]
67
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
68
+ null,null,null,null,null,null,null,null,null]
69
69
  },
70
70
  {
71
71
  "task": "HMM.essential.rb",
72
72
  "description": ["Typical single-copy archaeal genes present in",
73
73
  "Nanoarchaeum equitans."],
74
- "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
75
- null,null,null,null,null,null,null,null]
74
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,null,true,null,
75
+ null,null,null,null,null,null,null,null,null]
76
76
  },
77
77
  {
78
78
  "task": "Newick.autoprune.R",
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
1
+ ../../Scripts/FastA.N50.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
1
+ ../../Scripts/FastA.filterN.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
1
+ ../../Scripts/FastA.length.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
1
+ ../../Scripts/FastA.split.pl
@@ -10,7 +10,8 @@ use 'zlib'
10
10
 
11
11
  o = {
12
12
  bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
- archaea: false, genomeeq: false, metagenome: false, list: false
13
+ archaea: false, genomeeq: false, metagenome: false, list: false,
14
+ collection: 'dupont_2012'
14
15
  }
15
16
  OptionParser.new do |opts|
16
17
  opts.banner = "
@@ -33,7 +34,15 @@ Usage: #{$0} [options]"
33
34
  'Path to the FastA file (.gz allowed) with all the proteins in a genome'
34
35
  ) { |v| o[:in] = v }
35
36
  opts.separator ''
36
- opts.separator 'Report Options'
37
+ opts.separator 'Options'
38
+ opts.on(
39
+ '-c', '--collection STR',
40
+ 'Reference collection of essential proteins to use. One of:',
41
+ '> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
42
+ ' modified by https://doi.org/10.1038/ismej.2015.5',
43
+ '> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
44
+ ' modified by https://doi.org/10.7717/peerj.1319'
45
+ ) { |v| o[:collection] = v }
37
46
  opts.on(
38
47
  '-o', '--out FILE',
39
48
  'Path to the output FastA file with the translated essential genes',
@@ -117,20 +126,44 @@ abort '-i is mandatory' if o[:in].nil? and not o[:list]
117
126
  o[:bin] = o[:bin] + '/' if o[:bin].size > 0
118
127
  o[:rename] = nil if o[:metagenome]
119
128
 
120
- not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
121
- TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062 TIGR00082 TIGR00086
122
- TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158 TIGR00165 TIGR00166 TIGR00168
123
- TIGR00362 TIGR00388 TIGR00396 TIGR00409 TIGR00418 TIGR00420 TIGR00422 TIGR00436
124
- TIGR00459 TIGR00460 TIGR00472 TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663
125
- TIGR00775 TIGR00810 TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964
126
- TIGR00967 TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
127
- TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059 TIGR01063
128
- TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169 TIGR01171 TIGR01391
129
- TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013 TIGR02027 TIGR02191 TIGR02350
130
- TIGR02386 TIGR02387 TIGR02397 TIGR02432 TIGR02729 TIGR03263 TIGR03594}
131
- not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
132
- not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
133
- TIGR00389 TIGR00436 tRNA-synth_1d}
129
+ case o[:collection]
130
+ when 'dupont_2012'
131
+ not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
132
+ TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
133
+ TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
134
+ TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
135
+ TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
136
+ TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
137
+ TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
138
+ TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
139
+ TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
140
+ TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
141
+ TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
142
+ TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
143
+ TIGR02729 TIGR03263 TIGR03594}
144
+ not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
145
+ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
146
+ TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
147
+ when 'lee_2019'
148
+ not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
149
+ EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
150
+ Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
151
+ Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
152
+ Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
153
+ Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
154
+ RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
155
+ not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
156
+ CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
157
+ DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
158
+ Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
159
+ Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
160
+ Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
161
+ Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
162
+ Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
163
+ not_as_genomeeq = not_in_archaea + not_in_bacteria
164
+ else
165
+ raise "Unsupported collection: '#{o[:collection]}'"
166
+ end
134
167
 
135
168
  begin
136
169
  Dir.mktmpdir do |dir|
@@ -148,7 +181,8 @@ begin
148
181
  models = {}
149
182
  model_id = nil
150
183
  dbh = File.open("#{dir}/essential.hmm", 'w')
151
- o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
184
+ o[:model_file] ||= File.expand_path(
185
+ "../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
152
186
  mfh = (File.extname(o[:model_file]) == '.gz') ?
153
187
  Zlib::GzipReader.open(o[:model_file]) :
154
188
  File.open(o[:model_file], 'r')
@@ -201,6 +235,9 @@ begin
201
235
  # Report statistics
202
236
  if o[:stats]
203
237
  reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
238
+ modifiers = [:bacteria, :archaea, :genomeeq]
239
+ .map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
240
+ reph.puts "! Collection: #{o[:collection]} #{modifiers}"
204
241
  if o[:metagenome]
205
242
  reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
206
243
  gc = [0] * (models.size - genes.size) +
@@ -1 +1 @@
1
- utils/enveomics/Scripts/lib/../../enveomics.R
1
+ ../../enveomics.R
@@ -1,5 +1,5 @@
1
1
  Package: enveomics.R
2
- Version: 1.7.0
2
+ Version: 1.7.1
3
3
  Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
4
  email="lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
@@ -25,25 +25,24 @@
25
25
 
26
26
  enve.df2dist <- function(
27
27
  x,
28
- obj1.index=1,
29
- obj2.index=2,
30
- dist.index=3,
31
- default.d=NA,
32
- max.sim=0
28
+ obj1.index = 1,
29
+ obj2.index = 2,
30
+ dist.index = 3,
31
+ default.d = NA,
32
+ max.sim = 0
33
33
  ){
34
- x <- as.data.frame(x);
35
- a <- as.character(x[, obj1.index]);
36
- b <- as.character(x[, obj2.index]);
37
- d <- as.double(x[, dist.index]);
38
- if(max.sim!=0) d <- (max.sim - d)/max.sim
39
- ids <- unique(c(a,b));
40
- m <- matrix(default.d, nrow=length(ids), ncol=length(ids), dimnames=list(ids, ids));
34
+ x <- as.data.frame(x)
35
+ a <- as.character(x[, obj1.index])
36
+ b <- as.character(x[, obj2.index])
37
+ d <- as.double(x[, dist.index])
38
+ if(max.sim != 0) d <- (max.sim - d) / max.sim
39
+ ids <- unique(c(a,b))
40
+ m <- matrix(default.d,
41
+ nrow = length(ids), ncol = length(ids), dimnames = list(ids, ids))
41
42
  diag(m) <- 0.0
42
- for(i in 1:nrow(x)){
43
- m[a[i], b[i]] <- d[i];
44
- }
45
- m <- pmin(m, t(m), na.rm=TRUE)
46
- return(as.dist(m));
43
+ m[cbind(a,b)] <- d
44
+ m <- pmin(m, t(m), na.rm = TRUE)
45
+ return(as.dist(m))
47
46
  }
48
47
 
49
48
  #' Enveomics: Data Frame to Dist (Group)
@@ -666,15 +666,16 @@ enve.recplot2.findPeaks <- function(
666
666
  #' A vector of number of components to evaluate.
667
667
  #' @param criterion
668
668
  #' Criterion to use for components selection. Must be one of:
669
- #' \code{aic} (Akaike Information Criterion),
670
- #' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
669
+ #' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
670
+ #' (Bayesian Information Criterion or Schwarz Criterion).
671
671
  #' @param merge.tol
672
672
  #' When attempting to merge peaks with very similar sequencing depth, use
673
673
  #' this number of significant digits (in log-scale).
674
674
  #' @param verbose
675
675
  #' Display (mostly debugging) information.
676
676
  #' @param ...
677
- #' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
677
+ #' Any additional parameters supported by
678
+ #' \code{\link{enve.recplot2.findPeaks.em}}.
678
679
  #'
679
680
  #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
680
681
  #'
@@ -684,10 +685,10 @@ enve.recplot2.findPeaks <- function(
684
685
 
685
686
  enve.recplot2.findPeaks.emauto <- function(
686
687
  x,
687
- components=seq(1,10),
688
- criterion='aic',
689
- merge.tol=2L,
690
- verbose=FALSE,
688
+ components = seq(1, 5),
689
+ criterion = 'aic',
690
+ merge.tol = 2L,
691
+ verbose = FALSE,
691
692
  ...
692
693
  ){
693
694
  best <- list(crit=0, pstore=list())
@@ -758,19 +759,19 @@ enve.recplot2.findPeaks.emauto <- function(
758
759
 
759
760
  enve.recplot2.findPeaks.em <- function(
760
761
  x,
761
- max.iter=1000,
762
- ll.diff.res=1e-8,
763
- components=2,
764
- rm.top=0.05,
765
- verbose=FALSE,
762
+ max.iter = 1000,
763
+ ll.diff.res = 1e-8,
764
+ components = 2,
765
+ rm.top = 0.05,
766
+ verbose = FALSE,
766
767
  init,
767
- log=TRUE
768
+ log = TRUE
768
769
  ){
769
770
 
770
771
  # Essential vars
771
772
  pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
772
773
  lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
773
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
774
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
774
775
  if(log) lsd1 <- log(lsd1)
775
776
 
776
777
  # 1. Initialize
@@ -779,7 +780,7 @@ enve.recplot2.findPeaks.em <- function(
779
780
  init <- list(
780
781
  mu = tapply(lsd1, km.clust, mean),
781
782
  sd = tapply(lsd1, km.clust, sd),
782
- alpha = table(km.clust)/length(km.clust)
783
+ alpha = table(km.clust) / length(km.clust)
783
784
  )
784
785
  }
785
786
  m.step <- init
@@ -795,6 +796,7 @@ enve.recplot2.findPeaks.em <- function(
795
796
  ll.diff <- abs(cur.ll - e.step[["ll"]])
796
797
  cur.ll <- e.step[["ll"]]
797
798
  if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
799
+ if(is.na(ll.diff) || ll.diff == Inf) break
798
800
  if(ll.diff <= ll.diff.res) break
799
801
  }
800
802
 
@@ -1431,6 +1433,9 @@ enve.recplot2.findPeaks.__em_e <- function
1431
1433
  theta[['sd']][i])*theta[['alpha']][i]))
1432
1434
  sum.of.components <- rowSums(product)
1433
1435
  posterior <- product / sum.of.components
1436
+ for(i in which(sum.of.components == Inf)) {
1437
+ cat(i,'/',nrow(product), ':', product[i,], '\n')
1438
+ }
1434
1439
 
1435
1440
  return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1436
1441
  }
@@ -52,6 +52,7 @@ For additional information on recruitment plots, see the
52
52
  [Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
53
53
 
54
54
  ## Changelog
55
+ * 1.7.1: Improved efficiency of `enve.df2dist` about five-fold.
55
56
  * 1.7.0: Uniformized output for `enve.recplot2.extractWindows` and
56
57
  `enve.recplot2.coordinates` to ease automation. Thanks to Tomeu Viver and
57
58
  Roth Conrad for troubleshooting.
@@ -4,7 +4,7 @@
4
4
  \alias{enve.recplot2.findPeaks.emauto}
5
5
  \title{Enveomics: Recruitment Plot (2) Emauto Peak Finder}
6
6
  \usage{
7
- enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
7
+ enve.recplot2.findPeaks.emauto(x, components = seq(1, 5),
8
8
  criterion = "aic", merge.tol = 2L, verbose = FALSE, ...)
9
9
  }
10
10
  \arguments{
@@ -13,15 +13,16 @@ enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
13
13
  \item{components}{A vector of number of components to evaluate.}
14
14
 
15
15
  \item{criterion}{Criterion to use for components selection. Must be one of:
16
- \code{aic} (Akaike Information Criterion),
17
- \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).}
16
+ \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
17
+ (Bayesian Information Criterion or Schwarz Criterion).}
18
18
 
19
19
  \item{merge.tol}{When attempting to merge peaks with very similar sequencing depth, use
20
20
  this number of significant digits (in log-scale).}
21
21
 
22
22
  \item{verbose}{Display (mostly debugging) information.}
23
23
 
24
- \item{...}{Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.}
24
+ \item{...}{Any additional parameters supported by
25
+ \code{\link{enve.recplot2.findPeaks.em}}.}
25
26
  }
26
27
  \value{
27
28
  Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
data/utils/find-medoid.R CHANGED
@@ -7,7 +7,12 @@
7
7
  #= Load stuff
8
8
  argv <- commandArgs(trailingOnly = T)
9
9
  suppressPackageStartupMessages(library(ape))
10
- suppressPackageStartupMessages(library(enveomics.R))
10
+ if(Sys.getenv('MIGA') == ''){
11
+ suppressPackageStartupMessages(library(enveomics.R))
12
+ }else{
13
+ source(file.path(Sys.getenv('MIGA'),
14
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
15
+ }
11
16
 
12
17
  find_medoids <- function(ani.df, out, clades) {
13
18
  if(nrow(ani.df) == 0) return(NULL)
data/utils/mytaxa_scan.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'zlib'
4
+
3
5
  abort "
4
6
  Usage:
5
7
  #{$0} {FastA file} {MyTaxa file} {Data output}
@@ -7,52 +9,53 @@ Usage:
7
9
  " if ARGV[2].nil?
8
10
 
9
11
  begin
10
- # Get arguments
11
- faa, mytaxa, outdata = ARGV
12
- winsize = 10
13
-
14
- # Extract gene IDs
15
- ids = File.open(faa).grep(/^>/).map{|dl| dl.chomp.sub(/^>/,"").sub(/\s.*/,"")}
16
- tax = Hash[ids.map{|k| [k, "NA"]}]
17
-
18
- # Get MyTaxa distributions
19
- k, l = nil
20
- File.open(mytaxa).each do |ln|
21
- ln.chomp!
22
- if $.%2 == 1
23
- k, l = ln.split /\t/
24
- else
25
- tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
26
- end
27
- end
28
- all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
29
-
30
- # Estimate Windows and save gene IDs
31
- fh = File.open(outdata + ".genes", "w")
32
- c = []
33
- c << all_tax.map{|t| tax.values.count(t) }
34
- n_wins = (ids.size/winsize).ceil
35
- (0 .. (n_wins-1)).each do |win|
36
- k = ids[win*winsize, winsize]
37
- win_t = tax.values_at(*k)
38
- fh.puts k.join("\t")
39
- c << all_tax.map{|t| win_t.count(t)}
40
- end
41
- p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
42
- fh.close
43
-
44
- # Save window profiles
45
- fh = File.open(outdata, "w")
46
- fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
47
- fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
48
- (0 .. (all_tax.size - 1)).each do |row|
49
- fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
50
- end
51
- fh.close
12
+ # Get arguments
13
+ faa, mytaxa, outdata = ARGV
14
+ winsize = 10
15
+
16
+ # Extract gene IDs
17
+ ifh = faa =~ /\.gz/ ? Zlib::GzipReader.open(faa) : File.open(faa, 'r')
18
+ ids = ifh.each_line.grep(/^>/).map{|dl| dl.chomp.sub(/^>/,'').sub(/\s.*/,'')}
19
+ ifh.close
20
+ tax = Hash[ids.map{|k| [k, "NA"]}]
21
+
22
+ # Get MyTaxa distributions
23
+ k, l = nil
24
+ File.open(mytaxa).each do |ln|
25
+ ln.chomp!
26
+ if $.%2 == 1
27
+ k, l = ln.split /\t/
28
+ else
29
+ tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
30
+ end
31
+ end
32
+ all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
33
+
34
+ # Estimate Windows and save gene IDs
35
+ fh = File.open(outdata + ".genes", "w")
36
+ c = []
37
+ c << all_tax.map{|t| tax.values.count(t) }
38
+ n_wins = (ids.size/winsize).ceil
39
+ (0 .. (n_wins-1)).each do |win|
40
+ k = ids[win*winsize, winsize]
41
+ win_t = tax.values_at(*k)
42
+ fh.puts k.join("\t")
43
+ c << all_tax.map{|t| win_t.count(t)}
44
+ end
45
+ p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
46
+ fh.close
47
+
48
+ # Save window profiles
49
+ fh = File.open(outdata, "w")
50
+ fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
51
+ fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
52
+ (0 .. (all_tax.size - 1)).each do |row|
53
+ fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
54
+ end
55
+ fh.close
52
56
  rescue => err
53
- $stderr.puts "Exception: #{err}\n\n"
54
- err.backtrace.each { |l| $stderr.puts l + "\n" }
55
- err
57
+ $stderr.puts "Exception: #{err}\n\n"
58
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
59
+ err
56
60
  end
57
61
 
58
-
data/utils/ref-tree.R CHANGED
@@ -7,7 +7,12 @@
7
7
  #= Load stuff
8
8
  argv <- commandArgs(trailingOnly=T)
9
9
  suppressPackageStartupMessages(library(ape))
10
- suppressPackageStartupMessages(library(enveomics.R))
10
+ if(Sys.getenv('MIGA') == ''){
11
+ suppressPackageStartupMessages(library(enveomics.R))
12
+ }else{
13
+ source(file.path(Sys.getenv('MIGA'),
14
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
15
+ }
11
16
  inst <- c("phangorn", "phytools") %in% rownames(installed.packages())
12
17
  if(inst[1]){
13
18
  suppressPackageStartupMessages(library(phangorn))
data/utils/subclades-nj.R CHANGED
@@ -12,7 +12,12 @@ suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(phytools))
13
13
  suppressPackageStartupMessages(library(phangorn))
14
14
  suppressPackageStartupMessages(library(parallel))
15
- suppressPackageStartupMessages(library(enveomics.R))
15
+ if(Sys.getenv('MIGA') == ''){
16
+ suppressPackageStartupMessages(library(enveomics.R))
17
+ }else{
18
+ source(file.path(Sys.getenv('MIGA'),
19
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
20
+ }
16
21
 
17
22
  #= Main function
18
23
  subclades <- function(ani_file, out_base, thr=1, ani=c()) {
data/utils/subclades.R CHANGED
@@ -10,7 +10,12 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- suppressPackageStartupMessages(library(enveomics.R))
13
+ if(Sys.getenv('MIGA') == ''){
14
+ suppressPackageStartupMessages(library(enveomics.R))
15
+ }else{
16
+ source(file.path(Sys.getenv('MIGA'),
17
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
18
+ }
14
19
 
15
20
  #= Main function
16
21
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0.0
4
+ version: 0.5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-25 00:00:00.000000000 Z
11
+ date: 2020-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -197,7 +197,6 @@ files:
197
197
  - test/taxonomy_test.rb
198
198
  - test/test_helper.rb
199
199
  - utils/adapters.fa
200
- - utils/arch-ess-genes.rb
201
200
  - utils/cleanup-databases.rb
202
201
  - utils/core-pan-plot.R
203
202
  - utils/distance/base.rb
@@ -207,6 +206,7 @@ files:
207
206
  - utils/distance/runner.rb
208
207
  - utils/distance/temporal.rb
209
208
  - utils/distances.rb
209
+ - utils/domain-ess-genes.rb
210
210
  - utils/enveomics/Docs/recplot2.md
211
211
  - utils/enveomics/Examples/aai-matrix.bash
212
212
  - utils/enveomics/Examples/ani-matrix.bash
@@ -356,7 +356,8 @@ files:
356
356
  - utils/enveomics/Scripts/clust.rand.rb
357
357
  - utils/enveomics/Scripts/gi2tax.rb
358
358
  - utils/enveomics/Scripts/in_silico_GA_GI.pl
359
- - utils/enveomics/Scripts/lib/data/essential.hmm.gz
359
+ - utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz
360
+ - utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz
360
361
  - utils/enveomics/Scripts/lib/enveomics.R
361
362
  - utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
362
363
  - utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
@@ -514,8 +515,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
514
515
  - !ruby/object:Gem::Version
515
516
  version: '0'
516
517
  requirements: []
517
- rubyforge_project:
518
- rubygems_version: 2.7.6
518
+ rubygems_version: 3.0.3
519
519
  signing_key:
520
520
  specification_version: 4
521
521
  summary: MiGA
@@ -1,57 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- esslog = ARGV.shift
4
- outlog = ARGV.shift
5
- l_all = `HMM.essential.rb -l -q`.chomp.split("\n").map{ |i| i.gsub(/\t.*/,"") }
6
- n_arc = Hash[
7
- `HMM.essential.rb -l -q -A`.chomp.split("\n").map{ |i| i.split("\t") }
8
- ]
9
- l_arc = n_arc.keys
10
-
11
- def quality(hsh)
12
- q = {}
13
- q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
14
- q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
15
- q[:cmp] = 100.0*q[:found].to_f/hsh.size
16
- q[:cnt] = 100.0*q[:multi].to_f/hsh.size
17
- q
18
- end
19
-
20
- cnt_ref = {}
21
- l_all.each{ |i| cnt_ref[i] = 1 }
22
-
23
- at = :header
24
- File.open(esslog, "r") do |fh|
25
- fh.each_line do |ln|
26
- v = ln.chomp.gsub(/^! +/, "")
27
- if v=="Multiple copies: "
28
- at = :multi
29
- elsif v=="Missing genes: "
30
- at = :missing
31
- elsif at==:multi
32
- v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
33
- cnt_ref[$2] = $1.to_i
34
- elsif at==:missing
35
- v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
36
- cnt_ref[$1] = 0
37
- end
38
- end
39
- end
40
-
41
- cnt_arc = {}
42
- l_arc.each{ |i| cnt_arc[i] = cnt_ref[i] }
43
-
44
- q = quality(cnt_arc)
45
- File.open(outlog, "w") do |ofh|
46
- ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_arc.size}."
47
- ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
48
- ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
49
- if q[:multi] > 0
50
- ofh.puts "! Multiple copies: "
51
- cnt_arc.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_arc[k]}." if v>1 }
52
- end
53
- if q[:found] < cnt_arc.size
54
- ofh.puts "! Missing genes: "
55
- cnt_arc.each{ |k,v| ofh.puts "! #{k}: #{n_arc[k]}." if v==0 }
56
- end
57
- end