miga-base 0.5.0.0 → 0.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor.rb +6 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/cli/action/quality_wf.rb +1 -0
  5. data/lib/miga/cli/action/stats.rb +9 -8
  6. data/lib/miga/cli/action/wf.rb +5 -0
  7. data/lib/miga/cli/objects_helper.rb +1 -0
  8. data/lib/miga/common/format.rb +5 -2
  9. data/lib/miga/daemon.rb +2 -2
  10. data/lib/miga/project/dataset.rb +8 -7
  11. data/lib/miga/version.rb +2 -2
  12. data/scripts/essential_genes.bash +9 -8
  13. data/scripts/mytaxa.bash +3 -1
  14. data/scripts/mytaxa_scan.bash +15 -8
  15. data/utils/domain-ess-genes.rb +63 -0
  16. data/utils/enveomics/Manifest/Tasks/other.json +21 -2
  17. data/utils/enveomics/Manifest/examples.json +4 -4
  18. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
  19. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
  20. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
  21. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
  22. data/utils/enveomics/Scripts/HMM.essential.rb +54 -17
  23. data/utils/enveomics/Scripts/lib/data/{essential.hmm.gz → dupont_2012_essential.hmm.gz} +0 -0
  24. data/utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz +0 -0
  25. data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
  26. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  27. data/utils/enveomics/enveomics.R/R/df2dist.R +16 -17
  28. data/utils/enveomics/enveomics.R/R/recplot2.R +20 -15
  29. data/utils/enveomics/enveomics.R/README.md +1 -0
  30. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +5 -4
  31. data/utils/find-medoid.R +6 -1
  32. data/utils/mytaxa_scan.rb +49 -46
  33. data/utils/ref-tree.R +6 -1
  34. data/utils/subclades-nj.R +6 -1
  35. data/utils/subclades.R +6 -1
  36. metadata +6 -6
  37. data/utils/arch-ess-genes.rb +0 -57
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e370d282f1b28480765e1b91fcb7d8921d12baa31d22db1318975a1c2a79e19a
4
- data.tar.gz: e7fb3941fd3381e0e9696a2c577aeb157657335e56434e7c6d6650be7ba45e98
3
+ metadata.gz: f6888c1ce3756b8cc708736c0da052e5a7396277e0c903ebcfc083f17b6915e7
4
+ data.tar.gz: d998f6e087316a81de4aa8897452344c1987ce0cb9807f4a1e11a29f52dfbcf2
5
5
  SHA512:
6
- metadata.gz: 4642a212e1b4021e211fd144b515ff49e9ddb7a9b2292430553307a7ae165e4d8d5e6fd8426757f15ea6e70f4c3efbb055e0439497172cc1f91186d522c82635
7
- data.tar.gz: 8d5d3ded3c03e56505572102110a4bca4b84d06b2e73bcf208856610a4cd6e60092ce6d54d47dcef2c8acf85f1cce5f8461097e8699344df6738ed8493215112
6
+ metadata.gz: c6f7f8af791664b2bb704744535e0e39c4d5fc06521beb8feb57f658d6187a667100fde4312a3a8e5f47f5dd9d4b3c06326584d95e80262dc0e02a91795e192c
7
+ data.tar.gz: 4f633972d8ccc1cc06cc14ca6c48b50759d63af72ba75514a0e877a58af4e1d407fda2c2608c2077dd541e30e86add8359bc2e0838d93a19f7cc1bd5c5f5fff2
@@ -104,6 +104,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
104
104
  unless ok
105
105
  cli.say " > Registering again #{d.name}:#{r_k}"
106
106
  d.add_result(r_k, true, force: true)
107
+ sr = d.result(:stats) and sr.remove!
107
108
  end
108
109
  end
109
110
  end
@@ -123,7 +124,10 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
123
124
  changed = true
124
125
  end
125
126
  end
126
- d.add_result(:cds, true, force: true) if changed
127
+ if changed
128
+ d.add_result(:cds, true, force: true)
129
+ sr = d.result(:stats) and sr.remove!
130
+ end
127
131
  end
128
132
  end
129
133
 
@@ -136,6 +140,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
136
140
  if dir.nil?
137
141
  cli.say " > Removing #{d.name}:essential_genes"
138
142
  res.remove!
143
+ sr = d.result(:stats) and sr.remove!
139
144
  next
140
145
  end
141
146
  next if Dir["#{dir}/*.faa"].empty?
@@ -220,7 +220,7 @@ BASH
220
220
 
221
221
  def check_r_packages(paths)
222
222
  cli.puts 'Looking for R packages:'
223
- %w(enveomics.R ape cluster vegan).each do |pkg|
223
+ %w(ape cluster vegan).each do |pkg|
224
224
  cli.print "Testing #{pkg}... "
225
225
  if test_r_package(cli, paths, pkg)
226
226
  cli.puts 'yes.'
@@ -25,6 +25,7 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
25
25
  %w[project_stats haai_distances aai_distances ani_distances clade_finding]
26
26
  .map { |i| ["run_#{i}", false] }
27
27
  ]
28
+ p_metadata[:ess_coll] = cli[:ess_coll]
28
29
  d_metadata = { run_distances: false }
29
30
  d_metadata[:run_mytaxa_scan] = false unless cli[:mytaxa]
30
31
  p = create_project(:assembly, p_metadata, d_metadata)
@@ -122,17 +122,18 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
122
122
  end
123
123
  end
124
124
  else
125
- # Fix estimate for Archaea
126
- if !d.metadata[:tax].nil? &&
127
- d.metadata[:tax].in?(Taxonomy.new('d:Archaea')) &&
128
- r.file_path(:bac_report).nil?
129
- scr = "#{MiGA.root_path}/utils/arch-ess-genes.rb"
125
+ # Fix estimate by domain
126
+ if !(tax = d.metadata[:tax]).nil? &&
127
+ %w[Archaea Bacteria].include?(tax[:d]) &&
128
+ r.file_path(:raw_report).nil?
129
+ scr = "#{MiGA.root_path}/utils/domain-ess-genes.rb"
130
130
  rep = r.file_path(:report)
131
131
  rc_p = File.expand_path('.miga_rc', ENV['HOME'])
132
132
  rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
133
- $stderr.print `#{rc} ruby '#{scr}' '#{rep}' '#{rep}.archaea'`
134
- r.add_file(:bac_report, "#{d.name}.ess/log")
135
- r.add_file(:report, "#{d.name}.ess/log.archaea")
133
+ $stderr.print `#{rc} ruby '#{scr}' \
134
+ '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
135
+ r.add_file(:raw_report, "#{d.name}.ess/log")
136
+ r.add_file(:report, "#{d.name}.ess/log.domain")
136
137
  end
137
138
  # Extract/compute quality values
138
139
  stats = {completeness: [0.0, '%'], contamination: [0.0, '%']}
@@ -24,6 +24,11 @@ module MiGA::Cli::Action::Wf
24
24
  opt.separator " FILES...: #{files_desc}"
25
25
  opt.separator ''
26
26
  opt.separator 'Workflow Control Options'
27
+ opt.on(
28
+ '-C', '--collection STRING',
29
+ 'Collection of essential genes to use as reference',
30
+ 'One of: dupont_2012 (default), lee_2019'
31
+ ) { |v| cli[:ess_coll] = v }
27
32
  if params[:ncbi]
28
33
  opt.on(
29
34
  '-T', '--ncbi-taxon STRING',
@@ -66,6 +66,7 @@ module MiGA::Cli::ObjectsHelper
66
66
  end
67
67
 
68
68
  def add_metadata(obj, cli = self)
69
+ raise "Unsupported object: #{obj.class}" unless obj.respond_to? :metadata
69
70
  cli[:metadata].split(',').each do |pair|
70
71
  (k,v) = pair.split('=')
71
72
  case v
@@ -25,10 +25,13 @@ module MiGA::Common::Format
25
25
  # Cleans a FastA file in place.
26
26
  def clean_fasta_file(file)
27
27
  tmp_fh = nil
28
+ tmp_path = nil
28
29
  begin
29
30
  if file =~ /\.gz/
30
31
  tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
31
- tmp_fh = Zlib::GzipWriter.open(tmp_path)
32
+ File.unlink tmp_path
33
+ tmp_path += '.gz'
34
+ tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
32
35
  fh = Zlib::GzipReader.open(file)
33
36
  else
34
37
  tmp_fh = Tempfile.new('MiGA')
@@ -50,7 +53,7 @@ module MiGA::Common::Format
50
53
  tmp_fh.print buffer.wrap_width(80)
51
54
  tmp_fh.close
52
55
  fh.close
53
- FileUtils.cp(tmp_path, file)
56
+ FileUtils.mv(tmp_path, file)
54
57
  ensure
55
58
  begin
56
59
  tmp_fh.close unless tmp_fh.nil?
data/lib/miga/daemon.rb CHANGED
@@ -285,10 +285,10 @@ class MiGA::Daemon < MiGA::MiGA
285
285
  if [nil, '', 0].include? job[:pid]
286
286
  job[:pid] = nil
287
287
  @jobs_to_run << job
288
- say "Unsuccessful #{job[:task_name]}, rescheduling."
288
+ say "Unsuccessful #{job[:task_name]}, rescheduling"
289
289
  else
290
290
  @jobs_running << job
291
- say "Spawned pid:#{job[:pid]} for #{job[:task_name]}."
291
+ say "Spawned pid:#{job[:pid]} for #{job[:task_name]}"
292
292
  end
293
293
  end
294
294
  end
@@ -4,7 +4,7 @@
4
4
  ##
5
5
  # Helper module including specific functions handle datasets.
6
6
  module MiGA::Project::Dataset
7
-
7
+
8
8
  ##
9
9
  # Returns Array of MiGA::Dataset.
10
10
  def datasets
@@ -23,7 +23,7 @@ module MiGA::Project::Dataset
23
23
  def dataset_names_hash
24
24
  @dataset_names_hash ||= Hash[dataset_names.map{ |i| [i,true] }]
25
25
  end
26
-
26
+
27
27
  ##
28
28
  # Returns MiGA::Dataset.
29
29
  def dataset(name)
@@ -47,18 +47,19 @@ module MiGA::Project::Dataset
47
47
  end
48
48
  end
49
49
  end
50
-
50
+
51
51
  ##
52
52
  # Add dataset identified by +name+ and return MiGA::Dataset.
53
53
  def add_dataset(name)
54
54
  unless metadata[:datasets].include? name
55
55
  MiGA::Dataset.new(self, name)
56
56
  @metadata[:datasets] << name
57
+ @dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
57
58
  save
58
59
  end
59
60
  dataset(name)
60
61
  end
61
-
62
+
62
63
  ##
63
64
  # Unlink dataset identified by +name+ and return MiGA::Dataset.
64
65
  def unlink_dataset(name)
@@ -68,7 +69,7 @@ module MiGA::Project::Dataset
68
69
  save
69
70
  d
70
71
  end
71
-
72
+
72
73
  ##
73
74
  # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
74
75
  # supported by File#generic_transfer.
@@ -116,7 +117,7 @@ module MiGA::Project::Dataset
116
117
  end
117
118
  datasets.uniq - metadata[:datasets]
118
119
  end
119
-
120
+
120
121
  ##
121
122
  # Are all the datasets in the project preprocessed? Save intermediate results
122
123
  # if +save+ (until the first incomplete dataset is reached).
@@ -149,6 +150,6 @@ module MiGA::Project::Dataset
149
150
  def each_dataset_profile_advance(&blk)
150
151
  each_dataset { |ds| blk.call(ds.profile_advance) }
151
152
  end
152
-
153
+
153
154
  end
154
155
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.5, 0, 0]
13
+ VERSION = [0.5, 1, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2019, 11, 25)
21
+ VERSION_DATE = Date.new(2020, 1, 6)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
@@ -22,18 +22,19 @@ fi
22
22
  # Find and extract essential genes
23
23
  [[ -d "${DATASET}.ess" ]] && rm -R "${DATASET}.ess"
24
24
  mkdir "${DATASET}.ess"
25
- TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
25
+ TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" \
26
26
  --metadata "type" | awk '{print $2}')
27
+ COLL=$(miga about -P "$PROJECT" -m ess_coll)
28
+ [[ "$COLL" == "?" ]] && COLL=dupont_2012
29
+ CMD="HMM.essential.rb \
30
+ -i '$FAA' -o '${DATASET}.ess.faa' -m '${DATASET}.ess/' \
31
+ -t '$CORES' -r '$DATASET' --collection '$COLL'"
27
32
  if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
28
- HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
29
- -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" --metagenome \
30
- > "${DATASET}.ess/log"
33
+ CMD="$CMD --metagenome"
31
34
  else
32
- HMM.essential.rb -i "$FAA" -o "${DATASET}.ess.faa" \
33
- -m "${DATASET}.ess/" -t "$CORES" -r "$DATASET" \
34
- --alignments "${DATASET}.ess/proteins.aln" \
35
- > "${DATASET}.ess/log"
35
+ CMD="$CMD --alignments '${DATASET}.ess/proteins.aln'"
36
36
  fi
37
+ $CMD > "${DATASET}.ess/log"
37
38
 
38
39
  # Reduce files
39
40
  if exists "$DATASET".ess/*.faa ; then
data/scripts/mytaxa.bash CHANGED
@@ -38,7 +38,9 @@ else
38
38
  fi
39
39
 
40
40
  # Execute search
41
- diamond blastp -q "../../../06.cds/$DATASET.faa" -d "$MT/AllGenomes.faa" \
41
+ FAA="../../../06.cds/$DATASET.faa"
42
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
43
+ diamond blastp -q "$FAA" -d "$MT/AllGenomes.faa" \
42
44
  -a "$DATASET.daa" -k 5 -p "$CORES" --min-score 60
43
45
  diamond view -a "$DATASET.daa" -o "$DATASET.blast"
44
46
 
@@ -39,12 +39,13 @@ else
39
39
  exit 1
40
40
  fi
41
41
 
42
+ FAA="../../../06.cds/$DATASET.faa"
43
+ [[ -s "$FAA" ]] || FAA="${FAA}.gz"
42
44
  if [[ ! -s "$DATASET.mytaxa" ]] ; then
43
45
  # Execute search
44
46
  if [[ ! -s "$DATASET.blast" ]] ; then
45
- diamond blastp -q "../../../06.cds/$DATASET.faa" \
46
- -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60 \
47
- -a "$DATASET.daa" -t "$TMPDIR"
47
+ diamond blastp -q "$FAA" -a "$DATASET.daa" -t "$TMPDIR" \
48
+ -d "$MT/AllGenomes.faa" -k 5 -p "$CORES" --min-score 60
48
49
  diamond view -a "$DATASET.daa" -o "$DATASET.blast" -t "$TMPDIR"
49
50
  fi
50
51
 
@@ -53,8 +54,7 @@ else
53
54
  | sort -k 13 > "$DATASET.mytaxain"
54
55
  "$MT/MyTaxa" "$DATASET.mytaxain" "$DATASET.mytaxa" "0.5"
55
56
  fi
56
- ruby "$MIGA/utils/mytaxa_scan.rb" "../../../06.cds/$DATASET.faa" \
57
- "$DATASET.mytaxa" "$DATASET.wintax"
57
+ ruby "$MIGA/utils/mytaxa_scan.rb" "$FAA" "$DATASET.mytaxa" "$DATASET.wintax"
58
58
  echo "
59
59
  source('$MIGA/utils/mytaxa_scan.R');
60
60
  pdf('$DATASET.pdf', 12, 7);
@@ -70,11 +70,18 @@ else
70
70
  let i=$i+1
71
71
  awk "NR==$win" "$DATASET.wintax.genes" | tr "\\t" "\\n" \
72
72
  > "$DATASET.reg/$i.ids"
73
- FastA.filter.pl -q "$DATASET.reg/$i.ids" \
74
- "../../../06.cds/$DATASET.faa" > "$DATASET.reg/$i.faa"
73
+ if [[ "$FAA" == *.gz ]] ; then
74
+ gzip -c -d "$FAA" \
75
+ | FastA.filter.pl -q "$DATASET.reg/$i.ids" /dev/stdin \
76
+ > "$DATASET.reg/$i.faa"
77
+ else
78
+ FastA.filter.pl -q "$DATASET.reg/$i.ids" "$FAA" \
79
+ > "$DATASET.reg/$i.faa"
80
+ fi
75
81
  done
76
82
  # Archive regions
77
- tar zcf "$DATASET.reg.tar.gz" "$DATASET.reg"
83
+ tar -cf "$DATASET.reg.tar" "$DATASET.reg"
84
+ gzip -9 "$DATASET.reg.tar"
78
85
  rm -r "$DATASET.reg"
79
86
  fi
80
87
 
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ esslog = ARGV.shift
4
+ outlog = ARGV.shift
5
+ domain = ARGV.shift
6
+
7
+ def quality(hsh)
8
+ q = {}
9
+ q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
10
+ q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
11
+ q[:cmp] = 100.0*q[:found].to_f/hsh.size
12
+ q[:cnt] = 100.0*q[:multi].to_f/hsh.size
13
+ q
14
+ end
15
+
16
+ # Find collection and detected anomalies
17
+ cnt_ref = {}
18
+ at = :header
19
+ collection = 'dupont_2012'
20
+ File.open(esslog, 'r') do |fh|
21
+ fh.each_line do |ln|
22
+ v = ln.chomp.gsub(/^! +/, '')
23
+ if v == 'Multiple copies: '
24
+ at = :multi
25
+ elsif v == 'Missing genes: '
26
+ at = :missing
27
+ elsif v =~ /Collection: (\S+)/
28
+ collection = $1
29
+ elsif at == :multi
30
+ v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
31
+ cnt_ref[$2] = $1.to_i
32
+ elsif at == :missing
33
+ v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
34
+ cnt_ref[$1] = 0
35
+ end
36
+ end
37
+ end
38
+
39
+ # Find expected genes for domain
40
+ n_dom = Hash[
41
+ `HMM.essential.rb -L -q '-#{domain}' -c '#{collection}'`
42
+ .chomp.split("\n").map { |i| i.split("\t") }
43
+ ]
44
+ l_dom = n_dom.keys
45
+ cnt_dom = {}
46
+ l_dom.each { |i| cnt_dom[i] = cnt_ref[i] || 1 }
47
+
48
+ # Correct report
49
+ q = quality(cnt_dom)
50
+ File.open(outlog, 'w') do |ofh|
51
+ ofh.puts "! Collection: #{collection} #{domain}"
52
+ ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_dom.size}."
53
+ ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
54
+ ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
55
+ if q[:multi] > 0
56
+ ofh.puts "! Multiple copies: "
57
+ cnt_dom.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_dom[k]}." if v>1 }
58
+ end
59
+ if q[:found] < cnt_dom.size
60
+ ofh.puts "! Missing genes: "
61
+ cnt_dom.each{ |k,v| ofh.puts "! #{k}: #{n_dom[k]}." if v==0 }
62
+ end
63
+ end
@@ -371,8 +371,18 @@
371
371
  "source_url": "http://hmmer.janelia.org/software"
372
372
  }
373
373
  ],
374
- "cite": [["Eddy, 2011, PLoS CB",
375
- "http://dx.doi.org/10.1371/journal.pcbi.1002195"]],
374
+ "cite": [
375
+ ["Eddy, 2011, PLoS CB",
376
+ "http://dx.doi.org/10.1371/journal.pcbi.1002195"],
377
+ ["Dupont et al, 2012, ISME J",
378
+ "https://doi.org/10.1038/ismej.2011.189"],
379
+ ["Rodriguez-R et al, 2014, ISME J",
380
+ "https://doi.org/10.1038/ismej.2015.5"],
381
+ ["Lee, 2019, Bioinf",
382
+ "https://doi.org/10.1093/bioinformatics/btz188"],
383
+ ["Eren et al, 2015, PeerJ",
384
+ "https://doi.org/10.7717/peerj.1319"]
385
+ ],
376
386
  "options": [
377
387
  {
378
388
  "name": "Input file",
@@ -381,6 +391,15 @@
381
391
  "mandatory": true,
382
392
  "description": "FastA file containing all the proteins in the genome."
383
393
  },
394
+ {
395
+ "opt": "--collection",
396
+ "arg": "string",
397
+ "default": "dupont_2012",
398
+ "description": ["Reference collection of essential proteins to use.",
399
+ "One of: dupont_2012 (default, Dupont et al 2012 modified by",
400
+ "Rodriguez-R et al 2015), or lee_2019 (Lee 2019 modified by Eren",
401
+ "et al 2015)."]
402
+ },
384
403
  {
385
404
  "name": "Output file",
386
405
  "opt": "--out",
@@ -64,15 +64,15 @@
64
64
  "task": "HMM.essential.rb",
65
65
  "description": ["Typical single-copy bacterial genes present in",
66
66
  "Mycoplasma genitalium."],
67
- "values": ["Mgen_M2288.faa",null,null,null,null,null,true,null,null,null,
68
- null,null,null,null,null,null,null,null]
67
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
68
+ null,null,null,null,null,null,null,null,null]
69
69
  },
70
70
  {
71
71
  "task": "HMM.essential.rb",
72
72
  "description": ["Typical single-copy archaeal genes present in",
73
73
  "Nanoarchaeum equitans."],
74
- "values": ["Mgen_M2288.faa",null,null,null,null,null,null,true,null,null,
75
- null,null,null,null,null,null,null,null]
74
+ "values": ["Mgen_M2288.faa",null,null,null,null,null,null,null,true,null,
75
+ null,null,null,null,null,null,null,null,null]
76
76
  },
77
77
  {
78
78
  "task": "Newick.autoprune.R",
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
1
+ ../../Scripts/FastA.N50.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
1
+ ../../Scripts/FastA.filterN.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
1
+ ../../Scripts/FastA.length.pl
@@ -1 +1 @@
1
- utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
1
+ ../../Scripts/FastA.split.pl
@@ -10,7 +10,8 @@ use 'zlib'
10
10
 
11
11
  o = {
12
12
  bin: '', thr: 2, q: false, stats: true, genes: true, bacteria: false,
13
- archaea: false, genomeeq: false, metagenome: false, list: false
13
+ archaea: false, genomeeq: false, metagenome: false, list: false,
14
+ collection: 'dupont_2012'
14
15
  }
15
16
  OptionParser.new do |opts|
16
17
  opts.banner = "
@@ -33,7 +34,15 @@ Usage: #{$0} [options]"
33
34
  'Path to the FastA file (.gz allowed) with all the proteins in a genome'
34
35
  ) { |v| o[:in] = v }
35
36
  opts.separator ''
36
- opts.separator 'Report Options'
37
+ opts.separator 'Options'
38
+ opts.on(
39
+ '-c', '--collection STR',
40
+ 'Reference collection of essential proteins to use. One of:',
41
+ '> dupont_2012 (default): https://doi.org/10.1038/ismej.2011.189',
42
+ ' modified by https://doi.org/10.1038/ismej.2015.5',
43
+ '> lee_2019: https://doi.org/10.1093/bioinformatics/btz188',
44
+ ' modified by https://doi.org/10.7717/peerj.1319'
45
+ ) { |v| o[:collection] = v }
37
46
  opts.on(
38
47
  '-o', '--out FILE',
39
48
  'Path to the output FastA file with the translated essential genes',
@@ -117,20 +126,44 @@ abort '-i is mandatory' if o[:in].nil? and not o[:list]
117
126
  o[:bin] = o[:bin] + '/' if o[:bin].size > 0
118
127
  o[:rename] = nil if o[:metagenome]
119
128
 
120
- not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009 TIGR00019
121
- TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062 TIGR00082 TIGR00086
122
- TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158 TIGR00165 TIGR00166 TIGR00168
123
- TIGR00362 TIGR00388 TIGR00396 TIGR00409 TIGR00418 TIGR00420 TIGR00422 TIGR00436
124
- TIGR00459 TIGR00460 TIGR00472 TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663
125
- TIGR00775 TIGR00810 TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964
126
- TIGR00967 TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
127
- TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059 TIGR01063
128
- TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169 TIGR01171 TIGR01391
129
- TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013 TIGR02027 TIGR02191 TIGR02350
130
- TIGR02386 TIGR02387 TIGR02397 TIGR02432 TIGR02729 TIGR03263 TIGR03594}
131
- not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
132
- not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408 TIGR00409
133
- TIGR00389 TIGR00436 tRNA-synth_1d}
129
+ case o[:collection]
130
+ when 'dupont_2012'
131
+ not_in_archaea = %w{GrpE Methyltransf_5 TIGR00001 TIGR00002 TIGR00009
132
+ TIGR00019 TIGR00029 TIGR00043 TIGR00059 TIGR00060 TIGR00061 TIGR00062
133
+ TIGR00082 TIGR00086 TIGR00092 TIGR00115 TIGR00116 TIGR00152 TIGR00158
134
+ TIGR00165 TIGR00166 TIGR00168 TIGR00362 TIGR00388 TIGR00396 TIGR00409
135
+ TIGR00418 TIGR00420 TIGR00422 TIGR00436 TIGR00459 TIGR00460 TIGR00472
136
+ TIGR00487 TIGR00496 TIGR00575 TIGR00631 TIGR00663 TIGR00775 TIGR00810
137
+ TIGR00855 TIGR00922 TIGR00952 TIGR00959 TIGR00963 TIGR00964 TIGR00967
138
+ TIGR00981 TIGR01009 TIGR01011 TIGR01017 TIGR01021 TIGR01024 TIGR01029
139
+ TIGR01030 TIGR01031 TIGR01032 TIGR01044 TIGR01049 TIGR01050 TIGR01059
140
+ TIGR01063 TIGR01066 TIGR01067 TIGR01071 TIGR01079 TIGR01164 TIGR01169
141
+ TIGR01171 TIGR01391 TIGR01393 TIGR01632 TIGR01953 TIGR02012 TIGR02013
142
+ TIGR02027 TIGR02191 TIGR02350 TIGR02386 TIGR02387 TIGR02397 TIGR02432
143
+ TIGR02729 TIGR03263 TIGR03594}
144
+ not_in_bacteria = %w{TIGR00389 TIGR00408 TIGR00471 TIGR00775 TIGR02387}
145
+ not_as_genomeeq = %w{TIGR02386 TIGR02387 TIGR00471 TIGR00472 TIGR00408
146
+ TIGR00409 TIGR00389 TIGR00436 tRNA-synth_1d}
147
+ when 'lee_2019'
148
+ not_in_archaea = %w{ADK AICARFT_IMPCHas ATP-synt ATP-synt_A Chorismate_synt
149
+ EF_TS eIF-1a Exonuc_VII_L GrpE IPPT OSCP Pept_tRNA_hydro PGK RBFA RecO_C
150
+ Ribonuclease_P Ribosomal_L17 Ribosomal_L18p Ribosomal_L19 Ribosomal_L20
151
+ Ribosomal_L21p ribosomal_L24 Ribosomal_S3_C Ribosomal_L5 Ribosomal_L2
152
+ Ribosomal_L27 Ribosomal_L27A Ribosomal_L28 Ribosomal_L32p Ribosomal_L35p
153
+ Ribosomal_L9_C Ribosomal_S10 Ribosomal_S16 Ribosomal_S20p Ribosomal_S6
154
+ RNA_pol_L RRF RsfS RuvX SecE SecG SmpB tRNA_m1G_MT TsaE UPF0054 YajC}
155
+ not_in_bacteria = %w{AdoHcyase Archease ATP-synt_D ATP-synt_F CarS-like
156
+ CTP-dep_RFKase Diphthamide_syn DNA_primase_lrg dsDNA_bind DUF357 DUF359
157
+ DUF655 eIF-6 FbpA HMG-CoA_red NDK PPS_PS Prefoldin PTH2 PyrI Ribosomal_L15e
158
+ Ribosomal_L21e Ribosomal_L26 Ribosomal_L31e Ribosomal_L32e Ribosomal_L37ae
159
+ Ribosomal_L39 Ribosomal_L44 Ribosomal_L5e Ribosomal_S17e Ribosomal_S19e
160
+ Ribosomal_S24e Ribosomal_S27e Ribosomal_S28e Ribosomal_S3Ae Ribosomal_S8e
161
+ Rib_5-P_isom_A RNase_HII RNA_pol_L_2 RNA_pol_N RNA_pol_Rpb4 RtcB Spt4 TIM
162
+ Trm56 tRNA-synt_1c tRNA-synt_His TruD vATP-synt_AC39 vATP-synt_E V_ATPase_I}
163
+ not_as_genomeeq = not_in_archaea + not_in_bacteria
164
+ else
165
+ raise "Unsupported collection: '#{o[:collection]}'"
166
+ end
134
167
 
135
168
  begin
136
169
  Dir.mktmpdir do |dir|
@@ -148,7 +181,8 @@ begin
148
181
  models = {}
149
182
  model_id = nil
150
183
  dbh = File.open("#{dir}/essential.hmm", 'w')
151
- o[:model_file] ||= File.expand_path('../lib/data/essential.hmm.gz',__FILE__)
184
+ o[:model_file] ||= File.expand_path(
185
+ "../lib/data/#{o[:collection]}_essential.hmm.gz", __FILE__)
152
186
  mfh = (File.extname(o[:model_file]) == '.gz') ?
153
187
  Zlib::GzipReader.open(o[:model_file]) :
154
188
  File.open(o[:model_file], 'r')
@@ -201,6 +235,9 @@ begin
201
235
  # Report statistics
202
236
  if o[:stats]
203
237
  reph = o[:report].nil? ? $stdout : File.open(o[:report], 'w')
238
+ modifiers = [:bacteria, :archaea, :genomeeq]
239
+ .map { |i| o[i] ? i.to_s[0].upcase : '' }.join('')
240
+ reph.puts "! Collection: #{o[:collection]} #{modifiers}"
204
241
  if o[:metagenome]
205
242
  reph.printf "! Essential genes found: %d/%d.\n", genes.size, models.size
206
243
  gc = [0] * (models.size - genes.size) +
@@ -1 +1 @@
1
- utils/enveomics/Scripts/lib/../../enveomics.R
1
+ ../../enveomics.R
@@ -1,5 +1,5 @@
1
1
  Package: enveomics.R
2
- Version: 1.7.0
2
+ Version: 1.7.1
3
3
  Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
4
  email="lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
@@ -25,25 +25,24 @@
25
25
 
26
26
  enve.df2dist <- function(
27
27
  x,
28
- obj1.index=1,
29
- obj2.index=2,
30
- dist.index=3,
31
- default.d=NA,
32
- max.sim=0
28
+ obj1.index = 1,
29
+ obj2.index = 2,
30
+ dist.index = 3,
31
+ default.d = NA,
32
+ max.sim = 0
33
33
  ){
34
- x <- as.data.frame(x);
35
- a <- as.character(x[, obj1.index]);
36
- b <- as.character(x[, obj2.index]);
37
- d <- as.double(x[, dist.index]);
38
- if(max.sim!=0) d <- (max.sim - d)/max.sim
39
- ids <- unique(c(a,b));
40
- m <- matrix(default.d, nrow=length(ids), ncol=length(ids), dimnames=list(ids, ids));
34
+ x <- as.data.frame(x)
35
+ a <- as.character(x[, obj1.index])
36
+ b <- as.character(x[, obj2.index])
37
+ d <- as.double(x[, dist.index])
38
+ if(max.sim != 0) d <- (max.sim - d) / max.sim
39
+ ids <- unique(c(a,b))
40
+ m <- matrix(default.d,
41
+ nrow = length(ids), ncol = length(ids), dimnames = list(ids, ids))
41
42
  diag(m) <- 0.0
42
- for(i in 1:nrow(x)){
43
- m[a[i], b[i]] <- d[i];
44
- }
45
- m <- pmin(m, t(m), na.rm=TRUE)
46
- return(as.dist(m));
43
+ m[cbind(a,b)] <- d
44
+ m <- pmin(m, t(m), na.rm = TRUE)
45
+ return(as.dist(m))
47
46
  }
48
47
 
49
48
  #' Enveomics: Data Frame to Dist (Group)
@@ -666,15 +666,16 @@ enve.recplot2.findPeaks <- function(
666
666
  #' A vector of number of components to evaluate.
667
667
  #' @param criterion
668
668
  #' Criterion to use for components selection. Must be one of:
669
- #' \code{aic} (Akaike Information Criterion),
670
- #' \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).
669
+ #' \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
670
+ #' (Bayesian Information Criterion or Schwarz Criterion).
671
671
  #' @param merge.tol
672
672
  #' When attempting to merge peaks with very similar sequencing depth, use
673
673
  #' this number of significant digits (in log-scale).
674
674
  #' @param verbose
675
675
  #' Display (mostly debugging) information.
676
676
  #' @param ...
677
- #' Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.
677
+ #' Any additional parameters supported by
678
+ #' \code{\link{enve.recplot2.findPeaks.em}}.
678
679
  #'
679
680
  #' @return Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
680
681
  #'
@@ -684,10 +685,10 @@ enve.recplot2.findPeaks <- function(
684
685
 
685
686
  enve.recplot2.findPeaks.emauto <- function(
686
687
  x,
687
- components=seq(1,10),
688
- criterion='aic',
689
- merge.tol=2L,
690
- verbose=FALSE,
688
+ components = seq(1, 5),
689
+ criterion = 'aic',
690
+ merge.tol = 2L,
691
+ verbose = FALSE,
691
692
  ...
692
693
  ){
693
694
  best <- list(crit=0, pstore=list())
@@ -758,19 +759,19 @@ enve.recplot2.findPeaks.emauto <- function(
758
759
 
759
760
  enve.recplot2.findPeaks.em <- function(
760
761
  x,
761
- max.iter=1000,
762
- ll.diff.res=1e-8,
763
- components=2,
764
- rm.top=0.05,
765
- verbose=FALSE,
762
+ max.iter = 1000,
763
+ ll.diff.res = 1e-8,
764
+ components = 2,
765
+ rm.top = 0.05,
766
+ verbose = FALSE,
766
767
  init,
767
- log=TRUE
768
+ log = TRUE
768
769
  ){
769
770
 
770
771
  # Essential vars
771
772
  pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
772
773
  lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
773
- lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
774
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names = FALSE) ]
774
775
  if(log) lsd1 <- log(lsd1)
775
776
 
776
777
  # 1. Initialize
@@ -779,7 +780,7 @@ enve.recplot2.findPeaks.em <- function(
779
780
  init <- list(
780
781
  mu = tapply(lsd1, km.clust, mean),
781
782
  sd = tapply(lsd1, km.clust, sd),
782
- alpha = table(km.clust)/length(km.clust)
783
+ alpha = table(km.clust) / length(km.clust)
783
784
  )
784
785
  }
785
786
  m.step <- init
@@ -795,6 +796,7 @@ enve.recplot2.findPeaks.em <- function(
795
796
  ll.diff <- abs(cur.ll - e.step[["ll"]])
796
797
  cur.ll <- e.step[["ll"]]
797
798
  if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
799
+ if(is.na(ll.diff) || ll.diff == Inf) break
798
800
  if(ll.diff <= ll.diff.res) break
799
801
  }
800
802
 
@@ -1431,6 +1433,9 @@ enve.recplot2.findPeaks.__em_e <- function
1431
1433
  theta[['sd']][i])*theta[['alpha']][i]))
1432
1434
  sum.of.components <- rowSums(product)
1433
1435
  posterior <- product / sum.of.components
1436
+ for(i in which(sum.of.components == Inf)) {
1437
+ cat(i,'/',nrow(product), ':', product[i,], '\n')
1438
+ }
1434
1439
 
1435
1440
  return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1436
1441
  }
@@ -52,6 +52,7 @@ For additional information on recruitment plots, see the
52
52
  [Recruitment plots working document](https://github.com/lmrodriguezr/enveomics/blob/master/Docs/recplot2.md).
53
53
 
54
54
  ## Changelog
55
+ * 1.7.1: Improved efficiency of `enve.df2dist` about five-fold.
55
56
  * 1.7.0: Uniformized output for `enve.recplot2.extractWindows` and
56
57
  `enve.recplot2.coordinates` to ease automation. Thanks to Tomeu Viver and
57
58
  Roth Conrad for troubleshooting.
@@ -4,7 +4,7 @@
4
4
  \alias{enve.recplot2.findPeaks.emauto}
5
5
  \title{Enveomics: Recruitment Plot (2) Emauto Peak Finder}
6
6
  \usage{
7
- enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
7
+ enve.recplot2.findPeaks.emauto(x, components = seq(1, 5),
8
8
  criterion = "aic", merge.tol = 2L, verbose = FALSE, ...)
9
9
  }
10
10
  \arguments{
@@ -13,15 +13,16 @@ enve.recplot2.findPeaks.emauto(x, components = seq(1, 10),
13
13
  \item{components}{A vector of number of components to evaluate.}
14
14
 
15
15
  \item{criterion}{Criterion to use for components selection. Must be one of:
16
- \code{aic} (Akaike Information Criterion),
17
- \code{bic} or \code{sbc} (Bayesian Information Criterion or Schwarz Criterion).}
16
+ \code{aic} (Akaike Information Criterion), \code{bic} or \code{sbc}
17
+ (Bayesian Information Criterion or Schwarz Criterion).}
18
18
 
19
19
  \item{merge.tol}{When attempting to merge peaks with very similar sequencing depth, use
20
20
  this number of significant digits (in log-scale).}
21
21
 
22
22
  \item{verbose}{Display (mostly debugging) information.}
23
23
 
24
- \item{...}{Any additional parameters supported by \code{\link{enve.recplot2.findPeaks.em}}.}
24
+ \item{...}{Any additional parameters supported by
25
+ \code{\link{enve.recplot2.findPeaks.em}}.}
25
26
  }
26
27
  \value{
27
28
  Returns a list of \code{\link{enve.RecPlot2.Peak}} objects.
data/utils/find-medoid.R CHANGED
@@ -7,7 +7,12 @@
7
7
  #= Load stuff
8
8
  argv <- commandArgs(trailingOnly = T)
9
9
  suppressPackageStartupMessages(library(ape))
10
- suppressPackageStartupMessages(library(enveomics.R))
10
+ if(Sys.getenv('MIGA') == ''){
11
+ suppressPackageStartupMessages(library(enveomics.R))
12
+ }else{
13
+ source(file.path(Sys.getenv('MIGA'),
14
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
15
+ }
11
16
 
12
17
  find_medoids <- function(ani.df, out, clades) {
13
18
  if(nrow(ani.df) == 0) return(NULL)
data/utils/mytaxa_scan.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'zlib'
4
+
3
5
  abort "
4
6
  Usage:
5
7
  #{$0} {FastA file} {MyTaxa file} {Data output}
@@ -7,52 +9,53 @@ Usage:
7
9
  " if ARGV[2].nil?
8
10
 
9
11
  begin
10
- # Get arguments
11
- faa, mytaxa, outdata = ARGV
12
- winsize = 10
13
-
14
- # Extract gene IDs
15
- ids = File.open(faa).grep(/^>/).map{|dl| dl.chomp.sub(/^>/,"").sub(/\s.*/,"")}
16
- tax = Hash[ids.map{|k| [k, "NA"]}]
17
-
18
- # Get MyTaxa distributions
19
- k, l = nil
20
- File.open(mytaxa).each do |ln|
21
- ln.chomp!
22
- if $.%2 == 1
23
- k, l = ln.split /\t/
24
- else
25
- tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
26
- end
27
- end
28
- all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
29
-
30
- # Estimate Windows and save gene IDs
31
- fh = File.open(outdata + ".genes", "w")
32
- c = []
33
- c << all_tax.map{|t| tax.values.count(t) }
34
- n_wins = (ids.size/winsize).ceil
35
- (0 .. (n_wins-1)).each do |win|
36
- k = ids[win*winsize, winsize]
37
- win_t = tax.values_at(*k)
38
- fh.puts k.join("\t")
39
- c << all_tax.map{|t| win_t.count(t)}
40
- end
41
- p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
42
- fh.close
43
-
44
- # Save window profiles
45
- fh = File.open(outdata, "w")
46
- fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
47
- fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
48
- (0 .. (all_tax.size - 1)).each do |row|
49
- fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
50
- end
51
- fh.close
12
+ # Get arguments
13
+ faa, mytaxa, outdata = ARGV
14
+ winsize = 10
15
+
16
+ # Extract gene IDs
17
+ ifh = faa =~ /\.gz/ ? Zlib::GzipReader.open(faa) : File.open(faa, 'r')
18
+ ids = ifh.each_line.grep(/^>/).map{|dl| dl.chomp.sub(/^>/,'').sub(/\s.*/,'')}
19
+ ifh.close
20
+ tax = Hash[ids.map{|k| [k, "NA"]}]
21
+
22
+ # Get MyTaxa distributions
23
+ k, l = nil
24
+ File.open(mytaxa).each do |ln|
25
+ ln.chomp!
26
+ if $.%2 == 1
27
+ k, l = ln.split /\t/
28
+ else
29
+ tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
30
+ end
31
+ end
32
+ all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
33
+
34
+ # Estimate Windows and save gene IDs
35
+ fh = File.open(outdata + ".genes", "w")
36
+ c = []
37
+ c << all_tax.map{|t| tax.values.count(t) }
38
+ n_wins = (ids.size/winsize).ceil
39
+ (0 .. (n_wins-1)).each do |win|
40
+ k = ids[win*winsize, winsize]
41
+ win_t = tax.values_at(*k)
42
+ fh.puts k.join("\t")
43
+ c << all_tax.map{|t| win_t.count(t)}
44
+ end
45
+ p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
46
+ fh.close
47
+
48
+ # Save window profiles
49
+ fh = File.open(outdata, "w")
50
+ fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
51
+ fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
52
+ (0 .. (all_tax.size - 1)).each do |row|
53
+ fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
54
+ end
55
+ fh.close
52
56
  rescue => err
53
- $stderr.puts "Exception: #{err}\n\n"
54
- err.backtrace.each { |l| $stderr.puts l + "\n" }
55
- err
57
+ $stderr.puts "Exception: #{err}\n\n"
58
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
59
+ err
56
60
  end
57
61
 
58
-
data/utils/ref-tree.R CHANGED
@@ -7,7 +7,12 @@
7
7
  #= Load stuff
8
8
  argv <- commandArgs(trailingOnly=T)
9
9
  suppressPackageStartupMessages(library(ape))
10
- suppressPackageStartupMessages(library(enveomics.R))
10
+ if(Sys.getenv('MIGA') == ''){
11
+ suppressPackageStartupMessages(library(enveomics.R))
12
+ }else{
13
+ source(file.path(Sys.getenv('MIGA'),
14
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
15
+ }
11
16
  inst <- c("phangorn", "phytools") %in% rownames(installed.packages())
12
17
  if(inst[1]){
13
18
  suppressPackageStartupMessages(library(phangorn))
data/utils/subclades-nj.R CHANGED
@@ -12,7 +12,12 @@ suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(phytools))
13
13
  suppressPackageStartupMessages(library(phangorn))
14
14
  suppressPackageStartupMessages(library(parallel))
15
- suppressPackageStartupMessages(library(enveomics.R))
15
+ if(Sys.getenv('MIGA') == ''){
16
+ suppressPackageStartupMessages(library(enveomics.R))
17
+ }else{
18
+ source(file.path(Sys.getenv('MIGA'),
19
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
20
+ }
16
21
 
17
22
  #= Main function
18
23
  subclades <- function(ani_file, out_base, thr=1, ani=c()) {
data/utils/subclades.R CHANGED
@@ -10,7 +10,12 @@ suppressPackageStartupMessages(library(ape))
10
10
  suppressPackageStartupMessages(library(vegan))
11
11
  suppressPackageStartupMessages(library(cluster))
12
12
  suppressPackageStartupMessages(library(parallel))
13
- suppressPackageStartupMessages(library(enveomics.R))
13
+ if(Sys.getenv('MIGA') == ''){
14
+ suppressPackageStartupMessages(library(enveomics.R))
15
+ }else{
16
+ source(file.path(Sys.getenv('MIGA'),
17
+ 'utils', 'enveomics', 'enveomics.R', 'R', 'df2dist.R'))
18
+ }
14
19
 
15
20
  #= Main function
16
21
  subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0.0
4
+ version: 0.5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-25 00:00:00.000000000 Z
11
+ date: 2020-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -197,7 +197,6 @@ files:
197
197
  - test/taxonomy_test.rb
198
198
  - test/test_helper.rb
199
199
  - utils/adapters.fa
200
- - utils/arch-ess-genes.rb
201
200
  - utils/cleanup-databases.rb
202
201
  - utils/core-pan-plot.R
203
202
  - utils/distance/base.rb
@@ -207,6 +206,7 @@ files:
207
206
  - utils/distance/runner.rb
208
207
  - utils/distance/temporal.rb
209
208
  - utils/distances.rb
209
+ - utils/domain-ess-genes.rb
210
210
  - utils/enveomics/Docs/recplot2.md
211
211
  - utils/enveomics/Examples/aai-matrix.bash
212
212
  - utils/enveomics/Examples/ani-matrix.bash
@@ -356,7 +356,8 @@ files:
356
356
  - utils/enveomics/Scripts/clust.rand.rb
357
357
  - utils/enveomics/Scripts/gi2tax.rb
358
358
  - utils/enveomics/Scripts/in_silico_GA_GI.pl
359
- - utils/enveomics/Scripts/lib/data/essential.hmm.gz
359
+ - utils/enveomics/Scripts/lib/data/dupont_2012_essential.hmm.gz
360
+ - utils/enveomics/Scripts/lib/data/lee_2019_essential.hmm.gz
360
361
  - utils/enveomics/Scripts/lib/enveomics.R
361
362
  - utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb
362
363
  - utils/enveomics/Scripts/lib/enveomics_rb/jplace.rb
@@ -514,8 +515,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
514
515
  - !ruby/object:Gem::Version
515
516
  version: '0'
516
517
  requirements: []
517
- rubyforge_project:
518
- rubygems_version: 2.7.6
518
+ rubygems_version: 3.0.3
519
519
  signing_key:
520
520
  specification_version: 4
521
521
  summary: MiGA
@@ -1,57 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- esslog = ARGV.shift
4
- outlog = ARGV.shift
5
- l_all = `HMM.essential.rb -l -q`.chomp.split("\n").map{ |i| i.gsub(/\t.*/,"") }
6
- n_arc = Hash[
7
- `HMM.essential.rb -l -q -A`.chomp.split("\n").map{ |i| i.split("\t") }
8
- ]
9
- l_arc = n_arc.keys
10
-
11
- def quality(hsh)
12
- q = {}
13
- q[:found] = hsh.values.map{ |i| i==0 ? 0 : 1 }.inject(:+)
14
- q[:multi] = hsh.values.map{ |i| i==0 ? 0 : i-1 }.inject(:+)
15
- q[:cmp] = 100.0*q[:found].to_f/hsh.size
16
- q[:cnt] = 100.0*q[:multi].to_f/hsh.size
17
- q
18
- end
19
-
20
- cnt_ref = {}
21
- l_all.each{ |i| cnt_ref[i] = 1 }
22
-
23
- at = :header
24
- File.open(esslog, "r") do |fh|
25
- fh.each_line do |ln|
26
- v = ln.chomp.gsub(/^! +/, "")
27
- if v=="Multiple copies: "
28
- at = :multi
29
- elsif v=="Missing genes: "
30
- at = :missing
31
- elsif at==:multi
32
- v =~ /^(\d+) (\S+): .*/ or raise "Unexpected multi-copies format: #{v}"
33
- cnt_ref[$2] = $1.to_i
34
- elsif at==:missing
35
- v =~ /^(\S+): .*/ or raise "Unexpected missing format: #{v}"
36
- cnt_ref[$1] = 0
37
- end
38
- end
39
- end
40
-
41
- cnt_arc = {}
42
- l_arc.each{ |i| cnt_arc[i] = cnt_ref[i] }
43
-
44
- q = quality(cnt_arc)
45
- File.open(outlog, "w") do |ofh|
46
- ofh.puts "! Essential genes found: #{q[:found]}/#{cnt_arc.size}."
47
- ofh.puts "! Completeness: #{q[:cmp].round(1)}%."
48
- ofh.puts "! Contamination: #{q[:cnt].round(1)}%."
49
- if q[:multi] > 0
50
- ofh.puts "! Multiple copies: "
51
- cnt_arc.each{ |k,v| ofh.puts "! #{v} #{k}: #{n_arc[k]}." if v>1 }
52
- end
53
- if q[:found] < cnt_arc.size
54
- ofh.puts "! Missing genes: "
55
- cnt_arc.each{ |k,v| ofh.puts "! #{k}: #{n_arc[k]}." if v==0 }
56
- end
57
- end