miga-base 0.3.1.7 → 0.3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/actions/ncbi_get.rb +8 -0
  3. data/lib/miga/common.rb +9 -215
  4. data/lib/miga/common/base.rb +49 -0
  5. data/lib/miga/common/format.rb +135 -0
  6. data/lib/miga/common/path.rb +49 -0
  7. data/lib/miga/daemon.rb +3 -60
  8. data/lib/miga/daemon/base.rb +69 -0
  9. data/lib/miga/dataset.rb +3 -3
  10. data/lib/miga/dataset/result.rb +5 -5
  11. data/lib/miga/result.rb +5 -0
  12. data/lib/miga/version.rb +7 -5
  13. data/scripts/distances.bash +2 -19
  14. data/scripts/taxonomy.bash +2 -21
  15. data/test/common_test.rb +9 -0
  16. data/utils/distance/base.rb +6 -0
  17. data/utils/distance/commands.rb +82 -0
  18. data/utils/distance/database.rb +86 -0
  19. data/utils/distance/pipeline.rb +98 -0
  20. data/utils/distance/runner.rb +104 -0
  21. data/utils/distance/temporal.rb +37 -0
  22. data/utils/distances.rb +9 -0
  23. data/utils/enveomics/Docs/recplot2.md +233 -0
  24. data/utils/enveomics/Makefile +1 -1
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
  26. data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
  27. data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
  28. data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
  29. data/utils/enveomics/Manifest/categories.json +11 -1
  30. data/utils/enveomics/Manifest/examples.json +2 -2
  31. data/utils/enveomics/README.md +2 -0
  32. data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
  33. data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
  34. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  35. data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
  36. data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
  37. data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
  38. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  39. data/utils/enveomics/Scripts/SRA.download.bash +28 -21
  40. data/utils/enveomics/Scripts/Table.barplot.R +1 -0
  41. data/utils/enveomics/Scripts/aai.rb +4 -2
  42. data/utils/enveomics/build_enveomics_r.bash +5 -5
  43. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  44. data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
  45. data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
  46. data/utils/enveomics/enveomics.R/README.md +26 -17
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
  50. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
  51. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
  52. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
  53. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
  54. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
  55. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
  56. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
  57. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
  58. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
  59. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
  60. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
  61. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
  64. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
  65. data/utils/requirements.txt +1 -1
  66. metadata +28 -4
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
  68. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -0,0 +1,86 @@
1
+
2
+ require 'sqlite3'
3
+
4
+ module MiGA::DistanceRunner::Database
5
+ # Check for corrupt files and create empty databases
6
+ def initialize_dbs!(for_ref)
7
+ @dbs = {}
8
+ @tmp_dbs = {}
9
+ @db_counts = {}
10
+ {haai: :aai, aai: :aai, ani: :ani}.each do |m, t|
11
+ @db_counts[m] = 0
12
+ @dbs[m] = for_ref ? ref_db(m) : query_db(m)
13
+ # Remove if corrupt
14
+ if File.size?(dbs[m])
15
+ begin
16
+ SQLite3::Database.new(dbs[m]) do |conn|
17
+ conn.execute "select count(*) from #{t};"
18
+ end
19
+ rescue SQLite3::SQLException
20
+ FileUtils.rm dbs[m]
21
+ end
22
+ end
23
+ # Initialize if it doesn't exist
24
+ SQLite3::Database.new(dbs[m]) do |conn|
25
+ conn.execute "create table if not exists #{t}(" +
26
+ "seq1 varchar(256), seq2 varchar(256), " +
27
+ "#{t} float, sd float, n int, omega int" +
28
+ ")"
29
+ end unless File.size? dbs[m]
30
+ # Copy over to (local) temporals
31
+ @tmp_dbs[m] = tmp_file("#{m}.db")
32
+ FileUtils.cp(dbs[m], tmp_dbs[m])
33
+ end
34
+ end
35
+
36
+ # Path to the database +metric+ for +dataset_name+ in +project+
37
+ # (assumes that +dataset_name+ is a reference dataset)
38
+ def ref_db(metric, dataset_name=nil)
39
+ dataset_name ||= dataset.name
40
+ b = case metric
41
+ when :haai
42
+ "01.haai/#{dataset_name}.db"
43
+ when :aai
44
+ "02.aai/#{dataset_name}.db"
45
+ when :ani
46
+ "03.ani/#{dataset_name}.db"
47
+ end
48
+ File.expand_path(b, home)
49
+ end
50
+
51
+ # Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
52
+ # query dataset)
53
+ def query_db(metric)
54
+ File.expand_path("#{dataset.name}.#{metric}.db", home)
55
+ end
56
+
57
+ # Get the stored +metric+ value against +target+
58
+ def stored_value(target, metric)
59
+ # Check if self.dataset -> target is done (previous run)
60
+ y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
61
+ return y unless y.nil? or y.zero?
62
+ # Check if self.dataset <- target is done (another thread)
63
+ if dataset.is_ref? and project.path==ref_project.path
64
+ y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
65
+ return y unless y.nil? or y.zero?
66
+ end
67
+ nil
68
+ end
69
+
70
+ # Get the value of +metric+ in the +db+ database between +n1+ and +n2+
71
+ def value_from_db(n1, n2, db, metric)
72
+ y = nil
73
+ SQLite3::Database.new(db) do |conn|
74
+ y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
75
+ y = y.first unless y.nil?
76
+ end if File.size? db
77
+ y
78
+ end
79
+
80
+ # Iterates for each entry in +db+
81
+ def foreach_in_db(db, metric, &blk)
82
+ SQLite3::Database.new(db) do |conn|
83
+ conn.execute("select * from #{metric}").each{ |r| blk[r] }
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,98 @@
1
+
2
+ # High-end pipelines for DistanceRunner
3
+ module MiGA::DistanceRunner::Pipeline
4
+
5
+ # Recursively classify the dataset, returning an Array with two entries:
6
+ # classification and cluster number
7
+ def classify(clades, classif, metric, result_fh, val_cls=nil)
8
+ dir = File.expand_path(classif, clades)
9
+ med = File.expand_path("miga-project.medoids", dir)
10
+ return [classif,val_cls] unless File.size? med
11
+ max_val = 0
12
+ val_med = ""
13
+ val_cls = nil
14
+ i_n = 0
15
+ File.open(med, "r") do |med_fh|
16
+ med_fh.each_line do |med_ln|
17
+ i_n += 1
18
+ med_ln.chomp!
19
+ val = send(metric, ref_project.dataset(med_ln))
20
+ if !val.nil? and val >= max_val
21
+ max_val = val
22
+ val_med = med_ln
23
+ val_cls = i_n
24
+ puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
25
+ end
26
+ end
27
+ end
28
+ classif = File.expand_path("miga-project.sc-#{val_cls}", classif)
29
+ result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
30
+ classify(clades, classif, metric, result_fh, val_cls)
31
+ end
32
+
33
+ # Builds a tree with all visited medoids from any classification level
34
+ def build_medoids_tree(metric)
35
+ db = query_db(metric)
36
+ return unless File.size? db
37
+ out_base = File.expand_path(dataset.name, home)
38
+ ds_matrix = "#{out_base}.txt"
39
+ ds_matrix_fh = File.open(ds_matrix, "w")
40
+ ds_matrix_fh.puts %w[a b value].join("\t")
41
+ # Find all values in the database
42
+ seq2 = []
43
+ foreach_in_db(db, metric) do |r|
44
+ seq2 << r[0]
45
+ ds_matrix_fh.puts r[0,3].join("\t")
46
+ end
47
+ # Find all values among visited datasets in ref_project
48
+ ref_r = ref_project.result("#{metric}_distances") or return
49
+ Zlib::GzipReader.open(ref_r.file_path(:matrix)) do |fh|
50
+ fh.each_line do |ln|
51
+ r = ln.chomp.split("\t")
52
+ next unless seq2.include?(r[1]) or seq2.include?(r[2])
53
+ ds_matrix_fh.puts r[1,3].join("\t")
54
+ end
55
+ end
56
+ ds_matrix_fh.close
57
+ ref_tree = File.expand_path("utils/ref-tree.R", MiGA::MiGA.root_path)
58
+ `"#{ref_tree}" "#{ds_matrix}" "#{out_base}" "#{dataset.name}"`
59
+ File.unlink ds_matrix
60
+ end
61
+
62
+ # Tests taxonomy
63
+ def tax_test
64
+ # Get taxonomy of closest relative
65
+ from_ref_project = (project != ref_project)
66
+ res_dir = from_ref_project ?
67
+ File.expand_path("data/09.distances/05.taxonomy", project.path) : home
68
+ Dir.mkdir res_dir unless Dir.exist? res_dir
69
+ File.open(File.expand_path("#{dataset.name}.done", res_dir), "w") do |fh|
70
+ fh.puts Time.now.to_s
71
+ end
72
+ dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
73
+ cr = dataset.closest_relatives(1, from_ref_project)
74
+ return if cr.nil? or cr.empty?
75
+ tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
76
+ # Run the test for each rank
77
+ r = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax).map do |k,v|
78
+ sig = ""
79
+ [0.5,0.1,0.05,0.01].each{ |i| sig << "*" if v<i }
80
+ [MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || "?"), v, sig]
81
+ end
82
+ # Save test
83
+ File.open(File.expand_path("#{dataset.name}.intax.txt", home), "w") do |fh|
84
+ fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
85
+ fh.puts ""
86
+ fh.puts "Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01."
87
+ end
88
+ return r
89
+ end
90
+
91
+ # Transfer the taxonomy to the current dataset
92
+ def transfer_taxonomy(tax)
93
+ pval = (project.metadata[:tax_pvalue] || 0.05).to_f
94
+ tax_a = tax.select{ |i| i[1]!="?" && i[2]<=pval }.map { |i| i[0,2].join(":") }
95
+ dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
96
+ dataset.save
97
+ end
98
+ end
@@ -0,0 +1,104 @@
1
+
2
+ require_relative 'base.rb'
3
+ require_relative 'temporal.rb'
4
+ require_relative 'database.rb'
5
+ require_relative 'commands.rb'
6
+ require_relative 'pipeline.rb'
7
+
8
+
9
+ class MiGA::DistanceRunner
10
+
11
+ include MiGA::DistanceRunner::Temporal
12
+ include MiGA::DistanceRunner::Database
13
+ include MiGA::DistanceRunner::Commands
14
+ include MiGA::DistanceRunner::Pipeline
15
+
16
+ attr_reader :project, :ref_project, :dataset, :opts, :home
17
+ attr_reader :tmp, :tmp_dbs, :dbs, :db_counts
18
+
19
+ def initialize(project_path, dataset_name, opts_hash={})
20
+ @opts = opts_hash
21
+ @project = MiGA::Project.load(project_path) or
22
+ raise "No project at #{project_path}"
23
+ @dataset = project.dataset(dataset_name)
24
+ @home = File.expand_path("data/09.distances", project.path)
25
+ # Default opts
26
+ @opts[:aai_save_rbm] ||= ENV.fetch("MIGA_AAI_SAVE_RBM") do
27
+ project.is_clade? ? "save-rbm" : "no-save-rbm"
28
+ end
29
+ @opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
30
+ if opts[:run_taxonomy] && project.metadata[:ref_project]
31
+ @ref_project = MiGA::Project.load(project.metadata[:ref_project])
32
+ end
33
+ @ref_project ||= project
34
+ [:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
35
+ @opts[m] ||= ref_project.metadata[m]
36
+ end
37
+ @opts[:distances_checkpoint] ||= 10
38
+ @opts[:distances_checkpoint] = @opts[:distances_checkpoint].to_i
39
+ end
40
+
41
+ # Launch the appropriate analysis
42
+ def go!
43
+ return if dataset.is_multi?
44
+ Dir.mktmpdir do |tmp_dir|
45
+ @tmp = tmp_dir
46
+ create_temporals
47
+ opts[:run_taxonomy] ? go_taxonomy! : dataset.is_ref? ? go_ref! : go_query!
48
+ end
49
+ end
50
+
51
+ # Launch analysis for reference datasets
52
+ def go_ref!
53
+ # Initialize databases
54
+ initialize_dbs! true
55
+ # first-come-first-serve traverse
56
+ ref_project.each_dataset do |ds|
57
+ next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
58
+ puts "[ #{Time.now} ] #{ds.name}"
59
+ aai = aai(ds)
60
+ ani(ds) unless aai.nil? or aai < 90.0
61
+ end
62
+ # Finalize
63
+ [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
64
+ end
65
+
66
+ # Launch analysis for query datasets
67
+ def go_query!
68
+ # Check if project is ready
69
+ v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
70
+ res = ref_project.result(v[0])
71
+ return if res.nil?
72
+ # Initialize the databases
73
+ initialize_dbs! false
74
+ # Calculate the classification-informed AAI/ANI traverse
75
+ results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
76
+ fh = File.open(results, "w")
77
+ classif, val_cls = *classify(res.dir, ".", v[1], fh)
78
+ fh.close
79
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
80
+ par_dir = File.dirname(File.expand_path(classif, res.dir))
81
+ par = File.expand_path("miga-project.classif", par_dir)
82
+ if File.size? par
83
+ File.open(par, "r") do |fh|
84
+ fh.each_line do |ln|
85
+ r = ln.chomp.split("\t")
86
+ next unless r[1].to_i==val_cls
87
+ target = ref_project.dataset(r[0])
88
+ aai = (metric==:aai) ? aai(target) : 100.0
89
+ ani(target) if aai >= 90.0
90
+ end
91
+ end
92
+ end
93
+ # Finalize
94
+ [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
95
+ build_medoids_tree(v[1])
96
+ transfer_taxonomy(tax_test)
97
+ end
98
+
99
+ # Launch analysis for taxonomy jobs
100
+ def go_taxonomy!
101
+ return unless project.metadata[:ref_project]
102
+ go_query! # <- yeah, it's actually the same, just different ref_project
103
+ end
104
+ end
@@ -0,0 +1,37 @@
1
+
2
+ require 'tmpdir'
3
+
4
+ module MiGA::DistanceRunner::Temporal
5
+
6
+ # Copy input files to the (local) temporal folder
7
+ def create_temporals
8
+ rf = {essential_genes: :ess_genes, cds: :proteins, assembly: :largecontigs}
9
+ rf.each do |res, file|
10
+ r = dataset.result(res)
11
+ f = r.nil? ? nil : r.file_path(file)
12
+ FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
13
+ end
14
+ end
15
+
16
+ # Temporal file with extension +ext+
17
+ def tmp_file(ext)
18
+ File.expand_path("#{dataset.name}.#{ext}", tmp)
19
+ end
20
+
21
+ # Copies temporal databases back to the MiGA Project if 10 or more values
22
+ # have been stored without copying. The period (10 by default) can be
23
+ # controlled using +@opts[:distances_checkpoint]+
24
+ def checkpoint(metric)
25
+ @db_counts[metric] += 1
26
+ checkpoint! metric if db_counts[metric] >= @opts[:distances_checkpoint]
27
+ end
28
+
29
+ # Copies temporal databases back to the MiGA Project
30
+ def checkpoint!(metric)
31
+ SQLite3::Database.new(tmp_dbs[metric]) do |conn|
32
+ conn.execute("select count(*) from #{metric==:haai ? :aai : metric}")
33
+ end
34
+ FileUtils.cp(tmp_dbs[metric], dbs[metric])
35
+ @db_counts[metric] = 0
36
+ end
37
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'distance/runner.rb'
4
+
5
+ dataset = ARGV.shift
6
+ project = ARGV.shift
7
+ opts = Hash[ ARGV.map{ |i| i.split("=",2).tap{ |j| j[0] = j[0].to_sym } } ]
8
+ runner = MiGA::DistanceRunner.new(dataset, project, opts)
9
+ runner.go!
@@ -0,0 +1,233 @@
1
+ # Recruitment plots
2
+
3
+ ## Aims
4
+
5
+ This document aims to cover the technical aspects of the recruitment plot functions in the
6
+ `enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
7
+
8
+ ## Caveats
9
+
10
+ This is a __*working document*__, describing unstable and/or experimental code. The material
11
+ here is susceptible of changes without warning, pay attention to the modification date and (if
12
+ in doubt) the commit history. The definitions and default parameters of the functions described
13
+ here may change in the near future as result of further experimentation or more stable
14
+ implementations.
15
+
16
+ The current document was generated and tested with the `enveomics.R` package version 1.3. To
17
+ check your current version in R, use `packageVersion('enveomics.R')`.
18
+
19
+ > **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
20
+ > Carefully evaluate all your results.
21
+
22
+ ---
23
+
24
+ ## Package: `enveomics.R`
25
+
26
+ The functionalities described here are provided by the `enveomics.R` package. Some features
27
+ described here are updated more frequently than the official
28
+ [CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
29
+ updates (package HEAD), download (or update), and install this git repository.
30
+
31
+ ### Quick installation guide
32
+
33
+ :globe_with_meridians: To install the latest stable version available in CRAN, use in R:
34
+
35
+ ```R
36
+ install.packages(c('enveomics.R','optparse'))
37
+ ```
38
+
39
+ :octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
40
+
41
+ ```R
42
+ install.packages('devtools')
43
+ library('devtools')
44
+ install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Recruitment plots: `enve.recplot2`
50
+
51
+ The first step in this analysis is the mapping of reads to the genome, processed with
52
+ [BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
53
+ We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
54
+ prefix of the processed files.
55
+
56
+ Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
57
+ For this, you'll have two options.
58
+
59
+ ### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
60
+
61
+ The stand-alone script
62
+ [BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
63
+ is the easiest option to run, and should be the preferred method if you're automating
64
+ this analysis to process several mappings, but it doesn't offer access to advanced options.
65
+
66
+ You can run it like this using two CPUs:
67
+
68
+ ```bash
69
+ BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
70
+ ```
71
+
72
+ > **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
73
+ > map against contigs. However, if you did map reads against genes, you may want to use the
74
+ > `--pos-breaks 0` option to use each gene as a recruitment window.
75
+ >
76
+ > **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
77
+ > `--peaks-col darkred` option.
78
+
79
+ Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
80
+ object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
81
+
82
+ ### Option 2: Using the `enve.recplot2` R function
83
+
84
+ If you require access to advanced options, or for some other reason prefer to calculate the
85
+ recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
86
+ and example session in R:
87
+
88
+ ```R
89
+ # Load the package
90
+ library(enveomics.R)
91
+ # Open the PDF
92
+ pdf('my-recplot.pdf')
93
+ # Build and plot the object using two threads and no peak detection
94
+ # (to turn on peak detection, simply remove `peaks.col=NA`)
95
+ rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
96
+ # Close the PDF
97
+ dev.off()
98
+ # Save the object
99
+ save(rp, file='my-recplot.rdata')
100
+ ```
101
+
102
+ > **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
103
+ > before closing the R session.
104
+
105
+ Naturally, you may want to see what other (advanced) options you have. You can access the
106
+ documentation of the function in R using `?enve.recplot2`.
107
+
108
+ ---
109
+
110
+ ## Summary statistics
111
+
112
+ Here we explore some frequently used summary statistics from recruitment plots. First, load the
113
+ package and the `enve.RecPlot2` object you saved previously, in R:
114
+
115
+ ```R
116
+ library(enveomics.R)
117
+ load('my-recplot.rdata')
118
+ ```
119
+
120
+ ### Average and median sequencing depth
121
+
122
+ ```R
123
+ mean(enve.recplot2.seqdepth(rp)) # <- Average
124
+ median(enve.recplot2.seqdepth(rp)) # <- Median
125
+ ```
126
+
127
+ ### Average and median sequencing depth excluding zero-coverage windows
128
+
129
+ ```R
130
+ seqdepth <- enve.recplot2.seqdepth(rp)
131
+ mean(seqdepth[seqdepth>0]) # <- Average
132
+ median(seqdepth[seqdepth>0]) # <- Median
133
+ ```
134
+
135
+ ### Average Nucleotide Identity from reads (ANIr)
136
+
137
+ ```R
138
+ enve.recplot2.ANIr(rp) # <- Complete recruitment plot
139
+ enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
140
+ enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
141
+ enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
142
+ ```
143
+
144
+ ### Coordinates of each sequence window with their respective sequencing depth
145
+
146
+ ```R
147
+ d <- enve.recplot2.coordinates(rp)
148
+ d$seqdepth <- enve.recplot2.seqdepth(rp)
149
+ d
150
+ ```
151
+
152
+ ### Sequencing breadth (upper boundary)
153
+
154
+ This estimate depends on the window size. The smaller the window size, the better the
155
+ estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
156
+ biased (overestimate).
157
+
158
+ ```R
159
+ mean(enve.recplot2.seqdepth(rp) > 0)
160
+ ```
161
+
162
+ ---
163
+
164
+ ## Peak-finder: `enve.recplot2.findPeaks`
165
+
166
+ In this step we will try to identify one or multiple population peaks corresponding to different
167
+ sub-populations and/or composites of sub-populations.
168
+
169
+ > **NOTE** This step can be performed together with the step above, but we separate it here for
170
+ > two reasons: **(1)** This step is much more unstable but less computationally demanding than the
171
+ > step before, so it makes sense to re-run only this part with different parameters and/or
172
+ > package updates; and **(2)** We want to save the R objects independently, so the following steps
173
+ > are more clear.
174
+
175
+ In R:
176
+
177
+ ```R
178
+ # Load the package
179
+ library(enveomics.R)
180
+ # Load the `enve.RecPlot2` object you saved previously
181
+ load('my-recplot.rdata')
182
+ # Find the peaks
183
+ peaks <- enve.recplot2.findPeaks(rp)
184
+ # Save the peaks R object (optional)
185
+ save(peaks, file='my-recplot-peaks.rdata')
186
+ # Plot the peaks in a PDF (optional)
187
+ pdf('my-recplot-peaks.pdf')
188
+ p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
189
+ dev.off()
190
+ ```
191
+
192
+ The key function here is `enve.recplo2.findPeaks`. This function has several parameters, depending on
193
+ the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
194
+ of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
195
+
196
+ ---
197
+
198
+ ## Gene-content diversity: `enve.recplot2.extractWindows`
199
+
200
+ In R:
201
+
202
+ ```R
203
+ # Load the package and the objects (unless you're still in the same session from the last step)
204
+ library(enveomics.R)
205
+ load('my-recplot.rdata')
206
+ load('my-recplot-peaks.rdata')
207
+ # Find the peak representing the core genome
208
+ cp <- enve.recplot2.corePeak(peaks)
209
+ #-----
210
+ # The following functions illustrate how to obtain different results. Please explore the resulting
211
+ # objects and the associated documentation
212
+ #-----
213
+ # Find the coordinates of windows significantly below the average sequencing depth
214
+ div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
215
+ # Add sequencing depth
216
+ div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
217
+ # Save the coordinates as a tab-delimited table
218
+ write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
219
+ # Find all the windows with sequencing depth zero
220
+ zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
221
+ ```
222
+
223
+ ---
224
+
225
+ ## To do
226
+
227
+ - [x] Document structure
228
+ - [x] Package: `enveomics.R`
229
+ - [x] Recruitment plots: `enve.recplot2`
230
+ - [x] Summary statistics
231
+ - [x] Peak-finder: `enve.recplot2.findPeaks`
232
+ - [x] Gene-content diversity: `enve.recplot2.extractWindows`
233
+ - [ ] Compare identity profiles: `enve.recplot2.compareIdentities`