miga-base 0.3.1.7 → 0.3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/actions/ncbi_get.rb +8 -0
  3. data/lib/miga/common.rb +9 -215
  4. data/lib/miga/common/base.rb +49 -0
  5. data/lib/miga/common/format.rb +135 -0
  6. data/lib/miga/common/path.rb +49 -0
  7. data/lib/miga/daemon.rb +3 -60
  8. data/lib/miga/daemon/base.rb +69 -0
  9. data/lib/miga/dataset.rb +3 -3
  10. data/lib/miga/dataset/result.rb +5 -5
  11. data/lib/miga/result.rb +5 -0
  12. data/lib/miga/version.rb +7 -5
  13. data/scripts/distances.bash +2 -19
  14. data/scripts/taxonomy.bash +2 -21
  15. data/test/common_test.rb +9 -0
  16. data/utils/distance/base.rb +6 -0
  17. data/utils/distance/commands.rb +82 -0
  18. data/utils/distance/database.rb +86 -0
  19. data/utils/distance/pipeline.rb +98 -0
  20. data/utils/distance/runner.rb +104 -0
  21. data/utils/distance/temporal.rb +37 -0
  22. data/utils/distances.rb +9 -0
  23. data/utils/enveomics/Docs/recplot2.md +233 -0
  24. data/utils/enveomics/Makefile +1 -1
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
  26. data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
  27. data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
  28. data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
  29. data/utils/enveomics/Manifest/categories.json +11 -1
  30. data/utils/enveomics/Manifest/examples.json +2 -2
  31. data/utils/enveomics/README.md +2 -0
  32. data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
  33. data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
  34. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  35. data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
  36. data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
  37. data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
  38. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  39. data/utils/enveomics/Scripts/SRA.download.bash +28 -21
  40. data/utils/enveomics/Scripts/Table.barplot.R +1 -0
  41. data/utils/enveomics/Scripts/aai.rb +4 -2
  42. data/utils/enveomics/build_enveomics_r.bash +5 -5
  43. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  44. data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
  45. data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
  46. data/utils/enveomics/enveomics.R/README.md +26 -17
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
  50. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
  51. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
  52. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
  53. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
  54. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
  55. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
  56. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
  57. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
  58. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
  59. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
  60. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
  61. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
  64. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
  65. data/utils/requirements.txt +1 -1
  66. metadata +28 -4
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
  68. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -0,0 +1,86 @@
1
+
2
+ require 'sqlite3'
3
+
4
+ module MiGA::DistanceRunner::Database
5
+ # Check for corrupt files and create empty databases
6
+ def initialize_dbs!(for_ref)
7
+ @dbs = {}
8
+ @tmp_dbs = {}
9
+ @db_counts = {}
10
+ {haai: :aai, aai: :aai, ani: :ani}.each do |m, t|
11
+ @db_counts[m] = 0
12
+ @dbs[m] = for_ref ? ref_db(m) : query_db(m)
13
+ # Remove if corrupt
14
+ if File.size?(dbs[m])
15
+ begin
16
+ SQLite3::Database.new(dbs[m]) do |conn|
17
+ conn.execute "select count(*) from #{t};"
18
+ end
19
+ rescue SQLite3::SQLException
20
+ FileUtils.rm dbs[m]
21
+ end
22
+ end
23
+ # Initialize if it doesn't exist
24
+ SQLite3::Database.new(dbs[m]) do |conn|
25
+ conn.execute "create table if not exists #{t}(" +
26
+ "seq1 varchar(256), seq2 varchar(256), " +
27
+ "#{t} float, sd float, n int, omega int" +
28
+ ")"
29
+ end unless File.size? dbs[m]
30
+ # Copy over to (local) temporals
31
+ @tmp_dbs[m] = tmp_file("#{m}.db")
32
+ FileUtils.cp(dbs[m], tmp_dbs[m])
33
+ end
34
+ end
35
+
36
+ # Path to the database +metric+ for +dataset_name+ in +project+
37
+ # (assumes that +dataset_name+ is a reference dataset)
38
+ def ref_db(metric, dataset_name=nil)
39
+ dataset_name ||= dataset.name
40
+ b = case metric
41
+ when :haai
42
+ "01.haai/#{dataset_name}.db"
43
+ when :aai
44
+ "02.aai/#{dataset_name}.db"
45
+ when :ani
46
+ "03.ani/#{dataset_name}.db"
47
+ end
48
+ File.expand_path(b, home)
49
+ end
50
+
51
+ # Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
52
+ # query dataset)
53
+ def query_db(metric)
54
+ File.expand_path("#{dataset.name}.#{metric}.db", home)
55
+ end
56
+
57
+ # Get the stored +metric+ value against +target+
58
+ def stored_value(target, metric)
59
+ # Check if self.dataset -> target is done (previous run)
60
+ y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
61
+ return y unless y.nil? or y.zero?
62
+ # Check if self.dataset <- target is done (another thread)
63
+ if dataset.is_ref? and project.path==ref_project.path
64
+ y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
65
+ return y unless y.nil? or y.zero?
66
+ end
67
+ nil
68
+ end
69
+
70
+ # Get the value of +metric+ in the +db+ database between +n1+ and +n2+
71
+ def value_from_db(n1, n2, db, metric)
72
+ y = nil
73
+ SQLite3::Database.new(db) do |conn|
74
+ y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
75
+ y = y.first unless y.nil?
76
+ end if File.size? db
77
+ y
78
+ end
79
+
80
+ # Iterates for each entry in +db+
81
+ def foreach_in_db(db, metric, &blk)
82
+ SQLite3::Database.new(db) do |conn|
83
+ conn.execute("select * from #{metric}").each{ |r| blk[r] }
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,98 @@
1
+
2
+ # High-end pipelines for DistanceRunner
3
+ module MiGA::DistanceRunner::Pipeline
4
+
5
+ # Recursively classify the dataset, returning an Array with two entries:
6
+ # classification and cluster number
7
+ def classify(clades, classif, metric, result_fh, val_cls=nil)
8
+ dir = File.expand_path(classif, clades)
9
+ med = File.expand_path("miga-project.medoids", dir)
10
+ return [classif,val_cls] unless File.size? med
11
+ max_val = 0
12
+ val_med = ""
13
+ val_cls = nil
14
+ i_n = 0
15
+ File.open(med, "r") do |med_fh|
16
+ med_fh.each_line do |med_ln|
17
+ i_n += 1
18
+ med_ln.chomp!
19
+ val = send(metric, ref_project.dataset(med_ln))
20
+ if !val.nil? and val >= max_val
21
+ max_val = val
22
+ val_med = med_ln
23
+ val_cls = i_n
24
+ puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
25
+ end
26
+ end
27
+ end
28
+ classif = File.expand_path("miga-project.sc-#{val_cls}", classif)
29
+ result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
30
+ classify(clades, classif, metric, result_fh, val_cls)
31
+ end
32
+
33
+ # Builds a tree with all visited medoids from any classification level
34
+ def build_medoids_tree(metric)
35
+ db = query_db(metric)
36
+ return unless File.size? db
37
+ out_base = File.expand_path(dataset.name, home)
38
+ ds_matrix = "#{out_base}.txt"
39
+ ds_matrix_fh = File.open(ds_matrix, "w")
40
+ ds_matrix_fh.puts %w[a b value].join("\t")
41
+ # Find all values in the database
42
+ seq2 = []
43
+ foreach_in_db(db, metric) do |r|
44
+ seq2 << r[0]
45
+ ds_matrix_fh.puts r[0,3].join("\t")
46
+ end
47
+ # Find all values among visited datasets in ref_project
48
+ ref_r = ref_project.result("#{metric}_distances") or return
49
+ Zlib::GzipReader.open(ref_r.file_path(:matrix)) do |fh|
50
+ fh.each_line do |ln|
51
+ r = ln.chomp.split("\t")
52
+ next unless seq2.include?(r[1]) or seq2.include?(r[2])
53
+ ds_matrix_fh.puts r[1,3].join("\t")
54
+ end
55
+ end
56
+ ds_matrix_fh.close
57
+ ref_tree = File.expand_path("utils/ref-tree.R", MiGA::MiGA.root_path)
58
+ `"#{ref_tree}" "#{ds_matrix}" "#{out_base}" "#{dataset.name}"`
59
+ File.unlink ds_matrix
60
+ end
61
+
62
+ # Tests taxonomy
63
+ def tax_test
64
+ # Get taxonomy of closest relative
65
+ from_ref_project = (project != ref_project)
66
+ res_dir = from_ref_project ?
67
+ File.expand_path("data/09.distances/05.taxonomy", project.path) : home
68
+ Dir.mkdir res_dir unless Dir.exist? res_dir
69
+ File.open(File.expand_path("#{dataset.name}.done", res_dir), "w") do |fh|
70
+ fh.puts Time.now.to_s
71
+ end
72
+ dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
73
+ cr = dataset.closest_relatives(1, from_ref_project)
74
+ return if cr.nil? or cr.empty?
75
+ tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
76
+ # Run the test for each rank
77
+ r = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax).map do |k,v|
78
+ sig = ""
79
+ [0.5,0.1,0.05,0.01].each{ |i| sig << "*" if v<i }
80
+ [MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || "?"), v, sig]
81
+ end
82
+ # Save test
83
+ File.open(File.expand_path("#{dataset.name}.intax.txt", home), "w") do |fh|
84
+ fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
85
+ fh.puts ""
86
+ fh.puts "Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01."
87
+ end
88
+ return r
89
+ end
90
+
91
+ # Transfer the taxonomy to the current dataset
92
+ def transfer_taxonomy(tax)
93
+ pval = (project.metadata[:tax_pvalue] || 0.05).to_f
94
+ tax_a = tax.select{ |i| i[1]!="?" && i[2]<=pval }.map { |i| i[0,2].join(":") }
95
+ dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
96
+ dataset.save
97
+ end
98
+ end
@@ -0,0 +1,104 @@
1
+
2
+ require_relative 'base.rb'
3
+ require_relative 'temporal.rb'
4
+ require_relative 'database.rb'
5
+ require_relative 'commands.rb'
6
+ require_relative 'pipeline.rb'
7
+
8
+
9
+ class MiGA::DistanceRunner
10
+
11
+ include MiGA::DistanceRunner::Temporal
12
+ include MiGA::DistanceRunner::Database
13
+ include MiGA::DistanceRunner::Commands
14
+ include MiGA::DistanceRunner::Pipeline
15
+
16
+ attr_reader :project, :ref_project, :dataset, :opts, :home
17
+ attr_reader :tmp, :tmp_dbs, :dbs, :db_counts
18
+
19
+ def initialize(project_path, dataset_name, opts_hash={})
20
+ @opts = opts_hash
21
+ @project = MiGA::Project.load(project_path) or
22
+ raise "No project at #{project_path}"
23
+ @dataset = project.dataset(dataset_name)
24
+ @home = File.expand_path("data/09.distances", project.path)
25
+ # Default opts
26
+ @opts[:aai_save_rbm] ||= ENV.fetch("MIGA_AAI_SAVE_RBM") do
27
+ project.is_clade? ? "save-rbm" : "no-save-rbm"
28
+ end
29
+ @opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
30
+ if opts[:run_taxonomy] && project.metadata[:ref_project]
31
+ @ref_project = MiGA::Project.load(project.metadata[:ref_project])
32
+ end
33
+ @ref_project ||= project
34
+ [:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
35
+ @opts[m] ||= ref_project.metadata[m]
36
+ end
37
+ @opts[:distances_checkpoint] ||= 10
38
+ @opts[:distances_checkpoint] = @opts[:distances_checkpoint].to_i
39
+ end
40
+
41
+ # Launch the appropriate analysis
42
+ def go!
43
+ return if dataset.is_multi?
44
+ Dir.mktmpdir do |tmp_dir|
45
+ @tmp = tmp_dir
46
+ create_temporals
47
+ opts[:run_taxonomy] ? go_taxonomy! : dataset.is_ref? ? go_ref! : go_query!
48
+ end
49
+ end
50
+
51
+ # Launch analysis for reference datasets
52
+ def go_ref!
53
+ # Initialize databases
54
+ initialize_dbs! true
55
+ # first-come-first-serve traverse
56
+ ref_project.each_dataset do |ds|
57
+ next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
58
+ puts "[ #{Time.now} ] #{ds.name}"
59
+ aai = aai(ds)
60
+ ani(ds) unless aai.nil? or aai < 90.0
61
+ end
62
+ # Finalize
63
+ [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
64
+ end
65
+
66
+ # Launch analysis for query datasets
67
+ def go_query!
68
+ # Check if project is ready
69
+ v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
70
+ res = ref_project.result(v[0])
71
+ return if res.nil?
72
+ # Initialize the databases
73
+ initialize_dbs! false
74
+ # Calculate the classification-informed AAI/ANI traverse
75
+ results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
76
+ fh = File.open(results, "w")
77
+ classif, val_cls = *classify(res.dir, ".", v[1], fh)
78
+ fh.close
79
+ # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
80
+ par_dir = File.dirname(File.expand_path(classif, res.dir))
81
+ par = File.expand_path("miga-project.classif", par_dir)
82
+ if File.size? par
83
+ File.open(par, "r") do |fh|
84
+ fh.each_line do |ln|
85
+ r = ln.chomp.split("\t")
86
+ next unless r[1].to_i==val_cls
87
+ target = ref_project.dataset(r[0])
88
+ aai = (metric==:aai) ? aai(target) : 100.0
89
+ ani(target) if aai >= 90.0
90
+ end
91
+ end
92
+ end
93
+ # Finalize
94
+ [:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
95
+ build_medoids_tree(v[1])
96
+ transfer_taxonomy(tax_test)
97
+ end
98
+
99
+ # Launch analysis for taxonomy jobs
100
+ def go_taxonomy!
101
+ return unless project.metadata[:ref_project]
102
+ go_query! # <- yeah, it's actually the same, just different ref_project
103
+ end
104
+ end
@@ -0,0 +1,37 @@
1
+
2
+ require 'tmpdir'
3
+
4
+ module MiGA::DistanceRunner::Temporal
5
+
6
+ # Copy input files to the (local) temporal folder
7
+ def create_temporals
8
+ rf = {essential_genes: :ess_genes, cds: :proteins, assembly: :largecontigs}
9
+ rf.each do |res, file|
10
+ r = dataset.result(res)
11
+ f = r.nil? ? nil : r.file_path(file)
12
+ FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
13
+ end
14
+ end
15
+
16
+ # Temporal file with extension +ext+
17
+ def tmp_file(ext)
18
+ File.expand_path("#{dataset.name}.#{ext}", tmp)
19
+ end
20
+
21
+ # Copies temporal databases back to the MiGA Project if 10 or more values
22
+ # have been stored without copying. The period (10 by default) can be
23
+ # controlled using +@opts[:distances_checkpoint]+
24
+ def checkpoint(metric)
25
+ @db_counts[metric] += 1
26
+ checkpoint! metric if db_counts[metric] >= @opts[:distances_checkpoint]
27
+ end
28
+
29
+ # Copies temporal databases back to the MiGA Project
30
+ def checkpoint!(metric)
31
+ SQLite3::Database.new(tmp_dbs[metric]) do |conn|
32
+ conn.execute("select count(*) from #{metric==:haai ? :aai : metric}")
33
+ end
34
+ FileUtils.cp(tmp_dbs[metric], dbs[metric])
35
+ @db_counts[metric] = 0
36
+ end
37
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'distance/runner.rb'
4
+
5
+ dataset = ARGV.shift
6
+ project = ARGV.shift
7
+ opts = Hash[ ARGV.map{ |i| i.split("=",2).tap{ |j| j[0] = j[0].to_sym } } ]
8
+ runner = MiGA::DistanceRunner.new(dataset, project, opts)
9
+ runner.go!
@@ -0,0 +1,233 @@
1
+ # Recruitment plots
2
+
3
+ ## Aims
4
+
5
+ This document aims to cover the technical aspects of the recruitment plot functions in the
6
+ `enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
7
+
8
+ ## Caveats
9
+
10
+ This is a __*working document*__, describing unstable and/or experimental code. The material
11
+ here is susceptible of changes without warning, pay attention to the modification date and (if
12
+ in doubt) the commit history. The definitions and default parameters of the functions described
13
+ here may change in the near future as result of further experimentation or more stable
14
+ implementations.
15
+
16
+ The current document was generated and tested with the `enveomics.R` package version 1.3. To
17
+ check your current version in R, use `packageVersion('enveomics.R')`.
18
+
19
+ > **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
20
+ > Carefully evaluate all your results.
21
+
22
+ ---
23
+
24
+ ## Package: `enveomics.R`
25
+
26
+ The functionalities described here are provided by the `enveomics.R` package. Some features
27
+ described here are updated more frequently than the official
28
+ [CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
29
+ updates (package HEAD), download (or update), and install this git repository.
30
+
31
+ ### Quick installation guide
32
+
33
+ :globe_with_meridians: To install the latest stable version available in CRAN, use in R:
34
+
35
+ ```R
36
+ install.packages(c('enveomics.R','optparse'))
37
+ ```
38
+
39
+ :octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
40
+
41
+ ```R
42
+ install.packages('devtools')
43
+ library('devtools')
44
+ install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Recruitment plots: `enve.recplot2`
50
+
51
+ The first step in this analysis is the mapping of reads to the genome, processed with
52
+ [BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
53
+ We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
54
+ prefix of the processed files.
55
+
56
+ Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
57
+ For this, you'll have two options.
58
+
59
+ ### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
60
+
61
+ The stand-alone script
62
+ [BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
63
+ is the easiest option to run, and should be the preferred method if you're automating
64
+ this analysis to process several mappings, but it doesn't offer access to advanced options.
65
+
66
+ You can run it like this using two CPUs:
67
+
68
+ ```bash
69
+ BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
70
+ ```
71
+
72
+ > **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
73
+ > map against contigs. However, if you did map reads against genes, you may want to use the
74
+ > `--pos-breaks 0` option to use each gene as a recruitment window.
75
+ >
76
+ > **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
77
+ > `--peaks-col darkred` option.
78
+
79
+ Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
80
+ object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
81
+
82
+ ### Option 2: Using the `enve.recplot2` R function
83
+
84
+ If you require access to advanced options, or for some other reason prefer to calculate the
85
+ recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
86
+ and example session in R:
87
+
88
+ ```R
89
+ # Load the package
90
+ library(enveomics.R)
91
+ # Open the PDF
92
+ pdf('my-recplot.pdf')
93
+ # Build and plot the object using two threads and no peak detection
94
+ # (to turn on peak detection, simply remove `peaks.col=NA`)
95
+ rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
96
+ # Close the PDF
97
+ dev.off()
98
+ # Save the object
99
+ save(rp, file='my-recplot.rdata')
100
+ ```
101
+
102
+ > **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
103
+ > before closing the R session.
104
+
105
+ Naturally, you may want to see what other (advanced) options you have. You can access the
106
+ documentation of the function in R using `?enve.recplot2`.
107
+
108
+ ---
109
+
110
+ ## Summary statistics
111
+
112
+ Here we explore some frequently used summary statistics from recruitment plots. First, load the
113
+ package and the `enve.RecPlot2` object you saved previously, in R:
114
+
115
+ ```R
116
+ library(enveomics.R)
117
+ load('my-recplot.rdata')
118
+ ```
119
+
120
+ ### Average and median sequencing depth
121
+
122
+ ```R
123
+ mean(enve.recplot2.seqdepth(rp)) # <- Average
124
+ median(enve.recplot2.seqdepth(rp)) # <- Median
125
+ ```
126
+
127
+ ### Average and median sequencing depth excluding zero-coverage windows
128
+
129
+ ```R
130
+ seqdepth <- enve.recplot2.seqdepth(rp)
131
+ mean(seqdepth[seqdepth>0]) # <- Average
132
+ median(seqdepth[seqdepth>0]) # <- Median
133
+ ```
134
+
135
+ ### Average Nucleotide Identity from reads (ANIr)
136
+
137
+ ```R
138
+ enve.recplot2.ANIr(rp) # <- Complete recruitment plot
139
+ enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
140
+ enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
141
+ enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
142
+ ```
143
+
144
+ ### Coordinates of each sequence window with their respective sequencing depth
145
+
146
+ ```R
147
+ d <- enve.recplot2.coordinates(rp)
148
+ d$seqdepth <- enve.recplot2.seqdepth(rp)
149
+ d
150
+ ```
151
+
152
+ ### Sequencing breadth (upper boundary)
153
+
154
+ This estimate depends on the window size. The smaller the window size, the better the
155
+ estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
156
+ biased (overestimate).
157
+
158
+ ```R
159
+ mean(enve.recplot2.seqdepth(rp) > 0)
160
+ ```
161
+
162
+ ---
163
+
164
+ ## Peak-finder: `enve.recplot2.findPeaks`
165
+
166
+ In this step we will try to identify one or multiple population peaks corresponding to different
167
+ sub-populations and/or composites of sub-populations.
168
+
169
+ > **NOTE** This step can be performed together with the step above, but we separate it here for
170
+ > two reasons: **(1)** This step is much more unstable but less computationally demanding than the
171
+ > step before, so it makes sense to re-run only this part with different parameters and/or
172
+ > package updates; and **(2)** We want to save the R objects independently, so the following steps
173
+ > are more clear.
174
+
175
+ In R:
176
+
177
+ ```R
178
+ # Load the package
179
+ library(enveomics.R)
180
+ # Load the `enve.RecPlot2` object you saved previously
181
+ load('my-recplot.rdata')
182
+ # Find the peaks
183
+ peaks <- enve.recplot2.findPeaks(rp)
184
+ # Save the peaks R object (optional)
185
+ save(peaks, file='my-recplot-peaks.rdata')
186
+ # Plot the peaks in a PDF (optional)
187
+ pdf('my-recplot-peaks.pdf')
188
+ p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
189
+ dev.off()
190
+ ```
191
+
192
+ The key function here is `enve.recplo2.findPeaks`. This function has several parameters, depending on
193
+ the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
194
+ of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
195
+
196
+ ---
197
+
198
+ ## Gene-content diversity: `enve.recplot2.extractWindows`
199
+
200
+ In R:
201
+
202
+ ```R
203
+ # Load the package and the objects (unless you're still in the same session from the last step)
204
+ library(enveomics.R)
205
+ load('my-recplot.rdata')
206
+ load('my-recplot-peaks.rdata')
207
+ # Find the peak representing the core genome
208
+ cp <- enve.recplot2.corePeak(peaks)
209
+ #-----
210
+ # The following functions illustrate how to obtain different results. Please explore the resulting
211
+ # objects and the associated documentation
212
+ #-----
213
+ # Find the coordinates of windows significantly below the average sequencing depth
214
+ div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
215
+ # Add sequencing depth
216
+ div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
217
+ # Save the coordinates as a tab-delimited table
218
+ write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
219
+ # Find all the windows with sequencing depth zero
220
+ zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
221
+ ```
222
+
223
+ ---
224
+
225
+ ## To do
226
+
227
+ - [x] Document structure
228
+ - [x] Package: `enveomics.R`
229
+ - [x] Recruitment plots: `enve.recplot2`
230
+ - [x] Summary statistics
231
+ - [x] Peak-finder: `enve.recplot2.findPeaks`
232
+ - [x] Gene-content diversity: `enve.recplot2.extractWindows`
233
+ - [ ] Compare identity profiles: `enve.recplot2.compareIdentities`