miga-base 0.3.1.7 → 0.3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
require 'sqlite3'
|
3
|
+
|
4
|
+
module MiGA::DistanceRunner::Database
|
5
|
+
# Check for corrupt files and create empty databases
|
6
|
+
def initialize_dbs!(for_ref)
|
7
|
+
@dbs = {}
|
8
|
+
@tmp_dbs = {}
|
9
|
+
@db_counts = {}
|
10
|
+
{haai: :aai, aai: :aai, ani: :ani}.each do |m, t|
|
11
|
+
@db_counts[m] = 0
|
12
|
+
@dbs[m] = for_ref ? ref_db(m) : query_db(m)
|
13
|
+
# Remove if corrupt
|
14
|
+
if File.size?(dbs[m])
|
15
|
+
begin
|
16
|
+
SQLite3::Database.new(dbs[m]) do |conn|
|
17
|
+
conn.execute "select count(*) from #{t};"
|
18
|
+
end
|
19
|
+
rescue SQLite3::SQLException
|
20
|
+
FileUtils.rm dbs[m]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# Initialize if it doesn't exist
|
24
|
+
SQLite3::Database.new(dbs[m]) do |conn|
|
25
|
+
conn.execute "create table if not exists #{t}(" +
|
26
|
+
"seq1 varchar(256), seq2 varchar(256), " +
|
27
|
+
"#{t} float, sd float, n int, omega int" +
|
28
|
+
")"
|
29
|
+
end unless File.size? dbs[m]
|
30
|
+
# Copy over to (local) temporals
|
31
|
+
@tmp_dbs[m] = tmp_file("#{m}.db")
|
32
|
+
FileUtils.cp(dbs[m], tmp_dbs[m])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Path to the database +metric+ for +dataset_name+ in +project+
|
37
|
+
# (assumes that +dataset_name+ is a reference dataset)
|
38
|
+
def ref_db(metric, dataset_name=nil)
|
39
|
+
dataset_name ||= dataset.name
|
40
|
+
b = case metric
|
41
|
+
when :haai
|
42
|
+
"01.haai/#{dataset_name}.db"
|
43
|
+
when :aai
|
44
|
+
"02.aai/#{dataset_name}.db"
|
45
|
+
when :ani
|
46
|
+
"03.ani/#{dataset_name}.db"
|
47
|
+
end
|
48
|
+
File.expand_path(b, home)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
|
52
|
+
# query dataset)
|
53
|
+
def query_db(metric)
|
54
|
+
File.expand_path("#{dataset.name}.#{metric}.db", home)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get the stored +metric+ value against +target+
|
58
|
+
def stored_value(target, metric)
|
59
|
+
# Check if self.dataset -> target is done (previous run)
|
60
|
+
y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
|
61
|
+
return y unless y.nil? or y.zero?
|
62
|
+
# Check if self.dataset <- target is done (another thread)
|
63
|
+
if dataset.is_ref? and project.path==ref_project.path
|
64
|
+
y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
|
65
|
+
return y unless y.nil? or y.zero?
|
66
|
+
end
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the value of +metric+ in the +db+ database between +n1+ and +n2+
|
71
|
+
def value_from_db(n1, n2, db, metric)
|
72
|
+
y = nil
|
73
|
+
SQLite3::Database.new(db) do |conn|
|
74
|
+
y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
|
75
|
+
y = y.first unless y.nil?
|
76
|
+
end if File.size? db
|
77
|
+
y
|
78
|
+
end
|
79
|
+
|
80
|
+
# Iterates for each entry in +db+
|
81
|
+
def foreach_in_db(db, metric, &blk)
|
82
|
+
SQLite3::Database.new(db) do |conn|
|
83
|
+
conn.execute("select * from #{metric}").each{ |r| blk[r] }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
# High-end pipelines for DistanceRunner
|
3
|
+
module MiGA::DistanceRunner::Pipeline
|
4
|
+
|
5
|
+
# Recursively classify the dataset, returning an Array with two entries:
|
6
|
+
# classification and cluster number
|
7
|
+
def classify(clades, classif, metric, result_fh, val_cls=nil)
|
8
|
+
dir = File.expand_path(classif, clades)
|
9
|
+
med = File.expand_path("miga-project.medoids", dir)
|
10
|
+
return [classif,val_cls] unless File.size? med
|
11
|
+
max_val = 0
|
12
|
+
val_med = ""
|
13
|
+
val_cls = nil
|
14
|
+
i_n = 0
|
15
|
+
File.open(med, "r") do |med_fh|
|
16
|
+
med_fh.each_line do |med_ln|
|
17
|
+
i_n += 1
|
18
|
+
med_ln.chomp!
|
19
|
+
val = send(metric, ref_project.dataset(med_ln))
|
20
|
+
if !val.nil? and val >= max_val
|
21
|
+
max_val = val
|
22
|
+
val_med = med_ln
|
23
|
+
val_cls = i_n
|
24
|
+
puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
classif = File.expand_path("miga-project.sc-#{val_cls}", classif)
|
29
|
+
result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
|
30
|
+
classify(clades, classif, metric, result_fh, val_cls)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Builds a tree with all visited medoids from any classification level
|
34
|
+
def build_medoids_tree(metric)
|
35
|
+
db = query_db(metric)
|
36
|
+
return unless File.size? db
|
37
|
+
out_base = File.expand_path(dataset.name, home)
|
38
|
+
ds_matrix = "#{out_base}.txt"
|
39
|
+
ds_matrix_fh = File.open(ds_matrix, "w")
|
40
|
+
ds_matrix_fh.puts %w[a b value].join("\t")
|
41
|
+
# Find all values in the database
|
42
|
+
seq2 = []
|
43
|
+
foreach_in_db(db, metric) do |r|
|
44
|
+
seq2 << r[0]
|
45
|
+
ds_matrix_fh.puts r[0,3].join("\t")
|
46
|
+
end
|
47
|
+
# Find all values among visited datasets in ref_project
|
48
|
+
ref_r = ref_project.result("#{metric}_distances") or return
|
49
|
+
Zlib::GzipReader.open(ref_r.file_path(:matrix)) do |fh|
|
50
|
+
fh.each_line do |ln|
|
51
|
+
r = ln.chomp.split("\t")
|
52
|
+
next unless seq2.include?(r[1]) or seq2.include?(r[2])
|
53
|
+
ds_matrix_fh.puts r[1,3].join("\t")
|
54
|
+
end
|
55
|
+
end
|
56
|
+
ds_matrix_fh.close
|
57
|
+
ref_tree = File.expand_path("utils/ref-tree.R", MiGA::MiGA.root_path)
|
58
|
+
`"#{ref_tree}" "#{ds_matrix}" "#{out_base}" "#{dataset.name}"`
|
59
|
+
File.unlink ds_matrix
|
60
|
+
end
|
61
|
+
|
62
|
+
# Tests taxonomy
|
63
|
+
def tax_test
|
64
|
+
# Get taxonomy of closest relative
|
65
|
+
from_ref_project = (project != ref_project)
|
66
|
+
res_dir = from_ref_project ?
|
67
|
+
File.expand_path("data/09.distances/05.taxonomy", project.path) : home
|
68
|
+
Dir.mkdir res_dir unless Dir.exist? res_dir
|
69
|
+
File.open(File.expand_path("#{dataset.name}.done", res_dir), "w") do |fh|
|
70
|
+
fh.puts Time.now.to_s
|
71
|
+
end
|
72
|
+
dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
|
73
|
+
cr = dataset.closest_relatives(1, from_ref_project)
|
74
|
+
return if cr.nil? or cr.empty?
|
75
|
+
tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
|
76
|
+
# Run the test for each rank
|
77
|
+
r = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax).map do |k,v|
|
78
|
+
sig = ""
|
79
|
+
[0.5,0.1,0.05,0.01].each{ |i| sig << "*" if v<i }
|
80
|
+
[MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || "?"), v, sig]
|
81
|
+
end
|
82
|
+
# Save test
|
83
|
+
File.open(File.expand_path("#{dataset.name}.intax.txt", home), "w") do |fh|
|
84
|
+
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
|
85
|
+
fh.puts ""
|
86
|
+
fh.puts "Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01."
|
87
|
+
end
|
88
|
+
return r
|
89
|
+
end
|
90
|
+
|
91
|
+
# Transfer the taxonomy to the current dataset
|
92
|
+
def transfer_taxonomy(tax)
|
93
|
+
pval = (project.metadata[:tax_pvalue] || 0.05).to_f
|
94
|
+
tax_a = tax.select{ |i| i[1]!="?" && i[2]<=pval }.map { |i| i[0,2].join(":") }
|
95
|
+
dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
|
96
|
+
dataset.save
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
|
2
|
+
require_relative 'base.rb'
|
3
|
+
require_relative 'temporal.rb'
|
4
|
+
require_relative 'database.rb'
|
5
|
+
require_relative 'commands.rb'
|
6
|
+
require_relative 'pipeline.rb'
|
7
|
+
|
8
|
+
|
9
|
+
class MiGA::DistanceRunner
|
10
|
+
|
11
|
+
include MiGA::DistanceRunner::Temporal
|
12
|
+
include MiGA::DistanceRunner::Database
|
13
|
+
include MiGA::DistanceRunner::Commands
|
14
|
+
include MiGA::DistanceRunner::Pipeline
|
15
|
+
|
16
|
+
attr_reader :project, :ref_project, :dataset, :opts, :home
|
17
|
+
attr_reader :tmp, :tmp_dbs, :dbs, :db_counts
|
18
|
+
|
19
|
+
def initialize(project_path, dataset_name, opts_hash={})
|
20
|
+
@opts = opts_hash
|
21
|
+
@project = MiGA::Project.load(project_path) or
|
22
|
+
raise "No project at #{project_path}"
|
23
|
+
@dataset = project.dataset(dataset_name)
|
24
|
+
@home = File.expand_path("data/09.distances", project.path)
|
25
|
+
# Default opts
|
26
|
+
@opts[:aai_save_rbm] ||= ENV.fetch("MIGA_AAI_SAVE_RBM") do
|
27
|
+
project.is_clade? ? "save-rbm" : "no-save-rbm"
|
28
|
+
end
|
29
|
+
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
30
|
+
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
|
+
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
32
|
+
end
|
33
|
+
@ref_project ||= project
|
34
|
+
[:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
|
35
|
+
@opts[m] ||= ref_project.metadata[m]
|
36
|
+
end
|
37
|
+
@opts[:distances_checkpoint] ||= 10
|
38
|
+
@opts[:distances_checkpoint] = @opts[:distances_checkpoint].to_i
|
39
|
+
end
|
40
|
+
|
41
|
+
# Launch the appropriate analysis
|
42
|
+
def go!
|
43
|
+
return if dataset.is_multi?
|
44
|
+
Dir.mktmpdir do |tmp_dir|
|
45
|
+
@tmp = tmp_dir
|
46
|
+
create_temporals
|
47
|
+
opts[:run_taxonomy] ? go_taxonomy! : dataset.is_ref? ? go_ref! : go_query!
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Launch analysis for reference datasets
|
52
|
+
def go_ref!
|
53
|
+
# Initialize databases
|
54
|
+
initialize_dbs! true
|
55
|
+
# first-come-first-serve traverse
|
56
|
+
ref_project.each_dataset do |ds|
|
57
|
+
next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
|
58
|
+
puts "[ #{Time.now} ] #{ds.name}"
|
59
|
+
aai = aai(ds)
|
60
|
+
ani(ds) unless aai.nil? or aai < 90.0
|
61
|
+
end
|
62
|
+
# Finalize
|
63
|
+
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Launch analysis for query datasets
|
67
|
+
def go_query!
|
68
|
+
# Check if project is ready
|
69
|
+
v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
|
70
|
+
res = ref_project.result(v[0])
|
71
|
+
return if res.nil?
|
72
|
+
# Initialize the databases
|
73
|
+
initialize_dbs! false
|
74
|
+
# Calculate the classification-informed AAI/ANI traverse
|
75
|
+
results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
|
76
|
+
fh = File.open(results, "w")
|
77
|
+
classif, val_cls = *classify(res.dir, ".", v[1], fh)
|
78
|
+
fh.close
|
79
|
+
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
80
|
+
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
81
|
+
par = File.expand_path("miga-project.classif", par_dir)
|
82
|
+
if File.size? par
|
83
|
+
File.open(par, "r") do |fh|
|
84
|
+
fh.each_line do |ln|
|
85
|
+
r = ln.chomp.split("\t")
|
86
|
+
next unless r[1].to_i==val_cls
|
87
|
+
target = ref_project.dataset(r[0])
|
88
|
+
aai = (metric==:aai) ? aai(target) : 100.0
|
89
|
+
ani(target) if aai >= 90.0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
# Finalize
|
94
|
+
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
95
|
+
build_medoids_tree(v[1])
|
96
|
+
transfer_taxonomy(tax_test)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Launch analysis for taxonomy jobs
|
100
|
+
def go_taxonomy!
|
101
|
+
return unless project.metadata[:ref_project]
|
102
|
+
go_query! # <- yeah, it's actually the same, just different ref_project
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module MiGA::DistanceRunner::Temporal
|
5
|
+
|
6
|
+
# Copy input files to the (local) temporal folder
|
7
|
+
def create_temporals
|
8
|
+
rf = {essential_genes: :ess_genes, cds: :proteins, assembly: :largecontigs}
|
9
|
+
rf.each do |res, file|
|
10
|
+
r = dataset.result(res)
|
11
|
+
f = r.nil? ? nil : r.file_path(file)
|
12
|
+
FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Temporal file with extension +ext+
|
17
|
+
def tmp_file(ext)
|
18
|
+
File.expand_path("#{dataset.name}.#{ext}", tmp)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Copies temporal databases back to the MiGA Project if 10 or more values
|
22
|
+
# have been stored without copying. The period (10 by default) can be
|
23
|
+
# controlled using +@opts[:distances_checkpoint]+
|
24
|
+
def checkpoint(metric)
|
25
|
+
@db_counts[metric] += 1
|
26
|
+
checkpoint! metric if db_counts[metric] >= @opts[:distances_checkpoint]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Copies temporal databases back to the MiGA Project
|
30
|
+
def checkpoint!(metric)
|
31
|
+
SQLite3::Database.new(tmp_dbs[metric]) do |conn|
|
32
|
+
conn.execute("select count(*) from #{metric==:haai ? :aai : metric}")
|
33
|
+
end
|
34
|
+
FileUtils.cp(tmp_dbs[metric], dbs[metric])
|
35
|
+
@db_counts[metric] = 0
|
36
|
+
end
|
37
|
+
end
|
data/utils/distances.rb
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
# Recruitment plots
|
2
|
+
|
3
|
+
## Aims
|
4
|
+
|
5
|
+
This document aims to cover the technical aspects of the recruitment plot functions in the
|
6
|
+
`enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
|
7
|
+
|
8
|
+
## Caveats
|
9
|
+
|
10
|
+
This is a __*working document*__, describing unstable and/or experimental code. The material
|
11
|
+
here is susceptible of changes without warning, pay attention to the modification date and (if
|
12
|
+
in doubt) the commit history. The definitions and default parameters of the functions described
|
13
|
+
here may change in the near future as result of further experimentation or more stable
|
14
|
+
implementations.
|
15
|
+
|
16
|
+
The current document was generated and tested with the `enveomics.R` package version 1.3. To
|
17
|
+
check your current version in R, use `packageVersion('enveomics.R')`.
|
18
|
+
|
19
|
+
> **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
|
20
|
+
> Carefully evaluate all your results.
|
21
|
+
|
22
|
+
---
|
23
|
+
|
24
|
+
## Package: `enveomics.R`
|
25
|
+
|
26
|
+
The functionalities described here are provided by the `enveomics.R` package. Some features
|
27
|
+
described here are updated more frequently than the official
|
28
|
+
[CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
|
29
|
+
updates (package HEAD), download (or update), and install this git repository.
|
30
|
+
|
31
|
+
### Quick installation guide
|
32
|
+
|
33
|
+
:globe_with_meridians: To install the latest stable version available in CRAN, use in R:
|
34
|
+
|
35
|
+
```R
|
36
|
+
install.packages(c('enveomics.R','optparse'))
|
37
|
+
```
|
38
|
+
|
39
|
+
:octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
|
40
|
+
|
41
|
+
```R
|
42
|
+
install.packages('devtools')
|
43
|
+
library('devtools')
|
44
|
+
install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
|
45
|
+
```
|
46
|
+
|
47
|
+
---
|
48
|
+
|
49
|
+
## Recruitment plots: `enve.recplot2`
|
50
|
+
|
51
|
+
The first step in this analysis is the mapping of reads to the genome, processed with
|
52
|
+
[BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
|
53
|
+
We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
|
54
|
+
prefix of the processed files.
|
55
|
+
|
56
|
+
Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
|
57
|
+
For this, you'll have two options.
|
58
|
+
|
59
|
+
### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
|
60
|
+
|
61
|
+
The stand-alone script
|
62
|
+
[BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
|
63
|
+
is the easiest option to run, and should be the preferred method if you're automating
|
64
|
+
this analysis to process several mappings, but it doesn't offer access to advanced options.
|
65
|
+
|
66
|
+
You can run it like this using two CPUs:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
|
70
|
+
```
|
71
|
+
|
72
|
+
> **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
|
73
|
+
> map against contigs. However, if you did map reads against genes, you may want to use the
|
74
|
+
> `--pos-breaks 0` option to use each gene as a recruitment window.
|
75
|
+
>
|
76
|
+
> **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
|
77
|
+
> `--peaks-col darkred` option.
|
78
|
+
|
79
|
+
Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
|
80
|
+
object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
|
81
|
+
|
82
|
+
### Option 2: Using the `enve.recplot2` R function
|
83
|
+
|
84
|
+
If you require access to advanced options, or for some other reason prefer to calculate the
|
85
|
+
recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
|
86
|
+
and example session in R:
|
87
|
+
|
88
|
+
```R
|
89
|
+
# Load the package
|
90
|
+
library(enveomics.R)
|
91
|
+
# Open the PDF
|
92
|
+
pdf('my-recplot.pdf')
|
93
|
+
# Build and plot the object using two threads and no peak detection
|
94
|
+
# (to turn on peak detection, simply remove `peaks.col=NA`)
|
95
|
+
rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
|
96
|
+
# Close the PDF
|
97
|
+
dev.off()
|
98
|
+
# Save the object
|
99
|
+
save(rp, file='my-recplot.rdata')
|
100
|
+
```
|
101
|
+
|
102
|
+
> **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
|
103
|
+
> before closing the R session.
|
104
|
+
|
105
|
+
Naturally, you may want to see what other (advanced) options you have. You can access the
|
106
|
+
documentation of the function in R using `?enve.recplot2`.
|
107
|
+
|
108
|
+
---
|
109
|
+
|
110
|
+
## Summary statistics
|
111
|
+
|
112
|
+
Here we explore some frequently used summary statistics from recruitment plots. First, load the
|
113
|
+
package and the `enve.RecPlot2` object you saved previously, in R:
|
114
|
+
|
115
|
+
```R
|
116
|
+
library(enveomics.R)
|
117
|
+
load('my-recplot.rdata')
|
118
|
+
```
|
119
|
+
|
120
|
+
### Average and median sequencing depth
|
121
|
+
|
122
|
+
```R
|
123
|
+
mean(enve.recplot2.seqdepth(rp)) # <- Average
|
124
|
+
median(enve.recplot2.seqdepth(rp)) # <- Median
|
125
|
+
```
|
126
|
+
|
127
|
+
### Average and median sequencing depth excluding zero-coverage windows
|
128
|
+
|
129
|
+
```R
|
130
|
+
seqdepth <- enve.recplot2.seqdepth(rp)
|
131
|
+
mean(seqdepth[seqdepth>0]) # <- Average
|
132
|
+
median(seqdepth[seqdepth>0]) # <- Median
|
133
|
+
```
|
134
|
+
|
135
|
+
### Average Nucleotide Identity from reads (ANIr)
|
136
|
+
|
137
|
+
```R
|
138
|
+
enve.recplot2.ANIr(rp) # <- Complete recruitment plot
|
139
|
+
enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
|
140
|
+
enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
|
141
|
+
enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
|
142
|
+
```
|
143
|
+
|
144
|
+
### Coordinates of each sequence window with their respective sequencing depth
|
145
|
+
|
146
|
+
```R
|
147
|
+
d <- enve.recplot2.coordinates(rp)
|
148
|
+
d$seqdepth <- enve.recplot2.seqdepth(rp)
|
149
|
+
d
|
150
|
+
```
|
151
|
+
|
152
|
+
### Sequencing breadth (upper boundary)
|
153
|
+
|
154
|
+
This estimate depends on the window size. The smaller the window size, the better the
|
155
|
+
estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
|
156
|
+
biased (overestimate).
|
157
|
+
|
158
|
+
```R
|
159
|
+
mean(enve.recplot2.seqdepth(rp) > 0)
|
160
|
+
```
|
161
|
+
|
162
|
+
---
|
163
|
+
|
164
|
+
## Peak-finder: `enve.recplot2.findPeaks`
|
165
|
+
|
166
|
+
In this step we will try to identify one or multiple population peaks corresponding to different
|
167
|
+
sub-populations and/or composites of sub-populations.
|
168
|
+
|
169
|
+
> **NOTE** This step can be performed together with the step above, but we separate it here for
|
170
|
+
> two reasons: **(1)** This step is much more unstable but less computationally demanding than the
|
171
|
+
> step before, so it makes sense to re-run only this part with different parameters and/or
|
172
|
+
> package updates; and **(2)** We want to save the R objects independently, so the following steps
|
173
|
+
> are more clear.
|
174
|
+
|
175
|
+
In R:
|
176
|
+
|
177
|
+
```R
|
178
|
+
# Load the package
|
179
|
+
library(enveomics.R)
|
180
|
+
# Load the `enve.RecPlot2` object you saved previously
|
181
|
+
load('my-recplot.rdata')
|
182
|
+
# Find the peaks
|
183
|
+
peaks <- enve.recplot2.findPeaks(rp)
|
184
|
+
# Save the peaks R object (optional)
|
185
|
+
save(peaks, file='my-recplot-peaks.rdata')
|
186
|
+
# Plot the peaks in a PDF (optional)
|
187
|
+
pdf('my-recplot-peaks.pdf')
|
188
|
+
p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
|
189
|
+
dev.off()
|
190
|
+
```
|
191
|
+
|
192
|
+
The key function here is `enve.recplo2.findPeaks`. This function has several parameters, depending on
|
193
|
+
the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
|
194
|
+
of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
|
195
|
+
|
196
|
+
---
|
197
|
+
|
198
|
+
## Gene-content diversity: `enve.recplot2.extractWindows`
|
199
|
+
|
200
|
+
In R:
|
201
|
+
|
202
|
+
```R
|
203
|
+
# Load the package and the objects (unless you're still in the same session from the last step)
|
204
|
+
library(enveomics.R)
|
205
|
+
load('my-recplot.rdata')
|
206
|
+
load('my-recplot-peaks.rdata')
|
207
|
+
# Find the peak representing the core genome
|
208
|
+
cp <- enve.recplot2.corePeak(peaks)
|
209
|
+
#-----
|
210
|
+
# The following functions illustrate how to obtain different results. Please explore the resulting
|
211
|
+
# objects and the associated documentation
|
212
|
+
#-----
|
213
|
+
# Find the coordinates of windows significantly below the average sequencing depth
|
214
|
+
div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
|
215
|
+
# Add sequencing depth
|
216
|
+
div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
|
217
|
+
# Save the coordinates as a tab-delimited table
|
218
|
+
write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
|
219
|
+
# Find all the windows with sequencing depth zero
|
220
|
+
zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
|
221
|
+
```
|
222
|
+
|
223
|
+
---
|
224
|
+
|
225
|
+
## To do
|
226
|
+
|
227
|
+
- [x] Document structure
|
228
|
+
- [x] Package: `enveomics.R`
|
229
|
+
- [x] Recruitment plots: `enve.recplot2`
|
230
|
+
- [x] Summary statistics
|
231
|
+
- [x] Peak-finder: `enve.recplot2.findPeaks`
|
232
|
+
- [x] Gene-content diversity: `enve.recplot2.extractWindows`
|
233
|
+
- [ ] Compare identity profiles: `enve.recplot2.compareIdentities`
|