miga-base 0.3.1.7 → 0.3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/ncbi_get.rb +8 -0
- data/lib/miga/common.rb +9 -215
- data/lib/miga/common/base.rb +49 -0
- data/lib/miga/common/format.rb +135 -0
- data/lib/miga/common/path.rb +49 -0
- data/lib/miga/daemon.rb +3 -60
- data/lib/miga/daemon/base.rb +69 -0
- data/lib/miga/dataset.rb +3 -3
- data/lib/miga/dataset/result.rb +5 -5
- data/lib/miga/result.rb +5 -0
- data/lib/miga/version.rb +7 -5
- data/scripts/distances.bash +2 -19
- data/scripts/taxonomy.bash +2 -21
- data/test/common_test.rb +9 -0
- data/utils/distance/base.rb +6 -0
- data/utils/distance/commands.rb +82 -0
- data/utils/distance/database.rb +86 -0
- data/utils/distance/pipeline.rb +98 -0
- data/utils/distance/runner.rb +104 -0
- data/utils/distance/temporal.rb +37 -0
- data/utils/distances.rb +9 -0
- data/utils/enveomics/Docs/recplot2.md +233 -0
- data/utils/enveomics/Makefile +1 -1
- data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
- data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
- data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
- data/utils/enveomics/Manifest/categories.json +11 -1
- data/utils/enveomics/Manifest/examples.json +2 -2
- data/utils/enveomics/README.md +2 -0
- data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
- data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
- data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
- data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
- data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
- data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
- data/utils/enveomics/Scripts/SRA.download.bash +28 -21
- data/utils/enveomics/Scripts/Table.barplot.R +1 -0
- data/utils/enveomics/Scripts/aai.rb +4 -2
- data/utils/enveomics/build_enveomics_r.bash +5 -5
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
- data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
- data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
- data/utils/enveomics/enveomics.R/README.md +26 -17
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
- data/utils/requirements.txt +1 -1
- metadata +28 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
require 'sqlite3'
|
3
|
+
|
4
|
+
module MiGA::DistanceRunner::Database
|
5
|
+
# Check for corrupt files and create empty databases
|
6
|
+
def initialize_dbs!(for_ref)
|
7
|
+
@dbs = {}
|
8
|
+
@tmp_dbs = {}
|
9
|
+
@db_counts = {}
|
10
|
+
{haai: :aai, aai: :aai, ani: :ani}.each do |m, t|
|
11
|
+
@db_counts[m] = 0
|
12
|
+
@dbs[m] = for_ref ? ref_db(m) : query_db(m)
|
13
|
+
# Remove if corrupt
|
14
|
+
if File.size?(dbs[m])
|
15
|
+
begin
|
16
|
+
SQLite3::Database.new(dbs[m]) do |conn|
|
17
|
+
conn.execute "select count(*) from #{t};"
|
18
|
+
end
|
19
|
+
rescue SQLite3::SQLException
|
20
|
+
FileUtils.rm dbs[m]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# Initialize if it doesn't exist
|
24
|
+
SQLite3::Database.new(dbs[m]) do |conn|
|
25
|
+
conn.execute "create table if not exists #{t}(" +
|
26
|
+
"seq1 varchar(256), seq2 varchar(256), " +
|
27
|
+
"#{t} float, sd float, n int, omega int" +
|
28
|
+
")"
|
29
|
+
end unless File.size? dbs[m]
|
30
|
+
# Copy over to (local) temporals
|
31
|
+
@tmp_dbs[m] = tmp_file("#{m}.db")
|
32
|
+
FileUtils.cp(dbs[m], tmp_dbs[m])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Path to the database +metric+ for +dataset_name+ in +project+
|
37
|
+
# (assumes that +dataset_name+ is a reference dataset)
|
38
|
+
def ref_db(metric, dataset_name=nil)
|
39
|
+
dataset_name ||= dataset.name
|
40
|
+
b = case metric
|
41
|
+
when :haai
|
42
|
+
"01.haai/#{dataset_name}.db"
|
43
|
+
when :aai
|
44
|
+
"02.aai/#{dataset_name}.db"
|
45
|
+
when :ani
|
46
|
+
"03.ani/#{dataset_name}.db"
|
47
|
+
end
|
48
|
+
File.expand_path(b, home)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Path to the database +metric+ for +dataset+ (assumes that +dataset+ is a
|
52
|
+
# query dataset)
|
53
|
+
def query_db(metric)
|
54
|
+
File.expand_path("#{dataset.name}.#{metric}.db", home)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get the stored +metric+ value against +target+
|
58
|
+
def stored_value(target, metric)
|
59
|
+
# Check if self.dataset -> target is done (previous run)
|
60
|
+
y = value_from_db(dataset.name, target.name, tmp_dbs[metric], metric)
|
61
|
+
return y unless y.nil? or y.zero?
|
62
|
+
# Check if self.dataset <- target is done (another thread)
|
63
|
+
if dataset.is_ref? and project.path==ref_project.path
|
64
|
+
y = value_from_db(target.name, dataset.name, ref_db(metric, target.name), metric)
|
65
|
+
return y unless y.nil? or y.zero?
|
66
|
+
end
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the value of +metric+ in the +db+ database between +n1+ and +n2+
|
71
|
+
def value_from_db(n1, n2, db, metric)
|
72
|
+
y = nil
|
73
|
+
SQLite3::Database.new(db) do |conn|
|
74
|
+
y = conn.execute("select #{metric} from #{metric} where seq1=? and seq2=?", [n1, n2]).first
|
75
|
+
y = y.first unless y.nil?
|
76
|
+
end if File.size? db
|
77
|
+
y
|
78
|
+
end
|
79
|
+
|
80
|
+
# Iterates for each entry in +db+
|
81
|
+
def foreach_in_db(db, metric, &blk)
|
82
|
+
SQLite3::Database.new(db) do |conn|
|
83
|
+
conn.execute("select * from #{metric}").each{ |r| blk[r] }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
# High-end pipelines for DistanceRunner
|
3
|
+
module MiGA::DistanceRunner::Pipeline
|
4
|
+
|
5
|
+
# Recursively classify the dataset, returning an Array with two entries:
|
6
|
+
# classification and cluster number
|
7
|
+
def classify(clades, classif, metric, result_fh, val_cls=nil)
|
8
|
+
dir = File.expand_path(classif, clades)
|
9
|
+
med = File.expand_path("miga-project.medoids", dir)
|
10
|
+
return [classif,val_cls] unless File.size? med
|
11
|
+
max_val = 0
|
12
|
+
val_med = ""
|
13
|
+
val_cls = nil
|
14
|
+
i_n = 0
|
15
|
+
File.open(med, "r") do |med_fh|
|
16
|
+
med_fh.each_line do |med_ln|
|
17
|
+
i_n += 1
|
18
|
+
med_ln.chomp!
|
19
|
+
val = send(metric, ref_project.dataset(med_ln))
|
20
|
+
if !val.nil? and val >= max_val
|
21
|
+
max_val = val
|
22
|
+
val_med = med_ln
|
23
|
+
val_cls = i_n
|
24
|
+
puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
classif = File.expand_path("miga-project.sc-#{val_cls}", classif)
|
29
|
+
result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
|
30
|
+
classify(clades, classif, metric, result_fh, val_cls)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Builds a tree with all visited medoids from any classification level
|
34
|
+
def build_medoids_tree(metric)
|
35
|
+
db = query_db(metric)
|
36
|
+
return unless File.size? db
|
37
|
+
out_base = File.expand_path(dataset.name, home)
|
38
|
+
ds_matrix = "#{out_base}.txt"
|
39
|
+
ds_matrix_fh = File.open(ds_matrix, "w")
|
40
|
+
ds_matrix_fh.puts %w[a b value].join("\t")
|
41
|
+
# Find all values in the database
|
42
|
+
seq2 = []
|
43
|
+
foreach_in_db(db, metric) do |r|
|
44
|
+
seq2 << r[0]
|
45
|
+
ds_matrix_fh.puts r[0,3].join("\t")
|
46
|
+
end
|
47
|
+
# Find all values among visited datasets in ref_project
|
48
|
+
ref_r = ref_project.result("#{metric}_distances") or return
|
49
|
+
Zlib::GzipReader.open(ref_r.file_path(:matrix)) do |fh|
|
50
|
+
fh.each_line do |ln|
|
51
|
+
r = ln.chomp.split("\t")
|
52
|
+
next unless seq2.include?(r[1]) or seq2.include?(r[2])
|
53
|
+
ds_matrix_fh.puts r[1,3].join("\t")
|
54
|
+
end
|
55
|
+
end
|
56
|
+
ds_matrix_fh.close
|
57
|
+
ref_tree = File.expand_path("utils/ref-tree.R", MiGA::MiGA.root_path)
|
58
|
+
`"#{ref_tree}" "#{ds_matrix}" "#{out_base}" "#{dataset.name}"`
|
59
|
+
File.unlink ds_matrix
|
60
|
+
end
|
61
|
+
|
62
|
+
# Tests taxonomy
|
63
|
+
def tax_test
|
64
|
+
# Get taxonomy of closest relative
|
65
|
+
from_ref_project = (project != ref_project)
|
66
|
+
res_dir = from_ref_project ?
|
67
|
+
File.expand_path("data/09.distances/05.taxonomy", project.path) : home
|
68
|
+
Dir.mkdir res_dir unless Dir.exist? res_dir
|
69
|
+
File.open(File.expand_path("#{dataset.name}.done", res_dir), "w") do |fh|
|
70
|
+
fh.puts Time.now.to_s
|
71
|
+
end
|
72
|
+
dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
|
73
|
+
cr = dataset.closest_relatives(1, from_ref_project)
|
74
|
+
return if cr.nil? or cr.empty?
|
75
|
+
tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
|
76
|
+
# Run the test for each rank
|
77
|
+
r = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax).map do |k,v|
|
78
|
+
sig = ""
|
79
|
+
[0.5,0.1,0.05,0.01].each{ |i| sig << "*" if v<i }
|
80
|
+
[MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || "?"), v, sig]
|
81
|
+
end
|
82
|
+
# Save test
|
83
|
+
File.open(File.expand_path("#{dataset.name}.intax.txt", home), "w") do |fh|
|
84
|
+
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
|
85
|
+
fh.puts ""
|
86
|
+
fh.puts "Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01."
|
87
|
+
end
|
88
|
+
return r
|
89
|
+
end
|
90
|
+
|
91
|
+
# Transfer the taxonomy to the current dataset
|
92
|
+
def transfer_taxonomy(tax)
|
93
|
+
pval = (project.metadata[:tax_pvalue] || 0.05).to_f
|
94
|
+
tax_a = tax.select{ |i| i[1]!="?" && i[2]<=pval }.map { |i| i[0,2].join(":") }
|
95
|
+
dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
|
96
|
+
dataset.save
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
|
2
|
+
require_relative 'base.rb'
|
3
|
+
require_relative 'temporal.rb'
|
4
|
+
require_relative 'database.rb'
|
5
|
+
require_relative 'commands.rb'
|
6
|
+
require_relative 'pipeline.rb'
|
7
|
+
|
8
|
+
|
9
|
+
class MiGA::DistanceRunner
|
10
|
+
|
11
|
+
include MiGA::DistanceRunner::Temporal
|
12
|
+
include MiGA::DistanceRunner::Database
|
13
|
+
include MiGA::DistanceRunner::Commands
|
14
|
+
include MiGA::DistanceRunner::Pipeline
|
15
|
+
|
16
|
+
attr_reader :project, :ref_project, :dataset, :opts, :home
|
17
|
+
attr_reader :tmp, :tmp_dbs, :dbs, :db_counts
|
18
|
+
|
19
|
+
def initialize(project_path, dataset_name, opts_hash={})
|
20
|
+
@opts = opts_hash
|
21
|
+
@project = MiGA::Project.load(project_path) or
|
22
|
+
raise "No project at #{project_path}"
|
23
|
+
@dataset = project.dataset(dataset_name)
|
24
|
+
@home = File.expand_path("data/09.distances", project.path)
|
25
|
+
# Default opts
|
26
|
+
@opts[:aai_save_rbm] ||= ENV.fetch("MIGA_AAI_SAVE_RBM") do
|
27
|
+
project.is_clade? ? "save-rbm" : "no-save-rbm"
|
28
|
+
end
|
29
|
+
@opts[:thr] ||= ENV.fetch("CORES"){ 2 }.to_i
|
30
|
+
if opts[:run_taxonomy] && project.metadata[:ref_project]
|
31
|
+
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
|
32
|
+
end
|
33
|
+
@ref_project ||= project
|
34
|
+
[:haai_p, :aai_p, :ani_p, :distances_checkpoint].each do |m|
|
35
|
+
@opts[m] ||= ref_project.metadata[m]
|
36
|
+
end
|
37
|
+
@opts[:distances_checkpoint] ||= 10
|
38
|
+
@opts[:distances_checkpoint] = @opts[:distances_checkpoint].to_i
|
39
|
+
end
|
40
|
+
|
41
|
+
# Launch the appropriate analysis
|
42
|
+
def go!
|
43
|
+
return if dataset.is_multi?
|
44
|
+
Dir.mktmpdir do |tmp_dir|
|
45
|
+
@tmp = tmp_dir
|
46
|
+
create_temporals
|
47
|
+
opts[:run_taxonomy] ? go_taxonomy! : dataset.is_ref? ? go_ref! : go_query!
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Launch analysis for reference datasets
|
52
|
+
def go_ref!
|
53
|
+
# Initialize databases
|
54
|
+
initialize_dbs! true
|
55
|
+
# first-come-first-serve traverse
|
56
|
+
ref_project.each_dataset do |ds|
|
57
|
+
next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
|
58
|
+
puts "[ #{Time.now} ] #{ds.name}"
|
59
|
+
aai = aai(ds)
|
60
|
+
ani(ds) unless aai.nil? or aai < 90.0
|
61
|
+
end
|
62
|
+
# Finalize
|
63
|
+
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Launch analysis for query datasets
|
67
|
+
def go_query!
|
68
|
+
# Check if project is ready
|
69
|
+
v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
|
70
|
+
res = ref_project.result(v[0])
|
71
|
+
return if res.nil?
|
72
|
+
# Initialize the databases
|
73
|
+
initialize_dbs! false
|
74
|
+
# Calculate the classification-informed AAI/ANI traverse
|
75
|
+
results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
|
76
|
+
fh = File.open(results, "w")
|
77
|
+
classif, val_cls = *classify(res.dir, ".", v[1], fh)
|
78
|
+
fh.close
|
79
|
+
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
80
|
+
par_dir = File.dirname(File.expand_path(classif, res.dir))
|
81
|
+
par = File.expand_path("miga-project.classif", par_dir)
|
82
|
+
if File.size? par
|
83
|
+
File.open(par, "r") do |fh|
|
84
|
+
fh.each_line do |ln|
|
85
|
+
r = ln.chomp.split("\t")
|
86
|
+
next unless r[1].to_i==val_cls
|
87
|
+
target = ref_project.dataset(r[0])
|
88
|
+
aai = (metric==:aai) ? aai(target) : 100.0
|
89
|
+
ani(target) if aai >= 90.0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
# Finalize
|
94
|
+
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
|
95
|
+
build_medoids_tree(v[1])
|
96
|
+
transfer_taxonomy(tax_test)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Launch analysis for taxonomy jobs
|
100
|
+
def go_taxonomy!
|
101
|
+
return unless project.metadata[:ref_project]
|
102
|
+
go_query! # <- yeah, it's actually the same, just different ref_project
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module MiGA::DistanceRunner::Temporal
|
5
|
+
|
6
|
+
# Copy input files to the (local) temporal folder
|
7
|
+
def create_temporals
|
8
|
+
rf = {essential_genes: :ess_genes, cds: :proteins, assembly: :largecontigs}
|
9
|
+
rf.each do |res, file|
|
10
|
+
r = dataset.result(res)
|
11
|
+
f = r.nil? ? nil : r.file_path(file)
|
12
|
+
FileUtils.cp(f, tmp_file("#{file}.fa")) unless f.nil?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Temporal file with extension +ext+
|
17
|
+
def tmp_file(ext)
|
18
|
+
File.expand_path("#{dataset.name}.#{ext}", tmp)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Copies temporal databases back to the MiGA Project if 10 or more values
|
22
|
+
# have been stored without copying. The period (10 by default) can be
|
23
|
+
# controlled using +@opts[:distances_checkpoint]+
|
24
|
+
def checkpoint(metric)
|
25
|
+
@db_counts[metric] += 1
|
26
|
+
checkpoint! metric if db_counts[metric] >= @opts[:distances_checkpoint]
|
27
|
+
end
|
28
|
+
|
29
|
+
# Copies temporal databases back to the MiGA Project
|
30
|
+
def checkpoint!(metric)
|
31
|
+
SQLite3::Database.new(tmp_dbs[metric]) do |conn|
|
32
|
+
conn.execute("select count(*) from #{metric==:haai ? :aai : metric}")
|
33
|
+
end
|
34
|
+
FileUtils.cp(tmp_dbs[metric], dbs[metric])
|
35
|
+
@db_counts[metric] = 0
|
36
|
+
end
|
37
|
+
end
|
data/utils/distances.rb
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
# Recruitment plots
|
2
|
+
|
3
|
+
## Aims
|
4
|
+
|
5
|
+
This document aims to cover the technical aspects of the recruitment plot functions in the
|
6
|
+
`enveomics.R` package, focusing on the peak finder and gene-content diversity analyses.
|
7
|
+
|
8
|
+
## Caveats
|
9
|
+
|
10
|
+
This is a __*working document*__, describing unstable and/or experimental code. The material
|
11
|
+
here is susceptible of changes without warning, pay attention to the modification date and (if
|
12
|
+
in doubt) the commit history. The definitions and default parameters of the functions described
|
13
|
+
here may change in the near future as result of further experimentation or more stable
|
14
|
+
implementations.
|
15
|
+
|
16
|
+
The current document was generated and tested with the `enveomics.R` package version 1.3. To
|
17
|
+
check your current version in R, use `packageVersion('enveomics.R')`.
|
18
|
+
|
19
|
+
> **IMPORTANT**: Some of the functions described here may return unexpected results with your data.
|
20
|
+
> Carefully evaluate all your results.
|
21
|
+
|
22
|
+
---
|
23
|
+
|
24
|
+
## Package: `enveomics.R`
|
25
|
+
|
26
|
+
The functionalities described here are provided by the `enveomics.R` package. Some features
|
27
|
+
described here are updated more frequently than the official
|
28
|
+
[CRAN releases](https://CRAN.R-project.org/package=enveomics.R). In order to have the latest
|
29
|
+
updates (package HEAD), download (or update), and install this git repository.
|
30
|
+
|
31
|
+
### Quick installation guide
|
32
|
+
|
33
|
+
:globe_with_meridians: To install the latest stable version available in CRAN, use in R:
|
34
|
+
|
35
|
+
```R
|
36
|
+
install.packages(c('enveomics.R','optparse'))
|
37
|
+
```
|
38
|
+
|
39
|
+
:octocat: To install the latest HEAD version (potentially unstable) available in GitHub, use in R:
|
40
|
+
|
41
|
+
```R
|
42
|
+
install.packages('devtools')
|
43
|
+
library('devtools')
|
44
|
+
install_github('lmrodriguezr/enveomics', subdir='enveomics.R')
|
45
|
+
```
|
46
|
+
|
47
|
+
---
|
48
|
+
|
49
|
+
## Recruitment plots: `enve.recplot2`
|
50
|
+
|
51
|
+
The first step in this analysis is the mapping of reads to the genome, processed with
|
52
|
+
[BlastTab.catsbj.pl](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.catsbj.pl).
|
53
|
+
We'll assume the mapping is saved in the file `my-mapping.tab` and this is also the
|
54
|
+
prefix of the processed files.
|
55
|
+
|
56
|
+
Once you have these input files (`.rec` and `.lim`), you can build the recruitment plot.
|
57
|
+
For this, you'll have two options.
|
58
|
+
|
59
|
+
### Option 1: Using the `BlastTab.recplot2.R` stand-alone script
|
60
|
+
|
61
|
+
The stand-alone script
|
62
|
+
[BlastTab.recplot2.R](http://enve-omics.ce.gatech.edu/enveomics/docs?t=BlastTab.recplot2.R)
|
63
|
+
is the easiest option to run, and should be the preferred method if you're automating
|
64
|
+
this analysis to process several mappings, but it doesn't offer access to advanced options.
|
65
|
+
|
66
|
+
You can run it like this using two CPUs:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
BlastTab.recplot2.R --prefix my-mapping.tab --threads 2 my-recplot.rdata my-recplot.pdf
|
70
|
+
```
|
71
|
+
|
72
|
+
> **NOTE 1**: It's NOT recommended to map reads against genes, the recommended strategy is to
|
73
|
+
> map against contigs. However, if you did map reads against genes, you may want to use the
|
74
|
+
> `--pos-breaks 0` option to use each gene as a recruitment window.
|
75
|
+
>
|
76
|
+
> **NOTE 2**: If you want to plot the population peaks at this step, simply pass the
|
77
|
+
> `--peaks-col darkred` option.
|
78
|
+
|
79
|
+
Now you should have two output files: `my-recplot.rdata`, containing your `enve.RecPlot2` R
|
80
|
+
object, and `my-recplot.pdf` with the graphical output of the recruitment plot.
|
81
|
+
|
82
|
+
### Option 2: Using the `enve.recplot2` R function
|
83
|
+
|
84
|
+
If you require access to advanced options, or for some other reason prefer to calculate the
|
85
|
+
recruitment plot interactively, you can directly use the `enve.recplot2` R function. This is
|
86
|
+
and example session in R:
|
87
|
+
|
88
|
+
```R
|
89
|
+
# Load the package
|
90
|
+
library(enveomics.R)
|
91
|
+
# Open the PDF
|
92
|
+
pdf('my-recplot.pdf')
|
93
|
+
# Build and plot the object using two threads and no peak detection
|
94
|
+
# (to turn on peak detection, simply remove `peaks.col=NA`)
|
95
|
+
rp <- enve.recplot2('my-mapping.tab', threads=2, peaks.col=NA)
|
96
|
+
# Close the PDF
|
97
|
+
dev.off()
|
98
|
+
# Save the object
|
99
|
+
save(rp, file='my-recplot.rdata')
|
100
|
+
```
|
101
|
+
|
102
|
+
> **IMPORTANT**: Remember to save the `enve.RecPlot2` R object (that's the last line above)
|
103
|
+
> before closing the R session.
|
104
|
+
|
105
|
+
Naturally, you may want to see what other (advanced) options you have. You can access the
|
106
|
+
documentation of the function in R using `?enve.recplot2`.
|
107
|
+
|
108
|
+
---
|
109
|
+
|
110
|
+
## Summary statistics
|
111
|
+
|
112
|
+
Here we explore some frequently used summary statistics from recruitment plots. First, load the
|
113
|
+
package and the `enve.RecPlot2` object you saved previously, in R:
|
114
|
+
|
115
|
+
```R
|
116
|
+
library(enveomics.R)
|
117
|
+
load('my-recplot.rdata')
|
118
|
+
```
|
119
|
+
|
120
|
+
### Average and median sequencing depth
|
121
|
+
|
122
|
+
```R
|
123
|
+
mean(enve.recplot2.seqdepth(rp)) # <- Average
|
124
|
+
median(enve.recplot2.seqdepth(rp)) # <- Median
|
125
|
+
```
|
126
|
+
|
127
|
+
### Average and median sequencing depth excluding zero-coverage windows
|
128
|
+
|
129
|
+
```R
|
130
|
+
seqdepth <- enve.recplot2.seqdepth(rp)
|
131
|
+
mean(seqdepth[seqdepth>0]) # <- Average
|
132
|
+
median(seqdepth[seqdepth>0]) # <- Median
|
133
|
+
```
|
134
|
+
|
135
|
+
### Average Nucleotide Identity from reads (ANIr)
|
136
|
+
|
137
|
+
```R
|
138
|
+
enve.recplot2.ANIr(rp) # <- Complete recruitment plot
|
139
|
+
enve.recplot2.ANIr(rp, c(90,100)) # <- All reads above 90% (recommended for intra-population)
|
140
|
+
enve.recplot2.ANIr(rp, c(95,100)) # <- Reads above 95%
|
141
|
+
enve.recplot2.ANIr(rp, c( 0, 90)) # <- Between populations (other species)
|
142
|
+
```
|
143
|
+
|
144
|
+
### Coordinates of each sequence window with their respective sequencing depth
|
145
|
+
|
146
|
+
```R
|
147
|
+
d <- enve.recplot2.coordinates(rp)
|
148
|
+
d$seqdepth <- enve.recplot2.seqdepth(rp)
|
149
|
+
d
|
150
|
+
```
|
151
|
+
|
152
|
+
### Sequencing breadth (upper boundary)
|
153
|
+
|
154
|
+
This estimate depends on the window size. The smaller the window size, the better the
|
155
|
+
estimate. When the window size is 1bp, the estimate is exact, otherwise it's consistently
|
156
|
+
biased (overestimate).
|
157
|
+
|
158
|
+
```R
|
159
|
+
mean(enve.recplot2.seqdepth(rp) > 0)
|
160
|
+
```
|
161
|
+
|
162
|
+
---
|
163
|
+
|
164
|
+
## Peak-finder: `enve.recplot2.findPeaks`
|
165
|
+
|
166
|
+
In this step we will try to identify one or multiple population peaks corresponding to different
|
167
|
+
sub-populations and/or composites of sub-populations.
|
168
|
+
|
169
|
+
> **NOTE** This step can be performed together with the step above, but we separate it here for
|
170
|
+
> two reasons: **(1)** This step is much more unstable but less computationally demanding than the
|
171
|
+
> step before, so it makes sense to re-run only this part with different parameters and/or
|
172
|
+
> package updates; and **(2)** We want to save the R objects independently, so the following steps
|
173
|
+
> are more clear.
|
174
|
+
|
175
|
+
In R:
|
176
|
+
|
177
|
+
```R
|
178
|
+
# Load the package
|
179
|
+
library(enveomics.R)
|
180
|
+
# Load the `enve.RecPlot2` object you saved previously
|
181
|
+
load('my-recplot.rdata')
|
182
|
+
# Find the peaks
|
183
|
+
peaks <- enve.recplot2.findPeaks(rp)
|
184
|
+
# Save the peaks R object (optional)
|
185
|
+
save(peaks, file='my-recplot-peaks.rdata')
|
186
|
+
# Plot the peaks in a PDF (optional)
|
187
|
+
pdf('my-recplot-peaks.pdf')
|
188
|
+
p <- plot(rp, use.peaks=peaks, layout=4) # <- Remove `layout=4` for the full plot
|
189
|
+
dev.off()
|
190
|
+
```
|
191
|
+
|
192
|
+
The key function here is `enve.recplo2.findPeaks`. This function has several parameters, depending on
|
193
|
+
the method used. To see all supported methods, use `?enve.recplot2.findPeaks`. To see all the options
|
194
|
+
of the default method (`'emauto'`) use `?enve.recplot2.findPeaks.emauto`.
|
195
|
+
|
196
|
+
---
|
197
|
+
|
198
|
+
## Gene-content diversity: `enve.recplot2.extractWindows`
|
199
|
+
|
200
|
+
In R:
|
201
|
+
|
202
|
+
```R
|
203
|
+
# Load the package and the objects (unless you're still in the same session from the last step)
|
204
|
+
library(enveomics.R)
|
205
|
+
load('my-recplot.rdata')
|
206
|
+
load('my-recplot-peaks.rdata')
|
207
|
+
# Find the peak representing the core genome
|
208
|
+
cp <- enve.recplot2.corePeak(peaks)
|
209
|
+
#-----
|
210
|
+
# The following functions illustrate how to obtain different results. Please explore the resulting
|
211
|
+
# objects and the associated documentation
|
212
|
+
#-----
|
213
|
+
# Find the coordinates of windows significantly below the average sequencing depth
|
214
|
+
div <- enve.recplot2.extractWindows(rp, cp, seq.names=TRUE)
|
215
|
+
# Add sequencing depth
|
216
|
+
div$seqdepth <- enve.recplot2.seqdepth(rp, as.numeric(rownames(div)))
|
217
|
+
# Save the coordinates as a tab-delimited table
|
218
|
+
write.table(div, 'my-low-seqdepth.tsv', quote=FALSE, sep='\t', row.names=FALSE)
|
219
|
+
# Find all the windows with sequencing depth zero
|
220
|
+
zero <- enve.recplot2.coordinates(rp, enve.recplot2.seqdepth(rp)==0)
|
221
|
+
```
|
222
|
+
|
223
|
+
---
|
224
|
+
|
225
|
+
## To do
|
226
|
+
|
227
|
+
- [x] Document structure
|
228
|
+
- [x] Package: `enveomics.R`
|
229
|
+
- [x] Recruitment plots: `enve.recplot2`
|
230
|
+
- [x] Summary statistics
|
231
|
+
- [x] Peak-finder: `enve.recplot2.findPeaks`
|
232
|
+
- [x] Gene-content diversity: `enve.recplot2.extractWindows`
|
233
|
+
- [ ] Compare identity profiles: `enve.recplot2.compareIdentities`
|