miga-base 0.3.1.7 → 0.3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/actions/ncbi_get.rb +8 -0
  3. data/lib/miga/common.rb +9 -215
  4. data/lib/miga/common/base.rb +49 -0
  5. data/lib/miga/common/format.rb +135 -0
  6. data/lib/miga/common/path.rb +49 -0
  7. data/lib/miga/daemon.rb +3 -60
  8. data/lib/miga/daemon/base.rb +69 -0
  9. data/lib/miga/dataset.rb +3 -3
  10. data/lib/miga/dataset/result.rb +5 -5
  11. data/lib/miga/result.rb +5 -0
  12. data/lib/miga/version.rb +7 -5
  13. data/scripts/distances.bash +2 -19
  14. data/scripts/taxonomy.bash +2 -21
  15. data/test/common_test.rb +9 -0
  16. data/utils/distance/base.rb +6 -0
  17. data/utils/distance/commands.rb +82 -0
  18. data/utils/distance/database.rb +86 -0
  19. data/utils/distance/pipeline.rb +98 -0
  20. data/utils/distance/runner.rb +104 -0
  21. data/utils/distance/temporal.rb +37 -0
  22. data/utils/distances.rb +9 -0
  23. data/utils/enveomics/Docs/recplot2.md +233 -0
  24. data/utils/enveomics/Makefile +1 -1
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
  26. data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
  27. data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
  28. data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
  29. data/utils/enveomics/Manifest/categories.json +11 -1
  30. data/utils/enveomics/Manifest/examples.json +2 -2
  31. data/utils/enveomics/README.md +2 -0
  32. data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
  33. data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
  34. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  35. data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
  36. data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
  37. data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
  38. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  39. data/utils/enveomics/Scripts/SRA.download.bash +28 -21
  40. data/utils/enveomics/Scripts/Table.barplot.R +1 -0
  41. data/utils/enveomics/Scripts/aai.rb +4 -2
  42. data/utils/enveomics/build_enveomics_r.bash +5 -5
  43. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  44. data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
  45. data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
  46. data/utils/enveomics/enveomics.R/README.md +26 -17
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
  50. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
  51. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
  52. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
  53. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
  54. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
  55. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
  56. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
  57. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
  58. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
  59. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
  60. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
  61. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
  64. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
  65. data/utils/requirements.txt +1 -1
  66. metadata +28 -4
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
  68. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -1,43 +1,49 @@
1
1
  #!/usr/bin/env perl
2
2
 
3
- # Interpose sequences in FastA format from two files into one output file. If more than two files are
4
- # provided, the script will interpose all the input files.
5
- # Please note that this script will check for the consistency of the names (assuming a pair of related reads
6
- # contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn this
7
- # feature off just set the $eval_T variable to zero. If you want to decrease the sampling period (to speed
8
- # the script up) or increase it (to make it more sensitive to errors) just change $eval_T accordingly.
9
- #
10
3
  # @author Luis M. Rodriguez-R
11
- # @version 1.0
12
- # @created Nov-27-2012
13
- # @update Mar-23-2015
14
4
  # @license artistic license 2.0
15
- #
16
- # Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
17
5
 
18
6
  use strict;
19
7
  use warnings;
20
8
  use Symbol;
21
9
 
22
10
  my $HELP = <<HELP
11
+
12
+ Description:
13
+ Interposes sequences in FastA format from two files into one output file.
14
+ If more than two files are provided, the script will interpose all the input
15
+ files.
16
+ Note that this script will check for the consistency of the names (assuming
17
+ a pair of related reads contains the same name varying only in a trailing
18
+ slash (/) followed by a digit. If you want to turn this feature off just
19
+ set the -T option to zero. If you want to decrease the sampling period (to
20
+ speed the script up) or increase it (to make it more sensitive to errors)
21
+ just change the -T option accordingly.
22
+
23
23
  Usage:
24
- $0 <output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
24
+ $0 [-T <int> ]<output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
25
25
 
26
26
  Where,
27
- output_fasta : Output file
28
- input_fasta_1 : First FastA file
29
- input_fasta_2 : Second FastA file
30
- ... : Any additional FastA files (or none)
27
+ -T <int> : Optional. Integer indicating the sampling period for
28
+ names evaluation (see Description above).
29
+ By default: 1000.
30
+ output_fasta : Output file
31
+ input_fasta_1 : First FastA file
32
+ input_fasta_2 : Second FastA file
33
+ ... : Any additional FastA files (or none)
31
34
 
32
35
  HELP
33
36
  ;
34
- my $eval_T = 1000; # Period (in number of entries) of evaluation for consistency of the names.
35
- # To turn off evaluation set to 0 (zero).
37
+ my $eval_T = 1000;
38
+ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
39
+ $eval_T = $ARGV[1]+0;
40
+ shift @ARGV;
41
+ shift @ARGV;
42
+ }
36
43
  my $out = shift @ARGV;
37
44
  my @in = @ARGV;
38
45
  $/ = "\n>";
39
46
 
40
-
41
47
  die $HELP unless $out and $#in >= 1;
42
48
  open OUT, ">", $out or die "Unable to write on $out: $!\n";
43
49
  print "Output file: $out\n";
@@ -1,11 +1,7 @@
1
1
  #!/usr/bin/env perl
2
- #
2
+
3
3
  # @author Luis M. Rodriguez-R
4
- # @version 2.0
5
- # @update: Mar-23-2015
6
4
  # @license artistic license 2.0
7
- #
8
- # Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
9
5
 
10
6
  use strict;
11
7
  use warnings;
@@ -14,24 +10,27 @@ use Symbol;
14
10
  my $HELP = <<HELP
15
11
 
16
12
  Description:
17
- Interposes sequences in FastQ format from two files into one output file. If more than two files are
18
- provided, the script will interpose all the input files.
19
- Note that this script will check for the consistency of the names (assuming a pair of related reads
20
- contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn
21
- this feature off just set the -T option to zero. If you want to decrease the sampling period
22
- (to speed the script up) or increase it (to make it more sensitive to errors) just change -T option
23
- accordingly.
24
-
13
+ Interposes sequences in FastQ format from two files into one output file.
14
+ If more than two files are provided, the script will interpose all the input
15
+ files.
16
+ Note that this script will check for the consistency of the names (assuming
17
+ a pair of related reads contains the same name varying only in a trailing
18
+ slash (/) followed by a digit. If you want to turn this feature off just
19
+ set the -T option to zero. If you want to decrease the sampling period (to
20
+ speed the script up) or increase it (to make it more sensitive to errors)
21
+ just change the -T option accordingly.
22
+
25
23
  Usage:
26
- $0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
24
+ $0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
27
25
 
28
26
  Where,
29
- -T <int> : Optional. Integer indicating the sampling period for names evaluation (see
30
- Description above). By default: 1000.
31
- output_fastq : Output file
32
- input_fastq_1 : First FastQ file
33
- input_fastq_2 : Second FastQ file
34
- ... : Any additional FastQ files (or none)
27
+ -T <int> : Optional. Integer indicating the sampling period for
28
+ names evaluation (see Description above).
29
+ By default: 1000.
30
+ output_fastq : Output file
31
+ input_fastq_1 : First FastQ file
32
+ input_fastq_2 : Second FastQ file
33
+ ... : Any additional FastQ files (or none)
35
34
 
36
35
  HELP
37
36
  ;
@@ -44,6 +43,7 @@ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
44
43
  my $out = shift @ARGV;
45
44
  my @in = @ARGV;
46
45
 
46
+
47
47
  die $HELP unless $out and $#in >= 1;
48
48
  open OUT, ">", $out or die "Unable to write on $out: $!\n";
49
49
  print "Output file: $out\n";
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Jan-04-2016
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ #= Load stuff
9
+ args <- commandArgs(trailingOnly = F)
10
+ enveomics_R <- file.path(dirname(
11
+ sub("^--file=", "", args[grep("^--file=", args)])),
12
+ "lib", "enveomics.R")
13
+ library(methods)
14
+ source(file.path(enveomics_R, "R", "cliopts.R"))
15
+ source(file.path(enveomics_R, "R", "recplot2.R"))
16
+
17
+ #= Generate interface
18
+ opt <- enve.cliopts(enve.recplot2.compareIdentities,
19
+ file.path(enveomics_R, "man", "enve.recplot2.compareIdentities.Rd"),
20
+ positional_arguments=2,
21
+ usage="usage: %prog [options] recplot-A.Rdata recplot-B.Rdata",
22
+ number=c("pseudocounts", "max.deviation"), ignore=c("x", "y"),
23
+ p_desc="Calculates the difference between identity distributions of two recruitment plots.")
24
+
25
+ #= Run it!
26
+ load(opt$args[1])
27
+ opt$options[['x']] <- rp
28
+ load(opt$args[2])
29
+ opt$options[['y']] <- rp
30
+ dist <- do.call("enve.recplot2.compareIdentities", opt$options)
31
+ cat(dist, '\n')
32
+
@@ -2,14 +2,13 @@
2
2
 
3
3
  #
4
4
  # @author Luis M. Rodriguez-R
5
- # @update Nov-27-2015
6
5
  # @license artistic license 2.0
7
6
  #
8
7
 
9
- DATA_LINK="http://www.ebi.ac.uk/ena/data/warehouse/filereport"
8
+ DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
10
9
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
11
10
  SRX=$1
12
- DIR=${1:-$SRX}
11
+ DIR=${2:-$SRX}
13
12
 
14
13
  if [[ "$SRX" == "" ]] ; then
15
14
  echo "
@@ -27,24 +26,32 @@ fi
27
26
 
28
27
  [[ -d "$DIR" ]] || mkdir "$DIR"
29
28
 
29
+ function md5value {
30
+ local file=$1
31
+ o=$(md5 "$file" | perl -pe 's/.* //')
32
+ [[ -n $o ]] || o=$(md5sum-lite "$file" | awk '{print $1}')
33
+ [[ -n $o ]] || o=$(md5sum "$file" | awk '{print $1}')
34
+ echo "$o"
35
+ }
36
+
30
37
  curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
31
38
  tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
32
- srr=$(echo "$ln"|cut -f 1)
33
- ftp=$(echo "$ln"|cut -f 2)
34
- md5=$(echo "$ln"|cut -f 3)
35
- dir="$DIR/$srr"
36
- [[ -d "$dir" ]] || mkdir "$dir"
37
- echo "o $srr" >&2
38
- for uri in $(echo "$ftp" | tr ";" " ") ; do
39
- file="$dir/$(basename $uri)"
40
- curl "$uri" -o "$file"
41
- md5obs=$(md5sum "$file" | awk '{print $1}')
42
- if [[ "$md5" == "$md5obs"* ]] ; then
43
- md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
44
- else
45
- echo "Corrupt file: $file" >&2
46
- echo " MD5 mismatch: $md5obs not in $md5" >&2
47
- exit 1;
48
- fi
49
- done
39
+ srr=$(echo "$ln"|cut -f 1)
40
+ ftp=$(echo "$ln"|cut -f 2)
41
+ md5=$(echo "$ln"|cut -f 3)
42
+ dir="$DIR/$srr"
43
+ [[ -d "$dir" ]] || mkdir "$dir"
44
+ echo "o $srr" >&2
45
+ for uri in $(echo "$ftp" | tr ";" " ") ; do
46
+ file="$dir/$(basename $uri)"
47
+ curl "$uri" -o "$file"
48
+ md5obs=$(md5value "$file")
49
+ if [[ "$md5" == "$md5obs"* ]] ; then
50
+ md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
51
+ else
52
+ echo "Corrupt file: $file" >&2
53
+ echo " MD5 mismatch: $md5obs not in $md5" >&2
54
+ exit 1;
55
+ fi
56
+ done
50
57
  done
@@ -11,6 +11,7 @@ enveomics_R <- file.path(dirname(
11
11
  sub("^--file=", "", args[grep("^--file=", args)])),
12
12
  "lib", "enveomics.R")
13
13
  source(file.path(enveomics_R, "R", "cliopts.R"))
14
+ source(file.path(enveomics_R, "R", "utils.R"))
14
15
  source(file.path(enveomics_R, "R", "barplot.R"))
15
16
 
16
17
  #= Generate interface
@@ -296,8 +296,10 @@ Dir.mktmpdir do |dir|
296
296
  "#{dir}/#{i}.tab.uns"`
297
297
  `sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
298
298
  when "diamond"
299
- `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --outfmt 6 \
300
- --db "#{s}.dmnd" --query "#{q}" --out "#{dir}/#{i}.tab" --more-sensitive`
299
+ `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
300
+ --query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" \
301
+ && "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
302
+ --out "#{dir}/#{i}.tab"`
301
303
  else
302
304
  abort "Unsupported program: #{o[:program]}."
303
305
  end
@@ -34,11 +34,11 @@ library(inlinedocs)
34
34
  package.skeleton.dx('./');
35
35
  " | R --vanilla
36
36
  cat man/enveomics.R-package.Rd | tr -d '\r' \
37
- | grep -v '^}$' | grep -v '^\\author{' \
38
- | grep -v '^Maintainer' \
39
- | perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
40
- | perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
41
- > o && mv o man/enveomics.R-package.Rd
37
+ | grep -v '^}$' | grep -v '^\\author{' \
38
+ | grep -v '^Maintainer' \
39
+ | perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
40
+ | perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
41
+ > o && mv o man/enveomics.R-package.Rd
42
42
  #[[ ! -d inst/doc ]] && mkdir -p inst/doc
43
43
  #pandoc -o inst/doc/enveomics.R.pdf -f markdown_github README.md
44
44
 
@@ -1,5 +1,5 @@
1
1
  Package: enveomics.R
2
- Version: 1.1.6
2
+ Version: 1.3
3
3
  Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
4
  email="lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
@@ -10,7 +10,8 @@ importFrom("graphics", "abline", "axis", "barplot", "hist", "image",
10
10
  "layout", "legend", "lines", "par", "points", "polygon",
11
11
  "rect", "text")
12
12
  importFrom("stats", "as.dist", "cophenetic", "cor", "median",
13
- "quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm")
13
+ "quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm",
14
+ "dnorm", "kmeans")
14
15
  importFrom("utils", "head", "read.table", "setTxtProgressBar", "tail",
15
16
  "txtProgressBar")
16
17
  exportClasses(enve.RecPlot2, enve.RecPlot2.Peak, enve.TRIBS, enve.TRIBStest,
@@ -29,7 +30,10 @@ export(
29
30
  plot.enve.TRIBStest, summary.enve.TRIBStest,
30
31
  enve.df2dist, enve.df2dist.group, enve.df2dist.list,
31
32
  enve.recplot2, plot.enve.RecPlot2, enve.recplot2.findPeaks,
33
+ enve.recplot2.findPeaks.emauto, enve.recplot2.findPeaks.em,
34
+ enve.recplot2.findPeaks.mower,
32
35
  enve.recplot2.corePeak, enve.recplot2.changeCutoff,
33
- enve.recplot2.extractWindows,
36
+ enve.recplot2.extractWindows, enve.recplot2.compareIdentities,
37
+ enve.recplot2.coordinates, enve.recplot2.seqdepth, enve.recplot2.ANIr,
34
38
  enve.growthcurve, plot.enve.GrowthCurve, summary.enve.GrowthCurve,
35
39
  enve.col2alpha)
@@ -9,7 +9,7 @@ setClass("enve.RecPlot2",
9
9
  id.counts='numeric', ##<< Counts per ID bin.
10
10
  id.breaks='numeric', ##<< Breaks of identity bins.
11
11
  pos.breaks='numeric', ##<< Breaks of position bins.
12
- seq.breaks='numeric',
12
+ seq.breaks='numeric', ##<< Breaks of input sequences.
13
13
  peaks='list', ##<< Peaks identified in the recplot.
14
14
  ### Limits of the subject sequences after concatenation.
15
15
  seq.names='character', ##<< Names of the subject sequences.
@@ -45,9 +45,13 @@ setClass("enve.RecPlot2.Peak",
45
45
  ### number of position bins with non-zero sequencing depth in the recruitment
46
46
  ### plot (regardless of peak count).
47
47
  err.res='numeric',
48
- ### Error left after adding the peak.
49
- merge.logdist='numeric'
48
+ ### Error left after adding the peak (mower) or log-likelihood (em or emauto).
49
+ merge.logdist='numeric',
50
50
  ### Attempted `merge.logdist` parameter.
51
+ seq.depth='numeric',
52
+ ### Best estimate available for the sequencing depth of the peak (centrality).
53
+ log='logical'
54
+ ### Indicates if the estimation was performed in natural logarithm space
51
55
  ));
52
56
  setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
53
57
  setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
@@ -83,6 +87,9 @@ plot.enve.RecPlot2 <- function
83
87
  peaks.col='darkred',
84
88
  ### If not NA, it attempts to represent peaks in the population histogram
85
89
  ### in the specified color. Set to NA to avoid peak-finding.
90
+ use.peaks,
91
+ ### A list of `enve.RecPlot2.Peak` objects, as returned by
92
+ ### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
86
93
  id.lim=range(x$id.breaks),
87
94
  ### Limits of identities to represent.
88
95
  pos.lim=range(x$pos.breaks),
@@ -98,15 +105,17 @@ plot.enve.RecPlot2 <- function
98
105
  ### the number of the panel as index (see `layout`).
99
106
  pos.splines=0,
100
107
  ### Smoothing parameter for the splines in the position histogram. Zero
101
- ### (0) for no splines. If non-zero, requires the stats package.
108
+ ### (0) for no splines. Use NULL to automatically detect by leave-one-out
109
+ ### cross-validation.
102
110
  id.splines=1/2,
103
111
  ### Smoothing parameter for the splines in the identity histogram. Zero
104
- ### (0) for no splines. If non-zero, requires the stats package.
105
- in.lwd=ifelse(pos.splines>0, 1/2, 2),
112
+ ### (0) for no splines. Use NULL to automatically detect by leave-one-out
113
+ ### cross-validation.
114
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
106
115
  ### Line width for the sequencing depth of in-group matches.
107
- out.lwd=ifelse(pos.splines>0, 1/2, 2),
116
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
108
117
  ### Line width for the sequencing depth of out-group matches.
109
- id.lwd=ifelse(id.splines>0, 1/2, 2),
118
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
110
119
  ### Line width for the identity histogram.
111
120
  in.col='darkblue',
112
121
  ### Color associated to in-group matches.
@@ -193,7 +202,7 @@ plot.enve.RecPlot2 <- function
193
202
  pos.f <- rep(seqdepth.in,each=2)
194
203
  lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
195
204
  lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
196
- if(pos.splines > 0){
205
+ if(is.null(pos.splines) || pos.splines > 0){
197
206
  pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
198
207
  spar=pos.splines)
199
208
  lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
@@ -230,7 +239,7 @@ plot.enve.RecPlot2 <- function
230
239
  id.f <- rep(id.counts,each=2)
231
240
  id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
232
241
  lines(id.f, id.x, lwd=id.lwd, col=id.col);
233
- if(id.splines > 0){
242
+ if(is.null(id.splines) || id.splines > 0){
234
243
  id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
235
244
  spar=id.splines)
236
245
  lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
@@ -267,8 +276,12 @@ plot.enve.RecPlot2 <- function
267
276
  polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
268
277
  y.tmp.in, border=NA, col=in.col)
269
278
  if(!is.na(peaks.col)){
270
- o <- peaks.opts; o$x = x;
271
- peaks <- do.call(enve.recplot2.findPeaks, o);
279
+ o <- peaks.opts; o$x = x;
280
+ if(missing(use.peaks)){
281
+ peaks <- do.call(enve.recplot2.findPeaks, o)
282
+ }else{
283
+ peaks <- use.peaks
284
+ }
272
285
  h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
273
286
  if(!is.null(peaks) & length(peaks)>0){
274
287
  pf <- h.mids*0;
@@ -276,18 +289,23 @@ plot.enve.RecPlot2 <- function
276
289
  cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
277
290
  lines(cnt, h.mids, col='red');
278
291
  pf <- pf+cnt;
279
- axis(4, at=peaks[[i]]$param.hat[[length(peaks[[i]]$param.hat)]],
280
- letters[i], las=1, hadj=1/2)
292
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
281
293
  }
282
294
  lines(pf, h.mids, col='red',lwd=1.5);
283
- legend('bottomright', legend=paste(
284
- letters[1:length(peaks)],'. ',
285
- signif(as.numeric(lapply(peaks,
286
- function(x) tail(as.numeric(x$param.hat),n=1))),3),'X (',
287
- signif(100*as.numeric(lapply(peaks,
288
- function(x) (length(x$values)/x$n.total))), 3), '%, err: ',
289
- signif(as.numeric(lapply(peaks, function(x) x$err.res)), 3), ')',
290
- sep=''), bty='n');
295
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
296
+ frx <- signif(100*as.numeric(
297
+ lapply(peaks,
298
+ function(x) ifelse(length(x$values)==0, x$n.hat,
299
+ length(x$values))/x$n.total)), 2)
300
+ if(peaks[[1]]$err.res < 0){
301
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
302
+ }else{
303
+ err <- paste(', err:',
304
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
305
+ }
306
+ legend('topright', bty='n', cex=1/2,
307
+ legend=paste(letters[1:length(peaks)],'. ',
308
+ dpt,'X (', frx, '%', err, ')', sep=''))
291
309
  }
292
310
  }
293
311
  }
@@ -334,6 +352,10 @@ enve.recplot2 <- function(
334
352
  id.breaks=300,
335
353
  ### Breaks in the identity histogram. It can also be a vector of break
336
354
  ### points, and values outside the range are ignored.
355
+ id.free.range=FALSE,
356
+ ### Indicates that the range should be freely set from the observed
357
+ ### values. Otherwise, 70-100% is included in the identity histogram
358
+ ### (default).
337
359
  id.metric=c('identity', 'corrected identity', 'bit score'),
338
360
  ### Metric of identity to be used (Y-axis). Corrected identity is only
339
361
  ### supported if the original BLAST file included sequence lengths.
@@ -373,12 +395,14 @@ enve.recplot2 <- function(
373
395
  if(pos.breaks>0){
374
396
  pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
375
397
  }else{
376
- pos.breaks <- c(lim[,2], tail(lim[,3], n=1))
398
+ pos.breaks <- c(lim[1,2], lim[,3])
377
399
  }
378
400
  }
379
401
  if(length(id.breaks)==1){
380
- id.breaks <- seq(min(rec[,rec.idcol]), max(rec[,rec.idcol]),
381
- length.out=id.breaks+1);
402
+ id.range.v <- rec[,rec.idcol]
403
+ if(!id.free.range) id.range.v <- c(id.range.v,70,100)
404
+ id.range.v <- range(id.range.v)
405
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
382
406
  }
383
407
 
384
408
  # Run in parallel
@@ -423,8 +447,159 @@ enve.recplot2 <- function(
423
447
  }
424
448
 
425
449
  enve.recplot2.findPeaks <- function(
450
+ ### Identifies peaks in the population histogram potentially indicating
451
+ ### sub-population mixtures
452
+ x,
453
+ ### An `enve.RecPlot2` object.
454
+ method="emauto",
455
+ ### Peak-finder method. This should be one of:
456
+ ### "emauto" (Expectation-Maximization with auto-selection of components),
457
+ ### "em" (Expectation-Maximization),
458
+ ### "mower" (Custom distribution-mowing method).
459
+ ...
460
+ ### Any additional parameters supported by
461
+ ### `enve.recplot2.findPeaks.<method>`.
462
+ ){
463
+ if(method == "emauto"){
464
+ peaks <- enve.recplot2.findPeaks.emauto(x, ...)
465
+ }else if(method == "em"){
466
+ peaks <- enve.recplot2.findPeaks.em(x, ...)
467
+ }else if(method == "mower"){
468
+ peaks <- enve.recplot2.findPeaks.mower(x, ...)
469
+ }else{
470
+ stop("Invalid peak-finder method ", method)
471
+ }
472
+ return(peaks)
473
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
474
+ }
475
+
476
+ enve.recplot2.findPeaks.emauto <- function(
477
+ ### Identifies peaks in the population histogram using a Gaussian Mixture
478
+ ### Model Expectation Maximization (GMM-EM) method with number of components
479
+ ### automatically detected.
480
+ x,
481
+ ### An `enve.RecPlot2` object.
482
+ components=seq(1,10),
483
+ ### A vector of number of components to evaluate.
484
+ criterion='aic',
485
+ ### Criterion to use for components selection. Must be one of:
486
+ ### 'aic' (Akaike Information Criterion),
487
+ ### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
488
+ merge.tol=2L,
489
+ ### When attempting to merge peaks with very similar sequencing depth, use
490
+ ### this number of significant digits (in log-scale).
491
+ verbose=FALSE,
492
+ ### Display (mostly debugging) information.
493
+ ...
494
+ ### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
495
+ ){
496
+ best <- list(crit=0, pstore=list())
497
+ if(criterion == 'aic'){
498
+ do_crit <- function(ll, k, n) 2*k - 2*ll
499
+ }else if(criterion %in% c('bic', 'sbc')){
500
+ do_crit <- function(ll, k, n) log(n)*k - 2*ll
501
+ }else{
502
+ stop('Invalid criterion ', criterion)
503
+ }
504
+ for(comp in components){
505
+ best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
506
+ verbose, ...)
507
+ }
508
+
509
+ seqdepths.r <- signif(log(sapply(best[['peaks']],
510
+ function(x) x$seq.depth)), merge.tol)
511
+ distinct <- length(unique(seqdepths.r))
512
+ if(distinct < length(best[['peaks']])){
513
+ if(verbose) cat('Attempting merge to', distinct, 'components\n')
514
+ init <- apply(sapply(best[['peaks']],
515
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
516
+ init <- init[!duplicated(seqdepths.r),]
517
+ init <- list(mu=init[,'mean'], sd=init[,'sd'],
518
+ alpha=init[,'alpha']/sum(init[,'alpha']))
519
+ best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
520
+ verbose, ...)
521
+ }
522
+ return(best[['peaks']])
523
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
524
+ }
525
+
526
+ enve.recplot2.findPeaks.em <- function(
527
+ ### Identifies peaks in the population histogram using a Gaussian Mixture
528
+ ### Model Expectation Maximization (GMM-EM) method.
529
+ x,
530
+ ### An `enve.RecPlot2` object.
531
+ max.iter=1000,
532
+ ### Maximum number of EM iterations.
533
+ ll.diff.res=1e-8,
534
+ ### Maximum Log-Likelihood difference to be considered as convergent.
535
+ components=2,
536
+ ### Number of distributions assumed in the mixture.
537
+ rm.top=0.05,
538
+ ### Top-values to remove before finding peaks, as a quantile probability.
539
+ ### This step is useful to remove highly conserved regions, but can be
540
+ ### turned off by setting rm.top=0. The quantile is determined *after*
541
+ ### removing zero-coverage windows.
542
+ verbose=FALSE,
543
+ ### Display (mostly debugging) information.
544
+ init,
545
+ ### Initialization parameters. By default, these are derived from k-means
546
+ ### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
547
+ ### of length `components`.
548
+ log=TRUE
549
+ ### Logical value indicating if the estimations should be performed in
550
+ ### natural logarithm units. Do not change unless you know what you're
551
+ ### doing.
552
+ ){
553
+
554
+ # Essential vars
555
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
556
+ lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
557
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
558
+ if(log) lsd1 <- log(lsd1)
559
+
560
+ # 1. Initialize
561
+ if(missing(init)){
562
+ km.clust <- kmeans(lsd1, components)$cluster
563
+ init <- list(
564
+ mu = tapply(lsd1, km.clust, mean),
565
+ sd = tapply(lsd1, km.clust, sd),
566
+ alpha = table(km.clust)/length(km.clust)
567
+ )
568
+ }
569
+ m.step <- init
570
+ ll <- c()
571
+ cur.ll <- -Inf
572
+
573
+ for(i in 1:max.iter){
574
+ # 2/3. EM
575
+ e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
576
+ m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
577
+ # 4. Convergence
578
+ ll <- c(ll, e.step[["ll"]])
579
+ ll.diff <- abs(cur.ll - e.step[["ll"]])
580
+ cur.ll <- e.step[["ll"]]
581
+ if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
582
+ if(ll.diff <= ll.diff.res) break
583
+ }
584
+
585
+ # Return
586
+ peaks <- list()
587
+ for(i in 1:components){
588
+ n.hat <- m.step[['alpha']][i]*length(lsd1)
589
+ peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
590
+ values.res=0, mode=m.step[['mu']][i],
591
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
592
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
593
+ merge.logdist=as.numeric(), log=log,
594
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
595
+ }
596
+ return(peaks)
597
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
598
+ }
599
+
600
+ enve.recplot2.findPeaks.mower <- function(
426
601
  ### Identifies peaks in the population histogram potentially indicating
427
- ### sub-population mixtures.
602
+ ### sub-population mixtures, using a custom distribution-mowing method.
428
603
  x,
429
604
  ### An `enve.RecPlot2` object.
430
605
  min.points=10,
@@ -436,13 +611,12 @@ enve.recplot2.findPeaks <- function(
436
611
  mlv.opts=list(method='parzen'),
437
612
  ### Options passed to `mlv` to estimate the mode.
438
613
  fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
439
- start=list(omega=1, alpha=-1), lower=c(1e-6, -Inf, 0),
440
- upper=c(Inf, 0, Inf)),
614
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
441
615
  ### Options passed to `fitdist` to estimate the standard deviation if
442
616
  ### with.skewness=TRUE. Note that the `start` parameter will be ammended
443
617
  ### with xi=estimated mode for each peak.
444
- fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4,.6),
445
- start=list(sd=1), lower=c(1e-8, 0)),
618
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
619
+ start=list(sd=1), lower=c(0, -Inf)),
446
620
  ### Options passed to `fitdist` to estimate the standard deviation if
447
621
  ### with.skewness=FALSE. Note that the `start` parameter will be ammended
448
622
  ### with mean=estimated mode for each peak.
@@ -460,7 +634,7 @@ enve.recplot2.findPeaks <- function(
460
634
  ### "tail distribution".
461
635
  optim.rounds=200,
462
636
  ### Maximum rounds of peak optimization.
463
- optim.epsilon=1e-8,
637
+ optim.epsilon=1e-4,
464
638
  ### Trace change at which optimization stops (unless `optim.rounds` is
465
639
  ### reached first). The trace change is estimated as the sum of square
466
640
  ### differences between parameters in one round and those from two rounds
@@ -469,8 +643,12 @@ enve.recplot2.findPeaks <- function(
469
643
  ### Maximum value of |log-ratio| between centrality parameters in peaks to
470
644
  ### attempt merging. The default of ~0.22 corresponds to a maximum
471
645
  ### difference of 25%.
472
- verbose=FALSE
646
+ verbose=FALSE,
473
647
  ### Display (mostly debugging) information.
648
+ log=TRUE
649
+ ### Logical value indicating if the estimations should be performed in
650
+ ### natural logarithm units. Do not change unless you know what you're
651
+ ### doing.
474
652
  ){
475
653
 
476
654
  # Essential vars
@@ -478,6 +656,7 @@ enve.recplot2.findPeaks <- function(
478
656
  seqdepth.in <- x$pos.counts.in/pos.binsize;
479
657
  lsd1 <- seqdepth.in[seqdepth.in>0];
480
658
  lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
659
+ if(log) lsd1 <- log(lsd1)
481
660
  if(with.skewness){
482
661
  fitdist.opts <- fitdist.opts.sn
483
662
  }else{
@@ -486,11 +665,11 @@ enve.recplot2.findPeaks <- function(
486
665
  peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
487
666
  mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
488
667
  optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
489
- n.total=length(lsd1), merge.logdist=merge.logdist)
668
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
490
669
 
491
670
  # Find seed peaks
492
671
  if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
493
- peaks <- enve.recplot2.__findPeaks(peaks.opts);
672
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
494
673
 
495
674
  # Merge overlapping peaks
496
675
  if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
@@ -511,7 +690,7 @@ enve.recplot2.findPeaks <- function(
511
690
  p$param.hat[[ length(p$param.hat) ]],'&',
512
691
  p2$param.hat[[ length(p2$param.hat) ]],'X\n');
513
692
  peaks.opts$lsd1 <- c(p$values, p2$values)
514
- p.new <- enve.recplot2.__findPeaks(peaks.opts)
693
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
515
694
  if(length(p.new)==1){
516
695
  peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
517
696
  ignore <- c(ignore, j)
@@ -542,16 +721,19 @@ enve.recplot2.corePeak <- function
542
721
  function(y) y$param.hat[[ length(y$param.hat) ]])))
543
722
  ]]
544
723
  # If a "larger" peak (a peak explaining more bins of the genome) is within
545
- # the "merge.logdist" distance, take that one instead.
724
+ # the default "merge.logdist" distance, take that one instead.
546
725
  corePeak <- maxPeak
547
726
  for(p in x){
548
- sz.d = log(length(p$values)/length(corePeak$values))
549
- if(sz.d < 0)
550
- next;
551
- sq.d.a <- p$param.hat[[ length(p$param.hat) ]]
552
- sq.d.b <- maxPeak$param.hat[[ length(maxPeak$param.hat) ]]
553
- if(abs(log(sq.d.a/sq.d.b )) < maxPeak$merge.logdist+sz.d/5)
554
- corePeak <- p
727
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
728
+ corePeak.len <- ifelse(
729
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
730
+ sz.d <- log(p.len/corePeak.len)
731
+ if(is.nan(sz.d) || sz.d < 0) next
732
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
733
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
734
+ if(p$log) sq.d.a <- exp(sq.d.a)
735
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
736
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
555
737
  }
556
738
  return(corePeak)
557
739
  }
@@ -580,43 +762,204 @@ enve.recplot2.extractWindows <- function
580
762
  ### Extract windows significantly below (or above) the peak in sequencing
581
763
  ### depth.
582
764
  (rp,
583
- ### Recruitment plot, a enve.Recplot2 object.
765
+ ### Recruitment plot, a enve.RecPlot2 object.
584
766
  peak,
585
- ### Peak, a enve.RecPlot2.Peak object. If list, it is assumed to be a list
586
- ### of enve.RecPlot2.Peak objects, in which case the core peak is used
587
- ### (see enve.recplot2.corePeak).
767
+ ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
768
+ ### list of enve.RecPlot2.Peak objects, in which case the core peak is
769
+ ### used (see `enve.recplot2.corePeak`).
588
770
  lower.tail=TRUE,
589
771
  ### If FALSE, it returns windows significantly above the peak in
590
772
  ### sequencing depth.
591
773
  significance=0.05,
592
774
  ### Significance threshold (alpha) to select windows.
593
775
  seq.names=FALSE
594
- ### Returns subject sequence names instead of a vector of Booleans. It
595
- ### assumes that the recruitment plot was generated with pos.breaks=0.
776
+ ### Returns subject sequence names instead of a vector of Booleans. If
777
+ ### the recruitment plot was generated with pos.breaks=0 it returns a
778
+ ### vector of characters (the sequence identifiers), otherwise it returns
779
+ ### a data.frame with a name column and two columns of coordinates.
596
780
  ){
597
781
  # Determine the threshold
598
782
  if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
599
783
  par <- peak$param.hat
600
784
  par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
601
785
  thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
786
+ if(peak$log) thr <- exp(thr)
602
787
 
603
- # Estimate sequencing depths per window
604
- pos.cnts.in <- rp$pos.counts.in
605
- pos.breaks <- rp$pos.breaks
606
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
607
- seqdepth.in <- pos.cnts.in/pos.binsize
608
-
609
788
  # Select windows past the threshold
789
+ seqdepth.in <- enve.recplot2.seqdepth(rp)
610
790
  if(lower.tail){
611
791
  sel <- seqdepth.in < thr
612
792
  }else{
613
793
  sel <- seqdepth.in > thr
614
794
  }
795
+
796
+ # seq.names=FALSE
615
797
  if(!seq.names) return(sel)
616
- if(length(seqdepth.in) != length(rp$seq.names))
617
- stop(paste("Requesting subject sequence names, but the recruitment plot",
618
- "was not generated with pos.breaks=0."))
619
- return(rp$seq.names[sel])
798
+ # seq.names=TRUE and pos.breaks=0
799
+ if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
800
+ rp$pos.breaks==rp$seq.breaks)
801
+ return(rp$seq.names[sel])
802
+ # seq.names=TRUE and pos.breaks!=0
803
+ return(enve.recplot2.coordinates(rp,sel))
804
+ ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
805
+ ### it returns a vector of characters if the object was built with
806
+ ### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
807
+ ### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
808
+ }
809
+
810
+ enve.recplot2.compareIdentities <- function
811
+ ### Compare the distribution of identities between two enve.RecPlot2 objects.
812
+ (x,
813
+ ### First enve.RecPlot2 object.
814
+ y,
815
+ ### Second enve.RecPlot2 object.
816
+ method="hellinger",
817
+ ### Distance method to use. This should be (an unambiguous abbreviation of)
818
+ ### one of:
819
+ ### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
820
+ ### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
821
+ ### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
822
+ ### doi:10.1214/aoms/1177729694), or "euclidean".
823
+ smooth.par=NULL,
824
+ ### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
825
+ ### Use NULL to automatically determine this value using leave-one-out
826
+ ### cross-validation (see `smooth.spline` parameter `spar`).
827
+ pseudocounts=0,
828
+ ### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
829
+ ### 1 for add-one smoothing.
830
+ max.deviation=0.75
831
+ ### Maximum mean deviation between identity breaks tolerated (as percent
832
+ ### identity). Difference in number of id.breaks is never tolerated.
833
+ ){
834
+ METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
835
+ i.meth <- pmatch(method, METHODS)
836
+ if (is.na(i.meth)) stop("Invalid distance ", method)
837
+ if(!inherits(x, "enve.RecPlot2"))
838
+ stop("'x' must inherit from class `enve.RecPlot2`")
839
+ if(!inherits(y, "enve.RecPlot2"))
840
+ stop("'y' must inherit from class `enve.RecPlot2`")
841
+ if(length(x$id.breaks) != length(y$id.breaks))
842
+ stop("'x' and 'y' must have the same number of `id.breaks`")
843
+ dev <- mean(abs(x$id.breaks - y$id.breaks))
844
+ if(dev > max.deviation)
845
+ stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
846
+ dev)
847
+ x.cnt <- x$id.counts
848
+ y.cnt <- y$id.counts
849
+ if(is.null(smooth.par) || smooth.par > 0){
850
+ x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
851
+ y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
852
+ p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
853
+ q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
854
+ x.cnt <- pmax(p.spline$y, 0)
855
+ y.cnt <- pmax(q.spline$y, 0)
856
+ }
857
+ a <- as.numeric(pseudocounts)
858
+ p <- (x.cnt + a) / sum(x.cnt + a)
859
+ q <- (y.cnt + a) / sum(y.cnt + a)
860
+ d <- NA
861
+ if(i.meth %in% c(1L, 2L)){
862
+ d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
863
+ if(i.meth==2L) d <- 1 - d**2
864
+ }else if(i.meth %in% c(3L, 4L)){
865
+ sel <- p>0
866
+ if(any(q[sel]==0))
867
+ stop("Undefined distance without absolute continuity, use pseudocounts")
868
+ d <- -sum(p[sel]*log(q[sel]/p[sel]))
869
+ }else if(i.meth == 5L){
870
+ d <- sqrt(sum((q-p)**2))
871
+ }
872
+ return(d)
873
+ }
874
+
875
+ enve.recplot2.coordinates <- function
876
+ ### Returns the sequence name and coordinates of the requested position bins.
877
+ (x,
878
+ ### `enve.RecPlot2` object.
879
+ bins
880
+ ### Vector of selected bins to return. It can be a vector of logical values
881
+ ### with the same length as `x$pos.breaks`-1 or a vector of integers. If
882
+ ### missing, returns the coordinates of all windows.
883
+ ){
884
+ if(!inherits(x, "enve.RecPlot2"))
885
+ stop("'x' must inherit from class `enve.RecPlot2`")
886
+ if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
887
+ if(!is.vector(bins)) stop("'bins' must be a vector")
888
+ if(inherits(bins, "logical")) bins <- which(bins)
889
+
890
+ y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
891
+
892
+ for(i in 1:length(bins)){
893
+ j <- bins[i]
894
+ # Concatenated coordinates
895
+ cc <- x$pos.breaks[c(j, j+1)]
896
+ # Find the corresponding `seq.breaks`
897
+ sb.from <- which(
898
+ cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
899
+ cc[1] < x$seq.breaks[-1])
900
+ sb.to <- which(
901
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
902
+ cc[2] <=x$seq.breaks[-1])
903
+ # Translate coordinates
904
+ if(length(sb.from)==1 & length(sb.to)==1){
905
+ y[i, 'name.from'] <- x$seq.names[sb.from]
906
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
907
+ y[i, 'name.to'] <- x$seq.names[sb.to]
908
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
909
+ }
910
+ }
911
+
912
+ return(y)
913
+ ### Returns a data.frame with four columns: name.from (character), pos.from
914
+ ### (numeric) name.to (character), and pos.to (numeric). The first two
915
+ ### correspond to sequence and position of the start point of the bin, the
916
+ ### last two correspond to the sequence and position of the end point of the
917
+ ### bin.
918
+ }
919
+
920
+ enve.recplot2.seqdepth <- function
921
+ ### Calculate the sequencing depth of the given window(s)
922
+ (x,
923
+ ### `enve.RecPlot2` object.
924
+ sel,
925
+ ### Window(s) for which the sequencing depth is to be calculated. If not
926
+ ### passed, it returns the sequencing depth of all windows
927
+ low.identity=FALSE
928
+ ### A logical indicating if the sequencing depth is to be estimated only
929
+ ### with low-identity matches. By default, only high-identity matches are
930
+ ### used.
931
+ ){
932
+ if(!inherits(x, "enve.RecPlot2"))
933
+ stop("'x' must inherit from class `enve.RecPlot2`")
934
+ pos.cnts.in <- x$pos.counts.in
935
+ pos.breaks <- x$pos.breaks
936
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
937
+ seqdepth.in <- pos.cnts.in/pos.binsize
938
+ if(missing(sel)) return(seqdepth.in)
939
+ return(seqdepth.in[sel])
940
+ ### Returns a numeric vector of sequencing depths (in bp/bp).
941
+ }
942
+
943
+ enve.recplot2.ANIr <- function
944
+ ### Estimate the Average Nucleotide Identity from reads (ANIr) from a
945
+ ### recruitment plot
946
+ (x,
947
+ ### `enve.RecPlot2` object.
948
+ range=c(0,Inf)
949
+ ### Range of identities to be considered. By default, the full range
950
+ ### is used (note that the upper boundary is `Inf` and not 100 because
951
+ ### recruitment plots can also be built with bit-scores). To use only
952
+ ### intra-population matches (with identities), use c(95,100). To use only
953
+ ### inter-population values, use c(0,95).
954
+ ){
955
+ if(!inherits(x, "enve.RecPlot2"))
956
+ stop("'x' must inherit from class `enve.RecPlot2`")
957
+ id.b <- x$id.breaks
958
+ id <- (id.b[-1]+id.b[-length(id.b)])/2
959
+ cnt <- x$id.counts
960
+ cnt[id < range[1]] <- 0
961
+ cnt[id > range[2]] <- 0
962
+ return(sum(id*cnt/sum(cnt)))
620
963
  }
621
964
 
622
965
  #==============> Define internal functions
@@ -640,21 +983,67 @@ enve.recplot2.__counts <- function
640
983
  return(counts);
641
984
  }
642
985
 
986
+ enve.recplot2.findPeaks.__emauto_one <- function
987
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
988
+ (x, comp, do_crit, best, verbose, ...){
989
+ peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
990
+ k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
991
+ crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
992
+ if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
993
+ ifelse(crit > best[['crit']], '*', ''), '\n')
994
+ if(crit > best[['crit']]){
995
+ best[['crit']] <- crit
996
+ best[['peaks']] <- peaks
997
+ }
998
+ best[['pstore']][[comp]] <- peaks
999
+ return(best)
1000
+ }
1001
+ enve.recplot2.findPeaks.__em_e <- function
1002
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
1003
+ (x, theta){
1004
+ components <- length(theta[['mu']])
1005
+ product <- do.call(cbind,
1006
+ lapply(1:components,
1007
+ function(i) dnorm(x, theta[['mu']][i],
1008
+ theta[['sd']][i])*theta[['alpha']][i]))
1009
+ sum.of.components <- rowSums(product)
1010
+ posterior <- product / sum.of.components
1011
+
1012
+ return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1013
+ }
1014
+
1015
+ enve.recplot2.findPeaks.__em_m <- function
1016
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
1017
+ (x, posterior){
1018
+ components <- ncol(posterior)
1019
+ n <- colSums(posterior)
1020
+ mu <- colSums(posterior * x) / n
1021
+ sd <- sqrt( colSums(
1022
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1023
+ alpha <- n/length(x)
1024
+ return(list(mu=mu, sd=sd, alpha=alpha))
1025
+ }
1026
+
643
1027
  enve.recplot2.__peakHist <- function
644
1028
  ### Internal ancilliary function (see `enve.RecPlot2.Peak`).
645
1029
  (x, mids, counts=TRUE){
646
1030
  d.o <- x$param.hat
647
- d.o$x <- mids
1031
+ if(length(x$log)==0) x$log <- FALSE
1032
+ if(x$log){
1033
+ d.o$x <- log(mids)
1034
+ }else{
1035
+ d.o$x <- mids
1036
+ }
648
1037
  prob <- do.call(paste('d', x$dist, sep=''), d.o)
649
1038
  if(!counts) return(prob)
650
1039
  if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
651
1040
  return(prob*x$n.hat/sum(prob))
652
1041
  }
653
1042
 
654
- enve.recplot2.__findPeak <- function
655
- ### Internall ancilliary function (see `enve.recplot2.findPeaks`).
1043
+ enve.recplot2.findPeaks.__mow_one <- function
1044
+ ### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
656
1045
  (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
657
- optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
1046
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
658
1047
  ){
659
1048
  dist <- ifelse(with.skewness, 'sn', 'norm');
660
1049
 
@@ -683,8 +1072,14 @@ enve.recplot2.__findPeak <- function
683
1072
  if(round>1) param.hat <- last.hat;
684
1073
  break;
685
1074
  }
686
- epsilon <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
687
- if(round>2) if(epsilon < optim.epsilon) break;
1075
+ if(round > 1){
1076
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1077
+ if(epsilon1 < optim.epsilon) break;
1078
+ if(round > 2){
1079
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1080
+ if(epsilon2 < optim.epsilon) break;
1081
+ }
1082
+ }
688
1083
  }
689
1084
  if(verbose) cat('\n')
690
1085
  if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
@@ -695,14 +1090,14 @@ enve.recplot2.__findPeak <- function
695
1090
  n.hat <- length(lsd1.pop)/diff(quant.est)
696
1091
  peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
697
1092
  param.hat=param.hat, n.hat=n.hat, n.total=n.total,
698
- merge.logdist=merge.logdist)
1093
+ merge.logdist=merge.logdist, log=log)
699
1094
  peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
700
1095
  peak.cnt <- enve.recplot2.__peakHist(peak,
701
1096
  (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
702
1097
  for(i in 2:length(peak.breaks)){
703
1098
  values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
704
1099
  n.exp <- peak.cnt[i-1]
705
- if(n.exp==0) n.exp=0.1
1100
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
706
1101
  if(length(values)==0) next
707
1102
  in.peak <- runif(length(values)) <= n.exp/length(values)
708
1103
  lsd2 <- c(lsd2, values[!in.peak])
@@ -716,17 +1111,19 @@ enve.recplot2.__findPeak <- function
716
1111
  attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
717
1112
  plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
718
1113
  plot=FALSE)$counts)+1)/2
1114
+ mu <- tail(param.hat, n=1)
1115
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
719
1116
  if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
720
1117
  'with expected n =',n.hat,'\n')
721
1118
  return(peak)
722
1119
  }
723
1120
 
724
- enve.recplot2.__findPeaks <- function
725
- ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
1121
+ enve.recplot2.findPeaks.__mower <- function
1122
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
726
1123
  (peaks.opts){
727
1124
  peaks <- list()
728
1125
  while(length(peaks.opts$lsd1) > peaks.opts$min.points){
729
- peak <- do.call(enve.recplot2.__findPeak, peaks.opts)
1126
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
730
1127
  if(is.null(peak)) break
731
1128
  peaks[[ length(peaks)+1 ]] <- peak
732
1129
  peaks.opts$lsd1 <- peak$values.res
@@ -738,7 +1135,10 @@ enve.recplot2.__findPeaks <- function
738
1135
  enve.recplot2.__whichClosestPeak <- function
739
1136
  ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
740
1137
  (peak, peaks){
741
- dist <- as.numeric(lapply(peaks, function(x) abs(log(x$param.hat[[ length(x$param.hat) ]]/peak$param.hat[[ length(peak$param.hat) ]] ))))
1138
+ dist <- as.numeric(lapply(peaks,
1139
+ function(x)
1140
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1141
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
742
1142
  dist[ dist==0 ] <- Inf
743
1143
  return(which.min(dist))
744
1144
  }