miga-base 0.3.1.7 → 0.3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/actions/ncbi_get.rb +8 -0
  3. data/lib/miga/common.rb +9 -215
  4. data/lib/miga/common/base.rb +49 -0
  5. data/lib/miga/common/format.rb +135 -0
  6. data/lib/miga/common/path.rb +49 -0
  7. data/lib/miga/daemon.rb +3 -60
  8. data/lib/miga/daemon/base.rb +69 -0
  9. data/lib/miga/dataset.rb +3 -3
  10. data/lib/miga/dataset/result.rb +5 -5
  11. data/lib/miga/result.rb +5 -0
  12. data/lib/miga/version.rb +7 -5
  13. data/scripts/distances.bash +2 -19
  14. data/scripts/taxonomy.bash +2 -21
  15. data/test/common_test.rb +9 -0
  16. data/utils/distance/base.rb +6 -0
  17. data/utils/distance/commands.rb +82 -0
  18. data/utils/distance/database.rb +86 -0
  19. data/utils/distance/pipeline.rb +98 -0
  20. data/utils/distance/runner.rb +104 -0
  21. data/utils/distance/temporal.rb +37 -0
  22. data/utils/distances.rb +9 -0
  23. data/utils/enveomics/Docs/recplot2.md +233 -0
  24. data/utils/enveomics/Makefile +1 -1
  25. data/utils/enveomics/Manifest/Tasks/blasttab.json +66 -0
  26. data/utils/enveomics/Manifest/Tasks/fasta.json +10 -3
  27. data/utils/enveomics/Manifest/Tasks/fastq.json +4 -4
  28. data/utils/enveomics/Manifest/Tasks/mapping.json +38 -1
  29. data/utils/enveomics/Manifest/categories.json +11 -1
  30. data/utils/enveomics/Manifest/examples.json +2 -2
  31. data/utils/enveomics/README.md +2 -0
  32. data/utils/enveomics/Scripts/Aln.cat.rb +1 -0
  33. data/utils/enveomics/Scripts/BedGraph.tad.rb +52 -30
  34. data/utils/enveomics/Scripts/BedGraph.window.rb +71 -0
  35. data/utils/enveomics/Scripts/BlastTab.recplot2.R +7 -2
  36. data/utils/enveomics/Scripts/FastA.interpose.pl +26 -20
  37. data/utils/enveomics/Scripts/FastQ.interpose.pl +20 -20
  38. data/utils/enveomics/Scripts/RecPlot2.compareIdentities.R +32 -0
  39. data/utils/enveomics/Scripts/SRA.download.bash +28 -21
  40. data/utils/enveomics/Scripts/Table.barplot.R +1 -0
  41. data/utils/enveomics/Scripts/aai.rb +4 -2
  42. data/utils/enveomics/build_enveomics_r.bash +5 -5
  43. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -1
  44. data/utils/enveomics/enveomics.R/NAMESPACE +6 -2
  45. data/utils/enveomics/enveomics.R/R/recplot2.R +471 -71
  46. data/utils/enveomics/enveomics.R/README.md +26 -17
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -1
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +23 -0
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +6 -3
  50. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +32 -0
  51. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +24 -0
  52. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +12 -7
  53. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +8 -37
  54. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +20 -0
  55. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +20 -0
  56. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +29 -0
  57. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +42 -0
  58. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +18 -0
  59. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +33 -0
  60. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +28 -0
  61. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +56 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +3 -1
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +22 -0
  64. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +20 -14
  65. data/utils/requirements.txt +1 -1
  66. metadata +28 -4
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeak.Rd +0 -40
  68. data/utils/enveomics/enveomics.R/man/enve.recplot2.__findPeaks.Rd +0 -18
@@ -1,43 +1,49 @@
1
1
  #!/usr/bin/env perl
2
2
 
3
- # Interpose sequences in FastA format from two files into one output file. If more than two files are
4
- # provided, the script will interpose all the input files.
5
- # Please note that this script will check for the consistency of the names (assuming a pair of related reads
6
- # contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn this
7
- # feature off just set the $eval_T variable to zero. If you want to decrease the sampling period (to speed
8
- # the script up) or increase it (to make it more sensitive to errors) just change $eval_T accordingly.
9
- #
10
3
  # @author Luis M. Rodriguez-R
11
- # @version 1.0
12
- # @created Nov-27-2012
13
- # @update Mar-23-2015
14
4
  # @license artistic license 2.0
15
- #
16
- # Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
17
5
 
18
6
  use strict;
19
7
  use warnings;
20
8
  use Symbol;
21
9
 
22
10
  my $HELP = <<HELP
11
+
12
+ Description:
13
+ Interposes sequences in FastA format from two files into one output file.
14
+ If more than two files are provided, the script will interpose all the input
15
+ files.
16
+ Note that this script will check for the consistency of the names (assuming
17
+ a pair of related reads contains the same name varying only in a trailing
18
+ slash (/) followed by a digit. If you want to turn this feature off just
19
+ set the -T option to zero. If you want to decrease the sampling period (to
20
+ speed the script up) or increase it (to make it more sensitive to errors)
21
+ just change the -T option accordingly.
22
+
23
23
  Usage:
24
- $0 <output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
24
+ $0 [-T <int> ]<output_fasta> <input_fasta_1> <input_fasta_2> [additional input files...]
25
25
 
26
26
  Where,
27
- output_fasta : Output file
28
- input_fasta_1 : First FastA file
29
- input_fasta_2 : Second FastA file
30
- ... : Any additional FastA files (or none)
27
+ -T <int> : Optional. Integer indicating the sampling period for
28
+ names evaluation (see Description above).
29
+ By default: 1000.
30
+ output_fasta : Output file
31
+ input_fasta_1 : First FastA file
32
+ input_fasta_2 : Second FastA file
33
+ ... : Any additional FastA files (or none)
31
34
 
32
35
  HELP
33
36
  ;
34
- my $eval_T = 1000; # Period (in number of entries) of evaluation for consistency of the names.
35
- # To turn off evaluation set to 0 (zero).
37
+ my $eval_T = 1000;
38
+ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
39
+ $eval_T = $ARGV[1]+0;
40
+ shift @ARGV;
41
+ shift @ARGV;
42
+ }
36
43
  my $out = shift @ARGV;
37
44
  my @in = @ARGV;
38
45
  $/ = "\n>";
39
46
 
40
-
41
47
  die $HELP unless $out and $#in >= 1;
42
48
  open OUT, ">", $out or die "Unable to write on $out: $!\n";
43
49
  print "Output file: $out\n";
@@ -1,11 +1,7 @@
1
1
  #!/usr/bin/env perl
2
- #
2
+
3
3
  # @author Luis M. Rodriguez-R
4
- # @version 2.0
5
- # @update: Mar-23-2015
6
4
  # @license artistic license 2.0
7
- #
8
- # Usage: FastQ.interpose.pl <output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
9
5
 
10
6
  use strict;
11
7
  use warnings;
@@ -14,24 +10,27 @@ use Symbol;
14
10
  my $HELP = <<HELP
15
11
 
16
12
  Description:
17
- Interposes sequences in FastQ format from two files into one output file. If more than two files are
18
- provided, the script will interpose all the input files.
19
- Note that this script will check for the consistency of the names (assuming a pair of related reads
20
- contains the same name varying only in a trailing slash (/) followed by a digit. If you want to turn
21
- this feature off just set the -T option to zero. If you want to decrease the sampling period
22
- (to speed the script up) or increase it (to make it more sensitive to errors) just change -T option
23
- accordingly.
24
-
13
+ Interposes sequences in FastQ format from two files into one output file.
14
+ If more than two files are provided, the script will interpose all the input
15
+ files.
16
+ Note that this script will check for the consistency of the names (assuming
17
+ a pair of related reads contains the same name varying only in a trailing
18
+ slash (/) followed by a digit. If you want to turn this feature off just
19
+ set the -T option to zero. If you want to decrease the sampling period (to
20
+ speed the script up) or increase it (to make it more sensitive to errors)
21
+ just change the -T option accordingly.
22
+
25
23
  Usage:
26
- $0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
24
+ $0 [-T <int> ]<output_fastq> <input_fastq_1> <input_fastq_2> [additional input files...]
27
25
 
28
26
  Where,
29
- -T <int> : Optional. Integer indicating the sampling period for names evaluation (see
30
- Description above). By default: 1000.
31
- output_fastq : Output file
32
- input_fastq_1 : First FastQ file
33
- input_fastq_2 : Second FastQ file
34
- ... : Any additional FastQ files (or none)
27
+ -T <int> : Optional. Integer indicating the sampling period for
28
+ names evaluation (see Description above).
29
+ By default: 1000.
30
+ output_fastq : Output file
31
+ input_fastq_1 : First FastQ file
32
+ input_fastq_2 : Second FastQ file
33
+ ... : Any additional FastQ files (or none)
35
34
 
36
35
  HELP
37
36
  ;
@@ -44,6 +43,7 @@ if(exists $ARGV[0] and exists $ARGV[1] and $ARGV[0] eq '-T'){
44
43
  my $out = shift @ARGV;
45
44
  my @in = @ARGV;
46
45
 
46
+
47
47
  die $HELP unless $out and $#in >= 1;
48
48
  open OUT, ">", $out or die "Unable to write on $out: $!\n";
49
49
  print "Output file: $out\n";
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Jan-04-2016
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ #= Load stuff
9
+ args <- commandArgs(trailingOnly = F)
10
+ enveomics_R <- file.path(dirname(
11
+ sub("^--file=", "", args[grep("^--file=", args)])),
12
+ "lib", "enveomics.R")
13
+ library(methods)
14
+ source(file.path(enveomics_R, "R", "cliopts.R"))
15
+ source(file.path(enveomics_R, "R", "recplot2.R"))
16
+
17
+ #= Generate interface
18
+ opt <- enve.cliopts(enve.recplot2.compareIdentities,
19
+ file.path(enveomics_R, "man", "enve.recplot2.compareIdentities.Rd"),
20
+ positional_arguments=2,
21
+ usage="usage: %prog [options] recplot-A.Rdata recplot-B.Rdata",
22
+ number=c("pseudocounts", "max.deviation"), ignore=c("x", "y"),
23
+ p_desc="Calculates the difference between identity distributions of two recruitment plots.")
24
+
25
+ #= Run it!
26
+ load(opt$args[1])
27
+ opt$options[['x']] <- rp
28
+ load(opt$args[2])
29
+ opt$options[['y']] <- rp
30
+ dist <- do.call("enve.recplot2.compareIdentities", opt$options)
31
+ cat(dist, '\n')
32
+
@@ -2,14 +2,13 @@
2
2
 
3
3
  #
4
4
  # @author Luis M. Rodriguez-R
5
- # @update Nov-27-2015
6
5
  # @license artistic license 2.0
7
6
  #
8
7
 
9
- DATA_LINK="http://www.ebi.ac.uk/ena/data/warehouse/filereport"
8
+ DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
10
9
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
11
10
  SRX=$1
12
- DIR=${1:-$SRX}
11
+ DIR=${2:-$SRX}
13
12
 
14
13
  if [[ "$SRX" == "" ]] ; then
15
14
  echo "
@@ -27,24 +26,32 @@ fi
27
26
 
28
27
  [[ -d "$DIR" ]] || mkdir "$DIR"
29
28
 
29
+ function md5value {
30
+ local file=$1
31
+ o=$(md5 "$file" | perl -pe 's/.* //')
32
+ [[ -n $o ]] || o=$(md5sum-lite "$file" | awk '{print $1}')
33
+ [[ -n $o ]] || o=$(md5sum "$file" | awk '{print $1}')
34
+ echo "$o"
35
+ }
36
+
30
37
  curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
31
38
  tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
32
- srr=$(echo "$ln"|cut -f 1)
33
- ftp=$(echo "$ln"|cut -f 2)
34
- md5=$(echo "$ln"|cut -f 3)
35
- dir="$DIR/$srr"
36
- [[ -d "$dir" ]] || mkdir "$dir"
37
- echo "o $srr" >&2
38
- for uri in $(echo "$ftp" | tr ";" " ") ; do
39
- file="$dir/$(basename $uri)"
40
- curl "$uri" -o "$file"
41
- md5obs=$(md5sum "$file" | awk '{print $1}')
42
- if [[ "$md5" == "$md5obs"* ]] ; then
43
- md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
44
- else
45
- echo "Corrupt file: $file" >&2
46
- echo " MD5 mismatch: $md5obs not in $md5" >&2
47
- exit 1;
48
- fi
49
- done
39
+ srr=$(echo "$ln"|cut -f 1)
40
+ ftp=$(echo "$ln"|cut -f 2)
41
+ md5=$(echo "$ln"|cut -f 3)
42
+ dir="$DIR/$srr"
43
+ [[ -d "$dir" ]] || mkdir "$dir"
44
+ echo "o $srr" >&2
45
+ for uri in $(echo "$ftp" | tr ";" " ") ; do
46
+ file="$dir/$(basename $uri)"
47
+ curl "$uri" -o "$file"
48
+ md5obs=$(md5value "$file")
49
+ if [[ "$md5" == "$md5obs"* ]] ; then
50
+ md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
51
+ else
52
+ echo "Corrupt file: $file" >&2
53
+ echo " MD5 mismatch: $md5obs not in $md5" >&2
54
+ exit 1;
55
+ fi
56
+ done
50
57
  done
@@ -11,6 +11,7 @@ enveomics_R <- file.path(dirname(
11
11
  sub("^--file=", "", args[grep("^--file=", args)])),
12
12
  "lib", "enveomics.R")
13
13
  source(file.path(enveomics_R, "R", "cliopts.R"))
14
+ source(file.path(enveomics_R, "R", "utils.R"))
14
15
  source(file.path(enveomics_R, "R", "barplot.R"))
15
16
 
16
17
  #= Generate interface
@@ -296,8 +296,10 @@ Dir.mktmpdir do |dir|
296
296
  "#{dir}/#{i}.tab.uns"`
297
297
  `sort -k 1 "#{dir}/#{i}.tab.uns" > "#{dir}/#{i}.tab"`
298
298
  when "diamond"
299
- `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --outfmt 6 \
300
- --db "#{s}.dmnd" --query "#{q}" --out "#{dir}/#{i}.tab" --more-sensitive`
299
+ `"#{o[:bin]}diamond" blastp --threads "#{o[:thr]}" --db "#{s}.dmnd" \
300
+ --query "#{q}" --sensitive --daa "#{dir}/#{i}.daa" \
301
+ && "#{o[:bin]}diamond" view --daa "#{dir}/#{i}.daa" --outfmt 6 \
302
+ --out "#{dir}/#{i}.tab"`
301
303
  else
302
304
  abort "Unsupported program: #{o[:program]}."
303
305
  end
@@ -34,11 +34,11 @@ library(inlinedocs)
34
34
  package.skeleton.dx('./');
35
35
  " | R --vanilla
36
36
  cat man/enveomics.R-package.Rd | tr -d '\r' \
37
- | grep -v '^}$' | grep -v '^\\author{' \
38
- | grep -v '^Maintainer' \
39
- | perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
40
- | perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
41
- > o && mv o man/enveomics.R-package.Rd
37
+ | grep -v '^}$' | grep -v '^\\author{' \
38
+ | grep -v '^Maintainer' \
39
+ | perl -pe 's/^\\keyword/}\n\\author{Luis M. Rodriguez-R <lmrodriguezr\@gmail.com> [aut, cre]}\n\n\\keyword/' \
40
+ | perl -lwe '$/=\0; $_=<>; s/^\\details{\n+([^}].*\n+)*}\n+//mg; print' \
41
+ > o && mv o man/enveomics.R-package.Rd
42
42
  #[[ ! -d inst/doc ]] && mkdir -p inst/doc
43
43
  #pandoc -o inst/doc/enveomics.R.pdf -f markdown_github README.md
44
44
 
@@ -1,5 +1,5 @@
1
1
  Package: enveomics.R
2
- Version: 1.1.6
2
+ Version: 1.3
3
3
  Authors@R: c(person("Luis M.","Rodriguez-R",role=c("aut","cre"),
4
4
  email="lmrodriguezr@gmail.com"))
5
5
  Title: Various Utilities for Microbial Genomics and Metagenomics
@@ -10,7 +10,8 @@ importFrom("graphics", "abline", "axis", "barplot", "hist", "image",
10
10
  "layout", "legend", "lines", "par", "points", "polygon",
11
11
  "rect", "text")
12
12
  importFrom("stats", "as.dist", "cophenetic", "cor", "median",
13
- "quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm")
13
+ "quantile", "runif", "smooth.spline", "nls", "nls.control", "qnorm",
14
+ "dnorm", "kmeans")
14
15
  importFrom("utils", "head", "read.table", "setTxtProgressBar", "tail",
15
16
  "txtProgressBar")
16
17
  exportClasses(enve.RecPlot2, enve.RecPlot2.Peak, enve.TRIBS, enve.TRIBStest,
@@ -29,7 +30,10 @@ export(
29
30
  plot.enve.TRIBStest, summary.enve.TRIBStest,
30
31
  enve.df2dist, enve.df2dist.group, enve.df2dist.list,
31
32
  enve.recplot2, plot.enve.RecPlot2, enve.recplot2.findPeaks,
33
+ enve.recplot2.findPeaks.emauto, enve.recplot2.findPeaks.em,
34
+ enve.recplot2.findPeaks.mower,
32
35
  enve.recplot2.corePeak, enve.recplot2.changeCutoff,
33
- enve.recplot2.extractWindows,
36
+ enve.recplot2.extractWindows, enve.recplot2.compareIdentities,
37
+ enve.recplot2.coordinates, enve.recplot2.seqdepth, enve.recplot2.ANIr,
34
38
  enve.growthcurve, plot.enve.GrowthCurve, summary.enve.GrowthCurve,
35
39
  enve.col2alpha)
@@ -9,7 +9,7 @@ setClass("enve.RecPlot2",
9
9
  id.counts='numeric', ##<< Counts per ID bin.
10
10
  id.breaks='numeric', ##<< Breaks of identity bins.
11
11
  pos.breaks='numeric', ##<< Breaks of position bins.
12
- seq.breaks='numeric',
12
+ seq.breaks='numeric', ##<< Breaks of input sequences.
13
13
  peaks='list', ##<< Peaks identified in the recplot.
14
14
  ### Limits of the subject sequences after concatenation.
15
15
  seq.names='character', ##<< Names of the subject sequences.
@@ -45,9 +45,13 @@ setClass("enve.RecPlot2.Peak",
45
45
  ### number of position bins with non-zero sequencing depth in the recruitment
46
46
  ### plot (regardless of peak count).
47
47
  err.res='numeric',
48
- ### Error left after adding the peak.
49
- merge.logdist='numeric'
48
+ ### Error left after adding the peak (mower) or log-likelihood (em or emauto).
49
+ merge.logdist='numeric',
50
50
  ### Attempted `merge.logdist` parameter.
51
+ seq.depth='numeric',
52
+ ### Best estimate available for the sequencing depth of the peak (centrality).
53
+ log='logical'
54
+ ### Indicates if the estimation was performed in natural logarithm space
51
55
  ));
52
56
  setMethod("$", "enve.RecPlot2", function(x, name) attr(x, name))
53
57
  setMethod("$", "enve.RecPlot2.Peak", function(x, name) attr(x, name))
@@ -83,6 +87,9 @@ plot.enve.RecPlot2 <- function
83
87
  peaks.col='darkred',
84
88
  ### If not NA, it attempts to represent peaks in the population histogram
85
89
  ### in the specified color. Set to NA to avoid peak-finding.
90
+ use.peaks,
91
+ ### A list of `enve.RecPlot2.Peak` objects, as returned by
92
+ ### `enve.recplot2.findPeaks`. If passed, `peaks.opts` is ignored.
86
93
  id.lim=range(x$id.breaks),
87
94
  ### Limits of identities to represent.
88
95
  pos.lim=range(x$pos.breaks),
@@ -98,15 +105,17 @@ plot.enve.RecPlot2 <- function
98
105
  ### the number of the panel as index (see `layout`).
99
106
  pos.splines=0,
100
107
  ### Smoothing parameter for the splines in the position histogram. Zero
101
- ### (0) for no splines. If non-zero, requires the stats package.
108
+ ### (0) for no splines. Use NULL to automatically detect by leave-one-out
109
+ ### cross-validation.
102
110
  id.splines=1/2,
103
111
  ### Smoothing parameter for the splines in the identity histogram. Zero
104
- ### (0) for no splines. If non-zero, requires the stats package.
105
- in.lwd=ifelse(pos.splines>0, 1/2, 2),
112
+ ### (0) for no splines. Use NULL to automatically detect by leave-one-out
113
+ ### cross-validation.
114
+ in.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
106
115
  ### Line width for the sequencing depth of in-group matches.
107
- out.lwd=ifelse(pos.splines>0, 1/2, 2),
116
+ out.lwd=ifelse(is.null(pos.splines) || pos.splines>0, 1/2, 2),
108
117
  ### Line width for the sequencing depth of out-group matches.
109
- id.lwd=ifelse(id.splines>0, 1/2, 2),
118
+ id.lwd=ifelse(is.null(id.splines) || id.splines>0, 1/2, 2),
110
119
  ### Line width for the identity histogram.
111
120
  in.col='darkblue',
112
121
  ### Color associated to in-group matches.
@@ -193,7 +202,7 @@ plot.enve.RecPlot2 <- function
193
202
  pos.f <- rep(seqdepth.in,each=2)
194
203
  lines(pos.x, rep(seqdepth.out,each=2), lwd=out.lwd, col=out.col);
195
204
  lines(pos.x, pos.f, lwd=in.lwd, col=in.col);
196
- if(pos.splines > 0){
205
+ if(is.null(pos.splines) || pos.splines > 0){
197
206
  pos.spline <- smooth.spline(pos.x[pos.f>0], log(pos.f[pos.f>0]),
198
207
  spar=pos.splines)
199
208
  lines(pos.spline$x, exp(pos.spline$y), lwd=2, col=in.col)
@@ -230,7 +239,7 @@ plot.enve.RecPlot2 <- function
230
239
  id.f <- rep(id.counts,each=2)
231
240
  id.x <- rep(id.breaks,each=2)[-c(1,2*length(id.breaks))]
232
241
  lines(id.f, id.x, lwd=id.lwd, col=id.col);
233
- if(id.splines > 0){
242
+ if(is.null(id.splines) || id.splines > 0){
234
243
  id.spline <- smooth.spline(id.x[id.f>0], log(id.f[id.f>0]),
235
244
  spar=id.splines)
236
245
  lines(exp(id.spline$y), id.spline$x, lwd=2, col=id.col)
@@ -267,8 +276,12 @@ plot.enve.RecPlot2 <- function
267
276
  polygon(c(0,rep(h.in$counts,each=2),0,0,rep(sum(pos.counts.in==0),2),0),
268
277
  y.tmp.in, border=NA, col=in.col)
269
278
  if(!is.na(peaks.col)){
270
- o <- peaks.opts; o$x = x;
271
- peaks <- do.call(enve.recplot2.findPeaks, o);
279
+ o <- peaks.opts; o$x = x;
280
+ if(missing(use.peaks)){
281
+ peaks <- do.call(enve.recplot2.findPeaks, o)
282
+ }else{
283
+ peaks <- use.peaks
284
+ }
272
285
  h.mids <- (10^h.breaks[-1] + 10^h.breaks[-length(h.breaks)])/2
273
286
  if(!is.null(peaks) & length(peaks)>0){
274
287
  pf <- h.mids*0;
@@ -276,18 +289,23 @@ plot.enve.RecPlot2 <- function
276
289
  cnt <- enve.recplot2.__peakHist(peaks[[i]], h.mids)
277
290
  lines(cnt, h.mids, col='red');
278
291
  pf <- pf+cnt;
279
- axis(4, at=peaks[[i]]$param.hat[[length(peaks[[i]]$param.hat)]],
280
- letters[i], las=1, hadj=1/2)
292
+ axis(4, at=peaks[[i]]$seq.depth, letters[i], las=1, hadj=1/2)
281
293
  }
282
294
  lines(pf, h.mids, col='red',lwd=1.5);
283
- legend('bottomright', legend=paste(
284
- letters[1:length(peaks)],'. ',
285
- signif(as.numeric(lapply(peaks,
286
- function(x) tail(as.numeric(x$param.hat),n=1))),3),'X (',
287
- signif(100*as.numeric(lapply(peaks,
288
- function(x) (length(x$values)/x$n.total))), 3), '%, err: ',
289
- signif(as.numeric(lapply(peaks, function(x) x$err.res)), 3), ')',
290
- sep=''), bty='n');
295
+ dpt <- signif(as.numeric(lapply(peaks, function(x) x$seq.depth)),2)
296
+ frx <- signif(100*as.numeric(
297
+ lapply(peaks,
298
+ function(x) ifelse(length(x$values)==0, x$n.hat,
299
+ length(x$values))/x$n.total)), 2)
300
+ if(peaks[[1]]$err.res < 0){
301
+ err <- paste(', LL:', signif(peaks[[1]]$err.res, 3))
302
+ }else{
303
+ err <- paste(', err:',
304
+ signif(as.numeric(lapply(peaks, function(x) x$err.res)), 2))
305
+ }
306
+ legend('topright', bty='n', cex=1/2,
307
+ legend=paste(letters[1:length(peaks)],'. ',
308
+ dpt,'X (', frx, '%', err, ')', sep=''))
291
309
  }
292
310
  }
293
311
  }
@@ -334,6 +352,10 @@ enve.recplot2 <- function(
334
352
  id.breaks=300,
335
353
  ### Breaks in the identity histogram. It can also be a vector of break
336
354
  ### points, and values outside the range are ignored.
355
+ id.free.range=FALSE,
356
+ ### Indicates that the range should be freely set from the observed
357
+ ### values. Otherwise, 70-100% is included in the identity histogram
358
+ ### (default).
337
359
  id.metric=c('identity', 'corrected identity', 'bit score'),
338
360
  ### Metric of identity to be used (Y-axis). Corrected identity is only
339
361
  ### supported if the original BLAST file included sequence lengths.
@@ -373,12 +395,14 @@ enve.recplot2 <- function(
373
395
  if(pos.breaks>0){
374
396
  pos.breaks <- seq(min(lim[,2]), max(lim[,3]), length.out=pos.breaks+1);
375
397
  }else{
376
- pos.breaks <- c(lim[,2], tail(lim[,3], n=1))
398
+ pos.breaks <- c(lim[1,2], lim[,3])
377
399
  }
378
400
  }
379
401
  if(length(id.breaks)==1){
380
- id.breaks <- seq(min(rec[,rec.idcol]), max(rec[,rec.idcol]),
381
- length.out=id.breaks+1);
402
+ id.range.v <- rec[,rec.idcol]
403
+ if(!id.free.range) id.range.v <- c(id.range.v,70,100)
404
+ id.range.v <- range(id.range.v)
405
+ id.breaks <- seq(id.range.v[1], id.range.v[2], length.out=id.breaks+1);
382
406
  }
383
407
 
384
408
  # Run in parallel
@@ -423,8 +447,159 @@ enve.recplot2 <- function(
423
447
  }
424
448
 
425
449
  enve.recplot2.findPeaks <- function(
450
+ ### Identifies peaks in the population histogram potentially indicating
451
+ ### sub-population mixtures
452
+ x,
453
+ ### An `enve.RecPlot2` object.
454
+ method="emauto",
455
+ ### Peak-finder method. This should be one of:
456
+ ### "emauto" (Expectation-Maximization with auto-selection of components),
457
+ ### "em" (Expectation-Maximization),
458
+ ### "mower" (Custom distribution-mowing method).
459
+ ...
460
+ ### Any additional parameters supported by
461
+ ### `enve.recplot2.findPeaks.<method>`.
462
+ ){
463
+ if(method == "emauto"){
464
+ peaks <- enve.recplot2.findPeaks.emauto(x, ...)
465
+ }else if(method == "em"){
466
+ peaks <- enve.recplot2.findPeaks.em(x, ...)
467
+ }else if(method == "mower"){
468
+ peaks <- enve.recplot2.findPeaks.mower(x, ...)
469
+ }else{
470
+ stop("Invalid peak-finder method ", method)
471
+ }
472
+ return(peaks)
473
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
474
+ }
475
+
476
+ enve.recplot2.findPeaks.emauto <- function(
477
+ ### Identifies peaks in the population histogram using a Gaussian Mixture
478
+ ### Model Expectation Maximization (GMM-EM) method with number of components
479
+ ### automatically detected.
480
+ x,
481
+ ### An `enve.RecPlot2` object.
482
+ components=seq(1,10),
483
+ ### A vector of number of components to evaluate.
484
+ criterion='aic',
485
+ ### Criterion to use for components selection. Must be one of:
486
+ ### 'aic' (Akaike Information Criterion),
487
+ ### 'bic' or 'sbc' (Bayesian Information Criterion or Schwarz Criterion).
488
+ merge.tol=2L,
489
+ ### When attempting to merge peaks with very similar sequencing depth, use
490
+ ### this number of significant digits (in log-scale).
491
+ verbose=FALSE,
492
+ ### Display (mostly debugging) information.
493
+ ...
494
+ ### Any additional parameters supported by `enve.recplot2.findPeaks.em`.
495
+ ){
496
+ best <- list(crit=0, pstore=list())
497
+ if(criterion == 'aic'){
498
+ do_crit <- function(ll, k, n) 2*k - 2*ll
499
+ }else if(criterion %in% c('bic', 'sbc')){
500
+ do_crit <- function(ll, k, n) log(n)*k - 2*ll
501
+ }else{
502
+ stop('Invalid criterion ', criterion)
503
+ }
504
+ for(comp in components){
505
+ best <- enve.recplot2.findPeaks.__emauto_one(x, comp, do_crit, best,
506
+ verbose, ...)
507
+ }
508
+
509
+ seqdepths.r <- signif(log(sapply(best[['peaks']],
510
+ function(x) x$seq.depth)), merge.tol)
511
+ distinct <- length(unique(seqdepths.r))
512
+ if(distinct < length(best[['peaks']])){
513
+ if(verbose) cat('Attempting merge to', distinct, 'components\n')
514
+ init <- apply(sapply(best[['peaks']],
515
+ function(x) c(x$param.hat, alpha=x$n.hat/x$n.total)), 1, as.numeric)
516
+ init <- init[!duplicated(seqdepths.r),]
517
+ init <- list(mu=init[,'mean'], sd=init[,'sd'],
518
+ alpha=init[,'alpha']/sum(init[,'alpha']))
519
+ best <- enve.recplot2.findPeaks.__emauto_one(x, distinct, do_crit, best,
520
+ verbose, ...)
521
+ }
522
+ return(best[['peaks']])
523
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
524
+ }
525
+
526
+ enve.recplot2.findPeaks.em <- function(
527
+ ### Identifies peaks in the population histogram using a Gaussian Mixture
528
+ ### Model Expectation Maximization (GMM-EM) method.
529
+ x,
530
+ ### An `enve.RecPlot2` object.
531
+ max.iter=1000,
532
+ ### Maximum number of EM iterations.
533
+ ll.diff.res=1e-8,
534
+ ### Maximum Log-Likelihood difference to be considered as convergent.
535
+ components=2,
536
+ ### Number of distributions assumed in the mixture.
537
+ rm.top=0.05,
538
+ ### Top-values to remove before finding peaks, as a quantile probability.
539
+ ### This step is useful to remove highly conserved regions, but can be
540
+ ### turned off by setting rm.top=0. The quantile is determined *after*
541
+ ### removing zero-coverage windows.
542
+ verbose=FALSE,
543
+ ### Display (mostly debugging) information.
544
+ init,
545
+ ### Initialization parameters. By default, these are derived from k-means
546
+ ### clustering. A named list with vectors for 'mu', 'sd', and 'alpha', each
547
+ ### of length `components`.
548
+ log=TRUE
549
+ ### Logical value indicating if the estimations should be performed in
550
+ ### natural logarithm units. Do not change unless you know what you're
551
+ ### doing.
552
+ ){
553
+
554
+ # Essential vars
555
+ pos.binsize <- x$pos.breaks[-1] - x$pos.breaks[-length(x$pos.breaks)]
556
+ lsd1 <- (x$pos.counts.in/pos.binsize)[ x$pos.counts.in > 0 ]
557
+ lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
558
+ if(log) lsd1 <- log(lsd1)
559
+
560
+ # 1. Initialize
561
+ if(missing(init)){
562
+ km.clust <- kmeans(lsd1, components)$cluster
563
+ init <- list(
564
+ mu = tapply(lsd1, km.clust, mean),
565
+ sd = tapply(lsd1, km.clust, sd),
566
+ alpha = table(km.clust)/length(km.clust)
567
+ )
568
+ }
569
+ m.step <- init
570
+ ll <- c()
571
+ cur.ll <- -Inf
572
+
573
+ for(i in 1:max.iter){
574
+ # 2/3. EM
575
+ e.step <- enve.recplot2.findPeaks.__em_e(lsd1, m.step)
576
+ m.step <- enve.recplot2.findPeaks.__em_m(lsd1, e.step[['posterior']])
577
+ # 4. Convergence
578
+ ll <- c(ll, e.step[["ll"]])
579
+ ll.diff <- abs(cur.ll - e.step[["ll"]])
580
+ cur.ll <- e.step[["ll"]]
581
+ if(verbose) cat(i, '\t| LL =', cur.ll, '\t| LL.diff =', ll.diff, '\n')
582
+ if(ll.diff <= ll.diff.res) break
583
+ }
584
+
585
+ # Return
586
+ peaks <- list()
587
+ for(i in 1:components){
588
+ n.hat <- m.step[['alpha']][i]*length(lsd1)
589
+ peaks[[i]] <- new('enve.RecPlot2.Peak', dist='norm', values=as.numeric(),
590
+ values.res=0, mode=m.step[['mu']][i],
591
+ param.hat=list(sd=m.step[['sd']][i], mean=m.step[['mu']][i]),
592
+ n.hat=n.hat, n.total=length(lsd1), err.res=cur.ll,
593
+ merge.logdist=as.numeric(), log=log,
594
+ seq.depth=ifelse(log, exp(m.step[['mu']][i]), m.step[['mu']][i]))
595
+ }
596
+ return(peaks)
597
+ ### Returns a list of `enve.RecPlot2.Peak` objects.
598
+ }
599
+
600
+ enve.recplot2.findPeaks.mower <- function(
426
601
  ### Identifies peaks in the population histogram potentially indicating
427
- ### sub-population mixtures.
602
+ ### sub-population mixtures, using a custom distribution-mowing method.
428
603
  x,
429
604
  ### An `enve.RecPlot2` object.
430
605
  min.points=10,
@@ -436,13 +611,12 @@ enve.recplot2.findPeaks <- function(
436
611
  mlv.opts=list(method='parzen'),
437
612
  ### Options passed to `mlv` to estimate the mode.
438
613
  fitdist.opts.sn=list(distr='sn', method='qme', probs=c(0.1,0.5,0.8),
439
- start=list(omega=1, alpha=-1), lower=c(1e-6, -Inf, 0),
440
- upper=c(Inf, 0, Inf)),
614
+ start=list(omega=1, alpha=-1), lower=c(0, -Inf, -Inf)),
441
615
  ### Options passed to `fitdist` to estimate the standard deviation if
442
616
  ### with.skewness=TRUE. Note that the `start` parameter will be ammended
443
617
  ### with xi=estimated mode for each peak.
444
- fitdist.opts.norm=list(distr='norm', method='qme', probs=c(.4,.6),
445
- start=list(sd=1), lower=c(1e-8, 0)),
618
+ fitdist.opts.norm=list(distr='norm', method='qme', probs=c(0.4,0.6),
619
+ start=list(sd=1), lower=c(0, -Inf)),
446
620
  ### Options passed to `fitdist` to estimate the standard deviation if
447
621
  ### with.skewness=FALSE. Note that the `start` parameter will be ammended
448
622
  ### with mean=estimated mode for each peak.
@@ -460,7 +634,7 @@ enve.recplot2.findPeaks <- function(
460
634
  ### "tail distribution".
461
635
  optim.rounds=200,
462
636
  ### Maximum rounds of peak optimization.
463
- optim.epsilon=1e-8,
637
+ optim.epsilon=1e-4,
464
638
  ### Trace change at which optimization stops (unless `optim.rounds` is
465
639
  ### reached first). The trace change is estimated as the sum of square
466
640
  ### differences between parameters in one round and those from two rounds
@@ -469,8 +643,12 @@ enve.recplot2.findPeaks <- function(
469
643
  ### Maximum value of |log-ratio| between centrality parameters in peaks to
470
644
  ### attempt merging. The default of ~0.22 corresponds to a maximum
471
645
  ### difference of 25%.
472
- verbose=FALSE
646
+ verbose=FALSE,
473
647
  ### Display (mostly debugging) information.
648
+ log=TRUE
649
+ ### Logical value indicating if the estimations should be performed in
650
+ ### natural logarithm units. Do not change unless you know what you're
651
+ ### doing.
474
652
  ){
475
653
 
476
654
  # Essential vars
@@ -478,6 +656,7 @@ enve.recplot2.findPeaks <- function(
478
656
  seqdepth.in <- x$pos.counts.in/pos.binsize;
479
657
  lsd1 <- seqdepth.in[seqdepth.in>0];
480
658
  lsd1 <- lsd1[ lsd1 < quantile(lsd1, 1-rm.top, names=FALSE) ]
659
+ if(log) lsd1 <- log(lsd1)
481
660
  if(with.skewness){
482
661
  fitdist.opts <- fitdist.opts.sn
483
662
  }else{
@@ -486,11 +665,11 @@ enve.recplot2.findPeaks <- function(
486
665
  peaks.opts <- list(lsd1=lsd1, min.points=min.points, quant.est=quant.est,
487
666
  mlv.opts=mlv.opts, fitdist.opts=fitdist.opts, with.skewness=with.skewness,
488
667
  optim.rounds=optim.rounds, optim.epsilon=optim.epsilon, verbose=verbose,
489
- n.total=length(lsd1), merge.logdist=merge.logdist)
668
+ n.total=length(lsd1), merge.logdist=merge.logdist, log=log)
490
669
 
491
670
  # Find seed peaks
492
671
  if(verbose) cat('Mowing peaks for n =',length(lsd1),'\n')
493
- peaks <- enve.recplot2.__findPeaks(peaks.opts);
672
+ peaks <- enve.recplot2.findPeaks.__mower(peaks.opts);
494
673
 
495
674
  # Merge overlapping peaks
496
675
  if(verbose) cat('Trying to merge',length(peaks),'peaks\n')
@@ -511,7 +690,7 @@ enve.recplot2.findPeaks <- function(
511
690
  p$param.hat[[ length(p$param.hat) ]],'&',
512
691
  p2$param.hat[[ length(p2$param.hat) ]],'X\n');
513
692
  peaks.opts$lsd1 <- c(p$values, p2$values)
514
- p.new <- enve.recplot2.__findPeaks(peaks.opts)
693
+ p.new <- enve.recplot2.findPeaks.__mower(peaks.opts)
515
694
  if(length(p.new)==1){
516
695
  peaks2[[ length(peaks2)+1 ]] <- p.new[[ 1 ]]
517
696
  ignore <- c(ignore, j)
@@ -542,16 +721,19 @@ enve.recplot2.corePeak <- function
542
721
  function(y) y$param.hat[[ length(y$param.hat) ]])))
543
722
  ]]
544
723
  # If a "larger" peak (a peak explaining more bins of the genome) is within
545
- # the "merge.logdist" distance, take that one instead.
724
+ # the default "merge.logdist" distance, take that one instead.
546
725
  corePeak <- maxPeak
547
726
  for(p in x){
548
- sz.d = log(length(p$values)/length(corePeak$values))
549
- if(sz.d < 0)
550
- next;
551
- sq.d.a <- p$param.hat[[ length(p$param.hat) ]]
552
- sq.d.b <- maxPeak$param.hat[[ length(maxPeak$param.hat) ]]
553
- if(abs(log(sq.d.a/sq.d.b )) < maxPeak$merge.logdist+sz.d/5)
554
- corePeak <- p
727
+ p.len <- ifelse(length(p$values)==0, p$n.hat, length(p$values))
728
+ corePeak.len <- ifelse(
729
+ length(corePeak$values)==0, corePeak$n.hat, length(corePeak$values))
730
+ sz.d <- log(p.len/corePeak.len)
731
+ if(is.nan(sz.d) || sz.d < 0) next
732
+ sq.d.a <- as.numeric(tail(p$param.hat, n=1))
733
+ sq.d.b <- as.numeric(tail(maxPeak$param.hat, n=1))
734
+ if(p$log) sq.d.a <- exp(sq.d.a)
735
+ if(corePeak$log) sq.d.b <- exp(sq.d.b)
736
+ if(abs(log(sq.d.a/sq.d.b)) < log(1.75)+sz.d/5) corePeak <- p
555
737
  }
556
738
  return(corePeak)
557
739
  }
@@ -580,43 +762,204 @@ enve.recplot2.extractWindows <- function
580
762
  ### Extract windows significantly below (or above) the peak in sequencing
581
763
  ### depth.
582
764
  (rp,
583
- ### Recruitment plot, a enve.Recplot2 object.
765
+ ### Recruitment plot, a enve.RecPlot2 object.
584
766
  peak,
585
- ### Peak, a enve.RecPlot2.Peak object. If list, it is assumed to be a list
586
- ### of enve.RecPlot2.Peak objects, in which case the core peak is used
587
- ### (see enve.recplot2.corePeak).
767
+ ### Peak, an `enve.RecPlot2.Peak` object. If list, it is assumed to be a
768
+ ### list of enve.RecPlot2.Peak objects, in which case the core peak is
769
+ ### used (see `enve.recplot2.corePeak`).
588
770
  lower.tail=TRUE,
589
771
  ### If FALSE, it returns windows significantly above the peak in
590
772
  ### sequencing depth.
591
773
  significance=0.05,
592
774
  ### Significance threshold (alpha) to select windows.
593
775
  seq.names=FALSE
594
- ### Returns subject sequence names instead of a vector of Booleans. It
595
- ### assumes that the recruitment plot was generated with pos.breaks=0.
776
+ ### Returns subject sequence names instead of a vector of Booleans. If
777
+ ### the recruitment plot was generated with pos.breaks=0 it returns a
778
+ ### vector of characters (the sequence identifiers), otherwise it returns
779
+ ### a data.frame with a name column and two columns of coordinates.
596
780
  ){
597
781
  # Determine the threshold
598
782
  if(is.list(peak)) peak <- enve.recplot2.corePeak(peak)
599
783
  par <- peak$param.hat
600
784
  par[["p"]] <- ifelse(lower.tail, significance, 1-significance)
601
785
  thr <- do.call(ifelse(length(par)==4, qsn, qnorm), par)
786
+ if(peak$log) thr <- exp(thr)
602
787
 
603
- # Estimate sequencing depths per window
604
- pos.cnts.in <- rp$pos.counts.in
605
- pos.breaks <- rp$pos.breaks
606
- pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
607
- seqdepth.in <- pos.cnts.in/pos.binsize
608
-
609
788
  # Select windows past the threshold
789
+ seqdepth.in <- enve.recplot2.seqdepth(rp)
610
790
  if(lower.tail){
611
791
  sel <- seqdepth.in < thr
612
792
  }else{
613
793
  sel <- seqdepth.in > thr
614
794
  }
795
+
796
+ # seq.names=FALSE
615
797
  if(!seq.names) return(sel)
616
- if(length(seqdepth.in) != length(rp$seq.names))
617
- stop(paste("Requesting subject sequence names, but the recruitment plot",
618
- "was not generated with pos.breaks=0."))
619
- return(rp$seq.names[sel])
798
+ # seq.names=TRUE and pos.breaks=0
799
+ if(length(rp$pos.breaks)==length(rp$seq.breaks) &&
800
+ rp$pos.breaks==rp$seq.breaks)
801
+ return(rp$seq.names[sel])
802
+ # seq.names=TRUE and pos.breaks!=0
803
+ return(enve.recplot2.coordinates(rp,sel))
804
+ ### Returns a vector of logicals if `seq.names=FALSE`. If `seq.names=TRUE`,
805
+ ### it returns a vector of characters if the object was built with
806
+ ### `pos.breaks=0` or a data.frame with four columns otherwise: name.from,
807
+ ### name.to, pos.from, and pos.to (see `enve.recplot2.coordinates`).
808
+ }
809
+
810
+ enve.recplot2.compareIdentities <- function
811
+ ### Compare the distribution of identities between two enve.RecPlot2 objects.
812
+ (x,
813
+ ### First enve.RecPlot2 object.
814
+ y,
815
+ ### Second enve.RecPlot2 object.
816
+ method="hellinger",
817
+ ### Distance method to use. This should be (an unambiguous abbreviation of)
818
+ ### one of:
819
+ ### "hellinger" (Hellinger, 1090, doi:10.1515/crll.1909.136.210),
820
+ ### "bhattacharyya" (Bhattacharyya, 1943, Bull. Calcutta Math. Soc. 35),
821
+ ### "kl" or "kullback-leibler" (Kullback & Leibler, 1951,
822
+ ### doi:10.1214/aoms/1177729694), or "euclidean".
823
+ smooth.par=NULL,
824
+ ### Smoothing parameter for cubic spline smoothing. Use 0 for no smoothing.
825
+ ### Use NULL to automatically determine this value using leave-one-out
826
+ ### cross-validation (see `smooth.spline` parameter `spar`).
827
+ pseudocounts=0,
828
+ ### Smoothing parameter for Laplace smoothing. Use 0 for no smoothing, or
829
+ ### 1 for add-one smoothing.
830
+ max.deviation=0.75
831
+ ### Maximum mean deviation between identity breaks tolerated (as percent
832
+ ### identity). Difference in number of id.breaks is never tolerated.
833
+ ){
834
+ METHODS <- c("hellinger","bhattacharyya","kullback-leibler","kl","euclidean")
835
+ i.meth <- pmatch(method, METHODS)
836
+ if (is.na(i.meth)) stop("Invalid distance ", method)
837
+ if(!inherits(x, "enve.RecPlot2"))
838
+ stop("'x' must inherit from class `enve.RecPlot2`")
839
+ if(!inherits(y, "enve.RecPlot2"))
840
+ stop("'y' must inherit from class `enve.RecPlot2`")
841
+ if(length(x$id.breaks) != length(y$id.breaks))
842
+ stop("'x' and 'y' must have the same number of `id.breaks`")
843
+ dev <- mean(abs(x$id.breaks - y$id.breaks))
844
+ if(dev > max.deviation)
845
+ stop("'x' and 'y' must have similar `id.breaks`; exceeding max.deviation: ",
846
+ dev)
847
+ x.cnt <- x$id.counts
848
+ y.cnt <- y$id.counts
849
+ if(is.null(smooth.par) || smooth.par > 0){
850
+ x.mids <- (x$id.breaks[-1] + x$id.breaks[-length(x$id.breaks)])/2
851
+ y.mids <- (y$id.breaks[-1] + y$id.breaks[-length(y$id.breaks)])/2
852
+ p.spline <- smooth.spline(x.mids, x.cnt, spar=smooth.par)
853
+ q.spline <- smooth.spline(y.mids, y.cnt, spar=smooth.par)
854
+ x.cnt <- pmax(p.spline$y, 0)
855
+ y.cnt <- pmax(q.spline$y, 0)
856
+ }
857
+ a <- as.numeric(pseudocounts)
858
+ p <- (x.cnt + a) / sum(x.cnt + a)
859
+ q <- (y.cnt + a) / sum(y.cnt + a)
860
+ d <- NA
861
+ if(i.meth %in% c(1L, 2L)){
862
+ d <- sqrt(sum((sqrt(p) - sqrt(q))**2))/sqrt(2)
863
+ if(i.meth==2L) d <- 1 - d**2
864
+ }else if(i.meth %in% c(3L, 4L)){
865
+ sel <- p>0
866
+ if(any(q[sel]==0))
867
+ stop("Undefined distance without absolute continuity, use pseudocounts")
868
+ d <- -sum(p[sel]*log(q[sel]/p[sel]))
869
+ }else if(i.meth == 5L){
870
+ d <- sqrt(sum((q-p)**2))
871
+ }
872
+ return(d)
873
+ }
874
+
875
+ enve.recplot2.coordinates <- function
876
+ ### Returns the sequence name and coordinates of the requested position bins.
877
+ (x,
878
+ ### `enve.RecPlot2` object.
879
+ bins
880
+ ### Vector of selected bins to return. It can be a vector of logical values
881
+ ### with the same length as `x$pos.breaks`-1 or a vector of integers. If
882
+ ### missing, returns the coordinates of all windows.
883
+ ){
884
+ if(!inherits(x, "enve.RecPlot2"))
885
+ stop("'x' must inherit from class `enve.RecPlot2`")
886
+ if(missing(bins)) bins <- rep(TRUE, length(x$pos.breaks)-1)
887
+ if(!is.vector(bins)) stop("'bins' must be a vector")
888
+ if(inherits(bins, "logical")) bins <- which(bins)
889
+
890
+ y <- data.frame(stringsAsFactors=FALSE, row.names=bins)
891
+
892
+ for(i in 1:length(bins)){
893
+ j <- bins[i]
894
+ # Concatenated coordinates
895
+ cc <- x$pos.breaks[c(j, j+1)]
896
+ # Find the corresponding `seq.breaks`
897
+ sb.from <- which(
898
+ cc[1] >=x$seq.breaks[-length(x$seq.breaks)] &
899
+ cc[1] < x$seq.breaks[-1])
900
+ sb.to <- which(
901
+ cc[2] > x$seq.breaks[-length(x$seq.breaks)] &
902
+ cc[2] <=x$seq.breaks[-1])
903
+ # Translate coordinates
904
+ if(length(sb.from)==1 & length(sb.to)==1){
905
+ y[i, 'name.from'] <- x$seq.names[sb.from]
906
+ y[i, 'pos.from'] <- floor(x$seq.breaks[sb.from] + cc[1] - 1)
907
+ y[i, 'name.to'] <- x$seq.names[sb.to]
908
+ y[i, 'pos.to'] <- ceiling(x$seq.breaks[sb.to] + cc[2] - 1)
909
+ }
910
+ }
911
+
912
+ return(y)
913
+ ### Returns a data.frame with four columns: name.from (character), pos.from
914
+ ### (numeric) name.to (character), and pos.to (numeric). The first two
915
+ ### correspond to sequence and position of the start point of the bin, the
916
+ ### last two correspond to the sequence and position of the end point of the
917
+ ### bin.
918
+ }
919
+
920
+ enve.recplot2.seqdepth <- function
921
+ ### Calculate the sequencing depth of the given window(s)
922
+ (x,
923
+ ### `enve.RecPlot2` object.
924
+ sel,
925
+ ### Window(s) for which the sequencing depth is to be calculated. If not
926
+ ### passed, it returns the sequencing depth of all windows
927
+ low.identity=FALSE
928
+ ### A logical indicating if the sequencing depth is to be estimated only
929
+ ### with low-identity matches. By default, only high-identity matches are
930
+ ### used.
931
+ ){
932
+ if(!inherits(x, "enve.RecPlot2"))
933
+ stop("'x' must inherit from class `enve.RecPlot2`")
934
+ pos.cnts.in <- x$pos.counts.in
935
+ pos.breaks <- x$pos.breaks
936
+ pos.binsize <- (pos.breaks[-1] - pos.breaks[-length(pos.breaks)])
937
+ seqdepth.in <- pos.cnts.in/pos.binsize
938
+ if(missing(sel)) return(seqdepth.in)
939
+ return(seqdepth.in[sel])
940
+ ### Returns a numeric vector of sequencing depths (in bp/bp).
941
+ }
942
+
943
+ enve.recplot2.ANIr <- function
944
+ ### Estimate the Average Nucleotide Identity from reads (ANIr) from a
945
+ ### recruitment plot
946
+ (x,
947
+ ### `enve.RecPlot2` object.
948
+ range=c(0,Inf)
949
+ ### Range of identities to be considered. By default, the full range
950
+ ### is used (note that the upper boundary is `Inf` and not 100 because
951
+ ### recruitment plots can also be built with bit-scores). To use only
952
+ ### intra-population matches (with identities), use c(95,100). To use only
953
+ ### inter-population values, use c(0,95).
954
+ ){
955
+ if(!inherits(x, "enve.RecPlot2"))
956
+ stop("'x' must inherit from class `enve.RecPlot2`")
957
+ id.b <- x$id.breaks
958
+ id <- (id.b[-1]+id.b[-length(id.b)])/2
959
+ cnt <- x$id.counts
960
+ cnt[id < range[1]] <- 0
961
+ cnt[id > range[2]] <- 0
962
+ return(sum(id*cnt/sum(cnt)))
620
963
  }
621
964
 
622
965
  #==============> Define internal functions
@@ -640,21 +983,67 @@ enve.recplot2.__counts <- function
640
983
  return(counts);
641
984
  }
642
985
 
986
+ enve.recplot2.findPeaks.__emauto_one <- function
987
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.emauto).
988
+ (x, comp, do_crit, best, verbose, ...){
989
+ peaks <- enve.recplot2.findPeaks.em(x=x, components=comp, ...)
990
+ k <- comp*3 - 1 # mean & sd for each component, and n-1 free alpha parameters
991
+ crit <- do_crit(peaks[[1]]$err.res, k, peaks[[1]]$n.total)
992
+ if(verbose) cat(comp,'\t| LL =', peaks[[1]]$err.res, '\t| Estimate =', crit,
993
+ ifelse(crit > best[['crit']], '*', ''), '\n')
994
+ if(crit > best[['crit']]){
995
+ best[['crit']] <- crit
996
+ best[['peaks']] <- peaks
997
+ }
998
+ best[['pstore']][[comp]] <- peaks
999
+ return(best)
1000
+ }
1001
+ enve.recplot2.findPeaks.__em_e <- function
1002
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`).
1003
+ (x, theta){
1004
+ components <- length(theta[['mu']])
1005
+ product <- do.call(cbind,
1006
+ lapply(1:components,
1007
+ function(i) dnorm(x, theta[['mu']][i],
1008
+ theta[['sd']][i])*theta[['alpha']][i]))
1009
+ sum.of.components <- rowSums(product)
1010
+ posterior <- product / sum.of.components
1011
+
1012
+ return(list(ll=sum(log(sum.of.components)), posterior=posterior))
1013
+ }
1014
+
1015
+ enve.recplot2.findPeaks.__em_m <- function
1016
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.em`
1017
+ (x, posterior){
1018
+ components <- ncol(posterior)
1019
+ n <- colSums(posterior)
1020
+ mu <- colSums(posterior * x) / n
1021
+ sd <- sqrt( colSums(
1022
+ posterior * (matrix(rep(x,components), ncol=components) - mu)^2) / n )
1023
+ alpha <- n/length(x)
1024
+ return(list(mu=mu, sd=sd, alpha=alpha))
1025
+ }
1026
+
643
1027
  enve.recplot2.__peakHist <- function
644
1028
  ### Internal ancilliary function (see `enve.RecPlot2.Peak`).
645
1029
  (x, mids, counts=TRUE){
646
1030
  d.o <- x$param.hat
647
- d.o$x <- mids
1031
+ if(length(x$log)==0) x$log <- FALSE
1032
+ if(x$log){
1033
+ d.o$x <- log(mids)
1034
+ }else{
1035
+ d.o$x <- mids
1036
+ }
648
1037
  prob <- do.call(paste('d', x$dist, sep=''), d.o)
649
1038
  if(!counts) return(prob)
650
1039
  if(length(x$values)>0) return(prob*length(x$values)/sum(prob))
651
1040
  return(prob*x$n.hat/sum(prob))
652
1041
  }
653
1042
 
654
- enve.recplot2.__findPeak <- function
655
- ### Internall ancilliary function (see `enve.recplot2.findPeaks`).
1043
+ enve.recplot2.findPeaks.__mow_one <- function
1044
+ ### Internall ancilliary function (see `enve.recplot2.findPeaks.mower`).
656
1045
  (lsd1, min.points, quant.est, mlv.opts, fitdist.opts, with.skewness,
657
- optim.rounds, optim.epsilon, n.total, merge.logdist, verbose
1046
+ optim.rounds, optim.epsilon, n.total, merge.logdist, verbose, log
658
1047
  ){
659
1048
  dist <- ifelse(with.skewness, 'sn', 'norm');
660
1049
 
@@ -683,8 +1072,14 @@ enve.recplot2.__findPeak <- function
683
1072
  if(round>1) param.hat <- last.hat;
684
1073
  break;
685
1074
  }
686
- epsilon <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
687
- if(round>2) if(epsilon < optim.epsilon) break;
1075
+ if(round > 1){
1076
+ epsilon1 <- sum((as.numeric(last.hat)-as.numeric(param.hat))^2)
1077
+ if(epsilon1 < optim.epsilon) break;
1078
+ if(round > 2){
1079
+ epsilon2 <- sum((as.numeric(last.last.hat)-as.numeric(param.hat))^2)
1080
+ if(epsilon2 < optim.epsilon) break;
1081
+ }
1082
+ }
688
1083
  }
689
1084
  if(verbose) cat('\n')
690
1085
  if(is.na(param.hat[1]) | is.na(lim[1])) return(NULL);
@@ -695,14 +1090,14 @@ enve.recplot2.__findPeak <- function
695
1090
  n.hat <- length(lsd1.pop)/diff(quant.est)
696
1091
  peak <- new('enve.RecPlot2.Peak', dist=dist, values=as.numeric(), mode=mode1,
697
1092
  param.hat=param.hat, n.hat=n.hat, n.total=n.total,
698
- merge.logdist=merge.logdist)
1093
+ merge.logdist=merge.logdist, log=log)
699
1094
  peak.breaks <- seq(min(lsd1), max(lsd1), length=20)
700
1095
  peak.cnt <- enve.recplot2.__peakHist(peak,
701
1096
  (peak.breaks[-length(peak.breaks)]+peak.breaks[-1])/2)
702
1097
  for(i in 2:length(peak.breaks)){
703
1098
  values <- lsd1[ (lsd1 >= peak.breaks[i-1]) & (lsd1 < peak.breaks[i]) ]
704
1099
  n.exp <- peak.cnt[i-1]
705
- if(n.exp==0) n.exp=0.1
1100
+ if(is.na(n.exp) | n.exp==0) n.exp <- 0.1
706
1101
  if(length(values)==0) next
707
1102
  in.peak <- runif(length(values)) <= n.exp/length(values)
708
1103
  lsd2 <- c(lsd2, values[!in.peak])
@@ -716,17 +1111,19 @@ enve.recplot2.__findPeak <- function
716
1111
  attr(peak, 'err.res') <- 1-(cor(hist(lsd.pop, breaks=peak.breaks,
717
1112
  plot=FALSE)$counts, hist(lsd1, breaks=peak.breaks,
718
1113
  plot=FALSE)$counts)+1)/2
1114
+ mu <- tail(param.hat, n=1)
1115
+ attr(peak, 'seq.depth') <- ifelse(log, exp(mu), mu)
719
1116
  if(verbose) cat(' Extracted peak with n =',length(lsd.pop),
720
1117
  'with expected n =',n.hat,'\n')
721
1118
  return(peak)
722
1119
  }
723
1120
 
724
- enve.recplot2.__findPeaks <- function
725
- ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
1121
+ enve.recplot2.findPeaks.__mower <- function
1122
+ ### Internal ancilliary function (see `enve.recplot2.findPeaks.mower`).
726
1123
  (peaks.opts){
727
1124
  peaks <- list()
728
1125
  while(length(peaks.opts$lsd1) > peaks.opts$min.points){
729
- peak <- do.call(enve.recplot2.__findPeak, peaks.opts)
1126
+ peak <- do.call(enve.recplot2.findPeaks.__mow_one, peaks.opts)
730
1127
  if(is.null(peak)) break
731
1128
  peaks[[ length(peaks)+1 ]] <- peak
732
1129
  peaks.opts$lsd1 <- peak$values.res
@@ -738,7 +1135,10 @@ enve.recplot2.__findPeaks <- function
738
1135
  enve.recplot2.__whichClosestPeak <- function
739
1136
  ### Internal ancilliary function (see `enve.recplot2.findPeaks`).
740
1137
  (peak, peaks){
741
- dist <- as.numeric(lapply(peaks, function(x) abs(log(x$param.hat[[ length(x$param.hat) ]]/peak$param.hat[[ length(peak$param.hat) ]] ))))
1138
+ dist <- as.numeric(lapply(peaks,
1139
+ function(x)
1140
+ abs(log(x$param.hat[[ length(x$param.hat) ]] /
1141
+ peak$param.hat[[ length(peak$param.hat) ]] ))))
742
1142
  dist[ dist==0 ] <- Inf
743
1143
  return(which.min(dist))
744
1144
  }