feldtruby 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/R/diffusion_kde.R ADDED
@@ -0,0 +1,116 @@
1
+ # Gaussian KDE downloaded from
2
+ # http://www-etud.iro.umontreal.ca/~botev/kde.R
3
+ # on February 9th 2013.
4
+ # Our changes:
5
+ # - Renamed function from kde to diffusion.kde
6
+ # - Return an object instead of a matrix
7
+ # - Added the sum of the densities
8
+ # - Added the mesh interval
9
+ # - Added the probabilities for each interval
10
+ # No license was specified on this piece of code. For questions about it please contact
11
+ # Prof. Botev on the email: botev@maths.uq.edu.au
12
+ diffusion.kde <- function(data,n,MIN,MAX){
13
+ # State-of-the-art gaussian kernel density estimator for one-dimensional data;
14
+ # The estimator does not use the commonly employed 'gaussian rule of thumb'.
15
+ # As a result it outperforms many plug-in methods on multimodal densities
16
+ # with widely separated modes (see example).
17
+ # INPUTS:
18
+ # data - a vector of data from which the density estimate is constructed;
19
+ # n - the number of mesh points used in the uniform discretization of the
20
+ # interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
21
+ # n is rounded up to the next power of two; the default value of n is n=2^12;
22
+ # MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
23
+ # the default values of MIN and MAX are:
24
+ # MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
25
+ # OUTPUT:
26
+ # matrix 'out' of with two rows of length 'n', where out[2,]
27
+ # are the density values on the mesh out[1,];
28
+ # EXAMPLE:
29
+ ##Save this file in your directory as kde.R and copy and paste the commands:
30
+ # rm(list=ls())
31
+ # source(file='kde.r')
32
+ # data=c(rnorm(10^3),rnorm(10^3)*2+30);
33
+ # d=kde(data)
34
+ # plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
35
+
36
+ # REFERENCE:
37
+ # Z. I. Botev, J. F. Grotowski and D. P. Kroese
38
+ # "Kernel Density Estimation Via Diffusion"
39
+ # Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
40
+ # for questions email: botev@maths.uq.edu.au
41
+
42
+ nargin=length(as.list(match.call()))-1;
43
+ if (nargin<2) n=2^14
44
+ n=2^ceiling(log2(n)); # round up n to the next power of 2;
45
+ if (nargin<4)
46
+ {# define the default interval [MIN,MAX]
47
+ minimum=min(data); maximum=max(data);
48
+ Range=maximum-minimum;
49
+ MIN=minimum-Range/10; MAX=maximum+Range/10;
50
+ }
51
+ # set up the grid over which the density estimate is computed;
52
+ R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
53
+ # if data has repeated observations use the N below
54
+ # N=length(as.numeric(names(table(data))));
55
+ # bin the data uniformly using the grid defined above;
56
+ w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
57
+ initial_data=initial_data/sum(initial_data);
58
+
59
+ dct1d <- function(data){
60
+ # computes the discrete cosine transform of the column vector data
61
+ n= length(data);
62
+ # Compute weights to multiply DFT coefficients
63
+ weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
64
+ # Re-order the elements of the columns of x
65
+ data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
66
+ # Multiply FFT by weights:
67
+ data= Re(weight* fft(data));
68
+ data}
69
+
70
+ a=dct1d(initial_data); # discrete cosine transform of initial data
71
+ # now compute the optimal bandwidth^2 using the referenced method
72
+ I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
73
+ # use fzero to solve the equation t=zeta*gamma^[5](t)
74
+
75
+ fixed_point <- function(t,N,I,a2){
76
+ # this implements the function t-zeta*gamma^[l](t)
77
+ l=7;
78
+ f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
79
+ for (s in (l-1):2){
80
+
81
+ K0=prod(seq(1,2*s-1,2))/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
82
+ time=(2*const*K0/N/f)^(2/(3+2*s));
83
+ f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
84
+ }
85
+ out=t-(2*N*sqrt(pi)*f)^(-2/5);
86
+ }
87
+
88
+ t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
89
+ # smooth the discrete cosine transform of initial data using t_star
90
+ a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
91
+ # now apply the inverse discrete cosine transform
92
+
93
+ idct1d <- function(data){
94
+ # computes the inverse discrete cosine transform
95
+ n=length(data);
96
+ # Compute weights
97
+ weights = n*exp(1i*(0:(n-1))*pi/(2*n));
98
+ # Compute x tilde using equation (5.93) in Jain
99
+ data = Re(fft(weights*data,inverse=TRUE))/n;
100
+ # Re-order elements of each column according to equations (5.93) and
101
+ # (5.94) in Jain
102
+ out = rep(0,n);
103
+ out[seq(1,n,2)] = data[1:(n/2)];
104
+ out[seq(2,n,2)] = data[n:(n/2+1)];
105
+ out;
106
+ }
107
+
108
+ density=idct1d(a_t)/R;
109
+ # take the rescaling of the data into account
110
+ bandwidth=sqrt(t_star)*R;
111
+ xmesh=seq(MIN,MAX,R/(n-1));
112
+ # out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
113
+ posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
114
+ list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
115
+ sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
116
+ }
@@ -21,6 +21,46 @@ module BasicStatistics
21
21
  end
22
22
  end
23
23
 
24
+ # Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
25
+ # This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
26
+ def quantiles
27
+ return [nil, nil, nil, nil, nil] if length == 0
28
+ sorted = self.sort
29
+ q1 = sorted.quantile_at_ratio(0.25)
30
+ q2 = sorted.quantile_at_ratio(0.50)
31
+ q3 = sorted.quantile_at_ratio(0.75)
32
+ return sorted.first, q1, q2, q3, sorted.last
33
+ end
34
+
35
+ # Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
36
+ # is a sorted array. This is based on the type 7 quantile function in R.
37
+ def quantile_at_ratio(p)
38
+ n = self.length
39
+ h = (n - 1) * p + 1
40
+ hfloor = h.floor
41
+ if h == hfloor
42
+ self[hfloor-1]
43
+ else
44
+ x_hfloor = self[hfloor-1]
45
+ x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
46
+ end
47
+ end
48
+
49
+ # Calculate the three quartiles of the array.
50
+ def quartiles
51
+ return [nil, nil, nil] if length == 0
52
+ sorted = self.sort
53
+ q1 = sorted.quantile_at_ratio(0.25)
54
+ q2 = sorted.quantile_at_ratio(0.50)
55
+ q3 = sorted.quantile_at_ratio(0.75)
56
+ return q1, q2, q3
57
+ end
58
+
59
+ def inter_quartile_range
60
+ q1, q2, q3 = quartiles
61
+ q3 - q1
62
+ end
63
+
24
64
  def variance
25
65
  return 0 if self.length == 0
26
66
  avg = self.mean
@@ -31,6 +71,17 @@ module BasicStatistics
31
71
  Math.sqrt( self.variance )
32
72
  end
33
73
 
74
+ # Same as R's var, i.e. uses N-1 in denominator.
75
+ def var
76
+ n = self.length.to_f
77
+ (variance * n) / (n-1)
78
+ end
79
+
80
+ # Save as R's sd, i.e. uses N-1 in denominator.
81
+ def sd
82
+ Math.sqrt( self.var )
83
+ end
84
+
34
85
  def root_mean_square
35
86
  Math.sqrt( self.map {|v| v**2}.mean )
36
87
  end
@@ -44,4 +44,8 @@ class Array
44
44
  self.each {|element| count_hash[element] += 1}
45
45
  count_hash
46
46
  end
47
+
48
+ def sample
49
+ self[rand(self.length)]
50
+ end
47
51
  end
@@ -0,0 +1,49 @@
1
+ require 'zlib'
2
+
3
+ module FeldtRuby::Statistics
4
+
5
+ class StringDistance
6
+ def compress(s)
7
+ Zlib::Deflate.deflate(s, 9)
8
+ end
9
+
10
+ def compressed_length(s)
11
+ compress(s).length
12
+ end
13
+
14
+ def distance(string1, string2)
15
+ raise NotImplementedError
16
+ end
17
+ end
18
+
19
+ # Cilibrasi and Vitanyi's NCD.
20
+ class NormalizedCompressionDistance < StringDistance
21
+ def distance(string1, string2)
22
+ return 0.0 if string1 == string2
23
+ c1 = compressed_length(string1)
24
+ c2 = compressed_length(string2)
25
+ c_1_2 = compressed_length(string1 + string2)
26
+ (c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
27
+ end
28
+ end
29
+
30
+ def ncd(string1, string2)
31
+ (@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
32
+ end
33
+
34
+ # Keogh et al's CDM.
35
+ class CompressionBasedDissimilarityMeasure < StringDistance
36
+ def distance(string1, string2)
37
+ return 0.0 if string1 == string2
38
+ c1 = compressed_length(string1)
39
+ c2 = compressed_length(string2)
40
+ c_1_2 = compressed_length(string1 + string2)
41
+ c_1_2.to_f / (c1 + c2)
42
+ end
43
+ end
44
+
45
+ def cdm(string1, string2)
46
+ (@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
47
+ end
48
+
49
+ end
@@ -0,0 +1,14 @@
1
+ module FeldtRuby
2
+
3
+ class EuclideanDistance
4
+ def calc(o1, o2)
5
+ sum = 0.0
6
+ o1.length.times do |i|
7
+ d = (o1[i] - o2[i])
8
+ sum += (d*d)
9
+ end
10
+ Math.sqrt(sum)
11
+ end
12
+ end
13
+
14
+ end
@@ -0,0 +1,106 @@
1
+ require 'feldtruby/array'
2
+
3
+ module FeldtRuby
4
+
5
+ class FastMap
6
+ # A PivotNode has two pivot objects, a map from each object to its
7
+ # coordinate on the line for these pivots, a distance function and
8
+ # a child pointing to the next dimension.
9
+ # It maps a multi-variate object to a k-dimensional coordinate.
10
+ class PivotNode
11
+ attr_writer :map, :child
12
+
13
+ def initialize(distance, pivot1, pivot2, map = nil, child = nil)
14
+ @distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
15
+ @d_1_2 = distance.calc(pivot1, pivot2)
16
+ @d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
17
+ end
18
+
19
+ # The number of coordinates that will be returned for an object.
20
+ def k; depth; end
21
+ def depth
22
+ @depth ||= 1 + (@child ? @child.depth : 0)
23
+ end
24
+
25
+ # Map an object to its coordinate in the dimension represented by this node.
26
+ def fastmap_coordinate(o)
27
+ ( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
28
+ end
29
+
30
+ def coordinate(o)
31
+ [map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
32
+ end
33
+
34
+ def [](object)
35
+ coordinate(object)
36
+ end
37
+
38
+ def map_object_to_coordinate(o)
39
+ @map[o] || fastmap_coordinate(o)
40
+ end
41
+ end
42
+
43
+ def initialize(distance, k = 2, choiceDepth = 1)
44
+ @distance, @k, @choice_depth = distance, k, choiceDepth
45
+ end
46
+
47
+ def run(objects)
48
+ @objects = objects
49
+ create_map(@k, @distance)
50
+ end
51
+
52
+ def create_map(k, distance)
53
+ return nil if k == 0
54
+ o1, o2 = choose_distant_objects(@objects, @distance)
55
+ node = PivotNode.new(distance, o1, o2)
56
+ coordinate_map = {}
57
+ if distance.calc(o1, o2) == 0.0
58
+ @objects.each {|o| coordinate_map[o] = 0.0}
59
+ else
60
+ @objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
61
+ end
62
+ node.map = coordinate_map
63
+ node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
64
+ node
65
+ end
66
+
67
+ def choose_distant_objects(objects, distance)
68
+ o1 = nil
69
+ o2 = objects.sample
70
+ # Not sure if there is any benefit to doing this more than once. Test later.
71
+ @choice_depth.times do
72
+ o1 = find_most_distant_object(objects, o2, distance)
73
+ o2 = find_most_distant_object(objects, o1, distance)
74
+ end
75
+ return o1, o2
76
+ end
77
+
78
+ # Find the object in objects that is farthest from o, given a distance function.
79
+ def find_most_distant_object(objects, o, distance)
80
+ objects.sort_by {|oi| distance.calc(oi, o)}.last
81
+ end
82
+
83
+ class DistanceFunction
84
+ def initialize(&func)
85
+ @func = func
86
+ end
87
+ def calc(o1, o2)
88
+ @func.call(o1, o2)
89
+ end
90
+ end
91
+
92
+ # Create the next distance function from a given distance func.
93
+ def next_distance(distance, o1, o2, coordinates)
94
+ DistanceFunction.new do |oi, oj|
95
+ Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
96
+ end
97
+ end
98
+ end
99
+
100
+ # Recursively map n-dimensional objects (given as an Array) into a k-dimensional
101
+ # space while preserving the distances between the objects as well as possible.
102
+ def self.fastmap(objects, distance, k = 2)
103
+ FastMap.new(distance, k).run(objects)
104
+ end
105
+
106
+ end
@@ -0,0 +1,26 @@
1
+ require 'feldtruby/array/basic_stats'
2
+
3
+ # The normalization methods assumes the existence of basic statistics
4
+ # on the class it they are included in:
5
+ # z_normalize: require mean and stdev
6
+ module FeldtRuby::Normalization
7
+ def normalize(&transform)
8
+ self.map {|v| transform.call(v)}
9
+ end
10
+
11
+ def z_normalize
12
+ mean, stdev = self.mean, self.sd
13
+ self.map {|e| (e-mean)/stdev}
14
+ end
15
+
16
+ def min_max_normalize
17
+ return [] if self.length == 0
18
+ min = self.min.to_f
19
+ range = self.max - min
20
+ self.map {|e| (e-min)/range}
21
+ end
22
+ end
23
+
24
+ class Array
25
+ include FeldtRuby::Normalization
26
+ end
@@ -0,0 +1,99 @@
1
+ require 'feldtruby/statistics/normalization'
2
+
3
+ # Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
4
+ # Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
5
+ # "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
6
+ # available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
7
+ module FeldtRuby::Statistics
8
+
9
+ # A SAX processor transforms any numeric stream of data (often a time series)
10
+ # of arbitrary length n to a string (symbolic stream) of arbitrary length w,
11
+ # where w<n, and typically w<<n. The alphabet size (symbols in the string) is
12
+ # also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
13
+ # al we state the number of data elements, _elementsPerWord_, that should go
14
+ # into each word, i.e. w = n/elementsPerWord.
15
+ # This allows for many powerful data mining algorithms to be applied and sped up.
16
+ class SAX
17
+ # Create a SAX processor with given output length _w_ and alphabet size _a_.
18
+ def initialize(elementsPerWord, alphabetSize = 6)
19
+ raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
20
+ @elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
21
+ end
22
+
23
+ # A mapper maps the values in a subsequence into a symbol. The standard
24
+ # mapper is state-less and normalizes each subsequence and then assumes
25
+ # a normal distribution and thus uses a fixed selection of bins.
26
+ class SymbolMapper
27
+ def initialize(data = nil)
28
+ # This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
29
+ end
30
+
31
+ # Cut points based on a Normal/Gaussian distribution...
32
+ NormalDistCutPoints = {
33
+ 2 => [-Float::INFINITY, 0.00],
34
+ 3 => [-Float::INFINITY, -0.43, 0.43],
35
+ 4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
36
+ 5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
37
+ 6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
38
+ 7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
39
+ 8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
40
+ 9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
41
+ 10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
42
+ 11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
43
+ 12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
44
+ 13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
45
+ 14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
46
+ 15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
47
+ 16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
48
+ 17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
49
+ 18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
50
+ 19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
51
+ 20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
52
+ }
53
+
54
+ def supports_alphabet_size?(size)
55
+ NormalDistCutPoints.keys.include? size
56
+ end
57
+
58
+ def map_sequence_to_symbol(sequence, alphabet_size)
59
+ symbol_for_value(sequence.mean, alphabet_size)
60
+ end
61
+
62
+ def symbol_for_value(value, alphabet_size)
63
+ NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
64
+ return symbol if cutpoint > value
65
+ symbol + 1
66
+ end
67
+ end
68
+ end
69
+
70
+ def setup_for_processing_data(data, mapper = nil)
71
+ @mapper ||= SymbolMapper.new(data)
72
+ unless @mapper.supports_alphabet_size?(@alphabet_size)
73
+ raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
74
+ end
75
+ end
76
+
77
+ def process_subsequence(subsequence)
78
+ normalized_ss = subsequence.z_normalize
79
+ len, rem = normalized_ss.length.divmod @elements_per_word
80
+ # Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
81
+ # This is different than the orig SAX as specified in their paper.
82
+ symbols = (0...len).map do |wordindex|
83
+ @mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
84
+ end
85
+ symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
86
+ symbols
87
+ end
88
+
89
+ def process(data, windowSize = data.length, mapper = nil)
90
+ setup_for_processing_data(data, mapper)
91
+ res = (0..(data.length - windowSize)).map do |i|
92
+ process_subsequence(data[i, windowSize])
93
+ end
94
+ res = res.flatten if windowSize == data.length
95
+ res
96
+ end
97
+ end
98
+
99
+ end
@@ -26,6 +26,15 @@ class RCommunicator
26
26
  @r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
27
27
  end
28
28
 
29
+ # Load R scripts in the feldtruby/R directory.
30
+ def load_feldtruby_r_script(scriptName, reload = false)
31
+ @loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
32
+ return if reload == false && @loaded_scripts.include?(scriptName)
33
+ @loaded_scripts << scriptName
34
+ path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
35
+ @r.eval "source(\"#{path}\")"
36
+ end
37
+
29
38
  def eval(str)
30
39
  @r.eval str
31
40
  end
@@ -103,6 +112,45 @@ module Statistics
103
112
  res = RC.call("chisq.test", vs)
104
113
  res.p_value
105
114
  end
115
+
116
+ class DiffusionKDE
117
+ attr_reader :densities, :mesh
118
+
119
+ # Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
120
+ # we can calculate the probability of new values.
121
+ def initialize(rvalue)
122
+ @probabilities = rvalue.probabilities
123
+ @densities = rvalue.densities
124
+ @mesh = rvalue.mesh
125
+ @mesh_interval = rvalue.mesh_interval.to_f
126
+ @min, @max = rvalue.min.to_f, rvalue.max.to_f
127
+ end
128
+
129
+ def density_of(value)
130
+ return 0.0 if value < @min || value > @max
131
+ bin_index = ((value - @min) / @mesh_interval).floor
132
+ @densities[bin_index]
133
+ end
134
+
135
+ def probability_of(value)
136
+ return 0.0 if value < @min || value > @max
137
+ bin_index = ((value - @min) / @mesh_interval).floor
138
+ @probabilities[bin_index]
139
+ end
140
+ end
141
+
142
+ # Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
143
+ # and optional min and max values.
144
+ def density_estimation(values, n = 2**9, min = nil, max = nil)
145
+ # Ensure we have loaded the diffusion.kde code
146
+ RC.load_feldtruby_r_script("diffusion_kde.R")
147
+ args = [values, n]
148
+ if min && max
149
+ args << min
150
+ args << max
151
+ end
152
+ DiffusionKDE.new RC.call("diffusion.kde", *args)
153
+ end
106
154
  end
107
155
 
108
156
  # Make them available at top level
@@ -1,3 +1,3 @@
1
1
  module FeldtRuby
2
- VERSION = "0.3.6"
2
+ VERSION = "0.3.8"
3
3
  end
data/lib/feldtruby.rb CHANGED
@@ -4,4 +4,6 @@ if RUBY_VERSION < "1.9"
4
4
  end
5
5
 
6
6
  # This is the namespace under which we put things...
7
- module FeldtRuby; end
7
+ module FeldtRuby
8
+ TopDirectory = File.dirname(__FILE__).split("/")[0...-1].join("/")
9
+ end
data/test/test_array.rb CHANGED
@@ -100,4 +100,11 @@ describe "Array extensions" do
100
100
  counts[5].must_equal 5
101
101
  end
102
102
  end
103
- end
103
+
104
+ describe "sample" do
105
+ it "only samples within the array" do
106
+ d = (1..100).to_a
107
+ 100.times { d.include?(d.sample).must_equal(true) }
108
+ end
109
+ end
110
+ end
@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
60
60
  end
61
61
  end
62
62
 
63
+ describe "mean and stdev" do
64
+ it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
65
+ data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
66
+ data.mean.must_be_close_to 4.606667
67
+ data.sd.must_be_close_to 2.640316
68
+ end
69
+ end
70
+
63
71
  describe "Basic statistics" do
64
72
  describe "sum of abs" do
65
73
  it "works for simple example" do
@@ -131,4 +139,28 @@ describe "Basic statistics" do
131
139
  [1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
132
140
  end
133
141
  end
142
+
143
+ describe "quantile- and quartile-related functionality" do
144
+ it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
145
+ seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
146
+ seq.quartiles.must_equal [7.25, 9, 14.5]
147
+ seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
148
+ seq.inter_quartile_range.must_equal (14.5-7.25)
149
+ end
150
+
151
+ it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
152
+ seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
153
+ seq.quartiles.must_equal [7.5, 9, 14]
154
+ seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
155
+ seq.inter_quartile_range.must_equal 6.5
156
+ end
157
+
158
+ it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
159
+ seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
160
+
161
+ seq.quartiles.must_equal [25.5, 40, 42.5]
162
+ seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
163
+ seq.inter_quartile_range.must_equal 17.0
164
+ end
165
+ end
134
166
  end
@@ -0,0 +1,22 @@
1
+ require 'feldtruby/statistics/fastmap'
2
+ require 'feldtruby/statistics/euclidean_distance'
3
+
4
+ describe "Fastmap" do
5
+ it "works for simple data, and different values of k" do
6
+ d = [
7
+ [0, 0, 0, 0],
8
+ [1, 1, 1, 1],
9
+ [2, 2, 2, 2],
10
+ [3, 3, 3, 3]
11
+ ]
12
+ 1.upto(d.first.length-1) do |k|
13
+ m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
14
+ m.depth.must_equal k
15
+ d.each do |datum|
16
+ c = m[datum]
17
+ c.length.must_equal k
18
+ c.must_equal m[datum]
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,36 @@
1
+ require 'feldtruby/statistics/normalization'
2
+
3
+ class Array
4
+ def must_be_close_to(other)
5
+ self.zip(other).map {|a,b| a.must_be_close_to(b)}
6
+ end
7
+ end
8
+
9
+ describe "Z normalization" do
10
+ it "handles empty arrays" do
11
+ [].z_normalize.must_equal []
12
+ end
13
+
14
+ it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
15
+ data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
16
+ expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
17
+ data.z_normalize.must_be_close_to expected
18
+ end
19
+
20
+ it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
21
+ data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
22
+ expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
23
+ data.z_normalize.must_be_close_to expected
24
+ end
25
+ end
26
+
27
+ describe "Min-Max normalization" do
28
+ it "handles empty arrays" do
29
+ [].min_max_normalize.must_equal []
30
+ end
31
+
32
+ it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
33
+ data = [20, 24, 26, 27, 30]
34
+ data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
35
+ end
36
+ end
data/test/test_sax.rb ADDED
@@ -0,0 +1,48 @@
1
+ require 'feldtruby/statistics/time_series/sax'
2
+ include FeldtRuby::Statistics
3
+
4
+ describe 'Symbolic Adaptive approXimation - SAX' do
5
+ describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
6
+ it "accepts alphabet sizes between 2 and 20" do
7
+ sm = SAX::SymbolMapper.new
8
+ sm.supports_alphabet_size?(-1).must_equal false
9
+ sm.supports_alphabet_size?(0).must_equal false
10
+ sm.supports_alphabet_size?(1).must_equal false
11
+ sm.supports_alphabet_size?(2).must_equal true
12
+ sm.supports_alphabet_size?(20).must_equal true
13
+ sm.supports_alphabet_size?(21).must_equal false
14
+ end
15
+
16
+ it "maps correctly to symbols for alphabet of size 2" do
17
+ sm = SAX::SymbolMapper.new
18
+ sm.symbol_for_value(-10, 2).must_equal 1
19
+ sm.symbol_for_value(-1, 2).must_equal 1
20
+ sm.symbol_for_value(1, 2).must_equal 2
21
+ sm.symbol_for_value(10, 2).must_equal 2
22
+ end
23
+
24
+ it "maps correctly to symbols for alphabet of size 4" do
25
+ sm = SAX::SymbolMapper.new
26
+ sm.symbol_for_value(-0.7, 4).must_equal 1
27
+ sm.symbol_for_value(-0.5, 4).must_equal 2
28
+ sm.symbol_for_value(-0.01, 4).must_equal 2
29
+ sm.symbol_for_value(0, 4).must_equal 3
30
+ sm.symbol_for_value(0.01, 4).must_equal 3
31
+ sm.symbol_for_value(0.5, 4).must_equal 3
32
+ sm.symbol_for_value(0.7, 4).must_equal 4
33
+ sm.symbol_for_value(17, 4).must_equal 4
34
+ end
35
+ end
36
+
37
+ it "does not accept alphabet sizes larger than 20 or smaller than 2" do
38
+ proc {SAX.new(10, 21)}.must_raise ArgumentError
39
+ proc {SAX.new(3, 1)}.must_raise ArgumentError
40
+ end
41
+
42
+ it "maps some simple time series to symbols when directly mapping" do
43
+ sax = SAX.new(1, 4)
44
+ sax.process([-1, 0, 1]).must_equal [1,3,4]
45
+ sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
46
+ sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
47
+ end
48
+ end
@@ -69,6 +69,19 @@ describe "Statistics" do
69
69
  probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
70
70
  end
71
71
  end
72
+
73
+ describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
74
+ it "works for simple examples" do
75
+ data = [1]
76
+ kde = density_estimation(data, 4, 0.0, 3.0)
77
+ kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
78
+ kde.densities.length.must_equal 4
79
+ kde.densities[0].must_be_close_to 0.3912
80
+ kde.densities[1].must_be_close_to 0.3591
81
+ kde.densities[2].must_be_close_to 0.3101
82
+ kde.densities[3].must_be_close_to 0.2728
83
+ end
84
+ end
72
85
  end
73
86
 
74
87
  require 'feldtruby/minitest_extensions'
@@ -0,0 +1,24 @@
1
+ require 'feldtruby/statistics/distance/string_distance'
2
+ include FeldtRuby::Statistics
3
+
4
+ describe "ncd" do
5
+ it "gives no distance if the strings are the same" do
6
+ ncd("aaa", "aaa").must_equal 0.0
7
+ end
8
+
9
+ it "gives distance > 0.0 if strings are not the same" do
10
+ ncd("a", "b").must_be :>, 0.0
11
+ ncd("aa", "ab").must_be :>, 0.0
12
+ end
13
+ end
14
+
15
+ describe "cdm" do
16
+ it "gives no distance if the strings are the same" do
17
+ cdm("aaa", "aaa").must_equal 0.0
18
+ end
19
+
20
+ it "gives distance > 0.0 if strings are not the same" do
21
+ cdm("a", "b").must_be :>, 0.0
22
+ cdm("aa", "ab").must_be :>, 0.0
23
+ end
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feldtruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-16 00:00:00.000000000 Z
12
+ date: 2013-02-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
@@ -54,6 +54,7 @@ files:
54
54
  - Gemfile
55
55
  - Gemfile.lock
56
56
  - History.txt
57
+ - R/diffusion_kde.R
57
58
  - README.md
58
59
  - Rakefile
59
60
  - TODO
@@ -77,6 +78,11 @@ files:
77
78
  - lib/feldtruby/optimize/search_space.rb
78
79
  - lib/feldtruby/optimize/stdout_logger.rb
79
80
  - lib/feldtruby/statistics.rb
81
+ - lib/feldtruby/statistics/distance/string_distance.rb
82
+ - lib/feldtruby/statistics/euclidean_distance.rb
83
+ - lib/feldtruby/statistics/fastmap.rb
84
+ - lib/feldtruby/statistics/normalization.rb
85
+ - lib/feldtruby/statistics/time_series/sax.rb
80
86
  - lib/feldtruby/string/to_iso.rb
81
87
  - lib/feldtruby/time.rb
82
88
  - lib/feldtruby/vector.rb
@@ -87,15 +93,19 @@ files:
87
93
  - test/test_array.rb
88
94
  - test/test_array_basic_stats.rb
89
95
  - test/test_array_count_by.rb
96
+ - test/test_fastmap.rb
90
97
  - test/test_float.rb
91
98
  - test/test_html_doc_getter.rb
99
+ - test/test_normalization.rb
92
100
  - test/test_optimize.rb
93
101
  - test/test_optimize_differential_evolution.rb
94
102
  - test/test_optimize_objective.rb
95
103
  - test/test_optimize_populationbasedoptimizer.rb
96
104
  - test/test_optimize_random_search.rb
97
105
  - test/test_optimize_search_space.rb
106
+ - test/test_sax.rb
98
107
  - test/test_statistics.rb
108
+ - test/test_string_distance.rb
99
109
  - test/test_time.rb
100
110
  - test/test_vector.rb
101
111
  - test/test_word_counter.rb
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
129
  version: '0'
120
130
  requirements: []
121
131
  rubyforge_project:
122
- rubygems_version: 1.8.24
132
+ rubygems_version: 1.8.25
123
133
  signing_key:
124
134
  specification_version: 3
125
135
  summary: Robert Feldt's Common Ruby Code lib
@@ -128,15 +138,19 @@ test_files:
128
138
  - test/test_array.rb
129
139
  - test/test_array_basic_stats.rb
130
140
  - test/test_array_count_by.rb
141
+ - test/test_fastmap.rb
131
142
  - test/test_float.rb
132
143
  - test/test_html_doc_getter.rb
144
+ - test/test_normalization.rb
133
145
  - test/test_optimize.rb
134
146
  - test/test_optimize_differential_evolution.rb
135
147
  - test/test_optimize_objective.rb
136
148
  - test/test_optimize_populationbasedoptimizer.rb
137
149
  - test/test_optimize_random_search.rb
138
150
  - test/test_optimize_search_space.rb
151
+ - test/test_sax.rb
139
152
  - test/test_statistics.rb
153
+ - test/test_string_distance.rb
140
154
  - test/test_time.rb
141
155
  - test/test_vector.rb
142
156
  - test/test_word_counter.rb