feldtruby 0.3.6 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
data/R/diffusion_kde.R ADDED
@@ -0,0 +1,116 @@
1
+ # Gaussian KDE downloaded from
2
+ # http://www-etud.iro.umontreal.ca/~botev/kde.R
3
+ # on February 9th 2013.
4
+ # Our changes:
5
+ # - Renamed function from kde to diffusion.kde
6
+ # - Return an object instead of a matrix
7
+ # - Added the sum of the densities
8
+ # - Added the mesh interval
9
+ # - Added the probabilities for each interval
10
+ # No license was specified on this piece of code. For questions about it please contact
11
+ # Prof. Botev on the email: botev@maths.uq.edu.au
12
+ diffusion.kde <- function(data,n,MIN,MAX){
13
+ # State-of-the-art gaussian kernel density estimator for one-dimensional data;
14
+ # The estimator does not use the commonly employed 'gaussian rule of thumb'.
15
+ # As a result it outperforms many plug-in methods on multimodal densities
16
+ # with widely separated modes (see example).
17
+ # INPUTS:
18
+ # data - a vector of data from which the density estimate is constructed;
19
+ # n - the number of mesh points used in the uniform discretization of the
20
+ # interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
21
+ # n is rounded up to the next power of two; the default value of n is n=2^12;
22
+ # MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
23
+ # the default values of MIN and MAX are:
24
+ # MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
25
+ # OUTPUT:
26
+ # matrix 'out' of with two rows of length 'n', where out[2,]
27
+ # are the density values on the mesh out[1,];
28
+ # EXAMPLE:
29
+ ##Save this file in your directory as kde.R and copy and paste the commands:
30
+ # rm(list=ls())
31
+ # source(file='kde.r')
32
+ # data=c(rnorm(10^3),rnorm(10^3)*2+30);
33
+ # d=kde(data)
34
+ # plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
35
+
36
+ # REFERENCE:
37
+ # Z. I. Botev, J. F. Grotowski and D. P. Kroese
38
+ # "Kernel Density Estimation Via Diffusion"
39
+ # Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
40
+ # for questions email: botev@maths.uq.edu.au
41
+
42
+ nargin=length(as.list(match.call()))-1;
43
+ if (nargin<2) n=2^14
44
+ n=2^ceiling(log2(n)); # round up n to the next power of 2;
45
+ if (nargin<4)
46
+ {# define the default interval [MIN,MAX]
47
+ minimum=min(data); maximum=max(data);
48
+ Range=maximum-minimum;
49
+ MIN=minimum-Range/10; MAX=maximum+Range/10;
50
+ }
51
+ # set up the grid over which the density estimate is computed;
52
+ R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
53
+ # if data has repeated observations use the N below
54
+ # N=length(as.numeric(names(table(data))));
55
+ # bin the data uniformly using the grid defined above;
56
+ w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
57
+ initial_data=initial_data/sum(initial_data);
58
+
59
+ dct1d <- function(data){
60
+ # computes the discrete cosine transform of the column vector data
61
+ n= length(data);
62
+ # Compute weights to multiply DFT coefficients
63
+ weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
64
+ # Re-order the elements of the columns of x
65
+ data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
66
+ # Multiply FFT by weights:
67
+ data= Re(weight* fft(data));
68
+ data}
69
+
70
+ a=dct1d(initial_data); # discrete cosine transform of initial data
71
+ # now compute the optimal bandwidth^2 using the referenced method
72
+ I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
73
+ # use fzero to solve the equation t=zeta*gamma^[5](t)
74
+
75
+ fixed_point <- function(t,N,I,a2){
76
+ # this implements the function t-zeta*gamma^[l](t)
77
+ l=7;
78
+ f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
79
+ for (s in (l-1):2){
80
+
81
+ K0=prod(seq(1,2*s-1,2))/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
82
+ time=(2*const*K0/N/f)^(2/(3+2*s));
83
+ f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
84
+ }
85
+ out=t-(2*N*sqrt(pi)*f)^(-2/5);
86
+ }
87
+
88
+ t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
89
+ # smooth the discrete cosine transform of initial data using t_star
90
+ a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
91
+ # now apply the inverse discrete cosine transform
92
+
93
+ idct1d <- function(data){
94
+ # computes the inverse discrete cosine transform
95
+ n=length(data);
96
+ # Compute weights
97
+ weights = n*exp(1i*(0:(n-1))*pi/(2*n));
98
+ # Compute x tilde using equation (5.93) in Jain
99
+ data = Re(fft(weights*data,inverse=TRUE))/n;
100
+ # Re-order elements of each column according to equations (5.93) and
101
+ # (5.94) in Jain
102
+ out = rep(0,n);
103
+ out[seq(1,n,2)] = data[1:(n/2)];
104
+ out[seq(2,n,2)] = data[n:(n/2+1)];
105
+ out;
106
+ }
107
+
108
+ density=idct1d(a_t)/R;
109
+ # take the rescaling of the data into account
110
+ bandwidth=sqrt(t_star)*R;
111
+ xmesh=seq(MIN,MAX,R/(n-1));
112
+ # out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
113
+ posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
114
+ list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
115
+ sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
116
+ }
@@ -21,6 +21,46 @@ module BasicStatistics
21
21
  end
22
22
  end
23
23
 
24
+ # Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
25
+ # This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
26
+ def quantiles
27
+ return [nil, nil, nil, nil, nil] if length == 0
28
+ sorted = self.sort
29
+ q1 = sorted.quantile_at_ratio(0.25)
30
+ q2 = sorted.quantile_at_ratio(0.50)
31
+ q3 = sorted.quantile_at_ratio(0.75)
32
+ return sorted.first, q1, q2, q3, sorted.last
33
+ end
34
+
35
+ # Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
36
+ # is a sorted array. This is based on the type 7 quantile function in R.
37
+ def quantile_at_ratio(p)
38
+ n = self.length
39
+ h = (n - 1) * p + 1
40
+ hfloor = h.floor
41
+ if h == hfloor
42
+ self[hfloor-1]
43
+ else
44
+ x_hfloor = self[hfloor-1]
45
+ x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
46
+ end
47
+ end
48
+
49
+ # Calculate the three quartiles of the array.
50
+ def quartiles
51
+ return [nil, nil, nil] if length == 0
52
+ sorted = self.sort
53
+ q1 = sorted.quantile_at_ratio(0.25)
54
+ q2 = sorted.quantile_at_ratio(0.50)
55
+ q3 = sorted.quantile_at_ratio(0.75)
56
+ return q1, q2, q3
57
+ end
58
+
59
+ def inter_quartile_range
60
+ q1, q2, q3 = quartiles
61
+ q3 - q1
62
+ end
63
+
24
64
  def variance
25
65
  return 0 if self.length == 0
26
66
  avg = self.mean
@@ -31,6 +71,17 @@ module BasicStatistics
31
71
  Math.sqrt( self.variance )
32
72
  end
33
73
 
74
+ # Same as R's var, i.e. uses N-1 in denominator.
75
+ def var
76
+ n = self.length.to_f
77
+ (variance * n) / (n-1)
78
+ end
79
+
80
+ # Save as R's sd, i.e. uses N-1 in denominator.
81
+ def sd
82
+ Math.sqrt( self.var )
83
+ end
84
+
34
85
  def root_mean_square
35
86
  Math.sqrt( self.map {|v| v**2}.mean )
36
87
  end
@@ -44,4 +44,8 @@ class Array
44
44
  self.each {|element| count_hash[element] += 1}
45
45
  count_hash
46
46
  end
47
+
48
+ def sample
49
+ self[rand(self.length)]
50
+ end
47
51
  end
@@ -0,0 +1,49 @@
1
+ require 'zlib'
2
+
3
+ module FeldtRuby::Statistics
4
+
5
+ class StringDistance
6
+ def compress(s)
7
+ Zlib::Deflate.deflate(s, 9)
8
+ end
9
+
10
+ def compressed_length(s)
11
+ compress(s).length
12
+ end
13
+
14
+ def distance(string1, string2)
15
+ raise NotImplementedError
16
+ end
17
+ end
18
+
19
+ # Cilibrasi and Vitanyi's NCD.
20
+ class NormalizedCompressionDistance < StringDistance
21
+ def distance(string1, string2)
22
+ return 0.0 if string1 == string2
23
+ c1 = compressed_length(string1)
24
+ c2 = compressed_length(string2)
25
+ c_1_2 = compressed_length(string1 + string2)
26
+ (c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
27
+ end
28
+ end
29
+
30
+ def ncd(string1, string2)
31
+ (@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
32
+ end
33
+
34
+ # Keogh et al's CDM.
35
+ class CompressionBasedDissimilarityMeasure < StringDistance
36
+ def distance(string1, string2)
37
+ return 0.0 if string1 == string2
38
+ c1 = compressed_length(string1)
39
+ c2 = compressed_length(string2)
40
+ c_1_2 = compressed_length(string1 + string2)
41
+ c_1_2.to_f / (c1 + c2)
42
+ end
43
+ end
44
+
45
+ def cdm(string1, string2)
46
+ (@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
47
+ end
48
+
49
+ end
@@ -0,0 +1,14 @@
1
+ module FeldtRuby
2
+
3
+ class EuclideanDistance
4
+ def calc(o1, o2)
5
+ sum = 0.0
6
+ o1.length.times do |i|
7
+ d = (o1[i] - o2[i])
8
+ sum += (d*d)
9
+ end
10
+ Math.sqrt(sum)
11
+ end
12
+ end
13
+
14
+ end
@@ -0,0 +1,106 @@
1
+ require 'feldtruby/array'
2
+
3
+ module FeldtRuby
4
+
5
+ class FastMap
6
+ # A PivotNode has two pivot objects, a map from each object to its
7
+ # coordinate on the line for these pivots, a distance function and
8
+ # a child pointing to the next dimension.
9
+ # It maps a multi-variate object to a k-dimensional coordinate.
10
+ class PivotNode
11
+ attr_writer :map, :child
12
+
13
+ def initialize(distance, pivot1, pivot2, map = nil, child = nil)
14
+ @distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
15
+ @d_1_2 = distance.calc(pivot1, pivot2)
16
+ @d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
17
+ end
18
+
19
+ # The number of coordinates that will be returned for an object.
20
+ def k; depth; end
21
+ def depth
22
+ @depth ||= 1 + (@child ? @child.depth : 0)
23
+ end
24
+
25
+ # Map an object to its coordinate in the dimension represented by this node.
26
+ def fastmap_coordinate(o)
27
+ ( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
28
+ end
29
+
30
+ def coordinate(o)
31
+ [map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
32
+ end
33
+
34
+ def [](object)
35
+ coordinate(object)
36
+ end
37
+
38
+ def map_object_to_coordinate(o)
39
+ @map[o] || fastmap_coordinate(o)
40
+ end
41
+ end
42
+
43
+ def initialize(distance, k = 2, choiceDepth = 1)
44
+ @distance, @k, @choice_depth = distance, k, choiceDepth
45
+ end
46
+
47
+ def run(objects)
48
+ @objects = objects
49
+ create_map(@k, @distance)
50
+ end
51
+
52
+ def create_map(k, distance)
53
+ return nil if k == 0
54
+ o1, o2 = choose_distant_objects(@objects, @distance)
55
+ node = PivotNode.new(distance, o1, o2)
56
+ coordinate_map = {}
57
+ if distance.calc(o1, o2) == 0.0
58
+ @objects.each {|o| coordinate_map[o] = 0.0}
59
+ else
60
+ @objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
61
+ end
62
+ node.map = coordinate_map
63
+ node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
64
+ node
65
+ end
66
+
67
+ def choose_distant_objects(objects, distance)
68
+ o1 = nil
69
+ o2 = objects.sample
70
+ # Not sure if there is any benefit to doing this more than once. Test later.
71
+ @choice_depth.times do
72
+ o1 = find_most_distant_object(objects, o2, distance)
73
+ o2 = find_most_distant_object(objects, o1, distance)
74
+ end
75
+ return o1, o2
76
+ end
77
+
78
+ # Find the object in objects that is farthest from o, given a distance function.
79
+ def find_most_distant_object(objects, o, distance)
80
+ objects.sort_by {|oi| distance.calc(oi, o)}.last
81
+ end
82
+
83
+ class DistanceFunction
84
+ def initialize(&func)
85
+ @func = func
86
+ end
87
+ def calc(o1, o2)
88
+ @func.call(o1, o2)
89
+ end
90
+ end
91
+
92
+ # Create the next distance function from a given distance func.
93
+ def next_distance(distance, o1, o2, coordinates)
94
+ DistanceFunction.new do |oi, oj|
95
+ Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
96
+ end
97
+ end
98
+ end
99
+
100
+ # Recursively map n-dimensional objects (given as an Array) into a k-dimensional
101
+ # space while preserving the distances between the objects as well as possible.
102
+ def self.fastmap(objects, distance, k = 2)
103
+ FastMap.new(distance, k).run(objects)
104
+ end
105
+
106
+ end
@@ -0,0 +1,26 @@
1
+ require 'feldtruby/array/basic_stats'
2
+
3
+ # The normalization methods assumes the existence of basic statistics
4
+ # on the class it they are included in:
5
+ # z_normalize: require mean and stdev
6
+ module FeldtRuby::Normalization
7
+ def normalize(&transform)
8
+ self.map {|v| transform.call(v)}
9
+ end
10
+
11
+ def z_normalize
12
+ mean, stdev = self.mean, self.sd
13
+ self.map {|e| (e-mean)/stdev}
14
+ end
15
+
16
+ def min_max_normalize
17
+ return [] if self.length == 0
18
+ min = self.min.to_f
19
+ range = self.max - min
20
+ self.map {|e| (e-min)/range}
21
+ end
22
+ end
23
+
24
+ class Array
25
+ include FeldtRuby::Normalization
26
+ end
@@ -0,0 +1,99 @@
1
+ require 'feldtruby/statistics/normalization'
2
+
3
+ # Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
4
+ # Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
5
+ # "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
6
+ # available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
7
+ module FeldtRuby::Statistics
8
+
9
+ # A SAX processor transforms any numeric stream of data (often a time series)
10
+ # of arbitrary length n to a string (symbolic stream) of arbitrary length w,
11
+ # where w<n, and typically w<<n. The alphabet size (symbols in the string) is
12
+ # also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
13
+ # al we state the number of data elements, _elementsPerWord_, that should go
14
+ # into each word, i.e. w = n/elementsPerWord.
15
+ # This allows for many powerful data mining algorithms to be applied and sped up.
16
+ class SAX
17
+ # Create a SAX processor with given output length _w_ and alphabet size _a_.
18
+ def initialize(elementsPerWord, alphabetSize = 6)
19
+ raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
20
+ @elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
21
+ end
22
+
23
+ # A mapper maps the values in a subsequence into a symbol. The standard
24
+ # mapper is state-less and normalizes each subsequence and then assumes
25
+ # a normal distribution and thus uses a fixed selection of bins.
26
+ class SymbolMapper
27
+ def initialize(data = nil)
28
+ # This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
29
+ end
30
+
31
+ # Cut points based on a Normal/Gaussian distribution...
32
+ NormalDistCutPoints = {
33
+ 2 => [-Float::INFINITY, 0.00],
34
+ 3 => [-Float::INFINITY, -0.43, 0.43],
35
+ 4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
36
+ 5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
37
+ 6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
38
+ 7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
39
+ 8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
40
+ 9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
41
+ 10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
42
+ 11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
43
+ 12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
44
+ 13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
45
+ 14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
46
+ 15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
47
+ 16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
48
+ 17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
49
+ 18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
50
+ 19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
51
+ 20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
52
+ }
53
+
54
+ def supports_alphabet_size?(size)
55
+ NormalDistCutPoints.keys.include? size
56
+ end
57
+
58
+ def map_sequence_to_symbol(sequence, alphabet_size)
59
+ symbol_for_value(sequence.mean, alphabet_size)
60
+ end
61
+
62
+ def symbol_for_value(value, alphabet_size)
63
+ NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
64
+ return symbol if cutpoint > value
65
+ symbol + 1
66
+ end
67
+ end
68
+ end
69
+
70
+ def setup_for_processing_data(data, mapper = nil)
71
+ @mapper ||= SymbolMapper.new(data)
72
+ unless @mapper.supports_alphabet_size?(@alphabet_size)
73
+ raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
74
+ end
75
+ end
76
+
77
+ def process_subsequence(subsequence)
78
+ normalized_ss = subsequence.z_normalize
79
+ len, rem = normalized_ss.length.divmod @elements_per_word
80
+ # Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
81
+ # This is different than the orig SAX as specified in their paper.
82
+ symbols = (0...len).map do |wordindex|
83
+ @mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
84
+ end
85
+ symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
86
+ symbols
87
+ end
88
+
89
+ def process(data, windowSize = data.length, mapper = nil)
90
+ setup_for_processing_data(data, mapper)
91
+ res = (0..(data.length - windowSize)).map do |i|
92
+ process_subsequence(data[i, windowSize])
93
+ end
94
+ res = res.flatten if windowSize == data.length
95
+ res
96
+ end
97
+ end
98
+
99
+ end
@@ -26,6 +26,15 @@ class RCommunicator
26
26
  @r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
27
27
  end
28
28
 
29
+ # Load R scripts in the feldtruby/R directory.
30
+ def load_feldtruby_r_script(scriptName, reload = false)
31
+ @loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
32
+ return if reload == false && @loaded_scripts.include?(scriptName)
33
+ @loaded_scripts << scriptName
34
+ path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
35
+ @r.eval "source(\"#{path}\")"
36
+ end
37
+
29
38
  def eval(str)
30
39
  @r.eval str
31
40
  end
@@ -103,6 +112,45 @@ module Statistics
103
112
  res = RC.call("chisq.test", vs)
104
113
  res.p_value
105
114
  end
115
+
116
+ class DiffusionKDE
117
+ attr_reader :densities, :mesh
118
+
119
+ # Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
120
+ # we can calculate the probability of new values.
121
+ def initialize(rvalue)
122
+ @probabilities = rvalue.probabilities
123
+ @densities = rvalue.densities
124
+ @mesh = rvalue.mesh
125
+ @mesh_interval = rvalue.mesh_interval.to_f
126
+ @min, @max = rvalue.min.to_f, rvalue.max.to_f
127
+ end
128
+
129
+ def density_of(value)
130
+ return 0.0 if value < @min || value > @max
131
+ bin_index = ((value - @min) / @mesh_interval).floor
132
+ @densities[bin_index]
133
+ end
134
+
135
+ def probability_of(value)
136
+ return 0.0 if value < @min || value > @max
137
+ bin_index = ((value - @min) / @mesh_interval).floor
138
+ @probabilities[bin_index]
139
+ end
140
+ end
141
+
142
+ # Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
143
+ # and optional min and max values.
144
+ def density_estimation(values, n = 2**9, min = nil, max = nil)
145
+ # Ensure we have loaded the diffusion.kde code
146
+ RC.load_feldtruby_r_script("diffusion_kde.R")
147
+ args = [values, n]
148
+ if min && max
149
+ args << min
150
+ args << max
151
+ end
152
+ DiffusionKDE.new RC.call("diffusion.kde", *args)
153
+ end
106
154
  end
107
155
 
108
156
  # Make them available at top level
@@ -1,3 +1,3 @@
1
1
  module FeldtRuby
2
- VERSION = "0.3.6"
2
+ VERSION = "0.3.8"
3
3
  end
data/lib/feldtruby.rb CHANGED
@@ -4,4 +4,6 @@ if RUBY_VERSION < "1.9"
4
4
  end
5
5
 
6
6
  # This is the namespace under which we put things...
7
- module FeldtRuby; end
7
+ module FeldtRuby
8
+ TopDirectory = File.dirname(__FILE__).split("/")[0...-1].join("/")
9
+ end
data/test/test_array.rb CHANGED
@@ -100,4 +100,11 @@ describe "Array extensions" do
100
100
  counts[5].must_equal 5
101
101
  end
102
102
  end
103
- end
103
+
104
+ describe "sample" do
105
+ it "only samples within the array" do
106
+ d = (1..100).to_a
107
+ 100.times { d.include?(d.sample).must_equal(true) }
108
+ end
109
+ end
110
+ end
@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
60
60
  end
61
61
  end
62
62
 
63
+ describe "mean and stdev" do
64
+ it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
65
+ data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
66
+ data.mean.must_be_close_to 4.606667
67
+ data.sd.must_be_close_to 2.640316
68
+ end
69
+ end
70
+
63
71
  describe "Basic statistics" do
64
72
  describe "sum of abs" do
65
73
  it "works for simple example" do
@@ -131,4 +139,28 @@ describe "Basic statistics" do
131
139
  [1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
132
140
  end
133
141
  end
142
+
143
+ describe "quantile- and quartile-related functionality" do
144
+ it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
145
+ seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
146
+ seq.quartiles.must_equal [7.25, 9, 14.5]
147
+ seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
148
+ seq.inter_quartile_range.must_equal (14.5-7.25)
149
+ end
150
+
151
+ it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
152
+ seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
153
+ seq.quartiles.must_equal [7.5, 9, 14]
154
+ seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
155
+ seq.inter_quartile_range.must_equal 6.5
156
+ end
157
+
158
+ it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
159
+ seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
160
+
161
+ seq.quartiles.must_equal [25.5, 40, 42.5]
162
+ seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
163
+ seq.inter_quartile_range.must_equal 17.0
164
+ end
165
+ end
134
166
  end
@@ -0,0 +1,22 @@
1
+ require 'feldtruby/statistics/fastmap'
2
+ require 'feldtruby/statistics/euclidean_distance'
3
+
4
+ describe "Fastmap" do
5
+ it "works for simple data, and different values of k" do
6
+ d = [
7
+ [0, 0, 0, 0],
8
+ [1, 1, 1, 1],
9
+ [2, 2, 2, 2],
10
+ [3, 3, 3, 3]
11
+ ]
12
+ 1.upto(d.first.length-1) do |k|
13
+ m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
14
+ m.depth.must_equal k
15
+ d.each do |datum|
16
+ c = m[datum]
17
+ c.length.must_equal k
18
+ c.must_equal m[datum]
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,36 @@
1
+ require 'feldtruby/statistics/normalization'
2
+
3
+ class Array
4
+ def must_be_close_to(other)
5
+ self.zip(other).map {|a,b| a.must_be_close_to(b)}
6
+ end
7
+ end
8
+
9
+ describe "Z normalization" do
10
+ it "handles empty arrays" do
11
+ [].z_normalize.must_equal []
12
+ end
13
+
14
+ it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
15
+ data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
16
+ expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
17
+ data.z_normalize.must_be_close_to expected
18
+ end
19
+
20
+ it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
21
+ data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
22
+ expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
23
+ data.z_normalize.must_be_close_to expected
24
+ end
25
+ end
26
+
27
+ describe "Min-Max normalization" do
28
+ it "handles empty arrays" do
29
+ [].min_max_normalize.must_equal []
30
+ end
31
+
32
+ it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
33
+ data = [20, 24, 26, 27, 30]
34
+ data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
35
+ end
36
+ end
data/test/test_sax.rb ADDED
@@ -0,0 +1,48 @@
1
+ require 'feldtruby/statistics/time_series/sax'
2
+ include FeldtRuby::Statistics
3
+
4
+ describe 'Symbolic Adaptive approXimation - SAX' do
5
+ describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
6
+ it "accepts alphabet sizes between 2 and 20" do
7
+ sm = SAX::SymbolMapper.new
8
+ sm.supports_alphabet_size?(-1).must_equal false
9
+ sm.supports_alphabet_size?(0).must_equal false
10
+ sm.supports_alphabet_size?(1).must_equal false
11
+ sm.supports_alphabet_size?(2).must_equal true
12
+ sm.supports_alphabet_size?(20).must_equal true
13
+ sm.supports_alphabet_size?(21).must_equal false
14
+ end
15
+
16
+ it "maps correctly to symbols for alphabet of size 2" do
17
+ sm = SAX::SymbolMapper.new
18
+ sm.symbol_for_value(-10, 2).must_equal 1
19
+ sm.symbol_for_value(-1, 2).must_equal 1
20
+ sm.symbol_for_value(1, 2).must_equal 2
21
+ sm.symbol_for_value(10, 2).must_equal 2
22
+ end
23
+
24
+ it "maps correctly to symbols for alphabet of size 4" do
25
+ sm = SAX::SymbolMapper.new
26
+ sm.symbol_for_value(-0.7, 4).must_equal 1
27
+ sm.symbol_for_value(-0.5, 4).must_equal 2
28
+ sm.symbol_for_value(-0.01, 4).must_equal 2
29
+ sm.symbol_for_value(0, 4).must_equal 3
30
+ sm.symbol_for_value(0.01, 4).must_equal 3
31
+ sm.symbol_for_value(0.5, 4).must_equal 3
32
+ sm.symbol_for_value(0.7, 4).must_equal 4
33
+ sm.symbol_for_value(17, 4).must_equal 4
34
+ end
35
+ end
36
+
37
+ it "does not accept alphabet sizes larger than 20 or smaller than 2" do
38
+ proc {SAX.new(10, 21)}.must_raise ArgumentError
39
+ proc {SAX.new(3, 1)}.must_raise ArgumentError
40
+ end
41
+
42
+ it "maps some simple time series to symbols when directly mapping" do
43
+ sax = SAX.new(1, 4)
44
+ sax.process([-1, 0, 1]).must_equal [1,3,4]
45
+ sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
46
+ sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
47
+ end
48
+ end
@@ -69,6 +69,19 @@ describe "Statistics" do
69
69
  probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
70
70
  end
71
71
  end
72
+
73
+ describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
74
+ it "works for simple examples" do
75
+ data = [1]
76
+ kde = density_estimation(data, 4, 0.0, 3.0)
77
+ kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
78
+ kde.densities.length.must_equal 4
79
+ kde.densities[0].must_be_close_to 0.3912
80
+ kde.densities[1].must_be_close_to 0.3591
81
+ kde.densities[2].must_be_close_to 0.3101
82
+ kde.densities[3].must_be_close_to 0.2728
83
+ end
84
+ end
72
85
  end
73
86
 
74
87
  require 'feldtruby/minitest_extensions'
@@ -0,0 +1,24 @@
1
+ require 'feldtruby/statistics/distance/string_distance'
2
+ include FeldtRuby::Statistics
3
+
4
+ describe "ncd" do
5
+ it "gives no distance if the strings are the same" do
6
+ ncd("aaa", "aaa").must_equal 0.0
7
+ end
8
+
9
+ it "gives distance > 0.0 if strings are not the same" do
10
+ ncd("a", "b").must_be :>, 0.0
11
+ ncd("aa", "ab").must_be :>, 0.0
12
+ end
13
+ end
14
+
15
+ describe "cdm" do
16
+ it "gives no distance if the strings are the same" do
17
+ cdm("aaa", "aaa").must_equal 0.0
18
+ end
19
+
20
+ it "gives distance > 0.0 if strings are not the same" do
21
+ cdm("a", "b").must_be :>, 0.0
22
+ cdm("aa", "ab").must_be :>, 0.0
23
+ end
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feldtruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-16 00:00:00.000000000 Z
12
+ date: 2013-02-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rinruby
@@ -54,6 +54,7 @@ files:
54
54
  - Gemfile
55
55
  - Gemfile.lock
56
56
  - History.txt
57
+ - R/diffusion_kde.R
57
58
  - README.md
58
59
  - Rakefile
59
60
  - TODO
@@ -77,6 +78,11 @@ files:
77
78
  - lib/feldtruby/optimize/search_space.rb
78
79
  - lib/feldtruby/optimize/stdout_logger.rb
79
80
  - lib/feldtruby/statistics.rb
81
+ - lib/feldtruby/statistics/distance/string_distance.rb
82
+ - lib/feldtruby/statistics/euclidean_distance.rb
83
+ - lib/feldtruby/statistics/fastmap.rb
84
+ - lib/feldtruby/statistics/normalization.rb
85
+ - lib/feldtruby/statistics/time_series/sax.rb
80
86
  - lib/feldtruby/string/to_iso.rb
81
87
  - lib/feldtruby/time.rb
82
88
  - lib/feldtruby/vector.rb
@@ -87,15 +93,19 @@ files:
87
93
  - test/test_array.rb
88
94
  - test/test_array_basic_stats.rb
89
95
  - test/test_array_count_by.rb
96
+ - test/test_fastmap.rb
90
97
  - test/test_float.rb
91
98
  - test/test_html_doc_getter.rb
99
+ - test/test_normalization.rb
92
100
  - test/test_optimize.rb
93
101
  - test/test_optimize_differential_evolution.rb
94
102
  - test/test_optimize_objective.rb
95
103
  - test/test_optimize_populationbasedoptimizer.rb
96
104
  - test/test_optimize_random_search.rb
97
105
  - test/test_optimize_search_space.rb
106
+ - test/test_sax.rb
98
107
  - test/test_statistics.rb
108
+ - test/test_string_distance.rb
99
109
  - test/test_time.rb
100
110
  - test/test_vector.rb
101
111
  - test/test_word_counter.rb
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
129
  version: '0'
120
130
  requirements: []
121
131
  rubyforge_project:
122
- rubygems_version: 1.8.24
132
+ rubygems_version: 1.8.25
123
133
  signing_key:
124
134
  specification_version: 3
125
135
  summary: Robert Feldt's Common Ruby Code lib
@@ -128,15 +138,19 @@ test_files:
128
138
  - test/test_array.rb
129
139
  - test/test_array_basic_stats.rb
130
140
  - test/test_array_count_by.rb
141
+ - test/test_fastmap.rb
131
142
  - test/test_float.rb
132
143
  - test/test_html_doc_getter.rb
144
+ - test/test_normalization.rb
133
145
  - test/test_optimize.rb
134
146
  - test/test_optimize_differential_evolution.rb
135
147
  - test/test_optimize_objective.rb
136
148
  - test/test_optimize_populationbasedoptimizer.rb
137
149
  - test/test_optimize_random_search.rb
138
150
  - test/test_optimize_search_space.rb
151
+ - test/test_sax.rb
139
152
  - test/test_statistics.rb
153
+ - test/test_string_distance.rb
140
154
  - test/test_time.rb
141
155
  - test/test_vector.rb
142
156
  - test/test_word_counter.rb