feldtruby 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/R/diffusion_kde.R +116 -0
- data/lib/feldtruby/array/basic_stats.rb +51 -0
- data/lib/feldtruby/array.rb +4 -0
- data/lib/feldtruby/statistics/distance/string_distance.rb +49 -0
- data/lib/feldtruby/statistics/euclidean_distance.rb +14 -0
- data/lib/feldtruby/statistics/fastmap.rb +106 -0
- data/lib/feldtruby/statistics/normalization.rb +26 -0
- data/lib/feldtruby/statistics/time_series/sax.rb +99 -0
- data/lib/feldtruby/statistics.rb +48 -0
- data/lib/feldtruby/version.rb +1 -1
- data/lib/feldtruby.rb +3 -1
- data/test/test_array.rb +8 -1
- data/test/test_array_basic_stats.rb +32 -0
- data/test/test_fastmap.rb +22 -0
- data/test/test_normalization.rb +36 -0
- data/test/test_sax.rb +48 -0
- data/test/test_statistics.rb +13 -0
- data/test/test_string_distance.rb +24 -0
- metadata +17 -3
data/R/diffusion_kde.R
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# Gaussian KDE downloaded from
|
2
|
+
# http://www-etud.iro.umontreal.ca/~botev/kde.R
|
3
|
+
# on February 9th 2013.
|
4
|
+
# Our changes:
|
5
|
+
# - Renamed function from kde to diffusion.kde
|
6
|
+
# - Return an object instead of a matrix
|
7
|
+
# - Added the sum of the densities
|
8
|
+
# - Added the mesh interval
|
9
|
+
# - Added the probabilities for each interval
|
10
|
+
# No license was specified on this piece of code. For questions about it please contact
|
11
|
+
# Prof. Botev on the email: botev@maths.uq.edu.au
|
12
|
+
diffusion.kde <- function(data,n,MIN,MAX){
|
13
|
+
# State-of-the-art gaussian kernel density estimator for one-dimensional data;
|
14
|
+
# The estimator does not use the commonly employed 'gaussian rule of thumb'.
|
15
|
+
# As a result it outperforms many plug-in methods on multimodal densities
|
16
|
+
# with widely separated modes (see example).
|
17
|
+
# INPUTS:
|
18
|
+
# data - a vector of data from which the density estimate is constructed;
|
19
|
+
# n - the number of mesh points used in the uniform discretization of the
|
20
|
+
# interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
|
21
|
+
# n is rounded up to the next power of two; the default value of n is n=2^12;
|
22
|
+
# MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
|
23
|
+
# the default values of MIN and MAX are:
|
24
|
+
# MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
|
25
|
+
# OUTPUT:
|
26
|
+
# matrix 'out' of with two rows of length 'n', where out[2,]
|
27
|
+
# are the density values on the mesh out[1,];
|
28
|
+
# EXAMPLE:
|
29
|
+
##Save this file in your directory as kde.R and copy and paste the commands:
|
30
|
+
# rm(list=ls())
|
31
|
+
# source(file='kde.r')
|
32
|
+
# data=c(rnorm(10^3),rnorm(10^3)*2+30);
|
33
|
+
# d=kde(data)
|
34
|
+
# plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
|
35
|
+
|
36
|
+
# REFERENCE:
|
37
|
+
# Z. I. Botev, J. F. Grotowski and D. P. Kroese
|
38
|
+
# "Kernel Density Estimation Via Diffusion"
|
39
|
+
# Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
|
40
|
+
# for questions email: botev@maths.uq.edu.au
|
41
|
+
|
42
|
+
nargin=length(as.list(match.call()))-1;
|
43
|
+
if (nargin<2) n=2^14
|
44
|
+
n=2^ceiling(log2(n)); # round up n to the next power of 2;
|
45
|
+
if (nargin<4)
|
46
|
+
{# define the default interval [MIN,MAX]
|
47
|
+
minimum=min(data); maximum=max(data);
|
48
|
+
Range=maximum-minimum;
|
49
|
+
MIN=minimum-Range/10; MAX=maximum+Range/10;
|
50
|
+
}
|
51
|
+
# set up the grid over which the density estimate is computed;
|
52
|
+
R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
|
53
|
+
# if data has repeated observations use the N below
|
54
|
+
# N=length(as.numeric(names(table(data))));
|
55
|
+
# bin the data uniformly using the grid defined above;
|
56
|
+
w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
|
57
|
+
initial_data=initial_data/sum(initial_data);
|
58
|
+
|
59
|
+
dct1d <- function(data){
|
60
|
+
# computes the discrete cosine transform of the column vector data
|
61
|
+
n= length(data);
|
62
|
+
# Compute weights to multiply DFT coefficients
|
63
|
+
weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
|
64
|
+
# Re-order the elements of the columns of x
|
65
|
+
data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
|
66
|
+
# Multiply FFT by weights:
|
67
|
+
data= Re(weight* fft(data));
|
68
|
+
data}
|
69
|
+
|
70
|
+
a=dct1d(initial_data); # discrete cosine transform of initial data
|
71
|
+
# now compute the optimal bandwidth^2 using the referenced method
|
72
|
+
I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
|
73
|
+
# use fzero to solve the equation t=zeta*gamma^[5](t)
|
74
|
+
|
75
|
+
fixed_point <- function(t,N,I,a2){
|
76
|
+
# this implements the function t-zeta*gamma^[l](t)
|
77
|
+
l=7;
|
78
|
+
f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
|
79
|
+
for (s in (l-1):2){
|
80
|
+
|
81
|
+
K0=prod(seq(1,2*s-1,2))/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
|
82
|
+
time=(2*const*K0/N/f)^(2/(3+2*s));
|
83
|
+
f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
|
84
|
+
}
|
85
|
+
out=t-(2*N*sqrt(pi)*f)^(-2/5);
|
86
|
+
}
|
87
|
+
|
88
|
+
t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
|
89
|
+
# smooth the discrete cosine transform of initial data using t_star
|
90
|
+
a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
|
91
|
+
# now apply the inverse discrete cosine transform
|
92
|
+
|
93
|
+
idct1d <- function(data){
|
94
|
+
# computes the inverse discrete cosine transform
|
95
|
+
n=length(data);
|
96
|
+
# Compute weights
|
97
|
+
weights = n*exp(1i*(0:(n-1))*pi/(2*n));
|
98
|
+
# Compute x tilde using equation (5.93) in Jain
|
99
|
+
data = Re(fft(weights*data,inverse=TRUE))/n;
|
100
|
+
# Re-order elements of each column according to equations (5.93) and
|
101
|
+
# (5.94) in Jain
|
102
|
+
out = rep(0,n);
|
103
|
+
out[seq(1,n,2)] = data[1:(n/2)];
|
104
|
+
out[seq(2,n,2)] = data[n:(n/2+1)];
|
105
|
+
out;
|
106
|
+
}
|
107
|
+
|
108
|
+
density=idct1d(a_t)/R;
|
109
|
+
# take the rescaling of the data into account
|
110
|
+
bandwidth=sqrt(t_star)*R;
|
111
|
+
xmesh=seq(MIN,MAX,R/(n-1));
|
112
|
+
# out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
|
113
|
+
posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
|
114
|
+
list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
|
115
|
+
sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
|
116
|
+
}
|
@@ -21,6 +21,46 @@ module BasicStatistics
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
+
# Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
|
25
|
+
# This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
|
26
|
+
def quantiles
|
27
|
+
return [nil, nil, nil, nil, nil] if length == 0
|
28
|
+
sorted = self.sort
|
29
|
+
q1 = sorted.quantile_at_ratio(0.25)
|
30
|
+
q2 = sorted.quantile_at_ratio(0.50)
|
31
|
+
q3 = sorted.quantile_at_ratio(0.75)
|
32
|
+
return sorted.first, q1, q2, q3, sorted.last
|
33
|
+
end
|
34
|
+
|
35
|
+
# Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
|
36
|
+
# is a sorted array. This is based on the type 7 quantile function in R.
|
37
|
+
def quantile_at_ratio(p)
|
38
|
+
n = self.length
|
39
|
+
h = (n - 1) * p + 1
|
40
|
+
hfloor = h.floor
|
41
|
+
if h == hfloor
|
42
|
+
self[hfloor-1]
|
43
|
+
else
|
44
|
+
x_hfloor = self[hfloor-1]
|
45
|
+
x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Calculate the three quartiles of the array.
|
50
|
+
def quartiles
|
51
|
+
return [nil, nil, nil] if length == 0
|
52
|
+
sorted = self.sort
|
53
|
+
q1 = sorted.quantile_at_ratio(0.25)
|
54
|
+
q2 = sorted.quantile_at_ratio(0.50)
|
55
|
+
q3 = sorted.quantile_at_ratio(0.75)
|
56
|
+
return q1, q2, q3
|
57
|
+
end
|
58
|
+
|
59
|
+
def inter_quartile_range
|
60
|
+
q1, q2, q3 = quartiles
|
61
|
+
q3 - q1
|
62
|
+
end
|
63
|
+
|
24
64
|
def variance
|
25
65
|
return 0 if self.length == 0
|
26
66
|
avg = self.mean
|
@@ -31,6 +71,17 @@ module BasicStatistics
|
|
31
71
|
Math.sqrt( self.variance )
|
32
72
|
end
|
33
73
|
|
74
|
+
# Same as R's var, i.e. uses N-1 in denominator.
|
75
|
+
def var
|
76
|
+
n = self.length.to_f
|
77
|
+
(variance * n) / (n-1)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Save as R's sd, i.e. uses N-1 in denominator.
|
81
|
+
def sd
|
82
|
+
Math.sqrt( self.var )
|
83
|
+
end
|
84
|
+
|
34
85
|
def root_mean_square
|
35
86
|
Math.sqrt( self.map {|v| v**2}.mean )
|
36
87
|
end
|
data/lib/feldtruby/array.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FeldtRuby::Statistics
|
4
|
+
|
5
|
+
class StringDistance
|
6
|
+
def compress(s)
|
7
|
+
Zlib::Deflate.deflate(s, 9)
|
8
|
+
end
|
9
|
+
|
10
|
+
def compressed_length(s)
|
11
|
+
compress(s).length
|
12
|
+
end
|
13
|
+
|
14
|
+
def distance(string1, string2)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Cilibrasi and Vitanyi's NCD.
|
20
|
+
class NormalizedCompressionDistance < StringDistance
|
21
|
+
def distance(string1, string2)
|
22
|
+
return 0.0 if string1 == string2
|
23
|
+
c1 = compressed_length(string1)
|
24
|
+
c2 = compressed_length(string2)
|
25
|
+
c_1_2 = compressed_length(string1 + string2)
|
26
|
+
(c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ncd(string1, string2)
|
31
|
+
(@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Keogh et al's CDM.
|
35
|
+
class CompressionBasedDissimilarityMeasure < StringDistance
|
36
|
+
def distance(string1, string2)
|
37
|
+
return 0.0 if string1 == string2
|
38
|
+
c1 = compressed_length(string1)
|
39
|
+
c2 = compressed_length(string2)
|
40
|
+
c_1_2 = compressed_length(string1 + string2)
|
41
|
+
c_1_2.to_f / (c1 + c2)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def cdm(string1, string2)
|
46
|
+
(@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'feldtruby/array'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
class FastMap
|
6
|
+
# A PivotNode has two pivot objects, a map from each object to its
|
7
|
+
# coordinate on the line for these pivots, a distance function and
|
8
|
+
# a child pointing to the next dimension.
|
9
|
+
# It maps a multi-variate object to a k-dimensional coordinate.
|
10
|
+
class PivotNode
|
11
|
+
attr_writer :map, :child
|
12
|
+
|
13
|
+
def initialize(distance, pivot1, pivot2, map = nil, child = nil)
|
14
|
+
@distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
|
15
|
+
@d_1_2 = distance.calc(pivot1, pivot2)
|
16
|
+
@d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
|
17
|
+
end
|
18
|
+
|
19
|
+
# The number of coordinates that will be returned for an object.
|
20
|
+
def k; depth; end
|
21
|
+
def depth
|
22
|
+
@depth ||= 1 + (@child ? @child.depth : 0)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Map an object to its coordinate in the dimension represented by this node.
|
26
|
+
def fastmap_coordinate(o)
|
27
|
+
( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
|
28
|
+
end
|
29
|
+
|
30
|
+
def coordinate(o)
|
31
|
+
[map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](object)
|
35
|
+
coordinate(object)
|
36
|
+
end
|
37
|
+
|
38
|
+
def map_object_to_coordinate(o)
|
39
|
+
@map[o] || fastmap_coordinate(o)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(distance, k = 2, choiceDepth = 1)
|
44
|
+
@distance, @k, @choice_depth = distance, k, choiceDepth
|
45
|
+
end
|
46
|
+
|
47
|
+
def run(objects)
|
48
|
+
@objects = objects
|
49
|
+
create_map(@k, @distance)
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_map(k, distance)
|
53
|
+
return nil if k == 0
|
54
|
+
o1, o2 = choose_distant_objects(@objects, @distance)
|
55
|
+
node = PivotNode.new(distance, o1, o2)
|
56
|
+
coordinate_map = {}
|
57
|
+
if distance.calc(o1, o2) == 0.0
|
58
|
+
@objects.each {|o| coordinate_map[o] = 0.0}
|
59
|
+
else
|
60
|
+
@objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
|
61
|
+
end
|
62
|
+
node.map = coordinate_map
|
63
|
+
node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
|
64
|
+
node
|
65
|
+
end
|
66
|
+
|
67
|
+
def choose_distant_objects(objects, distance)
|
68
|
+
o1 = nil
|
69
|
+
o2 = objects.sample
|
70
|
+
# Not sure if there is any benefit to doing this more than once. Test later.
|
71
|
+
@choice_depth.times do
|
72
|
+
o1 = find_most_distant_object(objects, o2, distance)
|
73
|
+
o2 = find_most_distant_object(objects, o1, distance)
|
74
|
+
end
|
75
|
+
return o1, o2
|
76
|
+
end
|
77
|
+
|
78
|
+
# Find the object in objects that is farthest from o, given a distance function.
|
79
|
+
def find_most_distant_object(objects, o, distance)
|
80
|
+
objects.sort_by {|oi| distance.calc(oi, o)}.last
|
81
|
+
end
|
82
|
+
|
83
|
+
class DistanceFunction
|
84
|
+
def initialize(&func)
|
85
|
+
@func = func
|
86
|
+
end
|
87
|
+
def calc(o1, o2)
|
88
|
+
@func.call(o1, o2)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Create the next distance function from a given distance func.
|
93
|
+
def next_distance(distance, o1, o2, coordinates)
|
94
|
+
DistanceFunction.new do |oi, oj|
|
95
|
+
Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Recursively map n-dimensional objects (given as an Array) into a k-dimensional
|
101
|
+
# space while preserving the distances between the objects as well as possible.
|
102
|
+
def self.fastmap(objects, distance, k = 2)
|
103
|
+
FastMap.new(distance, k).run(objects)
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'feldtruby/array/basic_stats'
|
2
|
+
|
3
|
+
# The normalization methods assumes the existence of basic statistics
|
4
|
+
# on the class it they are included in:
|
5
|
+
# z_normalize: require mean and stdev
|
6
|
+
module FeldtRuby::Normalization
|
7
|
+
def normalize(&transform)
|
8
|
+
self.map {|v| transform.call(v)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def z_normalize
|
12
|
+
mean, stdev = self.mean, self.sd
|
13
|
+
self.map {|e| (e-mean)/stdev}
|
14
|
+
end
|
15
|
+
|
16
|
+
def min_max_normalize
|
17
|
+
return [] if self.length == 0
|
18
|
+
min = self.min.to_f
|
19
|
+
range = self.max - min
|
20
|
+
self.map {|e| (e-min)/range}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Array
|
25
|
+
include FeldtRuby::Normalization
|
26
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'feldtruby/statistics/normalization'
|
2
|
+
|
3
|
+
# Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
|
4
|
+
# Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
|
5
|
+
# "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
|
6
|
+
# available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
|
7
|
+
module FeldtRuby::Statistics
|
8
|
+
|
9
|
+
# A SAX processor transforms any numeric stream of data (often a time series)
|
10
|
+
# of arbitrary length n to a string (symbolic stream) of arbitrary length w,
|
11
|
+
# where w<n, and typically w<<n. The alphabet size (symbols in the string) is
|
12
|
+
# also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
|
13
|
+
# al we state the number of data elements, _elementsPerWord_, that should go
|
14
|
+
# into each word, i.e. w = n/elementsPerWord.
|
15
|
+
# This allows for many powerful data mining algorithms to be applied and sped up.
|
16
|
+
class SAX
|
17
|
+
# Create a SAX processor with given output length _w_ and alphabet size _a_.
|
18
|
+
def initialize(elementsPerWord, alphabetSize = 6)
|
19
|
+
raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
|
20
|
+
@elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
|
21
|
+
end
|
22
|
+
|
23
|
+
# A mapper maps the values in a subsequence into a symbol. The standard
|
24
|
+
# mapper is state-less and normalizes each subsequence and then assumes
|
25
|
+
# a normal distribution and thus uses a fixed selection of bins.
|
26
|
+
class SymbolMapper
|
27
|
+
def initialize(data = nil)
|
28
|
+
# This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
|
29
|
+
end
|
30
|
+
|
31
|
+
# Cut points based on a Normal/Gaussian distribution...
|
32
|
+
NormalDistCutPoints = {
|
33
|
+
2 => [-Float::INFINITY, 0.00],
|
34
|
+
3 => [-Float::INFINITY, -0.43, 0.43],
|
35
|
+
4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
|
36
|
+
5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
|
37
|
+
6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
|
38
|
+
7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
|
39
|
+
8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
|
40
|
+
9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
|
41
|
+
10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
|
42
|
+
11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
|
43
|
+
12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
|
44
|
+
13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
|
45
|
+
14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
|
46
|
+
15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
|
47
|
+
16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
|
48
|
+
17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
|
49
|
+
18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
|
50
|
+
19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
|
51
|
+
20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
|
52
|
+
}
|
53
|
+
|
54
|
+
def supports_alphabet_size?(size)
|
55
|
+
NormalDistCutPoints.keys.include? size
|
56
|
+
end
|
57
|
+
|
58
|
+
def map_sequence_to_symbol(sequence, alphabet_size)
|
59
|
+
symbol_for_value(sequence.mean, alphabet_size)
|
60
|
+
end
|
61
|
+
|
62
|
+
def symbol_for_value(value, alphabet_size)
|
63
|
+
NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
|
64
|
+
return symbol if cutpoint > value
|
65
|
+
symbol + 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def setup_for_processing_data(data, mapper = nil)
|
71
|
+
@mapper ||= SymbolMapper.new(data)
|
72
|
+
unless @mapper.supports_alphabet_size?(@alphabet_size)
|
73
|
+
raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_subsequence(subsequence)
|
78
|
+
normalized_ss = subsequence.z_normalize
|
79
|
+
len, rem = normalized_ss.length.divmod @elements_per_word
|
80
|
+
# Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
|
81
|
+
# This is different than the orig SAX as specified in their paper.
|
82
|
+
symbols = (0...len).map do |wordindex|
|
83
|
+
@mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
|
84
|
+
end
|
85
|
+
symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
|
86
|
+
symbols
|
87
|
+
end
|
88
|
+
|
89
|
+
def process(data, windowSize = data.length, mapper = nil)
|
90
|
+
setup_for_processing_data(data, mapper)
|
91
|
+
res = (0..(data.length - windowSize)).map do |i|
|
92
|
+
process_subsequence(data[i, windowSize])
|
93
|
+
end
|
94
|
+
res = res.flatten if windowSize == data.length
|
95
|
+
res
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
data/lib/feldtruby/statistics.rb
CHANGED
@@ -26,6 +26,15 @@ class RCommunicator
|
|
26
26
|
@r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
|
27
27
|
end
|
28
28
|
|
29
|
+
# Load R scripts in the feldtruby/R directory.
|
30
|
+
def load_feldtruby_r_script(scriptName, reload = false)
|
31
|
+
@loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
|
32
|
+
return if reload == false && @loaded_scripts.include?(scriptName)
|
33
|
+
@loaded_scripts << scriptName
|
34
|
+
path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
|
35
|
+
@r.eval "source(\"#{path}\")"
|
36
|
+
end
|
37
|
+
|
29
38
|
def eval(str)
|
30
39
|
@r.eval str
|
31
40
|
end
|
@@ -103,6 +112,45 @@ module Statistics
|
|
103
112
|
res = RC.call("chisq.test", vs)
|
104
113
|
res.p_value
|
105
114
|
end
|
115
|
+
|
116
|
+
class DiffusionKDE
|
117
|
+
attr_reader :densities, :mesh
|
118
|
+
|
119
|
+
# Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
|
120
|
+
# we can calculate the probability of new values.
|
121
|
+
def initialize(rvalue)
|
122
|
+
@probabilities = rvalue.probabilities
|
123
|
+
@densities = rvalue.densities
|
124
|
+
@mesh = rvalue.mesh
|
125
|
+
@mesh_interval = rvalue.mesh_interval.to_f
|
126
|
+
@min, @max = rvalue.min.to_f, rvalue.max.to_f
|
127
|
+
end
|
128
|
+
|
129
|
+
def density_of(value)
|
130
|
+
return 0.0 if value < @min || value > @max
|
131
|
+
bin_index = ((value - @min) / @mesh_interval).floor
|
132
|
+
@densities[bin_index]
|
133
|
+
end
|
134
|
+
|
135
|
+
def probability_of(value)
|
136
|
+
return 0.0 if value < @min || value > @max
|
137
|
+
bin_index = ((value - @min) / @mesh_interval).floor
|
138
|
+
@probabilities[bin_index]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
|
143
|
+
# and optional min and max values.
|
144
|
+
def density_estimation(values, n = 2**9, min = nil, max = nil)
|
145
|
+
# Ensure we have loaded the diffusion.kde code
|
146
|
+
RC.load_feldtruby_r_script("diffusion_kde.R")
|
147
|
+
args = [values, n]
|
148
|
+
if min && max
|
149
|
+
args << min
|
150
|
+
args << max
|
151
|
+
end
|
152
|
+
DiffusionKDE.new RC.call("diffusion.kde", *args)
|
153
|
+
end
|
106
154
|
end
|
107
155
|
|
108
156
|
# Make them available at top level
|
data/lib/feldtruby/version.rb
CHANGED
data/lib/feldtruby.rb
CHANGED
data/test/test_array.rb
CHANGED
@@ -100,4 +100,11 @@ describe "Array extensions" do
|
|
100
100
|
counts[5].must_equal 5
|
101
101
|
end
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
|
+
describe "sample" do
|
105
|
+
it "only samples within the array" do
|
106
|
+
d = (1..100).to_a
|
107
|
+
100.times { d.include?(d.sample).must_equal(true) }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
+
describe "mean and stdev" do
|
64
|
+
it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
65
|
+
data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
|
66
|
+
data.mean.must_be_close_to 4.606667
|
67
|
+
data.sd.must_be_close_to 2.640316
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
63
71
|
describe "Basic statistics" do
|
64
72
|
describe "sum of abs" do
|
65
73
|
it "works for simple example" do
|
@@ -131,4 +139,28 @@ describe "Basic statistics" do
|
|
131
139
|
[1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
|
132
140
|
end
|
133
141
|
end
|
142
|
+
|
143
|
+
describe "quantile- and quartile-related functionality" do
|
144
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
|
145
|
+
seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
|
146
|
+
seq.quartiles.must_equal [7.25, 9, 14.5]
|
147
|
+
seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
|
148
|
+
seq.inter_quartile_range.must_equal (14.5-7.25)
|
149
|
+
end
|
150
|
+
|
151
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
|
152
|
+
seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
|
153
|
+
seq.quartiles.must_equal [7.5, 9, 14]
|
154
|
+
seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
|
155
|
+
seq.inter_quartile_range.must_equal 6.5
|
156
|
+
end
|
157
|
+
|
158
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
|
159
|
+
seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
|
160
|
+
|
161
|
+
seq.quartiles.must_equal [25.5, 40, 42.5]
|
162
|
+
seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
|
163
|
+
seq.inter_quartile_range.must_equal 17.0
|
164
|
+
end
|
165
|
+
end
|
134
166
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'feldtruby/statistics/fastmap'
|
2
|
+
require 'feldtruby/statistics/euclidean_distance'
|
3
|
+
|
4
|
+
describe "Fastmap" do
|
5
|
+
it "works for simple data, and different values of k" do
|
6
|
+
d = [
|
7
|
+
[0, 0, 0, 0],
|
8
|
+
[1, 1, 1, 1],
|
9
|
+
[2, 2, 2, 2],
|
10
|
+
[3, 3, 3, 3]
|
11
|
+
]
|
12
|
+
1.upto(d.first.length-1) do |k|
|
13
|
+
m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
|
14
|
+
m.depth.must_equal k
|
15
|
+
d.each do |datum|
|
16
|
+
c = m[datum]
|
17
|
+
c.length.must_equal k
|
18
|
+
c.must_equal m[datum]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'feldtruby/statistics/normalization'
|
2
|
+
|
3
|
+
class Array
|
4
|
+
def must_be_close_to(other)
|
5
|
+
self.zip(other).map {|a,b| a.must_be_close_to(b)}
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "Z normalization" do
|
10
|
+
it "handles empty arrays" do
|
11
|
+
[].z_normalize.must_equal []
|
12
|
+
end
|
13
|
+
|
14
|
+
it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
15
|
+
data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
|
16
|
+
expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
|
17
|
+
data.z_normalize.must_be_close_to expected
|
18
|
+
end
|
19
|
+
|
20
|
+
it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
21
|
+
data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
|
22
|
+
expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
|
23
|
+
data.z_normalize.must_be_close_to expected
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "Min-Max normalization" do
|
28
|
+
it "handles empty arrays" do
|
29
|
+
[].min_max_normalize.must_equal []
|
30
|
+
end
|
31
|
+
|
32
|
+
it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
|
33
|
+
data = [20, 24, 26, 27, 30]
|
34
|
+
data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
|
35
|
+
end
|
36
|
+
end
|
data/test/test_sax.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'feldtruby/statistics/time_series/sax'
|
2
|
+
include FeldtRuby::Statistics
|
3
|
+
|
4
|
+
describe 'Symbolic Adaptive approXimation - SAX' do
|
5
|
+
describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
|
6
|
+
it "accepts alphabet sizes between 2 and 20" do
|
7
|
+
sm = SAX::SymbolMapper.new
|
8
|
+
sm.supports_alphabet_size?(-1).must_equal false
|
9
|
+
sm.supports_alphabet_size?(0).must_equal false
|
10
|
+
sm.supports_alphabet_size?(1).must_equal false
|
11
|
+
sm.supports_alphabet_size?(2).must_equal true
|
12
|
+
sm.supports_alphabet_size?(20).must_equal true
|
13
|
+
sm.supports_alphabet_size?(21).must_equal false
|
14
|
+
end
|
15
|
+
|
16
|
+
it "maps correctly to symbols for alphabet of size 2" do
|
17
|
+
sm = SAX::SymbolMapper.new
|
18
|
+
sm.symbol_for_value(-10, 2).must_equal 1
|
19
|
+
sm.symbol_for_value(-1, 2).must_equal 1
|
20
|
+
sm.symbol_for_value(1, 2).must_equal 2
|
21
|
+
sm.symbol_for_value(10, 2).must_equal 2
|
22
|
+
end
|
23
|
+
|
24
|
+
it "maps correctly to symbols for alphabet of size 4" do
|
25
|
+
sm = SAX::SymbolMapper.new
|
26
|
+
sm.symbol_for_value(-0.7, 4).must_equal 1
|
27
|
+
sm.symbol_for_value(-0.5, 4).must_equal 2
|
28
|
+
sm.symbol_for_value(-0.01, 4).must_equal 2
|
29
|
+
sm.symbol_for_value(0, 4).must_equal 3
|
30
|
+
sm.symbol_for_value(0.01, 4).must_equal 3
|
31
|
+
sm.symbol_for_value(0.5, 4).must_equal 3
|
32
|
+
sm.symbol_for_value(0.7, 4).must_equal 4
|
33
|
+
sm.symbol_for_value(17, 4).must_equal 4
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
it "does not accept alphabet sizes larger than 20 or smaller than 2" do
|
38
|
+
proc {SAX.new(10, 21)}.must_raise ArgumentError
|
39
|
+
proc {SAX.new(3, 1)}.must_raise ArgumentError
|
40
|
+
end
|
41
|
+
|
42
|
+
it "maps some simple time series to symbols when directly mapping" do
|
43
|
+
sax = SAX.new(1, 4)
|
44
|
+
sax.process([-1, 0, 1]).must_equal [1,3,4]
|
45
|
+
sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
|
46
|
+
sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
|
47
|
+
end
|
48
|
+
end
|
data/test/test_statistics.rb
CHANGED
@@ -69,6 +69,19 @@ describe "Statistics" do
|
|
69
69
|
probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
|
70
70
|
end
|
71
71
|
end
|
72
|
+
|
73
|
+
describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
|
74
|
+
it "works for simple examples" do
|
75
|
+
data = [1]
|
76
|
+
kde = density_estimation(data, 4, 0.0, 3.0)
|
77
|
+
kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
|
78
|
+
kde.densities.length.must_equal 4
|
79
|
+
kde.densities[0].must_be_close_to 0.3912
|
80
|
+
kde.densities[1].must_be_close_to 0.3591
|
81
|
+
kde.densities[2].must_be_close_to 0.3101
|
82
|
+
kde.densities[3].must_be_close_to 0.2728
|
83
|
+
end
|
84
|
+
end
|
72
85
|
end
|
73
86
|
|
74
87
|
require 'feldtruby/minitest_extensions'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'feldtruby/statistics/distance/string_distance'
|
2
|
+
include FeldtRuby::Statistics
|
3
|
+
|
4
|
+
describe "ncd" do
|
5
|
+
it "gives no distance if the strings are the same" do
|
6
|
+
ncd("aaa", "aaa").must_equal 0.0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "gives distance > 0.0 if strings are not the same" do
|
10
|
+
ncd("a", "b").must_be :>, 0.0
|
11
|
+
ncd("aa", "ab").must_be :>, 0.0
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "cdm" do
|
16
|
+
it "gives no distance if the strings are the same" do
|
17
|
+
cdm("aaa", "aaa").must_equal 0.0
|
18
|
+
end
|
19
|
+
|
20
|
+
it "gives distance > 0.0 if strings are not the same" do
|
21
|
+
cdm("a", "b").must_be :>, 0.0
|
22
|
+
cdm("aa", "ab").must_be :>, 0.0
|
23
|
+
end
|
24
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feldtruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- Gemfile
|
55
55
|
- Gemfile.lock
|
56
56
|
- History.txt
|
57
|
+
- R/diffusion_kde.R
|
57
58
|
- README.md
|
58
59
|
- Rakefile
|
59
60
|
- TODO
|
@@ -77,6 +78,11 @@ files:
|
|
77
78
|
- lib/feldtruby/optimize/search_space.rb
|
78
79
|
- lib/feldtruby/optimize/stdout_logger.rb
|
79
80
|
- lib/feldtruby/statistics.rb
|
81
|
+
- lib/feldtruby/statistics/distance/string_distance.rb
|
82
|
+
- lib/feldtruby/statistics/euclidean_distance.rb
|
83
|
+
- lib/feldtruby/statistics/fastmap.rb
|
84
|
+
- lib/feldtruby/statistics/normalization.rb
|
85
|
+
- lib/feldtruby/statistics/time_series/sax.rb
|
80
86
|
- lib/feldtruby/string/to_iso.rb
|
81
87
|
- lib/feldtruby/time.rb
|
82
88
|
- lib/feldtruby/vector.rb
|
@@ -87,15 +93,19 @@ files:
|
|
87
93
|
- test/test_array.rb
|
88
94
|
- test/test_array_basic_stats.rb
|
89
95
|
- test/test_array_count_by.rb
|
96
|
+
- test/test_fastmap.rb
|
90
97
|
- test/test_float.rb
|
91
98
|
- test/test_html_doc_getter.rb
|
99
|
+
- test/test_normalization.rb
|
92
100
|
- test/test_optimize.rb
|
93
101
|
- test/test_optimize_differential_evolution.rb
|
94
102
|
- test/test_optimize_objective.rb
|
95
103
|
- test/test_optimize_populationbasedoptimizer.rb
|
96
104
|
- test/test_optimize_random_search.rb
|
97
105
|
- test/test_optimize_search_space.rb
|
106
|
+
- test/test_sax.rb
|
98
107
|
- test/test_statistics.rb
|
108
|
+
- test/test_string_distance.rb
|
99
109
|
- test/test_time.rb
|
100
110
|
- test/test_vector.rb
|
101
111
|
- test/test_word_counter.rb
|
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
129
|
version: '0'
|
120
130
|
requirements: []
|
121
131
|
rubyforge_project:
|
122
|
-
rubygems_version: 1.8.
|
132
|
+
rubygems_version: 1.8.25
|
123
133
|
signing_key:
|
124
134
|
specification_version: 3
|
125
135
|
summary: Robert Feldt's Common Ruby Code lib
|
@@ -128,15 +138,19 @@ test_files:
|
|
128
138
|
- test/test_array.rb
|
129
139
|
- test/test_array_basic_stats.rb
|
130
140
|
- test/test_array_count_by.rb
|
141
|
+
- test/test_fastmap.rb
|
131
142
|
- test/test_float.rb
|
132
143
|
- test/test_html_doc_getter.rb
|
144
|
+
- test/test_normalization.rb
|
133
145
|
- test/test_optimize.rb
|
134
146
|
- test/test_optimize_differential_evolution.rb
|
135
147
|
- test/test_optimize_objective.rb
|
136
148
|
- test/test_optimize_populationbasedoptimizer.rb
|
137
149
|
- test/test_optimize_random_search.rb
|
138
150
|
- test/test_optimize_search_space.rb
|
151
|
+
- test/test_sax.rb
|
139
152
|
- test/test_statistics.rb
|
153
|
+
- test/test_string_distance.rb
|
140
154
|
- test/test_time.rb
|
141
155
|
- test/test_vector.rb
|
142
156
|
- test/test_word_counter.rb
|