feldtruby 0.3.6 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/R/diffusion_kde.R +116 -0
- data/lib/feldtruby/array/basic_stats.rb +51 -0
- data/lib/feldtruby/array.rb +4 -0
- data/lib/feldtruby/statistics/distance/string_distance.rb +49 -0
- data/lib/feldtruby/statistics/euclidean_distance.rb +14 -0
- data/lib/feldtruby/statistics/fastmap.rb +106 -0
- data/lib/feldtruby/statistics/normalization.rb +26 -0
- data/lib/feldtruby/statistics/time_series/sax.rb +99 -0
- data/lib/feldtruby/statistics.rb +48 -0
- data/lib/feldtruby/version.rb +1 -1
- data/lib/feldtruby.rb +3 -1
- data/test/test_array.rb +8 -1
- data/test/test_array_basic_stats.rb +32 -0
- data/test/test_fastmap.rb +22 -0
- data/test/test_normalization.rb +36 -0
- data/test/test_sax.rb +48 -0
- data/test/test_statistics.rb +13 -0
- data/test/test_string_distance.rb +24 -0
- metadata +17 -3
data/R/diffusion_kde.R
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# Gaussian KDE downloaded from
|
2
|
+
# http://www-etud.iro.umontreal.ca/~botev/kde.R
|
3
|
+
# on February 9th 2013.
|
4
|
+
# Our changes:
|
5
|
+
# - Renamed function from kde to diffusion.kde
|
6
|
+
# - Return an object instead of a matrix
|
7
|
+
# - Added the sum of the densities
|
8
|
+
# - Added the mesh interval
|
9
|
+
# - Added the probabilities for each interval
|
10
|
+
# No license was specified on this piece of code. For questions about it please contact
|
11
|
+
# Prof. Botev on the email: botev@maths.uq.edu.au
|
12
|
+
diffusion.kde <- function(data,n,MIN,MAX){
|
13
|
+
# State-of-the-art gaussian kernel density estimator for one-dimensional data;
|
14
|
+
# The estimator does not use the commonly employed 'gaussian rule of thumb'.
|
15
|
+
# As a result it outperforms many plug-in methods on multimodal densities
|
16
|
+
# with widely separated modes (see example).
|
17
|
+
# INPUTS:
|
18
|
+
# data - a vector of data from which the density estimate is constructed;
|
19
|
+
# n - the number of mesh points used in the uniform discretization of the
|
20
|
+
# interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
|
21
|
+
# n is rounded up to the next power of two; the default value of n is n=2^12;
|
22
|
+
# MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
|
23
|
+
# the default values of MIN and MAX are:
|
24
|
+
# MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
|
25
|
+
# OUTPUT:
|
26
|
+
# matrix 'out' of with two rows of length 'n', where out[2,]
|
27
|
+
# are the density values on the mesh out[1,];
|
28
|
+
# EXAMPLE:
|
29
|
+
##Save this file in your directory as kde.R and copy and paste the commands:
|
30
|
+
# rm(list=ls())
|
31
|
+
# source(file='kde.r')
|
32
|
+
# data=c(rnorm(10^3),rnorm(10^3)*2+30);
|
33
|
+
# d=kde(data)
|
34
|
+
# plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
|
35
|
+
|
36
|
+
# REFERENCE:
|
37
|
+
# Z. I. Botev, J. F. Grotowski and D. P. Kroese
|
38
|
+
# "Kernel Density Estimation Via Diffusion"
|
39
|
+
# Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
|
40
|
+
# for questions email: botev@maths.uq.edu.au
|
41
|
+
|
42
|
+
nargin=length(as.list(match.call()))-1;
|
43
|
+
if (nargin<2) n=2^14
|
44
|
+
n=2^ceiling(log2(n)); # round up n to the next power of 2;
|
45
|
+
if (nargin<4)
|
46
|
+
{# define the default interval [MIN,MAX]
|
47
|
+
minimum=min(data); maximum=max(data);
|
48
|
+
Range=maximum-minimum;
|
49
|
+
MIN=minimum-Range/10; MAX=maximum+Range/10;
|
50
|
+
}
|
51
|
+
# set up the grid over which the density estimate is computed;
|
52
|
+
R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
|
53
|
+
# if data has repeated observations use the N below
|
54
|
+
# N=length(as.numeric(names(table(data))));
|
55
|
+
# bin the data uniformly using the grid defined above;
|
56
|
+
w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
|
57
|
+
initial_data=initial_data/sum(initial_data);
|
58
|
+
|
59
|
+
dct1d <- function(data){
|
60
|
+
# computes the discrete cosine transform of the column vector data
|
61
|
+
n= length(data);
|
62
|
+
# Compute weights to multiply DFT coefficients
|
63
|
+
weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
|
64
|
+
# Re-order the elements of the columns of x
|
65
|
+
data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
|
66
|
+
# Multiply FFT by weights:
|
67
|
+
data= Re(weight* fft(data));
|
68
|
+
data}
|
69
|
+
|
70
|
+
a=dct1d(initial_data); # discrete cosine transform of initial data
|
71
|
+
# now compute the optimal bandwidth^2 using the referenced method
|
72
|
+
I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
|
73
|
+
# use fzero to solve the equation t=zeta*gamma^[5](t)
|
74
|
+
|
75
|
+
fixed_point <- function(t,N,I,a2){
|
76
|
+
# this implements the function t-zeta*gamma^[l](t)
|
77
|
+
l=7;
|
78
|
+
f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
|
79
|
+
for (s in (l-1):2){
|
80
|
+
|
81
|
+
K0=prod(seq(1,2*s-1,2))/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
|
82
|
+
time=(2*const*K0/N/f)^(2/(3+2*s));
|
83
|
+
f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
|
84
|
+
}
|
85
|
+
out=t-(2*N*sqrt(pi)*f)^(-2/5);
|
86
|
+
}
|
87
|
+
|
88
|
+
t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
|
89
|
+
# smooth the discrete cosine transform of initial data using t_star
|
90
|
+
a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
|
91
|
+
# now apply the inverse discrete cosine transform
|
92
|
+
|
93
|
+
idct1d <- function(data){
|
94
|
+
# computes the inverse discrete cosine transform
|
95
|
+
n=length(data);
|
96
|
+
# Compute weights
|
97
|
+
weights = n*exp(1i*(0:(n-1))*pi/(2*n));
|
98
|
+
# Compute x tilde using equation (5.93) in Jain
|
99
|
+
data = Re(fft(weights*data,inverse=TRUE))/n;
|
100
|
+
# Re-order elements of each column according to equations (5.93) and
|
101
|
+
# (5.94) in Jain
|
102
|
+
out = rep(0,n);
|
103
|
+
out[seq(1,n,2)] = data[1:(n/2)];
|
104
|
+
out[seq(2,n,2)] = data[n:(n/2+1)];
|
105
|
+
out;
|
106
|
+
}
|
107
|
+
|
108
|
+
density=idct1d(a_t)/R;
|
109
|
+
# take the rescaling of the data into account
|
110
|
+
bandwidth=sqrt(t_star)*R;
|
111
|
+
xmesh=seq(MIN,MAX,R/(n-1));
|
112
|
+
# out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
|
113
|
+
posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
|
114
|
+
list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
|
115
|
+
sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
|
116
|
+
}
|
@@ -21,6 +21,46 @@ module BasicStatistics
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
+
# Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
|
25
|
+
# This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
|
26
|
+
def quantiles
|
27
|
+
return [nil, nil, nil, nil, nil] if length == 0
|
28
|
+
sorted = self.sort
|
29
|
+
q1 = sorted.quantile_at_ratio(0.25)
|
30
|
+
q2 = sorted.quantile_at_ratio(0.50)
|
31
|
+
q3 = sorted.quantile_at_ratio(0.75)
|
32
|
+
return sorted.first, q1, q2, q3, sorted.last
|
33
|
+
end
|
34
|
+
|
35
|
+
# Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
|
36
|
+
# is a sorted array. This is based on the type 7 quantile function in R.
|
37
|
+
def quantile_at_ratio(p)
|
38
|
+
n = self.length
|
39
|
+
h = (n - 1) * p + 1
|
40
|
+
hfloor = h.floor
|
41
|
+
if h == hfloor
|
42
|
+
self[hfloor-1]
|
43
|
+
else
|
44
|
+
x_hfloor = self[hfloor-1]
|
45
|
+
x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Calculate the three quartiles of the array.
|
50
|
+
def quartiles
|
51
|
+
return [nil, nil, nil] if length == 0
|
52
|
+
sorted = self.sort
|
53
|
+
q1 = sorted.quantile_at_ratio(0.25)
|
54
|
+
q2 = sorted.quantile_at_ratio(0.50)
|
55
|
+
q3 = sorted.quantile_at_ratio(0.75)
|
56
|
+
return q1, q2, q3
|
57
|
+
end
|
58
|
+
|
59
|
+
def inter_quartile_range
|
60
|
+
q1, q2, q3 = quartiles
|
61
|
+
q3 - q1
|
62
|
+
end
|
63
|
+
|
24
64
|
def variance
|
25
65
|
return 0 if self.length == 0
|
26
66
|
avg = self.mean
|
@@ -31,6 +71,17 @@ module BasicStatistics
|
|
31
71
|
Math.sqrt( self.variance )
|
32
72
|
end
|
33
73
|
|
74
|
+
# Same as R's var, i.e. uses N-1 in denominator.
|
75
|
+
def var
|
76
|
+
n = self.length.to_f
|
77
|
+
(variance * n) / (n-1)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Save as R's sd, i.e. uses N-1 in denominator.
|
81
|
+
def sd
|
82
|
+
Math.sqrt( self.var )
|
83
|
+
end
|
84
|
+
|
34
85
|
def root_mean_square
|
35
86
|
Math.sqrt( self.map {|v| v**2}.mean )
|
36
87
|
end
|
data/lib/feldtruby/array.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FeldtRuby::Statistics
|
4
|
+
|
5
|
+
class StringDistance
|
6
|
+
def compress(s)
|
7
|
+
Zlib::Deflate.deflate(s, 9)
|
8
|
+
end
|
9
|
+
|
10
|
+
def compressed_length(s)
|
11
|
+
compress(s).length
|
12
|
+
end
|
13
|
+
|
14
|
+
def distance(string1, string2)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Cilibrasi and Vitanyi's NCD.
|
20
|
+
class NormalizedCompressionDistance < StringDistance
|
21
|
+
def distance(string1, string2)
|
22
|
+
return 0.0 if string1 == string2
|
23
|
+
c1 = compressed_length(string1)
|
24
|
+
c2 = compressed_length(string2)
|
25
|
+
c_1_2 = compressed_length(string1 + string2)
|
26
|
+
(c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ncd(string1, string2)
|
31
|
+
(@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Keogh et al's CDM.
|
35
|
+
class CompressionBasedDissimilarityMeasure < StringDistance
|
36
|
+
def distance(string1, string2)
|
37
|
+
return 0.0 if string1 == string2
|
38
|
+
c1 = compressed_length(string1)
|
39
|
+
c2 = compressed_length(string2)
|
40
|
+
c_1_2 = compressed_length(string1 + string2)
|
41
|
+
c_1_2.to_f / (c1 + c2)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def cdm(string1, string2)
|
46
|
+
(@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'feldtruby/array'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
class FastMap
|
6
|
+
# A PivotNode has two pivot objects, a map from each object to its
|
7
|
+
# coordinate on the line for these pivots, a distance function and
|
8
|
+
# a child pointing to the next dimension.
|
9
|
+
# It maps a multi-variate object to a k-dimensional coordinate.
|
10
|
+
class PivotNode
|
11
|
+
attr_writer :map, :child
|
12
|
+
|
13
|
+
def initialize(distance, pivot1, pivot2, map = nil, child = nil)
|
14
|
+
@distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
|
15
|
+
@d_1_2 = distance.calc(pivot1, pivot2)
|
16
|
+
@d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
|
17
|
+
end
|
18
|
+
|
19
|
+
# The number of coordinates that will be returned for an object.
|
20
|
+
def k; depth; end
|
21
|
+
def depth
|
22
|
+
@depth ||= 1 + (@child ? @child.depth : 0)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Map an object to its coordinate in the dimension represented by this node.
|
26
|
+
def fastmap_coordinate(o)
|
27
|
+
( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
|
28
|
+
end
|
29
|
+
|
30
|
+
def coordinate(o)
|
31
|
+
[map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](object)
|
35
|
+
coordinate(object)
|
36
|
+
end
|
37
|
+
|
38
|
+
def map_object_to_coordinate(o)
|
39
|
+
@map[o] || fastmap_coordinate(o)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(distance, k = 2, choiceDepth = 1)
|
44
|
+
@distance, @k, @choice_depth = distance, k, choiceDepth
|
45
|
+
end
|
46
|
+
|
47
|
+
def run(objects)
|
48
|
+
@objects = objects
|
49
|
+
create_map(@k, @distance)
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_map(k, distance)
|
53
|
+
return nil if k == 0
|
54
|
+
o1, o2 = choose_distant_objects(@objects, @distance)
|
55
|
+
node = PivotNode.new(distance, o1, o2)
|
56
|
+
coordinate_map = {}
|
57
|
+
if distance.calc(o1, o2) == 0.0
|
58
|
+
@objects.each {|o| coordinate_map[o] = 0.0}
|
59
|
+
else
|
60
|
+
@objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
|
61
|
+
end
|
62
|
+
node.map = coordinate_map
|
63
|
+
node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
|
64
|
+
node
|
65
|
+
end
|
66
|
+
|
67
|
+
def choose_distant_objects(objects, distance)
|
68
|
+
o1 = nil
|
69
|
+
o2 = objects.sample
|
70
|
+
# Not sure if there is any benefit to doing this more than once. Test later.
|
71
|
+
@choice_depth.times do
|
72
|
+
o1 = find_most_distant_object(objects, o2, distance)
|
73
|
+
o2 = find_most_distant_object(objects, o1, distance)
|
74
|
+
end
|
75
|
+
return o1, o2
|
76
|
+
end
|
77
|
+
|
78
|
+
# Find the object in objects that is farthest from o, given a distance function.
|
79
|
+
def find_most_distant_object(objects, o, distance)
|
80
|
+
objects.sort_by {|oi| distance.calc(oi, o)}.last
|
81
|
+
end
|
82
|
+
|
83
|
+
class DistanceFunction
|
84
|
+
def initialize(&func)
|
85
|
+
@func = func
|
86
|
+
end
|
87
|
+
def calc(o1, o2)
|
88
|
+
@func.call(o1, o2)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Create the next distance function from a given distance func.
|
93
|
+
def next_distance(distance, o1, o2, coordinates)
|
94
|
+
DistanceFunction.new do |oi, oj|
|
95
|
+
Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Recursively map n-dimensional objects (given as an Array) into a k-dimensional
|
101
|
+
# space while preserving the distances between the objects as well as possible.
|
102
|
+
def self.fastmap(objects, distance, k = 2)
|
103
|
+
FastMap.new(distance, k).run(objects)
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'feldtruby/array/basic_stats'
|
2
|
+
|
3
|
+
# The normalization methods assumes the existence of basic statistics
|
4
|
+
# on the class it they are included in:
|
5
|
+
# z_normalize: require mean and stdev
|
6
|
+
module FeldtRuby::Normalization
|
7
|
+
def normalize(&transform)
|
8
|
+
self.map {|v| transform.call(v)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def z_normalize
|
12
|
+
mean, stdev = self.mean, self.sd
|
13
|
+
self.map {|e| (e-mean)/stdev}
|
14
|
+
end
|
15
|
+
|
16
|
+
def min_max_normalize
|
17
|
+
return [] if self.length == 0
|
18
|
+
min = self.min.to_f
|
19
|
+
range = self.max - min
|
20
|
+
self.map {|e| (e-min)/range}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Array
|
25
|
+
include FeldtRuby::Normalization
|
26
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'feldtruby/statistics/normalization'
|
2
|
+
|
3
|
+
# Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
|
4
|
+
# Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
|
5
|
+
# "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
|
6
|
+
# available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
|
7
|
+
module FeldtRuby::Statistics
|
8
|
+
|
9
|
+
# A SAX processor transforms any numeric stream of data (often a time series)
|
10
|
+
# of arbitrary length n to a string (symbolic stream) of arbitrary length w,
|
11
|
+
# where w<n, and typically w<<n. The alphabet size (symbols in the string) is
|
12
|
+
# also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
|
13
|
+
# al we state the number of data elements, _elementsPerWord_, that should go
|
14
|
+
# into each word, i.e. w = n/elementsPerWord.
|
15
|
+
# This allows for many powerful data mining algorithms to be applied and sped up.
|
16
|
+
class SAX
|
17
|
+
# Create a SAX processor with given output length _w_ and alphabet size _a_.
|
18
|
+
def initialize(elementsPerWord, alphabetSize = 6)
|
19
|
+
raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
|
20
|
+
@elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
|
21
|
+
end
|
22
|
+
|
23
|
+
# A mapper maps the values in a subsequence into a symbol. The standard
|
24
|
+
# mapper is state-less and normalizes each subsequence and then assumes
|
25
|
+
# a normal distribution and thus uses a fixed selection of bins.
|
26
|
+
class SymbolMapper
|
27
|
+
def initialize(data = nil)
|
28
|
+
# This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
|
29
|
+
end
|
30
|
+
|
31
|
+
# Cut points based on a Normal/Gaussian distribution...
|
32
|
+
NormalDistCutPoints = {
|
33
|
+
2 => [-Float::INFINITY, 0.00],
|
34
|
+
3 => [-Float::INFINITY, -0.43, 0.43],
|
35
|
+
4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
|
36
|
+
5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
|
37
|
+
6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
|
38
|
+
7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
|
39
|
+
8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
|
40
|
+
9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
|
41
|
+
10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
|
42
|
+
11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
|
43
|
+
12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
|
44
|
+
13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
|
45
|
+
14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
|
46
|
+
15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
|
47
|
+
16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
|
48
|
+
17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
|
49
|
+
18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
|
50
|
+
19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
|
51
|
+
20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
|
52
|
+
}
|
53
|
+
|
54
|
+
def supports_alphabet_size?(size)
|
55
|
+
NormalDistCutPoints.keys.include? size
|
56
|
+
end
|
57
|
+
|
58
|
+
def map_sequence_to_symbol(sequence, alphabet_size)
|
59
|
+
symbol_for_value(sequence.mean, alphabet_size)
|
60
|
+
end
|
61
|
+
|
62
|
+
def symbol_for_value(value, alphabet_size)
|
63
|
+
NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
|
64
|
+
return symbol if cutpoint > value
|
65
|
+
symbol + 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def setup_for_processing_data(data, mapper = nil)
|
71
|
+
@mapper ||= SymbolMapper.new(data)
|
72
|
+
unless @mapper.supports_alphabet_size?(@alphabet_size)
|
73
|
+
raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_subsequence(subsequence)
|
78
|
+
normalized_ss = subsequence.z_normalize
|
79
|
+
len, rem = normalized_ss.length.divmod @elements_per_word
|
80
|
+
# Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
|
81
|
+
# This is different than the orig SAX as specified in their paper.
|
82
|
+
symbols = (0...len).map do |wordindex|
|
83
|
+
@mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
|
84
|
+
end
|
85
|
+
symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
|
86
|
+
symbols
|
87
|
+
end
|
88
|
+
|
89
|
+
def process(data, windowSize = data.length, mapper = nil)
|
90
|
+
setup_for_processing_data(data, mapper)
|
91
|
+
res = (0..(data.length - windowSize)).map do |i|
|
92
|
+
process_subsequence(data[i, windowSize])
|
93
|
+
end
|
94
|
+
res = res.flatten if windowSize == data.length
|
95
|
+
res
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
data/lib/feldtruby/statistics.rb
CHANGED
@@ -26,6 +26,15 @@ class RCommunicator
|
|
26
26
|
@r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
|
27
27
|
end
|
28
28
|
|
29
|
+
# Load R scripts in the feldtruby/R directory.
|
30
|
+
def load_feldtruby_r_script(scriptName, reload = false)
|
31
|
+
@loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
|
32
|
+
return if reload == false && @loaded_scripts.include?(scriptName)
|
33
|
+
@loaded_scripts << scriptName
|
34
|
+
path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
|
35
|
+
@r.eval "source(\"#{path}\")"
|
36
|
+
end
|
37
|
+
|
29
38
|
def eval(str)
|
30
39
|
@r.eval str
|
31
40
|
end
|
@@ -103,6 +112,45 @@ module Statistics
|
|
103
112
|
res = RC.call("chisq.test", vs)
|
104
113
|
res.p_value
|
105
114
|
end
|
115
|
+
|
116
|
+
class DiffusionKDE
|
117
|
+
attr_reader :densities, :mesh
|
118
|
+
|
119
|
+
# Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
|
120
|
+
# we can calculate the probability of new values.
|
121
|
+
def initialize(rvalue)
|
122
|
+
@probabilities = rvalue.probabilities
|
123
|
+
@densities = rvalue.densities
|
124
|
+
@mesh = rvalue.mesh
|
125
|
+
@mesh_interval = rvalue.mesh_interval.to_f
|
126
|
+
@min, @max = rvalue.min.to_f, rvalue.max.to_f
|
127
|
+
end
|
128
|
+
|
129
|
+
def density_of(value)
|
130
|
+
return 0.0 if value < @min || value > @max
|
131
|
+
bin_index = ((value - @min) / @mesh_interval).floor
|
132
|
+
@densities[bin_index]
|
133
|
+
end
|
134
|
+
|
135
|
+
def probability_of(value)
|
136
|
+
return 0.0 if value < @min || value > @max
|
137
|
+
bin_index = ((value - @min) / @mesh_interval).floor
|
138
|
+
@probabilities[bin_index]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
|
143
|
+
# and optional min and max values.
|
144
|
+
def density_estimation(values, n = 2**9, min = nil, max = nil)
|
145
|
+
# Ensure we have loaded the diffusion.kde code
|
146
|
+
RC.load_feldtruby_r_script("diffusion_kde.R")
|
147
|
+
args = [values, n]
|
148
|
+
if min && max
|
149
|
+
args << min
|
150
|
+
args << max
|
151
|
+
end
|
152
|
+
DiffusionKDE.new RC.call("diffusion.kde", *args)
|
153
|
+
end
|
106
154
|
end
|
107
155
|
|
108
156
|
# Make them available at top level
|
data/lib/feldtruby/version.rb
CHANGED
data/lib/feldtruby.rb
CHANGED
data/test/test_array.rb
CHANGED
@@ -100,4 +100,11 @@ describe "Array extensions" do
|
|
100
100
|
counts[5].must_equal 5
|
101
101
|
end
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
|
+
describe "sample" do
|
105
|
+
it "only samples within the array" do
|
106
|
+
d = (1..100).to_a
|
107
|
+
100.times { d.include?(d.sample).must_equal(true) }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
+
describe "mean and stdev" do
|
64
|
+
it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
65
|
+
data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
|
66
|
+
data.mean.must_be_close_to 4.606667
|
67
|
+
data.sd.must_be_close_to 2.640316
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
63
71
|
describe "Basic statistics" do
|
64
72
|
describe "sum of abs" do
|
65
73
|
it "works for simple example" do
|
@@ -131,4 +139,28 @@ describe "Basic statistics" do
|
|
131
139
|
[1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
|
132
140
|
end
|
133
141
|
end
|
142
|
+
|
143
|
+
describe "quantile- and quartile-related functionality" do
|
144
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
|
145
|
+
seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
|
146
|
+
seq.quartiles.must_equal [7.25, 9, 14.5]
|
147
|
+
seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
|
148
|
+
seq.inter_quartile_range.must_equal (14.5-7.25)
|
149
|
+
end
|
150
|
+
|
151
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
|
152
|
+
seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
|
153
|
+
seq.quartiles.must_equal [7.5, 9, 14]
|
154
|
+
seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
|
155
|
+
seq.inter_quartile_range.must_equal 6.5
|
156
|
+
end
|
157
|
+
|
158
|
+
it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
|
159
|
+
seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
|
160
|
+
|
161
|
+
seq.quartiles.must_equal [25.5, 40, 42.5]
|
162
|
+
seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
|
163
|
+
seq.inter_quartile_range.must_equal 17.0
|
164
|
+
end
|
165
|
+
end
|
134
166
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'feldtruby/statistics/fastmap'
|
2
|
+
require 'feldtruby/statistics/euclidean_distance'
|
3
|
+
|
4
|
+
describe "Fastmap" do
|
5
|
+
it "works for simple data, and different values of k" do
|
6
|
+
d = [
|
7
|
+
[0, 0, 0, 0],
|
8
|
+
[1, 1, 1, 1],
|
9
|
+
[2, 2, 2, 2],
|
10
|
+
[3, 3, 3, 3]
|
11
|
+
]
|
12
|
+
1.upto(d.first.length-1) do |k|
|
13
|
+
m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
|
14
|
+
m.depth.must_equal k
|
15
|
+
d.each do |datum|
|
16
|
+
c = m[datum]
|
17
|
+
c.length.must_equal k
|
18
|
+
c.must_equal m[datum]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'feldtruby/statistics/normalization'
|
2
|
+
|
3
|
+
class Array
|
4
|
+
def must_be_close_to(other)
|
5
|
+
self.zip(other).map {|a,b| a.must_be_close_to(b)}
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "Z normalization" do
|
10
|
+
it "handles empty arrays" do
|
11
|
+
[].z_normalize.must_equal []
|
12
|
+
end
|
13
|
+
|
14
|
+
it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
15
|
+
data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
|
16
|
+
expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
|
17
|
+
data.z_normalize.must_be_close_to expected
|
18
|
+
end
|
19
|
+
|
20
|
+
it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
|
21
|
+
data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
|
22
|
+
expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
|
23
|
+
data.z_normalize.must_be_close_to expected
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "Min-Max normalization" do
|
28
|
+
it "handles empty arrays" do
|
29
|
+
[].min_max_normalize.must_equal []
|
30
|
+
end
|
31
|
+
|
32
|
+
it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
|
33
|
+
data = [20, 24, 26, 27, 30]
|
34
|
+
data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
|
35
|
+
end
|
36
|
+
end
|
data/test/test_sax.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'feldtruby/statistics/time_series/sax'
|
2
|
+
include FeldtRuby::Statistics
|
3
|
+
|
4
|
+
describe 'Symbolic Adaptive approXimation - SAX' do
|
5
|
+
describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
|
6
|
+
it "accepts alphabet sizes between 2 and 20" do
|
7
|
+
sm = SAX::SymbolMapper.new
|
8
|
+
sm.supports_alphabet_size?(-1).must_equal false
|
9
|
+
sm.supports_alphabet_size?(0).must_equal false
|
10
|
+
sm.supports_alphabet_size?(1).must_equal false
|
11
|
+
sm.supports_alphabet_size?(2).must_equal true
|
12
|
+
sm.supports_alphabet_size?(20).must_equal true
|
13
|
+
sm.supports_alphabet_size?(21).must_equal false
|
14
|
+
end
|
15
|
+
|
16
|
+
it "maps correctly to symbols for alphabet of size 2" do
|
17
|
+
sm = SAX::SymbolMapper.new
|
18
|
+
sm.symbol_for_value(-10, 2).must_equal 1
|
19
|
+
sm.symbol_for_value(-1, 2).must_equal 1
|
20
|
+
sm.symbol_for_value(1, 2).must_equal 2
|
21
|
+
sm.symbol_for_value(10, 2).must_equal 2
|
22
|
+
end
|
23
|
+
|
24
|
+
it "maps correctly to symbols for alphabet of size 4" do
|
25
|
+
sm = SAX::SymbolMapper.new
|
26
|
+
sm.symbol_for_value(-0.7, 4).must_equal 1
|
27
|
+
sm.symbol_for_value(-0.5, 4).must_equal 2
|
28
|
+
sm.symbol_for_value(-0.01, 4).must_equal 2
|
29
|
+
sm.symbol_for_value(0, 4).must_equal 3
|
30
|
+
sm.symbol_for_value(0.01, 4).must_equal 3
|
31
|
+
sm.symbol_for_value(0.5, 4).must_equal 3
|
32
|
+
sm.symbol_for_value(0.7, 4).must_equal 4
|
33
|
+
sm.symbol_for_value(17, 4).must_equal 4
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
it "does not accept alphabet sizes larger than 20 or smaller than 2" do
|
38
|
+
proc {SAX.new(10, 21)}.must_raise ArgumentError
|
39
|
+
proc {SAX.new(3, 1)}.must_raise ArgumentError
|
40
|
+
end
|
41
|
+
|
42
|
+
it "maps some simple time series to symbols when directly mapping" do
|
43
|
+
sax = SAX.new(1, 4)
|
44
|
+
sax.process([-1, 0, 1]).must_equal [1,3,4]
|
45
|
+
sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
|
46
|
+
sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
|
47
|
+
end
|
48
|
+
end
|
data/test/test_statistics.rb
CHANGED
@@ -69,6 +69,19 @@ describe "Statistics" do
|
|
69
69
|
probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
|
70
70
|
end
|
71
71
|
end
|
72
|
+
|
73
|
+
describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
|
74
|
+
it "works for simple examples" do
|
75
|
+
data = [1]
|
76
|
+
kde = density_estimation(data, 4, 0.0, 3.0)
|
77
|
+
kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
|
78
|
+
kde.densities.length.must_equal 4
|
79
|
+
kde.densities[0].must_be_close_to 0.3912
|
80
|
+
kde.densities[1].must_be_close_to 0.3591
|
81
|
+
kde.densities[2].must_be_close_to 0.3101
|
82
|
+
kde.densities[3].must_be_close_to 0.2728
|
83
|
+
end
|
84
|
+
end
|
72
85
|
end
|
73
86
|
|
74
87
|
require 'feldtruby/minitest_extensions'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'feldtruby/statistics/distance/string_distance'
|
2
|
+
include FeldtRuby::Statistics
|
3
|
+
|
4
|
+
describe "ncd" do
|
5
|
+
it "gives no distance if the strings are the same" do
|
6
|
+
ncd("aaa", "aaa").must_equal 0.0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "gives distance > 0.0 if strings are not the same" do
|
10
|
+
ncd("a", "b").must_be :>, 0.0
|
11
|
+
ncd("aa", "ab").must_be :>, 0.0
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "cdm" do
|
16
|
+
it "gives no distance if the strings are the same" do
|
17
|
+
cdm("aaa", "aaa").must_equal 0.0
|
18
|
+
end
|
19
|
+
|
20
|
+
it "gives distance > 0.0 if strings are not the same" do
|
21
|
+
cdm("a", "b").must_be :>, 0.0
|
22
|
+
cdm("aa", "ab").must_be :>, 0.0
|
23
|
+
end
|
24
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feldtruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rinruby
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- Gemfile
|
55
55
|
- Gemfile.lock
|
56
56
|
- History.txt
|
57
|
+
- R/diffusion_kde.R
|
57
58
|
- README.md
|
58
59
|
- Rakefile
|
59
60
|
- TODO
|
@@ -77,6 +78,11 @@ files:
|
|
77
78
|
- lib/feldtruby/optimize/search_space.rb
|
78
79
|
- lib/feldtruby/optimize/stdout_logger.rb
|
79
80
|
- lib/feldtruby/statistics.rb
|
81
|
+
- lib/feldtruby/statistics/distance/string_distance.rb
|
82
|
+
- lib/feldtruby/statistics/euclidean_distance.rb
|
83
|
+
- lib/feldtruby/statistics/fastmap.rb
|
84
|
+
- lib/feldtruby/statistics/normalization.rb
|
85
|
+
- lib/feldtruby/statistics/time_series/sax.rb
|
80
86
|
- lib/feldtruby/string/to_iso.rb
|
81
87
|
- lib/feldtruby/time.rb
|
82
88
|
- lib/feldtruby/vector.rb
|
@@ -87,15 +93,19 @@ files:
|
|
87
93
|
- test/test_array.rb
|
88
94
|
- test/test_array_basic_stats.rb
|
89
95
|
- test/test_array_count_by.rb
|
96
|
+
- test/test_fastmap.rb
|
90
97
|
- test/test_float.rb
|
91
98
|
- test/test_html_doc_getter.rb
|
99
|
+
- test/test_normalization.rb
|
92
100
|
- test/test_optimize.rb
|
93
101
|
- test/test_optimize_differential_evolution.rb
|
94
102
|
- test/test_optimize_objective.rb
|
95
103
|
- test/test_optimize_populationbasedoptimizer.rb
|
96
104
|
- test/test_optimize_random_search.rb
|
97
105
|
- test/test_optimize_search_space.rb
|
106
|
+
- test/test_sax.rb
|
98
107
|
- test/test_statistics.rb
|
108
|
+
- test/test_string_distance.rb
|
99
109
|
- test/test_time.rb
|
100
110
|
- test/test_vector.rb
|
101
111
|
- test/test_word_counter.rb
|
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
129
|
version: '0'
|
120
130
|
requirements: []
|
121
131
|
rubyforge_project:
|
122
|
-
rubygems_version: 1.8.
|
132
|
+
rubygems_version: 1.8.25
|
123
133
|
signing_key:
|
124
134
|
specification_version: 3
|
125
135
|
summary: Robert Feldt's Common Ruby Code lib
|
@@ -128,15 +138,19 @@ test_files:
|
|
128
138
|
- test/test_array.rb
|
129
139
|
- test/test_array_basic_stats.rb
|
130
140
|
- test/test_array_count_by.rb
|
141
|
+
- test/test_fastmap.rb
|
131
142
|
- test/test_float.rb
|
132
143
|
- test/test_html_doc_getter.rb
|
144
|
+
- test/test_normalization.rb
|
133
145
|
- test/test_optimize.rb
|
134
146
|
- test/test_optimize_differential_evolution.rb
|
135
147
|
- test/test_optimize_objective.rb
|
136
148
|
- test/test_optimize_populationbasedoptimizer.rb
|
137
149
|
- test/test_optimize_random_search.rb
|
138
150
|
- test/test_optimize_search_space.rb
|
151
|
+
- test/test_sax.rb
|
139
152
|
- test/test_statistics.rb
|
153
|
+
- test/test_string_distance.rb
|
140
154
|
- test/test_time.rb
|
141
155
|
- test/test_vector.rb
|
142
156
|
- test/test_word_counter.rb
|