RubyGems - feldtruby - Versions diffs - 0.3.6 → 0.3.8 - Mend

feldtruby 0.3.6 → 0.3.8

Files changed (19) hide show

data/R/diffusion_kde.R +116 -0
data/lib/feldtruby/array/basic_stats.rb +51 -0
data/lib/feldtruby/array.rb +4 -0
data/lib/feldtruby/statistics/distance/string_distance.rb +49 -0
data/lib/feldtruby/statistics/euclidean_distance.rb +14 -0
data/lib/feldtruby/statistics/fastmap.rb +106 -0
data/lib/feldtruby/statistics/normalization.rb +26 -0
data/lib/feldtruby/statistics/time_series/sax.rb +99 -0
data/lib/feldtruby/statistics.rb +48 -0
data/lib/feldtruby/version.rb +1 -1
data/lib/feldtruby.rb +3 -1
data/test/test_array.rb +8 -1
data/test/test_array_basic_stats.rb +32 -0
data/test/test_fastmap.rb +22 -0
data/test/test_normalization.rb +36 -0
data/test/test_sax.rb +48 -0
data/test/test_statistics.rb +13 -0
data/test/test_string_distance.rb +24 -0
metadata +17 -3

data/R/diffusion_kde.R ADDED Viewed

@@ -0,0 +1,116 @@
+# Gaussian KDE downloaded from
+#   http://www-etud.iro.umontreal.ca/~botev/kde.R
+# on February 9th 2013.
+# Our changes:
+#   - Renamed function from kde to diffusion.kde
+#   - Return an object instead of a matrix
+#   - Added the sum of the densities
+#   - Added the mesh interval
+#   - Added the probabilities for each interval
+# No license was specified on this piece of code. For questions about it please contact
+# Prof. Botev on the email: botev@maths.uq.edu.au
+diffusion.kde <- function(data,n,MIN,MAX){
+#       State-of-the-art gaussian kernel density estimator for one-dimensional data;
+#       The estimator does not use the commonly employed 'gaussian rule of thumb'.
+#       As a result it outperforms many plug-in methods on multimodal densities
+#       with widely separated modes (see example).
+# INPUTS:
+#     data    - a vector of data from which the density estimate is constructed;
+#          n  - the number of mesh points used in the uniform discretization of the
+#               interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
+#               n is rounded up to the next power of two; the default value of n is n=2^12;
+#   MIN, MAX  - defines the interval [MIN,MAX] on which the density estimate is constructed;
+#               the default values of MIN and MAX are:
+#               MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
+# OUTPUT:
+#       matrix 'out' of with two rows of length 'n', where out[2,]
+#       are the density values on the mesh out[1,];
+# EXAMPLE:
+##Save this file in your directory as kde.R and copy and paste the commands:
+# rm(list=ls())
+# source(file='kde.r')
+# data=c(rnorm(10^3),rnorm(10^3)*2+30);
+# d=kde(data)
+# plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
+# REFERENCE:
+# Z. I. Botev, J. F. Grotowski and D. P. Kroese
+# "Kernel Density Estimation Via Diffusion"
+# Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
+# for questions email: botev@maths.uq.edu.au
+nargin=length(as.list(match.call()))-1;
+if (nargin<2) n=2^14
+n=2^ceiling(log2(n)); # round up n to the next power of 2;
+if (nargin<4)
+{# define the default  interval [MIN,MAX]
+minimum=min(data); maximum=max(data);
+Range=maximum-minimum;
+MIN=minimum-Range/10; MAX=maximum+Range/10;
+}
+# set up the grid over which the density estimate is computed;
+R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
+# if data has repeated observations use the N below
+# N=length(as.numeric(names(table(data))));
+# bin the data uniformly using the grid defined above;
+w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
+initial_data=initial_data/sum(initial_data);
+dct1d <- function(data){
+# computes the discrete cosine transform of the column vector data
+n= length(data);
+# Compute weights to multiply DFT coefficients
+weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
+# Re-order the elements of the columns of x
+data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
+# Multiply FFT by weights:
+data= Re(weight* fft(data));
+data}
+a=dct1d(initial_data); # discrete cosine transform of initial data
+# now compute the optimal bandwidth^2 using the referenced method
+I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
+# use  fzero to solve the equation t=zeta*gamma^[5](t)
+fixed_point <-  function(t,N,I,a2){
+# this implements the function t-zeta*gamma^[l](t)
+l=7;
+f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
+for (s in (l-1):2){
+    K0=prod(seq(1,2*s-1,2))/sqrt(2*pi);  const=(1+(1/2)^(s+1/2))/3;
+    time=(2*const*K0/N/f)^(2/(3+2*s));
+    f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
+}
+out=t-(2*N*sqrt(pi)*f)^(-2/5);
+}
+t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
+# smooth the discrete cosine transform of initial data using t_star
+a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
+# now apply the inverse discrete cosine transform
+idct1d <-  function(data){
+# computes the inverse discrete cosine transform
+n=length(data);
+# Compute weights
+weights = n*exp(1i*(0:(n-1))*pi/(2*n));
+# Compute x tilde using equation (5.93) in Jain
+data = Re(fft(weights*data,inverse=TRUE))/n;
+# Re-order elements of each column according to equations (5.93) and
+# (5.94) in Jain
+out = rep(0,n);
+out[seq(1,n,2)] = data[1:(n/2)];
+out[seq(2,n,2)] = data[n:(n/2+1)];
+out;
+}
+density=idct1d(a_t)/R;
+# take the rescaling of the data into account
+bandwidth=sqrt(t_star)*R;
+xmesh=seq(MIN,MAX,R/(n-1));
+# out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
+posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
+list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
+  sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
+}

data/lib/feldtruby/array/basic_stats.rb CHANGED Viewed

@@ -21,6 +21,46 @@ module BasicStatistics
 		end
 	end
+	# Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
+	# This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
+	def quantiles
+		return [nil, nil, nil, nil, nil] if length == 0
+		sorted = self.sort
+		q1 = sorted.quantile_at_ratio(0.25)
+		q2 = sorted.quantile_at_ratio(0.50)
+		q3 = sorted.quantile_at_ratio(0.75)
+		return sorted.first, q1, q2, q3, sorted.last
+	end
+	# Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
+	# is a sorted array. This is based on the type 7 quantile function in R.
+	def quantile_at_ratio(p)
+		n = self.length
+		h = (n - 1) * p + 1
+		hfloor = h.floor
+		if h == hfloor
+			self[hfloor-1]
+		else
+			x_hfloor = self[hfloor-1]
+			x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
+		end
+	end
+	# Calculate the three quartiles of the array.
+	def quartiles
+		return [nil, nil, nil] if length == 0
+		sorted = self.sort
+		q1 = sorted.quantile_at_ratio(0.25)
+		q2 = sorted.quantile_at_ratio(0.50)
+		q3 = sorted.quantile_at_ratio(0.75)
+		return q1, q2, q3
+	end
+	def inter_quartile_range
+		q1, q2, q3 = quartiles
+		q3 - q1
+	end
 	def variance
 		return 0 if self.length == 0
 		avg = self.mean
@@ -31,6 +71,17 @@ module BasicStatistics
 		Math.sqrt( self.variance )
 	end
+	# Same as R's var, i.e. uses N-1 in denominator.
+	def var
+		n = self.length.to_f
+		(variance * n) / (n-1)
+	end
+	# Save as R's sd, i.e. uses N-1 in denominator.
+	def sd
+		Math.sqrt( self.var )
+	end
 	def root_mean_square
 		Math.sqrt( self.map {|v| v**2}.mean )
 	end

data/lib/feldtruby/array.rb CHANGED Viewed

@@ -44,4 +44,8 @@ class Array
 		self.each {|element| count_hash[element] += 1}
 		count_hash
 	end
+	def sample
+		self[rand(self.length)]
+	end
 end

data/lib/feldtruby/statistics/distance/string_distance.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'zlib'
+module FeldtRuby::Statistics
+class StringDistance
+  def compress(s)
+    Zlib::Deflate.deflate(s, 9)
+  end
+  def compressed_length(s)
+    compress(s).length
+  end
+  def distance(string1, string2)
+    raise NotImplementedError
+  end
+end
+# Cilibrasi and Vitanyi's NCD.
+class NormalizedCompressionDistance < StringDistance
+  def distance(string1, string2)
+    return 0.0 if string1 == string2
+    c1 = compressed_length(string1)
+    c2 = compressed_length(string2)
+    c_1_2 = compressed_length(string1 + string2)
+    (c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
+  end
+end
+def ncd(string1, string2)
+  (@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
+end
+# Keogh et al's CDM.
+class CompressionBasedDissimilarityMeasure < StringDistance
+  def distance(string1, string2)
+    return 0.0 if string1 == string2
+    c1 = compressed_length(string1)
+    c2 = compressed_length(string2)
+    c_1_2 = compressed_length(string1 + string2)
+    c_1_2.to_f / (c1 + c2)
+  end
+end
+def cdm(string1, string2)
+  (@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
+end
+end

data/lib/feldtruby/statistics/euclidean_distance.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module FeldtRuby
+class EuclideanDistance
+  def calc(o1, o2)
+    sum = 0.0
+    o1.length.times do |i|
+      d = (o1[i] - o2[i])
+      sum += (d*d)
+    end
+    Math.sqrt(sum)
+  end
+end
+end

data/lib/feldtruby/statistics/fastmap.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'feldtruby/array'
+module FeldtRuby
+class FastMap
+  # A PivotNode has two pivot objects, a map from each object to its
+  # coordinate on the line for these pivots, a distance function and
+  # a child pointing to the next dimension.
+  # It maps a multi-variate object to a k-dimensional coordinate.
+  class PivotNode
+    attr_writer :map, :child
+    def initialize(distance, pivot1, pivot2, map = nil, child = nil)
+      @distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
+      @d_1_2 = distance.calc(pivot1, pivot2)
+      @d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
+    end
+    # The number of coordinates that will be returned for an object.
+    def k; depth; end
+    def depth
+      @depth ||= 1 + (@child ? @child.depth : 0)
+    end
+    # Map an object to its coordinate in the dimension represented by this node.
+    def fastmap_coordinate(o)
+      ( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
+    end
+    def coordinate(o)
+      [map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
+    end
+    def [](object)
+      coordinate(object)
+    end
+    def map_object_to_coordinate(o)
+      @map[o] || fastmap_coordinate(o)
+    end
+  end
+  def initialize(distance, k = 2, choiceDepth = 1)
+    @distance, @k, @choice_depth = distance, k, choiceDepth
+  end
+  def run(objects)
+    @objects = objects
+    create_map(@k, @distance)
+  end
+  def create_map(k, distance)
+    return nil if k == 0
+    o1, o2 = choose_distant_objects(@objects, @distance)
+    node = PivotNode.new(distance, o1, o2)
+    coordinate_map = {}
+    if distance.calc(o1, o2) == 0.0
+      @objects.each {|o| coordinate_map[o] = 0.0}
+    else
+      @objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
+    end
+    node.map = coordinate_map
+    node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
+    node
+  end
+  def choose_distant_objects(objects, distance)
+    o1 = nil
+    o2 = objects.sample
+    # Not sure if there is any benefit to doing this more than once. Test later.
+    @choice_depth.times do
+      o1 = find_most_distant_object(objects, o2, distance)
+      o2 = find_most_distant_object(objects, o1, distance)
+    end
+    return o1, o2
+  end
+  # Find the object in objects that is farthest from o, given a distance function.
+  def find_most_distant_object(objects, o, distance)
+    objects.sort_by {|oi| distance.calc(oi, o)}.last
+  end
+  class DistanceFunction
+    def initialize(&func)
+      @func = func
+    end
+    def calc(o1, o2)
+      @func.call(o1, o2)
+    end
+  end
+  # Create the next distance function from a given distance func.
+  def next_distance(distance, o1, o2, coordinates)
+    DistanceFunction.new do |oi, oj|
+      Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
+    end
+  end
+end
+# Recursively map n-dimensional objects (given as an Array) into a k-dimensional
+# space while preserving the distances between the objects as well as possible.
+def self.fastmap(objects, distance, k = 2)
+  FastMap.new(distance, k).run(objects)
+end
+end

data/lib/feldtruby/statistics/normalization.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'feldtruby/array/basic_stats'
+# The normalization methods assumes the existence of basic statistics
+# on the class it they are included in:
+#   z_normalize: require mean and stdev
+module FeldtRuby::Normalization
+  def normalize(&transform)
+    self.map {|v| transform.call(v)}
+  end
+  def z_normalize
+    mean, stdev = self.mean, self.sd
+    self.map {|e| (e-mean)/stdev}
+  end
+  def min_max_normalize
+    return [] if self.length == 0
+    min = self.min.to_f
+    range = self.max - min
+    self.map {|e| (e-min)/range}
+  end
+end
+class Array
+  include FeldtRuby::Normalization
+end

data/lib/feldtruby/statistics/time_series/sax.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'feldtruby/statistics/normalization'
+# Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
+#  Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
+#  "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
+# available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
+module FeldtRuby::Statistics
+# A SAX processor transforms any numeric stream of data (often a time series)
+# of arbitrary length n to a string (symbolic stream) of arbitrary length w,
+# where w<n, and typically w<<n. The alphabet size (symbols in the string) is
+# also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
+# al we state the number of data elements, _elementsPerWord_, that should go
+# into each word, i.e. w = n/elementsPerWord.
+# This allows for many powerful data mining algorithms to be applied and sped up.
+class SAX
+  # Create a SAX processor with given output length _w_ and alphabet size _a_.
+  def initialize(elementsPerWord, alphabetSize = 6)
+    raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
+    @elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
+  end
+  # A mapper maps the values in a subsequence into a symbol. The standard
+  # mapper is state-less and normalizes each subsequence and then assumes
+  # a normal distribution and thus uses a fixed selection of bins.
+  class SymbolMapper
+    def initialize(data = nil)
+      # This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
+    end
+    # Cut points based on a Normal/Gaussian distribution...
+    NormalDistCutPoints = {
+        2 => [-Float::INFINITY, 0.00],
+        3 => [-Float::INFINITY, -0.43, 0.43],
+        4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
+        5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
+        6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
+        7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
+        8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
+        9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
+        10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
+        11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
+        12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
+        13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
+        14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
+        15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
+        16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
+        17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
+        18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
+        19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
+        20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
+    }
+    def supports_alphabet_size?(size)
+      NormalDistCutPoints.keys.include? size
+    end
+    def map_sequence_to_symbol(sequence, alphabet_size)
+      symbol_for_value(sequence.mean, alphabet_size)
+    end
+    def symbol_for_value(value, alphabet_size)
+      NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
+        return symbol if cutpoint > value
+        symbol + 1
+      end
+    end
+  end
+  def setup_for_processing_data(data, mapper = nil)
+    @mapper ||= SymbolMapper.new(data)
+    unless @mapper.supports_alphabet_size?(@alphabet_size)
+      raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
+    end
+  end
+  def process_subsequence(subsequence)
+    normalized_ss = subsequence.z_normalize
+    len, rem = normalized_ss.length.divmod @elements_per_word
+    # Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
+    # This is different than the orig SAX as specified in their paper.
+    symbols = (0...len).map do |wordindex|
+      @mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
+    end
+    symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
+    symbols
+  end
+  def process(data, windowSize = data.length, mapper = nil)
+    setup_for_processing_data(data, mapper)
+    res = (0..(data.length - windowSize)).map do |i|
+      process_subsequence(data[i, windowSize])
+    end
+    res = res.flatten if windowSize == data.length
+    res
+  end
+end
+end

data/lib/feldtruby/statistics.rb CHANGED Viewed

@@ -26,6 +26,15 @@ class RCommunicator
     @r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
   end
+  # Load R scripts in the feldtruby/R directory.
+  def load_feldtruby_r_script(scriptName, reload = false)
+    @loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
+    return if reload == false && @loaded_scripts.include?(scriptName)
+    @loaded_scripts << scriptName
+    path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
+    @r.eval "source(\"#{path}\")"
+  end
   def eval(str)
     @r.eval str
   end
@@ -103,6 +112,45 @@ module Statistics
     res = RC.call("chisq.test", vs)
     res.p_value
   end
+  class DiffusionKDE
+    attr_reader :densities, :mesh
+    # Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
+    # we can calculate the probability of new values.
+    def initialize(rvalue)
+      @probabilities = rvalue.probabilities
+      @densities = rvalue.densities
+      @mesh = rvalue.mesh
+      @mesh_interval = rvalue.mesh_interval.to_f
+      @min, @max = rvalue.min.to_f, rvalue.max.to_f
+    end
+    def density_of(value)
+      return 0.0 if value < @min || value > @max
+      bin_index = ((value - @min) / @mesh_interval).floor
+      @densities[bin_index]
+    end
+    def probability_of(value)
+      return 0.0 if value < @min || value > @max
+      bin_index = ((value - @min) / @mesh_interval).floor
+      @probabilities[bin_index]
+    end
+  end
+  # Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
+  # and optional min and max values.
+  def density_estimation(values, n = 2**9, min = nil, max = nil)
+    # Ensure we have loaded the diffusion.kde code
+    RC.load_feldtruby_r_script("diffusion_kde.R")
+    args = [values, n]
+    if min && max
+      args << min
+      args << max
+    end
+    DiffusionKDE.new RC.call("diffusion.kde", *args)
+  end
 end
 # Make them available at top level

data/lib/feldtruby/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FeldtRuby
-  VERSION = "0.3.6"
+  VERSION = "0.3.8"
 end

data/lib/feldtruby.rb CHANGED Viewed

@@ -4,4 +4,6 @@ if RUBY_VERSION < "1.9"
 end
 # This is the namespace under which we put things...
-module FeldtRuby; end
+module FeldtRuby
+  TopDirectory = File.dirname(__FILE__).split("/")[0...-1].join("/")
+end

data/test/test_array.rb CHANGED Viewed

@@ -100,4 +100,11 @@ describe "Array extensions" do
 			counts[5].must_equal 5
 		end
 	end
-end
+	describe "sample" do
+		it "only samples within the array" do
+			d = (1..100).to_a
+			100.times { d.include?(d.sample).must_equal(true) }
+		end
+	end
+end

data/test/test_array_basic_stats.rb CHANGED Viewed

@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
 	end
 end
+describe "mean and stdev" do
+	it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
+    data.mean.must_be_close_to 4.606667
+    data.sd.must_be_close_to 2.640316
+  end
+end
 describe "Basic statistics" do
 	describe "sum of abs" do
 		it "works for simple example" do
@@ -131,4 +139,28 @@ describe "Basic statistics" do
 			[1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
 		end
 	end
+	describe "quantile- and quartile-related functionality" do
+		it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
+			seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
+			seq.quartiles.must_equal [7.25, 9, 14.5]
+			seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
+			seq.inter_quartile_range.must_equal (14.5-7.25)
+		end
+		it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
+			seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
+			seq.quartiles.must_equal [7.5, 9, 14]
+			seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
+			seq.inter_quartile_range.must_equal 6.5
+		end
+		it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
+			seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
+			seq.quartiles.must_equal [25.5, 40, 42.5]
+			seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
+			seq.inter_quartile_range.must_equal 17.0
+		end
+	end
 end

data/test/test_fastmap.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'feldtruby/statistics/fastmap'
+require 'feldtruby/statistics/euclidean_distance'
+describe "Fastmap" do
+  it "works for simple data, and different values of k" do
+    d = [
+      [0, 0, 0, 0],
+      [1, 1, 1, 1],
+      [2, 2, 2, 2],
+      [3, 3, 3, 3]
+    ]
+    1.upto(d.first.length-1) do |k|
+      m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
+      m.depth.must_equal k
+      d.each do |datum|
+        c = m[datum]
+        c.length.must_equal k
+        c.must_equal m[datum]
+      end
+    end
+  end
+end

data/test/test_normalization.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'feldtruby/statistics/normalization'
+class Array
+  def must_be_close_to(other)
+    self.zip(other).map {|a,b| a.must_be_close_to(b)}
+  end
+end
+describe "Z normalization" do
+  it "handles empty arrays" do
+    [].z_normalize.must_equal []
+  end
+  it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
+    expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
+    data.z_normalize.must_be_close_to expected
+  end
+  it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
+    expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
+    data.z_normalize.must_be_close_to expected
+  end
+end
+describe "Min-Max normalization" do
+  it "handles empty arrays" do
+    [].min_max_normalize.must_equal []
+  end
+  it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
+    data = [20, 24, 26, 27, 30]
+    data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
+  end
+end

data/test/test_sax.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require 'feldtruby/statistics/time_series/sax'
+include FeldtRuby::Statistics
+describe 'Symbolic Adaptive approXimation - SAX' do
+  describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
+    it "accepts alphabet sizes between 2 and 20" do
+      sm = SAX::SymbolMapper.new
+      sm.supports_alphabet_size?(-1).must_equal false
+      sm.supports_alphabet_size?(0).must_equal false
+      sm.supports_alphabet_size?(1).must_equal false
+      sm.supports_alphabet_size?(2).must_equal true
+      sm.supports_alphabet_size?(20).must_equal true
+      sm.supports_alphabet_size?(21).must_equal false
+    end
+    it "maps correctly to symbols for alphabet of size 2" do
+      sm = SAX::SymbolMapper.new
+      sm.symbol_for_value(-10, 2).must_equal 1
+      sm.symbol_for_value(-1, 2).must_equal 1
+      sm.symbol_for_value(1, 2).must_equal 2
+      sm.symbol_for_value(10, 2).must_equal 2
+    end
+    it "maps correctly to symbols for alphabet of size 4" do
+      sm = SAX::SymbolMapper.new
+      sm.symbol_for_value(-0.7, 4).must_equal 1
+      sm.symbol_for_value(-0.5, 4).must_equal 2
+      sm.symbol_for_value(-0.01, 4).must_equal 2
+      sm.symbol_for_value(0, 4).must_equal 3
+      sm.symbol_for_value(0.01, 4).must_equal 3
+      sm.symbol_for_value(0.5, 4).must_equal 3
+      sm.symbol_for_value(0.7, 4).must_equal 4
+      sm.symbol_for_value(17, 4).must_equal 4
+    end
+  end
+  it "does not accept alphabet sizes larger than 20 or smaller than 2" do
+    proc {SAX.new(10, 21)}.must_raise ArgumentError
+    proc {SAX.new(3, 1)}.must_raise ArgumentError
+  end
+  it "maps some simple time series to symbols when directly mapping" do
+    sax = SAX.new(1, 4)
+    sax.process([-1, 0, 1]).must_equal [1,3,4]
+    sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
+    sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
+  end
+end

data/test/test_statistics.rb CHANGED Viewed

@@ -69,6 +69,19 @@ describe "Statistics" do
       probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
     end
   end
+  describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
+    it "works for simple examples" do
+      data = [1]
+      kde = density_estimation(data, 4, 0.0, 3.0)
+      kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
+      kde.densities.length.must_equal 4
+      kde.densities[0].must_be_close_to 0.3912
+      kde.densities[1].must_be_close_to 0.3591
+      kde.densities[2].must_be_close_to 0.3101
+      kde.densities[3].must_be_close_to 0.2728
+    end
+  end
 end
 require 'feldtruby/minitest_extensions'

data/test/test_string_distance.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'feldtruby/statistics/distance/string_distance'
+include FeldtRuby::Statistics
+describe "ncd" do
+  it "gives no distance if the strings are the same" do
+    ncd("aaa", "aaa").must_equal 0.0
+  end
+  it "gives distance > 0.0 if strings are not the same" do
+    ncd("a", "b").must_be :>, 0.0
+    ncd("aa", "ab").must_be :>, 0.0
+  end
+end
+describe "cdm" do
+  it "gives no distance if the strings are the same" do
+    cdm("aaa", "aaa").must_equal 0.0
+  end
+  it "gives distance > 0.0 if strings are not the same" do
+    cdm("a", "b").must_be :>, 0.0
+    cdm("aa", "ab").must_be :>, 0.0
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: feldtruby
 version: !ruby/object:Gem::Version
-  version: 0.3.6
+  version: 0.3.8
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-16 00:00:00.000000000 Z
+date: 2013-02-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rinruby
@@ -54,6 +54,7 @@ files:
 - Gemfile
 - Gemfile.lock
 - History.txt
+- R/diffusion_kde.R
 - README.md
 - Rakefile
 - TODO
@@ -77,6 +78,11 @@ files:
 - lib/feldtruby/optimize/search_space.rb
 - lib/feldtruby/optimize/stdout_logger.rb
 - lib/feldtruby/statistics.rb
+- lib/feldtruby/statistics/distance/string_distance.rb
+- lib/feldtruby/statistics/euclidean_distance.rb
+- lib/feldtruby/statistics/fastmap.rb
+- lib/feldtruby/statistics/normalization.rb
+- lib/feldtruby/statistics/time_series/sax.rb
 - lib/feldtruby/string/to_iso.rb
 - lib/feldtruby/time.rb
 - lib/feldtruby/vector.rb
@@ -87,15 +93,19 @@ files:
 - test/test_array.rb
 - test/test_array_basic_stats.rb
 - test/test_array_count_by.rb
+- test/test_fastmap.rb
 - test/test_float.rb
 - test/test_html_doc_getter.rb
+- test/test_normalization.rb
 - test/test_optimize.rb
 - test/test_optimize_differential_evolution.rb
 - test/test_optimize_objective.rb
 - test/test_optimize_populationbasedoptimizer.rb
 - test/test_optimize_random_search.rb
 - test/test_optimize_search_space.rb
+- test/test_sax.rb
 - test/test_statistics.rb
+- test/test_string_distance.rb
 - test/test_time.rb
 - test/test_vector.rb
 - test/test_word_counter.rb
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
 summary: Robert Feldt's Common Ruby Code lib
@@ -128,15 +138,19 @@ test_files:
 - test/test_array.rb
 - test/test_array_basic_stats.rb
 - test/test_array_count_by.rb
+- test/test_fastmap.rb
 - test/test_float.rb
 - test/test_html_doc_getter.rb
+- test/test_normalization.rb
 - test/test_optimize.rb
 - test/test_optimize_differential_evolution.rb
 - test/test_optimize_objective.rb
 - test/test_optimize_populationbasedoptimizer.rb
 - test/test_optimize_random_search.rb
 - test/test_optimize_search_space.rb
+- test/test_sax.rb
 - test/test_statistics.rb
+- test/test_string_distance.rb
 - test/test_time.rb
 - test/test_vector.rb
 - test/test_word_counter.rb