RubyGems - feldtruby - Versions diffs - 0.3.6 → 0.3.8 - Mend

feldtruby 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/R/diffusion_kde.R +116 -0
data/lib/feldtruby/array/basic_stats.rb +51 -0
data/lib/feldtruby/array.rb +4 -0
data/lib/feldtruby/statistics/distance/string_distance.rb +49 -0
data/lib/feldtruby/statistics/euclidean_distance.rb +14 -0
data/lib/feldtruby/statistics/fastmap.rb +106 -0
data/lib/feldtruby/statistics/normalization.rb +26 -0
data/lib/feldtruby/statistics/time_series/sax.rb +99 -0
data/lib/feldtruby/statistics.rb +48 -0
data/lib/feldtruby/version.rb +1 -1
data/lib/feldtruby.rb +3 -1
data/test/test_array.rb +8 -1
data/test/test_array_basic_stats.rb +32 -0
data/test/test_fastmap.rb +22 -0
data/test/test_normalization.rb +36 -0
data/test/test_sax.rb +48 -0
data/test/test_statistics.rb +13 -0
data/test/test_string_distance.rb +24 -0
metadata +17 -3

data/R/diffusion_kde.R ADDED Viewed

@@ -0,0 +1,116 @@
+# Gaussian KDE downloaded from
+#   http://www-etud.iro.umontreal.ca/~botev/kde.R
+# on February 9th 2013.
+# Our changes:
+#   - Renamed function from kde to diffusion.kde
+#   - Return an object instead of a matrix
+#   - Added the sum of the densities
+#   - Added the mesh interval
+#   - Added the probabilities for each interval
+# No license was specified on this piece of code. For questions about it please contact
+# Prof. Botev on the email: botev@maths.uq.edu.au
+diffusion.kde <- function(data,n,MIN,MAX){
+#       State-of-the-art gaussian kernel density estimator for one-dimensional data;
+#       The estimator does not use the commonly employed 'gaussian rule of thumb'.
+#       As a result it outperforms many plug-in methods on multimodal densities
+#       with widely separated modes (see example).
+# INPUTS:
+#     data    - a vector of data from which the density estimate is constructed;
+#          n  - the number of mesh points used in the uniform discretization of the
+#               interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
+#               n is rounded up to the next power of two; the default value of n is n=2^12;
+#   MIN, MAX  - defines the interval [MIN,MAX] on which the density estimate is constructed;
+#               the default values of MIN and MAX are:
+#               MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
+# OUTPUT:
+#       matrix 'out' of with two rows of length 'n', where out[2,]
+#       are the density values on the mesh out[1,];
+# EXAMPLE:
+##Save this file in your directory as kde.R and copy and paste the commands:
+# rm(list=ls())
+# source(file='kde.r')
+# data=c(rnorm(10^3),rnorm(10^3)*2+30);
+# d=kde(data)
+# plot(d[1,],d[2,],type='l',xlab='x',ylab='density f(x)')
+# REFERENCE:
+# Z. I. Botev, J. F. Grotowski and D. P. Kroese
+# "Kernel Density Estimation Via Diffusion"
+# Annals of Statistics, 2010, Volume 38, Number 5, Pages 2916-2957
+# for questions email: botev@maths.uq.edu.au
+nargin=length(as.list(match.call()))-1;
+if (nargin<2) n=2^14
+n=2^ceiling(log2(n)); # round up n to the next power of 2;
+if (nargin<4)
+{# define the default  interval [MIN,MAX]
+minimum=min(data); maximum=max(data);
+Range=maximum-minimum;
+MIN=minimum-Range/10; MAX=maximum+Range/10;
+}
+# set up the grid over which the density estimate is computed;
+R=MAX-MIN; dx=R/n; xmesh=MIN+seq(0,R,dx); N=length(data);
+# if data has repeated observations use the N below
+# N=length(as.numeric(names(table(data))));
+# bin the data uniformly using the grid defined above;
+w=hist(data,xmesh,plot=FALSE);initial_data=(w$counts)/N;
+initial_data=initial_data/sum(initial_data);
+dct1d <- function(data){
+# computes the discrete cosine transform of the column vector data
+n= length(data);
+# Compute weights to multiply DFT coefficients
+weight = c(1,2*exp(-1i*(1:(n-1))*pi/(2*n)));
+# Re-order the elements of the columns of x
+data = c(data[seq(1,n-1,2)], data[seq(n,2,-2)]);
+# Multiply FFT by weights:
+data= Re(weight* fft(data));
+data}
+a=dct1d(initial_data); # discrete cosine transform of initial data
+# now compute the optimal bandwidth^2 using the referenced method
+I=(1:(n-1))^2; a2=(a[2:n]/2)^2;
+# use  fzero to solve the equation t=zeta*gamma^[5](t)
+fixed_point <-  function(t,N,I,a2){
+# this implements the function t-zeta*gamma^[l](t)
+l=7;
+f=2*(pi^(2*l))*sum((I^l)*a2*exp(-I*(pi^2)*t));
+for (s in (l-1):2){
+    K0=prod(seq(1,2*s-1,2))/sqrt(2*pi);  const=(1+(1/2)^(s+1/2))/3;
+    time=(2*const*K0/N/f)^(2/(3+2*s));
+    f=2*pi^(2*s)*sum(I^s*a2*exp(-I*pi^2*time));
+}
+out=t-(2*N*sqrt(pi)*f)^(-2/5);
+}
+t_star=tryCatch(uniroot(fixed_point,c(0,.1),N=N,I=I,a2=a2,tol=10^(-14))$root,error=function(e) .28*N^(-2/5));
+# smooth the discrete cosine transform of initial data using t_star
+a_t=a*exp(-(0:(n-1))^2*pi^2*t_star/2);
+# now apply the inverse discrete cosine transform
+idct1d <-  function(data){
+# computes the inverse discrete cosine transform
+n=length(data);
+# Compute weights
+weights = n*exp(1i*(0:(n-1))*pi/(2*n));
+# Compute x tilde using equation (5.93) in Jain
+data = Re(fft(weights*data,inverse=TRUE))/n;
+# Re-order elements of each column according to equations (5.93) and
+# (5.94) in Jain
+out = rep(0,n);
+out[seq(1,n,2)] = data[1:(n/2)];
+out[seq(2,n,2)] = data[n:(n/2+1)];
+out;
+}
+density=idct1d(a_t)/R;
+# take the rescaling of the data into account
+bandwidth=sqrt(t_star)*R;
+xmesh=seq(MIN,MAX,R/(n-1));
+# out=matrix(c(xmesh,density),nrow=2,byrow=TRUE);
+posd = density + abs(min(0.0, min(density))); # Ensure least density is 0.0 before calcing probabilities
+list(probabilities = (posd / sum(posd)), densities = density, mesh = xmesh,
+  sum_density = sum(density), mesh_interval = (R/(n-1)), min = MIN, max = MAX)
+}

data/lib/feldtruby/array/basic_stats.rb CHANGED Viewed

@@ -21,6 +21,46 @@ module BasicStatistics
 		end
 	end
+	# Calculate the values that cuts the data into 0%, 25%, 50%, 75% and 100%.
+	# This corresponds to the min, 1st quartile, 2nd quartile, 3rd quartile and the max.
+	def quantiles
+		return [nil, nil, nil, nil, nil] if length == 0
+		sorted = self.sort
+		q1 = sorted.quantile_at_ratio(0.25)
+		q2 = sorted.quantile_at_ratio(0.50)
+		q3 = sorted.quantile_at_ratio(0.75)
+		return sorted.first, q1, q2, q3, sorted.last
+	end
+	# Calculate the quantile at a given ratio (must be between 0.0 and 1.0) assuming self
+	# is a sorted array. This is based on the type 7 quantile function in R.
+	def quantile_at_ratio(p)
+		n = self.length
+		h = (n - 1) * p + 1
+		hfloor = h.floor
+		if h == hfloor
+			self[hfloor-1]
+		else
+			x_hfloor = self[hfloor-1]
+			x_hfloor + (h - hfloor)*(self[hfloor] - x_hfloor)
+		end
+	end
+	# Calculate the three quartiles of the array.
+	def quartiles
+		return [nil, nil, nil] if length == 0
+		sorted = self.sort
+		q1 = sorted.quantile_at_ratio(0.25)
+		q2 = sorted.quantile_at_ratio(0.50)
+		q3 = sorted.quantile_at_ratio(0.75)
+		return q1, q2, q3
+	end
+	def inter_quartile_range
+		q1, q2, q3 = quartiles
+		q3 - q1
+	end
 	def variance
 		return 0 if self.length == 0
 		avg = self.mean
@@ -31,6 +71,17 @@ module BasicStatistics
 		Math.sqrt( self.variance )
 	end
+	# Same as R's var, i.e. uses N-1 in denominator.
+	def var
+		n = self.length.to_f
+		(variance * n) / (n-1)
+	end
+	# Save as R's sd, i.e. uses N-1 in denominator.
+	def sd
+		Math.sqrt( self.var )
+	end
 	def root_mean_square
 		Math.sqrt( self.map {|v| v**2}.mean )
 	end

data/lib/feldtruby/array.rb CHANGED Viewed

@@ -44,4 +44,8 @@ class Array
 		self.each {|element| count_hash[element] += 1}
 		count_hash
 	end
+	def sample
+		self[rand(self.length)]
+	end
 end

data/lib/feldtruby/statistics/distance/string_distance.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'zlib'
+module FeldtRuby::Statistics
+class StringDistance
+  def compress(s)
+    Zlib::Deflate.deflate(s, 9)
+  end
+  def compressed_length(s)
+    compress(s).length
+  end
+  def distance(string1, string2)
+    raise NotImplementedError
+  end
+end
+# Cilibrasi and Vitanyi's NCD.
+class NormalizedCompressionDistance < StringDistance
+  def distance(string1, string2)
+    return 0.0 if string1 == string2
+    c1 = compressed_length(string1)
+    c2 = compressed_length(string2)
+    c_1_2 = compressed_length(string1 + string2)
+    (c_1_2 - [c1, c2].min).to_f / ([c1, c2].max)
+  end
+end
+def ncd(string1, string2)
+  (@ncd ||= NormalizedCompressionDistance.new).distance(string1, string2)
+end
+# Keogh et al's CDM.
+class CompressionBasedDissimilarityMeasure < StringDistance
+  def distance(string1, string2)
+    return 0.0 if string1 == string2
+    c1 = compressed_length(string1)
+    c2 = compressed_length(string2)
+    c_1_2 = compressed_length(string1 + string2)
+    c_1_2.to_f / (c1 + c2)
+  end
+end
+def cdm(string1, string2)
+  (@cdm ||= CompressionBasedDissimilarityMeasure.new).distance(string1, string2)
+end
+end

data/lib/feldtruby/statistics/euclidean_distance.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module FeldtRuby
+class EuclideanDistance
+  def calc(o1, o2)
+    sum = 0.0
+    o1.length.times do |i|
+      d = (o1[i] - o2[i])
+      sum += (d*d)
+    end
+    Math.sqrt(sum)
+  end
+end
+end

data/lib/feldtruby/statistics/fastmap.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require 'feldtruby/array'
+module FeldtRuby
+class FastMap
+  # A PivotNode has two pivot objects, a map from each object to its
+  # coordinate on the line for these pivots, a distance function and
+  # a child pointing to the next dimension.
+  # It maps a multi-variate object to a k-dimensional coordinate.
+  class PivotNode
+    attr_writer :map, :child
+    def initialize(distance, pivot1, pivot2, map = nil, child = nil)
+      @distance, @pivot1, @pivot2, @map, @child = distance, pivot1, pivot2, map, child
+      @d_1_2 = distance.calc(pivot1, pivot2)
+      @d_1_2_squared, @d_1_2_doubled = @d_1_2 * @d_1_2, 2 * @d_1_2
+    end
+    # The number of coordinates that will be returned for an object.
+    def k; depth; end
+    def depth
+      @depth ||= 1 + (@child ? @child.depth : 0)
+    end
+    # Map an object to its coordinate in the dimension represented by this node.
+    def fastmap_coordinate(o)
+      ( @distance.calc(o, @pivot1) + @d_1_2_squared - @distance.calc(o, @pivot2) ) / @d_1_2_doubled
+    end
+    def coordinate(o)
+      [map_object_to_coordinate(o)] + (@child ? @child.coordinate(o) : [])
+    end
+    def [](object)
+      coordinate(object)
+    end
+    def map_object_to_coordinate(o)
+      @map[o] || fastmap_coordinate(o)
+    end
+  end
+  def initialize(distance, k = 2, choiceDepth = 1)
+    @distance, @k, @choice_depth = distance, k, choiceDepth
+  end
+  def run(objects)
+    @objects = objects
+    create_map(@k, @distance)
+  end
+  def create_map(k, distance)
+    return nil if k == 0
+    o1, o2 = choose_distant_objects(@objects, @distance)
+    node = PivotNode.new(distance, o1, o2)
+    coordinate_map = {}
+    if distance.calc(o1, o2) == 0.0
+      @objects.each {|o| coordinate_map[o] = 0.0}
+    else
+      @objects.each {|o| coordinate_map[o] = node.fastmap_coordinate(o)}
+    end
+    node.map = coordinate_map
+    node.child = create_map k-1, next_distance(distance, o1, o2, coordinate_map)
+    node
+  end
+  def choose_distant_objects(objects, distance)
+    o1 = nil
+    o2 = objects.sample
+    # Not sure if there is any benefit to doing this more than once. Test later.
+    @choice_depth.times do
+      o1 = find_most_distant_object(objects, o2, distance)
+      o2 = find_most_distant_object(objects, o1, distance)
+    end
+    return o1, o2
+  end
+  # Find the object in objects that is farthest from o, given a distance function.
+  def find_most_distant_object(objects, o, distance)
+    objects.sort_by {|oi| distance.calc(oi, o)}.last
+  end
+  class DistanceFunction
+    def initialize(&func)
+      @func = func
+    end
+    def calc(o1, o2)
+      @func.call(o1, o2)
+    end
+  end
+  # Create the next distance function from a given distance func.
+  def next_distance(distance, o1, o2, coordinates)
+    DistanceFunction.new do |oi, oj|
+      Math.sqrt( distance.calc(oi, oj)**2 - (coordinates[oi] - coordinates[oj])**2 )
+    end
+  end
+end
+# Recursively map n-dimensional objects (given as an Array) into a k-dimensional
+# space while preserving the distances between the objects as well as possible.
+def self.fastmap(objects, distance, k = 2)
+  FastMap.new(distance, k).run(objects)
+end
+end

data/lib/feldtruby/statistics/normalization.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'feldtruby/array/basic_stats'
+# The normalization methods assumes the existence of basic statistics
+# on the class it they are included in:
+#   z_normalize: require mean and stdev
+module FeldtRuby::Normalization
+  def normalize(&transform)
+    self.map {|v| transform.call(v)}
+  end
+  def z_normalize
+    mean, stdev = self.mean, self.sd
+    self.map {|e| (e-mean)/stdev}
+  end
+  def min_max_normalize
+    return [] if self.length == 0
+    min = self.min.to_f
+    range = self.max - min
+    self.map {|e| (e-min)/range}
+  end
+end
+class Array
+  include FeldtRuby::Normalization
+end

data/lib/feldtruby/statistics/time_series/sax.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'feldtruby/statistics/normalization'
+# Implements the basic SAX (Symbolic Adaptive approXimation) from the paper:
+#  Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu,
+#  "A Symbolic Representation of Time Series, with Implications for Streaming Algorithms", IDMKD 2003.
+# available from: http://www.cs.ucr.edu/~eamonn/SAX.pdf
+module FeldtRuby::Statistics
+# A SAX processor transforms any numeric stream of data (often a time series)
+# of arbitrary length n to a string (symbolic stream) of arbitrary length w,
+# where w<n, and typically w<<n. The alphabet size (symbols in the string) is
+# also an arbitrary integer _a_, a>2. Compared to the SAX described by Keogh et
+# al we state the number of data elements, _elementsPerWord_, that should go
+# into each word, i.e. w = n/elementsPerWord.
+# This allows for many powerful data mining algorithms to be applied and sped up.
+class SAX
+  # Create a SAX processor with given output length _w_ and alphabet size _a_.
+  def initialize(elementsPerWord, alphabetSize = 6)
+    raise ArgumentError if alphabetSize > 20 || alphabetSize < 2
+    @elements_per_word, @alphabet_size = elementsPerWord, alphabetSize
+  end
+  # A mapper maps the values in a subsequence into a symbol. The standard
+  # mapper is state-less and normalizes each subsequence and then assumes
+  # a normal distribution and thus uses a fixed selection of bins.
+  class SymbolMapper
+    def initialize(data = nil)
+      # This standard mapper does not utilize the whole data sequence to precalc mapping values. But subclasses might.
+    end
+    # Cut points based on a Normal/Gaussian distribution...
+    NormalDistCutPoints = {
+        2 => [-Float::INFINITY, 0.00],
+        3 => [-Float::INFINITY, -0.43, 0.43],
+        4 => [-Float::INFINITY, -0.67, 0.00, 0.67],
+        5 => [-Float::INFINITY, -0.84, -0.25, 0.25, 0.84],
+        6 => [-Float::INFINITY, -0.97, -0.43, 0.00, 0.43, 0.97],
+        7 => [-Float::INFINITY, -1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
+        8 => [-Float::INFINITY, -1.15, -0.67, -0.32, 0.00, 0.32, 0.67, 1.15],
+        9 => [-Float::INFINITY, -1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
+        10 => [-Float::INFINITY, -1.28, -0.84, -0.52, -0.25, 0.00, 0.25, 0.52, 0.84, 1.28],
+        11 => [-Float::INFINITY, -1.34, -0.91, -0.60, -0.35, -0.11, 0.11, 0.35, 0.60, 0.91, 1.34],
+        12 => [-Float::INFINITY, -1.38, -0.97, -0.67, -0.43, -0.21, 0.00, 0.21, 0.43, 0.67, 0.97, 1.38],
+        13 => [-Float::INFINITY, -1.43, -1.02, -0.74, -0.50, -0.29, -0.10, 0.10, 0.29, 0.50, 0.74, 1.02, 1.43],
+        14 => [-Float::INFINITY, -1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0.00, 0.18, 0.37, 0.57, 0.79, 1.07, 1.47],
+        15 => [-Float::INFINITY, -1.5 , -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08, 0.25, 0.43, 0.62, 0.84, 1.11, 1.50],
+        16 => [-Float::INFINITY, -1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0.00, 0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
+        17 => [-Float::INFINITY, -1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07, 0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
+        18 => [-Float::INFINITY, -1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14, 0.00, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
+        19 => [-Float::INFINITY, -1.62, -1.25, -1.00, -0.80, -0.63, -0.48, -0.34, -0.20, -0.07, 0.07, 0.20, 0.34, 0.48, 0.63, 0.80, 1.0, 1.25, 1.62],
+        20 => [-Float::INFINITY, -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0.00, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64]
+    }
+    def supports_alphabet_size?(size)
+      NormalDistCutPoints.keys.include? size
+    end
+    def map_sequence_to_symbol(sequence, alphabet_size)
+      symbol_for_value(sequence.mean, alphabet_size)
+    end
+    def symbol_for_value(value, alphabet_size)
+      NormalDistCutPoints[alphabet_size].inject(0) do |symbol, cutpoint|
+        return symbol if cutpoint > value
+        symbol + 1
+      end
+    end
+  end
+  def setup_for_processing_data(data, mapper = nil)
+    @mapper ||= SymbolMapper.new(data)
+    unless @mapper.supports_alphabet_size?(@alphabet_size)
+      raise ArgumentError.new("Mapper does not support the alphabet size (#{@alphabet_size}): #{@mapper}")
+    end
+  end
+  def process_subsequence(subsequence)
+    normalized_ss = subsequence.z_normalize
+    len, rem = normalized_ss.length.divmod @elements_per_word
+    # Note that if the lengths are not evenly divisible the last word will be based on fewer elements.
+    # This is different than the orig SAX as specified in their paper.
+    symbols = (0...len).map do |wordindex|
+      @mapper.map_sequence_to_symbol(normalized_ss[wordindex * @elements_per_word, @elements_per_word], @alphabet_size)
+    end
+    symbols << @mapper.map_sequence_to_symbol(normalized_ss[len, @elements_per_word], @alphabet_size) if rem > 0
+    symbols
+  end
+  def process(data, windowSize = data.length, mapper = nil)
+    setup_for_processing_data(data, mapper)
+    res = (0..(data.length - windowSize)).map do |i|
+      process_subsequence(data[i, windowSize])
+    end
+    res = res.flatten if windowSize == data.length
+    res
+  end
+end
+end

data/lib/feldtruby/statistics.rb CHANGED Viewed

@@ -26,6 +26,15 @@ class RCommunicator
     @r.eval "if(!library(#{lib}, logical.return=TRUE)) {install.packages(\"#{lib}\"); library(#{lib});}"
   end
+  # Load R scripts in the feldtruby/R directory.
+  def load_feldtruby_r_script(scriptName, reload = false)
+    @loaded_scripts ||= Array.new # Ensure there is an empty array for loaded script names, if this is first call here.
+    return if reload == false && @loaded_scripts.include?(scriptName)
+    @loaded_scripts << scriptName
+    path = File.join(FeldtRuby::TopDirectory, "R", scriptName)
+    @r.eval "source(\"#{path}\")"
+  end
   def eval(str)
     @r.eval str
   end
@@ -103,6 +112,45 @@ module Statistics
     res = RC.call("chisq.test", vs)
     res.p_value
   end
+  class DiffusionKDE
+    attr_reader :densities, :mesh
+    # Given a R object with the four sub-values named densities, mesh, sum_density, mesh_interval, min, max
+    # we can calculate the probability of new values.
+    def initialize(rvalue)
+      @probabilities = rvalue.probabilities
+      @densities = rvalue.densities
+      @mesh = rvalue.mesh
+      @mesh_interval = rvalue.mesh_interval.to_f
+      @min, @max = rvalue.min.to_f, rvalue.max.to_f
+    end
+    def density_of(value)
+      return 0.0 if value < @min || value > @max
+      bin_index = ((value - @min) / @mesh_interval).floor
+      @densities[bin_index]
+    end
+    def probability_of(value)
+      return 0.0 if value < @min || value > @max
+      bin_index = ((value - @min) / @mesh_interval).floor
+      @probabilities[bin_index]
+    end
+  end
+  # Do a kernel density estimation based on the sampled _values_, with n bins (rounded up to nearest exponent of 2)
+  # and optional min and max values.
+  def density_estimation(values, n = 2**9, min = nil, max = nil)
+    # Ensure we have loaded the diffusion.kde code
+    RC.load_feldtruby_r_script("diffusion_kde.R")
+    args = [values, n]
+    if min && max
+      args << min
+      args << max
+    end
+    DiffusionKDE.new RC.call("diffusion.kde", *args)
+  end
 end
 # Make them available at top level

data/lib/feldtruby/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FeldtRuby
-  VERSION = "0.3.6"
+  VERSION = "0.3.8"
 end

data/lib/feldtruby.rb CHANGED Viewed

@@ -4,4 +4,6 @@ if RUBY_VERSION < "1.9"
 end
 # This is the namespace under which we put things...
-module FeldtRuby; end
+module FeldtRuby
+  TopDirectory = File.dirname(__FILE__).split("/")[0...-1].join("/")
+end

data/test/test_array.rb CHANGED Viewed

@@ -100,4 +100,11 @@ describe "Array extensions" do
 			counts[5].must_equal 5
 		end
 	end
-end
+	describe "sample" do
+		it "only samples within the array" do
+			d = (1..100).to_a
+			100.times { d.include?(d.sample).must_equal(true) }
+		end
+	end
+end

data/test/test_array_basic_stats.rb CHANGED Viewed

@@ -60,6 +60,14 @@ class TestArrayBasicStats < MiniTest::Unit::TestCase
 	end
 end
+describe "mean and stdev" do
+	it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
+    data.mean.must_be_close_to 4.606667
+    data.sd.must_be_close_to 2.640316
+  end
+end
 describe "Basic statistics" do
 	describe "sum of abs" do
 		it "works for simple example" do
@@ -131,4 +139,28 @@ describe "Basic statistics" do
 			[1,2,3,4].summary_stats.must_equal "2.500 (min = 1.0, max = 4.0, median = 2.5, stdev = 1.12)"
 		end
 	end
+	describe "quantile- and quartile-related functionality" do
+		it "can calc quantiles, quartiles and IQR for the set used as example for even-numbered sequence for quantiles on Wikipedia" do
+			seq = [3, 6, 7, 8, 8, 10, 13, 15, 16, 20]
+			seq.quartiles.must_equal [7.25, 9, 14.5]
+			seq.quantiles.must_equal [3, 7.25, 9, 14.5, 20]
+			seq.inter_quartile_range.must_equal (14.5-7.25)
+		end
+		it "can calc quantiles, quartiles and IQR for the set used as example for odd-numbered sequence for quantiles on Wikipedia" do
+			seq = [3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20]
+			seq.quartiles.must_equal [7.5, 9, 14]
+			seq.quantiles.must_equal [3, 7.5, 9, 14, 20]
+			seq.inter_quartile_range.must_equal 6.5
+		end
+		it "can calc quantiles, quartiles and IQR for the set used as example for quartiles on Wikipedia" do
+			seq = [6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]
+			seq.quartiles.must_equal [25.5, 40, 42.5]
+			seq.quantiles.must_equal [6, 25.5, 40.0, 42.5, 49]
+			seq.inter_quartile_range.must_equal 17.0
+		end
+	end
 end

data/test/test_fastmap.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'feldtruby/statistics/fastmap'
+require 'feldtruby/statistics/euclidean_distance'
+describe "Fastmap" do
+  it "works for simple data, and different values of k" do
+    d = [
+      [0, 0, 0, 0],
+      [1, 1, 1, 1],
+      [2, 2, 2, 2],
+      [3, 3, 3, 3]
+    ]
+    1.upto(d.first.length-1) do |k|
+      m = FeldtRuby.fastmap(d, FeldtRuby::EuclideanDistance.new, k)
+      m.depth.must_equal k
+      d.each do |datum|
+        c = m[datum]
+        c.length.must_equal k
+        c.must_equal m[datum]
+      end
+    end
+  end
+end

data/test/test_normalization.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'feldtruby/statistics/normalization'
+class Array
+  def must_be_close_to(other)
+    self.zip(other).map {|a,b| a.must_be_close_to(b)}
+  end
+end
+describe "Z normalization" do
+  it "handles empty arrays" do
+    [].z_normalize.must_equal []
+  end
+  it "works for Time series 1 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34]
+    expected = [-0.9796808, -0.8622706, -0.6123005, 0.8496459, 1.739691, 1.588194, 1.095829, 0.5277147, 0.4709033, -0.2865819, 0.0921607, -0.2865819, -0.9039323, -1.195564, -1.237226]
+    data.z_normalize.must_be_close_to expected
+  end
+  it "works for Time series 2 from http://code.google.com/p/jmotif/wiki/ZNormalization" do
+    data = [0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]
+    expected = [-1.289433, -0.9992189, -0.5253246, -0.06612478, -0.2791935, 0.08816637, -0.06612478, 0.595123, 0.8926845, 0.8228861, 1.741286, 1.770675, -0.2791935, -1.197593, -1.208614]
+    data.z_normalize.must_be_close_to expected
+  end
+end
+describe "Min-Max normalization" do
+  it "handles empty arrays" do
+    [].min_max_normalize.must_equal []
+  end
+  it "works for example from http://wiki.answers.com/Q/What_is_min-max_normalization" do
+    data = [20, 24, 26, 27, 30]
+    data.min_max_normalize.must_be_close_to [0.0, 0.4, 0.6, 0.7, 1.0]
+  end
+end

data/test/test_sax.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require 'feldtruby/statistics/time_series/sax'
+include FeldtRuby::Statistics
+describe 'Symbolic Adaptive approXimation - SAX' do
+  describe "The standard SAX SymbolMapper, that uses cut points based on Normal/Gaussian distribution" do
+    it "accepts alphabet sizes between 2 and 20" do
+      sm = SAX::SymbolMapper.new
+      sm.supports_alphabet_size?(-1).must_equal false
+      sm.supports_alphabet_size?(0).must_equal false
+      sm.supports_alphabet_size?(1).must_equal false
+      sm.supports_alphabet_size?(2).must_equal true
+      sm.supports_alphabet_size?(20).must_equal true
+      sm.supports_alphabet_size?(21).must_equal false
+    end
+    it "maps correctly to symbols for alphabet of size 2" do
+      sm = SAX::SymbolMapper.new
+      sm.symbol_for_value(-10, 2).must_equal 1
+      sm.symbol_for_value(-1, 2).must_equal 1
+      sm.symbol_for_value(1, 2).must_equal 2
+      sm.symbol_for_value(10, 2).must_equal 2
+    end
+    it "maps correctly to symbols for alphabet of size 4" do
+      sm = SAX::SymbolMapper.new
+      sm.symbol_for_value(-0.7, 4).must_equal 1
+      sm.symbol_for_value(-0.5, 4).must_equal 2
+      sm.symbol_for_value(-0.01, 4).must_equal 2
+      sm.symbol_for_value(0, 4).must_equal 3
+      sm.symbol_for_value(0.01, 4).must_equal 3
+      sm.symbol_for_value(0.5, 4).must_equal 3
+      sm.symbol_for_value(0.7, 4).must_equal 4
+      sm.symbol_for_value(17, 4).must_equal 4
+    end
+  end
+  it "does not accept alphabet sizes larger than 20 or smaller than 2" do
+    proc {SAX.new(10, 21)}.must_raise ArgumentError
+    proc {SAX.new(3, 1)}.must_raise ArgumentError
+  end
+  it "maps some simple time series to symbols when directly mapping" do
+    sax = SAX.new(1, 4)
+    sax.process([-1, 0, 1]).must_equal [1,3,4]
+    sax.process([-1, -0.5, 0, 0.5, 1]).must_equal [1,2,3,3,4]
+    sax.process([-1, -0.5, 0, 0.5, 1].reverse).must_equal [1,2,3,3,4].reverse
+  end
+end

data/test/test_statistics.rb CHANGED Viewed

@@ -69,6 +69,19 @@ describe "Statistics" do
       probability_of_same_proportions(([:a] * 570) + ([:b] * 430)).must_be_close_to 5.091e-10
     end
   end
+  describe "Diffusions Kernel Density Estimation based on R code loaded from the feldtruby R directory" do
+    it "works for simple examples" do
+      data = [1]
+      kde = density_estimation(data, 4, 0.0, 3.0)
+      kde.mesh.must_equal [0.0, 1.0, 2.0, 3.0]
+      kde.densities.length.must_equal 4
+      kde.densities[0].must_be_close_to 0.3912
+      kde.densities[1].must_be_close_to 0.3591
+      kde.densities[2].must_be_close_to 0.3101
+      kde.densities[3].must_be_close_to 0.2728
+    end
+  end
 end
 require 'feldtruby/minitest_extensions'

data/test/test_string_distance.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'feldtruby/statistics/distance/string_distance'
+include FeldtRuby::Statistics
+describe "ncd" do
+  it "gives no distance if the strings are the same" do
+    ncd("aaa", "aaa").must_equal 0.0
+  end
+  it "gives distance > 0.0 if strings are not the same" do
+    ncd("a", "b").must_be :>, 0.0
+    ncd("aa", "ab").must_be :>, 0.0
+  end
+end
+describe "cdm" do
+  it "gives no distance if the strings are the same" do
+    cdm("aaa", "aaa").must_equal 0.0
+  end
+  it "gives distance > 0.0 if strings are not the same" do
+    cdm("a", "b").must_be :>, 0.0
+    cdm("aa", "ab").must_be :>, 0.0
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: feldtruby
 version: !ruby/object:Gem::Version
-  version: 0.3.6
+  version: 0.3.8
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-16 00:00:00.000000000 Z
+date: 2013-02-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rinruby
@@ -54,6 +54,7 @@ files:
 - Gemfile
 - Gemfile.lock
 - History.txt
+- R/diffusion_kde.R
 - README.md
 - Rakefile
 - TODO
@@ -77,6 +78,11 @@ files:
 - lib/feldtruby/optimize/search_space.rb
 - lib/feldtruby/optimize/stdout_logger.rb
 - lib/feldtruby/statistics.rb
+- lib/feldtruby/statistics/distance/string_distance.rb
+- lib/feldtruby/statistics/euclidean_distance.rb
+- lib/feldtruby/statistics/fastmap.rb
+- lib/feldtruby/statistics/normalization.rb
+- lib/feldtruby/statistics/time_series/sax.rb
 - lib/feldtruby/string/to_iso.rb
 - lib/feldtruby/time.rb
 - lib/feldtruby/vector.rb
@@ -87,15 +93,19 @@ files:
 - test/test_array.rb
 - test/test_array_basic_stats.rb
 - test/test_array_count_by.rb
+- test/test_fastmap.rb
 - test/test_float.rb
 - test/test_html_doc_getter.rb
+- test/test_normalization.rb
 - test/test_optimize.rb
 - test/test_optimize_differential_evolution.rb
 - test/test_optimize_objective.rb
 - test/test_optimize_populationbasedoptimizer.rb
 - test/test_optimize_random_search.rb
 - test/test_optimize_search_space.rb
+- test/test_sax.rb
 - test/test_statistics.rb
+- test/test_string_distance.rb
 - test/test_time.rb
 - test/test_vector.rb
 - test/test_word_counter.rb
@@ -119,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
 summary: Robert Feldt's Common Ruby Code lib
@@ -128,15 +138,19 @@ test_files:
 - test/test_array.rb
 - test/test_array_basic_stats.rb
 - test/test_array_count_by.rb
+- test/test_fastmap.rb
 - test/test_float.rb
 - test/test_html_doc_getter.rb
+- test/test_normalization.rb
 - test/test_optimize.rb
 - test/test_optimize_differential_evolution.rb
 - test/test_optimize_objective.rb
 - test/test_optimize_populationbasedoptimizer.rb
 - test/test_optimize_random_search.rb
 - test/test_optimize_search_space.rb
+- test/test_sax.rb
 - test/test_statistics.rb
+- test/test_string_distance.rb
 - test/test_time.rb
 - test/test_vector.rb
 - test/test_word_counter.rb