RubyGems - statsample-timeseries - Versions diffs - 0.0.2 - Mend

statsample-timeseries 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/.document +5 -0
data/.travis.yml +13 -0
data/Gemfile +22 -0
data/LICENSE.txt +22 -0
data/README.rdoc +72 -0
data/Rakefile +47 -0
data/VERSION +1 -0
data/bin/bio-statsample-timeseries +74 -0
data/features/acf.feature +31 -0
data/features/pacf.feature +42 -0
data/features/step_definitions/bio-statsample-timeseries_steps.rb +0 -0
data/features/step_definitions/step_definitions.rb +37 -0
data/features/step_definitions/step_definitions_acf.rb +8 -0
data/features/support/env.rb +15 -0
data/lib/statsample-timeseries.rb +18 -0
data/lib/statsample-timeseries/arima.rb +246 -0
data/lib/statsample-timeseries/arima/kalman.rb +148 -0
data/lib/statsample-timeseries/arima/likelihood.rb +101 -0
data/lib/statsample-timeseries/timeseries.rb +291 -0
data/lib/statsample-timeseries/timeseries/pacf.rb +164 -0
data/lib/statsample-timeseries/utility.rb +154 -0
data/test/fixtures/stock_data.csv +500 -0
data/test/helper.rb +81 -0
data/test/test_arima_ks.rb +106 -0
data/test/test_arima_simulators.rb +186 -0
data/test/test_matrix.rb +92 -0
data/test/test_pacf.rb +52 -0
data/test/test_tseries.rb +103 -0
data/test/test_wald.rb +71 -0
metadata +273 -0

data/lib/statsample-timeseries/arima/kalman.rb ADDED

@@ -0,0 +1,148 @@
+require 'statsample-timeseries/arima/likelihood'
+module Statsample
+  module TimeSeries
+    module Arima
+      class KalmanFilter
+        include Statsample::TimeSeries
+        include GSL::MultiMin
+        #timeseries object
+        attr_accessor :ts
+        #Autoregressive order
+        attr_accessor :p
+        #Integerated part order
+        attr_accessor :i
+        #Moving average order
+        attr_accessor :q
+        # Autoregressive coefficients
+        attr_reader :ar
+        # Moving average coefficients
+        attr_reader :ma
+        #Creates a new KalmanFilter object and computes the likelihood
+        def initialize(ts=[].to_ts, p=0, i=0, q=0)
+          @ts = ts
+          @p = p
+          @i = i
+          @q = q
+          ks #call the filter
+        end
+        def to_s
+          sprintf("ARIMA model(p = %d, i = %d, q = %d) on series(%d elements) - [%s]",
+                  @p, @i, @q, @ts.size, @ts.to_a.join(','))
+        end
+        # = Kalman Filter
+        #  Function which minimizes KalmanFilter.ll iteratively for initial parameters
+        # == Usage
+        #    @s = [-1.16025577,0.64758021,0.77158601,0.14989543,2.31358162,3.49213868,1.14826956,0.58169457,-0.30813868,-0.34741084,-1.41175595,0.06040081, -0.78230232,0.86734837,0.95015787,-0.49781397,0.53247330,1.56495187,0.30936619,0.09750217,1.09698829,-0.81315490,-0.79425607,-0.64568547,-1.06460320,1.24647894,0.66695937,1.50284551,1.17631218,1.64082872,1.61462736,0.06443761,-0.17583741,0.83918339,0.46610988,-0.54915270,-0.56417108,-1.27696654,0.89460084,1.49970338,0.24520493,0.26249138,-1.33744834,-0.57725961,1.55819543,1.62143157,0.44421891,-0.74000084 ,0.57866347,3.51189333,2.39135077,1.73046244,1.81783890,0.21454040,0.43520890,-1.42443856,-2.72124685,-2.51313877,-1.20243091,-1.44268002 ,-0.16777305,0.05780661,2.03533992,0.39187242,0.54987983,0.57865693,-0.96592469,-0.93278473,-0.75962671,-0.63216906,1.06776183, 0.17476059 ,0.06635860,0.94906227,2.44498583,-1.04990407,-0.88440073,-1.99838258,-1.12955558,-0.62654882,-1.36589161,-2.67456821,-0.97187696, -0.84431782 ,-0.10051809,0.54239549,1.34622861,1.25598105,0.19707759,3.29286114,3.52423499,1.69146333,-0.10150024,0.45222903,-0.01730516, -0.49828727, -1.18484684,-1.09531773,-1.17190808,0.30207662].to_ts
+        #    @kf=Statsample::TimeSeries::ARIMA.ks(@s,1,0,0)
+        #    #=> ks is implictly called in above operation
+        #    @kf.ar
+        #    #=> AR coefficients
+        def ks
+          initial = Array.new((@p+@q), 0.0)
+          my_f = Proc.new{ |x, params|
+            #In rb-gsl, params remain idle, x is varied upon
+            #In R code, initial parameters varied in each iteration
+            #my_func.set_params([(1..100).to_a.to_ts, p_value, q_value])
+            timeseries = params[0]
+            p,q = params[1], params[2]
+            params = x
+            #puts x
+            -Arima::KF::LogLikelihood.new(x.to_a, timeseries, p, q).ll
+            #KalmanFilter.ll(x.to_a, timeseries, p, q)
+          }
+          np = @p + @q
+          my_func = Function.alloc(my_f, np)
+          my_func.set_params([@ts, @p, @q])
+          x = GSL::Vector.alloc(initial)
+          ss = GSL::Vector.alloc(np)
+          ss.set_all(0.1)
+          minimizer = FMinimizer.alloc("nmsimplex", np)
+          minimizer.set(my_func, x, ss)
+          status = GSL::CONTINUE
+          iter = 0
+          while status == GSL::CONTINUE && iter < 100
+            iter += 1
+            begin
+              status = minimizer.iterate()
+              status = minimizer.test_size(1e-2)
+              x = minimizer.x
+            rescue
+              break
+            end
+          #  printf("%5d ", iter)
+          #  for i in 0...np do
+          #    puts "#{x[i]}.to_f"
+          #    #printf("%10.3e ", x[i].to_f)
+          #  end
+          #  printf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)
+          end
+          #
+          @ar = (p > 0) ? x.to_a[0...p] : []
+          @ma = (q > 0) ? x.to_a[p...(p+q)] : []
+          x.to_a
+        end
+        #=Log Likelihood
+        #Computes Log likelihood on given parameters, ARMA order and timeseries
+        #==params
+        #* *params*: array of floats, contains phi/theta parameters
+        #* *timeseries*: timeseries object
+        #* *p*: integer, AR(p) order
+        #* *q*: integer, MA(q) order
+        #==Returns
+        #LogLikelihood object
+        #==Usage
+        # s = (1..100).map { rand }.to_ts
+        # p, q  = 1, 0
+        # ll = KalmanFilter.log_likelihood([0.2], s, p, q)
+        # ll.log_likelihood
+        # #=> -22.66
+        # ll.sigma
+        # #=> 0.232
+        def self.log_likelihood(params, timeseries, p, q)
+          Arima::KF::LogLikelihood.new(params, timeseries, p, q)
+        end
+        #=T
+        #The coefficient matrix for the state vector in state equation
+        # It's dimensions is r+k x r+k
+        #==Parameters
+        #* *r*: integer, r is max(p, q+1), where p and q are orders of AR and MA respectively
+        #* *k*: integer, number of exogeneous variables in ARMA model
+        #* *q*: integer, The AR coefficient of ARMA model
+        #==References Statsmodels tsa, Durbin and Koopman Section 4.7
+        #def self.T(r, k, p)
+        #  arr = Matrix.zero(r)
+        #  params_padded  = Statsample::Vector.new(Array.new(r, 0), :scale)
+        #
+        #  params_padded[0...p] = params[k...(p+k)]
+        #  intermediate_matrix = (r-1).times.map { Array.new(r, 0) }
+        #  #appending an array filled with padded values in beginning
+        #  intermediate_matrix[0,0] = [params_padded]
+        #
+        #  #now generating column matrix for that:
+        #  arr = Matrix.columns(intermediate_matrix)
+        #  arr_00 = arr[0,0]
+        #
+        #  #identify matrix substituition in matrix except row[0] and column[0]
+        #  r.times do |i|
+        #    arr[r,r] = 1
+        #  end
+        #  arr[0,0] = arr_00
+        #  arr
+        #end
+      end
+    end
+  end
+end

data/lib/statsample-timeseries/arima/likelihood.rb ADDED

@@ -0,0 +1,101 @@
+module Statsample
+  module TimeSeries
+    module Arima
+      module KF
+        class LogLikelihood
+          #Gives log likelihood value of an ARMA(p, q) process on given parameters
+          attr_reader :log_likelihood
+          #Gives sigma value of an ARMA(p,q) process on given parameters
+          attr_reader :sigma
+          #Gives AIC(Akaike Information Criterion)
+          #https://www.scss.tcd.ie/Rozenn.Dahyot/ST7005/13AICBIC.pdf
+          attr_reader :aic
+          def initialize(params, timeseries, p, q)
+            @params = params
+            @timeseries = timeseries
+            @p = p
+            @q = q
+            ll
+          end
+          #===Log likelihood link function.
+          #iteratively minimized by simplex algorithm via KalmanFilter.ks
+          #Not meant to be used directly. Will make it private later.
+          def ll
+            params, timeseries = @params, @timeseries
+            p, q = @p, @q
+            phi = []
+            theta = []
+            phi = params[0...p] if p > 0
+            theta = params[(p)...(p + q)] if q > 0
+            [phi, theta].each do |v|
+              if v.size>0 and v.map(&:abs).inject(:+) > 1
+                return
+              end
+            end
+            m = [p, q].max
+            h = Matrix.column_vector(Array.new(m,0))
+            m.times do |i|
+              h[i,0] = phi[i] if i< p
+              h[i,0] = h[i,0] + theta[i] if i < q
+            end
+            t = Matrix.zero(m)
+            #set_column is available in utility.rb
+            t = t.set_column(0, phi)
+            if(m > 1)
+              t[0...(m-1), 1...m] = Matrix.I(m-1)
+              #chances of extra constant 0 values as unbalanced column, so:
+              t = Matrix.columns(t.column_vectors)
+            end
+            g = Matrix[[1]]
+            a_t = Matrix.column_vector(Array.new(m,0))
+            n = timeseries.size
+            z = Matrix.row_vector(Array.new(m,0))
+            z[0,0] = 1
+            p_t = Matrix.I(m)
+            v_t, f_t = Array.new(n,0), Array.new(n, 0)
+            n.times do |i|
+              v_t[i] = (z * a_t).map { |x| timeseries[i] - x }[0,0]
+              f_t[i] = (z * p_t * (z.transpose)).map { |x| x + 1 }[0,0]
+              k_t = ((t * p_t * z.transpose) + h).map { |x| x / f_t[i] }
+              a_t = (t * a_t) + (k_t * v_t[i])
+              l_t = t - k_t * z
+              j_t = h - k_t
+              p_t = (t * p_t * (l_t.transpose)) + (h * (j_t.transpose))
+            end
+            pot = v_t.map(&:square).zip(f_t).map { |x,y| x / y}.inject(:+)
+            sigma_2 = pot.to_f / n.to_f
+            f_t_log_sum = f_t.map { |x| Math.log(x) }.inject(:+)
+            @log_likelihood = -0.5 * (n*Math.log(2*Math::PI) + n*Math.log(sigma_2) + f_t_log_sum + n)
+            @sigma = sigma_2
+            @aic = -(2 * @log_likelihood - 2*(p+q+1))
+            #puts ("ll = #{-ll}")
+            return @log_likelihood
+          end
+          def to_s
+            sprintf("LogLikelihood(p = %d, q = %d) on params: [%s]",
+                    @p, @q, @params.join(', '))
+          end
+        end
+      end
+    end
+  end
+end

data/lib/statsample-timeseries/timeseries.rb ADDED

@@ -0,0 +1,291 @@
+require 'statsample-timeseries/timeseries/pacf'
+module Statsample::TimeSeriesShorthands
+  # Creates a new Statsample::TimeSeries object
+  # Argument should be equal to TimeSeries.new
+  def to_time_series(*args)
+    Statsample::TimeSeries::Series.new(self, :scale, *args)
+  end
+  alias :to_ts :to_time_series
+end
+class Array
+  include Statsample::TimeSeriesShorthands
+end
+module Statsample
+  module TimeSeries
+    # Collection of data indexed by time.
+    # The order goes from earliest to latest.
+    class Series < Statsample::Vector
+      include Statsample::TimeSeries::Pacf
+      # Calculates the autocorrelation coefficients of the series.
+      #
+      # The first element is always 1, since that is the correlation
+      # of the series with itself.
+      #
+      # Usage:
+      #
+      #  ts = (1..100).map { rand }.to_time_series
+      #
+      #  ts.acf   # => array with first 21 autocorrelations
+      #  ts.acf 3 # => array with first 3 autocorrelations
+      #
+      def acf(max_lags = nil)
+        max_lags ||= (10 * Math.log10(size)).to_i
+        (0..max_lags).map do |i|
+          if i == 0
+            1.0
+          else
+            m = self.mean
+            # can't use Pearson coefficient since the mean for the lagged series should
+            # be the same as the regular series
+            ((self - m) * (self.lag(i) - m)).sum / self.variance_sample / (self.size - 1)
+          end
+        end
+      end
+      #=Partial Autocorrelation
+      #Generates partial autocorrelation series for a timeseries
+      #==Parameters
+      #* *max_lags*: integer, optional - provide number of lags
+      #* *method*: string. Default: 'yw'.
+      #    * *yw*: For yule-walker algorithm unbiased approach
+      #    * *mle*: For Maximum likelihood algorithm approach
+      #    * *ld*: Forr Levinson-Durbin recursive approach
+      #==Returns
+      # array of pacf
+      def pacf(max_lags = nil, method = :yw)
+        method = method.downcase.to_sym
+        max_lags ||= (10 * Math.log10(size)).to_i
+        if method.eql? :yw or method.eql? :mle
+          Pacf::Pacf.pacf_yw(self, max_lags, method.to_s)
+        elsif method == :ld
+          series = self.acvf
+          Pacf::Pacf.levinson_durbin(series, max_lags, true)[2]
+        else
+          raise "Method presents for pacf are 'yw', 'mle' or 'ld'"
+        end
+      end
+      #=Autoregressive estimation
+      #Generates AR(k) series for the calling timeseries by yule walker.
+      #==Parameters
+      #* *n*: integer, (default = 1500) number of observations for AR.
+      #* *k*: integer, (default = 1) order of AR process.
+      #==Returns
+      #Array constituting estimated AR series.
+      def ar(n = 1500, k = 1)
+        series = Statsample::TimeSeries.arima
+        #series = Statsample::TimeSeries::ARIMA.new
+        series.yule_walker(self, n, k)
+      end
+      #=AutoCovariance
+      #Provides autocovariance of timeseries.
+      #==Parameters
+      #* *demean* = true; optional. Supply false if series is not to be demeaned
+      #* *unbiased* = true; optional. true/false for unbiased/biased form of autocovariance
+      #==Returns
+      # Autocovariance value
+      def acvf(demean = true, unbiased = true)
+        #TODO: change parameters list in opts.merge as suggested by John
+        #functionality: computes autocovariance of timeseries data
+        #returns: array of autocovariances
+        if demean
+          demeaned_series = self - self.mean
+        else
+          demeaned_series = self
+        end
+        n = (10 * Math.log10(size)).to_i + 1
+        m = self.mean
+        if unbiased
+          d = Array.new(self.size, self.size)
+        else
+          d = ((1..self.size).to_a.reverse)[0..n]
+        end
+        0.upto(n - 1).map do |i|
+          (demeaned_series * (self.lag(i) - m)).sum / d[i]
+        end
+      end
+      #=Correlation
+      #Gives correlation of timeseries.
+      def correlate(a, v, mode = 'full')
+        #peforms cross-correlation of two series
+        #multiarray.correlate2(a, v, 'full')
+        if a.size < v.size
+          raise("Should have same size!")
+        end
+        ps = a.size + v.size - 1
+        a_padded = Array.new(ps, 0)
+        a_padded[0...a.size] = a
+        out = (mode.downcase.eql? 'full') ? Array.new(ps) : Array.new(a.size)
+        #ongoing
+      end
+      # Lags the series by k periods.
+      #
+      # The convention is to set the oldest observations (the first ones
+      # in the series) to nil so that the size of the lagged series is the
+      # same as the original.
+      #
+      # Usage:
+      #
+      #  ts = (1..10).map { rand }.to_time_series
+      #           # => [0.69, 0.23, 0.44, 0.71, ...]
+      #
+      #  ts.lag   # => [nil, 0.69, 0.23, 0.44, ...]
+      #  ts.lag 2 # => [nil, nil, 0.69, 0.23, ...]
+      #
+      def lag(k = 1)
+        return self if k == 0
+        dup.tap do |lagged|
+          (lagged.size - 1).downto k do |i|
+            lagged[i] = lagged[i - k]
+          end
+          (0...k).each do |i|
+            lagged[i] = nil
+          end
+          lagged.set_valid_data
+        end
+      end
+      #=Diff
+      # Performs the difference of the series.
+      # Note: The first difference of series is X(t) - X(t-1)
+      # But, second difference of series is NOT X(t) - X(t-2)
+      # It is the first difference of the first difference
+      # => (X(t) - X(t-1)) - (X(t-1) - X(t-2))
+      #==Params
+      #* *max_lags*: integer, (default: 1), number of differences reqd.
+      #==Usage
+      #
+      #  ts = (1..10).map { rand }.to_ts
+      #            # => [0.69, 0.23, 0.44, 0.71, ...]
+      #
+      #  ts.diff   # => [nil, -0.46, 0.21, 0.27, ...]
+      #==Returns
+      # Timeseries object
+      def diff(max_lags = 1)
+        ts = self
+        difference = []
+        max_lags.times do
+          difference = ts - ts.lag
+          ts = difference
+        end
+        difference
+      end
+      #=Moving Average
+      # Calculates the moving average of the series using the provided
+      # lookback argument. The lookback defaults to 10 periods.
+      #==Parameters
+      #* *n*: integer, (default = 10) - loopback argument
+      #
+      #==Usage
+      #
+      #   ts = (1..100).map { rand }.to_ts
+      #            # => [0.69, 0.23, 0.44, 0.71, ...]
+      #
+      #   # first 9 observations are nil
+      #   ts.ma    # => [ ... nil, 0.484... , 0.445... , 0.513 ... , ... ]
+      #
+      #==Returns
+      #Resulting moving average timeseries object
+      def ma(n = 10)
+        return mean if n >= size
+        ([nil] * (n - 1) + (0..(size - n)).map do |i|
+          self[i...(i + n)].inject(&:+) / n
+        end).to_time_series
+      end
+      #=Exponential Moving Average
+      # Calculates an exponential moving average of the series using a
+      # specified parameter. If wilder is false (the default) then the EMA
+      # uses a smoothing value of 2 / (n + 1), if it is true then it uses the
+      # Welles Wilder smoother of 1 / n.
+      #
+      # Warning for EMA usage: EMAs are unstable for small series, as they
+      # use a lot more than n observations to calculate. The series is stable
+      # if the size of the series is >= 3.45 * (n + 1)
+      #
+      #==Parameters
+      #* *n*: integer, (default = 10)
+      #* *wilder*: boolean, (default = false), if true, 1/n value is used for smoothing; if false, uses 2/(n+1) value
+      #
+      #==Usage
+      #   ts = (1..100).map { rand }.to_ts
+      #            # => [0.69, 0.23, 0.44, 0.71, ...]
+      #
+      #   # first 9 observations are nil
+      #   ts.ema   # => [ ... nil, 0.509... , 0.433..., ... ]
+      #
+      #==Returns
+      #EMA timeseries
+      def ema(n = 10, wilder = false)
+        smoother = wilder ? 1.0 / n : 2.0 / (n + 1)
+        # need to start everything from the first non-nil observation
+        start = self.data.index { |i| i != nil }
+        # first n - 1 observations are nil
+        base = [nil] * (start + n - 1)
+        # nth observation is just a moving average
+        base << self[start...(start + n)].inject(0.0) { |s, a| a.nil? ? s : s + a } / n
+        (start + n).upto size - 1 do |i|
+          base << self[i] * smoother + (1 - smoother) * base.last
+        end
+        base.to_time_series
+      end
+      #=Moving Average Convergence-Divergence
+      # Calculates the MACD (moving average convergence-divergence) of the time
+      # series - this is a comparison of a fast EMA with a slow EMA.
+      #
+      #==Parameters*:
+      #* *fast*: integer, (default = 12) - fast component of MACD
+      #* *slow*: integer, (default = 26) - slow component of MACD
+      #* *signal*: integer, (default = 9) - signal component of MACD
+      #
+      #==Usage
+      # ts = (1..100).map { rand }.to_ts
+      #            # => [0.69, 0.23, 0.44, 0.71, ...]
+      # ts.macd(13)
+      #
+      #==Returns
+      # Array of two timeseries - comparison of fast EMA with slow and EMA with signal value
+      def macd(fast = 12, slow = 26, signal = 9)
+        series = ema(fast) - ema(slow)
+        [series, series.ema(signal)]
+      end
+      # Borrow the operations from Vector, but convert to time series
+      def + series
+        super.to_a.to_ts
+      end
+      def - series
+        super.to_a.to_ts
+      end
+      def to_s
+        sprintf("Time Series(type:%s, n:%d)[%s]", @type.to_s, @data.size,
+                @data.collect{|d| d.nil? ? "nil":d}.join(","))
+      end
+    end
+  end
+end