RubyGems - prophet-rb - Versions diffs - 0.1.0 - Mend

prophet-rb 0.1.0

Files changed (17) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +23 -0
data/README.md +202 -0
data/data-raw/generated_holidays.csv +96474 -0
data/ext/prophet/Makefile +5 -0
data/ext/prophet/extconf.rb +18 -0
data/lib/prophet-rb.rb +1 -0
data/lib/prophet.rb +23 -0
data/lib/prophet/forecaster.rb +986 -0
data/lib/prophet/holidays.rb +27 -0
data/lib/prophet/plot.rb +269 -0
data/lib/prophet/stan_backend.rb +136 -0
data/lib/prophet/version.rb +3 -0
data/stan/unix/prophet.stan +131 -0
data/stan/win/prophet.stan +162 -0
metadata +170 -0

@@ -0,0 +1,27 @@
+module Prophet
+  module Holidays
+    def get_holiday_names(country)
+      years = (1995..2045).to_a
+      make_holidays_df(years, country)["holiday"].uniq
+    end
+    def make_holidays_df(year_list, country)
+      holidays_df.where(holidays_df["country"].eq(country) & holidays_df["year"].in(year_list))["ds", "holiday"]
+    end
+    # TODO marshal on installation
+    def holidays_df
+      @holidays_df ||= begin
+        holidays = {"ds" => [], "holiday" => [], "country" => [], "year" => []}
+        holidays_file = File.expand_path("../../data-raw/generated_holidays.csv", __dir__)
+        CSV.foreach(holidays_file, headers: true, converters: [:date, :numeric]) do |row|
+          holidays["ds"] << row["ds"]
+          holidays["holiday"] << row["holiday"]
+          holidays["country"] << row["country"]
+          holidays["year"] << row["year"]
+        end
+        Daru::DataFrame.new(holidays)
+      end
+    end
+  end
+end

data/lib/prophet/plot.rb ADDED

@@ -0,0 +1,269 @@
+module Prophet
+  module Plot
+    def plot(fcst, ax: nil, uncertainty: true, plot_cap: true, xlabel: "ds", ylabel: "y", figsize: [10, 6])
+      if ax.nil?
+        fig = plt.figure(facecolor: "w", figsize: figsize)
+        ax = fig.add_subplot(111)
+      else
+        fig = ax.get_figure
+      end
+      fcst_t = to_pydatetime(fcst["ds"])
+      ax.plot(to_pydatetime(@history["ds"]), @history["y"].map(&:to_f), "k.")
+      ax.plot(fcst_t, fcst["yhat"].map(&:to_f), ls: "-", c: "#0072B2")
+      if fcst.vectors.include?("cap") && plot_cap
+        ax.plot(fcst_t, fcst["cap"].map(&:to_f), ls: "--", c: "k")
+      end
+      if @logistic_floor && fcst.vectors.include?("floor") && plot_cap
+        ax.plot(fcst_t, fcst["floor"].map(&:to_f), ls: "--", c: "k")
+      end
+      if uncertainty && @uncertainty_samples
+        ax.fill_between(fcst_t, fcst["yhat_lower"].map(&:to_f), fcst["yhat_upper"].map(&:to_f), color: "#0072B2", alpha: 0.2)
+      end
+      # Specify formatting to workaround matplotlib issue #12925
+      locator = dates.AutoDateLocator.new(interval_multiples: false)
+      formatter = dates.AutoDateFormatter.new(locator)
+      ax.xaxis.set_major_locator(locator)
+      ax.xaxis.set_major_formatter(formatter)
+      ax.grid(true, which: "major", c: "gray", ls: "-", lw: 1, alpha: 0.2)
+      ax.set_xlabel(xlabel)
+      ax.set_ylabel(ylabel)
+      fig.tight_layout
+      fig
+    end
+    def plot_components(fcst, uncertainty: true, plot_cap: true, weekly_start: 0, yearly_start: 0, figsize: nil)
+      components = ["trend"]
+      if @train_holiday_names && fcst.vectors.include?("holidays")
+        components << "holidays"
+      end
+      # Plot weekly seasonality, if present
+      if @seasonalities["weekly"] && fcst.vectors.include?("weekly")
+        components << "weekly"
+      end
+      # Yearly if present
+      if @seasonalities["yearly"] && fcst.vectors.include?("yearly")
+        components << "yearly"
+      end
+      # Other seasonalities
+      components.concat(@seasonalities.keys.select { |name| fcst.vectors.include?(name) && !["weekly", "yearly"].include?(name) }.sort)
+      regressors = {"additive" => false, "multiplicative" => false}
+      @extra_regressors.each do |name, props|
+        regressors[props[:mode]] = true
+      end
+      ["additive", "multiplicative"].each do |mode|
+        if regressors[mode] && fcst.vectors.include?("extra_regressors_#{mode}")
+          components << "extra_regressors_#{mode}"
+        end
+      end
+      npanel = components.size
+      figsize = figsize || [9, 3 * npanel]
+      fig, axes = plt.subplots(npanel, 1, facecolor: "w", figsize: figsize)
+      if npanel == 1
+        axes = [axes]
+      end
+      multiplicative_axes = []
+      axes.tolist.zip(components) do |ax, plot_name|
+        if plot_name == "trend"
+          plot_forecast_component(fcst, "trend", ax: ax, uncertainty: uncertainty, plot_cap: plot_cap)
+        elsif @seasonalities[plot_name]
+          if plot_name == "weekly" || @seasonalities[plot_name][:period] == 7
+            plot_weekly(name: plot_name, ax: ax, uncertainty: uncertainty, weekly_start: weekly_start)
+          elsif plot_name == "yearly" || @seasonalities[plot_name][:period] == 365.25
+            plot_yearly(name: plot_name, ax: ax, uncertainty: uncertainty, yearly_start: yearly_start)
+          else
+            plot_seasonality(name: plot_name, ax: ax, uncertainty: uncertainty)
+          end
+        elsif ["holidays", "extra_regressors_additive", "extra_regressors_multiplicative"].include?(plot_name)
+          plot_forecast_component(fcst, plot_name, ax: ax, uncertainty: uncertainty, plot_cap: false)
+        end
+        if @component_modes["multiplicative"].include?(plot_name)
+          multiplicative_axes << ax
+        end
+      end
+      fig.tight_layout
+      # Reset multiplicative axes labels after tight_layout adjustment
+      multiplicative_axes.each do |ax|
+        ax = set_y_as_percent(ax)
+      end
+      fig
+    end
+    private
+    def plot_forecast_component(fcst, name, ax: nil, uncertainty: true, plot_cap: false, figsize: [10, 6])
+      artists = []
+      if !ax
+        fig = plt.figure(facecolor: "w", figsize: figsize)
+        ax = fig.add_subplot(111)
+      end
+      fcst_t = to_pydatetime(fcst["ds"])
+      artists += ax.plot(fcst_t, fcst[name].map(&:to_f), ls: "-", c: "#0072B2")
+      if fcst.vectors.include?("cap") && plot_cap
+        artists += ax.plot(fcst_t, fcst["cap"].map(&:to_f), ls: "--", c: "k")
+      end
+      if @logistic_floor && fcst.vectors.include?("floor") && plot_cap
+        ax.plot(fcst_t, fcst["floor"].map(&:to_f), ls: "--", c: "k")
+      end
+      if uncertainty && @uncertainty_samples
+        artists += [ax.fill_between(fcst_t, fcst[name + "_lower"].map(&:to_f), fcst[name + "_upper"].map(&:to_f), color: "#0072B2", alpha: 0.2)]
+      end
+      # Specify formatting to workaround matplotlib issue #12925
+      locator = dates.AutoDateLocator.new(interval_multiples: false)
+      formatter = dates.AutoDateFormatter.new(locator)
+      ax.xaxis.set_major_locator(locator)
+      ax.xaxis.set_major_formatter(formatter)
+      ax.grid(true, which: "major", c: "gray", ls: "-", lw: 1, alpha: 0.2)
+      ax.set_xlabel("ds")
+      ax.set_ylabel(name)
+      if @component_modes["multiplicative"].include?(name)
+        ax = set_y_as_percent(ax)
+      end
+      artists
+    end
+    def seasonality_plot_df(ds)
+      df_dict = {"ds" => ds, "cap" => [1.0] * ds.size, "floor" => [0.0] * ds.size}
+      @extra_regressors.each do |name|
+        df_dict[name] = [0.0] * ds.size
+      end
+      # Activate all conditional seasonality columns
+      @seasonalities.values.each do |props|
+        if props[:condition_name]
+          df_dict[props[:condition_name]] = [true] * ds.size
+        end
+      end
+      df = Daru::DataFrame.new(df_dict)
+      df = setup_dataframe(df)
+      df
+    end
+    def plot_weekly(ax: nil, uncertainty: true, weekly_start: 0, figsize: [10, 6], name: "weekly")
+      artists = []
+      if !ax
+        fig = plt.figure(facecolor: "w", figsize: figsize)
+        ax = fig.add_subplot(111)
+      end
+      # Compute weekly seasonality for a Sun-Sat sequence of dates.
+      start = Date.parse("2017-01-01")
+      days = 7.times.map { |i| start + i + weekly_start }
+      df_w = seasonality_plot_df(days)
+      seas = predict_seasonal_components(df_w)
+      days = days.map { |v| v.strftime("%A") }
+      artists += ax.plot(days.size.times.to_a, seas[name].map(&:to_f), ls: "-", c: "#0072B2")
+      if uncertainty && @uncertainty_samples
+        artists += [ax.fill_between(days.size.times.to_a, seas[name + "_lower"].map(&:to_f), seas[name + "_upper"].map(&:to_f), color: "#0072B2", alpha: 0.2)]
+      end
+      ax.grid(true, which: "major", c: "gray", ls: "-", lw: 1, alpha: 0.2)
+      ax.set_xticks(days.size.times.to_a)
+      ax.set_xticklabels(days)
+      ax.set_xlabel("Day of week")
+      ax.set_ylabel(name)
+      if @seasonalities[name]["mode"] == "multiplicative"
+        ax = set_y_as_percent(ax)
+      end
+      artists
+    end
+    def plot_yearly(ax: nil, uncertainty: true, yearly_start: 0, figsize: [10, 6], name: "yearly")
+      artists = []
+      if !ax
+        fig = plt.figure(facecolor: "w", figsize: figsize)
+        ax = fig.add_subplot(111)
+      end
+      # Compute yearly seasonality for a Jan 1 - Dec 31 sequence of dates.
+      start = Date.parse("2017-01-01")
+      days = 365.times.map { |i| start + i + yearly_start }
+      df_y = seasonality_plot_df(days)
+      seas = predict_seasonal_components(df_y)
+      artists += ax.plot(to_pydatetime(df_y["ds"]), seas[name].map(&:to_f), ls: "-", c: "#0072B2")
+      if uncertainty && @uncertainty_samples
+        artists += [ax.fill_between(to_pydatetime(df_y["ds"]), seas[name + "_lower"].map(&:to_f), seas[name + "_upper"].map(&:to_f), color: "#0072B2", alpha: 0.2)]
+      end
+      ax.grid(true, which: "major", c: "gray", ls: "-", lw: 1, alpha: 0.2)
+      months = dates.MonthLocator.new((1..12).to_a, bymonthday: 1, interval: 2)
+      ax.xaxis.set_major_formatter(ticker.FuncFormatter.new(lambda { |x, pos=nil| dates.num2date(x).strftime("%B %-e") }))
+      ax.xaxis.set_major_locator(months)
+      ax.set_xlabel("Day of year")
+      ax.set_ylabel(name)
+      if @seasonalities[name][:mode] == "multiplicative"
+        ax = set_y_as_percent(ax)
+      end
+      artists
+    end
+    def plot_seasonality(name:, ax: nil, uncertainty: true, figsize: [10, 6])
+      artists = []
+      if !ax
+        fig = plt.figure(facecolor: "w", figsize: figsize)
+        ax = fig.add_subplot(111)
+      end
+      # Compute seasonality from Jan 1 through a single period.
+      start = Time.utc(2017)
+      period = @seasonalities[name][:period]
+      finish = start + period * 86400
+      plot_points = 200
+      start = start.to_i
+      finish = finish.to_i
+      step = (finish - start) / (plot_points - 1).to_f
+      days = plot_points.times.map { |i| Time.at(start + i * step).utc }
+      df_y = seasonality_plot_df(days)
+      seas = predict_seasonal_components(df_y)
+      artists += ax.plot(to_pydatetime(df_y["ds"]), seas[name].map(&:to_f), ls: "-", c: "#0072B2")
+      if uncertainty && @uncertainty_samples
+        artists += [ax.fill_between(to_pydatetime(df_y["ds"]), seas[name + "_lower"].map(&:to_f), seas[name + "_upper"].map(&:to_f), color: "#0072B2", alpha: 0.2)]
+      end
+      ax.grid(true, which: "major", c: "gray", ls: "-", lw: 1, alpha: 0.2)
+      step = (finish - start) / (7 - 1).to_f
+      xticks = to_pydatetime(7.times.map { |i| Time.at(start + i * step).utc })
+      ax.set_xticks(xticks)
+      if period <= 2
+        fmt_str = "%T"
+      elsif period < 14
+        fmt_str = "%m/%d %R"
+      else
+        fmt_str = "%m/%d"
+      end
+      ax.xaxis.set_major_formatter(ticker.FuncFormatter.new(lambda { |x, pos=nil| dates.num2date(x).strftime(fmt_str) }))
+      ax.set_xlabel("ds")
+      ax.set_ylabel(name)
+      if @seasonalities[name][:mode] == "multiplicative"
+        ax = set_y_as_percent(ax)
+      end
+      artists
+    end
+    def set_y_as_percent(ax)
+      yticks = 100 * ax.get_yticks
+      yticklabels = yticks.tolist.map { |y| "%.4g%%" % y }
+      ax.set_yticklabels(yticklabels)
+      ax
+    end
+    def plt
+      begin
+        require "matplotlib/pyplot"
+      rescue LoadError
+        raise Error, "Install the matplotlib gem for plots"
+      end
+      Matplotlib::Pyplot
+    end
+    def dates
+      PyCall.import_module("matplotlib.dates")
+    end
+    def ticker
+      PyCall.import_module("matplotlib.ticker")
+    end
+    def to_pydatetime(v)
+      datetime = PyCall.import_module("datetime")
+      v.map { |v| datetime.datetime.utcfromtimestamp(v.to_i) }
+    end
+  end
+end

data/lib/prophet/stan_backend.rb ADDED

@@ -0,0 +1,136 @@
+module Prophet
+  class StanBackend
+    def initialize(logger)
+      @model = load_model
+      @logger = logger
+    end
+    def load_model
+      model_file = File.expand_path("../../stan_model/prophet_model.bin", __dir__)
+      CmdStan::Model.new(exe_file: model_file)
+    end
+    def fit(stan_init, stan_data, **kwargs)
+      stan_init, stan_data = prepare_data(stan_init, stan_data)
+      kwargs[:algorithm] ||= stan_data["T"] < 100 ? "Newton" : "LBFGS"
+      iterations = 10000
+      stan_fit = nil
+      begin
+        stan_fit = @model.optimize(
+          data: stan_data,
+          inits: stan_init,
+          iter: iterations,
+          **kwargs
+        )
+      rescue => e
+        if kwargs[:algorithm] != "Newton"
+          @logger.warn "Optimization terminated abnormally. Falling back to Newton."
+          kwargs[:algorithm] = "Newton"
+          stan_fit = @model.optimize(
+            data: stan_data,
+            inits: stan_init,
+            iter: iterations,
+            **kwargs
+          )
+        else
+          raise e
+        end
+      end
+      params = stan_to_numo(stan_fit.column_names, Numo::NArray.asarray(stan_fit.optimized_params.values))
+      params.each_key do |par|
+        params[par] = params[par].reshape(1, *params[par].shape)
+      end
+      params
+    end
+    def sampling(stan_init, stan_data, samples, **kwargs)
+      stan_init, stan_data = prepare_data(stan_init, stan_data)
+      kwargs[:chains] ||= 4
+      kwargs[:warmup_iters] ||= samples / 2
+      stan_fit = @model.sample(
+        data: stan_data,
+        inits: stan_init,
+        sampling_iters: samples,
+        **kwargs
+      )
+      res = Numo::NArray.asarray(stan_fit.sample)
+      samples, c, columns = res.shape
+      res = res.reshape(samples * c, columns)
+      params = stan_to_numo(stan_fit.column_names, res)
+      params.each_key do |par|
+        s = params[par].shape
+        if s[1] == 1
+          params[par] = params[par].reshape(s[0])
+        end
+        if ["delta", "beta"].include?(par) && s.size < 2
+          params[par] = params[par].reshape(-1, 1)
+        end
+      end
+      params
+    end
+    private
+    def stan_to_numo(column_names, data)
+      output = {}
+      prev = nil
+      start = 0
+      finish = 0
+      two_dims = data.shape.size > 1
+      column_names.each do |cname|
+        parsed = cname.split(".")
+        curr = parsed[0]
+        prev = curr if prev.nil?
+        if curr != prev
+          raise Error, "Found repeated column name" if output[prev]
+          if two_dims
+            output[prev] = Numo::NArray.asarray(data[true, start...finish])
+          else
+            output[prev] = Numo::NArray.asarray(data[start...finish])
+          end
+          prev = curr
+          start = finish
+          finish += 1
+        else
+          finish += 1
+        end
+      end
+      raise Error, "Found repeated column name" if output[prev]
+      if two_dims
+        output[prev] = Numo::NArray.asarray(data[true, start...finish])
+      else
+        output[prev] = Numo::NArray.asarray(data[start...finish])
+      end
+      output
+    end
+    def prepare_data(stan_init, stan_data)
+      stan_data["y"] = stan_data["y"].to_a
+      stan_data["t"] = stan_data["t"].to_a
+      stan_data["cap"] = stan_data["cap"].to_a
+      stan_data["t_change"] = stan_data["t_change"].to_a
+      stan_data["s_a"] = stan_data["s_a"].to_a
+      stan_data["s_m"] = stan_data["s_m"].to_a
+      stan_data["X"] = stan_data["X"].to_matrix.to_a
+      stan_init["delta"] = stan_init["delta"].to_a
+      stan_init["beta"] = stan_init["beta"].to_a
+      [stan_init, stan_data]
+    end
+  end
+end

data/lib/prophet/version.rb ADDED

@@ -0,0 +1,3 @@
+module Prophet
+  VERSION = "0.1.0"
+end

data/stan/unix/prophet.stan ADDED

@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+functions {
+  matrix get_changepoint_matrix(vector t, vector t_change, int T, int S) {
+    // Assumes t and t_change are sorted.
+    matrix[T, S] A;
+    row_vector[S] a_row;
+    int cp_idx;
+    // Start with an empty matrix.
+    A = rep_matrix(0, T, S);
+    a_row = rep_row_vector(0, S);
+    cp_idx = 1;
+    // Fill in each row of A.
+    for (i in 1:T) {
+      while ((cp_idx <= S) && (t[i] >= t_change[cp_idx])) {
+        a_row[cp_idx] = 1;
+        cp_idx = cp_idx + 1;
+      }
+      A[i] = a_row;
+    }
+    return A;
+  }
+  // Logistic trend functions
+  vector logistic_gamma(real k, real m, vector delta, vector t_change, int S) {
+    vector[S] gamma;  // adjusted offsets, for piecewise continuity
+    vector[S + 1] k_s;  // actual rate in each segment
+    real m_pr;
+    // Compute the rate in each segment
+    k_s = append_row(k, k + cumulative_sum(delta));
+    // Piecewise offsets
+    m_pr = m; // The offset in the previous segment
+    for (i in 1:S) {
+      gamma[i] = (t_change[i] - m_pr) * (1 - k_s[i] / k_s[i + 1]);
+      m_pr = m_pr + gamma[i];  // update for the next segment
+    }
+    return gamma;
+  }
+  vector logistic_trend(
+    real k,
+    real m,
+    vector delta,
+    vector t,
+    vector cap,
+    matrix A,
+    vector t_change,
+    int S
+  ) {
+    vector[S] gamma;
+    gamma = logistic_gamma(k, m, delta, t_change, S);
+    return cap .* inv_logit((k + A * delta) .* (t - (m + A * gamma)));
+  }
+  // Linear trend function
+  vector linear_trend(
+    real k,
+    real m,
+    vector delta,
+    vector t,
+    matrix A,
+    vector t_change
+  ) {
+    return (k + A * delta) .* t + (m + A * (-t_change .* delta));
+  }
+}
+data {
+  int T;                // Number of time periods
+  int<lower=1> K;       // Number of regressors
+  vector[T] t;          // Time
+  vector[T] cap;        // Capacities for logistic trend
+  vector[T] y;          // Time series
+  int S;                // Number of changepoints
+  vector[S] t_change;   // Times of trend changepoints
+  matrix[T,K] X;        // Regressors
+  vector[K] sigmas;     // Scale on seasonality prior
+  real<lower=0> tau;    // Scale on changepoints prior
+  int trend_indicator;  // 0 for linear, 1 for logistic
+  vector[K] s_a;        // Indicator of additive features
+  vector[K] s_m;        // Indicator of multiplicative features
+}
+transformed data {
+  matrix[T, S] A;
+  A = get_changepoint_matrix(t, t_change, T, S);
+}
+parameters {
+  real k;                   // Base trend growth rate
+  real m;                   // Trend offset
+  vector[S] delta;          // Trend rate adjustments
+  real<lower=0> sigma_obs;  // Observation noise
+  vector[K] beta;           // Regressor coefficients
+}
+model {
+  //priors
+  k ~ normal(0, 5);
+  m ~ normal(0, 5);
+  delta ~ double_exponential(0, tau);
+  sigma_obs ~ normal(0, 0.5);
+  beta ~ normal(0, sigmas);
+  // Likelihood
+  if (trend_indicator == 0) {
+    y ~ normal(
+      linear_trend(k, m, delta, t, A, t_change)
+      .* (1 + X * (beta .* s_m))
+      + X * (beta .* s_a),
+      sigma_obs
+    );
+  } else if (trend_indicator == 1) {
+    y ~ normal(
+      logistic_trend(k, m, delta, t, cap, A, t_change, S)
+      .* (1 + X * (beta .* s_m))
+      + X * (beta .* s_a),
+      sigma_obs
+    );
+  }
+}