prophet-rb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ install:
2
+ @echo "Skipping"
3
+
4
+ clean:
5
+ @echo "Skipping"
@@ -0,0 +1,18 @@
1
+ require "cmdstan"
2
+ require "fileutils"
3
+ require "tmpdir"
4
+
5
+ platform = Gem.win_platform? ? "win" : "unix"
6
+ stan_file = File.expand_path("../../stan/#{platform}/prophet.stan", __dir__)
7
+
8
+ # copy to avoid temp file in repo
9
+ temp_file = "#{Dir.tmpdir}/prophet.stan"
10
+ FileUtils.cp(stan_file, temp_file)
11
+
12
+ # compile
13
+ sm = CmdStan::Model.new(stan_file: temp_file)
14
+
15
+ # save
16
+ target_dir = File.expand_path("../../stan_model", __dir__)
17
+ FileUtils.mkdir_p(target_dir)
18
+ FileUtils.cp(sm.exe_file, "#{target_dir}/prophet_model.bin")
@@ -0,0 +1 @@
1
+ require "prophet"
@@ -0,0 +1,23 @@
1
+ # dependencies
2
+ require "cmdstan"
3
+ require "daru"
4
+ require "numo/narray"
5
+
6
+ # stdlib
7
+ require "logger"
8
+ require "set"
9
+
10
+ # modules
11
+ require "prophet/holidays"
12
+ require "prophet/plot"
13
+ require "prophet/forecaster"
14
+ require "prophet/stan_backend"
15
+ require "prophet/version"
16
+
17
+ module Prophet
18
+ class Error < StandardError; end
19
+
20
+ def self.new(**kwargs)
21
+ Forecaster.new(**kwargs)
22
+ end
23
+ end
@@ -0,0 +1,986 @@
1
+ module Prophet
2
+ class Forecaster
3
+ include Holidays
4
+ include Plot
5
+
6
+ attr_reader :logger, :params, :train_holiday_names
7
+
8
+ def initialize(
9
+ growth: "linear",
10
+ changepoints: nil,
11
+ n_changepoints: 25,
12
+ changepoint_range: 0.8,
13
+ yearly_seasonality: "auto",
14
+ weekly_seasonality: "auto",
15
+ daily_seasonality: "auto",
16
+ holidays: nil,
17
+ seasonality_mode: "additive",
18
+ seasonality_prior_scale: 10.0,
19
+ holidays_prior_scale: 10.0,
20
+ changepoint_prior_scale: 0.05,
21
+ mcmc_samples: 0,
22
+ interval_width: 0.80,
23
+ uncertainty_samples: 1000
24
+ )
25
+ @growth = growth
26
+
27
+ @changepoints = to_datetime(changepoints)
28
+ if !@changepoints.nil?
29
+ @n_changepoints = @changepoints.size
30
+ @specified_changepoints = true
31
+ else
32
+ @n_changepoints = n_changepoints
33
+ @specified_changepoints = false
34
+ end
35
+
36
+ @changepoint_range = changepoint_range
37
+ @yearly_seasonality = yearly_seasonality
38
+ @weekly_seasonality = weekly_seasonality
39
+ @daily_seasonality = daily_seasonality
40
+ @holidays = holidays
41
+
42
+ @seasonality_mode = seasonality_mode
43
+ @seasonality_prior_scale = seasonality_prior_scale.to_f
44
+ @changepoint_prior_scale = changepoint_prior_scale.to_f
45
+ @holidays_prior_scale = holidays_prior_scale.to_f
46
+
47
+ @mcmc_samples = mcmc_samples
48
+ @interval_width = interval_width
49
+ @uncertainty_samples = uncertainty_samples
50
+
51
+ # Set during fitting or by other methods
52
+ @start = nil
53
+ @y_scale = nil
54
+ @logistic_floor = false
55
+ @t_scale = nil
56
+ @changepoints_t = nil
57
+ @seasonalities = {}
58
+ @extra_regressors = {}
59
+ @country_holidays = nil
60
+ @stan_fit = nil
61
+ @params = {}
62
+ @history = nil
63
+ @history_dates = nil
64
+ @train_component_cols = nil
65
+ @component_modes = nil
66
+ @train_holiday_names = nil
67
+ @fit_kwargs = {}
68
+ validate_inputs
69
+
70
+ @logger = ::Logger.new($stderr)
71
+ @logger.formatter = proc do |severity, datetime, progname, msg|
72
+ "[prophet] #{msg}\n"
73
+ end
74
+ @stan_backend = StanBackend.new(@logger)
75
+ end
76
+
77
+ def validate_inputs
78
+ if !["linear", "logistic"].include?(@growth)
79
+ raise ArgumentError, "Parameter \"growth\" should be \"linear\" or \"logistic\"."
80
+ end
81
+ if @changepoint_range < 0 || @changepoint_range > 1
82
+ raise ArgumentError, "Parameter \"changepoint_range\" must be in [0, 1]"
83
+ end
84
+ if @holidays
85
+ if !@holidays.is_a?(Daru::DataFrame) && @holidays.vectors.include?("ds") && @holidays.vectors.include?("holiday")
86
+ raise ArgumentError, "holidays must be a DataFrame with \"ds\" and \"holiday\" columns."
87
+ end
88
+ @holidays["ds"] = to_datetime(@holidays["ds"])
89
+ has_lower = @holidays.vectors.include?("lower_window")
90
+ has_upper = @holidays.vectors.include?("upper_window")
91
+ if has_lower ^ has_upper # xor
92
+ raise ArgumentError, "Holidays must have both lower_window and upper_window, or neither"
93
+ end
94
+ if has_lower
95
+ if @holidays["lower_window"].max > 0
96
+ raise ArgumentError, "Holiday lower_window should be <= 0"
97
+ end
98
+ if @holidays["upper_window"].min < 0
99
+ raise ArgumentError, "Holiday upper_window should be >= 0"
100
+ end
101
+ end
102
+ @holidays["holiday"].uniq.each do |h|
103
+ validate_column_name(h, check_holidays: false)
104
+ end
105
+ end
106
+
107
+ if !["additive", "multiplicative"].include?(@seasonality_mode)
108
+ raise ArgumentError, "seasonality_mode must be \"additive\" or \"multiplicative\""
109
+ end
110
+ end
111
+
112
+ def validate_column_name(name, check_holidays: true, check_seasonalities: true, check_regressors: true)
113
+ if name.include?("_delim_")
114
+ raise ArgumentError, "Name cannot contain \"_delim_\""
115
+ end
116
+ reserved_names = [
117
+ "trend", "additive_terms", "daily", "weekly", "yearly",
118
+ "holidays", "zeros", "extra_regressors_additive", "yhat",
119
+ "extra_regressors_multiplicative", "multiplicative_terms",
120
+ ]
121
+ rn_l = reserved_names.map { |n| n + "_lower" }
122
+ rn_u = reserved_names.map { |n| n + "_upper" }
123
+ reserved_names.concat(rn_l)
124
+ reserved_names.concat(rn_u)
125
+ reserved_names.concat(["ds", "y", "cap", "floor", "y_scaled", "cap_scaled"])
126
+ if reserved_names.include?(name)
127
+ raise ArgumentError, "Name #{name.inspect} is reserved."
128
+ end
129
+ if check_holidays && @holidays && @holidays["holiday"].uniq.include?(name)
130
+ raise ArgumentError, "Name #{name.inspect} already used for a holiday."
131
+ end
132
+ if check_holidays && @country_holidays && get_holiday_names(@country_holidays).include?(name)
133
+ raise ArgumentError, "Name #{name.inspect} is a holiday name in #{@country_holidays.inspect}."
134
+ end
135
+ if check_seasonalities && @seasonalities[name]
136
+ raise ArgumentError, "Name #{name.inspect} already used for a seasonality."
137
+ end
138
+ if check_regressors and @extra_regressors[name]
139
+ raise ArgumentError, "Name #{name.inspect} already used for an added regressor."
140
+ end
141
+ end
142
+
143
+ def setup_dataframe(df, initialize_scales: false)
144
+ if df.vectors.include?("y")
145
+ df["y"] = df["y"].map(&:to_f)
146
+ raise ArgumentError "Found infinity in column y." unless df["y"].all?(&:finite?)
147
+ end
148
+ # TODO support integers
149
+
150
+ df["ds"] = to_datetime(df["ds"])
151
+
152
+ raise ArgumentError, "Found NaN in column ds." if df["ds"].any?(&:nil?)
153
+
154
+ @extra_regressors.each_key do |name|
155
+ if !df.vectors.include?(name)
156
+ raise ArgumentError, "Regressor #{name.inspect} missing from dataframe"
157
+ end
158
+ df[name] = df[name].map(&:to_f)
159
+ if df[name].any?(&:nil)
160
+ raise ArgumentError, "Found NaN in column #{name.inspect}"
161
+ end
162
+ end
163
+ @seasonalities.values.each do |props|
164
+ condition_name = props[:condition_name]
165
+ if condition_name
166
+ if !df.vectors.include?(condition_name)
167
+ raise ArgumentError, "Condition #{condition_name.inspect} missing from dataframe"
168
+ end
169
+ if df.where(!df[condition_name].in([true, false])).any?
170
+ raise ArgumentError, "Found non-boolean in column #{condition_name.inspect}"
171
+ end
172
+ end
173
+ end
174
+
175
+ if df.index.name == "ds"
176
+ df.index.name = nil
177
+ end
178
+ df = df.sort(["ds"])
179
+
180
+ initialize_scales(initialize_scales, df)
181
+
182
+ if @logistic_floor && !df.vectors.include?("floor")
183
+ raise ArgumentError, "Expected column \"floor\"."
184
+ else
185
+ df["floor"] = 0
186
+ end
187
+
188
+ if @growth == "logistic"
189
+ unless df.vectors.include?("cap")
190
+ raise ArgumentError, "Capacities must be supplied for logistic growth in column \"cap\""
191
+ end
192
+ if df.where(df["cap"] <= df["floor"]).size > 0
193
+ raise ArgumentError, "cap must be greater than floor (which defaults to 0)."
194
+ end
195
+ df["cap_scaled"] = (df["cap"] - df["floor"]) / @y_scale
196
+ end
197
+
198
+ df["t"] = (df["ds"] - @start) / @t_scale.to_f
199
+ if df.vectors.include?("y")
200
+ df["y_scaled"] = (df["y"] - df["floor"]) / @y_scale
201
+ end
202
+
203
+ @extra_regressors.each do |name, props|
204
+ df[name] = ((df[name] - props["mu"]) / props["std"])
205
+ end
206
+
207
+ df
208
+ end
209
+
210
+ def initialize_scales(initialize_scales, df)
211
+ return unless initialize_scales
212
+
213
+ floor = 0
214
+ @y_scale = (df["y"] - floor).abs.max
215
+ @y_scale = 1 if @y_scale == 0
216
+ @start = df["ds"].min
217
+ @t_scale = df["ds"].max - @start
218
+ end
219
+
220
+ def set_changepoints
221
+ hist_size = (@history.shape[0] * @changepoint_range).floor
222
+
223
+ if @n_changepoints + 1 > hist_size
224
+ @n_changepoints = hist_size - 1
225
+ logger.info "n_changepoints greater than number of observations. Using #{@n_changepoints}"
226
+ end
227
+
228
+ if @n_changepoints > 0
229
+ step = (hist_size - 1) / @n_changepoints.to_f
230
+ cp_indexes = (@n_changepoints + 1).times.map { |i| (i * step).round }
231
+ @changepoints = @history["ds"][*cp_indexes][1..-1]
232
+ else
233
+ @changepoints = []
234
+ end
235
+
236
+ if @changepoints.size > 0
237
+ @changepoints_t = Numo::NArray.asarray(((@changepoints - @start) / @t_scale.to_f).to_a).sort
238
+ else
239
+ @changepoints_t = Numo::NArray.asarray([0])
240
+ end
241
+ end
242
+
243
+ def fourier_series(dates, period, series_order)
244
+ start = Time.utc(1970).to_i
245
+ # uses to_datetime first so we get UTC
246
+ t = Numo::DFloat.asarray(dates.map { |v| v.to_i - start }) / (3600 * 24.0)
247
+
248
+ # no need for column_stack
249
+ series_order.times.flat_map do |i|
250
+ [Numo::DFloat::Math.method(:sin), Numo::DFloat::Math.method(:cos)].map do |fun|
251
+ fun.call(2.0 * (i + 1) * Math::PI * t / period)
252
+ end
253
+ end
254
+ end
255
+
256
+ def make_seasonality_features(dates, period, series_order, prefix)
257
+ features = fourier_series(dates, period, series_order)
258
+ Daru::DataFrame.new(features.map.with_index { |v, i| ["#{prefix}_delim_#{i + 1}", v] }.to_h)
259
+ end
260
+
261
+ def construct_holiday_dataframe(dates)
262
+ all_holidays = Daru::DataFrame.new
263
+ if @holidays
264
+ all_holidays = @holidays.dup
265
+ end
266
+ if @country_holidays
267
+ year_list = dates.map(&:year)
268
+ country_holidays_df = make_holidays_df(year_list, @country_holidays)
269
+ all_holidays = all_holidays.concat(country_holidays_df)
270
+ end
271
+ # Drop future holidays not previously seen in training data
272
+ if @train_holiday_names
273
+ # Remove holiday names didn't show up in fit
274
+ all_holidays = all_holidays.where(all_holidays["holiday"].in(@train_holiday_names))
275
+
276
+ # Add holiday names in fit but not in predict with ds as NA
277
+ holidays_to_add = Daru::DataFrame.new(
278
+ "holiday" => @train_holiday_names.where(!@train_holiday_names.in(all_holidays["holiday"]))
279
+ )
280
+ all_holidays = all_holidays.concat(holidays_to_add)
281
+ end
282
+
283
+ all_holidays
284
+ end
285
+
286
+ def make_holiday_features(dates, holidays)
287
+ expanded_holidays = Hash.new { |hash, key| hash[key] = Numo::DFloat.zeros(dates.size) }
288
+ prior_scales = {}
289
+ # Makes an index so we can perform `get_loc` below.
290
+ # Strip to just dates.
291
+ row_index = dates.map(&:to_date)
292
+
293
+ holidays.each_row do |row|
294
+ dt = row["ds"]
295
+ lw = nil
296
+ uw = nil
297
+ begin
298
+ lw = row["lower_window"].to_i
299
+ uw = row["upper_window"].to_i
300
+ rescue IndexError
301
+ lw = 0
302
+ uw = 0
303
+ end
304
+ ps = @holidays_prior_scale
305
+ if prior_scales[row["holiday"]] && prior_scales[row["holiday"]] != ps
306
+ raise ArgumentError, "Holiday #{row["holiday"].inspect} does not have consistent prior scale specification."
307
+ end
308
+ raise ArgumentError, "Prior scale must be > 0" if ps <= 0
309
+ prior_scales[row["holiday"]] = ps
310
+
311
+ lw.upto(uw).each do |offset|
312
+ occurrence = dt ? dt + offset : nil
313
+ loc = occurrence ? row_index.index(occurrence) : nil
314
+ key = "#{row["holiday"]}_delim_#{offset >= 0 ? "+" : "-"}#{offset.abs}"
315
+ if loc
316
+ expanded_holidays[key][loc] = 1.0
317
+ else
318
+ expanded_holidays[key] # Access key to generate value
319
+ end
320
+ end
321
+ end
322
+ holiday_features = Daru::DataFrame.new(expanded_holidays)
323
+ # # Make sure column order is consistent
324
+ holiday_features = holiday_features[*holiday_features.vectors.sort]
325
+ prior_scale_list = holiday_features.vectors.map { |h| prior_scales[h.split("_delim_")[0]] }
326
+ holiday_names = prior_scales.keys
327
+ # Store holiday names used in fit
328
+ if !@train_holiday_names
329
+ @train_holiday_names = Daru::Vector.new(holiday_names)
330
+ end
331
+ [holiday_features, prior_scale_list, holiday_names]
332
+ end
333
+
334
+ def add_regressor(name, prior_scale: nil, standardize: "auto", mode: nil)
335
+ raise Error, "Regressors must be added prior to model fitting." if @history
336
+ validate_column_name(name, check_regressors: false)
337
+ prior_scale ||= @holidays_prior_scale.to_f
338
+ mode ||= @seasonality_mode
339
+ raise ArgumentError, "Prior scale must be > 0" if prior_scale <= 0
340
+ if !["additive", "multiplicative"].include?(mode)
341
+ raise ArgumentError, "mode must be \"additive\" or \"multiplicative\""
342
+ end
343
+ @extra_regressors[name] = {
344
+ prior_scale: prior_scale,
345
+ standardize: standardize,
346
+ mu: 0.0,
347
+ std: 1.0,
348
+ mode: mode
349
+ }
350
+ self
351
+ end
352
+
353
+ def add_seasonality(name:, period:, fourier_order:, prior_scale: nil, mode: nil, condition_name: nil)
354
+ raise Error, "Seasonality must be added prior to model fitting." if @history
355
+
356
+ if !["daily", "weekly", "yearly"].include?(name)
357
+ # Allow overwriting built-in seasonalities
358
+ validate_column_name(name, check_seasonalities: false)
359
+ end
360
+ if prior_scale.nil?
361
+ ps = @seasonality_prior_scale
362
+ else
363
+ ps = prior_scale.to_f
364
+ end
365
+ raise ArgumentError, "Prior scale must be > 0" if ps <= 0
366
+ raise ArgumentError, "Fourier Order must be > 0" if fourier_order <= 0
367
+ mode ||= @seasonality_mode
368
+ if !["additive", "multiplicative"].include?(mode)
369
+ raise ArgumentError, "mode must be \"additive\" or \"multiplicative\""
370
+ end
371
+ validate_column_name(condition_name) if condition_name
372
+ @seasonalities[name] = {
373
+ period: period,
374
+ fourier_order: fourier_order,
375
+ prior_scale: ps,
376
+ mode: mode,
377
+ condition_name: condition_name
378
+ }
379
+ self
380
+ end
381
+
382
+ def add_country_holidays(country_name)
383
+ raise Error, "Country holidays must be added prior to model fitting." if @history
384
+ # Validate names.
385
+ get_holiday_names(country_name).each do |name|
386
+ # Allow merging with existing holidays
387
+ validate_column_name(name, check_holidays: false)
388
+ end
389
+ # Set the holidays.
390
+ if @country_holidays
391
+ logger.warn "Changing country holidays from #{@country_holidays.inspect} to #{country_name.inspect}."
392
+ end
393
+ @country_holidays = country_name
394
+ self
395
+ end
396
+
397
+ def make_all_seasonality_features(df)
398
+ seasonal_features = []
399
+ prior_scales = []
400
+ modes = {"additive" => [], "multiplicative" => []}
401
+
402
+ # Seasonality features
403
+ @seasonalities.each do |name, props|
404
+ features = make_seasonality_features(
405
+ df["ds"],
406
+ props[:period],
407
+ props[:fourier_order],
408
+ name
409
+ )
410
+ if props[:condition_name]
411
+ features[!df.where(props[:condition_name])] = 0
412
+ end
413
+ seasonal_features << features
414
+ prior_scales.concat([props[:prior_scale]] * features.shape[1])
415
+ modes[props[:mode]] << name
416
+ end
417
+
418
+ # Holiday features
419
+ holidays = construct_holiday_dataframe(df["ds"])
420
+ if holidays.size > 0
421
+ features, holiday_priors, holiday_names = make_holiday_features(df["ds"], holidays)
422
+ seasonal_features << features
423
+ prior_scales.concat(holiday_priors)
424
+ modes[@seasonality_mode].concat(holiday_names)
425
+ end
426
+
427
+ # # Additional regressors
428
+ @extra_regressors.each do |name, props|
429
+ seasonal_features << df[name].to_df
430
+ prior_scales << props[:prior_scale]
431
+ modes[props[:mode]] << name
432
+ end
433
+
434
+ # # Dummy to prevent empty X
435
+ if seasonal_features.size == 0
436
+ seasonal_features << Daru::DataFrame.new("zeros" => [0] * df.shape[0])
437
+ prior_scales << 1.0
438
+ end
439
+
440
+ seasonal_features = df_concat_axis_one(seasonal_features)
441
+
442
+ component_cols, modes = regressor_column_matrix(seasonal_features, modes)
443
+
444
+ [seasonal_features, prior_scales, component_cols, modes]
445
+ end
446
+
447
+ def regressor_column_matrix(seasonal_features, modes)
448
+ components = Daru::DataFrame.new(
449
+ "col" => seasonal_features.shape[1].times.to_a,
450
+ "component" => seasonal_features.vectors.map { |x| x.split("_delim_")[0] }
451
+ )
452
+
453
+ # # Add total for holidays
454
+ if @train_holiday_names
455
+ components = add_group_component(components, "holidays", @train_holiday_names.uniq)
456
+ end
457
+ # # Add totals additive and multiplicative components, and regressors
458
+ ["additive", "multiplicative"].each do |mode|
459
+ components = add_group_component(components, mode + "_terms", modes[mode])
460
+ regressors_by_mode = @extra_regressors.select { |r, props| props[:mode] == mode }
461
+ .map { |r, props| r }
462
+ components = add_group_component(components, "extra_regressors_" + mode, regressors_by_mode)
463
+
464
+ # Add combination components to modes
465
+ modes[mode] << mode + "_terms"
466
+ modes[mode] << "extra_regressors_" + mode
467
+ end
468
+ # # After all of the additive/multiplicative groups have been added,
469
+ modes[@seasonality_mode] << "holidays"
470
+ # # Convert to a binary matrix
471
+ component_cols = Daru::DataFrame.crosstab_by_assignation(
472
+ components["col"], components["component"], [1] * components.size
473
+ )
474
+ component_cols.each_vector do |v|
475
+ v.map! { |vi| vi.nil? ? 0 : vi }
476
+ end
477
+ component_cols.rename_vectors(:_id => "col")
478
+
479
+ # Add columns for additive and multiplicative terms, if missing
480
+ ["additive_terms", "multiplicative_terms"].each do |name|
481
+ component_cols[name] = 0 unless component_cols.vectors.include?(name)
482
+ end
483
+
484
+ # TODO validation
485
+
486
+ [component_cols, modes]
487
+ end
488
+
489
+ def add_group_component(components, name, group)
490
+ new_comp = components.where(components["component"].in(group)).dup
491
+ group_cols = new_comp["col"].uniq
492
+ if group_cols.size > 0
493
+ new_comp = Daru::DataFrame.new("col" => group_cols, "component" => [name] * group_cols.size)
494
+ components = components.concat(new_comp)
495
+ end
496
+ components
497
+ end
498
+
499
+ def parse_seasonality_args(name, arg, auto_disable, default_order)
500
+ case arg
501
+ when "auto"
502
+ fourier_order = 0
503
+ if @seasonalities.include?(name)
504
+ logger.info "Found custom seasonality named #{name.inspect}, disabling built-in #{name.inspect}seasonality."
505
+ elsif auto_disable
506
+ logger.info "Disabling #{name} seasonality. Run prophet with #{name}_seasonality: true to override this."
507
+ else
508
+ fourier_order = default_order
509
+ end
510
+ when true
511
+ fourier_order = default_order
512
+ when false
513
+ fourier_order = 0
514
+ else
515
+ fourier_order = arg.to_i
516
+ end
517
+ fourier_order
518
+ end
519
+
520
+ def set_auto_seasonalities
521
+ first = @history["ds"].min
522
+ last = @history["ds"].max
523
+ dt = @history["ds"].diff
524
+ min_dt = dt.min
525
+
526
+ days = 86400
527
+
528
+ # Yearly seasonality
529
+ yearly_disable = last - first < 370 * days
530
+ fourier_order = parse_seasonality_args("yearly", @yearly_seasonality, yearly_disable, 10)
531
+ if fourier_order > 0
532
+ @seasonalities["yearly"] = {
533
+ period: 365.25,
534
+ fourier_order: fourier_order,
535
+ prior_scale: @seasonality_prior_scale,
536
+ mode: @seasonality_mode,
537
+ condition_name: nil
538
+ }
539
+ end
540
+
541
+ # Weekly seasonality
542
+ weekly_disable = last - first < 14 * days || min_dt >= 7 * days
543
+ fourier_order = parse_seasonality_args("weekly", @weekly_seasonality, weekly_disable, 3)
544
+ if fourier_order > 0
545
+ @seasonalities["weekly"] = {
546
+ period: 7,
547
+ fourier_order: fourier_order,
548
+ prior_scale: @seasonality_prior_scale,
549
+ mode: @seasonality_mode,
550
+ condition_name: nil
551
+ }
552
+ end
553
+
554
+ # Daily seasonality
555
+ daily_disable = last - first < 2 * days || min_dt >= 1 * days
556
+ fourier_order = parse_seasonality_args("daily", @daily_seasonality, daily_disable, 4)
557
+ if fourier_order > 0
558
+ @seasonalities["daily"] = {
559
+ period: 1,
560
+ fourier_order: fourier_order,
561
+ prior_scale: @seasonality_prior_scale,
562
+ mode: @seasonality_mode,
563
+ condition_name: nil
564
+ }
565
+ end
566
+ end
567
+
568
+ def linear_growth_init(df)
569
+ i0 = df["ds"].index.min
570
+ i1 = df["ds"].index.max
571
+ t = df["t"][i1] - df["t"][i0]
572
+ k = (df["y_scaled"][i1] - df["y_scaled"][i0]) / t
573
+ m = df["y_scaled"][i0] - k * df["t"][i0]
574
+ [k, m]
575
+ end
576
+
577
+ def logistic_growth_init(df)
578
+ i0 = df["ds"].index.min
579
+ i1 = df["ds"].index.max
580
+ t = df["t"][i1] - df["t"][i0]
581
+
582
+ # Force valid values, in case y > cap or y < 0
583
+ c0 = df["cap_scaled"][i0]
584
+ c1 = df["cap_scaled"][i1]
585
+ y0 = [0.01 * c0, [0.99 * c0, df["y_scaled"][i0]].min].max
586
+ y1 = [0.01 * c1, [0.99 * c1, df["y_scaled"][i1]].min].max
587
+
588
+ r0 = c0 / y0
589
+ r1 = c1 / y1
590
+
591
+ if (r0 - r1).abs <= 0.01
592
+ r0 = 1.05 * r0
593
+ end
594
+
595
+ l0 = Math.log(r0 - 1)
596
+ l1 = Math.log(r1 - 1)
597
+
598
+ # Initialize the offset
599
+ m = l0 * t / (l0 - l1)
600
+ # And the rate
601
+ k = (l0 - l1) / t
602
+ [k, m]
603
+ end
604
+
605
+ def fit(df, **kwargs)
606
+ raise Error, "Prophet object can only be fit once" if @history
607
+
608
+ history = df.where(!df["y"].in([nil, Float::NAN]))
609
+ raise Error, "Data has less than 2 non-nil rows" if history.shape[0] < 2
610
+
611
+ @history_dates = to_datetime(df["ds"]).sort
612
+ history = setup_dataframe(history, initialize_scales: true)
613
+ @history = history
614
+ set_auto_seasonalities
615
+ seasonal_features, prior_scales, component_cols, modes = make_all_seasonality_features(history)
616
+ @train_component_cols = component_cols
617
+ @component_modes = modes
618
+ @fit_kwargs = kwargs.dup # TODO deep dup?
619
+
620
+ set_changepoints
621
+
622
+ dat = {
623
+ "T" => history.shape[0],
624
+ "K" => seasonal_features.shape[1],
625
+ "S" => @changepoints_t.size,
626
+ "y" => history["y_scaled"],
627
+ "t" => history["t"],
628
+ "t_change" => @changepoints_t,
629
+ "X" => seasonal_features,
630
+ "sigmas" => prior_scales,
631
+ "tau" => @changepoint_prior_scale,
632
+ "trend_indicator" => @growth == "logistic" ? 1 : 0,
633
+ "s_a" => component_cols["additive_terms"],
634
+ "s_m" => component_cols["multiplicative_terms"]
635
+ }
636
+
637
+ if @growth == "linear"
638
+ dat["cap"] = Numo::DFloat.zeros(@history.shape[0])
639
+ kinit = linear_growth_init(history)
640
+ else
641
+ dat["cap"] = history["cap_scaled"]
642
+ kinit = logistic_growth_init(history)
643
+ end
644
+
645
+ stan_init = {
646
+ "k" => kinit[0],
647
+ "m" => kinit[1],
648
+ "delta" => Numo::DFloat.zeros(@changepoints_t.size),
649
+ "beta" => Numo::DFloat.zeros(seasonal_features.shape[1]),
650
+ "sigma_obs" => 1
651
+ }
652
+
653
+ if history["y"].min == history["y"].max && @growth == "linear"
654
+ # Nothing to fit.
655
+ @params = stan_init
656
+ @params["sigma_obs"] = 1e-9
657
+ @params.each do |par|
658
+ @params[par] = Numo::NArray.asarray(@params[par])
659
+ end
660
+ elsif @mcmc_samples > 0
661
+ @params = @stan_backend.sampling(stan_init, dat, @mcmc_samples, **kwargs)
662
+ else
663
+ @params = @stan_backend.fit(stan_init, dat, **kwargs)
664
+ end
665
+
666
+ # If no changepoints were requested, replace delta with 0s
667
+ if @changepoints.size == 0
668
+ # Fold delta into the base rate k
669
+ @params["k"] = @params["k"] + @params["delta"].reshape(-1)
670
+ @params["delta"] = Numo::DFloat.zeros(@params["delta"].shape).reshape(-1, 1)
671
+ end
672
+
673
+ self
674
+ end
675
+
676
+ def predict(df = nil)
677
+ raise Error, "Model has not been fit." unless @history
678
+
679
+ if df.nil?
680
+ df = @history.dup
681
+ else
682
+ raise ArgumentError, "Dataframe has no rows." if df.shape[0] == 0
683
+ df = setup_dataframe(df.dup)
684
+ end
685
+
686
+ df["trend"] = predict_trend(df)
687
+ seasonal_components = predict_seasonal_components(df)
688
+ if @uncertainty_samples
689
+ intervals = predict_uncertainty(df)
690
+ else
691
+ intervals = nil
692
+ end
693
+
694
+ # Drop columns except ds, cap, floor, and trend
695
+ cols = ["ds", "trend"]
696
+ cols << "cap" if df.vectors.include?("cap")
697
+ cols << "floor" if @logistic_floor
698
+ # Add in forecast components
699
+ df2 = df_concat_axis_one([df[*cols], intervals, seasonal_components])
700
+ df2["yhat"] = df2["trend"] * (df2["multiplicative_terms"] + 1) + df2["additive_terms"]
701
+ df2
702
+ end
703
+
704
+ def piecewise_linear(t, deltas, k, m, changepoint_ts)
705
+ # Intercept changes
706
+ gammas = -changepoint_ts * deltas
707
+ # Get cumulative slope and intercept at each t
708
+ k_t = t.new_ones * k
709
+ m_t = t.new_ones * m
710
+ changepoint_ts.each_with_index do |t_s, s|
711
+ indx = t >= t_s
712
+ k_t[indx] += deltas[s]
713
+ m_t[indx] += gammas[s]
714
+ end
715
+ k_t * t + m_t
716
+ end
717
+
718
+ def piecewise_logistic(t, cap, deltas, k, m, changepoint_ts)
719
+ k_1d = Numo::NArray.asarray(k)
720
+ k_1d = k_1d.reshape(1) if k_1d.ndim < 1
721
+ k_cum = k_1d.concatenate(deltas.cumsum + k)
722
+ gammas = Numo::DFloat.zeros(changepoint_ts.size)
723
+ changepoint_ts.each_with_index do |t_s, i|
724
+ gammas[i] = (t_s - m - gammas.sum) * (1 - k_cum[i] / k_cum[i + 1])
725
+ end
726
+ # Get cumulative rate and offset at each t
727
+ k_t = t.new_ones * k
728
+ m_t = t.new_ones * m
729
+ changepoint_ts.each_with_index do |t_s, s|
730
+ indx = t >= t_s
731
+ k_t[indx] += deltas[s]
732
+ m_t[indx] += gammas[s]
733
+ end
734
+ # need df_values to prevent memory from blowing up
735
+ df_values(cap) / (1 + Numo::NMath.exp(-k_t * (t - m_t)))
736
+ end
737
+
738
+ def predict_trend(df)
739
+ k = @params["k"].mean(nan: true)
740
+ m = @params["m"].mean(nan: true)
741
+ deltas = @params["delta"].mean(axis: 0, nan: true)
742
+
743
+ t = Numo::NArray.asarray(df["t"].to_a)
744
+ if @growth == "linear"
745
+ trend = piecewise_linear(t, deltas, k, m, @changepoints_t)
746
+ else
747
+ cap = df["cap_scaled"]
748
+ trend = piecewise_logistic(t, cap, deltas, k, m, @changepoints_t)
749
+ end
750
+
751
+ trend * @y_scale + Numo::NArray.asarray(df["floor"].to_a)
752
+ end
753
+
754
+ def predict_seasonal_components(df)
755
+ seasonal_features, _, component_cols, _ = make_all_seasonality_features(df)
756
+ if @uncertainty_samples
757
+ lower_p = 100 * (1.0 - @interval_width) / 2
758
+ upper_p = 100 * (1.0 + @interval_width) / 2
759
+ end
760
+
761
+ x = df_values(seasonal_features)
762
+ data = {}
763
+ component_cols.vectors.each do |component|
764
+ beta_c = @params["beta"] * Numo::NArray.asarray(component_cols[component].to_a)
765
+
766
+ comp = x.dot(beta_c.transpose)
767
+ if @component_modes["additive"].include?(component)
768
+ comp *= @y_scale
769
+ end
770
+ data[component] = comp.mean(axis: 1, nan: true)
771
+ if @uncertainty_samples
772
+ data[component + "_lower"] = percentile(comp, lower_p, axis: 1)
773
+ data[component + "_upper"] = percentile(comp, upper_p, axis: 1)
774
+ end
775
+ end
776
+ Daru::DataFrame.new(data)
777
+ end
778
+
779
+ def sample_posterior_predictive(df)
780
+ n_iterations = @params["k"].shape[0]
781
+ samp_per_iter = [1, (@uncertainty_samples / n_iterations.to_f).ceil].max
782
+
783
+ # Generate seasonality features once so we can re-use them.
784
+ seasonal_features, _, component_cols, _ = make_all_seasonality_features(df)
785
+
786
+ # convert to Numo for performance
787
+ seasonal_features = df_values(seasonal_features)
788
+ additive_terms = df_values(component_cols["additive_terms"])
789
+ multiplicative_terms = df_values(component_cols["multiplicative_terms"])
790
+
791
+ sim_values = {"yhat" => [], "trend" => []}
792
+ n_iterations.times do |i|
793
+ samp_per_iter.times do
794
+ sim = sample_model(
795
+ df,
796
+ seasonal_features,
797
+ i,
798
+ additive_terms,
799
+ multiplicative_terms
800
+ )
801
+ sim_values.each_key do |key|
802
+ sim_values[key] << sim[key]
803
+ end
804
+ end
805
+ end
806
+ sim_values.each do |k, v|
807
+ sim_values[k] = Numo::NArray.column_stack(v)
808
+ end
809
+ sim_values
810
+ end
811
+
812
+ def predictive_samples(df)
813
+ df = setup_dataframe(df.dup)
814
+ sim_values = sample_posterior_predictive(df)
815
+ sim_values
816
+ end
817
+
818
+ def predict_uncertainty(df)
819
+ sim_values = sample_posterior_predictive(df)
820
+
821
+ lower_p = 100 * (1.0 - @interval_width) / 2
822
+ upper_p = 100 * (1.0 + @interval_width) / 2
823
+
824
+ series = {}
825
+ ["yhat", "trend"].each do |key|
826
+ series["#{key}_lower"] = percentile(sim_values[key], lower_p, axis: 1)
827
+ series["#{key}_upper"] = percentile(sim_values[key], upper_p, axis: 1)
828
+ end
829
+
830
+ Daru::DataFrame.new(series)
831
+ end
832
+
833
+ def sample_model(df, seasonal_features, iteration, s_a, s_m)
834
+ trend = sample_predictive_trend(df, iteration)
835
+
836
+ beta = @params["beta"][iteration, true]
837
+ xb_a = seasonal_features.dot(beta * s_a) * @y_scale
838
+ xb_m = seasonal_features.dot(beta * s_m)
839
+
840
+ sigma = @params["sigma_obs"][iteration]
841
+ noise = Numo::DFloat.new(*df.shape[0]).rand_norm(0, sigma) * @y_scale
842
+
843
+ # skip data frame for performance
844
+ {
845
+ "yhat" => trend * (1 + xb_m) + xb_a + noise,
846
+ "trend" => trend
847
+ }
848
+ end
849
+
850
+ def sample_predictive_trend(df, iteration)
851
+ k = @params["k"][iteration, true]
852
+ m = @params["m"][iteration, true]
853
+ deltas = @params["delta"][iteration, true]
854
+
855
+ t = Numo::NArray.asarray(df["t"].to_a)
856
+ upper_t = t.max
857
+
858
+ # New changepoints from a Poisson process with rate S on [1, T]
859
+ if upper_t > 1
860
+ s = @changepoints_t.size
861
+ n_changes = poisson(s * (upper_t - 1))
862
+ else
863
+ n_changes = 0
864
+ end
865
+ if n_changes > 0
866
+ changepoint_ts_new = 1 + Numo::DFloat.new(n_changes).rand * (upper_t - 1)
867
+ changepoint_ts_new.sort
868
+ else
869
+ changepoint_ts_new = []
870
+ end
871
+
872
+ # Get the empirical scale of the deltas, plus epsilon to avoid NaNs.
873
+ lambda_ = deltas.abs.mean + 1e-8
874
+
875
+ # Sample deltas
876
+ deltas_new = laplace(0, lambda_, n_changes)
877
+
878
+ # Prepend the times and deltas from the history
879
+ changepoint_ts = @changepoints_t.concatenate(changepoint_ts_new)
880
+ deltas = deltas.concatenate(deltas_new)
881
+
882
+ if @growth == "linear"
883
+ trend = piecewise_linear(t, deltas, k, m, changepoint_ts)
884
+ else
885
+ cap = df["cap_scaled"]
886
+ trend = piecewise_logistic(t, cap, deltas, k, m, changepoint_ts)
887
+ end
888
+
889
+ trend * @y_scale + Numo::NArray.asarray(df["floor"].to_a)
890
+ end
891
+
892
+ def percentile(a, percentile, axis:)
893
+ raise Error, "Axis must be 1" if axis != 1
894
+
895
+ sorted = a.sort(axis: axis)
896
+ x = percentile / 100.0 * (sorted.shape[axis] - 1)
897
+ r = x % 1
898
+ i = x.floor
899
+ # this should use axis, but we only need axis: 1
900
+ if i == sorted.shape[axis] - 1
901
+ sorted[true, -1]
902
+ else
903
+ sorted[true, i] + r * (sorted[true, i + 1] - sorted[true, i])
904
+ end
905
+ end
906
+
907
+ def make_future_dataframe(periods:, freq: "D", include_history: true)
908
+ raise Error, "Model has not been fit" unless @history_dates
909
+ last_date = @history_dates.max
910
+ case freq
911
+ when "D"
912
+ # days have constant length with UTC (no DST or leap seconds)
913
+ dates = (periods + 1).times.map { |i| last_date + i * 86400 }
914
+ when "H"
915
+ dates = (periods + 1).times.map { |i| last_date + i * 3600 }
916
+ when "MS"
917
+ dates = [last_date]
918
+ periods.times do
919
+ dates << dates.last.to_datetime.next_month.to_time.utc
920
+ end
921
+ else
922
+ raise ArgumentError, "Unknown freq: #{freq}"
923
+ end
924
+ dates.select! { |d| d > last_date }
925
+ dates = dates.last(periods)
926
+ dates = @history_dates + dates if include_history
927
+ Daru::DataFrame.new("ds" => dates)
928
+ end
929
+
930
+ private
931
+
932
+ # Time is prefer over DateTime Ruby
933
+ # use UTC to be consistent with Python
934
+ # and so days have equal length (no DST)
935
+ def to_datetime(vec)
936
+ return if vec.nil?
937
+ vec.map do |v|
938
+ case v
939
+ when Time
940
+ v.utc
941
+ when Date
942
+ v.to_datetime.to_time.utc
943
+ else
944
+ DateTime.parse(v.to_s).to_time.utc
945
+ end
946
+ end
947
+ end
948
+
949
+ # okay to do in-place
950
+ def df_concat_axis_one(dfs)
951
+ dfs[1..-1].each do |df|
952
+ df.each_vector_with_index do |v, k|
953
+ dfs[0][k] = v
954
+ end
955
+ end
956
+ dfs[0]
957
+ end
958
+
959
+ def df_values(df)
960
+ if df.is_a?(Daru::Vector)
961
+ Numo::NArray.asarray(df.to_a)
962
+ else
963
+ # TODO make more performant
964
+ Numo::NArray.asarray(df.to_matrix.to_a)
965
+ end
966
+ end
967
+
968
+ # https://en.wikipedia.org/wiki/Poisson_distribution#Generating_Poisson-distributed_random_variables
969
+ def poisson(lam)
970
+ l = Math.exp(-lam)
971
+ k = 0
972
+ p = 1
973
+ while p > l
974
+ k += 1
975
+ p *= rand
976
+ end
977
+ k - 1
978
+ end
979
+
980
+ # https://en.wikipedia.org/wiki/Laplace_distribution#Generating_values_from_the_Laplace_distribution
981
+ def laplace(loc, scale, size)
982
+ u = Numo::DFloat.new(size).rand - 0.5
983
+ loc - scale * u.sign * Numo::NMath.log(1 - 2 * u.abs)
984
+ end
985
+ end
986
+ end