prophet-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ install:
2
+ @echo "Skipping"
3
+
4
+ clean:
5
+ @echo "Skipping"
@@ -0,0 +1,18 @@
1
+ require "cmdstan"
2
+ require "fileutils"
3
+ require "tmpdir"
4
+
5
+ platform = Gem.win_platform? ? "win" : "unix"
6
+ stan_file = File.expand_path("../../stan/#{platform}/prophet.stan", __dir__)
7
+
8
+ # copy to avoid temp file in repo
9
+ temp_file = "#{Dir.tmpdir}/prophet.stan"
10
+ FileUtils.cp(stan_file, temp_file)
11
+
12
+ # compile
13
+ sm = CmdStan::Model.new(stan_file: temp_file)
14
+
15
+ # save
16
+ target_dir = File.expand_path("../../stan_model", __dir__)
17
+ FileUtils.mkdir_p(target_dir)
18
+ FileUtils.cp(sm.exe_file, "#{target_dir}/prophet_model.bin")
@@ -0,0 +1 @@
1
+ require "prophet"
@@ -0,0 +1,23 @@
1
+ # dependencies
2
+ require "cmdstan"
3
+ require "daru"
4
+ require "numo/narray"
5
+
6
+ # stdlib
7
+ require "logger"
8
+ require "set"
9
+
10
+ # modules
11
+ require "prophet/holidays"
12
+ require "prophet/plot"
13
+ require "prophet/forecaster"
14
+ require "prophet/stan_backend"
15
+ require "prophet/version"
16
+
17
+ module Prophet
18
+ class Error < StandardError; end
19
+
20
+ def self.new(**kwargs)
21
+ Forecaster.new(**kwargs)
22
+ end
23
+ end
@@ -0,0 +1,986 @@
1
+ module Prophet
2
+ class Forecaster
3
+ include Holidays
4
+ include Plot
5
+
6
+ attr_reader :logger, :params, :train_holiday_names
7
+
8
+ def initialize(
9
+ growth: "linear",
10
+ changepoints: nil,
11
+ n_changepoints: 25,
12
+ changepoint_range: 0.8,
13
+ yearly_seasonality: "auto",
14
+ weekly_seasonality: "auto",
15
+ daily_seasonality: "auto",
16
+ holidays: nil,
17
+ seasonality_mode: "additive",
18
+ seasonality_prior_scale: 10.0,
19
+ holidays_prior_scale: 10.0,
20
+ changepoint_prior_scale: 0.05,
21
+ mcmc_samples: 0,
22
+ interval_width: 0.80,
23
+ uncertainty_samples: 1000
24
+ )
25
+ @growth = growth
26
+
27
+ @changepoints = to_datetime(changepoints)
28
+ if !@changepoints.nil?
29
+ @n_changepoints = @changepoints.size
30
+ @specified_changepoints = true
31
+ else
32
+ @n_changepoints = n_changepoints
33
+ @specified_changepoints = false
34
+ end
35
+
36
+ @changepoint_range = changepoint_range
37
+ @yearly_seasonality = yearly_seasonality
38
+ @weekly_seasonality = weekly_seasonality
39
+ @daily_seasonality = daily_seasonality
40
+ @holidays = holidays
41
+
42
+ @seasonality_mode = seasonality_mode
43
+ @seasonality_prior_scale = seasonality_prior_scale.to_f
44
+ @changepoint_prior_scale = changepoint_prior_scale.to_f
45
+ @holidays_prior_scale = holidays_prior_scale.to_f
46
+
47
+ @mcmc_samples = mcmc_samples
48
+ @interval_width = interval_width
49
+ @uncertainty_samples = uncertainty_samples
50
+
51
+ # Set during fitting or by other methods
52
+ @start = nil
53
+ @y_scale = nil
54
+ @logistic_floor = false
55
+ @t_scale = nil
56
+ @changepoints_t = nil
57
+ @seasonalities = {}
58
+ @extra_regressors = {}
59
+ @country_holidays = nil
60
+ @stan_fit = nil
61
+ @params = {}
62
+ @history = nil
63
+ @history_dates = nil
64
+ @train_component_cols = nil
65
+ @component_modes = nil
66
+ @train_holiday_names = nil
67
+ @fit_kwargs = {}
68
+ validate_inputs
69
+
70
+ @logger = ::Logger.new($stderr)
71
+ @logger.formatter = proc do |severity, datetime, progname, msg|
72
+ "[prophet] #{msg}\n"
73
+ end
74
+ @stan_backend = StanBackend.new(@logger)
75
+ end
76
+
77
+ def validate_inputs
78
+ if !["linear", "logistic"].include?(@growth)
79
+ raise ArgumentError, "Parameter \"growth\" should be \"linear\" or \"logistic\"."
80
+ end
81
+ if @changepoint_range < 0 || @changepoint_range > 1
82
+ raise ArgumentError, "Parameter \"changepoint_range\" must be in [0, 1]"
83
+ end
84
+ if @holidays
85
+ if !@holidays.is_a?(Daru::DataFrame) && @holidays.vectors.include?("ds") && @holidays.vectors.include?("holiday")
86
+ raise ArgumentError, "holidays must be a DataFrame with \"ds\" and \"holiday\" columns."
87
+ end
88
+ @holidays["ds"] = to_datetime(@holidays["ds"])
89
+ has_lower = @holidays.vectors.include?("lower_window")
90
+ has_upper = @holidays.vectors.include?("upper_window")
91
+ if has_lower ^ has_upper # xor
92
+ raise ArgumentError, "Holidays must have both lower_window and upper_window, or neither"
93
+ end
94
+ if has_lower
95
+ if @holidays["lower_window"].max > 0
96
+ raise ArgumentError, "Holiday lower_window should be <= 0"
97
+ end
98
+ if @holidays["upper_window"].min < 0
99
+ raise ArgumentError, "Holiday upper_window should be >= 0"
100
+ end
101
+ end
102
+ @holidays["holiday"].uniq.each do |h|
103
+ validate_column_name(h, check_holidays: false)
104
+ end
105
+ end
106
+
107
+ if !["additive", "multiplicative"].include?(@seasonality_mode)
108
+ raise ArgumentError, "seasonality_mode must be \"additive\" or \"multiplicative\""
109
+ end
110
+ end
111
+
112
+ def validate_column_name(name, check_holidays: true, check_seasonalities: true, check_regressors: true)
113
+ if name.include?("_delim_")
114
+ raise ArgumentError, "Name cannot contain \"_delim_\""
115
+ end
116
+ reserved_names = [
117
+ "trend", "additive_terms", "daily", "weekly", "yearly",
118
+ "holidays", "zeros", "extra_regressors_additive", "yhat",
119
+ "extra_regressors_multiplicative", "multiplicative_terms",
120
+ ]
121
+ rn_l = reserved_names.map { |n| n + "_lower" }
122
+ rn_u = reserved_names.map { |n| n + "_upper" }
123
+ reserved_names.concat(rn_l)
124
+ reserved_names.concat(rn_u)
125
+ reserved_names.concat(["ds", "y", "cap", "floor", "y_scaled", "cap_scaled"])
126
+ if reserved_names.include?(name)
127
+ raise ArgumentError, "Name #{name.inspect} is reserved."
128
+ end
129
+ if check_holidays && @holidays && @holidays["holiday"].uniq.include?(name)
130
+ raise ArgumentError, "Name #{name.inspect} already used for a holiday."
131
+ end
132
+ if check_holidays && @country_holidays && get_holiday_names(@country_holidays).include?(name)
133
+ raise ArgumentError, "Name #{name.inspect} is a holiday name in #{@country_holidays.inspect}."
134
+ end
135
+ if check_seasonalities && @seasonalities[name]
136
+ raise ArgumentError, "Name #{name.inspect} already used for a seasonality."
137
+ end
138
+ if check_regressors and @extra_regressors[name]
139
+ raise ArgumentError, "Name #{name.inspect} already used for an added regressor."
140
+ end
141
+ end
142
+
143
+ def setup_dataframe(df, initialize_scales: false)
144
+ if df.vectors.include?("y")
145
+ df["y"] = df["y"].map(&:to_f)
146
+ raise ArgumentError "Found infinity in column y." unless df["y"].all?(&:finite?)
147
+ end
148
+ # TODO support integers
149
+
150
+ df["ds"] = to_datetime(df["ds"])
151
+
152
+ raise ArgumentError, "Found NaN in column ds." if df["ds"].any?(&:nil?)
153
+
154
+ @extra_regressors.each_key do |name|
155
+ if !df.vectors.include?(name)
156
+ raise ArgumentError, "Regressor #{name.inspect} missing from dataframe"
157
+ end
158
+ df[name] = df[name].map(&:to_f)
159
+ if df[name].any?(&:nil)
160
+ raise ArgumentError, "Found NaN in column #{name.inspect}"
161
+ end
162
+ end
163
+ @seasonalities.values.each do |props|
164
+ condition_name = props[:condition_name]
165
+ if condition_name
166
+ if !df.vectors.include?(condition_name)
167
+ raise ArgumentError, "Condition #{condition_name.inspect} missing from dataframe"
168
+ end
169
+ if df.where(!df[condition_name].in([true, false])).any?
170
+ raise ArgumentError, "Found non-boolean in column #{condition_name.inspect}"
171
+ end
172
+ end
173
+ end
174
+
175
+ if df.index.name == "ds"
176
+ df.index.name = nil
177
+ end
178
+ df = df.sort(["ds"])
179
+
180
+ initialize_scales(initialize_scales, df)
181
+
182
+ if @logistic_floor && !df.vectors.include?("floor")
183
+ raise ArgumentError, "Expected column \"floor\"."
184
+ else
185
+ df["floor"] = 0
186
+ end
187
+
188
+ if @growth == "logistic"
189
+ unless df.vectors.include?("cap")
190
+ raise ArgumentError, "Capacities must be supplied for logistic growth in column \"cap\""
191
+ end
192
+ if df.where(df["cap"] <= df["floor"]).size > 0
193
+ raise ArgumentError, "cap must be greater than floor (which defaults to 0)."
194
+ end
195
+ df["cap_scaled"] = (df["cap"] - df["floor"]) / @y_scale
196
+ end
197
+
198
+ df["t"] = (df["ds"] - @start) / @t_scale.to_f
199
+ if df.vectors.include?("y")
200
+ df["y_scaled"] = (df["y"] - df["floor"]) / @y_scale
201
+ end
202
+
203
+ @extra_regressors.each do |name, props|
204
+ df[name] = ((df[name] - props["mu"]) / props["std"])
205
+ end
206
+
207
+ df
208
+ end
209
+
210
+ def initialize_scales(initialize_scales, df)
211
+ return unless initialize_scales
212
+
213
+ floor = 0
214
+ @y_scale = (df["y"] - floor).abs.max
215
+ @y_scale = 1 if @y_scale == 0
216
+ @start = df["ds"].min
217
+ @t_scale = df["ds"].max - @start
218
+ end
219
+
220
+ def set_changepoints
221
+ hist_size = (@history.shape[0] * @changepoint_range).floor
222
+
223
+ if @n_changepoints + 1 > hist_size
224
+ @n_changepoints = hist_size - 1
225
+ logger.info "n_changepoints greater than number of observations. Using #{@n_changepoints}"
226
+ end
227
+
228
+ if @n_changepoints > 0
229
+ step = (hist_size - 1) / @n_changepoints.to_f
230
+ cp_indexes = (@n_changepoints + 1).times.map { |i| (i * step).round }
231
+ @changepoints = @history["ds"][*cp_indexes][1..-1]
232
+ else
233
+ @changepoints = []
234
+ end
235
+
236
+ if @changepoints.size > 0
237
+ @changepoints_t = Numo::NArray.asarray(((@changepoints - @start) / @t_scale.to_f).to_a).sort
238
+ else
239
+ @changepoints_t = Numo::NArray.asarray([0])
240
+ end
241
+ end
242
+
243
+ def fourier_series(dates, period, series_order)
244
+ start = Time.utc(1970).to_i
245
+ # uses to_datetime first so we get UTC
246
+ t = Numo::DFloat.asarray(dates.map { |v| v.to_i - start }) / (3600 * 24.0)
247
+
248
+ # no need for column_stack
249
+ series_order.times.flat_map do |i|
250
+ [Numo::DFloat::Math.method(:sin), Numo::DFloat::Math.method(:cos)].map do |fun|
251
+ fun.call(2.0 * (i + 1) * Math::PI * t / period)
252
+ end
253
+ end
254
+ end
255
+
256
+ def make_seasonality_features(dates, period, series_order, prefix)
257
+ features = fourier_series(dates, period, series_order)
258
+ Daru::DataFrame.new(features.map.with_index { |v, i| ["#{prefix}_delim_#{i + 1}", v] }.to_h)
259
+ end
260
+
261
+ def construct_holiday_dataframe(dates)
262
+ all_holidays = Daru::DataFrame.new
263
+ if @holidays
264
+ all_holidays = @holidays.dup
265
+ end
266
+ if @country_holidays
267
+ year_list = dates.map(&:year)
268
+ country_holidays_df = make_holidays_df(year_list, @country_holidays)
269
+ all_holidays = all_holidays.concat(country_holidays_df)
270
+ end
271
+ # Drop future holidays not previously seen in training data
272
+ if @train_holiday_names
273
+ # Remove holiday names didn't show up in fit
274
+ all_holidays = all_holidays.where(all_holidays["holiday"].in(@train_holiday_names))
275
+
276
+ # Add holiday names in fit but not in predict with ds as NA
277
+ holidays_to_add = Daru::DataFrame.new(
278
+ "holiday" => @train_holiday_names.where(!@train_holiday_names.in(all_holidays["holiday"]))
279
+ )
280
+ all_holidays = all_holidays.concat(holidays_to_add)
281
+ end
282
+
283
+ all_holidays
284
+ end
285
+
286
+ def make_holiday_features(dates, holidays)
287
+ expanded_holidays = Hash.new { |hash, key| hash[key] = Numo::DFloat.zeros(dates.size) }
288
+ prior_scales = {}
289
+ # Makes an index so we can perform `get_loc` below.
290
+ # Strip to just dates.
291
+ row_index = dates.map(&:to_date)
292
+
293
+ holidays.each_row do |row|
294
+ dt = row["ds"]
295
+ lw = nil
296
+ uw = nil
297
+ begin
298
+ lw = row["lower_window"].to_i
299
+ uw = row["upper_window"].to_i
300
+ rescue IndexError
301
+ lw = 0
302
+ uw = 0
303
+ end
304
+ ps = @holidays_prior_scale
305
+ if prior_scales[row["holiday"]] && prior_scales[row["holiday"]] != ps
306
+ raise ArgumentError, "Holiday #{row["holiday"].inspect} does not have consistent prior scale specification."
307
+ end
308
+ raise ArgumentError, "Prior scale must be > 0" if ps <= 0
309
+ prior_scales[row["holiday"]] = ps
310
+
311
+ lw.upto(uw).each do |offset|
312
+ occurrence = dt ? dt + offset : nil
313
+ loc = occurrence ? row_index.index(occurrence) : nil
314
+ key = "#{row["holiday"]}_delim_#{offset >= 0 ? "+" : "-"}#{offset.abs}"
315
+ if loc
316
+ expanded_holidays[key][loc] = 1.0
317
+ else
318
+ expanded_holidays[key] # Access key to generate value
319
+ end
320
+ end
321
+ end
322
+ holiday_features = Daru::DataFrame.new(expanded_holidays)
323
+ # # Make sure column order is consistent
324
+ holiday_features = holiday_features[*holiday_features.vectors.sort]
325
+ prior_scale_list = holiday_features.vectors.map { |h| prior_scales[h.split("_delim_")[0]] }
326
+ holiday_names = prior_scales.keys
327
+ # Store holiday names used in fit
328
+ if !@train_holiday_names
329
+ @train_holiday_names = Daru::Vector.new(holiday_names)
330
+ end
331
+ [holiday_features, prior_scale_list, holiday_names]
332
+ end
333
+
334
+ def add_regressor(name, prior_scale: nil, standardize: "auto", mode: nil)
335
+ raise Error, "Regressors must be added prior to model fitting." if @history
336
+ validate_column_name(name, check_regressors: false)
337
+ prior_scale ||= @holidays_prior_scale.to_f
338
+ mode ||= @seasonality_mode
339
+ raise ArgumentError, "Prior scale must be > 0" if prior_scale <= 0
340
+ if !["additive", "multiplicative"].include?(mode)
341
+ raise ArgumentError, "mode must be \"additive\" or \"multiplicative\""
342
+ end
343
+ @extra_regressors[name] = {
344
+ prior_scale: prior_scale,
345
+ standardize: standardize,
346
+ mu: 0.0,
347
+ std: 1.0,
348
+ mode: mode
349
+ }
350
+ self
351
+ end
352
+
353
+ def add_seasonality(name:, period:, fourier_order:, prior_scale: nil, mode: nil, condition_name: nil)
354
+ raise Error, "Seasonality must be added prior to model fitting." if @history
355
+
356
+ if !["daily", "weekly", "yearly"].include?(name)
357
+ # Allow overwriting built-in seasonalities
358
+ validate_column_name(name, check_seasonalities: false)
359
+ end
360
+ if prior_scale.nil?
361
+ ps = @seasonality_prior_scale
362
+ else
363
+ ps = prior_scale.to_f
364
+ end
365
+ raise ArgumentError, "Prior scale must be > 0" if ps <= 0
366
+ raise ArgumentError, "Fourier Order must be > 0" if fourier_order <= 0
367
+ mode ||= @seasonality_mode
368
+ if !["additive", "multiplicative"].include?(mode)
369
+ raise ArgumentError, "mode must be \"additive\" or \"multiplicative\""
370
+ end
371
+ validate_column_name(condition_name) if condition_name
372
+ @seasonalities[name] = {
373
+ period: period,
374
+ fourier_order: fourier_order,
375
+ prior_scale: ps,
376
+ mode: mode,
377
+ condition_name: condition_name
378
+ }
379
+ self
380
+ end
381
+
382
+ def add_country_holidays(country_name)
383
+ raise Error, "Country holidays must be added prior to model fitting." if @history
384
+ # Validate names.
385
+ get_holiday_names(country_name).each do |name|
386
+ # Allow merging with existing holidays
387
+ validate_column_name(name, check_holidays: false)
388
+ end
389
+ # Set the holidays.
390
+ if @country_holidays
391
+ logger.warn "Changing country holidays from #{@country_holidays.inspect} to #{country_name.inspect}."
392
+ end
393
+ @country_holidays = country_name
394
+ self
395
+ end
396
+
397
+ def make_all_seasonality_features(df)
398
+ seasonal_features = []
399
+ prior_scales = []
400
+ modes = {"additive" => [], "multiplicative" => []}
401
+
402
+ # Seasonality features
403
+ @seasonalities.each do |name, props|
404
+ features = make_seasonality_features(
405
+ df["ds"],
406
+ props[:period],
407
+ props[:fourier_order],
408
+ name
409
+ )
410
+ if props[:condition_name]
411
+ features[!df.where(props[:condition_name])] = 0
412
+ end
413
+ seasonal_features << features
414
+ prior_scales.concat([props[:prior_scale]] * features.shape[1])
415
+ modes[props[:mode]] << name
416
+ end
417
+
418
+ # Holiday features
419
+ holidays = construct_holiday_dataframe(df["ds"])
420
+ if holidays.size > 0
421
+ features, holiday_priors, holiday_names = make_holiday_features(df["ds"], holidays)
422
+ seasonal_features << features
423
+ prior_scales.concat(holiday_priors)
424
+ modes[@seasonality_mode].concat(holiday_names)
425
+ end
426
+
427
+ # # Additional regressors
428
+ @extra_regressors.each do |name, props|
429
+ seasonal_features << df[name].to_df
430
+ prior_scales << props[:prior_scale]
431
+ modes[props[:mode]] << name
432
+ end
433
+
434
+ # # Dummy to prevent empty X
435
+ if seasonal_features.size == 0
436
+ seasonal_features << Daru::DataFrame.new("zeros" => [0] * df.shape[0])
437
+ prior_scales << 1.0
438
+ end
439
+
440
+ seasonal_features = df_concat_axis_one(seasonal_features)
441
+
442
+ component_cols, modes = regressor_column_matrix(seasonal_features, modes)
443
+
444
+ [seasonal_features, prior_scales, component_cols, modes]
445
+ end
446
+
447
+ def regressor_column_matrix(seasonal_features, modes)
448
+ components = Daru::DataFrame.new(
449
+ "col" => seasonal_features.shape[1].times.to_a,
450
+ "component" => seasonal_features.vectors.map { |x| x.split("_delim_")[0] }
451
+ )
452
+
453
+ # # Add total for holidays
454
+ if @train_holiday_names
455
+ components = add_group_component(components, "holidays", @train_holiday_names.uniq)
456
+ end
457
+ # # Add totals additive and multiplicative components, and regressors
458
+ ["additive", "multiplicative"].each do |mode|
459
+ components = add_group_component(components, mode + "_terms", modes[mode])
460
+ regressors_by_mode = @extra_regressors.select { |r, props| props[:mode] == mode }
461
+ .map { |r, props| r }
462
+ components = add_group_component(components, "extra_regressors_" + mode, regressors_by_mode)
463
+
464
+ # Add combination components to modes
465
+ modes[mode] << mode + "_terms"
466
+ modes[mode] << "extra_regressors_" + mode
467
+ end
468
+ # # After all of the additive/multiplicative groups have been added,
469
+ modes[@seasonality_mode] << "holidays"
470
+ # # Convert to a binary matrix
471
+ component_cols = Daru::DataFrame.crosstab_by_assignation(
472
+ components["col"], components["component"], [1] * components.size
473
+ )
474
+ component_cols.each_vector do |v|
475
+ v.map! { |vi| vi.nil? ? 0 : vi }
476
+ end
477
+ component_cols.rename_vectors(:_id => "col")
478
+
479
+ # Add columns for additive and multiplicative terms, if missing
480
+ ["additive_terms", "multiplicative_terms"].each do |name|
481
+ component_cols[name] = 0 unless component_cols.vectors.include?(name)
482
+ end
483
+
484
+ # TODO validation
485
+
486
+ [component_cols, modes]
487
+ end
488
+
489
+ def add_group_component(components, name, group)
490
+ new_comp = components.where(components["component"].in(group)).dup
491
+ group_cols = new_comp["col"].uniq
492
+ if group_cols.size > 0
493
+ new_comp = Daru::DataFrame.new("col" => group_cols, "component" => [name] * group_cols.size)
494
+ components = components.concat(new_comp)
495
+ end
496
+ components
497
+ end
498
+
499
+ def parse_seasonality_args(name, arg, auto_disable, default_order)
500
+ case arg
501
+ when "auto"
502
+ fourier_order = 0
503
+ if @seasonalities.include?(name)
504
+ logger.info "Found custom seasonality named #{name.inspect}, disabling built-in #{name.inspect}seasonality."
505
+ elsif auto_disable
506
+ logger.info "Disabling #{name} seasonality. Run prophet with #{name}_seasonality: true to override this."
507
+ else
508
+ fourier_order = default_order
509
+ end
510
+ when true
511
+ fourier_order = default_order
512
+ when false
513
+ fourier_order = 0
514
+ else
515
+ fourier_order = arg.to_i
516
+ end
517
+ fourier_order
518
+ end
519
+
520
+ def set_auto_seasonalities
521
+ first = @history["ds"].min
522
+ last = @history["ds"].max
523
+ dt = @history["ds"].diff
524
+ min_dt = dt.min
525
+
526
+ days = 86400
527
+
528
+ # Yearly seasonality
529
+ yearly_disable = last - first < 370 * days
530
+ fourier_order = parse_seasonality_args("yearly", @yearly_seasonality, yearly_disable, 10)
531
+ if fourier_order > 0
532
+ @seasonalities["yearly"] = {
533
+ period: 365.25,
534
+ fourier_order: fourier_order,
535
+ prior_scale: @seasonality_prior_scale,
536
+ mode: @seasonality_mode,
537
+ condition_name: nil
538
+ }
539
+ end
540
+
541
+ # Weekly seasonality
542
+ weekly_disable = last - first < 14 * days || min_dt >= 7 * days
543
+ fourier_order = parse_seasonality_args("weekly", @weekly_seasonality, weekly_disable, 3)
544
+ if fourier_order > 0
545
+ @seasonalities["weekly"] = {
546
+ period: 7,
547
+ fourier_order: fourier_order,
548
+ prior_scale: @seasonality_prior_scale,
549
+ mode: @seasonality_mode,
550
+ condition_name: nil
551
+ }
552
+ end
553
+
554
+ # Daily seasonality
555
+ daily_disable = last - first < 2 * days || min_dt >= 1 * days
556
+ fourier_order = parse_seasonality_args("daily", @daily_seasonality, daily_disable, 4)
557
+ if fourier_order > 0
558
+ @seasonalities["daily"] = {
559
+ period: 1,
560
+ fourier_order: fourier_order,
561
+ prior_scale: @seasonality_prior_scale,
562
+ mode: @seasonality_mode,
563
+ condition_name: nil
564
+ }
565
+ end
566
+ end
567
+
568
+ def linear_growth_init(df)
569
+ i0 = df["ds"].index.min
570
+ i1 = df["ds"].index.max
571
+ t = df["t"][i1] - df["t"][i0]
572
+ k = (df["y_scaled"][i1] - df["y_scaled"][i0]) / t
573
+ m = df["y_scaled"][i0] - k * df["t"][i0]
574
+ [k, m]
575
+ end
576
+
577
+ def logistic_growth_init(df)
578
+ i0 = df["ds"].index.min
579
+ i1 = df["ds"].index.max
580
+ t = df["t"][i1] - df["t"][i0]
581
+
582
+ # Force valid values, in case y > cap or y < 0
583
+ c0 = df["cap_scaled"][i0]
584
+ c1 = df["cap_scaled"][i1]
585
+ y0 = [0.01 * c0, [0.99 * c0, df["y_scaled"][i0]].min].max
586
+ y1 = [0.01 * c1, [0.99 * c1, df["y_scaled"][i1]].min].max
587
+
588
+ r0 = c0 / y0
589
+ r1 = c1 / y1
590
+
591
+ if (r0 - r1).abs <= 0.01
592
+ r0 = 1.05 * r0
593
+ end
594
+
595
+ l0 = Math.log(r0 - 1)
596
+ l1 = Math.log(r1 - 1)
597
+
598
+ # Initialize the offset
599
+ m = l0 * t / (l0 - l1)
600
+ # And the rate
601
+ k = (l0 - l1) / t
602
+ [k, m]
603
+ end
604
+
605
+ def fit(df, **kwargs)
606
+ raise Error, "Prophet object can only be fit once" if @history
607
+
608
+ history = df.where(!df["y"].in([nil, Float::NAN]))
609
+ raise Error, "Data has less than 2 non-nil rows" if history.shape[0] < 2
610
+
611
+ @history_dates = to_datetime(df["ds"]).sort
612
+ history = setup_dataframe(history, initialize_scales: true)
613
+ @history = history
614
+ set_auto_seasonalities
615
+ seasonal_features, prior_scales, component_cols, modes = make_all_seasonality_features(history)
616
+ @train_component_cols = component_cols
617
+ @component_modes = modes
618
+ @fit_kwargs = kwargs.dup # TODO deep dup?
619
+
620
+ set_changepoints
621
+
622
+ dat = {
623
+ "T" => history.shape[0],
624
+ "K" => seasonal_features.shape[1],
625
+ "S" => @changepoints_t.size,
626
+ "y" => history["y_scaled"],
627
+ "t" => history["t"],
628
+ "t_change" => @changepoints_t,
629
+ "X" => seasonal_features,
630
+ "sigmas" => prior_scales,
631
+ "tau" => @changepoint_prior_scale,
632
+ "trend_indicator" => @growth == "logistic" ? 1 : 0,
633
+ "s_a" => component_cols["additive_terms"],
634
+ "s_m" => component_cols["multiplicative_terms"]
635
+ }
636
+
637
+ if @growth == "linear"
638
+ dat["cap"] = Numo::DFloat.zeros(@history.shape[0])
639
+ kinit = linear_growth_init(history)
640
+ else
641
+ dat["cap"] = history["cap_scaled"]
642
+ kinit = logistic_growth_init(history)
643
+ end
644
+
645
+ stan_init = {
646
+ "k" => kinit[0],
647
+ "m" => kinit[1],
648
+ "delta" => Numo::DFloat.zeros(@changepoints_t.size),
649
+ "beta" => Numo::DFloat.zeros(seasonal_features.shape[1]),
650
+ "sigma_obs" => 1
651
+ }
652
+
653
+ if history["y"].min == history["y"].max && @growth == "linear"
654
+ # Nothing to fit.
655
+ @params = stan_init
656
+ @params["sigma_obs"] = 1e-9
657
+ @params.each do |par|
658
+ @params[par] = Numo::NArray.asarray(@params[par])
659
+ end
660
+ elsif @mcmc_samples > 0
661
+ @params = @stan_backend.sampling(stan_init, dat, @mcmc_samples, **kwargs)
662
+ else
663
+ @params = @stan_backend.fit(stan_init, dat, **kwargs)
664
+ end
665
+
666
+ # If no changepoints were requested, replace delta with 0s
667
+ if @changepoints.size == 0
668
+ # Fold delta into the base rate k
669
+ @params["k"] = @params["k"] + @params["delta"].reshape(-1)
670
+ @params["delta"] = Numo::DFloat.zeros(@params["delta"].shape).reshape(-1, 1)
671
+ end
672
+
673
+ self
674
+ end
675
+
676
+ def predict(df = nil)
677
+ raise Error, "Model has not been fit." unless @history
678
+
679
+ if df.nil?
680
+ df = @history.dup
681
+ else
682
+ raise ArgumentError, "Dataframe has no rows." if df.shape[0] == 0
683
+ df = setup_dataframe(df.dup)
684
+ end
685
+
686
+ df["trend"] = predict_trend(df)
687
+ seasonal_components = predict_seasonal_components(df)
688
+ if @uncertainty_samples
689
+ intervals = predict_uncertainty(df)
690
+ else
691
+ intervals = nil
692
+ end
693
+
694
+ # Drop columns except ds, cap, floor, and trend
695
+ cols = ["ds", "trend"]
696
+ cols << "cap" if df.vectors.include?("cap")
697
+ cols << "floor" if @logistic_floor
698
+ # Add in forecast components
699
+ df2 = df_concat_axis_one([df[*cols], intervals, seasonal_components])
700
+ df2["yhat"] = df2["trend"] * (df2["multiplicative_terms"] + 1) + df2["additive_terms"]
701
+ df2
702
+ end
703
+
704
+ def piecewise_linear(t, deltas, k, m, changepoint_ts)
705
+ # Intercept changes
706
+ gammas = -changepoint_ts * deltas
707
+ # Get cumulative slope and intercept at each t
708
+ k_t = t.new_ones * k
709
+ m_t = t.new_ones * m
710
+ changepoint_ts.each_with_index do |t_s, s|
711
+ indx = t >= t_s
712
+ k_t[indx] += deltas[s]
713
+ m_t[indx] += gammas[s]
714
+ end
715
+ k_t * t + m_t
716
+ end
717
+
718
+ def piecewise_logistic(t, cap, deltas, k, m, changepoint_ts)
719
+ k_1d = Numo::NArray.asarray(k)
720
+ k_1d = k_1d.reshape(1) if k_1d.ndim < 1
721
+ k_cum = k_1d.concatenate(deltas.cumsum + k)
722
+ gammas = Numo::DFloat.zeros(changepoint_ts.size)
723
+ changepoint_ts.each_with_index do |t_s, i|
724
+ gammas[i] = (t_s - m - gammas.sum) * (1 - k_cum[i] / k_cum[i + 1])
725
+ end
726
+ # Get cumulative rate and offset at each t
727
+ k_t = t.new_ones * k
728
+ m_t = t.new_ones * m
729
+ changepoint_ts.each_with_index do |t_s, s|
730
+ indx = t >= t_s
731
+ k_t[indx] += deltas[s]
732
+ m_t[indx] += gammas[s]
733
+ end
734
+ # need df_values to prevent memory from blowing up
735
+ df_values(cap) / (1 + Numo::NMath.exp(-k_t * (t - m_t)))
736
+ end
737
+
738
+ def predict_trend(df)
739
+ k = @params["k"].mean(nan: true)
740
+ m = @params["m"].mean(nan: true)
741
+ deltas = @params["delta"].mean(axis: 0, nan: true)
742
+
743
+ t = Numo::NArray.asarray(df["t"].to_a)
744
+ if @growth == "linear"
745
+ trend = piecewise_linear(t, deltas, k, m, @changepoints_t)
746
+ else
747
+ cap = df["cap_scaled"]
748
+ trend = piecewise_logistic(t, cap, deltas, k, m, @changepoints_t)
749
+ end
750
+
751
+ trend * @y_scale + Numo::NArray.asarray(df["floor"].to_a)
752
+ end
753
+
754
+ def predict_seasonal_components(df)
755
+ seasonal_features, _, component_cols, _ = make_all_seasonality_features(df)
756
+ if @uncertainty_samples
757
+ lower_p = 100 * (1.0 - @interval_width) / 2
758
+ upper_p = 100 * (1.0 + @interval_width) / 2
759
+ end
760
+
761
+ x = df_values(seasonal_features)
762
+ data = {}
763
+ component_cols.vectors.each do |component|
764
+ beta_c = @params["beta"] * Numo::NArray.asarray(component_cols[component].to_a)
765
+
766
+ comp = x.dot(beta_c.transpose)
767
+ if @component_modes["additive"].include?(component)
768
+ comp *= @y_scale
769
+ end
770
+ data[component] = comp.mean(axis: 1, nan: true)
771
+ if @uncertainty_samples
772
+ data[component + "_lower"] = percentile(comp, lower_p, axis: 1)
773
+ data[component + "_upper"] = percentile(comp, upper_p, axis: 1)
774
+ end
775
+ end
776
+ Daru::DataFrame.new(data)
777
+ end
778
+
779
+ def sample_posterior_predictive(df)
780
+ n_iterations = @params["k"].shape[0]
781
+ samp_per_iter = [1, (@uncertainty_samples / n_iterations.to_f).ceil].max
782
+
783
+ # Generate seasonality features once so we can re-use them.
784
+ seasonal_features, _, component_cols, _ = make_all_seasonality_features(df)
785
+
786
+ # convert to Numo for performance
787
+ seasonal_features = df_values(seasonal_features)
788
+ additive_terms = df_values(component_cols["additive_terms"])
789
+ multiplicative_terms = df_values(component_cols["multiplicative_terms"])
790
+
791
+ sim_values = {"yhat" => [], "trend" => []}
792
+ n_iterations.times do |i|
793
+ samp_per_iter.times do
794
+ sim = sample_model(
795
+ df,
796
+ seasonal_features,
797
+ i,
798
+ additive_terms,
799
+ multiplicative_terms
800
+ )
801
+ sim_values.each_key do |key|
802
+ sim_values[key] << sim[key]
803
+ end
804
+ end
805
+ end
806
+ sim_values.each do |k, v|
807
+ sim_values[k] = Numo::NArray.column_stack(v)
808
+ end
809
+ sim_values
810
+ end
811
+
812
+ def predictive_samples(df)
813
+ df = setup_dataframe(df.dup)
814
+ sim_values = sample_posterior_predictive(df)
815
+ sim_values
816
+ end
817
+
818
+ def predict_uncertainty(df)
819
+ sim_values = sample_posterior_predictive(df)
820
+
821
+ lower_p = 100 * (1.0 - @interval_width) / 2
822
+ upper_p = 100 * (1.0 + @interval_width) / 2
823
+
824
+ series = {}
825
+ ["yhat", "trend"].each do |key|
826
+ series["#{key}_lower"] = percentile(sim_values[key], lower_p, axis: 1)
827
+ series["#{key}_upper"] = percentile(sim_values[key], upper_p, axis: 1)
828
+ end
829
+
830
+ Daru::DataFrame.new(series)
831
+ end
832
+
833
+ def sample_model(df, seasonal_features, iteration, s_a, s_m)
834
+ trend = sample_predictive_trend(df, iteration)
835
+
836
+ beta = @params["beta"][iteration, true]
837
+ xb_a = seasonal_features.dot(beta * s_a) * @y_scale
838
+ xb_m = seasonal_features.dot(beta * s_m)
839
+
840
+ sigma = @params["sigma_obs"][iteration]
841
+ noise = Numo::DFloat.new(*df.shape[0]).rand_norm(0, sigma) * @y_scale
842
+
843
+ # skip data frame for performance
844
+ {
845
+ "yhat" => trend * (1 + xb_m) + xb_a + noise,
846
+ "trend" => trend
847
+ }
848
+ end
849
+
850
+ def sample_predictive_trend(df, iteration)
851
+ k = @params["k"][iteration, true]
852
+ m = @params["m"][iteration, true]
853
+ deltas = @params["delta"][iteration, true]
854
+
855
+ t = Numo::NArray.asarray(df["t"].to_a)
856
+ upper_t = t.max
857
+
858
+ # New changepoints from a Poisson process with rate S on [1, T]
859
+ if upper_t > 1
860
+ s = @changepoints_t.size
861
+ n_changes = poisson(s * (upper_t - 1))
862
+ else
863
+ n_changes = 0
864
+ end
865
+ if n_changes > 0
866
+ changepoint_ts_new = 1 + Numo::DFloat.new(n_changes).rand * (upper_t - 1)
867
+ changepoint_ts_new.sort
868
+ else
869
+ changepoint_ts_new = []
870
+ end
871
+
872
+ # Get the empirical scale of the deltas, plus epsilon to avoid NaNs.
873
+ lambda_ = deltas.abs.mean + 1e-8
874
+
875
+ # Sample deltas
876
+ deltas_new = laplace(0, lambda_, n_changes)
877
+
878
+ # Prepend the times and deltas from the history
879
+ changepoint_ts = @changepoints_t.concatenate(changepoint_ts_new)
880
+ deltas = deltas.concatenate(deltas_new)
881
+
882
+ if @growth == "linear"
883
+ trend = piecewise_linear(t, deltas, k, m, changepoint_ts)
884
+ else
885
+ cap = df["cap_scaled"]
886
+ trend = piecewise_logistic(t, cap, deltas, k, m, changepoint_ts)
887
+ end
888
+
889
+ trend * @y_scale + Numo::NArray.asarray(df["floor"].to_a)
890
+ end
891
+
892
+ def percentile(a, percentile, axis:)
893
+ raise Error, "Axis must be 1" if axis != 1
894
+
895
+ sorted = a.sort(axis: axis)
896
+ x = percentile / 100.0 * (sorted.shape[axis] - 1)
897
+ r = x % 1
898
+ i = x.floor
899
+ # this should use axis, but we only need axis: 1
900
+ if i == sorted.shape[axis] - 1
901
+ sorted[true, -1]
902
+ else
903
+ sorted[true, i] + r * (sorted[true, i + 1] - sorted[true, i])
904
+ end
905
+ end
906
+
907
+ def make_future_dataframe(periods:, freq: "D", include_history: true)
908
+ raise Error, "Model has not been fit" unless @history_dates
909
+ last_date = @history_dates.max
910
+ case freq
911
+ when "D"
912
+ # days have constant length with UTC (no DST or leap seconds)
913
+ dates = (periods + 1).times.map { |i| last_date + i * 86400 }
914
+ when "H"
915
+ dates = (periods + 1).times.map { |i| last_date + i * 3600 }
916
+ when "MS"
917
+ dates = [last_date]
918
+ periods.times do
919
+ dates << dates.last.to_datetime.next_month.to_time.utc
920
+ end
921
+ else
922
+ raise ArgumentError, "Unknown freq: #{freq}"
923
+ end
924
+ dates.select! { |d| d > last_date }
925
+ dates = dates.last(periods)
926
+ dates = @history_dates + dates if include_history
927
+ Daru::DataFrame.new("ds" => dates)
928
+ end
929
+
930
+ private
931
+
932
+ # Time is prefer over DateTime Ruby
933
+ # use UTC to be consistent with Python
934
+ # and so days have equal length (no DST)
935
+ def to_datetime(vec)
936
+ return if vec.nil?
937
+ vec.map do |v|
938
+ case v
939
+ when Time
940
+ v.utc
941
+ when Date
942
+ v.to_datetime.to_time.utc
943
+ else
944
+ DateTime.parse(v.to_s).to_time.utc
945
+ end
946
+ end
947
+ end
948
+
949
+ # okay to do in-place
950
+ def df_concat_axis_one(dfs)
951
+ dfs[1..-1].each do |df|
952
+ df.each_vector_with_index do |v, k|
953
+ dfs[0][k] = v
954
+ end
955
+ end
956
+ dfs[0]
957
+ end
958
+
959
+ def df_values(df)
960
+ if df.is_a?(Daru::Vector)
961
+ Numo::NArray.asarray(df.to_a)
962
+ else
963
+ # TODO make more performant
964
+ Numo::NArray.asarray(df.to_matrix.to_a)
965
+ end
966
+ end
967
+
968
+ # https://en.wikipedia.org/wiki/Poisson_distribution#Generating_Poisson-distributed_random_variables
969
+ def poisson(lam)
970
+ l = Math.exp(-lam)
971
+ k = 0
972
+ p = 1
973
+ while p > l
974
+ k += 1
975
+ p *= rand
976
+ end
977
+ k - 1
978
+ end
979
+
980
+ # https://en.wikipedia.org/wiki/Laplace_distribution#Generating_values_from_the_Laplace_distribution
981
+ def laplace(loc, scale, size)
982
+ u = Numo::DFloat.new(size).rand - 0.5
983
+ loc - scale * u.sign * Numo::NMath.log(1 - 2 * u.abs)
984
+ end
985
+ end
986
+ end