pynnlf 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pynnlf/__about__.py +1 -0
  2. pynnlf/__init__.py +5 -0
  3. pynnlf/api.py +17 -0
  4. pynnlf/discovery.py +63 -0
  5. pynnlf/engine.py +1238 -0
  6. pynnlf/hyperparams.py +38 -0
  7. pynnlf/model_utils.py +186 -0
  8. pynnlf/runner.py +108 -0
  9. pynnlf/scaffold/README_WORKSPACE.md +0 -0
  10. pynnlf/scaffold/data/README_data.md +40 -0
  11. pynnlf/scaffold/data/ds0_test.csv +4081 -0
  12. pynnlf/scaffold/models/README_models.md +61 -0
  13. pynnlf/scaffold/models/hyperparameters.yaml +264 -0
  14. pynnlf/scaffold/models/m10_rf.py +65 -0
  15. pynnlf/scaffold/models/m11_svr.py +53 -0
  16. pynnlf/scaffold/models/m12_rnn.py +152 -0
  17. pynnlf/scaffold/models/m13_lstm.py +208 -0
  18. pynnlf/scaffold/models/m14_gru.py +139 -0
  19. pynnlf/scaffold/models/m15_transformer.py +138 -0
  20. pynnlf/scaffold/models/m16_prophet.py +216 -0
  21. pynnlf/scaffold/models/m17_xgb.py +66 -0
  22. pynnlf/scaffold/models/m18_nbeats.py +107 -0
  23. pynnlf/scaffold/models/m1_naive.py +49 -0
  24. pynnlf/scaffold/models/m2_snaive.py +49 -0
  25. pynnlf/scaffold/models/m3_ets.py +133 -0
  26. pynnlf/scaffold/models/m4_arima.py +123 -0
  27. pynnlf/scaffold/models/m5_sarima.py +128 -0
  28. pynnlf/scaffold/models/m6_lr.py +76 -0
  29. pynnlf/scaffold/models/m7_ann.py +148 -0
  30. pynnlf/scaffold/models/m8_dnn.py +141 -0
  31. pynnlf/scaffold/models/m9_rt.py +74 -0
  32. pynnlf/scaffold/models/mXX_template.py +68 -0
  33. pynnlf/scaffold/specs/batch.yaml +4 -0
  34. pynnlf/scaffold/specs/experiment.yaml +4 -0
  35. pynnlf/scaffold/specs/pynnlf_config.yaml +69 -0
  36. pynnlf/scaffold/specs/testing_benchmark.csv +613 -0
  37. pynnlf/scaffold/specs/testing_benchmark_metadata.md +12 -0
  38. pynnlf/scaffold/specs/tests_ci.yaml +8 -0
  39. pynnlf/scaffold/specs/tests_full.yaml +23 -0
  40. pynnlf/tests_runner.py +211 -0
  41. pynnlf/tools/strip_notebook_artifacts.py +32 -0
  42. pynnlf/workspace.py +63 -0
  43. pynnlf/yamlio.py +28 -0
  44. pynnlf-0.2.2.dist-info/METADATA +168 -0
  45. pynnlf-0.2.2.dist-info/RECORD +47 -0
  46. pynnlf-0.2.2.dist-info/WHEEL +5 -0
  47. pynnlf-0.2.2.dist-info/top_level.txt +1 -0
pynnlf/engine.py ADDED
@@ -0,0 +1,1238 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import datetime as dt
5
+ from datetime import datetime
6
+ import os
7
+ import time
8
+ import pickle # for saving trained model
9
+ import dill # for saving trained model
10
+ import importlib.util
11
+ from pathlib import Path
12
+ import re
13
+
14
+
15
+
16
+ # # FOLDER PREPARATION
17
+
18
+
19
+
20
+ def load_model_module(models_dir: Path, model_name: str):
21
+ """
22
+ Load a model module from the workspace models directory.
23
+
24
+ Args:
25
+ models_dir (Path): <workspace>/models
26
+ model_name (str): e.g., "m6_lr"
27
+
28
+ Returns:
29
+ module: Imported python module object.
30
+ """
31
+ p = models_dir / f"{model_name}.py"
32
+ if not p.exists():
33
+ raise FileNotFoundError(f"Model file not found: {p}")
34
+ spec = importlib.util.spec_from_file_location(model_name, p)
35
+ mod = importlib.util.module_from_spec(spec)
36
+ assert spec and spec.loader
37
+ spec.loader.exec_module(mod)
38
+ return mod
39
+
40
+ def compute_exp_no(path_result):
41
+ """Compute experiment number for folder & file naming.
42
+
43
+ This version:
44
+ 1) Detects existing experiment folders matching pattern: E00001_*
45
+ 2) Uses the maximum existing number + 1
46
+ 3) Starts numbering from 1
47
+ 4) Ignores Archive/other folders/files safely
48
+
49
+ Args:
50
+ path_result (str): path to experiment_result folder
51
+
52
+ Returns:
53
+ int: experiment_no (starts at 1)
54
+ str: experiment_no_str (e.g., "E00001")
55
+ """
56
+ if not os.path.exists(path_result):
57
+ os.makedirs(path_result, exist_ok=True)
58
+
59
+ pat = re.compile(r"^E(\d{5})_")
60
+ nums = []
61
+
62
+ for name in os.listdir(path_result):
63
+ full = os.path.join(path_result, name)
64
+ if not os.path.isdir(full):
65
+ continue
66
+ m = pat.match(name)
67
+ if m:
68
+ nums.append(int(m.group(1)))
69
+
70
+ # Start from 1 if none exist
71
+ next_no = (max(nums) + 1) if nums else 1
72
+ next_no_str = f"E{next_no:05d}"
73
+ return next_no, next_no_str
74
+
75
+ def compute_folder_name(experiment_no_str, dataset_file, forecast_horizon, model_name, hyperparameter_no):
76
+ """
77
+ Folder name in the format of [exp number]_[exp date]_[dataset]_[forecast horizon]_[model]_[hyperparameter]
78
+
79
+ Args:
80
+ experiment_no_str (str): exp number
81
+ dataset_file (str): dataset filename (e.g., "ds0_test.csv")
82
+ forecast_horizon (int): forecast horizon in minutes
83
+ model_name (str): for example, m6_lr
84
+ hyperparameter_no (str): for example, hp1
85
+
86
+ Returns:
87
+ str: folder name
88
+ """
89
+ folder_name = (
90
+ experiment_no_str + '_' +
91
+ datetime.today().date().strftime("%y%m%d") + '_' +
92
+ dataset_file.split('_')[0] + '_' +
93
+ 'fh' + str(forecast_horizon) + '_' +
94
+ model_name + '_' +
95
+ hyperparameter_no
96
+ )
97
+ return folder_name
98
+
99
+ def prepare_directory(path_result, dataset_file, forecast_horizon, model_name, hyperparameter_no, hyperparameter_dict):
100
+ """
101
+ Create experiment folders and filepaths for exports.
102
+
103
+ Args:
104
+ path_result (str): path to experiment_result folder
105
+ dataset_file (str): dataset filename
106
+ forecast_horizon (int): forecast horizon in minutes
107
+ model_name (str): model name e.g., m6_lr
108
+ hyperparameter_no (str): e.g., hp1
109
+ hyperparameter_dict (dict): chosen hyperparameter dict
110
+
111
+ Returns:
112
+ dict: hyperparameter_dict
113
+ str: experiment_no_str
114
+ dict: filepath dictionary
115
+ """
116
+ hyperparameter = hyperparameter_dict
117
+
118
+ experiment_no, experiment_no_str = compute_exp_no(path_result)
119
+ folder_name = compute_folder_name(experiment_no_str, dataset_file, forecast_horizon, model_name, hyperparameter_no)
120
+
121
+ # CREATE FOLDER
122
+ cv_folder_train = experiment_no_str + '_cv_train'
123
+ cv_folder_test = experiment_no_str + '_cv_test'
124
+ cv1_plot_folder = experiment_no_str + '_cv1_plots'
125
+ folder_model = experiment_no_str + '_models'
126
+
127
+ path_result2 = path_result + folder_name +'/'
128
+ path_result_train = path_result2 + cv_folder_train +'/'
129
+ path_result_test = path_result2 + cv_folder_test +'/'
130
+ path_result_plot = path_result2 + cv1_plot_folder +'/'
131
+ path_model = path_result2 + folder_model +'/'
132
+
133
+ # MAKE FOLDERS
134
+ os.makedirs(path_result2, exist_ok=False)
135
+ os.mkdir(path_result_train)
136
+ os.mkdir(path_result_test)
137
+ os.mkdir(path_result_plot)
138
+ os.mkdir(path_model)
139
+
140
+ # MAKE FILE PATH
141
+ filepath = {
142
+ 'a1' : path_result2 + experiment_no_str + '_a1_experiment_result.csv',
143
+ 'a2' : path_result2 + experiment_no_str + '_a2_hyperparameter.csv',
144
+ 'a3' : path_result2 + experiment_no_str + '_a3_cross_validation_result.csv',
145
+ 'b1' : path_result_plot + experiment_no_str + '_b1_train_timeplot.png', # Time Plot of Forecast vs Observation
146
+ 'b2' : path_result_plot + experiment_no_str + '_b2_train_scatterplot.png', # Scatter Plot of Forecast vs Observation
147
+ 'b3' : path_result_plot + experiment_no_str + '_b3_train_residual_timeplot.png', # Time Plot of Residual
148
+ 'b4' : path_result_plot + experiment_no_str + '_b4_train_residual_histogram.png', # Histogram of Residual
149
+ 'b5' : path_result_plot + experiment_no_str + '_b5_train_learningcurve.png', # Learning Curve vs Epoch
150
+ 'c1' : path_result_plot + experiment_no_str + '_c1_test_timeplot.png', # Time Plot of Forecast vs Observation
151
+ 'c2' : path_result_plot + experiment_no_str + '_c2_test_scatterplot.png', # Scatter Plot of Forecast vs Observation
152
+ 'c3' : path_result_plot + experiment_no_str + '_c3_test_residual_timeplot.png', # Time Plot of Residual
153
+ 'c4' : path_result_plot + experiment_no_str + '_c4_test_residual_histogram.png', # Histogram of Residual
154
+ 'c5' : path_result_plot + experiment_no_str + '_c5_test_learningcurve.png', # Learning Curve vs Epoch
155
+
156
+ # B. FOLDER FOR CROSS VALIDATION TIME SERIES
157
+ 'train_cv' : {
158
+ 1 : path_result_train + experiment_no_str + '_cv1_train_result.csv',
159
+ 2 : path_result_train + experiment_no_str + '_cv2_train_result.csv',
160
+ 3 : path_result_train + experiment_no_str + '_cv3_train_result.csv',
161
+ 4 : path_result_train + experiment_no_str + '_cv4_train_result.csv',
162
+ 5 : path_result_train + experiment_no_str + '_cv5_train_result.csv',
163
+ 6 : path_result_train + experiment_no_str + '_cv6_train_result.csv',
164
+ 7 : path_result_train + experiment_no_str + '_cv7_train_result.csv',
165
+ 8 : path_result_train + experiment_no_str + '_cv8_train_result.csv',
166
+ 9 : path_result_train + experiment_no_str + '_cv9_train_result.csv',
167
+ 10 : path_result_train + experiment_no_str + '_cv10_train_result.csv'
168
+ },
169
+
170
+ 'test_cv' : {
171
+ 1 : path_result_test + experiment_no_str + '_cv1_test_result.csv',
172
+ 2 : path_result_test + experiment_no_str + '_cv2_test_result.csv',
173
+ 3 : path_result_test + experiment_no_str + '_cv3_test_result.csv',
174
+ 4 : path_result_test + experiment_no_str + '_cv4_test_result.csv',
175
+ 5 : path_result_test + experiment_no_str + '_cv5_test_result.csv',
176
+ 6 : path_result_test + experiment_no_str + '_cv6_test_result.csv',
177
+ 7 : path_result_test + experiment_no_str + '_cv7_test_result.csv',
178
+ 8 : path_result_test + experiment_no_str + '_cv8_test_result.csv',
179
+ 9 : path_result_test + experiment_no_str + '_cv9_test_result.csv',
180
+ 10 : path_result_test + experiment_no_str + '_cv10_test_result.csv'
181
+ },
182
+
183
+ 'model' : {
184
+ 1 : path_model + experiment_no_str + '_cv1_model.pkl',
185
+ 2 : path_model + experiment_no_str + '_cv2_model.pkl',
186
+ 3 : path_model + experiment_no_str + '_cv3_model.pkl',
187
+ 4 : path_model + experiment_no_str + '_cv4_model.pkl',
188
+ 5 : path_model + experiment_no_str + '_cv5_model.pkl',
189
+ 6 : path_model + experiment_no_str + '_cv6_model.pkl',
190
+ 7 : path_model + experiment_no_str + '_cv7_model.pkl',
191
+ 8 : path_model + experiment_no_str + '_cv8_model.pkl',
192
+ 9 : path_model + experiment_no_str + '_cv9_model.pkl',
193
+ 10 : path_model + experiment_no_str + '_cv10_model.pkl'
194
+ }
195
+ }
196
+ return hyperparameter,experiment_no_str, filepath
197
+
198
+ def export_result(filepath, df_a1_result, cross_val_result_df, hyperparameter):
199
+ """Export experiment summary:
200
+ 1. experiment result
201
+ 2. hyperparameter
202
+ 3. cross validation detailed result
203
+
204
+ Args:
205
+ filepath (dict): dictionary of filepaths for exporting result
206
+ """
207
+ # Create a df of hyperparameter being used
208
+ # Create a df of hyperparameter being used (dict -> key/value table)
209
+ df_a2 = pd.DataFrame(
210
+ {"hyperparameter": list(hyperparameter.keys()),
211
+ "value": list(hyperparameter.values())}
212
+ )
213
+
214
+ # EXPORT IT
215
+ df_a1_result.to_csv(filepath['a1'], index=False)
216
+ df_a2.to_csv(filepath['a2'])
217
+ cross_val_result_df.to_csv(filepath['a3'])
218
+
219
+
220
+ # # DATA INPUT, CALENDAR FEATURE MAKING
221
+
222
+ # ADD NET LOAD HISTORICAL DATA
223
+ def add_lag_features(df, forecast_horizon, max_lag_day):
224
+ """
225
+ Adds a lagged column to the dataframe based on the given horizon in minutes and max lag in days.
226
+
227
+ Args:
228
+ df (pd.DataFrame): The input dataframe with a datetime index and a column 'y'.
229
+ forecast_horizon (int): The horizon in minutes for the lag.
230
+ max_lag_day (int): the number of days until the longest lag
231
+
232
+ Returns:
233
+ pd.DataFrame: The dataframe with additional columns for the lags.
234
+ """
235
+
236
+ # Convert the horizon to a timedelta object
237
+ horizon_timedelta = pd.Timedelta(minutes=forecast_horizon)
238
+ consecutive_timedelta = df.index[1] - df.index[0]
239
+
240
+ # Calculate the number of new columns
241
+ n_new_cols = len(df[df.index < df.index[0] + pd.DateOffset(days=max_lag_day)])
242
+
243
+ # List to hold all the new lagged columns
244
+ new_cols = []
245
+
246
+ # Generate lagged columns based on the horizon and max lag
247
+
248
+ #Generate lagged columns not only based on net load but also based on weather data if available
249
+ for column in df.columns:
250
+ # Generate lagged columns for the current column
251
+ for i in range(n_new_cols):
252
+ shift_timedelta = horizon_timedelta + i * consecutive_timedelta
253
+ new_col_name = f'{column}_lag_{shift_timedelta}m'
254
+ new_cols.append(df[column].shift(freq=shift_timedelta).rename(new_col_name))
255
+
256
+
257
+ # Concatenate the new lagged columns with the original dataframe
258
+ df = pd.concat([df] + new_cols, axis=1)
259
+
260
+ df.dropna(inplace=True)
261
+
262
+ return df
263
+
264
+ def separate_holdout(df, n_block):
265
+ """Separating df into two parts:
266
+ 1. df : df that will be used for training and blocked k-fold cross validation.
267
+ The block is a multiple of a week because net load data has weekly seasonality
268
+ 2. hold_out_df : this section is not used for now, but can be useful for final test of the chosen model
269
+ if wanted, to show the generalized error. This is at least 1 block of data.
270
+
271
+ By default, the chosen k for k-fold cross validation is 10.
272
+
273
+ For example, the original df has 12 weeks worth of data.
274
+ In this case,
275
+ new df is week 1-10,
276
+ hold_out_df is week 11-12,
277
+
278
+ the new df will be used for cross validation, for example
279
+ CV1: training: week 1-9, validation (test) week 10
280
+ CV2: training: week 1-8, week 10, validation (test) week 9,
281
+ etc.
282
+
283
+ Args:
284
+ df (df): cleaned df consisting of y and all predictors
285
+ n_block (int): number of blocks to divide the original df. This includes the block for hold_out_df, so if k=10, this n_block = k+1 = 11
286
+
287
+ Returns:
288
+ block_length (int) : number of weeks per block
289
+ hodout_df (df) : unused df, can be used later for unbiased estimate of final model performance
290
+ df (df) : df that will be used for training and validation (test) set
291
+ """
292
+
293
+ one_week = dt.timedelta(weeks=1)
294
+ dataset_length_week= ((df.index[-1] - df.index[0]).total_seconds() / 86400/7)
295
+ block_length = int(dataset_length_week / n_block)
296
+ consecutive_timedelta = df.index[1] - df.index[0]
297
+ n_timestep_per_week = int(one_week / consecutive_timedelta)
298
+ holdout_start = (n_block - 1)* block_length * n_timestep_per_week
299
+ holdout_df = df.iloc[holdout_start:]
300
+ df = df.drop(df.index[holdout_start:])
301
+
302
+ return block_length, holdout_df, df
303
+
304
+ def input_and_process(dataset_path, model_name, forecast_horizon, max_lag_day, n_block, hyperparameter):
305
+ """read dataset, add calendar features, add lag features (which depends on the forecast horizon).
306
+
307
+ Args:
308
+ path_data_cleaned (str): path to the dataset chosen
309
+ forecast_horizon (int): forecast horizon in minutes
310
+ max_lag_day (int): how much lag data will be used, written in days. For example, 7 means lag data until d-7 is used.
311
+ n_block (int): number of blocks to divide the original df. This includes the block for hold_out_df, so if k=10, this n_block = k+1 = 11
312
+ hyperparameter (dict): hyperparameters for the model
313
+
314
+ Returns:
315
+ block_length (int): number of weeks per block
316
+ holdout_df (df): unused df, can be used later for unbiased estimate of final model performance
317
+ df (df): df that will be used for training and validation (test) set
318
+ """
319
+ # MAKE THIS AS FUNCTION
320
+ # ADD CALENDAR DATA (holiday to add)
321
+ # columns_to_use = ['datetime', 'netload_kW']
322
+ df = pd.read_csv(dataset_path, index_col=0, parse_dates=True)
323
+ df.rename(columns={'netload_kW': 'y'}, inplace=True)
324
+
325
+ # 1. Check if forecast horizon is >= dataset frequency
326
+ # for example, if dataset is daily, forecast horizon should be at least 1 day
327
+ # compute dataset frequency in minutes based on the datetime index
328
+ dataset_freq = (df.index[1] - df.index[0]).seconds / 60
329
+ if forecast_horizon < dataset_freq:
330
+ raise ValueError('Forecast horizon should be >= dataset frequency')
331
+ else:
332
+ print('Pass Test 1 - Forecast horizon is >= dataset frequency')
333
+
334
+ # 2. Check if hyperparameter choice is possible given the forecast horizon
335
+ # for example, with forecast horizon of 2 days, we cannot use 1 day as the hyperparameter of seasonal naive forecast.
336
+
337
+
338
+ if model_name == 'm2_snaive':
339
+ if int(hyperparameter['days'] * 24 * 60) < forecast_horizon:
340
+ raise ValueError('Choice of seasonal naive hyperparameter needs to be >= forecast horizon! Please change the hyperparameter.')
341
+ # if model_name == 'm4_sarima':
342
+ # if int(hyperparameter['seasonal_period_days'] * 24 * 60) < forecast_horizon:
343
+ # raise ValueError('Choice of seasonal_period_days in SARIMA hyperparameter >= forecast horizon! Please change the hyperparameter.')
344
+ print('Pass Test 2 - Hyperparameter choice is possible given the forecast horizon')
345
+
346
+
347
+ # ADD LAG FEATURES
348
+ df = add_lag_features(df, forecast_horizon, max_lag_day)
349
+
350
+ # ADD CALENDAR FEATURES
351
+ # 1. Numerical representation of the datetime (Excel-style)
352
+ numeric_datetime = pd.Series((df.index - pd.Timestamp("1970-01-01")) / pd.Timedelta(days=1), index=df.index)
353
+
354
+ # 2. Year
355
+ year = pd.Series(df.index.year, index=df.index)
356
+
357
+ # 3. One-hot encoding of month (is_jan, is_feb, ..., is_nov, excluding December)
358
+ month_dummies = pd.get_dummies(df.index.month, prefix='is', drop_first=False)
359
+
360
+ # Custom column names for months: is_jan, is_feb, ..., is_nov
361
+ month_names = ['is_jan', 'is_feb', 'is_mar', 'is_apr', 'is_may', 'is_jun',
362
+ 'is_jul', 'is_aug', 'is_sep', 'is_oct', 'is_nov', 'is_dec']
363
+
364
+ # Drop the last column (December) to avoid redundancy and rename the columns
365
+ month_dummies = month_dummies.iloc[:, :-1] # Exclude December column
366
+ month_dummies.columns = month_names[:month_dummies.shape[1]] # Apply custom column names
367
+ month_dummies = month_dummies.astype(int) # Convert to 1 and 0
368
+ month_dummies.index = df.index
369
+
370
+ # 4. One-hot encoding of hour (hour_0, hour_1, ..., hour_22, excluding hour_23)
371
+ hour_dummies = pd.get_dummies(df.index.hour, prefix='hour', drop_first=False).iloc[:, :-1]
372
+ hour_dummies = hour_dummies.astype(int) # Convert to 1 and 0
373
+ hour_dummies.index = df.index
374
+
375
+ # 5. One-hot encoding of day of week (is_mon, is_tue, ..., is_sat, excluding Sunday)
376
+ # Mapping day of week (0=Mon, 1=Tue, ..., 6=Sun)
377
+ dayofweek_dummies = pd.get_dummies(df.index.dayofweek, prefix='is', drop_first=False).iloc[:, :-1]
378
+
379
+ # Custom mapping for days of the week: is_mon, is_tue, ..., is_sat
380
+ dayofweek_names = ['is_mon', 'is_tue', 'is_wed', 'is_thu', 'is_fri', 'is_sat'] # Custom day names
381
+ dayofweek_dummies.columns = dayofweek_names[:dayofweek_dummies.shape[1]] # Apply custom column names
382
+ dayofweek_dummies = dayofweek_dummies.astype(int) # Convert to 1 and 0
383
+ dayofweek_dummies.index = df.index
384
+
385
+ # 6. Is weekday (1 if Monday to Friday, 0 if Saturday/Sunday)
386
+ is_weekday = pd.Series((df.index.dayofweek < 5).astype(int), index=df.index)
387
+
388
+
389
+ # Concatenate all new features into the original dataframe at once
390
+ df = pd.concat([df,
391
+ numeric_datetime.rename('numeric_datetime'),
392
+ year.rename('year'),
393
+ month_dummies,
394
+ hour_dummies,
395
+ dayofweek_dummies,
396
+ is_weekday.rename('is_weekday')], axis=1)
397
+
398
+ block_length, holdout_df, df = separate_holdout(df, n_block)
399
+
400
+ return block_length, holdout_df, df
401
+
402
+
403
+ # # CROSS VALIDATION
404
+ # SPLIT TRAIN - DEV - TEST SET
405
+ def split_time_series(df, cv_no, test_pct):
406
+ """Split df to train and test set using blocked cross validation.
407
+
408
+ Args:
409
+ df (df): df that will be used for training and validation (test) set, consists of X and Y
410
+ cv_no (int): number of current cv order.
411
+ cv_no=1 means the test set is at the last, cv_no = k means the test set is at the beginning
412
+
413
+ Returns:
414
+ train_df (df) : df used for training
415
+ test_df (df) : df used for validation, formal name is validation set / dev set.
416
+ """
417
+
418
+ n = len(df)
419
+ test_start = int(n*(1 - cv_no*test_pct))
420
+ test_end = int(n*(1 - (cv_no-1)*test_pct))
421
+
422
+ test_df = df.iloc[test_start:test_end]
423
+ train_df = df.drop(df.index[test_start:test_end])
424
+
425
+ return train_df, test_df
426
+
427
+
428
+
429
+
430
+ # SPLIT X AND y
431
+
432
+
433
+
434
+
435
+ def split_xy(df):
436
+ """separate forecast target y and all predictors X into two dfs
437
+
438
+ Args:
439
+ df (df): df containing the forecast target y and all predictors X
440
+
441
+ Return:
442
+ df_X (df): df of all predictors X
443
+ df_y (df): df of target forecast y
444
+ """
445
+
446
+ df_y = df[['y']]
447
+ df_X = df.drop("y", axis=1)
448
+
449
+ return df_X, df_y
450
+
451
+
452
+ # # RUN MODEL
453
+
454
+ # transform below scripts into function with input train_df_y and output train_df_y_updated
455
+ def remove_jump_df(train_df_y):
456
+ #make docstring with the same format like other cells
457
+ """
458
+ Remove jump in the time series data
459
+ Parameters:
460
+ train_df_y (pd.Series): Time series data
461
+
462
+ Returns:
463
+ train_df_y_updated (pd.Series): Time series data with jump removed
464
+ """
465
+
466
+ time_diff = train_df_y.index.to_series().diff().dt.total_seconds()
467
+ initial_freq = time_diff.iloc[1]
468
+ jump_indices = time_diff[time_diff > initial_freq].index
469
+ if not jump_indices.empty:
470
+ jump_index = jump_indices[0]
471
+ jump_pos = train_df_y.index.get_loc(jump_index)
472
+ train_df_y_updated = train_df_y.iloc[:jump_pos]
473
+ else:
474
+ train_df_y_updated = train_df_y
475
+ return train_df_y_updated
476
+
477
+ def call_train(train_fn, hyperparameter, train_df_X, train_df_y, forecast_horizon):
478
+ """
479
+ Call a model's train function robustly across different signatures.
480
+
481
+ Args:
482
+ train_fn (callable): train_model_<model_name> function
483
+ hyperparameter (dict): hyperparameter dict from YAML
484
+ train_df_X (df): predictors
485
+ train_df_y (df): target
486
+ forecast_horizon (int): minutes
487
+
488
+ Returns:
489
+ object: model object returned by the model's train function
490
+ """
491
+ try:
492
+ return train_fn(hyperparameter, train_df_X, train_df_y, forecast_horizon)
493
+ except TypeError:
494
+ return train_fn(hyperparameter, train_df_X, train_df_y)
495
+
496
+ def call_forecast(forecast_fn, model, train_df_X, test_df_X, train_df_y, forecast_horizon):
497
+ """
498
+ Call a model's forecast function robustly across different signatures.
499
+
500
+ Args:
501
+ forecast_fn (callable): produce_forecast_<model_name> function
502
+ model (object): trained model object
503
+ train_df_X (df): predictors for train
504
+ test_df_X (df): predictors for test
505
+ train_df_y (df): target for train (needed by Prophet)
506
+ forecast_horizon (int): minutes
507
+
508
+ Returns:
509
+ tuple: (train_df_y_hat, test_df_y_hat)
510
+ """
511
+ # Most general first (Prophet-like)
512
+ try:
513
+ return forecast_fn(model, train_df_X, test_df_X, train_df_y, forecast_horizon)
514
+ except TypeError:
515
+ pass
516
+ # Statsmodels-like
517
+ try:
518
+ return forecast_fn(model, train_df_X, test_df_X, forecast_horizon)
519
+ except TypeError:
520
+ pass
521
+ # Simple ML models
522
+ return forecast_fn(model, train_df_X, test_df_X)
523
+
524
+
525
+ def save_model(filepath, cv_no, model):
526
+ """Export model into binary file using pickle to a designated file
527
+
528
+ Args:
529
+ filepath (dictionary): dictionary of the file path
530
+ cv_no (int) : cv number
531
+ model (dictionary): trained model
532
+ """
533
+
534
+ with open(filepath['model'][cv_no], "wb") as model_file:
535
+ # pickle.dump(model, model_file)
536
+ dill.dump(model, model_file)
537
+
538
+ import numpy as np
539
+ import pandas as pd
540
+
541
+ def to_series(y_hat, target_index):
542
+ """
543
+ Convert model output to a 1D pandas Series aligned to target_index.
544
+
545
+ Rules:
546
+ - If y_hat is Series/DataFrame with its own index: reindex to target_index.
547
+ - If y_hat is array-like: require matching length (otherwise raise).
548
+
549
+ Args:
550
+ y_hat (any): model output (np array / Series / DataFrame)
551
+ target_index (pd.Index): desired index
552
+
553
+ Returns:
554
+ pd.Series: forecast aligned to target_index
555
+ """
556
+ # DataFrame -> Series
557
+ if isinstance(y_hat, pd.DataFrame):
558
+ if y_hat.shape[1] == 1:
559
+ s = y_hat.iloc[:, 0]
560
+ else:
561
+ # flatten multi-col to 1D (defensive)
562
+ s = pd.Series(np.asarray(y_hat).ravel(), index=y_hat.index[:len(np.asarray(y_hat).ravel())])
563
+ elif isinstance(y_hat, pd.Series):
564
+ s = y_hat
565
+ else:
566
+ arr = np.asarray(y_hat).ravel()
567
+ if len(arr) != len(target_index):
568
+ raise ValueError(
569
+ f"Forecast length mismatch: got {len(arr)} values, expected {len(target_index)}"
570
+ )
571
+ return pd.Series(arr, index=target_index)
572
+
573
+ # If it has an index, align by timestamps
574
+ if hasattr(s, "index") and len(s.index) > 0:
575
+ return s.reindex(target_index)
576
+
577
+ # Fallback (should rarely happen)
578
+ arr = np.asarray(s).ravel()
579
+ if len(arr) != len(target_index):
580
+ raise ValueError(
581
+ f"Forecast length mismatch: got {len(arr)} values, expected {len(target_index)}"
582
+ )
583
+ return pd.Series(arr, index=target_index)
584
+
585
+ def run_model(
586
+ df,
587
+ model_mod,
588
+ model_name,
589
+ hyperparameter,
590
+ filepath,
591
+ forecast_horizon,
592
+ experiment_no_str,
593
+ block_length,
594
+ *,
595
+ dataset_file,
596
+ hyperparameter_no,
597
+ k,
598
+ test_pct,
599
+ train_pct,
600
+ n_block,
601
+ plot_enabled,
602
+ plot_style,
603
+ ):
604
+ """
605
+ Run CV loop, train model, forecast, export outputs.
606
+
607
+ Args:
608
+ df (df): processed dataframe used for CV
609
+ model_mod (module): loaded model module from workspace
610
+ model_name (str): e.g. "m6_lr"
611
+ hyperparameter (dict): hp dict
612
+ filepath (dict): export paths from prepare_directory
613
+ forecast_horizon (int): minutes
614
+ experiment_no_str (str): e.g. "E00001"
615
+ block_length (int): weeks per block
616
+
617
+ dataset_file (str): dataset filename e.g. "ds0_test.csv"
618
+ hyperparameter_no (str): e.g. "hp1"
619
+ k (int): number of CV folds
620
+ test_pct (float): 1/k
621
+ train_pct (float): 1 - test_pct
622
+ n_block (int): k + 1
623
+ plot_enabled (bool): plot on/off
624
+ plot_style (dict): colors + font
625
+
626
+ Returns:
627
+ None
628
+ """
629
+
630
+ import warnings
631
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
632
+
633
+ cross_val_result_df = pd.DataFrame()
634
+
635
+ # Compute max_y for normalization later
636
+ max_y = df['y'].max()
637
+
638
+
639
+ # DO CROSS VALIDATION
640
+ for cv_no in range(1, k+1):
641
+ print(f'Processing CV {cv_no} / {k}....')
642
+
643
+ # SPLIT INTO TRAIN AND TEST X AND Y
644
+ train_df, test_df = split_time_series(df, cv_no, test_pct)
645
+ train_df_X, train_df_y = split_xy(train_df)
646
+ test_df_X, test_df_y = split_xy(test_df)
647
+
648
+ # INITIALISE RESULT DF
649
+ train_result = train_df_y.copy()
650
+ train_result = train_result.rename(columns={'y': 'observation'})
651
+
652
+ test_result = test_df_y.copy()
653
+ test_result = test_result.rename(columns={'y': 'observation'})
654
+
655
+ # PRODUCE NAIVE FORECAST
656
+ horizon_timedelta = pd.Timedelta(minutes=forecast_horizon)
657
+ last_observation = f'y_lag_{horizon_timedelta}m'
658
+ train_result['naive'] = train_df[last_observation]
659
+ test_result['naive'] = test_df[last_observation]
660
+
661
+ # CALL TRAIN AND FORECAST PRODUCTION
662
+ train_fn = getattr(model_mod, f"train_model_{model_name}")
663
+ forecast_fn = getattr(model_mod, f"produce_forecast_{model_name}")
664
+
665
+ # TRAIN MODEL
666
+ start_time = time.time()
667
+ model = call_train(train_fn, hyperparameter, train_df_X, train_df_y, forecast_horizon)
668
+ save_model(filepath, cv_no, model)
669
+ end_time = time.time()
670
+ runtime_ms = (end_time - start_time) * 1000
671
+
672
+ # PRODUCE FORECAST
673
+ train_df_y_hat, test_df_y_hat = call_forecast(
674
+ forecast_fn, model, train_df_X, test_df_X, train_df_y, forecast_horizon
675
+ )
676
+
677
+ # NORMALIZE FORECAST OUTPUTS
678
+ train_result["forecast"] = to_series(train_df_y_hat, train_df_X.index)
679
+ test_result["forecast"] = to_series(test_df_y_hat, test_df_X.index)
680
+
681
+ # EVALUATE FORECAST
682
+ train_result['residual'] = train_result['forecast'] - train_result['observation']
683
+ test_result['residual'] = test_result['forecast'] - test_result['observation']
684
+ train_R2 = compute_R2(train_result['forecast'], train_result['observation'])
685
+ test_R2 = compute_R2(test_result['forecast'], test_result['observation'])
686
+
687
+ train_RMSE = compute_RMSE(train_result['forecast'], train_result['observation'])
688
+ test_RMSE = compute_RMSE(test_result['forecast'], test_result['observation'])
689
+
690
+ train_nRMSE = 100*train_RMSE / max_y # in percent
691
+ test_nRMSE = 100*test_RMSE / max_y # in percent
692
+
693
+ cross_val_result = pd.DataFrame(
694
+ {
695
+ "runtime_ms": runtime_ms,
696
+ "train_MBE": compute_MBE(train_result['forecast'], train_result['observation']),
697
+ "train_MAE": compute_MAE(train_result['forecast'], train_result['observation']),
698
+ "train_RMSE": train_RMSE,
699
+ "train_MAPE": compute_MAPE(train_result['forecast'], train_result['observation']),
700
+ "train_MASE": compute_MASE(train_result['forecast'], train_result['observation'], train_result),
701
+ "train_fskill": compute_fskill(train_result['forecast'], train_result['observation'], train_result['naive']),
702
+ "train_R2": train_R2,
703
+ "test_MBE": compute_MBE(test_result['forecast'], test_result['observation']),
704
+ "test_MAE": compute_MAE(test_result['forecast'], test_result['observation']),
705
+ "test_RMSE": test_RMSE,
706
+ "test_MAPE": compute_MAPE(test_result['forecast'], test_result['observation']),
707
+ "test_MASE": compute_MASE(test_result['forecast'], test_result['observation'], train_result),
708
+ "test_fskill": compute_fskill(test_result['forecast'], test_result['observation'], test_result['naive']),
709
+ "test_R2": test_R2,
710
+ "train_nRMSE": train_nRMSE,
711
+ "test_nRMSE": test_nRMSE
712
+ },
713
+ index=[cv_no]
714
+ )
715
+
716
+ if cross_val_result_df.empty:
717
+ cross_val_result_df = cross_val_result
718
+ else:
719
+ cross_val_result_df = pd.concat([cross_val_result_df, cross_val_result], ignore_index=False)
720
+ cross_val_result_df.index.name = 'cv_no'
721
+
722
+ # EXPORT RESULTS DF TO CSV
723
+ train_result.to_csv(filepath['train_cv'][cv_no])
724
+ test_result.to_csv(filepath['test_cv'][cv_no])
725
+
726
+ # IF CV_NO = 1, ALSO EXPORT SOME PLOTS
727
+ dark_blue = plot_style["colors"]["dark_blue"]
728
+ orange = plot_style["colors"]["orange"]
729
+ plt.rcParams["font.family"] = plot_style["font_family"]
730
+
731
+ # PLOTS (CV1 only)
732
+ if plot_enabled and cv_no == 1:
733
+ dark_blue = plot_style["colors"]["dark_blue"]
734
+ orange = plot_style["colors"]["orange"]
735
+ plt.rcParams["font.family"] = plot_style["font_family"]
736
+
737
+ timeplot_forecast(train_result["observation"], train_result["forecast"], filepath["b1"], dark_blue, orange)
738
+ timeplot_forecast(test_result["observation"], test_result["forecast"], filepath["c1"], dark_blue, orange)
739
+ scatterplot_forecast(train_result["observation"], train_result["forecast"], train_R2, filepath["b2"], dark_blue, orange)
740
+ scatterplot_forecast(test_result["observation"], test_result["forecast"], test_R2, filepath["c2"], dark_blue, orange)
741
+ timeplot_residual(train_result["residual"], filepath["b3"], dark_blue, orange)
742
+ timeplot_residual(test_result["residual"], filepath["c3"], dark_blue, orange)
743
+ histogram_residual(train_result["residual"], df, filepath["b4"], dark_blue, orange)
744
+ histogram_residual(test_result["residual"], df, filepath["c4"], dark_blue, orange)
745
+
746
+ print()
747
+
748
+
749
+ cross_val_result = pd.DataFrame(
750
+ {
751
+ "runtime_ms": [cross_val_result_df['runtime_ms'].mean(), cross_val_result_df['runtime_ms'].std()],
752
+ "train_MBE": [cross_val_result_df['train_MBE'].mean(), cross_val_result_df['train_MBE'].std()],
753
+ "train_MAE": [cross_val_result_df['train_MAE'].mean(), cross_val_result_df['train_MAE'].std()],
754
+ "train_RMSE": [cross_val_result_df['train_RMSE'].mean(), cross_val_result_df['train_RMSE'].std()],
755
+ "train_MAPE": [cross_val_result_df['train_MAPE'].mean(), cross_val_result_df['train_MAPE'].std()],
756
+ "train_MASE": [cross_val_result_df['train_MASE'].mean(), cross_val_result_df['train_MASE'].std()],
757
+ "train_fskill": [cross_val_result_df['train_fskill'].mean(), cross_val_result_df['train_fskill'].std()],
758
+ "train_R2": [cross_val_result_df['train_R2'].mean(), cross_val_result_df['train_R2'].std()],
759
+ "test_MBE": [cross_val_result_df['test_MBE'].mean(), cross_val_result_df['test_MBE'].std()],
760
+ "test_MAE": [cross_val_result_df['test_MAE'].mean(), cross_val_result_df['test_MAE'].std()],
761
+ "test_RMSE": [cross_val_result_df['test_RMSE'].mean(), cross_val_result_df['test_RMSE'].std()],
762
+ "test_MAPE": [cross_val_result_df['test_MAPE'].mean(), cross_val_result_df['test_MAPE'].std()],
763
+ "test_MASE": [cross_val_result_df['test_MASE'].mean(), cross_val_result_df['test_MASE'].std()],
764
+ "test_fskill": [cross_val_result_df['test_fskill'].mean(), cross_val_result_df['test_fskill'].std()],
765
+ "test_R2": [cross_val_result_df['test_R2'].mean(), cross_val_result_df['test_R2'].std()],
766
+ "train_nRMSE": [cross_val_result_df['train_nRMSE'].mean(), cross_val_result_df['train_nRMSE'].std()],
767
+ "test_nRMSE": [cross_val_result_df['test_nRMSE'].mean(), cross_val_result_df['test_nRMSE'].std()]
768
+ },
769
+ index=['mean', 'stddev']
770
+ )
771
+
772
+ cross_val_result_df = pd.concat([cross_val_result_df, cross_val_result], ignore_index=False)
773
+
774
+ data_a1 = {
775
+ "experiment_no": experiment_no_str,
776
+ "exp_date": datetime.today().strftime('%Y-%m-%d'), #today date in YYYY-MM-DD format
777
+ "dataset_no": dataset_file.split('_')[0],
778
+ "dataset": dataset_file.split('_')[1].split('.')[0] if '_' in dataset_file else dataset_file.split('.')[0],
779
+ "dataset_freq_min": int((df.index[1] - df.index[0]).total_seconds() / 60),
780
+ "dataset_length_week": block_length * (n_block - 1),
781
+ "forecast_horizon_min": forecast_horizon,
782
+ "train_pct": train_pct,
783
+ "test_pct": test_pct,
784
+ "model_no": model_name.split('_')[0],
785
+ "hyperparameter_no": hyperparameter_no,
786
+ "model_name": model_name + '_' + hyperparameter_no,
787
+ "hyperparamter": ', '.join(f"{k}: {v}" for k, v in hyperparameter.items()),
788
+ "runtime_ms": cross_val_result_df.loc['mean', 'runtime_ms'],
789
+ "train_RMSE": cross_val_result_df.loc['mean', 'train_RMSE'],
790
+ "train_RMSE_stddev": cross_val_result_df.loc['stddev', 'train_RMSE'],
791
+ "test_RMSE": cross_val_result_df.loc['mean', 'test_RMSE'],
792
+ "test_RMSE_stddev": cross_val_result_df.loc['stddev', 'test_RMSE'],
793
+ "train_nRMSE": cross_val_result_df.loc['mean', 'train_nRMSE'],
794
+ "train_nRMSE_stddev": cross_val_result_df.loc['stddev', 'train_nRMSE'],
795
+ "test_nRMSE": cross_val_result_df.loc['mean', 'test_nRMSE'],
796
+ "test_nRMSE_stddev": cross_val_result_df.loc['stddev', 'test_nRMSE']
797
+ }
798
+
799
+ # Create a df of experiment result
800
+ df_a1_result = pd.DataFrame([data_a1])
801
+
802
+ export_result(filepath, df_a1_result, cross_val_result_df, hyperparameter)
803
+
804
+ # return df_a1_result, cross_val_result_df
805
+
806
+ def validate_model_module(model_mod, model_name: str) -> None:
807
+ """
808
+ Validate that a model module provides required functions.
809
+
810
+ Args:
811
+ model_mod (module): loaded model module
812
+ model_name (str): file stem, e.g. "m6_lr"
813
+
814
+ Returns:
815
+ None
816
+ """
817
+ train_name = f"train_model_{model_name}"
818
+ fcst_name = f"produce_forecast_{model_name}"
819
+ if not hasattr(model_mod, train_name):
820
+ raise AttributeError(f"Missing function '{train_name}' in {model_name}.py")
821
+ if not hasattr(model_mod, fcst_name):
822
+ raise AttributeError(f"Missing function '{fcst_name}' in {model_name}.py")
823
+
824
+ # RUN THE TOOL
825
+ def run_experiment_engine(
826
+ dataset_path,
827
+ forecast_horizon_min,
828
+ model_name,
829
+ hyperparameter_no,
830
+ hyperparameter,
831
+ output_dir,
832
+ models_dir,
833
+ config,
834
+ ):
835
+ """
836
+ Run one experiment end-to-end using explicit inputs (no notebook globals).
837
+
838
+ Args:
839
+ dataset_path (str | Path): full path to dataset CSV (workspace/data/...)
840
+ forecast_horizon_min (int): forecast horizon in minutes
841
+ model_name (str): model name e.g. "m6_lr"
842
+ hyperparameter_no (str): hp identifier e.g. "hp1"
843
+ hyperparameter (dict): hyperparameter dict for this run
844
+ output_dir (str | Path): workspace/experiment_result
845
+ models_dir (str | Path): workspace/models (workspace-only)
846
+ config (dict): parsed workspace/specs/pynnlf_config.yaml (cv + plot + paths + registries)
847
+
848
+ Returns:
849
+ None
850
+ """
851
+ dataset_path = Path(dataset_path)
852
+ dataset_file = dataset_path.name
853
+
854
+ # CV config
855
+ k = int(config["cv"]["k"])
856
+ test_pct = 1 / k
857
+ train_pct = 1 - test_pct
858
+ n_block = k + 1
859
+ max_lag_day = int(config["cv"]["max_lag_day"])
860
+
861
+ # load workspace model module
862
+ model_mod = load_model_module(Path(models_dir), model_name)
863
+ validate_model_module(model_mod, model_name)
864
+
865
+ # folders + filepaths
866
+ hyperparameter_used, experiment_no_str, filepath = prepare_directory(
867
+ str(Path(output_dir)) + "/",
868
+ dataset_file,
869
+ forecast_horizon_min,
870
+ model_name,
871
+ hyperparameter_no,
872
+ hyperparameter,
873
+ )
874
+
875
+ # data prep
876
+ block_length, holdout_df, df = input_and_process(
877
+ dataset_path,
878
+ model_name,
879
+ forecast_horizon_min,
880
+ max_lag_day,
881
+ n_block,
882
+ hyperparameter,
883
+ )
884
+
885
+ # run CV + export a1/a2/a3
886
+ run_model(
887
+ df,
888
+ model_mod,
889
+ model_name,
890
+ hyperparameter,
891
+ filepath,
892
+ forecast_horizon_min,
893
+ experiment_no_str,
894
+ block_length,
895
+ dataset_file=dataset_file,
896
+ hyperparameter_no=hyperparameter_no,
897
+ k=k,
898
+ test_pct=test_pct,
899
+ train_pct=train_pct,
900
+ n_block=n_block,
901
+ plot_enabled=bool(config["plot"]["enabled"]),
902
+ plot_style=config["plot"],
903
+ )
904
+
905
+ # # PERFORMANCE COMPUTATION
906
+
907
+
908
+
909
+ # Mean Bias Error (MBE)
910
+ def compute_MBE(forecast, observation):
911
+ """As the name suggest.
912
+
913
+ Args:
914
+ forecast (df): series of the forecast result from the model
915
+ observation (df): series of the observed value (actual value)
916
+
917
+ Returns:
918
+ error as the name suggest (float): as the name suggest
919
+ """
920
+ return round(((forecast - observation).sum()) / len(observation), 5)
921
+
922
+ # Mean Absolute Error (MAE)
923
+ def compute_MAE(forecast, observation):
924
+ """As the name suggest.
925
+
926
+ Args:
927
+ forecast (df): series of the forecast result from the model
928
+ observation (df): series of the observed value (actual value)
929
+
930
+ Returns:
931
+ error as the name suggest (float): as the name suggest
932
+ """
933
+ return round((abs(forecast - observation)).mean(), 3)
934
+
935
+ # Root Mean Square Error (RMSE)
936
+ def compute_RMSE(forecast, observation):
937
+ """As the name suggest.
938
+
939
+ Args:
940
+ forecast (df): series of the forecast result from the model
941
+ observation (df): series of the observed value (actual value)
942
+
943
+ Returns:
944
+ error as the name suggest (float): as the name suggest
945
+ """
946
+ return round(np.sqrt(((forecast - observation) ** 2).mean()), 3)
947
+
948
+ # Mean Absolute Percentage Error (MAPE)
949
+ def compute_MAPE(forecast, observation):
950
+ """As the name suggest. Be careful with MAPE though because its value can go to inf since the observed value can be 0.
951
+
952
+ Args:
953
+ forecast (df): series of the forecast result from the model
954
+ observation (df): series of the observed value (actual value)
955
+
956
+ Returns:
957
+ error as the name suggest (float): as the name suggest
958
+ """
959
+ return round((abs((forecast - observation) / observation) * 100).mean(), 3)
960
+
961
+ # Mean Absolute Scaled Error (MASE)
962
+ def compute_MASE(forecast, observation, train_result):
963
+ """As the name suggest. MASE is first introduced by Rob Hyndman, used to handle MAPE problem being infinity.
964
+ Instead of using observed value as denominator,
965
+ MASE uses MAE of the naive forecast at the train set for denominator.
966
+
967
+ Args:
968
+ forecast (df): series of the forecast result from the model
969
+ observation (df): series of the observed value (actual value)
970
+
971
+ Returns:
972
+ error as the name suggest (float): as the name suggest
973
+ """
974
+ errors = abs(forecast - observation)
975
+ MAE_naive = compute_MAE(train_result['naive'], train_result['observation'])
976
+
977
+ MASE = errors.mean() / MAE_naive
978
+ return round(MASE, 3)
979
+
980
+ # Forecast Skill (FS)
981
+ def compute_fskill(forecast, observation, naive):
982
+ """As the name suggest. Forecast Skill is a relative measure seeing the improvement
983
+ of the model performance over naive model.
984
+
985
+ Args:
986
+ forecast (df): series of the forecast result from the model
987
+ observation (df): series of the observed value (actual value)
988
+
989
+ Returns:
990
+ error as the name suggest (float): as the name suggest
991
+ """
992
+ return round((1 - compute_RMSE(forecast, observation) / compute_RMSE(naive, observation)) * 100, 3)
993
+
994
+ # R2
995
+ def compute_R2(forecast, observation):
996
+ """As the name suggest. Be careful with R2 though because it is not a forecast evaluation.
997
+ It is just used to show linearity on the scatter plot of forecast and observed value.
998
+
999
+ Args:
1000
+ forecast (df): series of the forecast result from the model
1001
+ observation (df): series of the observed value (actual value)
1002
+
1003
+ Returns:
1004
+ error as the name suggest (float): as the name suggest
1005
+ """
1006
+ return round(forecast.corr(observation)**2, 3)
1007
+
1008
+ # # PLOT
1009
+ def timeplot_forecast(observation, forecast, pathname, dark_blue, orange):
1010
+ """Produce time plot of observation vs forecast value and save it on the designated folder
1011
+
1012
+ Args:
1013
+ observation (df): observed value
1014
+ forecast (df): forecast value
1015
+ pathname (str): filepath to save the figure
1016
+ """
1017
+ consecutive_timedelta = observation.index[-1] - observation.index[-2]
1018
+ # Calculate total minutes in a week
1019
+ minutes_per_week = 7 * 24 * 60 # 7 days * 24 hours * 60 minutes
1020
+
1021
+ # Calculate the number of minutes per timestep
1022
+ minutes_per_timestep = consecutive_timedelta.total_seconds() / 60 # convert seconds to minutes
1023
+
1024
+ # Compute the number of timesteps in a week
1025
+ timesteps_per_week = int(minutes_per_week / minutes_per_timestep)
1026
+
1027
+ # Create the figure with specified size
1028
+ plt.figure(figsize=(9, 9))
1029
+
1030
+ # Set background color
1031
+ # plt.gcf().patch.set_facecolor(platinum)
1032
+
1033
+ # Plot the actual and forecast data
1034
+ plt.plot(observation[-timesteps_per_week:], color=dark_blue, label='Actual')
1035
+ plt.plot(forecast[-timesteps_per_week:], color=orange, label='Forecast')
1036
+
1037
+ # Remove grid lines
1038
+ plt.grid(False)
1039
+
1040
+ # Set tick marks for x and y axis
1041
+ plt.xticks(fontsize=12, color=dark_blue, alpha=0.5, rotation=30)
1042
+ plt.yticks(fontsize=12, color=dark_blue, alpha=0.5)
1043
+
1044
+ # Add borders to the plot
1045
+ plt.gca().spines['top'].set_color(dark_blue)
1046
+ plt.gca().spines['right'].set_color(dark_blue)
1047
+ plt.gca().spines['bottom'].set_color(dark_blue)
1048
+ plt.gca().spines['left'].set_color(dark_blue)
1049
+
1050
+ # Remove the tick markers (the small lines)
1051
+ plt.tick_params(axis='x', which='both', length=0) # Remove x-axis tick markers
1052
+ plt.tick_params(axis='y', which='both', length=0) # Remove y-axis tick markers
1053
+
1054
+ # Set axis titles
1055
+ plt.xlabel('Time', fontsize=14, color=dark_blue)
1056
+ plt.ylabel('Net Load (kW)', fontsize=14, color=dark_blue)
1057
+
1058
+ # Remove title
1059
+ plt.title('')
1060
+
1061
+ plt.legend(loc='upper left', fontsize=12, frameon=False, labelspacing=1, bbox_to_anchor=(1, 1))
1062
+
1063
+ plt.savefig(pathname, format='png', bbox_inches='tight')
1064
+ plt.close()
1065
+
1066
+
1067
+
1068
+ # Show the plot
1069
+ # plt.show()
1070
+
1071
+ def scatterplot_forecast(observation, forecast, R2, pathname, dark_blue, orange):
1072
+ """Produce scatterplot observation vs forecast value and save it on the designated folder
1073
+
1074
+ Args:
1075
+ observation (df): observed value
1076
+ forecast (df): forecast value
1077
+ pathname (str): filepath to save the figure
1078
+ """
1079
+ # Create the figure with specified size
1080
+ plt.figure(figsize=(9, 9))
1081
+
1082
+ # Set background color
1083
+ # plt.gcf().patch.set_facecolor(platinum)
1084
+
1085
+ # Plot the actual and forecast data
1086
+ plt.scatter(forecast, observation, color=dark_blue, label='Actual', s=40, alpha=0.7) # 's' sets the size of the points
1087
+
1088
+ # Remove grid lines
1089
+ plt.grid(False)
1090
+
1091
+ # Set tick marks for x and y axis
1092
+ plt.xticks(fontsize=12, color=dark_blue, alpha=0.5, rotation=0)
1093
+ plt.yticks(fontsize=12, color=dark_blue, alpha=0.5)
1094
+
1095
+ # Add borders to the plot
1096
+ plt.gca().spines['top'].set_color(dark_blue)
1097
+ plt.gca().spines['right'].set_color(dark_blue)
1098
+ plt.gca().spines['bottom'].set_color(dark_blue)
1099
+ plt.gca().spines['left'].set_color(dark_blue)
1100
+
1101
+ # Remove the tick markers (the small lines)
1102
+ plt.tick_params(axis='x', which='both', length=0) # Remove x-axis tick markers
1103
+ plt.tick_params(axis='y', which='both', length=0) # Remove y-axis tick markers
1104
+
1105
+ # Set axis titles
1106
+ plt.xlabel('Net Load Forecast (kW)', fontsize=14, color=dark_blue)
1107
+ plt.ylabel('Net Load Observation (kW)', fontsize=14, color=dark_blue)
1108
+
1109
+ # Remove title
1110
+ plt.title('')
1111
+
1112
+ # Add R² value at the top-left corner
1113
+ plt.text(0.95, 0.05, f'R² = {R2:.3f}', transform=plt.gca().transAxes,
1114
+ fontsize=14, color=dark_blue, verticalalignment='bottom', horizontalalignment='right',
1115
+ bbox=dict(facecolor='white', edgecolor=dark_blue, boxstyle='round,pad=0.5', linewidth=1))
1116
+
1117
+
1118
+ plt.savefig(pathname, format='png', bbox_inches='tight')
1119
+ plt.close()
1120
+
1121
+
1122
+
1123
+ # Show the plot
1124
+ # plt.show()
1125
+
1126
+ def timeplot_residual(residual, pathname, dark_blue, orange):
1127
+ """Produce time plot of resodia; value and save it on the designated folder
1128
+
1129
+ Args:
1130
+ residual (df): forecast - observation
1131
+ pathname (str): filepath to save the figure
1132
+ """
1133
+ consecutive_timedelta = residual.index[-1] - residual.index[-2]
1134
+ # Calculate total minutes in a week
1135
+ minutes_per_week = 7 * 24 * 60 # 7 days * 24 hours * 60 minutes
1136
+
1137
+ # Calculate the number of minutes per timestep
1138
+ minutes_per_timestep = consecutive_timedelta.total_seconds() / 60 # convert seconds to minutes
1139
+
1140
+ # Compute the number of timesteps in a week
1141
+ timesteps_per_week = int(minutes_per_week / minutes_per_timestep)
1142
+
1143
+ # Create the figure with specified size
1144
+ plt.figure(figsize=(9, 9))
1145
+
1146
+ # Set background color
1147
+ # plt.gcf().patch.set_facecolor(platinum)
1148
+
1149
+ # Plot the actual and forecast data
1150
+ plt.plot(residual[-timesteps_per_week:], color=dark_blue, label='Actual')
1151
+
1152
+
1153
+ # Remove grid lines
1154
+ plt.grid(False)
1155
+
1156
+ # Set tick marks for x and y axis
1157
+ plt.xticks(fontsize=12, color=dark_blue, alpha=0.5, rotation=30)
1158
+ plt.yticks(fontsize=12, color=dark_blue, alpha=0.5)
1159
+
1160
+ # Add borders to the plot
1161
+ plt.gca().spines['top'].set_color(dark_blue)
1162
+ plt.gca().spines['right'].set_color(dark_blue)
1163
+ plt.gca().spines['bottom'].set_color(dark_blue)
1164
+ plt.gca().spines['left'].set_color(dark_blue)
1165
+
1166
+ # Remove the tick markers (the small lines)
1167
+ plt.tick_params(axis='x', which='both', length=0) # Remove x-axis tick markers
1168
+ plt.tick_params(axis='y', which='both', length=0) # Remove y-axis tick markers
1169
+
1170
+ # Set axis titles
1171
+ plt.xlabel('Time', fontsize=14, color=dark_blue)
1172
+ plt.ylabel('Forecast Residual (kW)', fontsize=14, color=dark_blue)
1173
+
1174
+ # Remove title
1175
+ plt.title('')
1176
+
1177
+ plt.savefig(pathname, format='png', bbox_inches='tight')
1178
+ plt.close()
1179
+
1180
+
1181
+
1182
+ # Show the plot
1183
+ # plt.show()
1184
+
1185
+ def histogram_residual(residual, df, pathname, dark_blue, orange):
1186
+ """Produce histogiram of residual value and save it on the designated folder
1187
+
1188
+ Args:
1189
+ residual (df): forecast - observation
1190
+ pathname (str): filepath to save the figure
1191
+ """
1192
+ # Create the figure with specified size
1193
+ plt.figure(figsize=(9, 9))
1194
+
1195
+ # Set background color
1196
+ # plt.gcf().patch.set_facecolor(platinum)
1197
+
1198
+ # Compute the range
1199
+ dataset_range = df['y'].max() - df['y'].min()
1200
+ bin_min = -dataset_range/7
1201
+ bin_max = dataset_range/7
1202
+
1203
+ # Plot the actual and forecast data
1204
+ plt.hist(residual, bins=31, range=(bin_min, bin_max), color=dark_blue, edgecolor=dark_blue, alpha=0.7)
1205
+
1206
+ # Remove grid lines
1207
+ plt.grid(False)
1208
+
1209
+ # Set tick marks for x and y axis
1210
+ plt.xticks(fontsize=12, color=dark_blue, alpha=0.5, rotation=0)
1211
+ plt.yticks(fontsize=12, color=dark_blue, alpha=0.5)
1212
+
1213
+ # Add borders to the plot
1214
+ plt.gca().spines['top'].set_color(dark_blue)
1215
+ plt.gca().spines['right'].set_color(dark_blue)
1216
+ plt.gca().spines['bottom'].set_color(dark_blue)
1217
+ plt.gca().spines['left'].set_color(dark_blue)
1218
+
1219
+ # Remove the tick markers (the small lines)
1220
+ plt.tick_params(axis='x', which='both', length=0) # Remove x-axis tick markers
1221
+ plt.tick_params(axis='y', which='both', length=0) # Remove y-axis tick markers
1222
+
1223
+ # Set axis titles
1224
+ plt.xlabel('Forecast Residual (kW)', fontsize=14, color=dark_blue)
1225
+ plt.ylabel('Count', fontsize=14, color=dark_blue)
1226
+
1227
+ # Remove title
1228
+ plt.title('')
1229
+
1230
+
1231
+ plt.savefig(pathname, format='png', bbox_inches='tight')
1232
+ plt.close()
1233
+
1234
+
1235
+
1236
+ # Show the plot
1237
+ # plt.show()
1238
+