oracle-ads 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. ads/aqua/__init__.py +12 -0
  2. ads/aqua/base.py +324 -0
  3. ads/aqua/cli.py +19 -0
  4. ads/aqua/config/deployment_config_defaults.json +9 -0
  5. ads/aqua/config/resource_limit_names.json +7 -0
  6. ads/aqua/constants.py +45 -0
  7. ads/aqua/data.py +40 -0
  8. ads/aqua/decorator.py +101 -0
  9. ads/aqua/deployment.py +643 -0
  10. ads/aqua/dummy_data/icon.txt +1 -0
  11. ads/aqua/dummy_data/oci_model_deployments.json +56 -0
  12. ads/aqua/dummy_data/oci_models.json +1 -0
  13. ads/aqua/dummy_data/readme.md +26 -0
  14. ads/aqua/evaluation.py +1751 -0
  15. ads/aqua/exception.py +82 -0
  16. ads/aqua/extension/__init__.py +40 -0
  17. ads/aqua/extension/base_handler.py +138 -0
  18. ads/aqua/extension/common_handler.py +21 -0
  19. ads/aqua/extension/deployment_handler.py +202 -0
  20. ads/aqua/extension/evaluation_handler.py +135 -0
  21. ads/aqua/extension/finetune_handler.py +66 -0
  22. ads/aqua/extension/model_handler.py +59 -0
  23. ads/aqua/extension/ui_handler.py +201 -0
  24. ads/aqua/extension/utils.py +23 -0
  25. ads/aqua/finetune.py +579 -0
  26. ads/aqua/job.py +29 -0
  27. ads/aqua/model.py +819 -0
  28. ads/aqua/training/__init__.py +4 -0
  29. ads/aqua/training/exceptions.py +459 -0
  30. ads/aqua/ui.py +453 -0
  31. ads/aqua/utils.py +715 -0
  32. ads/cli.py +37 -6
  33. ads/common/auth.py +7 -0
  34. ads/common/decorator/__init__.py +7 -3
  35. ads/common/decorator/require_nonempty_arg.py +65 -0
  36. ads/common/object_storage_details.py +166 -7
  37. ads/common/oci_client.py +18 -1
  38. ads/common/oci_logging.py +2 -2
  39. ads/common/oci_mixin.py +4 -5
  40. ads/common/serializer.py +34 -5
  41. ads/common/utils.py +75 -10
  42. ads/config.py +40 -1
  43. ads/dataset/correlation_plot.py +10 -12
  44. ads/jobs/ads_job.py +43 -25
  45. ads/jobs/builders/infrastructure/base.py +4 -2
  46. ads/jobs/builders/infrastructure/dsc_job.py +49 -39
  47. ads/jobs/builders/runtimes/base.py +71 -1
  48. ads/jobs/builders/runtimes/container_runtime.py +4 -4
  49. ads/jobs/builders/runtimes/pytorch_runtime.py +10 -63
  50. ads/jobs/templates/driver_pytorch.py +27 -10
  51. ads/model/artifact_downloader.py +84 -14
  52. ads/model/artifact_uploader.py +25 -23
  53. ads/model/datascience_model.py +388 -38
  54. ads/model/deployment/model_deployment.py +10 -2
  55. ads/model/generic_model.py +8 -0
  56. ads/model/model_file_description_schema.json +68 -0
  57. ads/model/model_metadata.py +1 -1
  58. ads/model/service/oci_datascience_model.py +34 -5
  59. ads/opctl/config/merger.py +2 -2
  60. ads/opctl/operator/__init__.py +3 -1
  61. ads/opctl/operator/cli.py +7 -1
  62. ads/opctl/operator/cmd.py +3 -3
  63. ads/opctl/operator/common/errors.py +2 -1
  64. ads/opctl/operator/common/operator_config.py +22 -3
  65. ads/opctl/operator/common/utils.py +16 -0
  66. ads/opctl/operator/lowcode/anomaly/MLoperator +15 -0
  67. ads/opctl/operator/lowcode/anomaly/README.md +209 -0
  68. ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
  69. ads/opctl/operator/lowcode/anomaly/__main__.py +104 -0
  70. ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
  71. ads/opctl/operator/lowcode/anomaly/const.py +88 -0
  72. ads/opctl/operator/lowcode/anomaly/environment.yaml +12 -0
  73. ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
  74. ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +147 -0
  75. ads/opctl/operator/lowcode/anomaly/model/automlx.py +89 -0
  76. ads/opctl/operator/lowcode/anomaly/model/autots.py +103 -0
  77. ads/opctl/operator/lowcode/anomaly/model/base_model.py +354 -0
  78. ads/opctl/operator/lowcode/anomaly/model/factory.py +67 -0
  79. ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
  80. ads/opctl/operator/lowcode/anomaly/operator_config.py +105 -0
  81. ads/opctl/operator/lowcode/anomaly/schema.yaml +359 -0
  82. ads/opctl/operator/lowcode/anomaly/utils.py +81 -0
  83. ads/opctl/operator/lowcode/common/__init__.py +5 -0
  84. ads/opctl/operator/lowcode/common/const.py +10 -0
  85. ads/opctl/operator/lowcode/common/data.py +96 -0
  86. ads/opctl/operator/lowcode/common/errors.py +41 -0
  87. ads/opctl/operator/lowcode/common/transformations.py +191 -0
  88. ads/opctl/operator/lowcode/common/utils.py +250 -0
  89. ads/opctl/operator/lowcode/forecast/README.md +3 -2
  90. ads/opctl/operator/lowcode/forecast/__main__.py +18 -2
  91. ads/opctl/operator/lowcode/forecast/cmd.py +8 -7
  92. ads/opctl/operator/lowcode/forecast/const.py +17 -1
  93. ads/opctl/operator/lowcode/forecast/environment.yaml +3 -2
  94. ads/opctl/operator/lowcode/forecast/model/arima.py +106 -117
  95. ads/opctl/operator/lowcode/forecast/model/automlx.py +204 -180
  96. ads/opctl/operator/lowcode/forecast/model/autots.py +144 -253
  97. ads/opctl/operator/lowcode/forecast/model/base_model.py +326 -259
  98. ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +325 -176
  99. ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +293 -237
  100. ads/opctl/operator/lowcode/forecast/model/prophet.py +191 -208
  101. ads/opctl/operator/lowcode/forecast/operator_config.py +24 -33
  102. ads/opctl/operator/lowcode/forecast/schema.yaml +116 -29
  103. ads/opctl/operator/lowcode/forecast/utils.py +186 -356
  104. ads/opctl/operator/lowcode/pii/model/guardrails.py +18 -15
  105. ads/opctl/operator/lowcode/pii/model/report.py +7 -7
  106. ads/opctl/operator/lowcode/pii/operator_config.py +1 -8
  107. ads/opctl/operator/lowcode/pii/utils.py +0 -82
  108. ads/opctl/operator/runtime/runtime.py +3 -2
  109. ads/telemetry/base.py +62 -0
  110. ads/telemetry/client.py +105 -0
  111. ads/telemetry/telemetry.py +6 -3
  112. {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/METADATA +44 -7
  113. {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/RECORD +116 -59
  114. ads/opctl/operator/lowcode/forecast/model/transformations.py +0 -125
  115. {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/LICENSE.txt +0 -0
  116. {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/WHEEL +0 -0
  117. {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/entry_points.txt +0 -0
@@ -11,6 +11,7 @@ from typing import List
11
11
  import fsspec
12
12
  import numpy as np
13
13
  import pandas as pd
14
+ import cloudpickle
14
15
  import plotly.express as px
15
16
  from plotly import graph_objects as go
16
17
  from sklearn.metrics import (
@@ -28,9 +29,13 @@ from ads.common.object_storage_details import ObjectStorageDetails
28
29
  from ads.dataset.label_encoder import DataFrameLabelEncoder
29
30
  from ads.opctl import logger
30
31
 
31
- from .const import SupportedMetrics, SupportedModels
32
+ from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT
32
33
  from .errors import ForecastInputDataError, ForecastSchemaYamlError
33
34
  from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig
35
+ from ads.opctl.operator.lowcode.common.utils import merge_category_columns
36
+ from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns
37
+
38
+ # from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData, ForecastOutput
34
39
 
35
40
 
36
41
  def _label_encode_dataframe(df, no_encode=set()):
@@ -55,11 +60,8 @@ def smape(actual, predicted) -> float:
55
60
 
56
61
 
57
62
  def _build_metrics_per_horizon(
58
- data: pd.DataFrame,
59
- output: pd.DataFrame,
60
- target_columns: List[str],
61
- target_col: str,
62
- horizon_periods: int,
63
+ test_data: "TestData",
64
+ output: "ForecastOutput",
63
65
  ) -> pd.DataFrame:
64
66
  """
65
67
  Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon
@@ -70,12 +72,6 @@ def _build_metrics_per_horizon(
70
72
  Dataframe that has the actual data
71
73
  output: Pandas Dataframe
72
74
  Dataframe that has the forecasted data
73
- target_columns: List
74
- List of target category columns
75
- target_col: str
76
- Target column name (yhat)
77
- horizon_periods: int
78
- Horizon Periods
79
75
 
80
76
  Returns
81
77
  --------
@@ -85,279 +81,118 @@ def _build_metrics_per_horizon(
85
81
  """
86
82
  Assumptions:
87
83
  data and output have all the target columns.
88
- yhats in output are in the same order as in target_columns.
84
+ yhats in output are in the same order as in series_ids.
89
85
  Test data might not have sorted dates and the order of series also might differ.
90
86
  """
91
87
 
92
- # Select the data with correct order of target_columns.
93
- target_columns = list(set.intersection(set(target_columns), set(data.columns)))
94
-
95
- actuals_df = data[["ds"] + target_columns]
88
+ test_df = (
89
+ test_data.get_data_long()
90
+ .rename({test_data.dt_column_name: ForecastOutputColumns.DATE}, axis=1)
91
+ .set_index([ForecastOutputColumns.DATE, ForecastOutputColumns.SERIES])
92
+ .sort_index()
93
+ )
94
+ forecast_df = (
95
+ output.get_horizon_long()
96
+ .set_index([ForecastOutputColumns.DATE, ForecastOutputColumns.SERIES])
97
+ .sort_index()
98
+ )
96
99
 
97
- # Concat the yhats in output and include only dates that are in test data
98
- forecasts_df = pd.DataFrame()
99
- for cat in output.list_categories():
100
- forecast_i = output.get_category(cat)[["Date", "forecast_value"]]
101
- forecast_i = forecast_i[forecast_i["Date"].isin(actuals_df["ds"])]
102
- forecasts_df = pd.concat([forecasts_df, forecast_i.set_index("Date")], axis=1)
100
+ dates = test_df.index.get_level_values(0).unique()
101
+ common_idx = test_df.index.intersection(forecast_df.index)
103
102
 
104
- # Remove dates that are not there in output
105
- actuals_df = actuals_df[actuals_df["ds"].isin(forecasts_df.index.values)]
103
+ if len(common_idx) != len(forecast_df.index):
104
+ if len(dates) > output.horizon:
105
+ logger.debug(
106
+ f"Found more unique dates ({len(dates)}) in the Test Data than expected given the horizon ({output.horizon})."
107
+ )
108
+ elif len(dates) < output.horizon:
109
+ logger.debug(
110
+ f"Found fewer unique dates ({len(dates)}) in the Test Data than expected given the horizon ({output.horizon}). This will impact the metrics."
111
+ )
112
+ elif test_df.index.get_level_values(1).unique() > output.list_series_ids():
113
+ logger.debug(
114
+ f"Found more Series Ids in test data ({len(dates)}) expected from the historical data ({output.list_series_ids()})."
115
+ )
116
+ else:
117
+ logger.debug(
118
+ f"Found fewer Series Ids in test data ({len(dates)}) expected from the historical data ({output.list_series_ids()}). This will impact the metrics."
119
+ )
106
120
 
107
- if actuals_df.empty or forecasts_df.empty:
108
- return pd.DataFrame()
121
+ test_df = test_df.loc[common_idx]
122
+ forecast_df = forecast_df.loc[common_idx]
109
123
 
110
- totals = actuals_df.sum(numeric_only=True)
124
+ totals = test_df.sum(numeric_only=True)
111
125
  wmape_weights = np.array((totals / totals.sum()).values)
112
126
 
113
- actuals_df = actuals_df.set_index("ds")
114
-
115
- metrics_df = pd.DataFrame(
116
- columns=[
117
- SupportedMetrics.MEAN_SMAPE,
118
- SupportedMetrics.MEDIAN_SMAPE,
119
- SupportedMetrics.MEAN_MAPE,
120
- SupportedMetrics.MEDIAN_MAPE,
121
- SupportedMetrics.MEAN_WMAPE,
122
- SupportedMetrics.MEDIAN_WMAPE,
127
+ metrics_df = pd.DataFrame()
128
+ for date in dates:
129
+ y_true = test_df.xs(date, level=ForecastOutputColumns.DATE)[
130
+ test_data.target_name
123
131
  ]
124
- )
125
-
126
- for i, (y_true, y_pred) in enumerate(
127
- zip(actuals_df.itertuples(index=False), forecasts_df.itertuples(index=False))
128
- ):
129
- y_true, y_pred = np.array(y_true), np.array(y_pred)
132
+ y_pred = forecast_df.xs(date, level=ForecastOutputColumns.DATE)[
133
+ ForecastOutputColumns.FORECAST_VALUE
134
+ ]
135
+ y_true = np.array(y_true.values)
136
+ y_pred = np.array(y_pred.values)
137
+
138
+ drop_na_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
139
+ if not drop_na_mask.all(): # There is a missing value
140
+ if drop_na_mask.any(): # All values are missing
141
+ logger.debug(
142
+ f"No test data available for date: {date}. This will affect the test metrics."
143
+ )
144
+ continue
145
+ logger.debug(
146
+ f"Missing test data for date: {date}. This will affect the test metrics."
147
+ )
148
+ y_true = y_true[drop_na_mask]
149
+ y_pred = y_pred[drop_na_mask]
150
+ smapes = smape(actual=y_true, predicted=y_pred)
151
+ mapes = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
152
+ wmapes = mapes * wmape_weights
130
153
 
131
- smapes = np.array(
132
- [smape(actual=y_t, predicted=y_p) for y_t, y_p in zip(y_true, y_pred)]
133
- )
134
- mapes = np.array(
154
+ metrics_df = pd.concat(
135
155
  [
136
- mean_absolute_percentage_error(y_true=[y_t], y_pred=[y_p])
137
- for y_t, y_p in zip(y_true, y_pred)
156
+ metrics_df,
157
+ pd.DataFrame(
158
+ {
159
+ SupportedMetrics.MEAN_SMAPE: np.mean(smapes),
160
+ SupportedMetrics.MEDIAN_SMAPE: np.median(smapes),
161
+ SupportedMetrics.MEAN_MAPE: np.mean(mapes),
162
+ SupportedMetrics.MEDIAN_MAPE: np.median(mapes),
163
+ SupportedMetrics.MEAN_WMAPE: np.mean(wmapes),
164
+ SupportedMetrics.MEDIAN_WMAPE: np.median(wmapes),
165
+ },
166
+ index=[date],
167
+ ),
138
168
  ]
139
169
  )
140
- wmapes = np.array([mape * weight for mape, weight in zip(mapes, wmape_weights)])
141
-
142
- metrics_row = {
143
- SupportedMetrics.MEAN_SMAPE: np.mean(smapes),
144
- SupportedMetrics.MEDIAN_SMAPE: np.median(smapes),
145
- SupportedMetrics.MEAN_MAPE: np.mean(mapes),
146
- SupportedMetrics.MEDIAN_MAPE: np.median(mapes),
147
- SupportedMetrics.MEAN_WMAPE: np.mean(wmapes),
148
- SupportedMetrics.MEDIAN_WMAPE: np.median(wmapes),
149
- }
150
-
151
- metrics_df = pd.concat(
152
- [metrics_df, pd.DataFrame(metrics_row, index=[actuals_df.index[i]])],
153
- )
154
-
155
170
  return metrics_df
156
171
 
157
172
 
158
- def _call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs):
159
- if fsspec.utils.get_protocol(filename) == "file":
160
- return pd_fn(filename, **kwargs)
161
- elif fsspec.utils.get_protocol(filename) in ["http", "https"]:
162
- return pd_fn(filename, **kwargs)
163
-
164
- storage_options = storage_options or (
165
- default_signer() if ObjectStorageDetails.is_oci_path(filename) else {}
166
- )
167
-
168
- return pd_fn(filename, storage_options=storage_options, **kwargs)
169
-
170
-
171
- def _load_data(filename, format, storage_options=None, columns=None, **kwargs):
172
- if not format:
173
- _, format = os.path.splitext(filename)
174
- format = format[1:]
175
- if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]:
176
- read_fn = getattr(pd, f"read_{format}")
177
- data = _call_pandas_fsspec(read_fn, filename, storage_options=storage_options)
178
- elif format in ["tsv"]:
179
- data = _call_pandas_fsspec(
180
- pd.read_csv, filename, storage_options=storage_options, sep="\t"
181
- )
182
- else:
183
- raise ForecastInputDataError(f"Unrecognized format: {format}")
184
- if columns:
185
- # keep only these columns, done after load because only CSV supports stream filtering
186
- data = data[columns]
187
- return data
188
-
189
-
190
- def _write_data(data, filename, format, storage_options, index=False, **kwargs):
191
- if not format:
192
- _, format = os.path.splitext(filename)
193
- format = format[1:]
194
- if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]:
195
- write_fn = getattr(data, f"to_{format}")
196
- return _call_pandas_fsspec(
197
- write_fn, filename, index=index, storage_options=storage_options
198
- )
199
- raise ForecastInputDataError(f"Unrecognized format: {format}")
200
-
201
-
202
- def _merge_category_columns(data, target_category_columns):
203
- result = data.apply(
204
- lambda x: "__".join([str(x[col]) for col in target_category_columns]), axis=1
205
- )
206
- return result if not result.empty else pd.Series([], dtype=str)
207
-
208
-
209
- def _clean_data(data, target_column, datetime_column, target_category_columns=None):
210
- if target_category_columns is not None:
211
- data["__Series__"] = _merge_category_columns(data, target_category_columns)
212
- unique_categories = data["__Series__"].unique()
213
-
214
- df = pd.DataFrame()
215
- new_target_columns = []
216
-
217
- for cat in unique_categories:
218
- data_cat = data[data["__Series__"] == cat].rename(
219
- {target_column: f"{target_column}_{cat}"}, axis=1
220
- )
221
- data_cat_clean = data_cat.drop("__Series__", axis=1).set_index(
222
- datetime_column
223
- )
224
- df = pd.concat([df, data_cat_clean], axis=1)
225
- new_target_columns.append(f"{target_column}_{cat}")
226
- df = df.reset_index()
227
-
228
- return df.fillna(0), new_target_columns
229
-
230
- raise ForecastSchemaYamlError(
231
- f"Either target_columns, target_category_columns, or datetime_column not specified."
232
- )
233
-
234
-
235
- def _validate_and_clean_data(
236
- cat: str, horizon: int, primary: pd.DataFrame, additional: pd.DataFrame
237
- ):
238
- """
239
- Checks compatibility between primary and additional dataframe for a category.
240
-
241
- Parameters
242
- ----------
243
- cat: (str)
244
- Category for which data is being validated.
245
- horizon: (int)
246
- horizon value for the forecast.
247
- primary: (pd.DataFrame)
248
- primary dataframe.
249
- additional: (pd.DataFrame)
250
- additional dataframe.
251
-
252
- Returns
253
- -------
254
- (pd.DataFrame, pd.DataFrame) or (None, None)
255
- Updated primary and additional dataframe or None values if the validation criteria does not satisfy.
256
- """
257
- # Additional data should have future values for horizon
258
- data_row_count = primary.shape[0]
259
- data_add_row_count = additional.shape[0]
260
- additional_surplus = data_add_row_count - horizon - data_row_count
261
- if additional_surplus < 0:
262
- logger.warn(
263
- "Forecast for {} will not be generated since additional data has fewer values({}) than"
264
- " horizon({}) + primary data({})".format(
265
- cat, data_add_row_count, horizon, data_row_count
266
- )
267
- )
268
- return None, None
269
- elif additional_surplus > 0:
270
- # Removing surplus future data in additional
271
- additional.drop(additional.tail(additional_surplus).index, inplace=True)
272
-
273
- # Dates in primary data should be subset of additional data
274
- dates_in_data = primary.index.tolist()
275
- dates_in_additional = additional.index.tolist()
276
- if not set(dates_in_data).issubset(set(dates_in_additional)):
277
- logger.warn(
278
- "Forecast for {} will not be generated since the dates in primary and additional do not"
279
- " match".format(cat)
280
- )
281
- return None, None
282
- return primary, additional
173
+ def load_pkl(filepath):
174
+ storage_options = dict()
175
+ if ObjectStorageDetails.is_oci_path(filepath):
176
+ storage_options = default_signer()
283
177
 
178
+ with fsspec.open(filepath, "rb", **storage_options) as f:
179
+ return cloudpickle.load(f)
180
+ return None
284
181
 
285
- def _build_indexed_datasets(
286
- data,
287
- target_column,
288
- datetime_column,
289
- horizon,
290
- target_category_columns=None,
291
- additional_data=None,
292
- metadata_data=None,
293
- ):
294
- df_by_target = dict()
295
- categories = []
296
182
 
297
- if target_category_columns is None:
298
- if additional_data is None:
299
- df_by_target[target_column] = data.fillna(0)
300
- else:
301
- df_by_target[target_column] = pd.concat(
302
- [
303
- data.set_index(datetime_column).fillna(0),
304
- additional_data.set_index(datetime_column).fillna(0),
305
- ],
306
- axis=1,
307
- ).reset_index()
308
- return df_by_target, target_column, categories
309
-
310
- data["__Series__"] = _merge_category_columns(data, target_category_columns)
311
- unique_categories = data["__Series__"].unique()
312
- invalid_categories = []
313
-
314
- if additional_data is not None and target_column in additional_data.columns:
315
- logger.warn(f"Dropping column '{target_column}' from additional_data")
316
- additional_data.drop(target_column, axis=1, inplace=True)
317
- for cat in unique_categories:
318
- data_by_cat = data[data["__Series__"] == cat].rename(
319
- {target_column: f"{target_column}_{cat}"}, axis=1
320
- )
321
- data_by_cat_clean = (
322
- data_by_cat.drop(target_category_columns + ["__Series__"], axis=1)
323
- .set_index(datetime_column)
324
- .fillna(0)
325
- )
326
- if additional_data is not None:
327
- additional_data["__Series__"] = _merge_category_columns(
328
- additional_data, target_category_columns
329
- )
330
- data_add_by_cat = additional_data[
331
- additional_data["__Series__"] == cat
332
- ].rename({target_column: f"{target_column}_{cat}"}, axis=1)
333
- data_add_by_cat_clean = (
334
- data_add_by_cat.drop(target_category_columns + ["__Series__"], axis=1)
335
- .set_index(datetime_column)
336
- .fillna(0)
337
- )
338
- valid_primary, valid_add = _validate_and_clean_data(
339
- cat, horizon, data_by_cat_clean, data_add_by_cat_clean
340
- )
341
-
342
- if valid_primary is None:
343
- invalid_categories.append(cat)
344
- data_by_cat_clean = None
345
- else:
346
- data_by_cat_clean = pd.concat([valid_add, valid_primary], axis=1)
347
- if data_by_cat_clean is not None:
348
- df_by_target[f"{target_column}_{cat}"] = data_by_cat_clean.reset_index()
349
-
350
- new_target_columns = list(df_by_target.keys())
351
- remaining_categories = set(unique_categories) - set(invalid_categories)
352
-
353
- if not len(remaining_categories):
354
- raise ForecastInputDataError(
355
- "Stopping forecast operator as there is no data that meets the validation criteria."
356
- )
357
- return df_by_target, new_target_columns, remaining_categories
183
+ def write_pkl(obj, filename, output_dir, storage_options):
184
+ pkl_path = os.path.join(output_dir, filename)
185
+ with fsspec.open(
186
+ pkl_path,
187
+ "wb",
188
+ **storage_options,
189
+ ) as f:
190
+ cloudpickle.dump(obj, f)
358
191
 
359
192
 
360
- def _build_metrics_df(y_true, y_pred, column_name):
193
+ def _build_metrics_df(y_true, y_pred, series_id):
194
+ if len(y_true) == 0 or len(y_pred) == 0:
195
+ return pd.DataFrame()
361
196
  metrics = dict()
362
197
  metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred)
363
198
  metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
@@ -369,38 +204,60 @@ def _build_metrics_df(y_true, y_pred, column_name):
369
204
  metrics["Explained Variance"] = explained_variance_score(
370
205
  y_true=y_true, y_pred=y_pred
371
206
  )
372
- return pd.DataFrame.from_dict(metrics, orient="index", columns=[column_name])
207
+ return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id])
373
208
 
374
209
 
375
- def evaluate_train_metrics(
376
- target_columns, datasets, output, datetime_col, target_col="yhat"
377
- ):
210
+ def evaluate_train_metrics(output, metrics_col_name=None):
378
211
  """
379
212
  Training metrics
213
+
214
+ Parameters:
215
+ output: ForecastOutputs
216
+
217
+ metrics_col_name: str
218
+ Only passed in if the series column was created artifically.
219
+ When passed in, replaces s_id as the column name in the metrics table
380
220
  """
381
221
  total_metrics = pd.DataFrame()
382
- for idx, col in enumerate(target_columns):
222
+ for s_id in output.list_series_ids():
383
223
  try:
384
- forecast_by_col = output.get_target_category(col)[
224
+ forecast_by_s_id = output.get_forecast(s_id)[
385
225
  ["input_value", "Date", "fitted_value"]
386
- ].dropna()
387
- y_true = forecast_by_col["input_value"].values
388
- y_pred = forecast_by_col["fitted_value"].values
226
+ ]
227
+ forecast_by_s_id = forecast_by_s_id.dropna()
228
+ y_true = forecast_by_s_id["input_value"].values
229
+ y_pred = forecast_by_s_id["fitted_value"].values
230
+ drop_na_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
231
+ if not drop_na_mask.all(): # There is a missing value
232
+ if drop_na_mask.any(): # All values are missing
233
+ logger.debug(
234
+ f"No fitted values available for series: {s_id}. This will affect the training metrics."
235
+ )
236
+ continue
237
+ logger.debug(
238
+ f"Missing fitted values for series: {s_id}. This will affect the training metrics."
239
+ )
240
+ y_true = y_true[drop_na_mask]
241
+ y_pred = y_pred[drop_na_mask]
389
242
  metrics_df = _build_metrics_df(
390
- y_true=y_true, y_pred=y_pred, column_name=col
243
+ y_true=y_true,
244
+ y_pred=y_pred,
245
+ series_id=s_id,
391
246
  )
392
247
  total_metrics = pd.concat([total_metrics, metrics_df], axis=1)
393
248
  except Exception as e:
394
- logger.warn(f"Failed to generate training metrics for target_series: {col}")
249
+ logger.debug(
250
+ f"Failed to generate training metrics for target_series: {s_id}"
251
+ )
395
252
  logger.debug(f"Recieved Error Statement: {e}")
396
253
  return total_metrics
397
254
 
398
255
 
399
- def _select_plot_list(fn, target_columns):
256
+ def _select_plot_list(fn, series_ids):
400
257
  import datapane as dp
401
258
 
402
- blocks = [dp.Plot(fn(i, col), label=col) for i, col in enumerate(target_columns)]
403
- return dp.Select(blocks=blocks) if len(target_columns) > 1 else blocks[0]
259
+ blocks = [dp.Plot(fn(s_id=s_id), label=s_id) for s_id in series_ids]
260
+ return dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0]
404
261
 
405
262
 
406
263
  def _add_unit(num, unit):
@@ -409,14 +266,32 @@ def _add_unit(num, unit):
409
266
 
410
267
  def get_forecast_plots(
411
268
  forecast_output,
412
- target_columns,
413
269
  horizon,
414
270
  test_data=None,
415
271
  ci_interval_width=0.95,
416
272
  ):
417
- def plot_forecast_plotly(idx, col):
273
+ def plot_forecast_plotly(s_id):
418
274
  fig = go.Figure()
419
- forecast_i = forecast_output.get_target_category(col)
275
+ forecast_i = forecast_output.get_forecast(s_id)
276
+ actual_length = len(forecast_i)
277
+ if actual_length > RENDER_LIMIT:
278
+ forecast_i = forecast_i.tail(RENDER_LIMIT)
279
+ text = (
280
+ f"<i>To improve rendering speed, subsampled the data from {actual_length}"
281
+ f" rows to {RENDER_LIMIT} rows for this plot.</i>"
282
+ )
283
+ fig.update_layout(
284
+ annotations=[
285
+ go.layout.Annotation(
286
+ x=0.01,
287
+ y=1.1,
288
+ xref="paper",
289
+ yref="paper",
290
+ text=text,
291
+ showarrow=False,
292
+ )
293
+ ]
294
+ )
420
295
  upper_bound = forecast_output.upper_bound_name
421
296
  lower_bound = forecast_output.lower_bound_name
422
297
  if upper_bound is not None and lower_bound is not None:
@@ -440,16 +315,20 @@ def get_forecast_plots(
440
315
  ),
441
316
  ]
442
317
  )
443
- if test_data is not None and col in test_data:
444
- fig.add_trace(
445
- go.Scatter(
446
- x=test_data["ds"],
447
- y=test_data[col],
448
- mode="markers",
449
- marker_color="green",
450
- name="Actual",
318
+ if test_data is not None:
319
+ try:
320
+ test_data_s_id = test_data.get_data_for_series(s_id)
321
+ fig.add_trace(
322
+ go.Scatter(
323
+ x=test_data_s_id[test_data.dt_column_name],
324
+ y=test_data_s_id[test_data.target_name],
325
+ mode="markers",
326
+ marker_color="green",
327
+ name="Actual",
328
+ )
451
329
  )
452
- )
330
+ except Exception as e:
331
+ logger.debug(f"Unable to plot test data due to: {e.args}")
453
332
 
454
333
  fig.add_trace(
455
334
  go.Scatter(
@@ -486,27 +365,7 @@ def get_forecast_plots(
486
365
  )
487
366
  return fig
488
367
 
489
- return _select_plot_list(plot_forecast_plotly, target_columns)
490
-
491
-
492
- def human_time_friendly(seconds):
493
- TIME_DURATION_UNITS = (
494
- ("week", 60 * 60 * 24 * 7),
495
- ("day", 60 * 60 * 24),
496
- ("hour", 60 * 60),
497
- ("min", 60),
498
- )
499
- if seconds == 0:
500
- return "inf"
501
- accumulator = []
502
- for unit, div in TIME_DURATION_UNITS:
503
- amount, seconds = divmod(float(seconds), div)
504
- if amount > 0:
505
- accumulator.append(
506
- "{} {}{}".format(int(amount), unit, "" if amount == 1 else "s")
507
- )
508
- accumulator.append("{} secs".format(round(seconds, 2)))
509
- return ", ".join(accumulator)
368
+ return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids())
510
369
 
511
370
 
512
371
  def select_auto_model(
@@ -528,17 +387,10 @@ def select_auto_model(
528
387
  str
529
388
  The type of the model.
530
389
  """
531
- date_column = operator_config.spec.datetime_column.name
532
- datetimes = pd.to_datetime(
533
- datasets.original_user_data[date_column].drop_duplicates()
534
- )
535
- freq_in_secs = datetimes.tail().diff().min().total_seconds()
536
- if datasets.original_additional_data is not None:
537
- num_of_additional_cols = len(datasets.original_additional_data.columns) - 2
538
- else:
539
- num_of_additional_cols = 0
540
- row_count = len(datasets.original_user_data.index)
541
- number_of_series = len(datasets.categories)
390
+ freq_in_secs = datasets.get_datetime_frequency_in_seconds()
391
+ num_of_additional_cols = len(datasets.get_additional_data_column_names())
392
+ row_count = datasets.get_num_rows()
393
+ number_of_series = len(datasets.list_series_ids())
542
394
  if (
543
395
  num_of_additional_cols < 15
544
396
  and row_count < 10000
@@ -547,10 +399,6 @@ def select_auto_model(
547
399
  ):
548
400
  return SupportedModels.AutoMLX
549
401
  elif row_count < 10000 and number_of_series > 10:
550
- operator_config.spec.model_kwargs["model_list"] = "fast_parallel"
551
- return SupportedModels.AutoTS
552
- elif row_count < 20000 and number_of_series > 10:
553
- operator_config.spec.model_kwargs["model_list"] = "superfast"
554
402
  return SupportedModels.AutoTS
555
403
  elif row_count > 20000:
556
404
  return SupportedModels.NeuralProphet
@@ -558,35 +406,27 @@ def select_auto_model(
558
406
  return SupportedModels.NeuralProphet
559
407
 
560
408
 
561
- def get_frequency_of_datetime(data: pd.DataFrame, dataset_info: ForecastOperatorSpec):
409
+ def convert_target(target: str, target_col: str):
562
410
  """
563
- Function checks if the data is compatible with the model selected
411
+ Removes the target_column that got appended to target.
564
412
 
565
413
  Parameters
566
414
  ------------
567
- data: pd.DataFrame
568
- primary dataset
569
- dataset_info: ForecastOperatorSpec
415
+ target: str
416
+ value in target_columns. i.e., "Sales_Product_Category_117"
417
+
418
+ target_col: str
419
+ target_column provided in yaml. i.e., "Sales"
570
420
 
571
421
  Returns
572
422
  --------
573
- None
574
-
423
+ Original target. i.e., "Product_Category_117"
575
424
  """
576
- date_column = dataset_info.datetime_column.name
577
- datetimes = pd.to_datetime(
578
- data[date_column].drop_duplicates(), format=dataset_info.datetime_column.format
579
- )
580
- freq = pd.DatetimeIndex(datetimes).inferred_freq
581
- if dataset_info.model == SupportedModels.AutoMLX:
582
- freq_in_secs = datetimes.tail().diff().min().total_seconds()
583
- if abs(freq_in_secs) < 3600:
584
- message = (
585
- "{} requires data with a frequency of at least one hour. Please try using a different model,"
586
- " or select the 'auto' option.".format(SupportedModels.AutoMLX, freq)
587
- )
588
- raise Exception(message)
589
- return freq
425
+ if target_col is not None and target_col != "":
426
+ temp = target_col + "_"
427
+ if temp in target:
428
+ target = target.replace(temp, "", 1)
429
+ return target
590
430
 
591
431
 
592
432
  def default_signer(**kwargs):
@@ -594,13 +434,3 @@ def default_signer(**kwargs):
594
434
  from ads.common.auth import default_signer
595
435
 
596
436
  return default_signer(**kwargs)
597
-
598
-
599
- # Disable
600
- def block_print():
601
- sys.stdout = open(os.devnull, "w")
602
-
603
-
604
- # Restore
605
- def enable_print():
606
- sys.stdout = sys.__stdout__