oracle-ads 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/__init__.py +12 -0
- ads/aqua/base.py +324 -0
- ads/aqua/cli.py +19 -0
- ads/aqua/config/deployment_config_defaults.json +9 -0
- ads/aqua/config/resource_limit_names.json +7 -0
- ads/aqua/constants.py +45 -0
- ads/aqua/data.py +40 -0
- ads/aqua/decorator.py +101 -0
- ads/aqua/deployment.py +643 -0
- ads/aqua/dummy_data/icon.txt +1 -0
- ads/aqua/dummy_data/oci_model_deployments.json +56 -0
- ads/aqua/dummy_data/oci_models.json +1 -0
- ads/aqua/dummy_data/readme.md +26 -0
- ads/aqua/evaluation.py +1751 -0
- ads/aqua/exception.py +82 -0
- ads/aqua/extension/__init__.py +40 -0
- ads/aqua/extension/base_handler.py +138 -0
- ads/aqua/extension/common_handler.py +21 -0
- ads/aqua/extension/deployment_handler.py +202 -0
- ads/aqua/extension/evaluation_handler.py +135 -0
- ads/aqua/extension/finetune_handler.py +66 -0
- ads/aqua/extension/model_handler.py +59 -0
- ads/aqua/extension/ui_handler.py +201 -0
- ads/aqua/extension/utils.py +23 -0
- ads/aqua/finetune.py +579 -0
- ads/aqua/job.py +29 -0
- ads/aqua/model.py +819 -0
- ads/aqua/training/__init__.py +4 -0
- ads/aqua/training/exceptions.py +459 -0
- ads/aqua/ui.py +453 -0
- ads/aqua/utils.py +715 -0
- ads/cli.py +37 -6
- ads/common/auth.py +7 -0
- ads/common/decorator/__init__.py +7 -3
- ads/common/decorator/require_nonempty_arg.py +65 -0
- ads/common/object_storage_details.py +166 -7
- ads/common/oci_client.py +18 -1
- ads/common/oci_logging.py +2 -2
- ads/common/oci_mixin.py +4 -5
- ads/common/serializer.py +34 -5
- ads/common/utils.py +75 -10
- ads/config.py +40 -1
- ads/dataset/correlation_plot.py +10 -12
- ads/jobs/ads_job.py +43 -25
- ads/jobs/builders/infrastructure/base.py +4 -2
- ads/jobs/builders/infrastructure/dsc_job.py +49 -39
- ads/jobs/builders/runtimes/base.py +71 -1
- ads/jobs/builders/runtimes/container_runtime.py +4 -4
- ads/jobs/builders/runtimes/pytorch_runtime.py +10 -63
- ads/jobs/templates/driver_pytorch.py +27 -10
- ads/model/artifact_downloader.py +84 -14
- ads/model/artifact_uploader.py +25 -23
- ads/model/datascience_model.py +388 -38
- ads/model/deployment/model_deployment.py +10 -2
- ads/model/generic_model.py +8 -0
- ads/model/model_file_description_schema.json +68 -0
- ads/model/model_metadata.py +1 -1
- ads/model/service/oci_datascience_model.py +34 -5
- ads/opctl/config/merger.py +2 -2
- ads/opctl/operator/__init__.py +3 -1
- ads/opctl/operator/cli.py +7 -1
- ads/opctl/operator/cmd.py +3 -3
- ads/opctl/operator/common/errors.py +2 -1
- ads/opctl/operator/common/operator_config.py +22 -3
- ads/opctl/operator/common/utils.py +16 -0
- ads/opctl/operator/lowcode/anomaly/MLoperator +15 -0
- ads/opctl/operator/lowcode/anomaly/README.md +209 -0
- ads/opctl/operator/lowcode/anomaly/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/__main__.py +104 -0
- ads/opctl/operator/lowcode/anomaly/cmd.py +35 -0
- ads/opctl/operator/lowcode/anomaly/const.py +88 -0
- ads/opctl/operator/lowcode/anomaly/environment.yaml +12 -0
- ads/opctl/operator/lowcode/anomaly/model/__init__.py +5 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +147 -0
- ads/opctl/operator/lowcode/anomaly/model/automlx.py +89 -0
- ads/opctl/operator/lowcode/anomaly/model/autots.py +103 -0
- ads/opctl/operator/lowcode/anomaly/model/base_model.py +354 -0
- ads/opctl/operator/lowcode/anomaly/model/factory.py +67 -0
- ads/opctl/operator/lowcode/anomaly/model/tods.py +119 -0
- ads/opctl/operator/lowcode/anomaly/operator_config.py +105 -0
- ads/opctl/operator/lowcode/anomaly/schema.yaml +359 -0
- ads/opctl/operator/lowcode/anomaly/utils.py +81 -0
- ads/opctl/operator/lowcode/common/__init__.py +5 -0
- ads/opctl/operator/lowcode/common/const.py +10 -0
- ads/opctl/operator/lowcode/common/data.py +96 -0
- ads/opctl/operator/lowcode/common/errors.py +41 -0
- ads/opctl/operator/lowcode/common/transformations.py +191 -0
- ads/opctl/operator/lowcode/common/utils.py +250 -0
- ads/opctl/operator/lowcode/forecast/README.md +3 -2
- ads/opctl/operator/lowcode/forecast/__main__.py +18 -2
- ads/opctl/operator/lowcode/forecast/cmd.py +8 -7
- ads/opctl/operator/lowcode/forecast/const.py +17 -1
- ads/opctl/operator/lowcode/forecast/environment.yaml +3 -2
- ads/opctl/operator/lowcode/forecast/model/arima.py +106 -117
- ads/opctl/operator/lowcode/forecast/model/automlx.py +204 -180
- ads/opctl/operator/lowcode/forecast/model/autots.py +144 -253
- ads/opctl/operator/lowcode/forecast/model/base_model.py +326 -259
- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +325 -176
- ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +293 -237
- ads/opctl/operator/lowcode/forecast/model/prophet.py +191 -208
- ads/opctl/operator/lowcode/forecast/operator_config.py +24 -33
- ads/opctl/operator/lowcode/forecast/schema.yaml +116 -29
- ads/opctl/operator/lowcode/forecast/utils.py +186 -356
- ads/opctl/operator/lowcode/pii/model/guardrails.py +18 -15
- ads/opctl/operator/lowcode/pii/model/report.py +7 -7
- ads/opctl/operator/lowcode/pii/operator_config.py +1 -8
- ads/opctl/operator/lowcode/pii/utils.py +0 -82
- ads/opctl/operator/runtime/runtime.py +3 -2
- ads/telemetry/base.py +62 -0
- ads/telemetry/client.py +105 -0
- ads/telemetry/telemetry.py +6 -3
- {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/METADATA +44 -7
- {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/RECORD +116 -59
- ads/opctl/operator/lowcode/forecast/model/transformations.py +0 -125
- {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/LICENSE.txt +0 -0
- {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/WHEEL +0 -0
- {oracle_ads-2.10.0.dist-info → oracle_ads-2.11.0.dist-info}/entry_points.txt +0 -0
@@ -11,6 +11,7 @@ from typing import List
|
|
11
11
|
import fsspec
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
+
import cloudpickle
|
14
15
|
import plotly.express as px
|
15
16
|
from plotly import graph_objects as go
|
16
17
|
from sklearn.metrics import (
|
@@ -28,9 +29,13 @@ from ads.common.object_storage_details import ObjectStorageDetails
|
|
28
29
|
from ads.dataset.label_encoder import DataFrameLabelEncoder
|
29
30
|
from ads.opctl import logger
|
30
31
|
|
31
|
-
from .const import SupportedMetrics, SupportedModels
|
32
|
+
from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT
|
32
33
|
from .errors import ForecastInputDataError, ForecastSchemaYamlError
|
33
34
|
from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig
|
35
|
+
from ads.opctl.operator.lowcode.common.utils import merge_category_columns
|
36
|
+
from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns
|
37
|
+
|
38
|
+
# from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData, ForecastOutput
|
34
39
|
|
35
40
|
|
36
41
|
def _label_encode_dataframe(df, no_encode=set()):
|
@@ -55,11 +60,8 @@ def smape(actual, predicted) -> float:
|
|
55
60
|
|
56
61
|
|
57
62
|
def _build_metrics_per_horizon(
|
58
|
-
|
59
|
-
output:
|
60
|
-
target_columns: List[str],
|
61
|
-
target_col: str,
|
62
|
-
horizon_periods: int,
|
63
|
+
test_data: "TestData",
|
64
|
+
output: "ForecastOutput",
|
63
65
|
) -> pd.DataFrame:
|
64
66
|
"""
|
65
67
|
Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon
|
@@ -70,12 +72,6 @@ def _build_metrics_per_horizon(
|
|
70
72
|
Dataframe that has the actual data
|
71
73
|
output: Pandas Dataframe
|
72
74
|
Dataframe that has the forecasted data
|
73
|
-
target_columns: List
|
74
|
-
List of target category columns
|
75
|
-
target_col: str
|
76
|
-
Target column name (yhat)
|
77
|
-
horizon_periods: int
|
78
|
-
Horizon Periods
|
79
75
|
|
80
76
|
Returns
|
81
77
|
--------
|
@@ -85,279 +81,118 @@ def _build_metrics_per_horizon(
|
|
85
81
|
"""
|
86
82
|
Assumptions:
|
87
83
|
data and output have all the target columns.
|
88
|
-
yhats in output are in the same order as in
|
84
|
+
yhats in output are in the same order as in series_ids.
|
89
85
|
Test data might not have sorted dates and the order of series also might differ.
|
90
86
|
"""
|
91
87
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
88
|
+
test_df = (
|
89
|
+
test_data.get_data_long()
|
90
|
+
.rename({test_data.dt_column_name: ForecastOutputColumns.DATE}, axis=1)
|
91
|
+
.set_index([ForecastOutputColumns.DATE, ForecastOutputColumns.SERIES])
|
92
|
+
.sort_index()
|
93
|
+
)
|
94
|
+
forecast_df = (
|
95
|
+
output.get_horizon_long()
|
96
|
+
.set_index([ForecastOutputColumns.DATE, ForecastOutputColumns.SERIES])
|
97
|
+
.sort_index()
|
98
|
+
)
|
96
99
|
|
97
|
-
|
98
|
-
|
99
|
-
for cat in output.list_categories():
|
100
|
-
forecast_i = output.get_category(cat)[["Date", "forecast_value"]]
|
101
|
-
forecast_i = forecast_i[forecast_i["Date"].isin(actuals_df["ds"])]
|
102
|
-
forecasts_df = pd.concat([forecasts_df, forecast_i.set_index("Date")], axis=1)
|
100
|
+
dates = test_df.index.get_level_values(0).unique()
|
101
|
+
common_idx = test_df.index.intersection(forecast_df.index)
|
103
102
|
|
104
|
-
|
105
|
-
|
103
|
+
if len(common_idx) != len(forecast_df.index):
|
104
|
+
if len(dates) > output.horizon:
|
105
|
+
logger.debug(
|
106
|
+
f"Found more unique dates ({len(dates)}) in the Test Data than expected given the horizon ({output.horizon})."
|
107
|
+
)
|
108
|
+
elif len(dates) < output.horizon:
|
109
|
+
logger.debug(
|
110
|
+
f"Found fewer unique dates ({len(dates)}) in the Test Data than expected given the horizon ({output.horizon}). This will impact the metrics."
|
111
|
+
)
|
112
|
+
elif test_df.index.get_level_values(1).unique() > output.list_series_ids():
|
113
|
+
logger.debug(
|
114
|
+
f"Found more Series Ids in test data ({len(dates)}) expected from the historical data ({output.list_series_ids()})."
|
115
|
+
)
|
116
|
+
else:
|
117
|
+
logger.debug(
|
118
|
+
f"Found fewer Series Ids in test data ({len(dates)}) expected from the historical data ({output.list_series_ids()}). This will impact the metrics."
|
119
|
+
)
|
106
120
|
|
107
|
-
|
108
|
-
|
121
|
+
test_df = test_df.loc[common_idx]
|
122
|
+
forecast_df = forecast_df.loc[common_idx]
|
109
123
|
|
110
|
-
totals =
|
124
|
+
totals = test_df.sum(numeric_only=True)
|
111
125
|
wmape_weights = np.array((totals / totals.sum()).values)
|
112
126
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
SupportedMetrics.MEAN_SMAPE,
|
118
|
-
SupportedMetrics.MEDIAN_SMAPE,
|
119
|
-
SupportedMetrics.MEAN_MAPE,
|
120
|
-
SupportedMetrics.MEDIAN_MAPE,
|
121
|
-
SupportedMetrics.MEAN_WMAPE,
|
122
|
-
SupportedMetrics.MEDIAN_WMAPE,
|
127
|
+
metrics_df = pd.DataFrame()
|
128
|
+
for date in dates:
|
129
|
+
y_true = test_df.xs(date, level=ForecastOutputColumns.DATE)[
|
130
|
+
test_data.target_name
|
123
131
|
]
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
132
|
+
y_pred = forecast_df.xs(date, level=ForecastOutputColumns.DATE)[
|
133
|
+
ForecastOutputColumns.FORECAST_VALUE
|
134
|
+
]
|
135
|
+
y_true = np.array(y_true.values)
|
136
|
+
y_pred = np.array(y_pred.values)
|
137
|
+
|
138
|
+
drop_na_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
|
139
|
+
if not drop_na_mask.all(): # There is a missing value
|
140
|
+
if drop_na_mask.any(): # All values are missing
|
141
|
+
logger.debug(
|
142
|
+
f"No test data available for date: {date}. This will affect the test metrics."
|
143
|
+
)
|
144
|
+
continue
|
145
|
+
logger.debug(
|
146
|
+
f"Missing test data for date: {date}. This will affect the test metrics."
|
147
|
+
)
|
148
|
+
y_true = y_true[drop_na_mask]
|
149
|
+
y_pred = y_pred[drop_na_mask]
|
150
|
+
smapes = smape(actual=y_true, predicted=y_pred)
|
151
|
+
mapes = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
|
152
|
+
wmapes = mapes * wmape_weights
|
130
153
|
|
131
|
-
|
132
|
-
[smape(actual=y_t, predicted=y_p) for y_t, y_p in zip(y_true, y_pred)]
|
133
|
-
)
|
134
|
-
mapes = np.array(
|
154
|
+
metrics_df = pd.concat(
|
135
155
|
[
|
136
|
-
|
137
|
-
|
156
|
+
metrics_df,
|
157
|
+
pd.DataFrame(
|
158
|
+
{
|
159
|
+
SupportedMetrics.MEAN_SMAPE: np.mean(smapes),
|
160
|
+
SupportedMetrics.MEDIAN_SMAPE: np.median(smapes),
|
161
|
+
SupportedMetrics.MEAN_MAPE: np.mean(mapes),
|
162
|
+
SupportedMetrics.MEDIAN_MAPE: np.median(mapes),
|
163
|
+
SupportedMetrics.MEAN_WMAPE: np.mean(wmapes),
|
164
|
+
SupportedMetrics.MEDIAN_WMAPE: np.median(wmapes),
|
165
|
+
},
|
166
|
+
index=[date],
|
167
|
+
),
|
138
168
|
]
|
139
169
|
)
|
140
|
-
wmapes = np.array([mape * weight for mape, weight in zip(mapes, wmape_weights)])
|
141
|
-
|
142
|
-
metrics_row = {
|
143
|
-
SupportedMetrics.MEAN_SMAPE: np.mean(smapes),
|
144
|
-
SupportedMetrics.MEDIAN_SMAPE: np.median(smapes),
|
145
|
-
SupportedMetrics.MEAN_MAPE: np.mean(mapes),
|
146
|
-
SupportedMetrics.MEDIAN_MAPE: np.median(mapes),
|
147
|
-
SupportedMetrics.MEAN_WMAPE: np.mean(wmapes),
|
148
|
-
SupportedMetrics.MEDIAN_WMAPE: np.median(wmapes),
|
149
|
-
}
|
150
|
-
|
151
|
-
metrics_df = pd.concat(
|
152
|
-
[metrics_df, pd.DataFrame(metrics_row, index=[actuals_df.index[i]])],
|
153
|
-
)
|
154
|
-
|
155
170
|
return metrics_df
|
156
171
|
|
157
172
|
|
158
|
-
def
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
return pd_fn(filename, **kwargs)
|
163
|
-
|
164
|
-
storage_options = storage_options or (
|
165
|
-
default_signer() if ObjectStorageDetails.is_oci_path(filename) else {}
|
166
|
-
)
|
167
|
-
|
168
|
-
return pd_fn(filename, storage_options=storage_options, **kwargs)
|
169
|
-
|
170
|
-
|
171
|
-
def _load_data(filename, format, storage_options=None, columns=None, **kwargs):
|
172
|
-
if not format:
|
173
|
-
_, format = os.path.splitext(filename)
|
174
|
-
format = format[1:]
|
175
|
-
if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]:
|
176
|
-
read_fn = getattr(pd, f"read_{format}")
|
177
|
-
data = _call_pandas_fsspec(read_fn, filename, storage_options=storage_options)
|
178
|
-
elif format in ["tsv"]:
|
179
|
-
data = _call_pandas_fsspec(
|
180
|
-
pd.read_csv, filename, storage_options=storage_options, sep="\t"
|
181
|
-
)
|
182
|
-
else:
|
183
|
-
raise ForecastInputDataError(f"Unrecognized format: {format}")
|
184
|
-
if columns:
|
185
|
-
# keep only these columns, done after load because only CSV supports stream filtering
|
186
|
-
data = data[columns]
|
187
|
-
return data
|
188
|
-
|
189
|
-
|
190
|
-
def _write_data(data, filename, format, storage_options, index=False, **kwargs):
|
191
|
-
if not format:
|
192
|
-
_, format = os.path.splitext(filename)
|
193
|
-
format = format[1:]
|
194
|
-
if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]:
|
195
|
-
write_fn = getattr(data, f"to_{format}")
|
196
|
-
return _call_pandas_fsspec(
|
197
|
-
write_fn, filename, index=index, storage_options=storage_options
|
198
|
-
)
|
199
|
-
raise ForecastInputDataError(f"Unrecognized format: {format}")
|
200
|
-
|
201
|
-
|
202
|
-
def _merge_category_columns(data, target_category_columns):
|
203
|
-
result = data.apply(
|
204
|
-
lambda x: "__".join([str(x[col]) for col in target_category_columns]), axis=1
|
205
|
-
)
|
206
|
-
return result if not result.empty else pd.Series([], dtype=str)
|
207
|
-
|
208
|
-
|
209
|
-
def _clean_data(data, target_column, datetime_column, target_category_columns=None):
|
210
|
-
if target_category_columns is not None:
|
211
|
-
data["__Series__"] = _merge_category_columns(data, target_category_columns)
|
212
|
-
unique_categories = data["__Series__"].unique()
|
213
|
-
|
214
|
-
df = pd.DataFrame()
|
215
|
-
new_target_columns = []
|
216
|
-
|
217
|
-
for cat in unique_categories:
|
218
|
-
data_cat = data[data["__Series__"] == cat].rename(
|
219
|
-
{target_column: f"{target_column}_{cat}"}, axis=1
|
220
|
-
)
|
221
|
-
data_cat_clean = data_cat.drop("__Series__", axis=1).set_index(
|
222
|
-
datetime_column
|
223
|
-
)
|
224
|
-
df = pd.concat([df, data_cat_clean], axis=1)
|
225
|
-
new_target_columns.append(f"{target_column}_{cat}")
|
226
|
-
df = df.reset_index()
|
227
|
-
|
228
|
-
return df.fillna(0), new_target_columns
|
229
|
-
|
230
|
-
raise ForecastSchemaYamlError(
|
231
|
-
f"Either target_columns, target_category_columns, or datetime_column not specified."
|
232
|
-
)
|
233
|
-
|
234
|
-
|
235
|
-
def _validate_and_clean_data(
|
236
|
-
cat: str, horizon: int, primary: pd.DataFrame, additional: pd.DataFrame
|
237
|
-
):
|
238
|
-
"""
|
239
|
-
Checks compatibility between primary and additional dataframe for a category.
|
240
|
-
|
241
|
-
Parameters
|
242
|
-
----------
|
243
|
-
cat: (str)
|
244
|
-
Category for which data is being validated.
|
245
|
-
horizon: (int)
|
246
|
-
horizon value for the forecast.
|
247
|
-
primary: (pd.DataFrame)
|
248
|
-
primary dataframe.
|
249
|
-
additional: (pd.DataFrame)
|
250
|
-
additional dataframe.
|
251
|
-
|
252
|
-
Returns
|
253
|
-
-------
|
254
|
-
(pd.DataFrame, pd.DataFrame) or (None, None)
|
255
|
-
Updated primary and additional dataframe or None values if the validation criteria does not satisfy.
|
256
|
-
"""
|
257
|
-
# Additional data should have future values for horizon
|
258
|
-
data_row_count = primary.shape[0]
|
259
|
-
data_add_row_count = additional.shape[0]
|
260
|
-
additional_surplus = data_add_row_count - horizon - data_row_count
|
261
|
-
if additional_surplus < 0:
|
262
|
-
logger.warn(
|
263
|
-
"Forecast for {} will not be generated since additional data has fewer values({}) than"
|
264
|
-
" horizon({}) + primary data({})".format(
|
265
|
-
cat, data_add_row_count, horizon, data_row_count
|
266
|
-
)
|
267
|
-
)
|
268
|
-
return None, None
|
269
|
-
elif additional_surplus > 0:
|
270
|
-
# Removing surplus future data in additional
|
271
|
-
additional.drop(additional.tail(additional_surplus).index, inplace=True)
|
272
|
-
|
273
|
-
# Dates in primary data should be subset of additional data
|
274
|
-
dates_in_data = primary.index.tolist()
|
275
|
-
dates_in_additional = additional.index.tolist()
|
276
|
-
if not set(dates_in_data).issubset(set(dates_in_additional)):
|
277
|
-
logger.warn(
|
278
|
-
"Forecast for {} will not be generated since the dates in primary and additional do not"
|
279
|
-
" match".format(cat)
|
280
|
-
)
|
281
|
-
return None, None
|
282
|
-
return primary, additional
|
173
|
+
def load_pkl(filepath):
|
174
|
+
storage_options = dict()
|
175
|
+
if ObjectStorageDetails.is_oci_path(filepath):
|
176
|
+
storage_options = default_signer()
|
283
177
|
|
178
|
+
with fsspec.open(filepath, "rb", **storage_options) as f:
|
179
|
+
return cloudpickle.load(f)
|
180
|
+
return None
|
284
181
|
|
285
|
-
def _build_indexed_datasets(
|
286
|
-
data,
|
287
|
-
target_column,
|
288
|
-
datetime_column,
|
289
|
-
horizon,
|
290
|
-
target_category_columns=None,
|
291
|
-
additional_data=None,
|
292
|
-
metadata_data=None,
|
293
|
-
):
|
294
|
-
df_by_target = dict()
|
295
|
-
categories = []
|
296
182
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
],
|
306
|
-
axis=1,
|
307
|
-
).reset_index()
|
308
|
-
return df_by_target, target_column, categories
|
309
|
-
|
310
|
-
data["__Series__"] = _merge_category_columns(data, target_category_columns)
|
311
|
-
unique_categories = data["__Series__"].unique()
|
312
|
-
invalid_categories = []
|
313
|
-
|
314
|
-
if additional_data is not None and target_column in additional_data.columns:
|
315
|
-
logger.warn(f"Dropping column '{target_column}' from additional_data")
|
316
|
-
additional_data.drop(target_column, axis=1, inplace=True)
|
317
|
-
for cat in unique_categories:
|
318
|
-
data_by_cat = data[data["__Series__"] == cat].rename(
|
319
|
-
{target_column: f"{target_column}_{cat}"}, axis=1
|
320
|
-
)
|
321
|
-
data_by_cat_clean = (
|
322
|
-
data_by_cat.drop(target_category_columns + ["__Series__"], axis=1)
|
323
|
-
.set_index(datetime_column)
|
324
|
-
.fillna(0)
|
325
|
-
)
|
326
|
-
if additional_data is not None:
|
327
|
-
additional_data["__Series__"] = _merge_category_columns(
|
328
|
-
additional_data, target_category_columns
|
329
|
-
)
|
330
|
-
data_add_by_cat = additional_data[
|
331
|
-
additional_data["__Series__"] == cat
|
332
|
-
].rename({target_column: f"{target_column}_{cat}"}, axis=1)
|
333
|
-
data_add_by_cat_clean = (
|
334
|
-
data_add_by_cat.drop(target_category_columns + ["__Series__"], axis=1)
|
335
|
-
.set_index(datetime_column)
|
336
|
-
.fillna(0)
|
337
|
-
)
|
338
|
-
valid_primary, valid_add = _validate_and_clean_data(
|
339
|
-
cat, horizon, data_by_cat_clean, data_add_by_cat_clean
|
340
|
-
)
|
341
|
-
|
342
|
-
if valid_primary is None:
|
343
|
-
invalid_categories.append(cat)
|
344
|
-
data_by_cat_clean = None
|
345
|
-
else:
|
346
|
-
data_by_cat_clean = pd.concat([valid_add, valid_primary], axis=1)
|
347
|
-
if data_by_cat_clean is not None:
|
348
|
-
df_by_target[f"{target_column}_{cat}"] = data_by_cat_clean.reset_index()
|
349
|
-
|
350
|
-
new_target_columns = list(df_by_target.keys())
|
351
|
-
remaining_categories = set(unique_categories) - set(invalid_categories)
|
352
|
-
|
353
|
-
if not len(remaining_categories):
|
354
|
-
raise ForecastInputDataError(
|
355
|
-
"Stopping forecast operator as there is no data that meets the validation criteria."
|
356
|
-
)
|
357
|
-
return df_by_target, new_target_columns, remaining_categories
|
183
|
+
def write_pkl(obj, filename, output_dir, storage_options):
|
184
|
+
pkl_path = os.path.join(output_dir, filename)
|
185
|
+
with fsspec.open(
|
186
|
+
pkl_path,
|
187
|
+
"wb",
|
188
|
+
**storage_options,
|
189
|
+
) as f:
|
190
|
+
cloudpickle.dump(obj, f)
|
358
191
|
|
359
192
|
|
360
|
-
def _build_metrics_df(y_true, y_pred,
|
193
|
+
def _build_metrics_df(y_true, y_pred, series_id):
|
194
|
+
if len(y_true) == 0 or len(y_pred) == 0:
|
195
|
+
return pd.DataFrame()
|
361
196
|
metrics = dict()
|
362
197
|
metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred)
|
363
198
|
metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
|
@@ -369,38 +204,60 @@ def _build_metrics_df(y_true, y_pred, column_name):
|
|
369
204
|
metrics["Explained Variance"] = explained_variance_score(
|
370
205
|
y_true=y_true, y_pred=y_pred
|
371
206
|
)
|
372
|
-
return pd.DataFrame.from_dict(metrics, orient="index", columns=[
|
207
|
+
return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id])
|
373
208
|
|
374
209
|
|
375
|
-
def evaluate_train_metrics(
|
376
|
-
target_columns, datasets, output, datetime_col, target_col="yhat"
|
377
|
-
):
|
210
|
+
def evaluate_train_metrics(output, metrics_col_name=None):
|
378
211
|
"""
|
379
212
|
Training metrics
|
213
|
+
|
214
|
+
Parameters:
|
215
|
+
output: ForecastOutputs
|
216
|
+
|
217
|
+
metrics_col_name: str
|
218
|
+
Only passed in if the series column was created artifically.
|
219
|
+
When passed in, replaces s_id as the column name in the metrics table
|
380
220
|
"""
|
381
221
|
total_metrics = pd.DataFrame()
|
382
|
-
for
|
222
|
+
for s_id in output.list_series_ids():
|
383
223
|
try:
|
384
|
-
|
224
|
+
forecast_by_s_id = output.get_forecast(s_id)[
|
385
225
|
["input_value", "Date", "fitted_value"]
|
386
|
-
]
|
387
|
-
|
388
|
-
|
226
|
+
]
|
227
|
+
forecast_by_s_id = forecast_by_s_id.dropna()
|
228
|
+
y_true = forecast_by_s_id["input_value"].values
|
229
|
+
y_pred = forecast_by_s_id["fitted_value"].values
|
230
|
+
drop_na_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
|
231
|
+
if not drop_na_mask.all(): # There is a missing value
|
232
|
+
if drop_na_mask.any(): # All values are missing
|
233
|
+
logger.debug(
|
234
|
+
f"No fitted values available for series: {s_id}. This will affect the training metrics."
|
235
|
+
)
|
236
|
+
continue
|
237
|
+
logger.debug(
|
238
|
+
f"Missing fitted values for series: {s_id}. This will affect the training metrics."
|
239
|
+
)
|
240
|
+
y_true = y_true[drop_na_mask]
|
241
|
+
y_pred = y_pred[drop_na_mask]
|
389
242
|
metrics_df = _build_metrics_df(
|
390
|
-
y_true=y_true,
|
243
|
+
y_true=y_true,
|
244
|
+
y_pred=y_pred,
|
245
|
+
series_id=s_id,
|
391
246
|
)
|
392
247
|
total_metrics = pd.concat([total_metrics, metrics_df], axis=1)
|
393
248
|
except Exception as e:
|
394
|
-
logger.
|
249
|
+
logger.debug(
|
250
|
+
f"Failed to generate training metrics for target_series: {s_id}"
|
251
|
+
)
|
395
252
|
logger.debug(f"Recieved Error Statement: {e}")
|
396
253
|
return total_metrics
|
397
254
|
|
398
255
|
|
399
|
-
def _select_plot_list(fn,
|
256
|
+
def _select_plot_list(fn, series_ids):
|
400
257
|
import datapane as dp
|
401
258
|
|
402
|
-
blocks = [dp.Plot(fn(
|
403
|
-
return dp.Select(blocks=blocks) if len(
|
259
|
+
blocks = [dp.Plot(fn(s_id=s_id), label=s_id) for s_id in series_ids]
|
260
|
+
return dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0]
|
404
261
|
|
405
262
|
|
406
263
|
def _add_unit(num, unit):
|
@@ -409,14 +266,32 @@ def _add_unit(num, unit):
|
|
409
266
|
|
410
267
|
def get_forecast_plots(
|
411
268
|
forecast_output,
|
412
|
-
target_columns,
|
413
269
|
horizon,
|
414
270
|
test_data=None,
|
415
271
|
ci_interval_width=0.95,
|
416
272
|
):
|
417
|
-
def plot_forecast_plotly(
|
273
|
+
def plot_forecast_plotly(s_id):
|
418
274
|
fig = go.Figure()
|
419
|
-
forecast_i = forecast_output.
|
275
|
+
forecast_i = forecast_output.get_forecast(s_id)
|
276
|
+
actual_length = len(forecast_i)
|
277
|
+
if actual_length > RENDER_LIMIT:
|
278
|
+
forecast_i = forecast_i.tail(RENDER_LIMIT)
|
279
|
+
text = (
|
280
|
+
f"<i>To improve rendering speed, subsampled the data from {actual_length}"
|
281
|
+
f" rows to {RENDER_LIMIT} rows for this plot.</i>"
|
282
|
+
)
|
283
|
+
fig.update_layout(
|
284
|
+
annotations=[
|
285
|
+
go.layout.Annotation(
|
286
|
+
x=0.01,
|
287
|
+
y=1.1,
|
288
|
+
xref="paper",
|
289
|
+
yref="paper",
|
290
|
+
text=text,
|
291
|
+
showarrow=False,
|
292
|
+
)
|
293
|
+
]
|
294
|
+
)
|
420
295
|
upper_bound = forecast_output.upper_bound_name
|
421
296
|
lower_bound = forecast_output.lower_bound_name
|
422
297
|
if upper_bound is not None and lower_bound is not None:
|
@@ -440,16 +315,20 @@ def get_forecast_plots(
|
|
440
315
|
),
|
441
316
|
]
|
442
317
|
)
|
443
|
-
if test_data is not None
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
318
|
+
if test_data is not None:
|
319
|
+
try:
|
320
|
+
test_data_s_id = test_data.get_data_for_series(s_id)
|
321
|
+
fig.add_trace(
|
322
|
+
go.Scatter(
|
323
|
+
x=test_data_s_id[test_data.dt_column_name],
|
324
|
+
y=test_data_s_id[test_data.target_name],
|
325
|
+
mode="markers",
|
326
|
+
marker_color="green",
|
327
|
+
name="Actual",
|
328
|
+
)
|
451
329
|
)
|
452
|
-
|
330
|
+
except Exception as e:
|
331
|
+
logger.debug(f"Unable to plot test data due to: {e.args}")
|
453
332
|
|
454
333
|
fig.add_trace(
|
455
334
|
go.Scatter(
|
@@ -486,27 +365,7 @@ def get_forecast_plots(
|
|
486
365
|
)
|
487
366
|
return fig
|
488
367
|
|
489
|
-
return _select_plot_list(plot_forecast_plotly,
|
490
|
-
|
491
|
-
|
492
|
-
def human_time_friendly(seconds):
|
493
|
-
TIME_DURATION_UNITS = (
|
494
|
-
("week", 60 * 60 * 24 * 7),
|
495
|
-
("day", 60 * 60 * 24),
|
496
|
-
("hour", 60 * 60),
|
497
|
-
("min", 60),
|
498
|
-
)
|
499
|
-
if seconds == 0:
|
500
|
-
return "inf"
|
501
|
-
accumulator = []
|
502
|
-
for unit, div in TIME_DURATION_UNITS:
|
503
|
-
amount, seconds = divmod(float(seconds), div)
|
504
|
-
if amount > 0:
|
505
|
-
accumulator.append(
|
506
|
-
"{} {}{}".format(int(amount), unit, "" if amount == 1 else "s")
|
507
|
-
)
|
508
|
-
accumulator.append("{} secs".format(round(seconds, 2)))
|
509
|
-
return ", ".join(accumulator)
|
368
|
+
return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids())
|
510
369
|
|
511
370
|
|
512
371
|
def select_auto_model(
|
@@ -528,17 +387,10 @@ def select_auto_model(
|
|
528
387
|
str
|
529
388
|
The type of the model.
|
530
389
|
"""
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
)
|
535
|
-
freq_in_secs = datetimes.tail().diff().min().total_seconds()
|
536
|
-
if datasets.original_additional_data is not None:
|
537
|
-
num_of_additional_cols = len(datasets.original_additional_data.columns) - 2
|
538
|
-
else:
|
539
|
-
num_of_additional_cols = 0
|
540
|
-
row_count = len(datasets.original_user_data.index)
|
541
|
-
number_of_series = len(datasets.categories)
|
390
|
+
freq_in_secs = datasets.get_datetime_frequency_in_seconds()
|
391
|
+
num_of_additional_cols = len(datasets.get_additional_data_column_names())
|
392
|
+
row_count = datasets.get_num_rows()
|
393
|
+
number_of_series = len(datasets.list_series_ids())
|
542
394
|
if (
|
543
395
|
num_of_additional_cols < 15
|
544
396
|
and row_count < 10000
|
@@ -547,10 +399,6 @@ def select_auto_model(
|
|
547
399
|
):
|
548
400
|
return SupportedModels.AutoMLX
|
549
401
|
elif row_count < 10000 and number_of_series > 10:
|
550
|
-
operator_config.spec.model_kwargs["model_list"] = "fast_parallel"
|
551
|
-
return SupportedModels.AutoTS
|
552
|
-
elif row_count < 20000 and number_of_series > 10:
|
553
|
-
operator_config.spec.model_kwargs["model_list"] = "superfast"
|
554
402
|
return SupportedModels.AutoTS
|
555
403
|
elif row_count > 20000:
|
556
404
|
return SupportedModels.NeuralProphet
|
@@ -558,35 +406,27 @@ def select_auto_model(
|
|
558
406
|
return SupportedModels.NeuralProphet
|
559
407
|
|
560
408
|
|
561
|
-
def
|
409
|
+
def convert_target(target: str, target_col: str):
|
562
410
|
"""
|
563
|
-
|
411
|
+
Removes the target_column that got appended to target.
|
564
412
|
|
565
413
|
Parameters
|
566
414
|
------------
|
567
|
-
|
568
|
-
|
569
|
-
|
415
|
+
target: str
|
416
|
+
value in target_columns. i.e., "Sales_Product_Category_117"
|
417
|
+
|
418
|
+
target_col: str
|
419
|
+
target_column provided in yaml. i.e., "Sales"
|
570
420
|
|
571
421
|
Returns
|
572
422
|
--------
|
573
|
-
|
574
|
-
|
423
|
+
Original target. i.e., "Product_Category_117"
|
575
424
|
"""
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
if dataset_info.model == SupportedModels.AutoMLX:
|
582
|
-
freq_in_secs = datetimes.tail().diff().min().total_seconds()
|
583
|
-
if abs(freq_in_secs) < 3600:
|
584
|
-
message = (
|
585
|
-
"{} requires data with a frequency of at least one hour. Please try using a different model,"
|
586
|
-
" or select the 'auto' option.".format(SupportedModels.AutoMLX, freq)
|
587
|
-
)
|
588
|
-
raise Exception(message)
|
589
|
-
return freq
|
425
|
+
if target_col is not None and target_col != "":
|
426
|
+
temp = target_col + "_"
|
427
|
+
if temp in target:
|
428
|
+
target = target.replace(temp, "", 1)
|
429
|
+
return target
|
590
430
|
|
591
431
|
|
592
432
|
def default_signer(**kwargs):
|
@@ -594,13 +434,3 @@ def default_signer(**kwargs):
|
|
594
434
|
from ads.common.auth import default_signer
|
595
435
|
|
596
436
|
return default_signer(**kwargs)
|
597
|
-
|
598
|
-
|
599
|
-
# Disable
|
600
|
-
def block_print():
|
601
|
-
sys.stdout = open(os.devnull, "w")
|
602
|
-
|
603
|
-
|
604
|
-
# Restore
|
605
|
-
def enable_print():
|
606
|
-
sys.stdout = sys.__stdout__
|