oracle-ads 2.11.14__py3-none-any.whl → 2.11.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/common/entities.py +17 -0
- ads/aqua/common/enums.py +5 -1
- ads/aqua/common/utils.py +109 -22
- ads/aqua/config/config.py +1 -1
- ads/aqua/config/deployment_config_defaults.json +29 -1
- ads/aqua/config/resource_limit_names.json +1 -0
- ads/aqua/constants.py +35 -18
- ads/aqua/evaluation/entities.py +0 -1
- ads/aqua/evaluation/evaluation.py +165 -121
- ads/aqua/extension/common_ws_msg_handler.py +57 -0
- ads/aqua/extension/deployment_handler.py +14 -13
- ads/aqua/extension/deployment_ws_msg_handler.py +54 -0
- ads/aqua/extension/errors.py +1 -1
- ads/aqua/extension/evaluation_handler.py +4 -7
- ads/aqua/extension/evaluation_ws_msg_handler.py +28 -10
- ads/aqua/extension/model_handler.py +31 -6
- ads/aqua/extension/models/ws_models.py +78 -3
- ads/aqua/extension/models_ws_msg_handler.py +49 -0
- ads/aqua/extension/ui_websocket_handler.py +7 -1
- ads/aqua/model/entities.py +17 -9
- ads/aqua/model/model.py +260 -90
- ads/aqua/modeldeployment/constants.py +0 -16
- ads/aqua/modeldeployment/deployment.py +97 -74
- ads/aqua/modeldeployment/entities.py +9 -20
- ads/aqua/ui.py +152 -28
- ads/common/object_storage_details.py +2 -5
- ads/common/serializer.py +2 -3
- ads/jobs/builders/infrastructure/dsc_job.py +29 -3
- ads/jobs/builders/infrastructure/dsc_job_runtime.py +74 -27
- ads/jobs/builders/runtimes/container_runtime.py +83 -4
- ads/opctl/operator/common/operator_config.py +1 -0
- ads/opctl/operator/lowcode/anomaly/README.md +3 -3
- ads/opctl/operator/lowcode/anomaly/__main__.py +5 -6
- ads/opctl/operator/lowcode/anomaly/const.py +9 -0
- ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py +6 -2
- ads/opctl/operator/lowcode/anomaly/model/base_model.py +51 -26
- ads/opctl/operator/lowcode/anomaly/model/factory.py +41 -13
- ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +79 -0
- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +79 -0
- ads/opctl/operator/lowcode/anomaly/operator_config.py +1 -0
- ads/opctl/operator/lowcode/anomaly/schema.yaml +16 -2
- ads/opctl/operator/lowcode/anomaly/utils.py +16 -13
- ads/opctl/operator/lowcode/common/data.py +2 -1
- ads/opctl/operator/lowcode/common/errors.py +6 -0
- ads/opctl/operator/lowcode/common/transformations.py +37 -9
- ads/opctl/operator/lowcode/common/utils.py +32 -10
- ads/opctl/operator/lowcode/forecast/model/base_model.py +21 -13
- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +14 -18
- ads/opctl/operator/lowcode/forecast/model_evaluator.py +15 -4
- ads/opctl/operator/lowcode/forecast/schema.yaml +9 -0
- ads/opctl/operator/lowcode/recommender/MLoperator +16 -0
- ads/opctl/operator/lowcode/recommender/README.md +206 -0
- ads/opctl/operator/lowcode/recommender/__init__.py +5 -0
- ads/opctl/operator/lowcode/recommender/__main__.py +82 -0
- ads/opctl/operator/lowcode/recommender/cmd.py +33 -0
- ads/opctl/operator/lowcode/recommender/constant.py +25 -0
- ads/opctl/operator/lowcode/recommender/environment.yaml +11 -0
- ads/opctl/operator/lowcode/recommender/model/base_model.py +198 -0
- ads/opctl/operator/lowcode/recommender/model/factory.py +58 -0
- ads/opctl/operator/lowcode/recommender/model/recommender_dataset.py +25 -0
- ads/opctl/operator/lowcode/recommender/model/svd.py +88 -0
- ads/opctl/operator/lowcode/recommender/operator_config.py +81 -0
- ads/opctl/operator/lowcode/recommender/schema.yaml +265 -0
- ads/opctl/operator/lowcode/recommender/utils.py +13 -0
- ads/pipeline/ads_pipeline_run.py +13 -2
- {oracle_ads-2.11.14.dist-info → oracle_ads-2.11.16.dist-info}/METADATA +6 -1
- {oracle_ads-2.11.14.dist-info → oracle_ads-2.11.16.dist-info}/RECORD +70 -50
- {oracle_ads-2.11.14.dist-info → oracle_ads-2.11.16.dist-info}/LICENSE.txt +0 -0
- {oracle_ads-2.11.14.dist-info → oracle_ads-2.11.16.dist-info}/WHEEL +0 -0
- {oracle_ads-2.11.14.dist-info → oracle_ads-2.11.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*--
|
3
|
+
|
4
|
+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
|
5
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
from ads.common.decorator.runtime_dependency import runtime_dependency
|
11
|
+
|
12
|
+
from .base_model import AnomalyOperatorBaseModel
|
13
|
+
from .anomaly_dataset import AnomalyOutput
|
14
|
+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
|
15
|
+
|
16
|
+
|
17
|
+
class OneClassSVMOperatorModel(AnomalyOperatorBaseModel):
|
18
|
+
"""Class representing OneClassSVM Anomaly Detection operator model."""
|
19
|
+
|
20
|
+
@runtime_dependency(
|
21
|
+
module="sklearn",
|
22
|
+
err_msg=(
|
23
|
+
"Please run `pip3 install scikit-learn` to "
|
24
|
+
"install the required dependencies for OneClassSVM."
|
25
|
+
),
|
26
|
+
)
|
27
|
+
def _build_model(self) -> AnomalyOutput:
|
28
|
+
from sklearn.svm import OneClassSVM
|
29
|
+
|
30
|
+
model_kwargs = self.spec.model_kwargs
|
31
|
+
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
|
32
|
+
self.outlier_map = {1: 0, -1: 1}
|
33
|
+
|
34
|
+
anomaly_output = AnomalyOutput(date_column="index")
|
35
|
+
|
36
|
+
for target, df in self.datasets.full_data_dict.items():
|
37
|
+
model = OneClassSVM(**model_kwargs)
|
38
|
+
model.fit(df)
|
39
|
+
y_pred = np.vectorize(self.outlier_map.get)(
|
40
|
+
model.predict(df)
|
41
|
+
)
|
42
|
+
|
43
|
+
scores = model.score_samples(
|
44
|
+
df
|
45
|
+
)
|
46
|
+
|
47
|
+
index_col = df.columns[0]
|
48
|
+
|
49
|
+
anomaly = pd.DataFrame(
|
50
|
+
{index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
|
51
|
+
).reset_index(drop=True)
|
52
|
+
score = pd.DataFrame(
|
53
|
+
{"index": df[index_col], OutputColumns.SCORE_COL: scores}
|
54
|
+
).reset_index(drop=True)
|
55
|
+
|
56
|
+
anomaly_output.add_output(target, anomaly, score)
|
57
|
+
|
58
|
+
return anomaly_output
|
59
|
+
|
60
|
+
def _generate_report(self):
|
61
|
+
"""Generates the report."""
|
62
|
+
import report_creator as rc
|
63
|
+
|
64
|
+
other_sections = [
|
65
|
+
rc.Heading("Selected Models Overview", level=2),
|
66
|
+
rc.Text(
|
67
|
+
"The following tables provide information regarding the chosen model."
|
68
|
+
),
|
69
|
+
]
|
70
|
+
|
71
|
+
model_description = rc.Text(
|
72
|
+
"The oneclasssvm model is a full-stack automated machine learning system for outlier detection. "
|
73
|
+
"It is best suited for novelty detection when the training set is not contaminated by outliers"
|
74
|
+
)
|
75
|
+
|
76
|
+
return (
|
77
|
+
model_description,
|
78
|
+
other_sections,
|
79
|
+
)
|
@@ -77,6 +77,7 @@ class AnomalyOperatorSpec(DataClassSerializable):
|
|
77
77
|
model: str = None
|
78
78
|
model_kwargs: Dict = field(default_factory=dict)
|
79
79
|
contamination: float = None
|
80
|
+
subsample_report_data: bool = None
|
80
81
|
|
81
82
|
def __post_init__(self):
|
82
83
|
"""Adjusts the specification details."""
|
@@ -29,7 +29,7 @@ spec:
|
|
29
29
|
input_data:
|
30
30
|
required: true
|
31
31
|
type: dict
|
32
|
-
default: {"url": "data.csv"}
|
32
|
+
default: { "url": "data.csv" }
|
33
33
|
meta:
|
34
34
|
description: "The payload that the detector should evaluate."
|
35
35
|
schema:
|
@@ -78,6 +78,9 @@ spec:
|
|
78
78
|
limit:
|
79
79
|
required: false
|
80
80
|
type: integer
|
81
|
+
vault_secret_id:
|
82
|
+
required: false
|
83
|
+
type: string
|
81
84
|
|
82
85
|
validation_data:
|
83
86
|
required: false
|
@@ -130,10 +133,15 @@ spec:
|
|
130
133
|
limit:
|
131
134
|
required: false
|
132
135
|
type: integer
|
136
|
+
vault_secret_id:
|
137
|
+
required: false
|
138
|
+
type: string
|
133
139
|
|
134
140
|
datetime_column:
|
135
141
|
type: dict
|
136
|
-
required:
|
142
|
+
required: false
|
143
|
+
meta:
|
144
|
+
description: "`datetime_column` is required for time series anomaly detection, only non time-based anomaly detection models can be run without `datetime_column`"
|
137
145
|
schema:
|
138
146
|
name:
|
139
147
|
type: string
|
@@ -353,6 +361,8 @@ spec:
|
|
353
361
|
allowed:
|
354
362
|
- autots
|
355
363
|
- auto
|
364
|
+
- oneclasssvm
|
365
|
+
- isolationforest
|
356
366
|
meta:
|
357
367
|
description: "The model to be used for anomaly detection"
|
358
368
|
|
@@ -367,4 +377,8 @@ spec:
|
|
367
377
|
type: dict
|
368
378
|
required: false
|
369
379
|
|
380
|
+
subsample_report_data:
|
381
|
+
type: boolean
|
382
|
+
required: false
|
383
|
+
|
370
384
|
type: dict
|
@@ -1,31 +1,32 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
|
-
# -*- coding: utf-8 -*--
|
3
2
|
|
4
3
|
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
|
5
4
|
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
5
|
|
7
6
|
import os
|
7
|
+
|
8
8
|
import pandas as pd
|
9
|
-
|
10
|
-
from .operator_config import AnomalyOperatorSpec
|
11
|
-
from .const import SupportedMetrics, SupportedModels
|
9
|
+
|
12
10
|
from ads.opctl import logger
|
13
11
|
|
12
|
+
from .const import NonTimeADSupportedModels, SupportedMetrics, SupportedModels
|
13
|
+
from .operator_config import AnomalyOperatorSpec
|
14
|
+
|
14
15
|
|
15
16
|
def _build_metrics_df(y_true, y_pred, column_name):
|
16
17
|
from sklearn.metrics import (
|
17
|
-
recall_score,
|
18
|
-
precision_score,
|
19
18
|
accuracy_score,
|
20
|
-
f1_score,
|
21
|
-
confusion_matrix,
|
22
|
-
roc_auc_score,
|
23
|
-
precision_recall_curve,
|
24
19
|
auc,
|
20
|
+
confusion_matrix,
|
21
|
+
f1_score,
|
25
22
|
matthews_corrcoef,
|
23
|
+
precision_recall_curve,
|
24
|
+
precision_score,
|
25
|
+
recall_score,
|
26
|
+
roc_auc_score,
|
26
27
|
)
|
27
28
|
|
28
|
-
metrics =
|
29
|
+
metrics = {}
|
29
30
|
metrics[SupportedMetrics.RECALL] = recall_score(y_true, y_pred)
|
30
31
|
metrics[SupportedMetrics.PRECISION] = precision_score(y_true, y_pred)
|
31
32
|
metrics[SupportedMetrics.ACCURACY] = accuracy_score(y_true, y_pred)
|
@@ -78,5 +79,7 @@ def default_signer(**kwargs):
|
|
78
79
|
return default_signer(**kwargs)
|
79
80
|
|
80
81
|
|
81
|
-
def select_auto_model(
|
82
|
-
|
82
|
+
def select_auto_model(operator_config):
|
83
|
+
if operator_config.spec.datetime_column is not None:
|
84
|
+
return SupportedModels.AutoTS
|
85
|
+
return NonTimeADSupportedModels.IsolationForest
|
@@ -25,6 +25,7 @@ class AbstractData(ABC):
|
|
25
25
|
self.data = None
|
26
26
|
self._data_dict = dict()
|
27
27
|
self.name = name
|
28
|
+
self.spec = spec
|
28
29
|
self.load_transform_ingest_data(spec)
|
29
30
|
|
30
31
|
def get_raw_data_by_cat(self, category):
|
@@ -36,7 +37,7 @@ class AbstractData(ABC):
|
|
36
37
|
for col, val in mapping[category].items():
|
37
38
|
condition &= (self.raw_data[col] == val)
|
38
39
|
data_by_cat = self.raw_data[condition].reset_index(drop=True)
|
39
|
-
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
|
40
|
+
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat
|
40
41
|
return data_by_cat
|
41
42
|
|
42
43
|
|
@@ -32,8 +32,14 @@ class Transformations(ABC):
|
|
32
32
|
self.dataset_info = dataset_info
|
33
33
|
self.target_category_columns = dataset_info.target_category_columns
|
34
34
|
self.target_column_name = dataset_info.target_column
|
35
|
-
self.dt_column_name =
|
36
|
-
|
35
|
+
self.dt_column_name = (
|
36
|
+
dataset_info.datetime_column.name if dataset_info.datetime_column else None
|
37
|
+
)
|
38
|
+
self.dt_column_format = (
|
39
|
+
dataset_info.datetime_column.format
|
40
|
+
if dataset_info.datetime_column
|
41
|
+
else None
|
42
|
+
)
|
37
43
|
self.preprocessing = dataset_info.preprocessing
|
38
44
|
|
39
45
|
def run(self, data):
|
@@ -55,8 +61,10 @@ class Transformations(ABC):
|
|
55
61
|
if self.name == "historical_data":
|
56
62
|
self._check_historical_dataset(clean_df)
|
57
63
|
clean_df = self._set_series_id_column(clean_df)
|
58
|
-
|
64
|
+
if self.dt_column_name:
|
65
|
+
clean_df = self._format_datetime_col(clean_df)
|
59
66
|
clean_df = self._set_multi_index(clean_df)
|
67
|
+
clean_df = self._fill_na(clean_df) if not self.dt_column_name else clean_df
|
60
68
|
|
61
69
|
if self.preprocessing and self.preprocessing.enabled:
|
62
70
|
if self.name == "historical_data":
|
@@ -66,7 +74,9 @@ class Transformations(ABC):
|
|
66
74
|
except Exception as e:
|
67
75
|
logger.debug(f"Missing value imputation failed with {e.args}")
|
68
76
|
else:
|
69
|
-
logger.info(
|
77
|
+
logger.info(
|
78
|
+
"Skipping missing value imputation because it is disabled"
|
79
|
+
)
|
70
80
|
if self.preprocessing.steps.outlier_treatment:
|
71
81
|
try:
|
72
82
|
clean_df = self._outlier_treatment(clean_df)
|
@@ -77,7 +87,9 @@ class Transformations(ABC):
|
|
77
87
|
elif self.name == "additional_data":
|
78
88
|
clean_df = self._missing_value_imputation_add(clean_df)
|
79
89
|
else:
|
80
|
-
logger.info(
|
90
|
+
logger.info(
|
91
|
+
"Skipping all preprocessing steps because preprocessing is disabled"
|
92
|
+
)
|
81
93
|
return clean_df
|
82
94
|
|
83
95
|
def _remove_trailing_whitespace(self, df):
|
@@ -95,7 +107,14 @@ class Transformations(ABC):
|
|
95
107
|
merged_values = df[DataColumns.Series].unique().tolist()
|
96
108
|
if self.target_category_columns:
|
97
109
|
for value in merged_values:
|
98
|
-
self._target_category_columns_map[value] =
|
110
|
+
self._target_category_columns_map[value] = (
|
111
|
+
df[df[DataColumns.Series] == value][
|
112
|
+
self.target_category_columns
|
113
|
+
]
|
114
|
+
.drop_duplicates()
|
115
|
+
.iloc[0]
|
116
|
+
.to_dict()
|
117
|
+
)
|
99
118
|
|
100
119
|
if self.target_category_columns != [DataColumns.Series]:
|
101
120
|
df = df.drop(self.target_category_columns, axis=1)
|
@@ -124,8 +143,12 @@ class Transformations(ABC):
|
|
124
143
|
-------
|
125
144
|
A new Pandas DataFrame with sorted dates for each series
|
126
145
|
"""
|
127
|
-
|
128
|
-
|
146
|
+
if self.dt_column_name:
|
147
|
+
df = df.set_index([self.dt_column_name, DataColumns.Series])
|
148
|
+
return df.sort_values(
|
149
|
+
[self.dt_column_name, DataColumns.Series], ascending=True
|
150
|
+
)
|
151
|
+
return df.set_index([df.index, DataColumns.Series])
|
129
152
|
|
130
153
|
def _missing_value_imputation_hist(self, df):
|
131
154
|
"""
|
@@ -222,5 +245,10 @@ class Transformations(ABC):
|
|
222
245
|
|
223
246
|
}
|
224
247
|
"""
|
248
|
+
|
225
249
|
def get_target_category_columns_map(self):
|
226
|
-
return self._target_category_columns_map
|
250
|
+
return self._target_category_columns_map
|
251
|
+
|
252
|
+
def _fill_na(self, df: pd.DataFrame, na_value=0) -> pd.DataFrame:
|
253
|
+
"""Fill nans in dataframe"""
|
254
|
+
return df.fillna(value=na_value)
|
@@ -7,7 +7,9 @@
|
|
7
7
|
import argparse
|
8
8
|
import logging
|
9
9
|
import os
|
10
|
+
import shutil
|
10
11
|
import sys
|
12
|
+
import tempfile
|
11
13
|
import time
|
12
14
|
from string import Template
|
13
15
|
from typing import Any, Dict, List, Tuple
|
@@ -28,6 +30,7 @@ from ads.opctl.operator.lowcode.common.errors import (
|
|
28
30
|
)
|
29
31
|
from ads.opctl.operator.common.operator_config import OutputDirectory
|
30
32
|
from ads.common.object_storage_details import ObjectStorageDetails
|
33
|
+
from ads.secrets import ADBSecretKeeper
|
31
34
|
|
32
35
|
|
33
36
|
def call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs):
|
@@ -53,10 +56,12 @@ def load_data(data_spec, storage_options=None, **kwargs):
|
|
53
56
|
sql = data_spec.sql
|
54
57
|
table_name = data_spec.table_name
|
55
58
|
limit = data_spec.limit
|
56
|
-
|
59
|
+
vault_secret_id = data_spec.vault_secret_id
|
57
60
|
storage_options = storage_options or (
|
58
61
|
default_signer() if ObjectStorageDetails.is_oci_path(filename) else {}
|
59
62
|
)
|
63
|
+
if vault_secret_id is not None and connect_args is None:
|
64
|
+
connect_args = dict()
|
60
65
|
|
61
66
|
if filename is not None:
|
62
67
|
if not format:
|
@@ -76,15 +81,32 @@ def load_data(data_spec, storage_options=None, **kwargs):
|
|
76
81
|
f"The format {format} is not currently supported for reading data. Please reformat the data source: {filename} ."
|
77
82
|
)
|
78
83
|
elif connect_args is not None:
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
84
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
85
|
+
if vault_secret_id is not None:
|
86
|
+
try:
|
87
|
+
with ADBSecretKeeper.load_secret(vault_secret_id, wallet_dir=temp_dir) as adwsecret:
|
88
|
+
if 'wallet_location' in adwsecret and 'wallet_location' not in connect_args:
|
89
|
+
shutil.unpack_archive(adwsecret["wallet_location"], temp_dir)
|
90
|
+
connect_args['wallet_location'] = temp_dir
|
91
|
+
if 'user_name' in adwsecret and 'user' not in connect_args:
|
92
|
+
connect_args['user'] = adwsecret['user_name']
|
93
|
+
if 'password' in adwsecret and 'password' not in connect_args:
|
94
|
+
connect_args['password'] = adwsecret['password']
|
95
|
+
if 'service_name' in adwsecret and 'service_name' not in connect_args:
|
96
|
+
connect_args['service_name'] = adwsecret['service_name']
|
97
|
+
|
98
|
+
except Exception as e:
|
99
|
+
raise Exception(f"Could not retrieve database credentials from vault {vault_secret_id}: {e}")
|
100
|
+
|
101
|
+
con = oracledb.connect(**connect_args)
|
102
|
+
if table_name is not None:
|
103
|
+
data = pd.read_sql(f"SELECT * FROM {table_name}", con)
|
104
|
+
elif sql is not None:
|
105
|
+
data = pd.read_sql(sql, con)
|
106
|
+
else:
|
107
|
+
raise InvalidParameterError(
|
108
|
+
f"Database `connect_args` provided without sql query or table name. Please specify either `sql` or `table_name`."
|
109
|
+
)
|
88
110
|
else:
|
89
111
|
raise InvalidParameterError(
|
90
112
|
f"No filename/url provided, and no connect_args provided. Please specify one of these if you want to read data from a file or a database respectively."
|
@@ -249,20 +249,28 @@ class ForecastOperatorBaseModel(ABC):
|
|
249
249
|
train_metrics_sections = [sec9_text, sec9]
|
250
250
|
|
251
251
|
backtest_sections = []
|
252
|
+
output_dir = self.spec.output_directory.url
|
253
|
+
backtest_report_name = "backtest_stats.csv"
|
254
|
+
file_path = f"{output_dir}/{backtest_report_name}"
|
252
255
|
if self.spec.model == AUTO_SELECT:
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
256
|
+
backtest_sections.append(rc.Heading("Auto-select statistics", level=2))
|
257
|
+
if not os.path.exists(file_path):
|
258
|
+
failure_msg = rc.Text("auto-select could not be executed. Please check the "
|
259
|
+
"logs for more details.")
|
260
|
+
backtest_sections.append(failure_msg)
|
261
|
+
else:
|
262
|
+
backtest_stats = pd.read_csv(file_path)
|
263
|
+
average_dict = backtest_stats.mean().to_dict()
|
264
|
+
del average_dict['backtest']
|
265
|
+
best_model = min(average_dict, key=average_dict.get)
|
266
|
+
backtest_text = rc.Heading("Back Testing Metrics", level=3)
|
267
|
+
summary_text = rc.Text(
|
268
|
+
f"Overall, the average scores for the models are {average_dict}, with {best_model}"
|
269
|
+
f" being identified as the top-performing model during backtesting.")
|
270
|
+
backtest_table = rc.DataTable(backtest_stats, index=True)
|
271
|
+
liner_plot = get_auto_select_plot(backtest_stats)
|
272
|
+
backtest_sections.extend([backtest_text, backtest_table, summary_text,
|
273
|
+
liner_plot])
|
266
274
|
|
267
275
|
|
268
276
|
forecast_plots = []
|
@@ -61,6 +61,18 @@ class MLForecastOperatorModel(ForecastOperatorBaseModel):
|
|
61
61
|
"verbosity": -1,
|
62
62
|
"num_leaves": 512,
|
63
63
|
}
|
64
|
+
additional_data_params = {}
|
65
|
+
if len(self.datasets.get_additional_data_column_names()) > 0:
|
66
|
+
additional_data_params = {
|
67
|
+
"target_transforms": [Differences([12])],
|
68
|
+
"lags": model_kwargs.get("lags", [1, 6, 12]),
|
69
|
+
"lag_transforms": (
|
70
|
+
{
|
71
|
+
1: [ExpandingMean()],
|
72
|
+
12: [RollingMean(window_size=24)],
|
73
|
+
}
|
74
|
+
),
|
75
|
+
}
|
64
76
|
|
65
77
|
fcst = MLForecast(
|
66
78
|
models={
|
@@ -80,24 +92,7 @@ class MLForecastOperatorModel(ForecastOperatorBaseModel):
|
|
80
92
|
},
|
81
93
|
freq=pd.infer_freq(data_train[self.date_col].drop_duplicates())
|
82
94
|
or pd.infer_freq(data_train[self.date_col].drop_duplicates()[-5:]),
|
83
|
-
|
84
|
-
lags=model_kwargs.get(
|
85
|
-
"lags",
|
86
|
-
(
|
87
|
-
[1, 6, 12]
|
88
|
-
if len(self.datasets.get_additional_data_column_names()) > 0
|
89
|
-
else []
|
90
|
-
),
|
91
|
-
),
|
92
|
-
lag_transforms=(
|
93
|
-
{
|
94
|
-
1: [ExpandingMean()],
|
95
|
-
12: [RollingMean(window_size=24)],
|
96
|
-
}
|
97
|
-
if len(self.datasets.get_additional_data_column_names()) > 0
|
98
|
-
else {}
|
99
|
-
),
|
100
|
-
# date_features=[hour_index],
|
95
|
+
**additional_data_params,
|
101
96
|
)
|
102
97
|
|
103
98
|
num_models = model_kwargs.get("recursive_models", False)
|
@@ -164,6 +159,7 @@ class MLForecastOperatorModel(ForecastOperatorBaseModel):
|
|
164
159
|
"error": str(e),
|
165
160
|
}
|
166
161
|
logger.debug(f"Encountered Error: {e}. Skipping.")
|
162
|
+
raise e
|
167
163
|
|
168
164
|
def _build_model(self) -> pd.DataFrame:
|
169
165
|
data_train = self.datasets.get_all_data_long(include_horizon=False)
|
@@ -12,7 +12,8 @@ from ads.opctl import logger
|
|
12
12
|
from ads.opctl.operator.lowcode.common.const import DataColumns
|
13
13
|
from .model.forecast_datasets import ForecastDatasets
|
14
14
|
from .operator_config import ForecastOperatorConfig
|
15
|
-
|
15
|
+
from ads.opctl.operator.lowcode.forecast.model.factory import SupportedModels
|
16
|
+
from ads.opctl.operator.lowcode.common.errors import InsufficientDataError
|
16
17
|
|
17
18
|
class ModelEvaluator:
|
18
19
|
"""
|
@@ -39,7 +40,7 @@ class ModelEvaluator:
|
|
39
40
|
def generate_cutoffs(self, unique_dates, horizon):
|
40
41
|
sorted_dates = np.sort(unique_dates)
|
41
42
|
train_window_size = [len(sorted_dates) - (i + 1) * horizon for i in range(self.k)]
|
42
|
-
valid_train_window_size = [ws for ws in train_window_size if ws >= horizon *
|
43
|
+
valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 2]
|
43
44
|
if len(valid_train_window_size) < self.k:
|
44
45
|
logger.warn(f"Only {valid_train_window_size} backtests can be created")
|
45
46
|
cut_offs = sorted_dates[-horizon - 1:-horizon * (self.k + 1):-horizon][:len(valid_train_window_size)]
|
@@ -61,6 +62,9 @@ class ModelEvaluator:
|
|
61
62
|
unique_dates = min_series_data[date_col].unique()
|
62
63
|
|
63
64
|
cut_offs = self.generate_cutoffs(unique_dates, horizon)
|
65
|
+
if not len(cut_offs):
|
66
|
+
raise InsufficientDataError("Insufficient data to evaluate multiple models. Please specify a model "
|
67
|
+
"instead of using auto-select.")
|
64
68
|
training_datasets = [sampled_historical_data[sampled_historical_data[date_col] <= cut_off_date] for cut_off_date
|
65
69
|
in cut_offs]
|
66
70
|
test_datasets = [sampled_historical_data[sampled_historical_data[date_col] > cut_offs[0]]]
|
@@ -95,7 +99,9 @@ class ModelEvaluator:
|
|
95
99
|
backtest_op_config_draft = operator_config.to_dict()
|
96
100
|
backtest_spec = backtest_op_config_draft["spec"]
|
97
101
|
backtest_spec["historical_data"]["url"] = historical_data_url
|
98
|
-
backtest_spec["additional_data"]
|
102
|
+
if backtest_spec["additional_data"]:
|
103
|
+
backtest_spec["additional_data"]["url"] = additional_data_url
|
104
|
+
backtest_spec["test_data"] = {}
|
99
105
|
backtest_spec["test_data"]["url"] = test_data_url
|
100
106
|
backtest_spec["model"] = model
|
101
107
|
backtest_spec['model_kwargs'] = None
|
@@ -135,7 +141,12 @@ class ModelEvaluator:
|
|
135
141
|
return metrics
|
136
142
|
|
137
143
|
def find_best_model(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
|
138
|
-
|
144
|
+
try:
|
145
|
+
metrics = self.run_all_models(datasets, operator_config)
|
146
|
+
except InsufficientDataError as e:
|
147
|
+
model = SupportedModels.Prophet
|
148
|
+
logger.error(f"Running {model} model as auto-select failed with the following error: {e.message}")
|
149
|
+
return model
|
139
150
|
avg_backtests_metrics = {key: sum(value.values()) / len(value.values()) for key, value in metrics.items()}
|
140
151
|
best_model = min(avg_backtests_metrics, key=avg_backtests_metrics.get)
|
141
152
|
logger.info(f"Among models {self.models}, {best_model} model shows better performance during backtesting.")
|
@@ -78,6 +78,9 @@ spec:
|
|
78
78
|
limit:
|
79
79
|
required: false
|
80
80
|
type: integer
|
81
|
+
vault_secret_id:
|
82
|
+
required: false
|
83
|
+
type: string
|
81
84
|
|
82
85
|
additional_data:
|
83
86
|
required: false
|
@@ -130,6 +133,9 @@ spec:
|
|
130
133
|
limit:
|
131
134
|
required: false
|
132
135
|
type: integer
|
136
|
+
vault_secret_id:
|
137
|
+
required: false
|
138
|
+
type: string
|
133
139
|
|
134
140
|
test_data:
|
135
141
|
required: false
|
@@ -181,6 +187,9 @@ spec:
|
|
181
187
|
limit:
|
182
188
|
required: false
|
183
189
|
type: integer
|
190
|
+
vault_secret_id:
|
191
|
+
required: false
|
192
|
+
type: string
|
184
193
|
type: dict
|
185
194
|
|
186
195
|
output_directory:
|
@@ -0,0 +1,16 @@
|
|
1
|
+
type: recommender
|
2
|
+
version: v1
|
3
|
+
conda_type: service
|
4
|
+
name: Recommender Operator
|
5
|
+
gpu: no
|
6
|
+
keywords:
|
7
|
+
- Recommender
|
8
|
+
backends:
|
9
|
+
- job
|
10
|
+
- operator.local
|
11
|
+
description: |
|
12
|
+
Recommender Systems are designed to suggest relevant items, products, or content to users based on their
|
13
|
+
preferences and behaviors. These systems are widely used in various industries such as e-commerce, entertainment,
|
14
|
+
and social media to enhance user experience by providing personalized recommendations. They help in increasing user
|
15
|
+
engagement, satisfaction, and sales by predicting what users might like or need based on their past interactions
|
16
|
+
and the preferences of similar users.
|