teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Kesavaragavan B (kesavaragavan.b@Teradata.com)
|
|
@@ -87,7 +87,24 @@ class _BaseSearch:
|
|
|
87
87
|
"SVM": "newdata", "XGBoost": "newdata",
|
|
88
88
|
"NaiveBayesTextClassifierTrainer": "newdata",
|
|
89
89
|
"DecisionTree": "data", "KMeans": "data",
|
|
90
|
-
"LinReg": "data", "LogReg": "data", "PCA": "data"
|
|
90
|
+
"LinReg": "data", "LogReg": "data", "PCA": "data",
|
|
91
|
+
"LinearRegression": "data", "Lasso": "data",
|
|
92
|
+
"Ridge": "data", "ARDRegression": "data",
|
|
93
|
+
"BayesianRidge": "data", "TweedieRegressor": "data",
|
|
94
|
+
"TheilSenRegressor": "data", "SGDRegressor": "data",
|
|
95
|
+
"RidgeCV": "data", "RANSACRegressor": "data",
|
|
96
|
+
"PoissonRegressor": "data", "PassiveAggressiveRegressor": "data",
|
|
97
|
+
"OrthogonalMatchingPursuitCV": "data", "OrthogonalMatchingPursuit": "data",
|
|
98
|
+
"MultiTaskLassoCV": "data", "MultiTaskLasso": "data",
|
|
99
|
+
"MultiTaskElasticNetCV": "data", "MultiTaskElasticNet": "data",
|
|
100
|
+
"LassoLarsIC": "data", "LassoLarsCV": "data", "LassoLars": "data",
|
|
101
|
+
"LassoCV": "data", "LarsCV": "data", "Lars": "data",
|
|
102
|
+
"HuberRegressor": "data", "GammaRegressor": "data",
|
|
103
|
+
"ElasticNetCV": "data", "ElasticNet": "data",
|
|
104
|
+
"LogisticRegression": "data", "RidgeClassifier": "data",
|
|
105
|
+
"RidgeClassifierCV": "data", "SGDClassifier": "data",
|
|
106
|
+
"PassiveAggressiveClassifier": "data", "Perceptron": "data",
|
|
107
|
+
"LogisticRegressionCV": "data"}
|
|
91
108
|
|
|
92
109
|
self._UAF_TRAINABLE_FUNCS = {"ArimaEstimate", "LinearRegr", "MAMean",
|
|
93
110
|
"MultivarRegr", "SimpleExp"}
|
|
@@ -120,8 +137,34 @@ class _BaseSearch:
|
|
|
120
137
|
'MACRO-F1': True,
|
|
121
138
|
'WEIGHTED-PRECISION': True,
|
|
122
139
|
'WEIGHTED-RECALL': True,
|
|
123
|
-
'WEIGHTED-F1': True
|
|
124
|
-
|
|
140
|
+
'WEIGHTED-F1': True,
|
|
141
|
+
'SILHOUETTE': True,
|
|
142
|
+
'CALINSKI': True,
|
|
143
|
+
'DAVIES': True}
|
|
144
|
+
|
|
145
|
+
# OpenSource ML function comparator (excluding MPD, MGD, MTD, RMSE, RMSLE)
|
|
146
|
+
self.__osml_func_comparator = {k: v for k, v in self.__func_comparator.items()
|
|
147
|
+
if k not in ['MPD', 'MGD', 'MTD', 'RMSE', 'RMSLE']}
|
|
148
|
+
|
|
149
|
+
# Linear model categorization lists for sklearn models
|
|
150
|
+
self._LINEAR_REGRESSION_MODELS = {
|
|
151
|
+
"ARDRegression", "BayesianRidge", "TweedieRegressor", "TheilSenRegressor",
|
|
152
|
+
"SGDRegressor", "RidgeCV", "Ridge", "RANSACRegressor", "PoissonRegressor",
|
|
153
|
+
"PassiveAggressiveRegressor", "OrthogonalMatchingPursuitCV", "OrthogonalMatchingPursuit",
|
|
154
|
+
"MultiTaskLassoCV", "MultiTaskLasso", "MultiTaskElasticNetCV", "MultiTaskElasticNet",
|
|
155
|
+
"LinearRegression", "LassoLarsIC", "LassoLarsCV", "LassoLars", "LassoCV",
|
|
156
|
+
"Lasso", "LarsCV", "Lars", "HuberRegressor", "GammaRegressor",
|
|
157
|
+
"ElasticNetCV", "ElasticNet"
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
self._LINEAR_CLASSIFICATION_MODELS = {
|
|
161
|
+
"SGDClassifier", "RidgeClassifierCV", "RidgeClassifier", "Perceptron",
|
|
162
|
+
"PassiveAggressiveClassifier", "LogisticRegressionCV", "LogisticRegression"
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
self._CLUSTERING_MODELS = {
|
|
166
|
+
"KMeans", "GaussianMixture"
|
|
167
|
+
}
|
|
125
168
|
self.__func = func
|
|
126
169
|
self.__params = params
|
|
127
170
|
# "self.__best_model" contains best model.
|
|
@@ -178,47 +221,67 @@ class _BaseSearch:
|
|
|
178
221
|
# '__parallel_stop_event' is used to stop threads in parallel execution.
|
|
179
222
|
self.__parallel_stop_event = None
|
|
180
223
|
|
|
181
|
-
|
|
182
|
-
self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
|
|
183
|
-
else func.__name__
|
|
184
|
-
|
|
224
|
+
|
|
185
225
|
# Set the function feature type and supported functionality.
|
|
186
226
|
self.__is_sqle_function = False
|
|
187
227
|
self.__is_uaf_function = False
|
|
188
228
|
self.__is_val_function = True if "valib" in str(self.__func.__module__)\
|
|
189
229
|
else False
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
self.
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
230
|
+
self.__is_opensource_model = False
|
|
231
|
+
self.__is_clustering_model = False
|
|
232
|
+
self.__is_regression_model = False
|
|
233
|
+
self.__is_classification_model = False
|
|
234
|
+
self.model_id_counter = {}
|
|
235
|
+
|
|
236
|
+
# Import sklearn wrapper class for proper type checking
|
|
237
|
+
from teradataml.opensource._sklearn import _SkLearnObjectWrapper
|
|
238
|
+
|
|
239
|
+
if hasattr(func, "modelObj") and isinstance(func, _SkLearnObjectWrapper):
|
|
240
|
+
self.__is_opensource_model = True
|
|
241
|
+
self.__is_trainable = True
|
|
242
|
+
self.__is_evaluatable = True
|
|
243
|
+
self.__is_predictable = True
|
|
244
|
+
|
|
245
|
+
# Set the function name and class
|
|
246
|
+
self.__func_name = func.modelObj.__class__.__name__ # e.g., 'KMeans'
|
|
247
|
+
self.__func = func.__class__
|
|
248
|
+
if self.__func_name in self._CLUSTERING_MODELS:
|
|
249
|
+
self.__is_clustering_model = True
|
|
250
|
+
self.__is_evaluatable = False
|
|
251
|
+
elif self.__func_name in self._LINEAR_REGRESSION_MODELS:
|
|
252
|
+
self.__is_regression_model = True
|
|
253
|
+
elif self.__func_name in self._LINEAR_CLASSIFICATION_MODELS:
|
|
254
|
+
self.__is_classification_model = True
|
|
206
255
|
else:
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
256
|
+
self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
|
|
257
|
+
else func.__name__
|
|
258
|
+
if self.__func_name in self._VAL_TRAINABLE_FUNCS and self.__is_val_function:
|
|
259
|
+
# TODO: Enable these feature once merge model supports VAL functions.
|
|
260
|
+
# This case is for VAL model trainer functions.
|
|
261
|
+
self.__is_trainable = self.__is_evaluatable = \
|
|
262
|
+
self.__is_predictable = False
|
|
263
|
+
elif self.__func_name in self._UAF_TRAINABLE_FUNCS:
|
|
264
|
+
# TODO: Enable these feature once merge model supports UAF functions.
|
|
265
|
+
# This case is for UAF model trainer functions.
|
|
266
|
+
self.__is_uaf_function = self.__is_trainable = \
|
|
267
|
+
self.__is_evaluatable = False
|
|
268
|
+
self.__is_predictable = False
|
|
269
|
+
elif self.__func_name in self._SQLE_TRAINABLE_FUNCS:
|
|
270
|
+
# This case is for SQLE model trainer functions.
|
|
271
|
+
self.__is_sqle_function = self.__is_trainable = \
|
|
272
|
+
self.__is_evaluatable = self.__is_predictable = True
|
|
273
|
+
else:
|
|
274
|
+
# This case is for non-model trainer functions.
|
|
275
|
+
self.__is_trainable = self.__is_evaluatable = \
|
|
276
|
+
self.__is_predictable = False
|
|
277
|
+
|
|
278
|
+
self.__is_evaluatable = False if not self.__is_evaluatable or \
|
|
279
|
+
self.__func_name in self.__US_TRAINABLE_FUNCS else \
|
|
280
|
+
True
|
|
218
281
|
# Set train routine based on model type.
|
|
219
282
|
# Non-model trainer routine is used for unsupervised model function training.
|
|
220
283
|
self._execute_fit = self.__model_trainer_routine if self.__is_trainable \
|
|
221
|
-
and self.__is_evaluatable else \
|
|
284
|
+
and (self.__is_evaluatable or self.__is_clustering_model) else \
|
|
222
285
|
self.__non_model_trainer_routine
|
|
223
286
|
|
|
224
287
|
# Utility lambda functions.
|
|
@@ -266,6 +329,9 @@ class _BaseSearch:
|
|
|
266
329
|
self._get_model_trainer_train_data_arg = lambda : "train_data" if \
|
|
267
330
|
self.__func_name == "KNN" else "data"
|
|
268
331
|
|
|
332
|
+
# '_get_predict_column' function is used to generate prediction column name.
|
|
333
|
+
self._get_predict_column = lambda: f"{self.__func_name.lower()}_predict_1"
|
|
334
|
+
|
|
269
335
|
if self.__is_trainable and "data" in self.__params:
|
|
270
336
|
data = self.__params.pop("data")
|
|
271
337
|
self.__validate_model_trainer_input_data_argument(data, False)
|
|
@@ -545,7 +611,6 @@ class _BaseSearch:
|
|
|
545
611
|
"""
|
|
546
612
|
return self.__sampled_df_mapper[self.__best_data_id]
|
|
547
613
|
|
|
548
|
-
|
|
549
614
|
@property
|
|
550
615
|
def best_data_id(self):
|
|
551
616
|
"""
|
|
@@ -592,7 +657,7 @@ class _BaseSearch:
|
|
|
592
657
|
|
|
593
658
|
"""
|
|
594
659
|
|
|
595
|
-
if not self.__is_evaluatable:
|
|
660
|
+
if not (self.__is_evaluatable or self.__is_clustering_model):
|
|
596
661
|
# Raise error when "model_stats" attribute accessed for non-executable
|
|
597
662
|
# functions.
|
|
598
663
|
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
@@ -635,7 +700,6 @@ class _BaseSearch:
|
|
|
635
700
|
|
|
636
701
|
return self.__model_stats
|
|
637
702
|
|
|
638
|
-
|
|
639
703
|
def is_running(self):
|
|
640
704
|
"""
|
|
641
705
|
DESCRIPTION:
|
|
@@ -665,7 +729,6 @@ class _BaseSearch:
|
|
|
665
729
|
# both parallel and sequential execution.
|
|
666
730
|
return self.__is_model_training_completed()
|
|
667
731
|
|
|
668
|
-
|
|
669
732
|
def _add_data_label(self, arg_name=None):
|
|
670
733
|
"""
|
|
671
734
|
DESCRIPTION:
|
|
@@ -765,7 +828,6 @@ class _BaseSearch:
|
|
|
765
828
|
|
|
766
829
|
return _labeled_data
|
|
767
830
|
|
|
768
|
-
|
|
769
831
|
def __perform_train_test_sampling(self, data, frac, stratify_column=None,
|
|
770
832
|
sample_id_column=None, sample_seed=None):
|
|
771
833
|
"""
|
|
@@ -995,8 +1057,71 @@ class _BaseSearch:
|
|
|
995
1057
|
# Validate DataFrames.
|
|
996
1058
|
arg_info_matrix.append(["data", data, is_optional_arg, (DataFrame)])
|
|
997
1059
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1060
|
+
|
|
1061
|
+
def _regression_metrics(self, y_true, y_pred):
|
|
1062
|
+
from teradataml import td_sklearn as skl
|
|
998
1063
|
|
|
999
|
-
|
|
1064
|
+
ME = skl.max_error(y_true=y_true, y_pred=y_pred)
|
|
1065
|
+
|
|
1066
|
+
MAE = skl.mean_absolute_error(y_true=y_true, y_pred=y_pred)
|
|
1067
|
+
|
|
1068
|
+
MSE = skl.mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
|
|
1069
|
+
|
|
1070
|
+
try:
|
|
1071
|
+
MSLE = skl.mean_squared_log_error(y_true=y_true, y_pred=y_pred)
|
|
1072
|
+
except:
|
|
1073
|
+
MSLE = "NA"
|
|
1074
|
+
|
|
1075
|
+
MAPE = skl.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
|
|
1076
|
+
|
|
1077
|
+
R2 = skl.r2_score(y_true=y_true, y_pred=y_pred)
|
|
1078
|
+
|
|
1079
|
+
EV = skl.explained_variance_score(y_true=y_true, y_pred=y_pred)
|
|
1080
|
+
|
|
1081
|
+
MAD = skl.median_absolute_error(y_true=y_true, y_pred=y_pred)
|
|
1082
|
+
|
|
1083
|
+
#TODO: Support for MPD, MGD, MTD will be added in next phase.
|
|
1084
|
+
# Support for RMSE, RMSLE will be added after OpenSourceML scikit-learn version
|
|
1085
|
+
# update as it requires higher version(>1.1.3)
|
|
1086
|
+
"""MPD = skl.mean_poisson_deviance(y_true, y_pred)
|
|
1087
|
+
MGD = skl.mean_gamma_deviance(y_true, y_pred)
|
|
1088
|
+
MTD = skl.mean_tweedie_deviance(y_true, y_pred)"""
|
|
1089
|
+
|
|
1090
|
+
keys = ["MAE", "MSE", "MSLE", "MAPE", "R2", "EV", "ME", "MAD"]
|
|
1091
|
+
values = [MAE, MSE, MSLE, MAPE, R2, EV, ME, MAD]
|
|
1092
|
+
return dict(zip(keys, values))
|
|
1093
|
+
|
|
1094
|
+
def _classification_metrics(self, y_true, y_pred):
|
|
1095
|
+
from teradataml import td_sklearn as skl
|
|
1096
|
+
|
|
1097
|
+
# Basic classification metrics
|
|
1098
|
+
accuracy = skl.accuracy_score(y_true=y_true, y_pred=y_pred)
|
|
1099
|
+
|
|
1100
|
+
# Precision, Recall, F1 (micro, macro, weighted averages)
|
|
1101
|
+
micro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1102
|
+
micro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1103
|
+
micro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1104
|
+
|
|
1105
|
+
macro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1106
|
+
macro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1107
|
+
macro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1108
|
+
|
|
1109
|
+
weighted_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1110
|
+
weighted_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1111
|
+
weighted_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1112
|
+
|
|
1113
|
+
keys = [
|
|
1114
|
+
"ACCURACY", "MICRO-PRECISION", "MICRO-RECALL", "MICRO-F1",
|
|
1115
|
+
"MACRO-PRECISION", "MACRO-RECALL", "MACRO-F1",
|
|
1116
|
+
"WEIGHTED-PRECISION", "WEIGHTED-RECALL", "WEIGHTED-F1"
|
|
1117
|
+
]
|
|
1118
|
+
values = [
|
|
1119
|
+
accuracy, micro_precision, micro_recall, micro_f1,
|
|
1120
|
+
macro_precision, macro_recall, macro_f1,
|
|
1121
|
+
weighted_precision, weighted_recall, weighted_f1
|
|
1122
|
+
]
|
|
1123
|
+
return dict(zip(keys, values))
|
|
1124
|
+
|
|
1000
1125
|
def fit(self,
|
|
1001
1126
|
data=None,
|
|
1002
1127
|
evaluation_metric=None,
|
|
@@ -1051,6 +1176,7 @@ class _BaseSearch:
|
|
|
1051
1176
|
* evaluation_metric applicable for model trainer functions.
|
|
1052
1177
|
* Best model is not selected when evaluation returns
|
|
1053
1178
|
non-finite values.
|
|
1179
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
1054
1180
|
Permitted Values:
|
|
1055
1181
|
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
1056
1182
|
Micro-F1, Macro-Precision, Macro-Recall,
|
|
@@ -1059,10 +1185,11 @@ class _BaseSearch:
|
|
|
1059
1185
|
Weighted-F1.
|
|
1060
1186
|
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
1061
1187
|
R2, EV, MPD, MGD
|
|
1062
|
-
|
|
1188
|
+
* Clustering: SILHOUETTE
|
|
1063
1189
|
Default Value:
|
|
1064
1190
|
* Classification: Accuracy
|
|
1065
1191
|
* Regression: MAE
|
|
1192
|
+
* Clustering: SILHOUETTE
|
|
1066
1193
|
Types: str
|
|
1067
1194
|
|
|
1068
1195
|
early_stop:
|
|
@@ -1241,7 +1368,9 @@ class _BaseSearch:
|
|
|
1241
1368
|
arg_info_matrix.append(["run_parallel", run_parallel, True, (bool)])
|
|
1242
1369
|
arg_info_matrix.append(["wait", wait, True, (bool)])
|
|
1243
1370
|
arg_info_matrix.append(["evaluation_metric", evaluation_metric, True,
|
|
1244
|
-
(str), True, list(self.
|
|
1371
|
+
(str), True, list(self.__osml_func_comparator)
|
|
1372
|
+
if self.__is_opensource_model
|
|
1373
|
+
else list(self.__func_comparator)])
|
|
1245
1374
|
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
1246
1375
|
arg_info_matrix.append(["max_time", max_time, True, (int, float)])
|
|
1247
1376
|
|
|
@@ -1260,8 +1389,8 @@ class _BaseSearch:
|
|
|
1260
1389
|
|
|
1261
1390
|
# When "evaluation_metric" is 'MPE' then use the spl comparators.
|
|
1262
1391
|
if self.__evaluation_metric == "MPE":
|
|
1263
|
-
self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
|
|
1264
|
-
|
|
1392
|
+
self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
|
|
1393
|
+
|
|
1265
1394
|
if not isinstance(self.__model_trainer_input_data, dict):
|
|
1266
1395
|
# Sample all the labeled data for model training and testing.
|
|
1267
1396
|
self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
|
|
@@ -1277,6 +1406,27 @@ class _BaseSearch:
|
|
|
1277
1406
|
|
|
1278
1407
|
self.__eval_params = kwargs if self.__is_evaluatable else None
|
|
1279
1408
|
|
|
1409
|
+
elif self.__is_trainable and self.__is_opensource_model:
|
|
1410
|
+
|
|
1411
|
+
if self.__is_clustering_model:
|
|
1412
|
+
self.__sampled_df_mapper = self._add_data_label("data")
|
|
1413
|
+
# Update model trainer function parameter grid.
|
|
1414
|
+
self.__update_model_parameters()
|
|
1415
|
+
elif self.__is_regression_model or self.__is_classification_model:
|
|
1416
|
+
# Open-source regression model: perform train-test split
|
|
1417
|
+
|
|
1418
|
+
if not isinstance(self.__model_trainer_input_data, dict):
|
|
1419
|
+
self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
|
|
1420
|
+
sample_id_column, sample_seed)
|
|
1421
|
+
elif isinstance(self.__model_trainer_input_data, dict):
|
|
1422
|
+
self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
|
|
1423
|
+
stratify_column, sample_id_column,
|
|
1424
|
+
sample_seed)
|
|
1425
|
+
# Set evaluation parameters for supervised models
|
|
1426
|
+
self.__eval_params = kwargs if self.__is_evaluatable else None
|
|
1427
|
+
|
|
1428
|
+
self.__update_model_parameters()
|
|
1429
|
+
|
|
1280
1430
|
elif self.__is_trainable and not self.__is_evaluatable:
|
|
1281
1431
|
# This condition identifies unsupervised model trainer function.
|
|
1282
1432
|
# Let's process training data.
|
|
@@ -1285,13 +1435,14 @@ class _BaseSearch:
|
|
|
1285
1435
|
self.__sampled_df_mapper = self._add_data_label("data")
|
|
1286
1436
|
# Update model trainer function parameter grid.
|
|
1287
1437
|
self.__update_model_parameters()
|
|
1288
|
-
|
|
1289
1438
|
# Initialize logging.
|
|
1290
1439
|
if verbose > 0:
|
|
1291
1440
|
self.__progress_bar = _ProgressBar(jobs=len(self._parameter_grid), verbose=verbose)
|
|
1441
|
+
|
|
1292
1442
|
# With VT option Parallel execution won't be possible, as it opens multiple connections.
|
|
1293
1443
|
if not run_parallel or configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
1294
1444
|
# Setting start time of Sequential execution.
|
|
1445
|
+
|
|
1295
1446
|
self.__start_time = time.time() if self.__timeout is not None else None
|
|
1296
1447
|
# TODO: Factorize the code once parallel execution part is completed in ELE-6154 JIRA.
|
|
1297
1448
|
# Execute all parameters from populated parameter grid for both trainable
|
|
@@ -1301,7 +1452,7 @@ class _BaseSearch:
|
|
|
1301
1452
|
|
|
1302
1453
|
# Condition to check early stop feature applicable for model
|
|
1303
1454
|
# trainer function.
|
|
1304
|
-
if self.__early_stop is not None and self.__is_evaluatable:
|
|
1455
|
+
if self.__early_stop is not None and (self.__is_evaluatable or self.__is_clustering_model):
|
|
1305
1456
|
if self.__is_finite and self._is_early_stoppable():
|
|
1306
1457
|
# Terminate HPT execution when the trained model attains the
|
|
1307
1458
|
# given "early_stop" value.
|
|
@@ -1390,28 +1541,44 @@ class _BaseSearch:
|
|
|
1390
1541
|
EXAMPLES:
|
|
1391
1542
|
>>> self.__model_trainer_routine(param=param, iter=iter, **kwargs)
|
|
1392
1543
|
"""
|
|
1393
|
-
|
|
1394
1544
|
# Define model name used for model metadata.
|
|
1545
|
+
|
|
1395
1546
|
model_name = self._generate_model_name(iter)
|
|
1396
1547
|
# Get the unique data identifier present in "model_param".
|
|
1397
1548
|
_data_id = model_param[self.__DATA_ID]
|
|
1398
1549
|
# 'param' variable holds model training parameters and train dataframe.
|
|
1399
1550
|
# Get the model training parameters.
|
|
1400
|
-
|
|
1401
|
-
|
|
1551
|
+
|
|
1552
|
+
if self.__is_opensource_model:
|
|
1553
|
+
param_outer = model_param.get("param", {})
|
|
1554
|
+
param = param_outer.get("param", param_outer)
|
|
1555
|
+
data_input = param.pop("data", None)
|
|
1556
|
+
param = {k: v for k, v in param.items() if k != "data"}
|
|
1557
|
+
else:
|
|
1558
|
+
param = model_param["param"]
|
|
1559
|
+
data_input = None
|
|
1560
|
+
|
|
1402
1561
|
# Check the stop_event set or not
|
|
1403
1562
|
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1404
1563
|
# Update the model metadata for Skip execution.
|
|
1405
|
-
self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
|
|
1564
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
|
|
1406
1565
|
return
|
|
1407
1566
|
|
|
1408
1567
|
# Retrieve the train and test data using data identifier.
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1568
|
+
if self.__is_opensource_model:
|
|
1569
|
+
|
|
1570
|
+
if self.__is_clustering_model:
|
|
1571
|
+
_train_data = self.__sampled_df_mapper[_data_id]
|
|
1572
|
+
_test_data = {} # No label needed
|
|
1573
|
+
elif self.__is_regression_model or self.__is_classification_model:
|
|
1574
|
+
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1575
|
+
kwargs.update(_test_data)
|
|
1576
|
+
else:
|
|
1577
|
+
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1578
|
+
# Update model training argument with train DataFrame.
|
|
1579
|
+
param.update(_train_data)
|
|
1580
|
+
# Update the test DataFrame for model evaluation.
|
|
1581
|
+
kwargs.update(_test_data)
|
|
1415
1582
|
|
|
1416
1583
|
try:
|
|
1417
1584
|
# Record starting time of model training.
|
|
@@ -1421,44 +1588,122 @@ class _BaseSearch:
|
|
|
1421
1588
|
# using getattr method.
|
|
1422
1589
|
self.__func = valib.__getattr__(self.__func_name)
|
|
1423
1590
|
# Train the model.
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1591
|
+
if self.__is_opensource_model:
|
|
1592
|
+
from teradataml import td_sklearn as skl
|
|
1593
|
+
func_class = getattr(skl, self.__func_name) # e.g., skl.KMeans
|
|
1594
|
+
if self.__is_regression_model or self.__is_classification_model:
|
|
1595
|
+
# Extract and remove only for regression models
|
|
1596
|
+
self.__input_columns = param.pop("input_columns", None)
|
|
1597
|
+
self.__response_column = param.pop("response_column", None)
|
|
1598
|
+
|
|
1599
|
+
func_obj = func_class(**param) # Safely create model instance
|
|
1600
|
+
else:
|
|
1601
|
+
func_obj = self.__func(**param)
|
|
1602
|
+
end_time = time.perf_counter()
|
|
1603
|
+
training_time = round((end_time - start_time), 3)
|
|
1427
1604
|
# Store the trained object.
|
|
1428
1605
|
self.__trained_models[model_name] = func_obj
|
|
1429
|
-
|
|
1430
|
-
|
|
1606
|
+
|
|
1607
|
+
if self.__is_opensource_model and self.__is_clustering_model:
|
|
1608
|
+
start_time_cluster = time.perf_counter()
|
|
1609
|
+
from teradataml import td_sklearn as skl
|
|
1610
|
+
feature_cols = [col for col in _train_data["data"].columns]
|
|
1611
|
+
func_obj.fit(data=_train_data["data"], feature_columns=feature_cols)
|
|
1612
|
+
pred_col = self._get_predict_column()
|
|
1613
|
+
result = func_obj.predict(data=_train_data["data"], feature_columns=feature_cols)
|
|
1614
|
+
result.materialize()
|
|
1431
1615
|
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1616
|
+
silhouette = skl.silhouette_score(
|
|
1617
|
+
X=result.select(feature_cols),
|
|
1618
|
+
labels=result.select([pred_col])
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
calinski = skl.calinski_harabasz_score(
|
|
1622
|
+
X=result.select(feature_cols),
|
|
1623
|
+
labels=result.select([pred_col])
|
|
1624
|
+
)
|
|
1625
|
+
|
|
1626
|
+
davies = skl.davies_bouldin_score(
|
|
1627
|
+
X=result.select(feature_cols),
|
|
1628
|
+
labels=result.select([pred_col])
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
columns = ["SILHOUETTE", "CALINSKI", "DAVIES"]
|
|
1632
|
+
eval_values = [silhouette, calinski, davies]
|
|
1633
|
+
eval_key_values = dict(zip(columns, eval_values))
|
|
1634
|
+
|
|
1635
|
+
end_time_cluster = time.perf_counter()
|
|
1636
|
+
training_time_cluster = round((end_time_cluster - start_time_cluster), 3)
|
|
1439
1637
|
|
|
1440
|
-
# Default evaluation metric is set to "MAE" for Regression models.
|
|
1441
1638
|
if self.__evaluation_metric is None:
|
|
1442
|
-
self.__evaluation_metric = "
|
|
1443
|
-
|
|
1639
|
+
self.__evaluation_metric = "SILHOUETTE"
|
|
1640
|
+
|
|
1641
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time_cluster,
|
|
1642
|
+
end_time_cluster, start_time_cluster, _data_id, eval_key_values)
|
|
1643
|
+
elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
1644
|
+
start_time_lin = time.perf_counter()
|
|
1645
|
+
train_df = _train_data["data"]
|
|
1646
|
+
y = train_df.select([self.__response_column])
|
|
1647
|
+
X = train_df.drop(columns=[self.__response_column], axis=1)
|
|
1648
|
+
|
|
1649
|
+
func_obj.fit(X,y)
|
|
1650
|
+
pred_col = self._get_predict_column()
|
|
1651
|
+
|
|
1652
|
+
output = func_obj.predict(X,y)
|
|
1653
|
+
|
|
1654
|
+
y_true = output.select([self.__response_column])
|
|
1655
|
+
y_pred = output.select([pred_col])
|
|
1656
|
+
|
|
1657
|
+
if self.__is_regression_model:
|
|
1658
|
+
eval_key_values = self._regression_metrics(y_true, y_pred)
|
|
1659
|
+
if self.__evaluation_metric is None:
|
|
1660
|
+
self.__evaluation_metric = "MAE"
|
|
1661
|
+
elif self.__is_classification_model:
|
|
1662
|
+
eval_key_values = self._classification_metrics(y_true, y_pred)
|
|
1663
|
+
if self.__evaluation_metric is None:
|
|
1664
|
+
self.__evaluation_metric = "ACCURACY"
|
|
1665
|
+
|
|
1666
|
+
end_time_lin = time.perf_counter()
|
|
1667
|
+
training_time_lin = round((end_time_lin - start_time_lin), 3)
|
|
1668
|
+
|
|
1669
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time_lin,
|
|
1670
|
+
end_time_lin, start_time_lin, _data_id, eval_key_values)
|
|
1444
1671
|
else:
|
|
1445
|
-
#
|
|
1446
|
-
|
|
1447
|
-
#
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
self.__evaluation_metric
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1672
|
+
# Evaluate the trained model.
|
|
1673
|
+
evaluations = func_obj.evaluate(**kwargs)
|
|
1674
|
+
# Extract evaluations report in dictionary format.
|
|
1675
|
+
if "RegressionEvaluator" in type(evaluations).__name__:
|
|
1676
|
+
# RegressionEvaluator results are stored under "result" attribute.
|
|
1677
|
+
# "result" dataframe column names are metrics and corresponding
|
|
1678
|
+
# rows are evaluation values.
|
|
1679
|
+
columns = evaluations.result.keys()
|
|
1680
|
+
eval_values = evaluations.result.get_values()[0]
|
|
1681
|
+
|
|
1682
|
+
# Default evaluation metric is set to "MAE" for Regression models.
|
|
1683
|
+
if self.__evaluation_metric is None:
|
|
1684
|
+
self.__evaluation_metric = "MAE"
|
|
1685
|
+
|
|
1686
|
+
else:
|
|
1687
|
+
# ClassificationEvaluator results are stored under "output_data"
|
|
1688
|
+
# attribute. "output_data" dataframe 'column 1' contains metrics
|
|
1689
|
+
# and 'column 2' holds corresponding evaluation values.
|
|
1690
|
+
eval_report = evaluations.output_data.get_values().transpose()
|
|
1691
|
+
columns = eval_report[1].astype('str')
|
|
1692
|
+
columns = [column_name.upper() for column_name in columns]
|
|
1693
|
+
eval_values = eval_report[2]
|
|
1694
|
+
|
|
1695
|
+
# Default evaluation metric is set to "ACCURACY" for
|
|
1696
|
+
# classification models.
|
|
1697
|
+
if self.__evaluation_metric is None:
|
|
1698
|
+
self.__evaluation_metric = "ACCURACY"
|
|
1699
|
+
|
|
1700
|
+
# Combine columns and eval_values into a dictionary
|
|
1701
|
+
eval_key_values = dict(zip(columns, eval_values))
|
|
1702
|
+
# Update the model metadata for successful model training.
|
|
1703
|
+
self.__update_model_metadata(model_name, param, "PASS",
|
|
1704
|
+
training_time, end_time, start_time,
|
|
1705
|
+
_data_id, eval_key_values)
|
|
1706
|
+
|
|
1462
1707
|
|
|
1463
1708
|
# Check whether self.__parallel_stop_event is None or not
|
|
1464
1709
|
if self.__parallel_stop_event is not None:
|
|
@@ -1468,18 +1713,18 @@ class _BaseSearch:
|
|
|
1468
1713
|
if (self.__early_stop is not None and self._is_early_stoppable())\
|
|
1469
1714
|
or (self.__timeout is not None and self._is_time_stoppable()):
|
|
1470
1715
|
self.__parallel_stop_event.set()
|
|
1471
|
-
|
|
1716
|
+
|
|
1472
1717
|
except Exception as _err_msg:
|
|
1473
1718
|
# Record error message with corresponding "model_name".
|
|
1474
1719
|
self.__model_err_records[model_name] = str(_err_msg)
|
|
1475
1720
|
# Compute the failed execution time for failed training.
|
|
1476
|
-
|
|
1721
|
+
end_time = time.perf_counter()
|
|
1722
|
+
training_time = round((end_time - start_time), 3)
|
|
1477
1723
|
# Update the model metadata for failed execution.
|
|
1478
|
-
self.__update_model_metadata(model_name, param, "FAIL", training_time,
|
|
1479
|
-
_data_id)
|
|
1724
|
+
self.__update_model_metadata(model_name, param, "FAIL", training_time,
|
|
1725
|
+
end_time, start_time, _data_id)
|
|
1480
1726
|
pass
|
|
1481
1727
|
|
|
1482
|
-
|
|
1483
1728
|
def __non_model_trainer_routine(self, model_param, iter, **kwargs):
|
|
1484
1729
|
"""
|
|
1485
1730
|
DESCRIPTION:
|
|
@@ -1549,7 +1794,7 @@ class _BaseSearch:
|
|
|
1549
1794
|
# Check the stop_event set or not
|
|
1550
1795
|
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1551
1796
|
# Update the model metadata for Skip execution.
|
|
1552
|
-
self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
|
|
1797
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
|
|
1553
1798
|
return
|
|
1554
1799
|
try:
|
|
1555
1800
|
# Record starting time of model training.
|
|
@@ -1566,17 +1811,19 @@ class _BaseSearch:
|
|
|
1566
1811
|
self.__trained_models[model_name] = func_obj
|
|
1567
1812
|
|
|
1568
1813
|
# Process training time.
|
|
1569
|
-
|
|
1814
|
+
end_time = time.perf_counter()
|
|
1815
|
+
training_time = round((end_time - start_time), 3)
|
|
1570
1816
|
# Update the model metadata for successful model training.
|
|
1571
1817
|
|
|
1572
|
-
self.__update_model_metadata(model_name, param, "PASS", training_time, _data_id)
|
|
1818
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time, end_time, start_time, _data_id)
|
|
1573
1819
|
except Exception as _err_msg:
|
|
1574
1820
|
# Record error message with corresponding "model_name".
|
|
1575
1821
|
self.__model_err_records[model_name] = str(_err_msg)
|
|
1576
1822
|
# Compute the failed execution time for failed training.
|
|
1577
|
-
|
|
1823
|
+
end_time = time.perf_counter()
|
|
1824
|
+
training_time = round((end_time - start_time), 3)
|
|
1578
1825
|
# Update the model metadata for failed execution.
|
|
1579
|
-
self.__update_model_metadata(model_name, param, "FAIL", training_time, _data_id)
|
|
1826
|
+
self.__update_model_metadata(model_name, param, "FAIL", training_time, end_time, start_time, _data_id)
|
|
1580
1827
|
pass
|
|
1581
1828
|
|
|
1582
1829
|
if self.__parallel_stop_event is not None:
|
|
@@ -1586,14 +1833,14 @@ class _BaseSearch:
|
|
|
1586
1833
|
self.__parallel_stop_event.set()
|
|
1587
1834
|
|
|
1588
1835
|
|
|
1589
|
-
|
|
1590
1836
|
def __update_model_metadata(self, model_name,
|
|
1591
1837
|
param,
|
|
1592
1838
|
status,
|
|
1593
1839
|
training_time,
|
|
1840
|
+
end_time,
|
|
1841
|
+
start_time,
|
|
1594
1842
|
data_id=None,
|
|
1595
|
-
|
|
1596
|
-
eval_values=None):
|
|
1843
|
+
eval_key_values=None):
|
|
1597
1844
|
"""
|
|
1598
1845
|
DESCRIPTION:
|
|
1599
1846
|
Internal function to update the model evaluation details, that are
|
|
@@ -1620,33 +1867,35 @@ class _BaseSearch:
|
|
|
1620
1867
|
* SKIP: Function execution skipped for the chosen parameters.
|
|
1621
1868
|
Types: str
|
|
1622
1869
|
|
|
1623
|
-
data_id:
|
|
1624
|
-
Optional Argument.
|
|
1625
|
-
Specifies the unique data identifier used for model training.
|
|
1626
|
-
Note:
|
|
1627
|
-
* "data_id" is supported for model trainer functions.
|
|
1628
|
-
Types: str
|
|
1629
|
-
|
|
1630
1870
|
training_time:
|
|
1631
1871
|
Required Argument.
|
|
1632
1872
|
Specifies the model training time in seconds for both model trainer
|
|
1633
1873
|
function and non-model trainer function.
|
|
1634
1874
|
Types: float
|
|
1635
1875
|
|
|
1636
|
-
|
|
1876
|
+
end_time:
|
|
1637
1877
|
Optional Argument.
|
|
1638
|
-
Specifies the
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
eval_values:
|
|
1878
|
+
Specifies the end time of the model training.
|
|
1879
|
+
Types: float
|
|
1880
|
+
|
|
1881
|
+
start_time:
|
|
1644
1882
|
Optional Argument.
|
|
1645
|
-
Specifies the
|
|
1646
|
-
|
|
1647
|
-
function.
|
|
1648
|
-
Types: list of float
|
|
1883
|
+
Specifies the start time of the model training.
|
|
1884
|
+
Types: float
|
|
1649
1885
|
|
|
1886
|
+
data_id:
|
|
1887
|
+
Optional Argument.
|
|
1888
|
+
Specifies the unique data identifier used for model training.
|
|
1889
|
+
Note:
|
|
1890
|
+
* "data_id" is supported for model trainer functions.
|
|
1891
|
+
Types: str
|
|
1892
|
+
|
|
1893
|
+
eval_key_values:
|
|
1894
|
+
Optional Argument.
|
|
1895
|
+
Specifies the evaluation key values retrieved from model evaluation
|
|
1896
|
+
phase. This argument is a required argument for model trainer
|
|
1897
|
+
function.
|
|
1898
|
+
Types: dict.
|
|
1650
1899
|
|
|
1651
1900
|
RETURNS:
|
|
1652
1901
|
None
|
|
@@ -1672,17 +1921,21 @@ class _BaseSearch:
|
|
|
1672
1921
|
model_metadata[self.__DATA_ID.upper()] = data_id
|
|
1673
1922
|
|
|
1674
1923
|
# Format log message needs to displayed.
|
|
1675
|
-
_msg = "Model_id:{},Run time:{}s,Status:{}".format(model_name,
|
|
1676
|
-
training_time,
|
|
1924
|
+
_msg = "Model_id:{}, Run time:{}s, Start time:{}, End time:{}, Status:{}".format(model_name,
|
|
1925
|
+
training_time,
|
|
1926
|
+
start_time,
|
|
1927
|
+
end_time,
|
|
1677
1928
|
status)
|
|
1678
1929
|
|
|
1679
|
-
|
|
1680
|
-
if status == "PASS" and self.__is_evaluatable :
|
|
1930
|
+
if status == "PASS" and (self.__is_evaluatable or self.__is_clustering_model):
|
|
1681
1931
|
# While execution status is 'Fail' then update the evaluation result
|
|
1682
1932
|
# with 'None' values.
|
|
1683
|
-
model_scores =
|
|
1933
|
+
model_scores = eval_key_values
|
|
1684
1934
|
model_metadata.update(model_scores)
|
|
1685
|
-
# Add additional model score to the log message.
|
|
1935
|
+
# Add additional model score to the log message.
|
|
1936
|
+
if self.__is_opensource_model and (self.__evaluation_metric is None or self.__evaluation_metric not in model_scores):
|
|
1937
|
+
if "SILHOUETTE" in model_scores:
|
|
1938
|
+
self.__evaluation_metric = "SILHOUETTE"
|
|
1686
1939
|
_msg += ",{}:{}".format(self.__evaluation_metric,round(
|
|
1687
1940
|
model_scores[self.__evaluation_metric], 3))
|
|
1688
1941
|
# Best model updation.
|
|
@@ -1757,18 +2010,46 @@ class _BaseSearch:
|
|
|
1757
2010
|
# identifier is passed.
|
|
1758
2011
|
if not self.__is_trainable or not self.__is_predictable:
|
|
1759
2012
|
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
1760
|
-
|
|
1761
|
-
|
|
2013
|
+
"execute 'predict()'","Not applicable for" \
|
|
2014
|
+
" non-model trainer analytic functions.")
|
|
1762
2015
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1763
2016
|
|
|
1764
2017
|
if self.__default_model is None:
|
|
1765
2018
|
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
1766
2019
|
"execute 'predict()'",
|
|
1767
|
-
|
|
1768
|
-
|
|
2020
|
+
"No model is set as default to set a "\
|
|
2021
|
+
"prediction model use the 'set_model()' function.")
|
|
1769
2022
|
|
|
1770
2023
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1771
|
-
|
|
2024
|
+
|
|
2025
|
+
test_data = kwargs.get("newdata", None)
|
|
2026
|
+
|
|
2027
|
+
if self.__is_opensource_model and self.__is_clustering_model:
|
|
2028
|
+
if test_data is None:
|
|
2029
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id]["data"]
|
|
2030
|
+
feature_columns = kwargs.get("feature_columns", None)
|
|
2031
|
+
|
|
2032
|
+
# If feature columns not passed, fetch from training data
|
|
2033
|
+
if feature_columns is None:
|
|
2034
|
+
if self.__best_data_id is None:
|
|
2035
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2036
|
+
"fetch 'feature_columns'",
|
|
2037
|
+
"No training metadata found")
|
|
2038
|
+
|
|
2039
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2040
|
+
training_df = self.__sampled_df_mapper[self.__best_data_id]["data"]
|
|
2041
|
+
training_columns = training_df.columns
|
|
2042
|
+
|
|
2043
|
+
feature_columns = [col for col in training_columns]
|
|
2044
|
+
|
|
2045
|
+
return self.__default_model.predict(data=test_data, feature_columns=feature_columns)
|
|
2046
|
+
elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
2047
|
+
if test_data is None:
|
|
2048
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
|
|
2049
|
+
y_test = test_data.select([self.__response_column])
|
|
2050
|
+
X_test = test_data.drop(columns=[self.__response_column], axis=1)
|
|
2051
|
+
|
|
2052
|
+
return self.__default_model.predict(X_test, y_test)
|
|
1772
2053
|
# TODO Enable this method, once Merge model supports VAL, and UAF.
|
|
1773
2054
|
return self.__default_model.predict(**kwargs)
|
|
1774
2055
|
|
|
@@ -1963,7 +2244,6 @@ class _BaseSearch:
|
|
|
1963
2244
|
return self.__model_err_records.get(model_id)
|
|
1964
2245
|
|
|
1965
2246
|
|
|
1966
|
-
|
|
1967
2247
|
def set_model(self, model_id):
|
|
1968
2248
|
"""
|
|
1969
2249
|
DESCRIPTION:
|
|
@@ -2046,10 +2326,16 @@ class _BaseSearch:
|
|
|
2046
2326
|
# Raise TeradataMLException error when non-model trainer function
|
|
2047
2327
|
# identifier is passed.
|
|
2048
2328
|
if not self.__is_trainable or not self.__is_evaluatable:
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2329
|
+
if not self.__is_clustering_model:
|
|
2330
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2331
|
+
"execute 'evaluate()'","Not applicable for" \
|
|
2332
|
+
" non-model trainer analytic functions.")
|
|
2333
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2334
|
+
else:
|
|
2335
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2336
|
+
"execute 'evaluate()'","Not applicable for" \
|
|
2337
|
+
" clustering model functions.")
|
|
2338
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2053
2339
|
|
|
2054
2340
|
if self.__default_model is None:
|
|
2055
2341
|
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
@@ -2058,11 +2344,35 @@ class _BaseSearch:
|
|
|
2058
2344
|
"trained model for evaluation use "\
|
|
2059
2345
|
"the 'set_model()' function.")
|
|
2060
2346
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2347
|
+
if self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
2348
|
+
test_data = kwargs.get("newdata", None)
|
|
2349
|
+
|
|
2350
|
+
if test_data is None:
|
|
2351
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
|
|
2352
|
+
|
|
2353
|
+
y_test = test_data.select([self.__response_column])
|
|
2354
|
+
X_test = test_data.drop(columns=[self.__response_column], axis=1)
|
|
2355
|
+
|
|
2356
|
+
pred_col = self._get_predict_column()
|
|
2357
|
+
|
|
2358
|
+
output = self.__default_model.predict(X_test,y_test)
|
|
2359
|
+
|
|
2360
|
+
y_true = output.select([self.__response_column])
|
|
2361
|
+
y_pred = output.select([pred_col])
|
|
2362
|
+
|
|
2363
|
+
if self.__is_regression_model:
|
|
2364
|
+
eval_key_values = self._regression_metrics(y_true, y_pred)
|
|
2365
|
+
elif self.__is_classification_model:
|
|
2366
|
+
eval_key_values = self._classification_metrics(y_true, y_pred)
|
|
2367
|
+
|
|
2368
|
+
import pandas as pd
|
|
2369
|
+
result_df = pd.DataFrame([eval_key_values])
|
|
2370
|
+
return result_df
|
|
2371
|
+
else:
|
|
2372
|
+
_params = self.__eval_params if len(kwargs) == 0 else kwargs
|
|
2373
|
+
if self._TRAINABLE_FUNCS_DATA_MAPPER[self.__func_name] not in _params:
|
|
2374
|
+
_params.update(self.__sampled_df_mapper[self.__best_data_id][1])
|
|
2375
|
+
return self.__default_model.evaluate(**_params)
|
|
2066
2376
|
|
|
2067
2377
|
|
|
2068
2378
|
def __populate_parameter_grid(self):
|
|
@@ -2255,6 +2565,8 @@ class _BaseSearch:
|
|
|
2255
2565
|
|
|
2256
2566
|
if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
|
|
2257
2567
|
self._labeled_data = self._add_data_label()
|
|
2568
|
+
elif self.__is_trainable and self.__is_evaluatable and not self.__is_clustering_model:
|
|
2569
|
+
self._labeled_data = self._add_data_label()
|
|
2258
2570
|
|
|
2259
2571
|
|
|
2260
2572
|
class GridSearch(_BaseSearch):
|
|
@@ -2940,6 +3252,7 @@ class GridSearch(_BaseSearch):
|
|
|
2940
3252
|
* evaluation_metric applicable for model trainer functions.
|
|
2941
3253
|
* Best model is not selected when evaluation returns
|
|
2942
3254
|
non-finite values.
|
|
3255
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
2943
3256
|
Permitted Values:
|
|
2944
3257
|
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
2945
3258
|
Micro-F1, Macro-Precision, Macro-Recall,
|
|
@@ -3555,6 +3868,7 @@ class RandomSearch(_BaseSearch):
|
|
|
3555
3868
|
* evaluation_metric applicable for model trainer functions.
|
|
3556
3869
|
* Best model is not selected when evaluation returns
|
|
3557
3870
|
non-finite values.
|
|
3871
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
3558
3872
|
Permitted Values:
|
|
3559
3873
|
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
3560
3874
|
Micro-F1, Macro-Precision, Macro-Recall,
|