PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/hyperparameter_tuner/optimizer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ##################################################################
 #
-# Copyright 2023 Teradata. All rights reserved.
+# Copyright 2025 Teradata. All rights reserved.
 # TERADATA CONFIDENTIAL AND TRADE SECRET
 #
 # Primary Owner: Kesavaragavan B (kesavaragavan.b@Teradata.com)
@@ -87,7 +87,24 @@ class _BaseSearch:
                                              "SVM": "newdata", "XGBoost": "newdata",
                                              "NaiveBayesTextClassifierTrainer": "newdata",
                                              "DecisionTree": "data", "KMeans": "data",
-                                             "LinReg": "data", "LogReg": "data", "PCA": "data"}
+                                             "LinReg": "data", "LogReg": "data", "PCA": "data",
+                                             "LinearRegression": "data", "Lasso": "data",
+                                             "Ridge": "data", "ARDRegression": "data",
+                                             "BayesianRidge": "data", "TweedieRegressor": "data",
+                                             "TheilSenRegressor": "data", "SGDRegressor": "data",
+                                             "RidgeCV": "data", "RANSACRegressor": "data",
+                                             "PoissonRegressor": "data", "PassiveAggressiveRegressor": "data",
+                                             "OrthogonalMatchingPursuitCV": "data", "OrthogonalMatchingPursuit": "data",
+                                             "MultiTaskLassoCV": "data", "MultiTaskLasso": "data",
+                                             "MultiTaskElasticNetCV": "data", "MultiTaskElasticNet": "data",
+                                             "LassoLarsIC": "data", "LassoLarsCV": "data", "LassoLars": "data",
+                                             "LassoCV": "data", "LarsCV": "data", "Lars": "data",
+                                             "HuberRegressor": "data", "GammaRegressor": "data",
+                                             "ElasticNetCV": "data", "ElasticNet": "data",
+                                             "LogisticRegression": "data", "RidgeClassifier": "data",
+                                             "RidgeClassifierCV": "data", "SGDClassifier": "data",
+                                             "PassiveAggressiveClassifier": "data", "Perceptron": "data",
+                                             "LogisticRegressionCV": "data"}
         self._UAF_TRAINABLE_FUNCS = {"ArimaEstimate", "LinearRegr", "MAMean",
                                      "MultivarRegr", "SimpleExp"}
@@ -120,8 +137,34 @@ class _BaseSearch:
                                    'MACRO-F1': True,
                                    'WEIGHTED-PRECISION': True,
                                    'WEIGHTED-RECALL': True,
-                                   'WEIGHTED-F1': True}
+                                   'WEIGHTED-F1': True,
+                                   'SILHOUETTE': True,
+                                   'CALINSKI': True,
+                                   'DAVIES': True}
+        # OpenSource ML function comparator (excluding MPD, MGD, MTD, RMSE, RMSLE)
+        self.__osml_func_comparator = {k: v for k, v in self.__func_comparator.items()
+                                       if k not in ['MPD', 'MGD', 'MTD', 'RMSE', 'RMSLE']}
+        # Linear model categorization lists for sklearn models
+        self._LINEAR_REGRESSION_MODELS = {
+            "ARDRegression", "BayesianRidge", "TweedieRegressor", "TheilSenRegressor",
+            "SGDRegressor", "RidgeCV", "Ridge", "RANSACRegressor", "PoissonRegressor",
+            "PassiveAggressiveRegressor", "OrthogonalMatchingPursuitCV", "OrthogonalMatchingPursuit",
+            "MultiTaskLassoCV", "MultiTaskLasso", "MultiTaskElasticNetCV", "MultiTaskElasticNet",
+            "LinearRegression", "LassoLarsIC", "LassoLarsCV", "LassoLars", "LassoCV",
+            "Lasso", "LarsCV", "Lars", "HuberRegressor", "GammaRegressor",
+            "ElasticNetCV", "ElasticNet"
+        }
+        self._LINEAR_CLASSIFICATION_MODELS = {
+            "SGDClassifier", "RidgeClassifierCV", "RidgeClassifier", "Perceptron",
+            "PassiveAggressiveClassifier", "LogisticRegressionCV", "LogisticRegression"
+        }
+        self._CLUSTERING_MODELS = {
+            "KMeans", "GaussianMixture"
+        }
         self.__func = func
         self.__params = params
         # "self.__best_model" contains best model.
@@ -178,47 +221,67 @@ class _BaseSearch:
         # '__parallel_stop_event' is used to stop threads in parallel execution.
         self.__parallel_stop_event = None
-        # Get the function name.
-        self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
-                                                 else func.__name__
         # Set the function feature type and supported functionality.
         self.__is_sqle_function = False
         self.__is_uaf_function = False
         self.__is_val_function = True if "valib" in str(self.__func.__module__)\
                                       else False
-        if self.__func_name in self._VAL_TRAINABLE_FUNCS and self.__is_val_function:
-            # TODO: Enable these feature once merge model supports VAL functions.
-            # This case is for VAL model trainer functions.
-            self.__is_trainable = self.__is_evaluatable = \
-                                  self.__is_predictable = False
-        elif self.__func_name in self._UAF_TRAINABLE_FUNCS:
-            # TODO: Enable these feature once merge model supports UAF functions.
-            # This case is for UAF model trainer functions.
-            self.__is_uaf_function = self.__is_trainable = \
-                                     self.__is_evaluatable = False
-            self.__is_predictable = False
-        elif self.__func_name in self._SQLE_TRAINABLE_FUNCS:
-            # This case is for SQLE model trainer functions.
-            self.__is_sqle_function = self.__is_trainable = \
-            self.__is_evaluatable = self.__is_predictable = True
+        self.__is_opensource_model = False
+        self.__is_clustering_model = False
+        self.__is_regression_model = False
+        self.__is_classification_model = False
+        self.model_id_counter = {}
+        # Import sklearn wrapper class for proper type checking
+        from teradataml.opensource._sklearn import _SkLearnObjectWrapper
+        if hasattr(func, "modelObj") and isinstance(func, _SkLearnObjectWrapper):
+            self.__is_opensource_model = True
+            self.__is_trainable = True
+            self.__is_evaluatable = True
+            self.__is_predictable = True
+            # Set the function name and class
+            self.__func_name = func.modelObj.__class__.__name__   # e.g., 'KMeans'
+            self.__func = func.__class__
+            if self.__func_name in self._CLUSTERING_MODELS:
+                self.__is_clustering_model = True
+                self.__is_evaluatable = False
+            elif self.__func_name in self._LINEAR_REGRESSION_MODELS:
+                self.__is_regression_model = True
+            elif self.__func_name in self._LINEAR_CLASSIFICATION_MODELS:
+                self.__is_classification_model = True
         else:
-            # This case is for non-model trainer functions.
-            self.__is_trainable = self.__is_evaluatable = \
-                                  self.__is_predictable = False
-        # Unsupervised model cannot perform evaluation. So, disable evaluation
-        # functionality.
-        self.__is_evaluatable = False if not self.__is_evaluatable or \
-                                self.__func_name in self.__US_TRAINABLE_FUNCS else \
-                                True
+            self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
+                                                     else func.__name__
+            if self.__func_name in self._VAL_TRAINABLE_FUNCS and self.__is_val_function:
+                # TODO: Enable these feature once merge model supports VAL functions.
+                # This case is for VAL model trainer functions.
+                self.__is_trainable = self.__is_evaluatable = \
+                                    self.__is_predictable = False
+            elif self.__func_name in self._UAF_TRAINABLE_FUNCS:
+                # TODO: Enable these feature once merge model supports UAF functions.
+                # This case is for UAF model trainer functions.
+                self.__is_uaf_function = self.__is_trainable = \
+                                        self.__is_evaluatable = False
+                self.__is_predictable = False
+            elif self.__func_name in self._SQLE_TRAINABLE_FUNCS:
+                # This case is for SQLE model trainer functions.
+                self.__is_sqle_function = self.__is_trainable = \
+                self.__is_evaluatable = self.__is_predictable = True
+            else:
+                # This case is for non-model trainer functions.
+                self.__is_trainable = self.__is_evaluatable = \
+                                    self.__is_predictable = False
+            self.__is_evaluatable = False if not self.__is_evaluatable or \
+                                    self.__func_name in self.__US_TRAINABLE_FUNCS else \
+                                    True
         # Set train routine based on model type.
         # Non-model trainer routine is used for unsupervised model function training.
         self._execute_fit = self.__model_trainer_routine if self.__is_trainable \
-                            and self.__is_evaluatable else \
+                            and (self.__is_evaluatable or self.__is_clustering_model) else \
                             self.__non_model_trainer_routine
         # Utility lambda functions.
@@ -266,6 +329,9 @@ class _BaseSearch:
         self._get_model_trainer_train_data_arg = lambda : "train_data" if \
                                                  self.__func_name == "KNN" else "data"
+        # '_get_predict_column' function is used to generate prediction column name.
+        self._get_predict_column = lambda: f"{self.__func_name.lower()}_predict_1"
         if self.__is_trainable and "data" in self.__params:
             data = self.__params.pop("data")
             self.__validate_model_trainer_input_data_argument(data, False)
@@ -545,7 +611,6 @@ class _BaseSearch:
         """
         return self.__sampled_df_mapper[self.__best_data_id]
     @property
     def best_data_id(self):
         """
@@ -592,7 +657,7 @@ class _BaseSearch:
         """
-        if not self.__is_evaluatable:
+        if not (self.__is_evaluatable or self.__is_clustering_model):
             # Raise error when "model_stats" attribute accessed for non-executable
             # functions.
             err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
@@ -635,7 +700,6 @@ class _BaseSearch:
         return self.__model_stats
     def is_running(self):
         """
         DESCRIPTION:
@@ -665,7 +729,6 @@ class _BaseSearch:
         # both parallel and sequential execution.
         return self.__is_model_training_completed()
     def _add_data_label(self, arg_name=None):
         """
         DESCRIPTION:
@@ -765,7 +828,6 @@ class _BaseSearch:
         return _labeled_data
     def __perform_train_test_sampling(self, data, frac, stratify_column=None,
                                       sample_id_column=None, sample_seed=None):
         """
@@ -995,8 +1057,71 @@ class _BaseSearch:
             # Validate DataFrames.
             arg_info_matrix.append(["data", data, is_optional_arg, (DataFrame)])
         _Validators._validate_function_arguments(arg_info_matrix)
+    def _regression_metrics(self, y_true, y_pred):
+        from teradataml import td_sklearn as skl
+        ME = skl.max_error(y_true=y_true, y_pred=y_pred)
+        MAE = skl.mean_absolute_error(y_true=y_true, y_pred=y_pred)
+        MSE = skl.mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
+        try:
+            MSLE = skl.mean_squared_log_error(y_true=y_true, y_pred=y_pred)
+        except:
+            MSLE = "NA"
+        MAPE = skl.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
+        R2 = skl.r2_score(y_true=y_true, y_pred=y_pred)
+        EV = skl.explained_variance_score(y_true=y_true, y_pred=y_pred)
+        MAD = skl.median_absolute_error(y_true=y_true, y_pred=y_pred)
+        #TODO: Support for MPD, MGD, MTD will be added in next phase.
+        # Support for RMSE, RMSLE will be added after OpenSourceML scikit-learn version
+        # update as it requires higher version(>1.1.3)
+        """MPD = skl.mean_poisson_deviance(y_true, y_pred)
+        MGD = skl.mean_gamma_deviance(y_true, y_pred)
+        MTD = skl.mean_tweedie_deviance(y_true, y_pred)"""
+        keys = ["MAE", "MSE", "MSLE", "MAPE", "R2", "EV", "ME", "MAD"]
+        values = [MAE, MSE, MSLE, MAPE, R2, EV, ME, MAD]
+        return dict(zip(keys, values))
+    def _classification_metrics(self, y_true, y_pred):
+        from teradataml import td_sklearn as skl
+        # Basic classification metrics
+        accuracy = skl.accuracy_score(y_true=y_true, y_pred=y_pred)
+        # Precision, Recall, F1 (micro, macro, weighted averages)
+        micro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='micro')
+        micro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='micro')
+        micro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='micro')
+        macro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='macro')
+        macro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='macro')
+        macro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='macro')
+        weighted_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
+        weighted_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
+        weighted_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
+        keys = [
+            "ACCURACY", "MICRO-PRECISION", "MICRO-RECALL", "MICRO-F1",
+            "MACRO-PRECISION", "MACRO-RECALL", "MACRO-F1",
+            "WEIGHTED-PRECISION", "WEIGHTED-RECALL", "WEIGHTED-F1"
+        ]
+        values = [
+            accuracy, micro_precision, micro_recall, micro_f1,
+            macro_precision, macro_recall, macro_f1,
+            weighted_precision, weighted_recall, weighted_f1
+        ]
+        return dict(zip(keys, values))
     def fit(self,
             data=None,
             evaluation_metric=None,
@@ -1051,6 +1176,7 @@ class _BaseSearch:
                     * evaluation_metric applicable for model trainer functions.
                     * Best model is not selected when evaluation returns
                       non-finite values.
+                    * MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
                 Permitted Values:
                     * Classification: Accuracy, Micro-Precision, Micro-Recall,
                                       Micro-F1, Macro-Precision, Macro-Recall,
@@ -1059,10 +1185,11 @@ class _BaseSearch:
                                       Weighted-F1.
                     * Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
                                   R2, EV, MPD, MGD
+                    * Clustering: SILHOUETTE
                 Default Value:
                     * Classification: Accuracy
                     * Regression: MAE
+                    * Clustering: SILHOUETTE
                 Types: str
             early_stop:
@@ -1241,7 +1368,9 @@ class _BaseSearch:
         arg_info_matrix.append(["run_parallel", run_parallel, True, (bool)])
         arg_info_matrix.append(["wait", wait, True, (bool)])
         arg_info_matrix.append(["evaluation_metric", evaluation_metric, True,
-                                (str), True, list(self.__func_comparator)])
+                                (str), True, list(self.__osml_func_comparator)
+                                if self.__is_opensource_model
+                                else list(self.__func_comparator)])
         arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
         arg_info_matrix.append(["max_time", max_time, True, (int, float)])
@@ -1260,8 +1389,8 @@ class _BaseSearch:
             # When "evaluation_metric" is 'MPE' then use the spl comparators.
             if self.__evaluation_metric == "MPE":
-                self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
+                self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
             if not isinstance(self.__model_trainer_input_data, dict):
                 # Sample all the labeled data for model training and testing.
                 self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
@@ -1277,6 +1406,27 @@ class _BaseSearch:
             self.__eval_params = kwargs if self.__is_evaluatable else None
+        elif self.__is_trainable and self.__is_opensource_model:
+            if self.__is_clustering_model:
+                self.__sampled_df_mapper = self._add_data_label("data")
+                # Update model trainer function parameter grid.
+                self.__update_model_parameters()
+            elif self.__is_regression_model or self.__is_classification_model:
+                # Open-source regression model: perform train-test split
+                if not isinstance(self.__model_trainer_input_data, dict):
+                    self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
+                                                    sample_id_column, sample_seed)
+                elif isinstance(self.__model_trainer_input_data, dict):
+                    self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
+                                                    stratify_column, sample_id_column,
+                                                    sample_seed)
+                #  Set evaluation parameters for supervised models
+                self.__eval_params = kwargs if self.__is_evaluatable else None
+            self.__update_model_parameters()
         elif self.__is_trainable and not self.__is_evaluatable:
             # This condition identifies unsupervised model trainer function.
             # Let's process training data.
@@ -1285,13 +1435,14 @@ class _BaseSearch:
             self.__sampled_df_mapper = self._add_data_label("data")
             # Update model trainer function parameter grid.
             self.__update_model_parameters()
         # Initialize logging.
         if verbose > 0:
             self.__progress_bar = _ProgressBar(jobs=len(self._parameter_grid), verbose=verbose)
         # With VT option Parallel execution won't be possible, as it opens multiple connections.
         if not run_parallel or configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
             # Setting start time of Sequential execution.
             self.__start_time = time.time() if self.__timeout is not None else None
             # TODO: Factorize the code once parallel execution part is completed in ELE-6154 JIRA.
             # Execute all parameters from populated parameter grid for both trainable
@@ -1301,7 +1452,7 @@ class _BaseSearch:
                 # Condition to check early stop feature applicable for model
                 # trainer function.
-                if self.__early_stop is not None and self.__is_evaluatable:
+                if self.__early_stop is not None and (self.__is_evaluatable or self.__is_clustering_model):
                     if self.__is_finite and self._is_early_stoppable():
                         # Terminate HPT execution when the trained model attains the
                         # given "early_stop" value.
@@ -1390,28 +1541,44 @@ class _BaseSearch:
         EXAMPLES:
             >>> self.__model_trainer_routine(param=param, iter=iter, **kwargs)
         """
         # Define model name used for model metadata.
         model_name = self._generate_model_name(iter)
         # Get the unique data identifier present in "model_param".
         _data_id = model_param[self.__DATA_ID]
         # 'param' variable holds model training parameters and train dataframe.
         # Get the model training parameters.
-        param = model_param["param"]
+        if self.__is_opensource_model:
+            param_outer = model_param.get("param", {})
+            param = param_outer.get("param", param_outer)
+            data_input = param.pop("data", None)
+            param = {k: v for k, v in param.items() if k != "data"}
+        else:
+            param = model_param["param"]
+            data_input = None
         # Check the stop_event set or not
         if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
             # Update the model metadata for Skip execution.
-            self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
+            self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
             return
         # Retrieve the train and test data using data identifier.
-        _train_data, _test_data =  self.__sampled_df_mapper[_data_id]
-        # Update model training argument with train DataFrame.
-        param.update(_train_data)
-        # Update the test DataFrame for model evaluation.
-        kwargs.update(_test_data)
+        if self.__is_opensource_model:
+            if self.__is_clustering_model:
+                _train_data = self.__sampled_df_mapper[_data_id]
+                _test_data = {}  # No label needed
+            elif self.__is_regression_model or self.__is_classification_model:
+                _train_data, _test_data = self.__sampled_df_mapper[_data_id]
+                kwargs.update(_test_data)
+        else:
+            _train_data, _test_data =  self.__sampled_df_mapper[_data_id]
+            # Update model training argument with train DataFrame.
+            param.update(_train_data)
+            # Update the test DataFrame for model evaluation.
+            kwargs.update(_test_data)
         try:
             # Record starting time of model training.
@@ -1421,44 +1588,122 @@ class _BaseSearch:
                 # using getattr method.
                 self.__func = valib.__getattr__(self.__func_name)
             # Train the model.
-            func_obj = self.__func(**param)
-            # Evaluate the trained model.
-            evaluations = func_obj.evaluate(**kwargs)
+            if self.__is_opensource_model:
+                from teradataml import td_sklearn as skl
+                func_class = getattr(skl, self.__func_name)  # e.g., skl.KMeans
+                if self.__is_regression_model or self.__is_classification_model:
+                    # Extract and remove only for regression models
+                    self.__input_columns = param.pop("input_columns", None)
+                    self.__response_column = param.pop("response_column", None)
+                func_obj = func_class(**param)  # Safely create model instance
+            else:
+                func_obj = self.__func(**param)
+            end_time = time.perf_counter()
+            training_time = round((end_time - start_time), 3)
             # Store the trained object.
             self.__trained_models[model_name] = func_obj
-            # Process training time.
-            training_time = round((time.perf_counter() - start_time), 3)
+            if self.__is_opensource_model and self.__is_clustering_model:
+                start_time_cluster = time.perf_counter()
+                from teradataml import td_sklearn as skl
+                feature_cols = [col for col in _train_data["data"].columns]
+                func_obj.fit(data=_train_data["data"], feature_columns=feature_cols)
+                pred_col = self._get_predict_column()
+                result = func_obj.predict(data=_train_data["data"], feature_columns=feature_cols)
+                result.materialize()
-            # Extract evaluations report in dictionary format.
-            if "RegressionEvaluator" in type(evaluations).__name__:
-                # RegressionEvaluator results are stored under "result" attribute.
-                # "result" dataframe column names are metrics and corresponding
-                # rows are evaluation values.
-                columns = evaluations.result.keys()
-                eval_values = evaluations.result.get_values()[0]
+                silhouette = skl.silhouette_score(
+                    X=result.select(feature_cols),
+                    labels=result.select([pred_col])
+                )
+                calinski = skl.calinski_harabasz_score(
+                    X=result.select(feature_cols),
+                    labels=result.select([pred_col])
+                )
+                davies = skl.davies_bouldin_score(
+                    X=result.select(feature_cols),
+                    labels=result.select([pred_col])
+                )
+                columns = ["SILHOUETTE", "CALINSKI", "DAVIES"]
+                eval_values = [silhouette, calinski, davies]
+                eval_key_values = dict(zip(columns, eval_values))
+                end_time_cluster = time.perf_counter()
+                training_time_cluster = round((end_time_cluster - start_time_cluster), 3)
-                # Default evaluation metric is set to "MAE" for Regression models.
                 if self.__evaluation_metric is None:
-                    self.__evaluation_metric = "MAE"
+                    self.__evaluation_metric = "SILHOUETTE"
+                self.__update_model_metadata(model_name, param, "PASS", training_time_cluster,
+                                             end_time_cluster, start_time_cluster, _data_id, eval_key_values)
+            elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
+                start_time_lin = time.perf_counter()
+                train_df = _train_data["data"]
+                y = train_df.select([self.__response_column])
+                X = train_df.drop(columns=[self.__response_column], axis=1)
+                func_obj.fit(X,y)
+                pred_col = self._get_predict_column()
+                output = func_obj.predict(X,y)
+                y_true = output.select([self.__response_column])
+                y_pred = output.select([pred_col])
+                if self.__is_regression_model:
+                    eval_key_values = self._regression_metrics(y_true, y_pred)
+                    if self.__evaluation_metric is None:
+                            self.__evaluation_metric = "MAE"
+                elif self.__is_classification_model:
+                    eval_key_values = self._classification_metrics(y_true, y_pred)
+                    if self.__evaluation_metric is None:
+                        self.__evaluation_metric = "ACCURACY"
+                end_time_lin = time.perf_counter()
+                training_time_lin = round((end_time_lin - start_time_lin), 3)
+                self.__update_model_metadata(model_name, param, "PASS", training_time_lin,
+                                                end_time_lin, start_time_lin, _data_id, eval_key_values)
             else:
-                # ClassificationEvaluator results are stored under "output_data"
-                # attribute. "output_data" dataframe 'column 1' contains metrics
-                # and 'column 2' holds corresponding evaluation values.
-                eval_report = evaluations.output_data.get_values().transpose()
-                columns = eval_report[1].astype('str')
-                columns = [column_name.upper() for column_name in columns]
-                eval_values = eval_report[2]
-                # Default evaluation metric is set to "ACCURACY" for
-                # classification models.
-                if self.__evaluation_metric is None:
-                    self.__evaluation_metric = "ACCURACY"
-            # Update the model metadata for successful model training.
-            self.__update_model_metadata(model_name, param, "PASS",
-                                         training_time, _data_id,
-                                         columns, eval_values)
+                # Evaluate the trained model.
+                evaluations = func_obj.evaluate(**kwargs)
+                # Extract evaluations report in dictionary format.
+                if "RegressionEvaluator" in type(evaluations).__name__:
+                    # RegressionEvaluator results are stored under "result" attribute.
+                    # "result" dataframe column names are metrics and corresponding
+                    # rows are evaluation values.
+                    columns = evaluations.result.keys()
+                    eval_values = evaluations.result.get_values()[0]
+                    # Default evaluation metric is set to "MAE" for Regression models.
+                    if self.__evaluation_metric is None:
+                        self.__evaluation_metric = "MAE"
+                else:
+                    # ClassificationEvaluator results are stored under "output_data"
+                    # attribute. "output_data" dataframe 'column 1' contains metrics
+                    # and 'column 2' holds corresponding evaluation values.
+                    eval_report = evaluations.output_data.get_values().transpose()
+                    columns = eval_report[1].astype('str')
+                    columns = [column_name.upper() for column_name in columns]
+                    eval_values = eval_report[2]
+                    # Default evaluation metric is set to "ACCURACY" for
+                    # classification models.
+                    if self.__evaluation_metric is None:
+                        self.__evaluation_metric = "ACCURACY"
+                # Combine columns and eval_values into a dictionary
+                eval_key_values = dict(zip(columns, eval_values))
+                # Update the model metadata for successful model training.
+                self.__update_model_metadata(model_name, param, "PASS",
+                                             training_time, end_time, start_time,
+                                             _data_id, eval_key_values)
             # Check whether self.__parallel_stop_event is None or not
             if self.__parallel_stop_event is not None:
@@ -1468,18 +1713,18 @@ class _BaseSearch:
                 if (self.__early_stop is not None and self._is_early_stoppable())\
                     or (self.__timeout is not None and self._is_time_stoppable()):
                     self.__parallel_stop_event.set()
         except Exception as _err_msg:
             # Record error message with corresponding "model_name".
             self.__model_err_records[model_name] = str(_err_msg)
             # Compute the failed execution time for failed training.
-            training_time = round((time.perf_counter() - start_time), 3)
+            end_time = time.perf_counter()
+            training_time = round((end_time - start_time), 3)
             # Update the model metadata for failed execution.
-            self.__update_model_metadata(model_name, param, "FAIL", training_time,
-                                         _data_id)
+            self.__update_model_metadata(model_name, param, "FAIL", training_time,
+                                         end_time, start_time, _data_id)
             pass
     def __non_model_trainer_routine(self, model_param, iter, **kwargs):
         """
         DESCRIPTION:
@@ -1549,7 +1794,7 @@ class _BaseSearch:
         # Check the stop_event set or not
         if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
             # Update the model metadata for Skip execution.
-            self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
+            self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
             return
         try:
             # Record starting time of model training.
@@ -1566,17 +1811,19 @@ class _BaseSearch:
             self.__trained_models[model_name] = func_obj
             # Process training time.
-            training_time = round((time.perf_counter() - start_time), 3)
+            end_time = time.perf_counter()
+            training_time = round((end_time - start_time), 3)
             # Update the model metadata for successful model training.
-            self.__update_model_metadata(model_name, param, "PASS", training_time, _data_id)
+            self.__update_model_metadata(model_name, param, "PASS", training_time, end_time, start_time, _data_id)
         except Exception as _err_msg:
             # Record error message with corresponding "model_name".
             self.__model_err_records[model_name] = str(_err_msg)
             # Compute the failed execution time for failed training.
-            training_time = round((time.perf_counter() - start_time), 3)
+            end_time = time.perf_counter()
+            training_time = round((end_time - start_time), 3)
             # Update the model metadata for failed execution.
-            self.__update_model_metadata(model_name, param, "FAIL", training_time, _data_id)
+            self.__update_model_metadata(model_name, param, "FAIL", training_time, end_time, start_time, _data_id)
             pass
         if self.__parallel_stop_event is not None:
@@ -1586,14 +1833,14 @@ class _BaseSearch:
                 self.__parallel_stop_event.set()
     def __update_model_metadata(self, model_name,
                                 param,
                                 status,
                                 training_time,
+                                end_time,
+                                start_time,
                                 data_id=None,
-                                columns=None,
-                                eval_values=None):
+                                eval_key_values=None):
         """
         DESCRIPTION:
             Internal function to update the model evaluation details, that are
@@ -1620,33 +1867,35 @@ class _BaseSearch:
                     * SKIP: Function execution skipped for the chosen parameters.
                 Types: str
-            data_id:
-                Optional Argument.
-                Specifies the unique data identifier used for model training.
-                Note:
-                    * "data_id" is supported for model trainer functions.
-                Types: str
             training_time:
                 Required Argument.
                 Specifies the model training time in seconds for both model trainer
                 function and non-model trainer function.
                 Types: float
-            columns:
+            end_time:
                 Optional Argument.
-                Specifies the column names retrieved from model evaluation
-                phase. This argument is a required argument for model trainer
-                function.
-                Types: list of string
-            eval_values:
+                Specifies the end time of the model training.
+                Types: float
+            start_time:
                 Optional Argument.
-                Specifies the evaluation results retrieved from model evaluation
-                phase. This argument is a required argument for model trainer
-                function.
-                Types: list of float
+                Specifies the start time of the model training.
+                Types: float
+            data_id:
+                Optional Argument.
+                Specifies the unique data identifier used for model training.
+                Note:
+                    * "data_id" is supported for model trainer functions.
+                Types: str
+            eval_key_values:
+                Optional Argument.
+                Specifies the evaluation key values retrieved from model evaluation
+                phase. This argument is a required argument for model trainer
+                function.
+                Types: dict.
         RETURNS:
             None
@@ -1672,17 +1921,21 @@ class _BaseSearch:
             model_metadata[self.__DATA_ID.upper()] = data_id
         # Format log message needs to displayed.
-        _msg = "Model_id:{},Run time:{}s,Status:{}".format(model_name,
-                                                       training_time,
+        _msg = "Model_id:{}, Run time:{}s, Start time:{}, End time:{}, Status:{}".format(model_name,
+                                                       training_time,
+                                                       start_time,
+                                                       end_time,
                                                        status)
-        if status == "PASS" and self.__is_evaluatable :
+        if status == "PASS" and (self.__is_evaluatable or self.__is_clustering_model):
             # While execution status is 'Fail' then update the evaluation result
             # with 'None' values.
-            model_scores = dict(zip(columns, eval_values))
+            model_scores = eval_key_values
             model_metadata.update(model_scores)
-            # Add additional model score to the log message.
+            # Add additional model score to the log message.
+            if self.__is_opensource_model and (self.__evaluation_metric is None or self.__evaluation_metric not in model_scores):
+                if "SILHOUETTE" in model_scores:
+                    self.__evaluation_metric = "SILHOUETTE"
             _msg += ",{}:{}".format(self.__evaluation_metric,round(
                                     model_scores[self.__evaluation_metric], 3))
             # Best model updation.
@@ -1757,18 +2010,46 @@ class _BaseSearch:
         # identifier is passed.
         if not self.__is_trainable or not self.__is_predictable:
             err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
-                                      "execute 'predict()'","Not applicable for" \
-                                      " non-model trainer analytic functions.")
+                                       "execute 'predict()'","Not applicable for" \
+                                       " non-model trainer analytic functions.")
             raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
         if self.__default_model is None:
             err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
                                        "execute 'predict()'",
-                                      "No model is set as default to set a "\
-                                      "prediction model use the 'set_model()' function.")
+                                       "No model is set as default to set a "\
+                                       "prediction model use the 'set_model()' function.")
             raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
+        test_data = kwargs.get("newdata", None)
+        if self.__is_opensource_model and self.__is_clustering_model:
+            if test_data is None:
+                test_data = self.__sampled_df_mapper[self.__best_data_id]["data"]
+            feature_columns = kwargs.get("feature_columns", None)
+            # If feature columns not passed, fetch from training data
+            if feature_columns is None:
+                if self.__best_data_id is None:
+                    err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
+                                               "fetch 'feature_columns'",
+                                               "No training metadata found")
+                    raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
+                training_df = self.__sampled_df_mapper[self.__best_data_id]["data"]
+                training_columns = training_df.columns
+                feature_columns = [col for col in training_columns]
+            return self.__default_model.predict(data=test_data, feature_columns=feature_columns)
+        elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
+            if test_data is None:
+                test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
+            y_test = test_data.select([self.__response_column])
+            X_test = test_data.drop(columns=[self.__response_column], axis=1)
+            return self.__default_model.predict(X_test, y_test)
         # TODO Enable this method, once Merge model supports VAL, and UAF.
         return self.__default_model.predict(**kwargs)
@@ -1963,7 +2244,6 @@ class _BaseSearch:
         return  self.__model_err_records.get(model_id)
     def set_model(self, model_id):
         """
         DESCRIPTION:
@@ -2046,10 +2326,16 @@ class _BaseSearch:
         # Raise TeradataMLException error when non-model trainer function
         # identifier is passed.
         if not self.__is_trainable or not self.__is_evaluatable:
-            err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
-                                      "execute 'evaluate()'","Not applicable for" \
-                                      " non-model trainer analytic functions.")
-            raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
+            if not self.__is_clustering_model:
+                err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
+                                          "execute 'evaluate()'","Not applicable for" \
+                                          " non-model trainer analytic functions.")
+                raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
+            else:
+                err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
+                                          "execute 'evaluate()'","Not applicable for" \
+                                          " clustering model functions.")
+                raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
         if self.__default_model is None:
             err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
@@ -2058,11 +2344,35 @@ class _BaseSearch:
                                       "trained model for evaluation use "\
                                       "the 'set_model()' function.")
             raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
-        _params = self.__eval_params if len(kwargs) == 0 else kwargs
-        if self._TRAINABLE_FUNCS_DATA_MAPPER[self.__func_name] not in _params:
-            _params.update(self.__sampled_df_mapper[self.__best_data_id][1])
-        return self.__default_model.evaluate(**_params)
+        if self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
+            test_data = kwargs.get("newdata", None)
+            if test_data is None:
+                test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
+            y_test = test_data.select([self.__response_column])
+            X_test = test_data.drop(columns=[self.__response_column], axis=1)
+            pred_col = self._get_predict_column()
+            output = self.__default_model.predict(X_test,y_test)
+            y_true = output.select([self.__response_column])
+            y_pred = output.select([pred_col])
+            if self.__is_regression_model:
+                eval_key_values = self._regression_metrics(y_true, y_pred)
+            elif self.__is_classification_model:
+                eval_key_values = self._classification_metrics(y_true, y_pred)
+            import pandas as pd
+            result_df = pd.DataFrame([eval_key_values])
+            return result_df
+        else:
+            _params = self.__eval_params if len(kwargs) == 0 else kwargs
+            if self._TRAINABLE_FUNCS_DATA_MAPPER[self.__func_name] not in _params:
+                _params.update(self.__sampled_df_mapper[self.__best_data_id][1])
+            return self.__default_model.evaluate(**_params)
     def __populate_parameter_grid(self):
@@ -2255,6 +2565,8 @@ class _BaseSearch:
         if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
             self._labeled_data = self._add_data_label()
+        elif self.__is_trainable and self.__is_evaluatable and not self.__is_clustering_model:
+            self._labeled_data = self._add_data_label()
 class GridSearch(_BaseSearch):
@@ -2940,6 +3252,7 @@ class GridSearch(_BaseSearch):
                     * evaluation_metric applicable for model trainer functions.
                     * Best model is not selected when evaluation returns
                       non-finite values.
+                    * MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
                 Permitted Values:
                     * Classification: Accuracy, Micro-Precision, Micro-Recall,
                                       Micro-F1, Macro-Precision, Macro-Recall,
@@ -3555,6 +3868,7 @@ class RandomSearch(_BaseSearch):
                     * evaluation_metric applicable for model trainer functions.
                     * Best model is not selected when evaluation returns
                       non-finite values.
+                    * MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
                 Permitted Values:
                     * Classification: Accuracy, Micro-Precision, Micro-Recall,
                                       Micro-F1, Macro-Precision, Macro-Recall,

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl