PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/automl/model_training.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ##################################################################
 #
-# Copyright 2024 Teradata. All rights reserved.
+# Copyright 2025 Teradata. All rights reserved.
 # TERADATA CONFIDENTIAL AND TRADE SECRET
 #
 # Primary Owner: Sweta Shaw
@@ -29,7 +29,7 @@ from teradataml import execute_sql, get_connection
 from teradataml import configure, SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
 from teradataml.utils.validators import _Validators
 from teradataml.common.utils import UtilFuncs
-from teradataml.common.constants import TeradataConstants
+from teradataml.common.constants import TeradataConstants, AutoMLConstants
 class _ModelTraining:
@@ -54,7 +54,7 @@ class _ModelTraining:
                 Types: teradataml Dataframe
             target_column:
-                Required Argument.
+                Required Argument. (Not required for Clustering task_type)
                 Specifies the target column present inside the dataset.
                 Types: str
@@ -83,9 +83,9 @@ class _ModelTraining:
             task_type:
                 Required Argument.
                 Specifies the task type for AutoML, whether to apply regresion
-                or classification on the provived dataset.
+                or classification or clustering on the provived dataset.
                 Default Value: "Regression"
-                Permitted Values: "Regression", "Classification"
+                Permitted Values: "Regression", "Classification", "Clustering"
                 Types: str
             custom_data:
@@ -120,12 +120,17 @@ class _ModelTraining:
                         Specifies the random seed for reproducibility.
                         Default Value: 42
                         Types: int
+                    cluster:
+                        Optional Argument.
+                        Specifies whether to apply clustering techniques.
+                        Default Value: False
+                        Types: bool
         """
         self.data = data
         self.target_column = target_column
         self.model_list = model_list
         self.verbose = verbose
-        self.features = (features[1], features[0], features[2])
         self.task_type = task_type
         self.custom_data = custom_data
         self.labels = self.data.drop_duplicate(self.target_column).size
@@ -133,14 +138,19 @@ class _ModelTraining:
         self.persist = kwargs.get("persist", False)
         self.volatile = kwargs.get("volatile", False)
         self.seed = kwargs.get("seed", 42)
+        self.cluster = kwargs.get("cluster", False)
+        if not self.cluster:
+            self.features = (features[1], features[0], features[2])
+        else:
+            self.features = (features[1], features[0])
     def model_training(self,
                        auto=True,
                        max_runtime_secs=None,
                        stopping_metric=None,
                        stopping_tolerance=0,
-                       max_models=None
-                       ):
+                       max_models=None):
         """
         DESCRIPTION:
             Function to perform following tasks:-
@@ -231,7 +241,12 @@ class _ModelTraining:
             int containing, total number of models available for training.
         """
         # Creating all possible combinations of hyperparameters
-        all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
+        if 'param_grid' in hyperparameters:
+            grid = hyperparameters['param_grid']
+        else:
+            # AutoML style: full dict is hyperparameter space
+            grid = hyperparameters
+        all_combinations = list(product(*[v if isinstance(v, (list, tuple)) else [v] for v in grid.values()]))
         # Getting total number of models for each model model training function
         total_models = len(all_combinations)
         return total_models
@@ -279,21 +294,34 @@ class _ModelTraining:
             None
         """
         self._display_msg(msg="\nHyperparameters used for model training: ",
-                          progress_bar = self.progress_bar,
+                          progress_bar=self.progress_bar,
                           show_data=True)
         print(" " *150, end='\r', flush=True)
         # Iterating over hyperparameters_list
         for hyperparameter_dct in hyperparameters_list:
-            # Extracting hyperparameter and their value from hyperparameters dictionary
-            for key, val in hyperparameter_dct.items():
-                # Displaying hyperparameters
-                print(f"{key} : {str(val)}")
+            name = hyperparameter_dct.get("name", "Unnamed Model")
+            print(f"Model: {name}")
-            # Displaying total number of models for each model
+            if self.cluster and "param_grid" in hyperparameter_dct:
+                # Also show metadata outside param_grid
+                for meta_key, meta_val in hyperparameter_dct.items():
+                    if meta_key != "param_grid":
+                        print(f"{meta_key}: {meta_val}")
+                print("Hyperparameter Grid:")
+                for key, val in hyperparameter_dct["param_grid"].items():
+                    print(f"  {key}: {val}")
+            else:
+                print("Hyperparameters:")
+                for key, val in hyperparameter_dct.items():
+                    print(f"  {key}: {val}")
             total_models = self._get_model_param_space(hyperparameter_dct)
-            print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
-            print(f"--"*100+'\n')
+            print(f"Total number of models for {name}: {total_models}")
+            print(f"--" * 100 + "\n")
     def _display_leaderboard(self,
                              trained_models_info):
@@ -311,14 +339,20 @@ class _ModelTraining:
             pandas Dataframe.
         """
         # Creating a copy to avoid use of same reference of memory
-        if self.task_type != "Regression":
-            sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
-                                                              ascending=[False, False]).reset_index(drop=True)
+        if not self.cluster:
+            if self.task_type != "Regression":
+                sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
+                                                                  ascending=[False, False]).reset_index(drop=True)
+            else:
+                sorted_model_df = trained_models_info.sort_values(by='R2',
+                                                                  ascending=False).reset_index(drop=True)
         else:
-            sorted_model_df = trained_models_info.sort_values(by='R2',
-                                                              ascending=False).reset_index(drop=True)
+            sorted_model_df = trained_models_info.sort_values(by=['SILHOUETTE', 'CALINSKI', 'DAVIES'],
+                                                              ascending=[False, False, True]).reset_index(drop=True)
         # Adding rank to leaderboard
         sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
@@ -326,7 +360,7 @@ class _ModelTraining:
         dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
         # Excluding the model object and model name from leaderboard
-        leaderboard = sorted_model_df.drop(dp_lst, axis=1)
+        leaderboard = sorted_model_df.drop(columns=[col for col in dp_lst if col in sorted_model_df.columns])
         # filtering the rows based on the max_models
         if self.max_models is not None:
@@ -363,24 +397,42 @@ class _ModelTraining:
         """
         # Iterating over new hyperparameters and performing required operation
         # based on passed method ADD or REPLACE
-        for feature, param_list in new_params.items():
-            if feature in existing_params.keys():
-                if param_list["Method"] == "ADD":
-                    # Extending existing list
-                    existing_params[feature] = list(existing_params[feature])
-                    existing_params[feature].extend(param_list["Value"])
-                    # Updating list with unique values.
-                    existing_params[feature]=tuple(set(existing_params[feature]))
-                elif param_list["Method"] == "REPLACE":
-                    # Replacing with entirely new value
-                    existing_params[feature] = tuple(param_list["Value"])
+        if self.cluster:
+            # Clustering: use param_grid
+            param_grid = existing_params.get("param_grid", {})
+            for feature, param_list in new_params.items():
+                if feature in param_grid:
+                    if param_list["Method"] == "ADD":
+                        param_grid[feature] = list(param_grid[feature])
+                        param_grid[feature].extend(param_list["Value"])
+                        param_grid[feature] = tuple(set(param_grid[feature]))
+                    elif param_list["Method"] == "REPLACE":
+                        param_grid[feature] = tuple(param_list["Value"])
+                    else:
+                        self._display_msg(inline_msg="Passed method is not valid.")
                 else:
-                    self._display_msg(inline_msg="Passed method is not valid.")
-            else:
-                self._display_msg(inline_msg="\nPassed model argument {} is not"
-                                  "available for model {}. Skipping it."
-                                  .format(feature,existing_params['name']))
-                continue
+                    param_grid[feature] = tuple(param_list["Value"])
+            existing_params["param_grid"] = param_grid
+        else:
+            for feature, param_list in new_params.items():
+                if feature in existing_params.keys():
+                    if param_list["Method"] == "ADD":
+                        # Extending existing list
+                        existing_params[feature] = list(existing_params[feature])
+                        existing_params[feature].extend(param_list["Value"])
+                        # Updating list with unique values.
+                        existing_params[feature]=tuple(set(existing_params[feature]))
+                    elif param_list["Method"] == "REPLACE":
+                        # Replacing with entirely new value
+                        existing_params[feature] = tuple(param_list["Value"])
+                    else:
+                        self._display_msg(inline_msg="Passed method is not valid.")
+                else:
+                    self._display_msg(inline_msg="\nPassed model argument {} is not"
+                                      " available for model {}. Skipping it."
+                                      .format(feature,existing_params['name']))
+                    continue
             # Returning updated hyperparamter
         return existing_params
@@ -422,13 +474,13 @@ class _ModelTraining:
                     hyperparameters[model_index]=self._update_hyperparameters(hyperparameters[model_index],hyp_list)
                 # Displaying it after update
                 self._display_msg(inline_msg="\nCompleted customized hyperparameter update.",
-                                 progress_bar=self.progress_bar)
+                                  progress_bar=self.progress_bar)
             else:
                 self._display_msg(inline_msg="No information provided for custom hyperparameters. AutoML will proceed with default values.",
-                                 progress_bar=self.progress_bar)
+                                  progress_bar=self.progress_bar)
         else:
             self._display_msg(inline_msg="\nSkipping customized hyperparameter tuning",
-                             progress_bar=self.progress_bar)
+                              progress_bar=self.progress_bar)
         # Retunring updated hyperparameters for all models
         return hyperparameters
@@ -506,7 +558,7 @@ class _ModelTraining:
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
                 'iter_num': tuple(iter_num),
-                'seed':self.seed
+                'seed': self.seed
                 }
         # Hyperparameters for Decision Forest model
         df_params = {
@@ -517,7 +569,7 @@ class _ModelTraining:
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
                 'num_trees': tuple(num_trees),
-                'seed':self.seed
+                'seed': self.seed
         }
         # Updating model type in case of classification
@@ -663,6 +715,47 @@ class _ModelTraining:
         else:
             return None
+    def _get_kmeans_hyperparameters(self):
+        """
+        DESCRIPTION:
+            Generates hyperparameters for KMeans clustering.
+        RETURNS:
+            dict containing hyperparameters for KMeans.
+        """
+        params = {
+            "name": "KMeans",
+            "param_grid": {
+                'n_clusters': (2,3,4,5,6,7,8,9,10),
+                'init': ('k-means++', 'random'),
+                'n_init': (5, 10),
+                'max_iter': (100, 200),
+                'tol': (0.001, 0.01),
+                'algorithm': ('auto', 'full')
+            }
+        }
+        return params
+    def _get_gmm_hyperparameters(self):
+        """
+        DESCRIPTION:
+            Generates hyperparameters for Gaussian Mixture Model (GMM).
+        RETURNS:
+            dict containing hyperparameters for GMM.
+        """
+        params = {
+            "name": "GaussianMixture",
+            "param_grid": {
+                "n_components": (2,3,4,5,6,7,8,9,10),
+                "covariance_type": ("full", "tied", "diag", "spherical"),
+                "max_iter": (100, 300)
+            }
+        }
+        return params
     def _generate_parameter(self):
         """
         DESCRIPTION:
@@ -672,46 +765,54 @@ class _ModelTraining:
             list containing, dict of hyperparameters for different ML models.
         """
         # list for storing hyperparameters
-        parameters=[]
+        parameters = []
         # Index for model mapping
-        model_index=0
+        model_index = 0
         # Dictionary for mapping model with index
         self.model_mapping={}
-        # Getting number of rows and columns
-        num_rows = self.data.shape[0]
-        num_cols = self.data.shape[1]
-        # Updating model list for multi-class classification
-        if self.task_type.casefold() == "classification" and self.labels > 2:
-            for model in ['glm','svm']:
-                if model in self.model_list:
-                    self._display_msg(inline_msg="\nMulti-class classification is "
-                                     "not supported by {} model. Skipping {} model."
-                                     .format(model, model),
-                                     progress_bar=self.progress_bar)
-                    self.model_list.remove(model)
-        # Model functions mapping for hyperparameter generation
-        model_functions = {
-            'decision_forest': self._get_tree_model_hyperparameters,
-            'xgboost': self._get_tree_model_hyperparameters,
-            'knn': self._get_knn_hyperparameters,
-            'glm': self._get_linear_model_hyperparameters,
-            'svm': self._get_linear_model_hyperparameters,
-        }
-        # Generating hyperparameters for each model
-        if self.model_list:
-            for model in self.model_list:
-                self.model_mapping[model] = model_index
-                if model == 'knn':
-                    parameters.append(model_functions[model](num_rows, num_cols))
-                else:
-                    parameters.append(model_functions[model](num_rows, num_cols, model))
-                model_index += 1
+        if not self.cluster:
+            # Getting number of rows and columns
+            num_rows = self.data.shape[0]
+            num_cols = self.data.shape[1]
+            # Model functions mapping for hyperparameter generation
+            model_functions = {
+                'decision_forest': self._get_tree_model_hyperparameters,
+                'xgboost': self._get_tree_model_hyperparameters,
+                'knn': self._get_knn_hyperparameters,
+                'glm': self._get_linear_model_hyperparameters,
+                'svm': self._get_linear_model_hyperparameters,
+            }
+            if not self.cluster:
+                supported_models = AutoMLConstants.SUPERVISED_MODELS.value
+                self.model_list = [model for model in self.model_list if model in supported_models]
+            # Generating hyperparameters for each model
+            if self.model_list:
+                for model in self.model_list:
+                    self.model_mapping[model] = model_index
+                    if model == 'knn':
+                        parameters.append(model_functions[model](num_rows, num_cols))
+                    else:
+                        parameters.append(model_functions[model](num_rows, num_cols, model))
+                    model_index += 1
+            else:
+                raise ValueError("No model is selected for training.")
         else:
-            raise ValueError("No model is selected for training.")
+            model_functions = {
+                'KMeans': self._get_kmeans_hyperparameters,
+                'GaussianMixture': self._get_gmm_hyperparameters,
+            }
+            supported_models = AutoMLConstants.CLUSTERING_MODELS.value
+            self.model_list = [model for model in self.model_list if model in supported_models]
+            if self.model_list:
+                for model in self.model_list:
+                    self.model_mapping[model] = model_index
+                    parameters.append(model_functions[model]())
+                    model_index += 1
+            else:
+                raise ValueError("No model is selected for training.")
         return parameters
@@ -723,8 +824,12 @@ class _ModelTraining:
         RETURNS:
             dictionary containing max_models distribution and list of models to remove.
         """
+        if self.cluster:
+            models = [model for model in self.model_list if model in AutoMLConstants.CLUSTERING_MODELS.value]
+        else:
+            models = [model for model in self.model_list if model in AutoMLConstants.SUPERVISED_MODELS.value]
         # Getting total number of models
-        model_count=len(self.model_list)
+        model_count = len(models)
         # Evenly distributing max_models across models
         base_assign = self.max_models // model_count
         # Creating list of max_models for each model
@@ -739,17 +844,20 @@ class _ModelTraining:
                 distribution[i] += 1
         # Creating dictionary for model distribution
-        model_distribution = dict(zip(self.model_list, distribution))
+        model_distribution = dict(zip(models, distribution))
         # Getting list of models with 0 distribution and removing them from model list
         # While for model having distribution greater than 0, updating distribution with
         # 1/3rd of original value as we are training with 3 different feature selection methods.
         models_to_remove = []
-        for model in self.model_list:
-            initial_count = model_distribution[model]
-            if initial_count == 0:
-                models_to_remove.append(model)
-            else:
-                model_distribution[model] = math.ceil(initial_count / 3)
+        if not self.cluster:
+            for model in models:
+                initial_count = model_distribution[model]
+                if initial_count == 0:
+                    models_to_remove.append(model)
+                else:
+                    model_distribution[model] = math.ceil(initial_count / 3)
+        else:
+            models_to_remove = [model for model, count in model_distribution.items() if count == 0]
         return model_distribution, models_to_remove
@@ -768,22 +876,31 @@ class _ModelTraining:
         RETURNS:
             Pandas DataFrame containing, trained models information.
         """
+        self.model_id_counters = {}
         # Hyperparameters for each model
         model_params = parameters[:min(len(parameters), 5)]
         self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
         # Defining training data
-        data_types = ['lasso', 'rfe', 'pca']
-        trainng_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
+        if not self.cluster:
+            data_types = ['lasso', 'rfe', 'pca']
+            training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
+        else:
+            data_types = ['pca', 'non_pca']
+            training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
-        if self.task_type == "Classification":
-            response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
+        if self.task_type == "Classification" and not self.cluster:
+            response_values = training_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
             self.output_response = [str(i) for i in response_values]
         if self.stopping_metric is None:
-            self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
-                                    if self.is_classification_type() else 'R2'
+            if not self.cluster:
+                self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
+                                        if self.is_classification_type() else 'R2'
+            else:
+                self.stopping_tolerance, self.stopping_metric = 1.0, 'SILHOUETTE'
         self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
                                 if self.max_runtime_secs is not None else None
@@ -798,16 +915,17 @@ class _ModelTraining:
                     # Updating progress bar as we are removing model
                     self.progress_bar.update()
-        if self.is_classification_type():
+        if self.is_classification_type() and not self.cluster:
             self.startify_col = self.target_column
         trained_models = []
         for param in model_params:
-            result = self._hyperparameter_tunning(param, trainng_datas)
+            result = self._hyperparameter_tunning(param, training_datas)
             if result is not None:
                 trained_models.append(result)
         models_df = pd.concat(trained_models, ignore_index=True)
         return models_df
     def _hyperparameter_tunning(self,
@@ -816,7 +934,7 @@ class _ModelTraining:
         """
         DESCRIPTION:
             Internal function performs hyperparameter tuning on
-            ML models for regression/classification problems.
+            ML models for regression/classification/clustering problems.
         PARAMETERS:
             model_param
@@ -832,121 +950,196 @@ class _ModelTraining:
         RETURNS:
             pandas DataFrame containing, trained models information.
         """
-        # Mapping model names to functions
-        model_to_func = {"glm": GLM, "svm": SVM,
-                         "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
+        # Passing verbose value based on user input
+        if self.verbose > 0:
+            print(" " *200, end='\r', flush=True)
+            verbose = 1
+        else:
+            verbose = 0
+        if not self.cluster:
+            # Mapping model names to functions
+            model_to_func = {"glm": GLM, "svm": SVM,
+                             "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
-        # Setting eval_params for hpt.
-        eval_params = _ModelTraining._eval_params_generation(model_param['name'],
-                                                             self.target_column,
-                                                             self.task_type)
+            # Setting eval_params for hpt.
+            eval_params = _ModelTraining._eval_params_generation(model_param['name'],
+                                                                 self.target_column,
+                                                                 self.task_type)
-        # Input columns for model
-        model_param['input_columns'] = self.features
+            # Input columns for model
+            model_param['input_columns'] = self.features
-        # Setting persist for model
-        model_param['persist'] = self.persist
+            # Setting persist for model
+            model_param['persist'] = self.persist
-        self._display_msg(msg=model_param['name'],
-                          progress_bar=self.progress_bar,
-                          show_data=True)
-        # As we are using entire data for HPT training. So,
-        # passing prepared training data as test_data for KNN.
-        if model_param['name'] == 'knn':
-            model_param['test_data'] = train_data
+            self._display_msg(msg=model_param['name'],
+                              progress_bar=self.progress_bar,
+                              show_data=True)
+            # As we are using entire data for HPT training. So,
+            # passing prepared training data as test_data for KNN.
+            if model_param['name'] == 'knn':
+                model_param['test_data'] = train_data
-        if self.task_type == "Classification":
-            model_param['output_prob'] = True
-            model_param['output_responses'] = self.output_response
+            if self.task_type == "Classification":
+                model_param['output_prob'] = True
+                model_param['output_responses'] = self.output_response
-        # Using RandomSearch for hyperparameter tunning when max_models is given.
-        # Otherwise, using GridSearch for hyperparameter tunning.
-        if self.max_models is not None:
-            # Setting max_models for RandomSearch based on model name
-            model_param['max_models'] = self.max_models_distribution[model_param['name']]
-            # Defining RandomSearch with ML model based on Name, and max_models
-            _obj = RandomSearch(func=model_to_func[model_param['name']],
-                                params=model_param,
-                                n_iter=model_param['max_models'])
-        else:
-            # Defining Gridsearch with ML model based on Name
-            _obj = GridSearch(func=model_to_func[model_param['name']],
-                              params=model_param)
-        if self.verbose > 0:
-            print(" " *200, end='\r', flush=True)
-            verbose = 1
-        else:
-            verbose = 0
+            # Using RandomSearch for hyperparameter tunning when max_models is given.
+            # Otherwise, using GridSearch for hyperparameter tunning.
+            if self.max_models is not None:
+                # Setting max_models for RandomSearch based on model name
+                model_param['max_models'] = self.max_models_distribution[model_param['name']]
+                # Defining RandomSearch with ML model based on Name, and max_models
+                _obj = RandomSearch(func=model_to_func[model_param['name']],
+                                    params=model_param,
+                                    n_iter=model_param['max_models'])
+            else:
+                # Defining Gridsearch with ML model based on Name
+                _obj = GridSearch(func=model_to_func[model_param['name']],
+                                  params=model_param)
+            # Hyperparameter tunning
+            # Parallel run opens multiple connections for parallel execution,
+            # but volatile tables are not accessible across different sessions.
+            # Therefore, execution is performed sequentially by setting run_parallel=False.
+            run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
+            common_params = {
+                "data": train_data,
+                "evaluation_metric": self.stopping_metric,
+                "early_stop": self.stopping_tolerance,
+                "run_parallel": run_parallel,
+                "sample_seed": self.seed,
+                "sample_id_column": "id",
+                "discard_invalid_column_params": True,
+                "stratify_column": self.startify_col,
+                "verbose": verbose,
+                "max_time": self.max_runtime_secs,
+                "suppress_refer_msg": True
+            }
-        # Hyperparameter tunning
-        # Parallel run opens multiple connections for parallel execution,
-        # but volatile tables are not accessible across different sessions.
-        # Therefore, execution is performed sequentially by setting run_parallel=False.
-        run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
-        common_params = {
-            "data": train_data,
-            "evaluation_metric": self.stopping_metric,
-            "early_stop": self.stopping_tolerance,
-            "run_parallel": run_parallel,
-            "sample_seed": self.seed,
-            "sample_id_column": "id",
-            "discard_invalid_column_params": True,
-            "stratify_column": self.startify_col,
-            "verbose": verbose,
-            "max_time": self.max_runtime_secs,
-            "suppress_refer_msg": True
-        }
-        if model_param['name'] == 'knn':
-            _obj.fit(**common_params)
+            if model_param['name'] == 'knn':
+                _obj.fit(**common_params)
+            else:
+                _obj.fit(**common_params, **eval_params)
+            # Getting all passed models
+            model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
+                                                on='MODEL_ID', how='inner')
+            if not model_info.empty:
+                # Creating mapping data ID to feature selection method
+                data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
+                                        "DF_1": ('rfe', train_data[1]._table_name),
+                                        "DF_2": ('pca', train_data[2]._table_name)}
+                # Updating model stats with feature selection method and result table
+                for index, row in model_info.iterrows():
+                    model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
+                    model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
+                    model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
+                    model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
+                # Dropping column 'DATA_ID'
+                model_info.drop(['DATA_ID'], axis=1, inplace=True)
+                model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
+                if not self.is_classification_type():
+                    # Calculating Adjusted-R2 for regression
+                    # Getting size and feature count for each feature selection method
+                    methods = ["lasso", "rfe", "pca"]
+                    size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
+                    feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
+                    model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
+                        1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
+                        (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
+                self._display_msg(msg="-"*100,
+                                  progress_bar=self.progress_bar,
+                                  show_data=True)
+                self.progress_bar.update()
+                return model_info
+            # Returning None, if no model is passed
+            return None
         else:
-            _obj.fit(**common_params, **eval_params)
-        # Getting all passed models
-        model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
-                                            on='MODEL_ID', how='inner')
-        if not model_info.empty:
-            # Creating mapping data ID to feature selection method
-            data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
-                                    "DF_1": ('rfe', train_data[1]._table_name),
-                                    "DF_2": ('pca', train_data[2]._table_name)}
+            import time
+            from teradataml import td_sklearn as skl
-            # Updating model stats with feature selection method and result table
-            for index, row in model_info.iterrows():
-                model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
-                model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
-                model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
-                model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
+            model_name = model_param['name']
-            # Dropping column 'DATA_ID'
-            model_info.drop(['DATA_ID'], axis=1, inplace=True)
-            model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
+            self._display_msg(msg=model_name,
+                              progress_bar=self.progress_bar, show_data=True)
+            if model_name == "KMeans":
+                model_func = skl.KMeans()
+                param_key = "n_clusters"
+                pred_col = "kmeans_predict_1"
+            elif model_name == "GaussianMixture":
+                model_func = skl.GaussianMixture()
+                param_key = "n_components"
+                pred_col = "gaussianmixture_predict_1"
+            else:
+                raise ValueError(f"Unsupported model: {model_name}")
+            model_param["input_columns"] = self.features
+            model_param["persist"] = self.persist
-            if not self.is_classification_type():
-                # Calculating Adjusted-R2 for regression
-                # Getting size and feature count for each feature selection method
-                methods = ["lasso", "rfe", "pca"]
-                size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
-                feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
-                model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
-                    1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
-                    (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
-            self._display_msg(msg="-"*100,
-                            progress_bar=self.progress_bar,
-                            show_data=True)
-            self.progress_bar.update()
-            return model_info
-        # Returning None, if no model is passed
-        return None
+            if self.max_models is not None:
+                model_param['max_models'] = self.max_models_distribution[model_name]
+                search_obj = RandomSearch(func=model_func,
+                                          params=model_param['param_grid'],
+                                          n_iter=model_param['max_models'])
+            else:
+                search_obj = GridSearch(func=model_func, params=model_param["param_grid"])
+            search_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
+                           early_stop=self.stopping_tolerance, run_parallel=True,
+                           sample_seed=self.seed, verbose=verbose, max_time=self.max_runtime_secs)
+            model_df = search_obj.models[search_obj.models["STATUS"] == "PASS"]
+            if model_df.empty:
+                print("No models passed. Exiting.")
+                self.progress_bar.update()
+                return None
+            model_stats = search_obj.model_stats
+            model_info = model_stats.merge(model_df[['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
+                                           on="MODEL_ID", how="inner")
+            if not model_info.empty:
+                # Creating mapping data ID to feature selection method
+                data_id_to_table_map = {"DF_0": ('pca', train_data[1]._table_name),
+                                        "DF_1": ('non_pca', train_data[0]._table_name)}
+                # Updating model stats with feature selection method and result table
+                for index, row in model_info.iterrows():
+                    model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
+                    model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
+                    model_info.loc[index, 'model-obj'] = search_obj.get_model(row['MODEL_ID'])
+                # Dropping column 'DATA_ID'
+                model_info.drop(['DATA_ID'], axis=1, inplace=True)
+                model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
+                self._display_msg(msg="-"*100,
+                                  progress_bar=self.progress_bar,
+                                  show_data=True)
+                self.progress_bar.update()
+                return model_info
+            return None
     @staticmethod
     def _eval_params_generation(ml_name,
                                 target_column,
@@ -980,7 +1173,7 @@ class _ModelTraining:
         """
         # Setting the eval_params
         eval_params = {"id_column": "id",
-                        "accumulate": target_column}
+                       "accumulate": target_column}
         model_type = {
             'xgboost': 'model_type',
@@ -1013,4 +1206,4 @@ class _ModelTraining:
             elif ml_name == 'glm':
                 eval_params['family'] = 'GAUSSIAN'
-        return eval_params
+        return eval_params

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl