PyPI - teradataml - Versions diffs - 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl - Mend

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (108) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/LICENSE.pdf +0 -0
teradataml/README.md +71 -0
teradataml/_version.py +2 -2
teradataml/analytics/analytic_function_executor.py +51 -24
teradataml/analytics/json_parser/utils.py +11 -17
teradataml/automl/__init__.py +103 -48
teradataml/automl/data_preparation.py +55 -37
teradataml/automl/data_transformation.py +131 -69
teradataml/automl/feature_engineering.py +117 -185
teradataml/automl/feature_exploration.py +9 -2
teradataml/automl/model_evaluation.py +13 -25
teradataml/automl/model_training.py +214 -75
teradataml/catalog/model_cataloging_utils.py +1 -1
teradataml/clients/auth_client.py +133 -0
teradataml/common/aed_utils.py +3 -2
teradataml/common/constants.py +11 -6
teradataml/common/garbagecollector.py +5 -0
teradataml/common/messagecodes.py +3 -1
teradataml/common/messages.py +2 -1
teradataml/common/utils.py +6 -0
teradataml/context/context.py +49 -29
teradataml/data/advertising.csv +201 -0
teradataml/data/bank_marketing.csv +11163 -0
teradataml/data/bike_sharing.csv +732 -0
teradataml/data/boston2cols.csv +721 -0
teradataml/data/breast_cancer.csv +570 -0
teradataml/data/customer_segmentation_test.csv +2628 -0
teradataml/data/customer_segmentation_train.csv +8069 -0
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
teradataml/data/glm_example.json +28 -1
teradataml/data/housing_train_segment.csv +201 -0
teradataml/data/insect2Cols.csv +61 -0
teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
teradataml/data/kmeans_example.json +5 -0
teradataml/data/kmeans_table.csv +10 -0
teradataml/data/onehot_encoder_train.csv +4 -0
teradataml/data/openml_example.json +29 -0
teradataml/data/scale_attributes.csv +3 -0
teradataml/data/scale_example.json +52 -1
teradataml/data/scale_input_part_sparse.csv +31 -0
teradataml/data/scale_input_partitioned.csv +16 -0
teradataml/data/scale_input_sparse.csv +11 -0
teradataml/data/scale_parameters.csv +3 -0
teradataml/data/scripts/deploy_script.py +20 -1
teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
teradataml/data/teradataml_example.json +77 -0
teradataml/data/ztest_example.json +16 -0
teradataml/dataframe/copy_to.py +8 -3
teradataml/dataframe/data_transfer.py +120 -61
teradataml/dataframe/dataframe.py +102 -17
teradataml/dataframe/dataframe_utils.py +47 -9
teradataml/dataframe/fastload.py +272 -89
teradataml/dataframe/sql.py +84 -0
teradataml/dbutils/dbutils.py +2 -2
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
teradataml/options/__init__.py +13 -4
teradataml/options/configure.py +27 -6
teradataml/scriptmgmt/UserEnv.py +19 -16
teradataml/scriptmgmt/lls_utils.py +117 -14
teradataml/table_operators/Script.py +2 -3
teradataml/table_operators/TableOperator.py +58 -10
teradataml/utils/validators.py +40 -2
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0

teradataml/automl/model_training.py CHANGED Viewed

@@ -16,6 +16,7 @@
 # Python libraries
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
+import math
 import pandas as pd
 from itertools import product
@@ -24,7 +25,8 @@ from teradataml.context import context as tdmlctx
 from teradataml.dataframe.copy_to import copy_to_sql
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml import execute_sql, get_connection
-from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
+from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
+from teradataml.utils.validators import _Validators
 class _ModelTraining:
@@ -49,12 +51,12 @@ class _ModelTraining:
                 Types: teradataml Dataframe
             target_column:
-                Required Arugment.
+                Required Argument.
                 Specifies the target column present inside the dataset.
                 Types: str
             model_list:
-                Required Arugment.
+                Required Argument.
                 Specifies the list of models to be used for model training.
                 Types: list
@@ -70,13 +72,13 @@ class _ModelTraining:
                 Types: int
             features:
-                Required Arugment.
+                Required Argument.
                 Specifies the list of selected feature by rfe, lasso and pca
                 respectively in this order.
                 Types: list of list of strings (str)
             task_type:
-                Required Arugment.
+                Required Argument.
                 Specifies the task type for AutoML, whether to apply regresion
                 or classification on the provived dataset.
                 Default Value: "Regression"
@@ -84,7 +86,7 @@ class _ModelTraining:
                 Types: str
             custom_data:
-                Optional Arugment.
+                Optional Argument.
                 Specifies json object containing user customized input.
                 Types: json object
         """
@@ -96,12 +98,14 @@ class _ModelTraining:
         self.task_type = task_type
         self.custom_data = custom_data
         self.labels = self.data.drop_duplicate(self.target_column).size
+        self.startify_col = None
     def model_training(self,
                        auto=True,
                        max_runtime_secs=None,
                        stopping_metric=None,
-                       stopping_tolerance=0
+                       stopping_tolerance=0,
+                       max_models=None
                        ):
         """
         DESCRIPTION:
@@ -112,14 +116,14 @@ class _ModelTraining:
         PARAMETERS:
             auto:
-                Optional Arugment.
+                Optional Argument.
                 Specifies whether to run data preparation in auto mode or custom mode.
                 When set to True, runs automtically otherwise, it take user inputs.
                 Default Value: True
                 Types: boolean
             max_runtime_secs:
-                Optional Arugment.
+                Optional Argument.
                 Specifies the time limit in seconds for model training.
                 Types: int
@@ -132,6 +136,11 @@ class _ModelTraining:
                 Required, when "stopping_metric" is set, otherwise optional.
                 Specifies the stopping tolerance for stopping metrics in model training.
                 Types: float
+            max_models:
+                Optional Argument.
+                Specifies the maximum number of models to be trained.
+                Types: int
         RETURNS:
             pandas dataframes containing model information, leaderboard and target
@@ -140,6 +149,7 @@ class _ModelTraining:
         self.stopping_metric = stopping_metric
         self.stopping_tolerance = stopping_tolerance
         self.max_runtime_secs = max_runtime_secs
+        self.max_models = max_models
         self._display_heading(phase=3, progress_bar=self.progress_bar)
         self._display_msg(msg='Model Training started ...',
@@ -152,6 +162,10 @@ class _ModelTraining:
         if not auto:
             parameters = self._custom_hyperparameters(parameters)
+        # Validates the upper limit of max_models based on total model combinations
+        if self.max_models is not None:
+            self._validate_upper_limit_for_max_models(parameters)
         if self.verbose == 2:
             self._display_hyperparameters(parameters)
@@ -167,6 +181,54 @@ class _ModelTraining:
         return models, leader_board, self.labels
+    def _get_model_param_space(self,
+                               hyperparameters):
+        """
+        DESCRIPTION:
+            Internal function to calculate the total number of models to be trained for specific model.
+        PARAMETERS:
+            hyperparameters:
+                Required Argument.
+                Specifies the hyperparameters availables for ML model.
+                Types: list of dict
+        RETURNS:
+            int containing, total number of models available for training.
+        """
+        # Creating all possible combinations of hyperparameters
+        all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
+        # Getting total number of models for each model model training function
+        total_models = len(all_combinations)
+        return total_models
+    def _validate_upper_limit_for_max_models(self,
+                                             hyperparameters_list):
+        """
+        DESCRIPTION:
+            Internal function to validate the upper limit of max_models.
+        PARAMETERS:
+            hyperparameters_list:
+                Required Argument.
+                Specifies the hyperparameters for different ML models.
+                Types: list of dict
+        RETURNS:
+            None
+        RAISES:
+            TeradataMlException, ValueError
+        """
+        model_param_space = 0
+        for hyperparameter_dct in hyperparameters_list:
+            # getting total number of models for each model
+            total_models = self._get_model_param_space(hyperparameter_dct)
+            model_param_space += total_models
+        # Validating upper range for max_models
+        _Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
     def _display_hyperparameters(self,
                                  hyperparameters_list):
         """
@@ -175,7 +237,7 @@ class _ModelTraining:
         PARAMETERS:
             hyperparameters_list:
-                Required Arugment.
+                Required Argument.
                 Specifies the hyperparameters for different ML models.
                 Types: list of dict
@@ -189,16 +251,13 @@ class _ModelTraining:
         # Iterating over hyperparameters_list
         for hyperparameter_dct in hyperparameters_list:
-            # Extracting hyperparameter and thier value from hyperparameters dictionary
+            # Extracting hyperparameter and their value from hyperparameters dictionary
             for key, val in hyperparameter_dct.items():
                 # Displaying hyperparameters
                 print(f"{key} : {str(val)}")
-            # Creating all possible combinations of hyperparameters
-            all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
             # Displaying total number of models for each model
-            total_models = len(all_combinations)
+            total_models = self._get_model_param_space(hyperparameter_dct)
             print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
             print(f"--"*100+'\n')
@@ -210,7 +269,7 @@ class _ModelTraining:
         PARAMETERS:
             trained_models_info:
-                Required Arugment.
+                Required Argument.
                 Specifies the trained models inforamtion to display.
                 Types: pandas Dataframe
@@ -227,10 +286,12 @@ class _ModelTraining:
         # Adding rank to leaderboard
         sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
-        # Assuming 'sorted_df' is your DataFrame
-        # Excluding the "last_col"
-        leaderboard = sorted_model_df.drop("model-obj", axis=1)
+        # Excluding the model object and model name from leaderboard
+        leaderboard = sorted_model_df.drop(["model-obj","Name"], axis=1)
+        # filtering the rows based on the max_models
+        if self.max_models is not None:
+            leaderboard = leaderboard[leaderboard["Rank"] <= self.max_models]
         self._display_msg(msg="Leaderboard",
                           progress_bar=self.progress_bar,
@@ -343,12 +404,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -409,7 +470,8 @@ class _ModelTraining:
                 'shrinkage_factor': tuple(shrinkage_factor),
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
-                'iter_num': tuple(iter_num)
+                'iter_num': tuple(iter_num),
+                'seed':42
                 }
         # Hyperparameters for Decision Forest model
         df_params = {
@@ -419,7 +481,8 @@ class _ModelTraining:
                 'min_impurity': tuple(min_impurity),
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
-                'num_trees': tuple(num_trees)
+                'num_trees': tuple(num_trees),
+                'seed':42
         }
         # Updating model type in case of classification
@@ -445,12 +508,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -482,12 +545,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -616,6 +679,44 @@ class _ModelTraining:
             raise ValueError("No model is selected for training.")
         return parameters
+    def distribute_max_models(self):
+        """
+        DESCRIPTION:
+            Internal function to distribute max_models across available model functions.
+        RETURNS:
+            dictionary containing max_models distribution and list of models to remove.
+        """
+        # Getting total number of models
+        model_count=len(self.model_list)
+        # Evenly distributing max_models across models
+        base_assign = self.max_models // model_count
+        # Creating list of max_models for each model
+        distribution = [base_assign] * model_count
+        # Calculating remaining models
+        remaining_model_count = self.max_models % model_count
+        if remaining_model_count:
+            # distributing remaining model across models.
+            # Starting from first model in list and distributing remaining models by 1 each.
+            for i in range(remaining_model_count):
+                distribution[i] += 1
+        # Creating dictionary for model distribution
+        model_distribution = dict(zip(self.model_list, distribution))
+        # Getting list of models with 0 distribution and removing them from model list
+        # While for model having distribution greater than 0, updating distribution with
+        # 1/3rd of original value as we are training with 3 different feature selection methods.
+        models_to_remove = []
+        for model in self.model_list:
+            initial_count = model_distribution[model]
+            if initial_count == 0:
+                models_to_remove.append(model)
+            else:
+                model_distribution[model] = math.ceil(initial_count / 3)
+        return model_distribution, models_to_remove
     def _parallel_training(self, parameters):
         """
@@ -648,6 +749,19 @@ class _ModelTraining:
         self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
                                 if self.max_runtime_secs is not None else None
+        if self.max_models is not None:
+            # Getting model distribution and models to remove
+            self.max_models_distribution, models_to_remove = self.distribute_max_models()
+            # Removing model parameters with 0 distribution
+            if len(models_to_remove):
+                for model in models_to_remove:
+                    model_params = [param for param in model_params if param['name'] != model]
+                    # Updating progress bar as we are removing model
+                    self.progress_bar.update()
+        if self.is_classification_type():
+            self.startify_col = self.target_column
         trained_models = []
         for param in model_params:
@@ -677,12 +791,12 @@ class _ModelTraining:
                 Types: tuple of Teradataml DataFrame
             model_info
-                Required Arugment.
+                Required Argument.
                 Specifies the trained models information.
                 Types: Pandas DataFrame
         RETURNS:
-            Pandas DataFrame containing, trained models with thier performance metrics.
+            Pandas DataFrame containing, trained models with their performance metrics.
         """
         self._display_msg(msg="Evaluating models performance ...",
                           progress_bar = self.progress_bar,
@@ -697,9 +811,9 @@ class _ModelTraining:
         # Iterating over models
         for index, model_row in model_info.iterrows():
-            # Extracting model name, feature selection method, and model object
-            model_name, feature_selection, model_object = model_row['Name'], \
-                                                        model_row['Feature selection'], model_row['obj']
+            # Extracting model name, model id, feature selection method, and model object
+            model_name, model_id, feature_selection, model_object = model_row['Name'], \
+                model_row['Model-ID'], model_row['Feature-Selection'], model_row['obj']
             # Selecting test data based on feature selection method
             test_set = feature_selection_to_test_data[feature_selection]
@@ -708,7 +822,9 @@ class _ModelTraining:
             if model_name == 'knn':
                 performance_metrics = model_object.evaluate(test_data=test_set)
             else:
-                eval_params = self._eval_params_generation(model_name)
+                eval_params = _ModelTraining._eval_params_generation(model_name,
+                                                                     self.target_column,
+                                                                     self.task_type)
                 performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
             # Extracting performance metrics
@@ -718,7 +834,7 @@ class _ModelTraining:
                 performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
                 # Combine all the elements to form a new row
-                new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
+                new_row = [model_name, model_id, feature_selection] + performance_metrics_list + [model_object]
             else:
                 # Regression
                 regression_metrics = next(performance_metrics.result.itertuples())
@@ -726,22 +842,23 @@ class _ModelTraining:
                 feature_count = len(test_set.columns) - 2
                 r2_score = regression_metrics[8]
                 adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
-                new_row = [model_name, feature_selection, regression_metrics[0], regression_metrics[1], regression_metrics[2],
-                        regression_metrics[5], regression_metrics[6], r2_score, adjusted_r2_score, model_object]
+                new_row = [model_name, model_id, feature_selection, regression_metrics[0],
+                           regression_metrics[1], regression_metrics[2], regression_metrics[5],
+                           regression_metrics[6], r2_score, adjusted_r2_score, model_object]
             model_performance_data.append(new_row)
         if self.is_classification_type():
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Feature selection',
-                                                        'Accuracy','Micro-Precision',
+            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Model-ID',
+                                                        'Feature-Selection','Accuracy','Micro-Precision',
                                                         'Micro-Recall','Micro-F1',
                                                         'Macro-Precision','Macro-Recall',
                                                         'Macro-F1','Weighted-Precision',
                                                         'Weighted-Recall','Weighted-F1',
                                                         'model-obj'])
         else:
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
-                                                            'Feature selection',
+            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name', 'Model-ID',
+                                                            'Feature-Selection',
                                                             'MAE', 'MSE', 'MSLE',
                                                             'RMSE', 'RMSLE',
                                                             'R2-score',
@@ -764,12 +881,12 @@ class _ModelTraining:
         PARAMETERS:
             model_param
-                Required Arugment.
+                Required Argument.
                 Specifies the eval_params argument for GridSearch.
                 Types: dict
             train_data:
-                Required Arugment.
+                Required Argument.
                 Specifies the training datasets.
                 Types: tuple of Teradataml DataFrame
@@ -786,7 +903,9 @@ class _ModelTraining:
                          "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
         # Setting eval_params for hpt.
-        eval_params = self._eval_params_generation(model_param['name'])
+        eval_params = _ModelTraining._eval_params_generation(model_param['name'],
+                                                             self.target_column,
+                                                             self.task_type)
         # Input columns for model
         model_param['input_columns'] = self.features
@@ -799,8 +918,19 @@ class _ModelTraining:
         if model_param['name'] == 'knn':
             model_param['test_data'] = test_data
-        # Defining Gridsearch with ML model based on Name
-        _obj = GridSearch(func=model_to_func[model_param['name']], params=model_param)
+        # Using RandomSearch for hyperparameter tunning when max_models is given.
+        # Otherwise, using GridSearch for hyperparameter tunning.
+        if self.max_models is not None:
+            # Setting max_models for RandomSearch based on model name
+            model_param['max_models'] = self.max_models_distribution[model_param['name']]
+            # Defining RandomSearch with ML model based on Name, and max_models
+            _obj = RandomSearch(func=model_to_func[model_param['name']],
+                                params=model_param,
+                                n_iter=model_param['max_models'])
+        else:
+            # Defining Gridsearch with ML model based on Name
+            _obj = GridSearch(func=model_to_func[model_param['name']],
+                              params=model_param)
         if self.verbose > 0:
             print(" " *200, end='\r', flush=True)
@@ -813,46 +943,39 @@ class _ModelTraining:
             _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
                     early_stop=self.stopping_tolerance, run_parallel=True,
                     sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
-                    verbose=verbose, max_time=self.max_runtime_secs)
+                    stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
         else:
             _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
                     early_stop=self.stopping_tolerance, **eval_params,
                     run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
-                    sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
+                    sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
         # Getting all passed models
         _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
+        # Creating mapping data ID to feature selection method
+        data_id_to_method_map = {"DF_0": "lasso", "DF_1": "rfe", "DF_2": "pca"}
+        # Mapping data ID to feature selection method
+        _df['Feature-Selection'] = _df['DATA_ID'].map(data_id_to_method_map)
+        # Getting model details
+        _df['Name'] = model_param['name']
+        _df['Model-ID'] = _df['MODEL_ID']
+        _df['obj'] = _df['MODEL_ID'].apply(lambda x: _obj.get_model(x))
+        # Extracting needed columns
+        model_info = _df[["Name", "Model-ID", "Feature-Selection", "obj"]]
-        # Mapping data ID to DataFrame
-        data_id_to_df = {"DF_0": _df[_df['DATA_ID']=='DF_0'],
-                         "DF_1": _df[_df['DATA_ID']=='DF_1'],
-                         "DF_2": _df[_df['DATA_ID']=='DF_2']}
-        # Returns best model within a Data_ID group
-        # get_best_model = lambda df: df.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'], ascending=[False, False]).iloc[0]['MODEL_ID']\
-        # if self.task_type != 'Regression' else df.sort_values(by=['R2', 'MAE'], ascending=[False, False]).iloc[0]['MODEL_ID']
-        get_best_model = lambda df, stats: df.sort_values(by=stats, ascending=[False, False]).iloc[0]['MODEL_ID']
-        # best_model = get_best_model(data_id_to_df[data_id], stats)
-        stats = ['MICRO-F1', 'WEIGHTED-F1'] if self.task_type != 'Regression' else ['R2', 'MAE']
-        model_info_data = []
-        # Extracting best model
-        for data_id, df_name in zip(["DF_0", "DF_1", "DF_2"], ["lasso", "rfe", "pca"]):
-            if not data_id_to_df[data_id].empty:
-                best_model = get_best_model(data_id_to_df[data_id], stats)
-                model_info_data.append([model_param['name'], df_name, _obj.get_model(best_model)])
-                self._display_msg(inline_msg=best_model, progress_bar=self.progress_bar)
-        model_info = pd.DataFrame(data=model_info_data, columns=["Name",'Feature selection', "obj"])
         self._display_msg(msg="-"*100,
                           progress_bar=self.progress_bar,
                           show_data=True)
         self.progress_bar.update()
         return model_info
-    def _eval_params_generation(self,
-                                ml_name):
+    @staticmethod
+    def _eval_params_generation(ml_name,
+                                target_column,
+                                task_type):
         """
         DESCRIPTION:
             Internal function generates the eval_params for
@@ -860,23 +983,39 @@ class _ModelTraining:
         PARAMETERS:
             ml_name
-                Required Arugment.
+                Required Argument.
                 Specifies the ML name for eval_params generation.
                 Types: str
+            target_column
+                Required Argument.
+                Specifies the target column.
+                Types: str
+            task_type:
+                Required Argument.
+                Specifies the task type for AutoML, whether to apply regresion
+                or classification on the provived dataset.
+                Default Value: "Regression"
+                Permitted Values: "Regression", "Classification"
+                Types: str
         RETURNS:
             dict containing, eval_params for ML model.
         """
         # Setting the eval_params
         eval_params = {"id_column": "id",
-                        "accumulate": self.target_column}
+                        "accumulate": target_column}
         # For Classification
-        if self.task_type != "Regression":
+        if task_type.lower() != "regression":
             if ml_name == 'xgboost':
                 eval_params['model_type'] = 'Classification'
                 eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
             else:
+                if ml_name == 'glm':
+                    eval_params['family'] = 'BINOMIAL'
                 eval_params['output_prob'] = True
         else:
         # For Regression

teradataml/catalog/model_cataloging_utils.py CHANGED Viewed

@@ -179,7 +179,7 @@ def __get_model_inputs_outputs(model, function_arg_map):
                     tdp = preparer(td_dialect)
                     nrows, ncols = member.shape
                     db_schema = UtilFuncs._extract_db_name(member._table_name)
-                    # Add quotes around the DB name in case we are getting it using _get_current_databasename()
+                    # Add quotes around the DB name in case we are getting it using _get_current_databasename().
                     db_schema = tdp.quote(_get_current_databasename()) if db_schema is None else db_schema
                     db_table_name = UtilFuncs._extract_table_name(member._table_name)

teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.1py3-none-any.whl