PyPI - teradataml - Versions diffs - 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl - Mend

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/LICENSE.pdf +0 -0
teradataml/README.md +183 -0
teradataml/__init__.py +6 -3
teradataml/_version.py +2 -2
teradataml/analytics/__init__.py +3 -2
teradataml/analytics/analytic_function_executor.py +275 -40
teradataml/analytics/analytic_query_generator.py +92 -0
teradataml/analytics/byom/__init__.py +3 -2
teradataml/analytics/json_parser/metadata.py +1 -0
teradataml/analytics/json_parser/utils.py +17 -21
teradataml/analytics/meta_class.py +40 -1
teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
teradataml/analytics/sqle/__init__.py +10 -2
teradataml/analytics/table_operator/__init__.py +3 -2
teradataml/analytics/uaf/__init__.py +21 -2
teradataml/analytics/utils.py +62 -1
teradataml/analytics/valib.py +1 -1
teradataml/automl/__init__.py +1553 -319
teradataml/automl/custom_json_utils.py +139 -61
teradataml/automl/data_preparation.py +276 -319
teradataml/automl/data_transformation.py +163 -81
teradataml/automl/feature_engineering.py +402 -239
teradataml/automl/feature_exploration.py +9 -2
teradataml/automl/model_evaluation.py +48 -51
teradataml/automl/model_training.py +291 -189
teradataml/catalog/byom.py +8 -8
teradataml/catalog/model_cataloging_utils.py +1 -1
teradataml/clients/auth_client.py +133 -0
teradataml/clients/pkce_client.py +1 -1
teradataml/common/aed_utils.py +3 -2
teradataml/common/constants.py +48 -6
teradataml/common/deprecations.py +13 -7
teradataml/common/garbagecollector.py +156 -120
teradataml/common/messagecodes.py +6 -1
teradataml/common/messages.py +3 -1
teradataml/common/sqlbundle.py +1 -1
teradataml/common/utils.py +103 -11
teradataml/common/wrapper_utils.py +1 -1
teradataml/context/context.py +121 -31
teradataml/data/advertising.csv +201 -0
teradataml/data/bank_marketing.csv +11163 -0
teradataml/data/bike_sharing.csv +732 -0
teradataml/data/boston2cols.csv +721 -0
teradataml/data/breast_cancer.csv +570 -0
teradataml/data/complaints_test_tokenized.csv +353 -0
teradataml/data/complaints_tokens_model.csv +348 -0
teradataml/data/covid_confirm_sd.csv +83 -0
teradataml/data/customer_segmentation_test.csv +2628 -0
teradataml/data/customer_segmentation_train.csv +8069 -0
teradataml/data/dataframe_example.json +10 -0
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
teradataml/data/dwt2d_dataTable.csv +65 -0
teradataml/data/dwt_dataTable.csv +8 -0
teradataml/data/dwt_filterTable.csv +3 -0
teradataml/data/finance_data4.csv +13 -0
teradataml/data/glm_example.json +28 -1
teradataml/data/grocery_transaction.csv +19 -0
teradataml/data/housing_train_segment.csv +201 -0
teradataml/data/idwt2d_dataTable.csv +5 -0
teradataml/data/idwt_dataTable.csv +8 -0
teradataml/data/idwt_filterTable.csv +3 -0
teradataml/data/insect2Cols.csv +61 -0
teradataml/data/interval_data.csv +5 -0
teradataml/data/jsons/paired_functions.json +14 -0
teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
teradataml/data/kmeans_example.json +5 -0
teradataml/data/kmeans_table.csv +10 -0
teradataml/data/load_example_data.py +8 -2
teradataml/data/naivebayestextclassifier_example.json +1 -1
teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
teradataml/data/onehot_encoder_train.csv +4 -0
teradataml/data/openml_example.json +29 -0
teradataml/data/peppers.png +0 -0
teradataml/data/real_values.csv +14 -0
teradataml/data/sax_example.json +8 -0
teradataml/data/scale_attributes.csv +3 -0
teradataml/data/scale_example.json +52 -1
teradataml/data/scale_input_part_sparse.csv +31 -0
teradataml/data/scale_input_partitioned.csv +16 -0
teradataml/data/scale_input_sparse.csv +11 -0
teradataml/data/scale_parameters.csv +3 -0
teradataml/data/scripts/deploy_script.py +21 -2
teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
teradataml/data/star_pivot.csv +8 -0
teradataml/data/templates/open_source_ml.json +2 -1
teradataml/data/teradataml_example.json +97 -1
teradataml/data/timestamp_data.csv +4 -0
teradataml/data/titanic_dataset_unpivoted.csv +19 -0
teradataml/data/uaf_example.json +55 -1
teradataml/data/unpivot_example.json +15 -0
teradataml/data/url_data.csv +9 -0
teradataml/data/windowdfft.csv +16 -0
teradataml/data/ztest_example.json +16 -0
teradataml/dataframe/copy_to.py +9 -4
teradataml/dataframe/data_transfer.py +125 -64
teradataml/dataframe/dataframe.py +575 -57
teradataml/dataframe/dataframe_utils.py +47 -9
teradataml/dataframe/fastload.py +273 -90
teradataml/dataframe/functions.py +339 -0
teradataml/dataframe/row.py +160 -0
teradataml/dataframe/setop.py +2 -2
teradataml/dataframe/sql.py +740 -18
teradataml/dataframe/window.py +1 -1
teradataml/dbutils/dbutils.py +324 -18
teradataml/geospatial/geodataframe.py +1 -1
teradataml/geospatial/geodataframecolumn.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +13 -13
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
teradataml/options/__init__.py +16 -5
teradataml/options/configure.py +39 -6
teradataml/options/display.py +2 -2
teradataml/plot/axis.py +4 -4
teradataml/scriptmgmt/UserEnv.py +26 -19
teradataml/scriptmgmt/lls_utils.py +120 -16
teradataml/table_operators/Script.py +4 -5
teradataml/table_operators/TableOperator.py +160 -26
teradataml/table_operators/table_operator_util.py +88 -41
teradataml/table_operators/templates/dataframe_udf.template +63 -0
teradataml/telemetry_utils/__init__.py +0 -0
teradataml/telemetry_utils/queryband.py +52 -0
teradataml/utils/validators.py +41 -3
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0

teradataml/automl/model_training.py CHANGED Viewed

@@ -16,15 +16,18 @@
 # Python libraries
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
+import math
 import pandas as pd
 from itertools import product
+import numpy as np
 # Teradata libraries
 from teradataml.context import context as tdmlctx
 from teradataml.dataframe.copy_to import copy_to_sql
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml import execute_sql, get_connection
-from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
+from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
+from teradataml.utils.validators import _Validators
 class _ModelTraining:
@@ -36,7 +39,8 @@ class _ModelTraining:
                  verbose=0,
                  features=None,
                  task_type="Regression",
-                 custom_data = None):
+                 custom_data = None,
+                 **kwargs):
         """
         DESCRIPTION:
             Function initializes the data, target column, features and models
@@ -49,12 +53,12 @@ class _ModelTraining:
                 Types: teradataml Dataframe
             target_column:
-                Required Arugment.
+                Required Argument.
                 Specifies the target column present inside the dataset.
                 Types: str
             model_list:
-                Required Arugment.
+                Required Argument.
                 Specifies the list of models to be used for model training.
                 Types: list
@@ -70,13 +74,13 @@ class _ModelTraining:
                 Types: int
             features:
-                Required Arugment.
+                Required Argument.
                 Specifies the list of selected feature by rfe, lasso and pca
                 respectively in this order.
                 Types: list of list of strings (str)
             task_type:
-                Required Arugment.
+                Required Argument.
                 Specifies the task type for AutoML, whether to apply regresion
                 or classification on the provived dataset.
                 Default Value: "Regression"
@@ -84,9 +88,31 @@ class _ModelTraining:
                 Types: str
             custom_data:
-                Optional Arugment.
+                Optional Argument.
                 Specifies json object containing user customized input.
                 Types: json object
+            **kwargs:
+                Specifies the additional arguments for model training. Below
+                are the additional arguments:
+                    volatile:
+                        Optional Argument.
+                        Specifies whether to put the interim results of the
+                        functions in a volatile table or not. When set to
+                        True, results are stored in a volatile table,
+                        otherwise not.
+                        Default Value: False
+                        Types: bool
+                    persist:
+                        Optional Argument.
+                        Specifies whether to persist the interim results of the
+                        functions in a table or not. When set to True,
+                        results are persisted in a table; otherwise,
+                        results are garbage collected at the end of the
+                        session.
+                        Default Value: False
+                        Types: bool
         """
         self.data = data
         self.target_column = target_column
@@ -96,12 +122,16 @@ class _ModelTraining:
         self.task_type = task_type
         self.custom_data = custom_data
         self.labels = self.data.drop_duplicate(self.target_column).size
+        self.startify_col = None
+        self.persist = kwargs.get("persist", False)
+        self.volatile = kwargs.get("volatile", False)
     def model_training(self,
                        auto=True,
                        max_runtime_secs=None,
                        stopping_metric=None,
-                       stopping_tolerance=0
+                       stopping_tolerance=0,
+                       max_models=None
                        ):
         """
         DESCRIPTION:
@@ -112,14 +142,14 @@ class _ModelTraining:
         PARAMETERS:
             auto:
-                Optional Arugment.
+                Optional Argument.
                 Specifies whether to run data preparation in auto mode or custom mode.
                 When set to True, runs automtically otherwise, it take user inputs.
                 Default Value: True
                 Types: boolean
             max_runtime_secs:
-                Optional Arugment.
+                Optional Argument.
                 Specifies the time limit in seconds for model training.
                 Types: int
@@ -132,6 +162,11 @@ class _ModelTraining:
                 Required, when "stopping_metric" is set, otherwise optional.
                 Specifies the stopping tolerance for stopping metrics in model training.
                 Types: float
+            max_models:
+                Optional Argument.
+                Specifies the maximum number of models to be trained.
+                Types: int
         RETURNS:
             pandas dataframes containing model information, leaderboard and target
@@ -140,6 +175,7 @@ class _ModelTraining:
         self.stopping_metric = stopping_metric
         self.stopping_tolerance = stopping_tolerance
         self.max_runtime_secs = max_runtime_secs
+        self.max_models = max_models
         self._display_heading(phase=3, progress_bar=self.progress_bar)
         self._display_msg(msg='Model Training started ...',
@@ -152,6 +188,10 @@ class _ModelTraining:
         if not auto:
             parameters = self._custom_hyperparameters(parameters)
+        # Validates the upper limit of max_models based on total model combinations
+        if self.max_models is not None:
+            self._validate_upper_limit_for_max_models(parameters)
         if self.verbose == 2:
             self._display_hyperparameters(parameters)
@@ -167,6 +207,54 @@ class _ModelTraining:
         return models, leader_board, self.labels
+    def _get_model_param_space(self,
+                               hyperparameters):
+        """
+        DESCRIPTION:
+            Internal function to calculate the total number of models to be trained for specific model.
+        PARAMETERS:
+            hyperparameters:
+                Required Argument.
+                Specifies the hyperparameters availables for ML model.
+                Types: list of dict
+        RETURNS:
+            int containing, total number of models available for training.
+        """
+        # Creating all possible combinations of hyperparameters
+        all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
+        # Getting total number of models for each model model training function
+        total_models = len(all_combinations)
+        return total_models
+    def _validate_upper_limit_for_max_models(self,
+                                             hyperparameters_list):
+        """
+        DESCRIPTION:
+            Internal function to validate the upper limit of max_models.
+        PARAMETERS:
+            hyperparameters_list:
+                Required Argument.
+                Specifies the hyperparameters for different ML models.
+                Types: list of dict
+        RETURNS:
+            None
+        RAISES:
+            TeradataMlException, ValueError
+        """
+        model_param_space = 0
+        for hyperparameter_dct in hyperparameters_list:
+            # getting total number of models for each model
+            total_models = self._get_model_param_space(hyperparameter_dct)
+            model_param_space += total_models
+        # Validating upper range for max_models
+        _Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
     def _display_hyperparameters(self,
                                  hyperparameters_list):
         """
@@ -175,7 +263,7 @@ class _ModelTraining:
         PARAMETERS:
             hyperparameters_list:
-                Required Arugment.
+                Required Argument.
                 Specifies the hyperparameters for different ML models.
                 Types: list of dict
@@ -189,16 +277,13 @@ class _ModelTraining:
         # Iterating over hyperparameters_list
         for hyperparameter_dct in hyperparameters_list:
-            # Extracting hyperparameter and thier value from hyperparameters dictionary
+            # Extracting hyperparameter and their value from hyperparameters dictionary
             for key, val in hyperparameter_dct.items():
                 # Displaying hyperparameters
                 print(f"{key} : {str(val)}")
-            # Creating all possible combinations of hyperparameters
-            all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
             # Displaying total number of models for each model
-            total_models = len(all_combinations)
+            total_models = self._get_model_param_space(hyperparameter_dct)
             print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
             print(f"--"*100+'\n')
@@ -210,7 +295,7 @@ class _ModelTraining:
         PARAMETERS:
             trained_models_info:
-                Required Arugment.
+                Required Argument.
                 Specifies the trained models inforamtion to display.
                 Types: pandas Dataframe
@@ -219,18 +304,25 @@ class _ModelTraining:
         """
         # Creating a copy to avoid use of same reference of memory
         if self.task_type != "Regression":
-            sorted_model_df = trained_models_info.sort_values(by=['Micro-F1', 'Weighted-F1'],
-                                                ascending=[False, False]).reset_index(drop=True)
+            sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
+                                                              ascending=[False, False]).reset_index(drop=True)
         else:
-            sorted_model_df = trained_models_info.sort_values(by='R2-score',
-                                                ascending=False).reset_index(drop=True)
+            sorted_model_df = trained_models_info.sort_values(by='R2',
+                                                              ascending=False).reset_index(drop=True)
         # Adding rank to leaderboard
-        sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
-        # Assuming 'sorted_df' is your DataFrame
-        # Excluding the "last_col"
-        leaderboard = sorted_model_df.drop("model-obj", axis=1)
+        sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
+        # Internal Data list for leaderboard
+        dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
+        # Excluding the model object and model name from leaderboard
+        leaderboard = sorted_model_df.drop(dp_lst, axis=1)
+        # filtering the rows based on the max_models
+        if self.max_models is not None:
+            leaderboard = leaderboard[leaderboard["RANK"] <= self.max_models]
         self._display_msg(msg="Leaderboard",
                           progress_bar=self.progress_bar,
@@ -343,12 +435,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -375,28 +467,24 @@ class _ModelTraining:
             max_depth.extend([6, 7, 8])
             min_node_size.extend([2])
             iter_num.extend([20])
-            num_trees.extend([10, 20])
         elif num_rows < 10000 and num_cols < 15:
             min_impurity.extend([0.1, 0.2])
             shrinkage_factor.extend([0.1, 0.3])
             max_depth.extend([6, 8, 10])
             min_node_size.extend([2, 3])
             iter_num.extend([20, 30])
-            num_trees.extend([20, 30])
         elif num_rows < 100000 and num_cols < 20:
             min_impurity.extend([0.2, 0.3])
             shrinkage_factor.extend([0.01, 0.1, 0.2])
             max_depth.extend([4, 6, 7])
             min_node_size.extend([3, 4])
             iter_num.extend([30, 40])
-            num_trees.extend([30, 40])
         else:
             min_impurity.extend([0.1, 0.2, 0.3])
             shrinkage_factor.extend([0.01, 0.05, 0.1])
             max_depth.extend([3, 4, 7, 8])
             min_node_size.extend([2, 3, 4])
             iter_num.extend([20, 30, 40])
-            num_trees.extend([20, 30, 40])
         # Hyperparameters for XGBoost model
         xgb_params = {
@@ -409,7 +497,8 @@ class _ModelTraining:
                 'shrinkage_factor': tuple(shrinkage_factor),
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
-                'iter_num': tuple(iter_num)
+                'iter_num': tuple(iter_num),
+                'seed':42
                 }
         # Hyperparameters for Decision Forest model
         df_params = {
@@ -419,7 +508,8 @@ class _ModelTraining:
                 'min_impurity': tuple(min_impurity),
                 'max_depth': tuple(max_depth),
                 'min_node_size': tuple(min_node_size),
-                'num_trees': tuple(num_trees)
+                'num_trees': tuple(num_trees),
+                'seed':42
         }
         # Updating model type in case of classification
@@ -445,12 +535,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -482,12 +572,12 @@ class _ModelTraining:
         PARAMETERS:
             num_rows:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of rows in dataset.
                 Types: int
             num_cols:
-                Required Arugment.
+                Required Argument.
                 Specifies the number of columns in dataset.
                 Types: int
@@ -616,6 +706,44 @@ class _ModelTraining:
             raise ValueError("No model is selected for training.")
         return parameters
+    def distribute_max_models(self):
+        """
+        DESCRIPTION:
+            Internal function to distribute max_models across available model functions.
+        RETURNS:
+            dictionary containing max_models distribution and list of models to remove.
+        """
+        # Getting total number of models
+        model_count=len(self.model_list)
+        # Evenly distributing max_models across models
+        base_assign = self.max_models // model_count
+        # Creating list of max_models for each model
+        distribution = [base_assign] * model_count
+        # Calculating remaining models
+        remaining_model_count = self.max_models % model_count
+        if remaining_model_count:
+            # distributing remaining model across models.
+            # Starting from first model in list and distributing remaining models by 1 each.
+            for i in range(remaining_model_count):
+                distribution[i] += 1
+        # Creating dictionary for model distribution
+        model_distribution = dict(zip(self.model_list, distribution))
+        # Getting list of models with 0 distribution and removing them from model list
+        # While for model having distribution greater than 0, updating distribution with
+        # 1/3rd of original value as we are training with 3 different feature selection methods.
+        models_to_remove = []
+        for model in self.model_list:
+            initial_count = model_distribution[model]
+            if initial_count == 0:
+                models_to_remove.append(model)
+            else:
+                model_distribution[model] = math.ceil(initial_count / 3)
+        return model_distribution, models_to_remove
     def _parallel_training(self, parameters):
         """
@@ -635,12 +763,15 @@ class _ModelTraining:
         # Hyperparameters for each model
         model_params = parameters[:min(len(parameters), 5)]
-        self._display_msg(msg="\nPerforming hyperParameter tuning ...", progress_bar=self.progress_bar)
+        self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
-        # Defining training and testing data
+        # Defining training data
         data_types = ['lasso', 'rfe', 'pca']
         trainng_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_train']) for data_type in data_types)
-        testing_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_test']) for data_type in data_types)
+        if self.task_type == "Classification":
+            response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
+            self.output_response = [str(i) for i in response_values]
         if self.stopping_metric is None:
             self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
@@ -648,115 +779,31 @@ class _ModelTraining:
         self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
                                 if self.max_runtime_secs is not None else None
+        if self.max_models is not None:
+            # Getting model distribution and models to remove
+            self.max_models_distribution, models_to_remove = self.distribute_max_models()
+            # Removing model parameters with 0 distribution
+            if len(models_to_remove):
+                for model in models_to_remove:
+                    model_params = [param for param in model_params if param['name'] != model]
+                    # Updating progress bar as we are removing model
+                    self.progress_bar.update()
+        if self.is_classification_type():
+            self.startify_col = self.target_column
         trained_models = []
         for param in model_params:
-            result = self._hyperparameter_tunning(param, trainng_datas, testing_datas)
+            result = self._hyperparameter_tunning(param, trainng_datas)
             trained_models.append(result)
         models_df = pd.concat(trained_models, ignore_index=True)
-        # Score the model and combine the results into a single DataFrame
-        trained_models_info = self._model_scoring(testing_datas, models_df)
-        trained_models_info = trained_models_info.reset_index(drop=True)
-        return trained_models_info
-    def _model_scoring(self,
-                       test_data,
-                       model_info):
-        """
-        DESCRIPTION:
-            Internal function generates the performance metrics for
-            trained ML models using testing dataset.
-        PARAMETERS:
-            test_data
-                Required Argument.
-                Specifies the testing datasets
-                Types: tuple of Teradataml DataFrame
-            model_info
-                Required Arugment.
-                Specifies the trained models information.
-                Types: Pandas DataFrame
-        RETURNS:
-            Pandas DataFrame containing, trained models with thier performance metrics.
-        """
-        self._display_msg(msg="Evaluating models performance ...",
-                          progress_bar = self.progress_bar,
-                          show_data=True)
-        # Empty list for storing model performance metrics
-        model_performance_data = []
-        # Mapping feature selection methods to corresponding test data
-        feature_selection_to_test_data = {"lasso": test_data[0],
-                                          "rfe": test_data[1],
-                                          "pca": test_data[2]}
-        # Iterating over models
-        for index, model_row in model_info.iterrows():
-            # Extracting model name, feature selection method, and model object
-            model_name, feature_selection, model_object = model_row['Name'], \
-                                                        model_row['Feature selection'], model_row['obj']
-            # Selecting test data based on feature selection method
-            test_set = feature_selection_to_test_data[feature_selection]
-            # Model evaluation
-            if model_name == 'knn':
-                performance_metrics = model_object.evaluate(test_data=test_set)
-            else:
-                eval_params = self._eval_params_generation(model_name)
-                performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
-            # Extracting performance metrics
-            if self.is_classification_type():
-                # Classification
-                # Extract performance metrics from the output data
-                performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
-                # Combine all the elements to form a new row
-                new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
-            else:
-                # Regression
-                regression_metrics = next(performance_metrics.result.itertuples())
-                sample_size = test_set.select('id').size
-                feature_count = len(test_set.columns) - 2
-                r2_score = regression_metrics[8]
-                adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
-                new_row = [model_name, feature_selection, regression_metrics[0], regression_metrics[1], regression_metrics[2],
-                        regression_metrics[5], regression_metrics[6], r2_score, adjusted_r2_score, model_object]
-            model_performance_data.append(new_row)
-        if self.is_classification_type():
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Feature selection',
-                                                        'Accuracy','Micro-Precision',
-                                                        'Micro-Recall','Micro-F1',
-                                                        'Macro-Precision','Macro-Recall',
-                                                        'Macro-F1','Weighted-Precision',
-                                                        'Weighted-Recall','Weighted-F1',
-                                                        'model-obj'])
-        else:
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
-                                                            'Feature selection',
-                                                            'MAE', 'MSE', 'MSLE',
-                                                            'RMSE', 'RMSLE',
-                                                            'R2-score',
-                                                            'Adjusted R2-score',
-                                                            'model-obj'])
-        self._display_msg(msg="Evaluation completed.",
-                          progress_bar = self.progress_bar,
-                          show_data=True)
-        return model_metrics_df
+        return models_df
     def _hyperparameter_tunning(self,
                                 model_param,
-                                train_data,
-                                test_data):
+                                train_data):
         """
         DESCRIPTION:
             Internal function performs hyperparameter tuning on
@@ -764,18 +811,13 @@ class _ModelTraining:
         PARAMETERS:
             model_param
-                Required Arugment.
+                Required Argument.
                 Specifies the eval_params argument for GridSearch.
                 Types: dict
             train_data:
-                Required Arugment.
-                Specifies the training datasets.
-                Types: tuple of Teradataml DataFrame
-            test_data
                 Required Argument.
-                Specifies the testing datasets
+                Specifies the training datasets.
                 Types: tuple of Teradataml DataFrame
         RETURNS:
@@ -786,21 +828,42 @@ class _ModelTraining:
                          "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
         # Setting eval_params for hpt.
-        eval_params = self._eval_params_generation(model_param['name'])
+        eval_params = _ModelTraining._eval_params_generation(model_param['name'],
+                                                             self.target_column,
+                                                             self.task_type)
         # Input columns for model
         model_param['input_columns'] = self.features
+        # Setting persist for model
+        model_param['persist'] = self.persist
         self._display_msg(msg=model_param['name'],
                           progress_bar=self.progress_bar,
                           show_data=True)
-        # Defining test data for KNN
+        # As we are using entire data for HPT training. So,
+        # passing prepared training data as test_data for KNN.
         if model_param['name'] == 'knn':
-            model_param['test_data'] = test_data
+            model_param['test_data'] = train_data
-        # Defining Gridsearch with ML model based on Name
-        _obj = GridSearch(func=model_to_func[model_param['name']], params=model_param)
+        if self.task_type == "Classification":
+            model_param['output_prob'] = True
+            model_param['output_responses'] = self.output_response
+        # Using RandomSearch for hyperparameter tunning when max_models is given.
+        # Otherwise, using GridSearch for hyperparameter tunning.
+        if self.max_models is not None:
+            # Setting max_models for RandomSearch based on model name
+            model_param['max_models'] = self.max_models_distribution[model_param['name']]
+            # Defining RandomSearch with ML model based on Name, and max_models
+            _obj = RandomSearch(func=model_to_func[model_param['name']],
+                                params=model_param,
+                                n_iter=model_param['max_models'])
+        else:
+            # Defining Gridsearch with ML model based on Name
+            _obj = GridSearch(func=model_to_func[model_param['name']],
+                              params=model_param)
         if self.verbose > 0:
             print(" " *200, end='\r', flush=True)
@@ -813,46 +876,54 @@ class _ModelTraining:
             _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
                     early_stop=self.stopping_tolerance, run_parallel=True,
                     sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
-                    verbose=verbose, max_time=self.max_runtime_secs)
+                    stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
         else:
             _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
                     early_stop=self.stopping_tolerance, **eval_params,
                     run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
-                    sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
+                    sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
         # Getting all passed models
-        _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
-        # Mapping data ID to DataFrame
-        data_id_to_df = {"DF_0": _df[_df['DATA_ID']=='DF_0'],
-                         "DF_1": _df[_df['DATA_ID']=='DF_1'],
-                         "DF_2": _df[_df['DATA_ID']=='DF_2']}
-        # Returns best model within a Data_ID group
-        # get_best_model = lambda df: df.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'], ascending=[False, False]).iloc[0]['MODEL_ID']\
-        # if self.task_type != 'Regression' else df.sort_values(by=['R2', 'MAE'], ascending=[False, False]).iloc[0]['MODEL_ID']
-        get_best_model = lambda df, stats: df.sort_values(by=stats, ascending=[False, False]).iloc[0]['MODEL_ID']
-        # best_model = get_best_model(data_id_to_df[data_id], stats)
-        stats = ['MICRO-F1', 'WEIGHTED-F1'] if self.task_type != 'Regression' else ['R2', 'MAE']
-        model_info_data = []
-        # Extracting best model
-        for data_id, df_name in zip(["DF_0", "DF_1", "DF_2"], ["lasso", "rfe", "pca"]):
-            if not data_id_to_df[data_id].empty:
-                best_model = get_best_model(data_id_to_df[data_id], stats)
-                model_info_data.append([model_param['name'], df_name, _obj.get_model(best_model)])
-                self._display_msg(inline_msg=best_model, progress_bar=self.progress_bar)
-        model_info = pd.DataFrame(data=model_info_data, columns=["Name",'Feature selection', "obj"])
+        model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
+                                            on='MODEL_ID', how='inner')
+        # Creating mapping data ID to feature selection method
+        data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
+                                "DF_1": ('rfe', train_data[1]._table_name),
+                                "DF_2": ('pca', train_data[2]._table_name)}
+        # Updating model stats with feature selection method and result table
+        for index, row in model_info.iterrows():
+            model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
+            model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
+            model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
+            model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
+        # Dropping column 'DATA_ID'
+        model_info.drop(['DATA_ID'], axis=1, inplace=True)
+        model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
+        if not self.is_classification_type():
+            # Calculating Adjusted-R2 for regression
+            # Getting size and feature count for each feature selection method
+            methods = ["lasso", "rfe", "pca"]
+            size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
+            feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
+            model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
+                1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
+                (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
         self._display_msg(msg="-"*100,
                           progress_bar=self.progress_bar,
                           show_data=True)
         self.progress_bar.update()
         return model_info
-    def _eval_params_generation(self,
-                                ml_name):
+    @staticmethod
+    def _eval_params_generation(ml_name,
+                                target_column,
+                                task_type):
         """
         DESCRIPTION:
             Internal function generates the eval_params for
@@ -860,28 +931,59 @@ class _ModelTraining:
         PARAMETERS:
             ml_name
-                Required Arugment.
+                Required Argument.
                 Specifies the ML name for eval_params generation.
                 Types: str
+            target_column
+                Required Argument.
+                Specifies the target column.
+                Types: str
+            task_type:
+                Required Argument.
+                Specifies the task type for AutoML, whether to apply regresion
+                or classification on the provived dataset.
+                Default Value: "Regression"
+                Permitted Values: "Regression", "Classification"
+                Types: str
         RETURNS:
             dict containing, eval_params for ML model.
         """
         # Setting the eval_params
         eval_params = {"id_column": "id",
-                        "accumulate": self.target_column}
+                        "accumulate": target_column}
+        model_type = {
+            'xgboost': 'model_type',
+            'glm': 'model_type',
+            'decisionforest': 'tree_type',
+            'svm': 'model_type',
+            'knn': 'model_type'
+        }
+        ml_name = ml_name.replace('_', '').lower()
         # For Classification
-        if self.task_type != "Regression":
+        if task_type.lower() != "regression":
+            eval_params[model_type[ml_name]] = 'Classification'
+            eval_params['output_prob'] = True
             if ml_name == 'xgboost':
-                eval_params['model_type'] = 'Classification'
                 eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
-            else:
-                eval_params['output_prob'] = True
+            elif ml_name == 'glm':
+                eval_params['family'] = 'BINOMIAL'
         else:
         # For Regression
+            eval_params[model_type[ml_name]] = 'Regression'
             if ml_name == 'xgboost':
-                eval_params['model_type'] = 'Regression'
                 eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
+            elif ml_name == 'glm':
+                eval_params['family'] = 'GAUSSIAN'
         return eval_params

teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.2py3-none-any.whl