PyPI - teradataml - Versions diffs - 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl - Mend

teradataml 20.0.0.1py3-none-any.whl → 20.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/LICENSE.pdf +0 -0
teradataml/README.md +306 -0
teradataml/__init__.py +10 -3
teradataml/_version.py +1 -1
teradataml/analytics/__init__.py +3 -2
teradataml/analytics/analytic_function_executor.py +299 -16
teradataml/analytics/analytic_query_generator.py +92 -0
teradataml/analytics/byom/__init__.py +3 -2
teradataml/analytics/json_parser/metadata.py +13 -3
teradataml/analytics/json_parser/utils.py +13 -6
teradataml/analytics/meta_class.py +40 -1
teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
teradataml/analytics/sqle/__init__.py +11 -2
teradataml/analytics/table_operator/__init__.py +4 -3
teradataml/analytics/uaf/__init__.py +21 -2
teradataml/analytics/utils.py +66 -1
teradataml/analytics/valib.py +1 -1
teradataml/automl/__init__.py +1502 -323
teradataml/automl/custom_json_utils.py +139 -61
teradataml/automl/data_preparation.py +247 -307
teradataml/automl/data_transformation.py +32 -12
teradataml/automl/feature_engineering.py +325 -86
teradataml/automl/model_evaluation.py +44 -35
teradataml/automl/model_training.py +122 -153
teradataml/catalog/byom.py +8 -8
teradataml/clients/pkce_client.py +1 -1
teradataml/common/__init__.py +2 -1
teradataml/common/constants.py +72 -0
teradataml/common/deprecations.py +13 -7
teradataml/common/garbagecollector.py +152 -120
teradataml/common/messagecodes.py +11 -2
teradataml/common/messages.py +4 -1
teradataml/common/sqlbundle.py +26 -4
teradataml/common/utils.py +225 -14
teradataml/common/wrapper_utils.py +1 -1
teradataml/context/context.py +82 -2
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/data/complaints_test_tokenized.csv +353 -0
teradataml/data/complaints_tokens_model.csv +348 -0
teradataml/data/covid_confirm_sd.csv +83 -0
teradataml/data/dataframe_example.json +27 -1
teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
teradataml/data/dwt2d_dataTable.csv +65 -0
teradataml/data/dwt_dataTable.csv +8 -0
teradataml/data/dwt_filterTable.csv +3 -0
teradataml/data/finance_data4.csv +13 -0
teradataml/data/grocery_transaction.csv +19 -0
teradataml/data/idwt2d_dataTable.csv +5 -0
teradataml/data/idwt_dataTable.csv +8 -0
teradataml/data/idwt_filterTable.csv +3 -0
teradataml/data/interval_data.csv +5 -0
teradataml/data/jsons/paired_functions.json +14 -0
teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
teradataml/data/load_example_data.py +8 -2
teradataml/data/medical_readings.csv +101 -0
teradataml/data/naivebayestextclassifier_example.json +1 -1
teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
teradataml/data/patient_profile.csv +101 -0
teradataml/data/peppers.png +0 -0
teradataml/data/real_values.csv +14 -0
teradataml/data/sax_example.json +8 -0
teradataml/data/scripts/deploy_script.py +1 -1
teradataml/data/scripts/lightgbm/dataset.template +157 -0
teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
teradataml/data/star_pivot.csv +8 -0
teradataml/data/target_udt_data.csv +8 -0
teradataml/data/templates/open_source_ml.json +3 -1
teradataml/data/teradataml_example.json +20 -1
teradataml/data/timestamp_data.csv +4 -0
teradataml/data/titanic_dataset_unpivoted.csv +19 -0
teradataml/data/uaf_example.json +55 -1
teradataml/data/unpivot_example.json +15 -0
teradataml/data/url_data.csv +9 -0
teradataml/data/vectordistance_example.json +4 -0
teradataml/data/windowdfft.csv +16 -0
teradataml/dataframe/copy_to.py +1 -1
teradataml/dataframe/data_transfer.py +5 -3
teradataml/dataframe/dataframe.py +1002 -201
teradataml/dataframe/fastload.py +3 -3
teradataml/dataframe/functions.py +867 -0
teradataml/dataframe/row.py +160 -0
teradataml/dataframe/setop.py +2 -2
teradataml/dataframe/sql.py +840 -33
teradataml/dataframe/window.py +1 -1
teradataml/dbutils/dbutils.py +878 -34
teradataml/dbutils/filemgr.py +48 -1
teradataml/geospatial/geodataframe.py +1 -1
teradataml/geospatial/geodataframecolumn.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +13 -13
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/__init__.py +1 -1
teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
teradataml/opensource/_lightgbm.py +950 -0
teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
teradataml/opensource/sklearn/__init__.py +0 -1
teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
teradataml/options/__init__.py +9 -23
teradataml/options/configure.py +42 -4
teradataml/options/display.py +2 -2
teradataml/plot/axis.py +4 -4
teradataml/scriptmgmt/UserEnv.py +13 -9
teradataml/scriptmgmt/lls_utils.py +77 -23
teradataml/store/__init__.py +13 -0
teradataml/store/feature_store/__init__.py +0 -0
teradataml/store/feature_store/constants.py +291 -0
teradataml/store/feature_store/feature_store.py +2223 -0
teradataml/store/feature_store/models.py +1505 -0
teradataml/store/vector_store/__init__.py +1586 -0
teradataml/table_operators/Script.py +2 -2
teradataml/table_operators/TableOperator.py +106 -20
teradataml/table_operators/query_generator.py +3 -0
teradataml/table_operators/table_operator_query_generator.py +3 -1
teradataml/table_operators/table_operator_util.py +102 -56
teradataml/table_operators/templates/dataframe_register.template +69 -0
teradataml/table_operators/templates/dataframe_udf.template +63 -0
teradataml/telemetry_utils/__init__.py +0 -0
teradataml/telemetry_utils/queryband.py +52 -0
teradataml/utils/dtypes.py +4 -2
teradataml/utils/validators.py +34 -2
{teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
{teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
{teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0

teradataml/automl/model_evaluation.py CHANGED Viewed

@@ -15,6 +15,7 @@
 # Python libraries
 import time
+import ast
 # Teradata libraries
 from teradataml.dataframe.dataframe import DataFrame
@@ -56,7 +57,12 @@ class _ModelEvaluator:
         self.target_column = target_column
         self.task_type = task_type
-    def model_evaluation(self, rank, table_name_mapping, test_data_ind = False, target_column_ind = False):
+    def model_evaluation(self,
+                         rank,
+                         table_name_mapping,
+                         data_node_id,
+                         target_column_ind = True,
+                         get_metrics = False):
         """
         DESCRIPTION:
             Function performs the model evaluation on the specified rank in leaderborad.
@@ -72,25 +78,32 @@ class _ModelEvaluator:
                 Specifies the mapping of train,test table names.
                 Types: dict
-            test_data_ind:
-                Optional Argument.
-                Specifies whether test data is present or not.
-                Default Value: False
-                Types: bool
+            data_node_id:
+                Required Argument.
+                Specifies the test data node id.
+                Types: str
             target_column_ind:
                 Optional Argument.
                 Specifies whether target column is present in the dataset or not.
+                Default Value: True
+                Types: bool
+            get_metrics:
+                Optional Argument.
+                Specifies whether to return metrics or not.
                 Default Value: False
+                Types: bool
         RETURNS:
             tuple containing, performance metrics and predicitions of specified rank ML model.
         """
-        # Setting test data indicator and target column indicator
-        self.test_data_ind = test_data_ind
+        # Setting target column indicator
         self.target_column_ind = target_column_ind
         self.table_name_mapping = table_name_mapping
+        self.data_node_id = data_node_id
+        self.get_metrics = get_metrics
         # Return predictions only if test data is present and target column is not present
         return self._evaluator(rank)
@@ -114,38 +127,34 @@ class _ModelEvaluator:
         """
         # Extracting model using rank
         model = self.model_info.loc[rank]
+        ml_name = self.model_info.loc[rank]['MODEL_ID'].split('_')[0]
         # Defining eval_params
-        eval_params = _ModelTraining._eval_params_generation(model['Name'],
+        eval_params = _ModelTraining._eval_params_generation(ml_name,
                                                              self.target_column,
                                                              self.task_type)
-        # Test Data
-        test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
-        # Getting test data from table
-        if not self.test_data_ind:
-            # Test Data
-            test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
-        else:
-            test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature-Selection'])])
-        print("\nFollowing model is being used for generating prediction :")
-        print("Model ID :", model['Model-ID'],
-              "\nFeature Selection Method :",model['Feature-Selection'])
+        # Extracting test data for evaluation based on data node id
+        test = DataFrame(self.table_name_mapping[self.data_node_id]['{}_new_test'.format(model['FEATURE_SELECTION'])])
-        # Evaluation and predictions
-        if model['Name'] == 'knn':
-            metrics = model['model-obj'].evaluate(test_data=test)
-            pred = model['model-obj'].predict(test_data=test)
+        print("\nFollowing model is being picked for evaluation:")
+        print("Model ID :", model['MODEL_ID'],
+              "\nFeature Selection Method :",model['FEATURE_SELECTION'])
+        if self.task_type.lower() == 'classification':
+            params = ast.literal_eval(model['PARAMETERS'])
+            eval_params['output_responses'] = params['output_responses']
+        # Mapping data according to model type
+        data_map = 'test_data' if ml_name == 'KNN' else 'newdata'
+        # Performing evaluation if get_metrics is True else returning predictions
+        if self.get_metrics:
+            metrics = model['model-obj'].evaluate(**{data_map: test}, **eval_params)
+            return metrics
         else:
-            # Return predictions only if test data is present and target column is not present
-            if self.test_data_ind and not self.target_column_ind:
+            # Removing accumulate parameter if target column is not present
+            if not self.target_column_ind:
                 eval_params.pop("accumulate")
-                pred = model['model-obj'].predict(newdata=test, **eval_params)
-                return pred
-            # Return both metrics and predictions for all other cases
-            metrics = model['model-obj'].evaluate(newdata=test, **eval_params)
-            pred = model['model-obj'].predict(newdata=test, **eval_params)
-        return (metrics, pred)
+            pred = model['model-obj'].predict(**{data_map: test}, **eval_params)
+            return pred

teradataml/automl/model_training.py CHANGED Viewed

@@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor
 import math
 import pandas as pd
 from itertools import product
+import numpy as np
 # Teradata libraries
 from teradataml.context import context as tdmlctx
@@ -27,6 +28,7 @@ from teradataml.dataframe.dataframe import DataFrame
 from teradataml import execute_sql, get_connection
 from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
 from teradataml.utils.validators import _Validators
+from teradataml.common.utils import UtilFuncs
 class _ModelTraining:
@@ -38,7 +40,8 @@ class _ModelTraining:
                  verbose=0,
                  features=None,
                  task_type="Regression",
-                 custom_data = None):
+                 custom_data = None,
+                 **kwargs):
         """
         DESCRIPTION:
             Function initializes the data, target column, features and models
@@ -89,6 +92,28 @@ class _ModelTraining:
                 Optional Argument.
                 Specifies json object containing user customized input.
                 Types: json object
+            **kwargs:
+                Specifies the additional arguments for model training. Below
+                are the additional arguments:
+                    volatile:
+                        Optional Argument.
+                        Specifies whether to put the interim results of the
+                        functions in a volatile table or not. When set to
+                        True, results are stored in a volatile table,
+                        otherwise not.
+                        Default Value: False
+                        Types: bool
+                    persist:
+                        Optional Argument.
+                        Specifies whether to persist the interim results of the
+                        functions in a table or not. When set to True,
+                        results are persisted in a table; otherwise,
+                        results are garbage collected at the end of the
+                        session.
+                        Default Value: False
+                        Types: bool
         """
         self.data = data
         self.target_column = target_column
@@ -99,6 +124,8 @@ class _ModelTraining:
         self.custom_data = custom_data
         self.labels = self.data.drop_duplicate(self.target_column).size
         self.startify_col = None
+        self.persist = kwargs.get("persist", False)
+        self.volatile = kwargs.get("volatile", False)
     def model_training(self,
                        auto=True,
@@ -278,20 +305,25 @@ class _ModelTraining:
         """
         # Creating a copy to avoid use of same reference of memory
         if self.task_type != "Regression":
-            sorted_model_df = trained_models_info.sort_values(by=['Micro-F1', 'Weighted-F1'],
-                                                ascending=[False, False]).reset_index(drop=True)
+            sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
+                                                              ascending=[False, False]).reset_index(drop=True)
         else:
-            sorted_model_df = trained_models_info.sort_values(by='R2-score',
-                                                ascending=False).reset_index(drop=True)
+            sorted_model_df = trained_models_info.sort_values(by='R2',
+                                                              ascending=False).reset_index(drop=True)
         # Adding rank to leaderboard
-        sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
+        sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
+        # Internal Data list for leaderboard
+        dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
         # Excluding the model object and model name from leaderboard
-        leaderboard = sorted_model_df.drop(["model-obj","Name"], axis=1)
+        leaderboard = sorted_model_df.drop(dp_lst, axis=1)
         # filtering the rows based on the max_models
         if self.max_models is not None:
-            leaderboard = leaderboard[leaderboard["Rank"] <= self.max_models]
+            leaderboard = leaderboard[leaderboard["RANK"] <= self.max_models]
         self._display_msg(msg="Leaderboard",
                           progress_bar=self.progress_bar,
@@ -436,28 +468,24 @@ class _ModelTraining:
             max_depth.extend([6, 7, 8])
             min_node_size.extend([2])
             iter_num.extend([20])
-            num_trees.extend([10, 20])
         elif num_rows < 10000 and num_cols < 15:
             min_impurity.extend([0.1, 0.2])
             shrinkage_factor.extend([0.1, 0.3])
             max_depth.extend([6, 8, 10])
             min_node_size.extend([2, 3])
             iter_num.extend([20, 30])
-            num_trees.extend([20, 30])
         elif num_rows < 100000 and num_cols < 20:
             min_impurity.extend([0.2, 0.3])
             shrinkage_factor.extend([0.01, 0.1, 0.2])
             max_depth.extend([4, 6, 7])
             min_node_size.extend([3, 4])
             iter_num.extend([30, 40])
-            num_trees.extend([30, 40])
         else:
             min_impurity.extend([0.1, 0.2, 0.3])
             shrinkage_factor.extend([0.01, 0.05, 0.1])
             max_depth.extend([3, 4, 7, 8])
             min_node_size.extend([2, 3, 4])
             iter_num.extend([20, 30, 40])
-            num_trees.extend([20, 30, 40])
         # Hyperparameters for XGBoost model
         xgb_params = {
@@ -736,12 +764,15 @@ class _ModelTraining:
         # Hyperparameters for each model
         model_params = parameters[:min(len(parameters), 5)]
-        self._display_msg(msg="\nPerforming hyperParameter tuning ...", progress_bar=self.progress_bar)
+        self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
-        # Defining training and testing data
+        # Defining training data
         data_types = ['lasso', 'rfe', 'pca']
         trainng_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_train']) for data_type in data_types)
-        testing_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_test']) for data_type in data_types)
+        if self.task_type == "Classification":
+            response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
+            self.output_response = [str(i) for i in response_values]
         if self.stopping_metric is None:
             self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
@@ -765,115 +796,16 @@ class _ModelTraining:
         trained_models = []
         for param in model_params:
-            result = self._hyperparameter_tunning(param, trainng_datas, testing_datas)
-            trained_models.append(result)
+            result = self._hyperparameter_tunning(param, trainng_datas)
+            if result is not None:
+                trained_models.append(result)
         models_df = pd.concat(trained_models, ignore_index=True)
-        # Score the model and combine the results into a single DataFrame
-        trained_models_info = self._model_scoring(testing_datas, models_df)
-        trained_models_info = trained_models_info.reset_index(drop=True)
-        return trained_models_info
-    def _model_scoring(self,
-                       test_data,
-                       model_info):
-        """
-        DESCRIPTION:
-            Internal function generates the performance metrics for
-            trained ML models using testing dataset.
-        PARAMETERS:
-            test_data
-                Required Argument.
-                Specifies the testing datasets
-                Types: tuple of Teradataml DataFrame
-            model_info
-                Required Argument.
-                Specifies the trained models information.
-                Types: Pandas DataFrame
-        RETURNS:
-            Pandas DataFrame containing, trained models with their performance metrics.
-        """
-        self._display_msg(msg="Evaluating models performance ...",
-                          progress_bar = self.progress_bar,
-                          show_data=True)
-        # Empty list for storing model performance metrics
-        model_performance_data = []
-        # Mapping feature selection methods to corresponding test data
-        feature_selection_to_test_data = {"lasso": test_data[0],
-                                          "rfe": test_data[1],
-                                          "pca": test_data[2]}
-        # Iterating over models
-        for index, model_row in model_info.iterrows():
-            # Extracting model name, model id, feature selection method, and model object
-            model_name, model_id, feature_selection, model_object = model_row['Name'], \
-                model_row['Model-ID'], model_row['Feature-Selection'], model_row['obj']
-            # Selecting test data based on feature selection method
-            test_set = feature_selection_to_test_data[feature_selection]
-            # Model evaluation
-            if model_name == 'knn':
-                performance_metrics = model_object.evaluate(test_data=test_set)
-            else:
-                eval_params = _ModelTraining._eval_params_generation(model_name,
-                                                                     self.target_column,
-                                                                     self.task_type)
-                performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
-            # Extracting performance metrics
-            if self.is_classification_type():
-                # Classification
-                # Extract performance metrics from the output data
-                performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
-                # Combine all the elements to form a new row
-                new_row = [model_name, model_id, feature_selection] + performance_metrics_list + [model_object]
-            else:
-                # Regression
-                regression_metrics = next(performance_metrics.result.itertuples())
-                sample_size = test_set.select('id').size
-                feature_count = len(test_set.columns) - 2
-                r2_score = regression_metrics[8]
-                adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
-                new_row = [model_name, model_id, feature_selection, regression_metrics[0],
-                           regression_metrics[1], regression_metrics[2], regression_metrics[5],
-                           regression_metrics[6], r2_score, adjusted_r2_score, model_object]
-            model_performance_data.append(new_row)
-        if self.is_classification_type():
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Model-ID',
-                                                        'Feature-Selection','Accuracy','Micro-Precision',
-                                                        'Micro-Recall','Micro-F1',
-                                                        'Macro-Precision','Macro-Recall',
-                                                        'Macro-F1','Weighted-Precision',
-                                                        'Weighted-Recall','Weighted-F1',
-                                                        'model-obj'])
-        else:
-            model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name', 'Model-ID',
-                                                            'Feature-Selection',
-                                                            'MAE', 'MSE', 'MSLE',
-                                                            'RMSE', 'RMSLE',
-                                                            'R2-score',
-                                                            'Adjusted R2-score',
-                                                            'model-obj'])
-        self._display_msg(msg="Evaluation completed.",
-                          progress_bar = self.progress_bar,
-                          show_data=True)
-        return model_metrics_df
+        return models_df
     def _hyperparameter_tunning(self,
                                 model_param,
-                                train_data,
-                                test_data):
+                                train_data):
         """
         DESCRIPTION:
             Internal function performs hyperparameter tuning on
@@ -890,11 +822,6 @@ class _ModelTraining:
                 Specifies the training datasets.
                 Types: tuple of Teradataml DataFrame
-            test_data
-                Required Argument.
-                Specifies the testing datasets
-                Types: tuple of Teradataml DataFrame
         RETURNS:
             pandas DataFrame containing, trained models information.
         """
@@ -910,13 +837,21 @@ class _ModelTraining:
         # Input columns for model
         model_param['input_columns'] = self.features
+        # Setting persist for model
+        model_param['persist'] = self.persist
         self._display_msg(msg=model_param['name'],
                           progress_bar=self.progress_bar,
                           show_data=True)
-        # Defining test data for KNN
+        # As we are using entire data for HPT training. So,
+        # passing prepared training data as test_data for KNN.
         if model_param['name'] == 'knn':
-            model_param['test_data'] = test_data
+            model_param['test_data'] = train_data
+        if self.task_type == "Classification":
+            model_param['output_prob'] = True
+            model_param['output_responses'] = self.output_response
         # Using RandomSearch for hyperparameter tunning when max_models is given.
         # Otherwise, using GridSearch for hyperparameter tunning.
@@ -951,26 +886,45 @@ class _ModelTraining:
                     sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
         # Getting all passed models
-        _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
-        # Creating mapping data ID to feature selection method
-        data_id_to_method_map = {"DF_0": "lasso", "DF_1": "rfe", "DF_2": "pca"}
-        # Mapping data ID to feature selection method
-        _df['Feature-Selection'] = _df['DATA_ID'].map(data_id_to_method_map)
-        # Getting model details
-        _df['Name'] = model_param['name']
-        _df['Model-ID'] = _df['MODEL_ID']
-        _df['obj'] = _df['MODEL_ID'].apply(lambda x: _obj.get_model(x))
-        # Extracting needed columns
-        model_info = _df[["Name", "Model-ID", "Feature-Selection", "obj"]]
-        self._display_msg(msg="-"*100,
-                          progress_bar=self.progress_bar,
-                          show_data=True)
-        self.progress_bar.update()
+        model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
+                                            on='MODEL_ID', how='inner')
+        if not model_info.empty:
+            # Creating mapping data ID to feature selection method
+            data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
+                                    "DF_1": ('rfe', train_data[1]._table_name),
+                                    "DF_2": ('pca', train_data[2]._table_name)}
+            # Updating model stats with feature selection method and result table
+            for index, row in model_info.iterrows():
+                model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
+                model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
+                model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
+                model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
+            # Dropping column 'DATA_ID'
+            model_info.drop(['DATA_ID'], axis=1, inplace=True)
-        return model_info
+            model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
+            if not self.is_classification_type():
+                # Calculating Adjusted-R2 for regression
+                # Getting size and feature count for each feature selection method
+                methods = ["lasso", "rfe", "pca"]
+                size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
+                feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
+                model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
+                    1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
+                    (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
+            self._display_msg(msg="-"*100,
+                            progress_bar=self.progress_bar,
+                            show_data=True)
+            self.progress_bar.update()
+            return model_info
+        # Returning None, if no model is passed
+        return None
     @staticmethod
     def _eval_params_generation(ml_name,
@@ -1006,21 +960,36 @@ class _ModelTraining:
         # Setting the eval_params
         eval_params = {"id_column": "id",
                         "accumulate": target_column}
+        model_type = {
+            'xgboost': 'model_type',
+            'glm': 'model_type',
+            'decisionforest': 'tree_type',
+            'svm': 'model_type',
+            'knn': 'model_type'
+        }
+        ml_name = ml_name.replace('_', '').lower()
         # For Classification
         if task_type.lower() != "regression":
+            eval_params[model_type[ml_name]] = 'Classification'
+            eval_params['output_prob'] = True
             if ml_name == 'xgboost':
-                eval_params['model_type'] = 'Classification'
                 eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
-            else:
-                if ml_name == 'glm':
-                    eval_params['family'] = 'BINOMIAL'
-                eval_params['output_prob'] = True
+            elif ml_name == 'glm':
+                eval_params['family'] = 'BINOMIAL'
         else:
         # For Regression
+            eval_params[model_type[ml_name]] = 'Regression'
             if ml_name == 'xgboost':
-                eval_params['model_type'] = 'Regression'
                 eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
+            elif ml_name == 'glm':
+                eval_params['family'] = 'GAUSSIAN'
         return eval_params

teradataml/catalog/byom.py CHANGED Viewed

@@ -26,7 +26,7 @@ from teradataml.options.display import display
 from teradataml.common.constants import ModelCatalogingConstants as mac
 from teradataml.options.configure import configure
 from teradataml.utils.utils import execute_sql
-from teradatasqlalchemy.telemetry.queryband import collect_queryband
+from teradataml.telemetry_utils.queryband import collect_queryband
 validator = _Validators()
@@ -541,13 +541,12 @@ def save_byom(model_id,
         # If exists, extract required information about table columns types
         # else extract from additional_columns_types.
         # Also validate model_id against allowed length.
-        table_exists = connection.dialect.has_table(connection, table_name=table_name, schema=schema_name)
+        table_exists = connection.dialect.has_table(connection, table_name=table_name,
+                                                    schema=schema_name, table_only=True)
         if table_exists:
             # Check if model exists or not. If exists, raise error.
             __check_if_model_exists(
                 model_id, table_name, schema_name, raise_error_if_model_found=True)
-            if len(additional_columns_types) != 0:
-                warnings.warn("Argument additional_columns_types is ignored since table already exists.", stacklevel=2)
             # Gather column name and type information from existing table
             existing_table_df = DataFrame(in_schema(schema_name, table_name))
@@ -807,7 +806,7 @@ def delete_byom(model_id, table_name=None, schema_name=None):
     # Before proceed further, check whether table exists or not.
     conn = get_connection()
-    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
+    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
         error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
         error_msg = Messages.get_message(
             error_code, "delete", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))
@@ -1472,7 +1471,7 @@ def retrieve_byom(model_id,
     # Before proceeding further, check whether table exists or not.
     conn = get_connection()
-    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
+    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
         error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
         error_msg = Messages.get_message(
             error_code, "retrieve", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))
@@ -1535,7 +1534,8 @@ def retrieve_byom(model_id,
     license_table = in_schema(license_schema_name, license_table_name)
     # Check whether license table exists or not before proceed further.
-    if not conn.dialect.has_table(conn, table_name=license_table_name, schema=license_schema_name):
+    if not conn.dialect.has_table(conn, table_name=license_table_name, schema=license_schema_name,
+                                  table_only=True):
         error_code = MessageCodes.EXECUTION_FAILED
         error_msg = Messages.get_message(
             error_code, "retrieve the model", 'Table "{}" does not exist.'.format(license_table))
@@ -1723,7 +1723,7 @@ def list_byom(table_name=None, schema_name=None, model_id=None):
     # Before proceeding further, check whether table exists or not.
     conn = get_connection()
-    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
+    if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
         error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
         error_msg = Messages.get_message(
             error_code, "list", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))

teradataml/clients/pkce_client.py CHANGED Viewed

@@ -425,7 +425,7 @@ class _DAWorkflow:
         """
         device_cfg = requests.post(
             url=self.device_auth_end_point,
-            data={'client_id': self.__client_id})
+            data={'client_id': self.__client_id, 'scope': 'openid'})
         # Check the status. If response is not 200, raise error.
         _Validators._validate_http_response(device_cfg, 200, "get the device metadata")

teradataml/common/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from teradataml.common.formula import as_categorical
+from teradataml.common.formula import as_categorical
+from teradataml.common.constants import Action, Permission

teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.1py3-none-any.whl → 20.0.0.3py3-none-any.whl