PyPI - teradataml - Versions diffs - 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl - Mend

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (126) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +315 -2
teradataml/__init__.py +4 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +95 -8
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/metadata.py +12 -3
teradataml/analytics/json_parser/utils.py +7 -2
teradataml/analytics/sqle/__init__.py +5 -1
teradataml/analytics/table_operator/__init__.py +1 -1
teradataml/analytics/uaf/__init__.py +1 -1
teradataml/analytics/utils.py +4 -0
teradataml/analytics/valib.py +18 -4
teradataml/automl/__init__.py +51 -6
teradataml/automl/data_preparation.py +59 -35
teradataml/automl/data_transformation.py +58 -33
teradataml/automl/feature_engineering.py +27 -12
teradataml/automl/model_training.py +73 -46
teradataml/common/constants.py +88 -29
teradataml/common/garbagecollector.py +2 -1
teradataml/common/messagecodes.py +19 -3
teradataml/common/messages.py +6 -1
teradataml/common/sqlbundle.py +64 -12
teradataml/common/utils.py +246 -47
teradataml/common/warnings.py +11 -0
teradataml/context/context.py +161 -27
teradataml/data/amazon_reviews_25.csv +26 -0
teradataml/data/byom_example.json +11 -0
teradataml/data/dataframe_example.json +18 -2
teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
teradataml/data/hnsw_alter_data.csv +5 -0
teradataml/data/hnsw_data.csv +10 -0
teradataml/data/jsons/byom/h2opredict.json +1 -1
teradataml/data/jsons/byom/onnxembeddings.json +266 -0
teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
teradataml/data/medical_readings.csv +101 -0
teradataml/data/patient_profile.csv +101 -0
teradataml/data/scripts/lightgbm/dataset.template +157 -0
teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
teradataml/data/target_udt_data.csv +8 -0
teradataml/data/templates/open_source_ml.json +3 -2
teradataml/data/teradataml_example.json +8 -0
teradataml/data/vectordistance_example.json +4 -0
teradataml/dataframe/copy_to.py +8 -3
teradataml/dataframe/data_transfer.py +11 -1
teradataml/dataframe/dataframe.py +1049 -285
teradataml/dataframe/dataframe_utils.py +152 -20
teradataml/dataframe/functions.py +578 -35
teradataml/dataframe/setop.py +11 -6
teradataml/dataframe/sql.py +185 -16
teradataml/dbutils/dbutils.py +1049 -115
teradataml/dbutils/filemgr.py +48 -1
teradataml/hyperparameter_tuner/optimizer.py +12 -1
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/__init__.py +1 -1
teradataml/opensource/_base.py +1466 -0
teradataml/opensource/_class.py +464 -0
teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
teradataml/opensource/_lightgbm.py +949 -0
teradataml/opensource/_sklearn.py +1008 -0
teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
teradataml/options/__init__.py +54 -38
teradataml/options/configure.py +131 -27
teradataml/options/display.py +13 -2
teradataml/plot/axis.py +47 -8
teradataml/plot/figure.py +33 -0
teradataml/plot/plot.py +63 -13
teradataml/scriptmgmt/UserEnv.py +5 -5
teradataml/scriptmgmt/lls_utils.py +130 -40
teradataml/store/__init__.py +12 -0
teradataml/store/feature_store/__init__.py +0 -0
teradataml/store/feature_store/constants.py +291 -0
teradataml/store/feature_store/feature_store.py +2318 -0
teradataml/store/feature_store/models.py +1505 -0
teradataml/table_operators/Apply.py +32 -18
teradataml/table_operators/Script.py +3 -1
teradataml/table_operators/TableOperator.py +3 -1
teradataml/table_operators/query_generator.py +3 -0
teradataml/table_operators/table_operator_query_generator.py +3 -1
teradataml/table_operators/table_operator_util.py +37 -38
teradataml/table_operators/templates/dataframe_register.template +69 -0
teradataml/utils/dtypes.py +51 -2
teradataml/utils/internal_buffer.py +18 -0
teradataml/utils/validators.py +99 -8
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
teradataml/libaed_0_1.dylib +0 -0
teradataml/libaed_0_1.so +0 -0
teradataml/opensource/sklearn/__init__.py +0 -1
teradataml/opensource/sklearn/_class.py +0 -255
teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0

teradataml/analytics/uaf/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ for func in _uaf_functions:
                             "__doc__": _AnalyticFunction.__doc__,
                             "__dir__": _common_dir})
-_stored_procedure = ['FilterFactory1d']
+_stored_procedure = ['CopyArt', 'FilterFactory1d']
 for func in _stored_procedure:
     globals()[func] = type("{}".format(func), (_AnalyticFunction,),

teradataml/analytics/utils.py CHANGED Viewed

@@ -441,6 +441,10 @@ class FuncSpecialCaseHandler():
                                                "filter_type": self._single_quote_arg,
                                                "window_type": self._single_quote_arg,
                                                "filter_description": self._single_quote_arg},
+                           "CopyArt":{"database_name": self._single_quote_arg,
+                                      "table_name": self._single_quote_arg,
+                                      "map_name": self._single_quote_arg,
+                                      "permanent_table": self._single_quote_arg},
                            "DWT": {"wavelet": self._single_quote_arg},
                            "IDWT": {"part": self._single_quote_arg,
                                     "wavelet": self._single_quote_arg,

teradataml/analytics/valib.py CHANGED Viewed

@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
 from teradataml.utils.validators import _Validators
 from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
     LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
+from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
 class _VALIB():
     """ An internal class for executing VALIB analytic functions. """
@@ -370,9 +372,16 @@ class _VALIB():
             self.__get_temp_table_name()
         """
         prefix = "valib_{}".format(self.__tdml_valib_name.lower())
-        return UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
-                                                   gc_on_quit=True, quote=False,
-                                                   table_type=TeradataConstants.TERADATA_TABLE)
+        tbl_name =  UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
+                                                        gc_on_quit=True, quote=False,
+                                                        table_type=TeradataConstants.TERADATA_TABLE)
+        # With VT option, table name is getting generated with 'vt_'.
+        # But its not getting created as Volatile table. Hence
+        # explicitly garbage collecting.
+        if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
+            GarbageCollector._add_to_garbagecollector(tbl_name,
+                                                      TeradataConstants.TERADATA_TABLE)
+        return tbl_name
     def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
         """
@@ -447,6 +456,7 @@ class _VALIB():
         # Add extension to the table name.
         generated_table_name = "{}{}".format(table_name, extension)
         # Register new output table to the GC.
         gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
         GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
@@ -1463,7 +1473,7 @@ class _VALIB():
         if gen_sql_only:
             valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
                                                             arg_name="gensqlonly")
+        charset = kwargs.pop("charset", None)
         # Raise error if there are additional arguments.
         if len(kwargs) != 0:
             err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
@@ -1478,6 +1488,10 @@ class _VALIB():
                                                         arg_name="tablename")
         valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
                                                         arg_name="columns")
+        # Generate clause of charset.
+        if charset:
+            valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
+                                                            arg_name="charset")
         return valib_inst._execute_valib_function(skip_data_arg_processing=True,
                                                   skip_other_arg_processing=True)

teradataml/automl/__init__.py CHANGED Viewed

@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml.utils.utils import execute_sql
 from teradataml.utils.validators import _Validators
-from teradataml import ROC, BLOB
+from teradataml import ROC, BLOB, VARCHAR
 from teradataml.utils.dtypes import _Dtypes
 from teradataml.common.utils import UtilFuncs
 from teradataml import TeradataMlException
@@ -94,6 +94,9 @@ class AutoML:
             the processes by passing the JSON file path in case of custom run. It also
             supports early stopping of model training based on stopping metrics,
             maximum running time and maximum models to be trained.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             task_type:
@@ -187,6 +190,12 @@ class AutoML:
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoML.
@@ -417,9 +426,11 @@ class AutoML:
         volatile = kwargs.get('volatile', False)
         persist = kwargs.get('persist', False)
+        seed = kwargs.get('seed', 42)
         arg_info_matrix.append(["volatile", volatile, True, (bool)])
         arg_info_matrix.append(["persist", persist, True, (bool)])
+        arg_info_matrix.append(["seed", seed, True, (int)])
         # Validate argument types
         _Validators._validate_function_arguments(arg_info_matrix)
@@ -517,7 +528,7 @@ class AutoML:
         # Validate argument types
         _Validators._validate_function_arguments(arg_info_fit_matrix)
         # Initializing class variables
         self.data = data
         self.target_column = target_column
@@ -758,11 +769,12 @@ class AutoML:
         if self.target_column_ind:
             prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
             probability_column = 'prob_1'
+            pred_target_count = pred.result.drop_duplicate(self.target_column).size
             # Displaying confusion matrix and ROC-AUC for classification problem
             if self.is_classification_type():
                 print_data = lambda data: print(data) if _is_terminal() else display(data)
                 # Displaying ROC-AUC for binary classification
-                if self.target_count == 2:
+                if self.target_count == 2 and pred_target_count == 2:
                     fit_params = {
                         "probability_column" : probability_column,
                         "observation_column" : self.target_column,
@@ -886,8 +898,8 @@ class AutoML:
         # as it is required for evaluation.
         if self.target_column not in data.columns:
              raise TeradataMlException(
-                            Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
-                            MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
+                 Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
+                 MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
         # Checking if data is already transformed before or not
         data_node_id = data._nodeid
@@ -1234,6 +1246,8 @@ class AutoML:
         pca.n_components_ = load_pca_info['n_components']
         pca.noise_variance_ = load_pca_info['noise_variance']
         pca.singular_values_ = np.array(load_pca_info['singular_values'])
+        pca.feature_names_in_ = data_params['pca_fit_columns']
+        pca.n_features_in_ = len(data_params['pca_fit_columns'])
         data_params['pca_fit_instance'] = pca
@@ -1442,7 +1456,8 @@ class AutoML:
         # Saving data transformation parameters to the specified table
         sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
-        copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
+        copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
+                                                                                       'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
         print('Model Deployment Completed Successfully.')
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
                 Default Value: False
                 Types: bool
+            seed:
+                Optional Argument.
+                Specifies the random seed for reproducibility.
+                Default Value: 42
+                Types: int
         RETURNS:
             a tuple containing, model information and leaderboard.
         """
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
                 session.
                 Default Value: False
                 Types: bool
+            seed:
+                Optional Argument.
+                Specifies the random seed for reproducibility.
+                Default Value: 42
+                Types: int
         RETURNS:
             a tuple containing, model information and leaderboard.
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
         """
         DESCRIPTION:
             AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             include:
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoRegressor.
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
         """
         DESCRIPTION:
             AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             include:
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoClassifier.

teradataml/automl/data_preparation.py CHANGED Viewed

@@ -16,7 +16,6 @@
 # Python libraries
 import numpy as np
 import pandas as pd
-import random
 import time
 import warnings
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
 from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.common.messages import Messages, MessageCodes
 from teradataml.utils.validators import _Validators
-from teradataml import INTEGER
+from teradataml import configure, INTEGER
+from teradataml.common.constants import TeradataConstants
-# Control Randomnes
-random.seed(42)
-np.random.seed(42)
 class _DataPreparation:
@@ -117,6 +114,12 @@ class _DataPreparation:
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         """
         self.data = data
         self.target_column = target_column
@@ -135,7 +138,13 @@ class _DataPreparation:
         self.table_name_mapping = {}
         self.data_types = {key: value for key, value in self.data._column_names_and_types}
+        self.seed = kwargs.get("seed", 42)
+        # np.random.seed() affects the random number generation in numpy and sklearn
+        # setting this changes the global state of the random number generator
+        # hence, setting the seed only if it is not None
+        if kwargs.get("seed") is not None:
+            np.random.seed(self.seed)
     def data_preparation(self,
                          auto = True):
@@ -262,25 +271,24 @@ class _DataPreparation:
         outlier_method = "Tukey"
         # List of columns for outlier processing.
-        outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
+        # Excluding target column and excluded columns from outlier processing
+        outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
-        # Detecting outlier percentage in each columns
-        outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
-        # Outlier Handling techniques
-        for i in outlier_percentage_df.itertuples():
-            # Column Name
-            col = i[0]
-            # Outlier value
-            value = i[1]
-            if col == self.target_column:
-                if value < 5.0 and value > 0.0:
+        if len(outlier_columns) != 0:
+            # Detecting outlier percentage in each columns
+            outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
+            # Outlier Handling techniques
+            for i in outlier_percentage_df.itertuples():
+                # Column Name
+                col = i[0]
+                # Outlier value
+                value = i[1]
+                # Dropping rows
+                if value > 0.0  and value <= 8.0 :
                     columns_to_drop_rows.append(col)
-            elif value > 0.0  and value <= 8.0 :
-                columns_to_drop_rows.append(col)
-            elif value> 8.0 and value <= 25.0:
-                columns_to_impute.append(col)
+                elif value> 8.0 and value <= 25.0:
+                    columns_to_impute.append(col)
         return columns_to_drop_rows, columns_to_impute
@@ -465,7 +473,7 @@ class _DataPreparation:
         RETURNS:
             int, number of folds to be used for cross-validation.
         """
-        num_of_folds = lambda rows: 1 if rows > 20000 else (3 if 1000 < rows <= 20000 else 10)
+        num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
         return num_of_folds(rows)
     def _feature_selection_PCA(self):
@@ -489,7 +497,7 @@ class _DataPreparation:
         train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
         # Initialize and fit PCA
-        pca = PCA()
+        pca = PCA(random_state=self.seed)
         pca.fit(train_data)
         # Find the number of components for PCA
@@ -497,7 +505,7 @@ class _DataPreparation:
         n = np.argmax(np.cumsum(variance) >= 0.95) + 1
         # Create a new instance of PCA with the optimal number of components
-        pca = PCA(n_components=n, random_state=42)
+        pca = PCA(n_components=n, random_state=self.seed)
         # Apply PCA on dataset
         X_train_pca = pca.fit_transform(train_data)
@@ -571,7 +579,7 @@ class _DataPreparation:
             # Random forest for RFE model
             RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
-            rf = RFModel(n_estimators=100, random_state=42)
+            rf = RFModel(n_estimators=100, random_state=self.seed)
             # Determine the scoring metric based on the number of unique classes
             score = 'r2' if not self.is_classification_type() \
@@ -665,10 +673,10 @@ class _DataPreparation:
                     scoring_metric = 'roc_auc'
                 else:
                     scoring_metric = 'f1_macro'
-                estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
+                estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
                 parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
             else:
-                estimator = Lasso(random_state=42)
+                estimator = Lasso(random_state=self.seed)
                 parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
                 scoring_metric = "r2"
@@ -679,7 +687,7 @@ class _DataPreparation:
             # Applying hyperparameter tuning and optimizing score
             hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
-                                                scoring=scoring_metric, verbose=0)
+                                                 scoring=scoring_metric, verbose=0)
             # Fitting the best result from hyperparameter
             hyperparameter_search.fit(train_features, train_target)
@@ -746,14 +754,20 @@ class _DataPreparation:
         train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
                                                                table_type = TeradataConstants.TERADATA_TABLE,
                                                                gc_on_quit=not persist)
+        # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
+        # table name in fully qualified format.
+        train_table_name = UtilFuncs._extract_table_name(train_table_name)
         # Storing the table names in the table name mapping dictionary
         self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
+        # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
+        is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
         # Pushing data into database
         if self.is_classification_type():
-            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
+            copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
         else:
-            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
+            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
     def _scaling_features_helper(self,
                                  data=None,
@@ -783,7 +797,8 @@ class _DataPreparation:
         for col in data.columns:
             # Selecting columns that will be scaled
             # Exculding target_col and columns with single value
-            if col not in ['id', self.target_column] and data.drop_duplicate(col).size > 1:
+            if col not in ['id', self.target_column] and \
+            data.drop_duplicate(col).size > 1:
                 columns_to_scale.append(col)
         if feature_selection_mtd == "lasso":
@@ -855,6 +870,7 @@ class _DataPreparation:
             # List of columns to copy to the output generated by scale transform
             accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
             # Scaling dataset
             transform_obj = ScaleTransform(data=data_to_scale,
@@ -866,6 +882,8 @@ class _DataPreparation:
                               data=scaled_df,
                               progress_bar=self.progress_bar)
         else:
+            # No columns to scale, Original data will be used
+            scaled_df = data_to_scale
             self._display_msg(msg="No columns to scale.",
                               progress_bar=self.progress_bar)
@@ -914,10 +932,16 @@ class _DataPreparation:
         # Assigning data to target dataframe
         target_df = self.data
         # Detecting list of float columns on target dataset
-        float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
+        float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
         if len(float_columns) == 0:
-            return target_df.to_pandas()
+            cols = target_df.columns
+            # Doing reset index to get index column
+            df = target_df.to_pandas().reset_index()
+            # Returning the dataframe with cols
+            # to avoid extra columns generated by reset_index()
+            return df[cols]
         # storing the column details for round up in data transformation dictionary
         self.data_transform_dict["round_columns"] = float_columns

teradataml/automl/data_transformation.py CHANGED Viewed

@@ -31,8 +31,11 @@ from teradataml import ScaleTransform
 from teradataml import SimpleImputeTransform
 from teradataml import TargetEncodingTransform
 from teradataml import Transform, UtilFuncs, TeradataConstants
+from teradataml import execute_sql
 from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.hyperparameter_tuner.utils import _ProgressBar
+from teradataml.options.configure import configure
+from teradataml.common.constants import TeradataConstants
 # AutoML Internal libraries
 from teradataml.automl.feature_exploration import _FeatureExplore
@@ -219,11 +222,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         DESCRIPTION:
             Function drops irrelevent columns and adds id column.
         """
-        # Extracting irrelevent column list
+        # Extracting irrelevant column list
         columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
         if columns_to_be_removed:
             self.data = self.data.drop(columns_to_be_removed, axis=1)
-            self._display_msg(msg="\nUpdated dataset after dropping irrelevent columns :",
+            self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
                               data=self.data,
                               progress_bar=self.progress_bar)
@@ -693,22 +696,28 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         lasso_scale_fit_obj = self.data_transformation_params.get("lasso_scale_fit_obj", None)
         lasso_scale_col = self.data_transformation_params.get("lasso_scale_col", None)
         # Extracting accumulate columns
-        accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
-        # Scaling dataset
-        lasso_df = ScaleTransform(data=lasso_df,
-                    object=lasso_scale_fit_obj,
-                    accumulate=accumulate_cols).result
-        # Displaying scaled dataset
-        self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
-                          data=lasso_df,
-                          progress_bar=self.progress_bar)
+        if lasso_scale_fit_obj is not None:
+            accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
+            # Scaling dataset
+            lasso_df = ScaleTransform(data=lasso_df,
+                        object=lasso_scale_fit_obj,
+                        accumulate=accumulate_cols).result
+            # Displaying scaled dataset
+            self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
+                            data=lasso_df,
+                            progress_bar=self.progress_bar)
         # Uploading lasso dataset to table for further use
         table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
+        # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
+        # table name in fully qualified format.
+        table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for lasso dataset
         self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
-        copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace")
+        # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
+        is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
+        copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
     def _feature_selection_rfe_transformation(self):
         """
@@ -730,23 +739,30 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         # Extracting fit object and columns for scaling
         rfe_scale_fit_obj = self.data_transformation_params.get("rfe_scale_fit_obj", None)
         rfe_scale_col = self.data_transformation_params.get("rfe_scale_col", None)
-        # Extracting accumulate columns
-        accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
-        # Scaling on rfe dataset
-        rfe_df = ScaleTransform(data=rfe_df,
-                    object=rfe_scale_fit_obj,
-                    accumulate=accumulate_cols).result
-        # Displaying scaled dataset
-        self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
-                          data=rfe_df,
-                          progress_bar=self.progress_bar)
+        if rfe_scale_fit_obj is not None:
+            # Extracting accumulate columns
+            accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
+            # Scaling on rfe dataset
+            rfe_df = ScaleTransform(data=rfe_df,
+                                    object=rfe_scale_fit_obj,
+                                    accumulate=accumulate_cols).result
+            # Displaying scaled dataset
+            self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
+                            data=rfe_df,
+                            progress_bar=self.progress_bar)
         # Uploading rfe dataset to table for further use
         table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
+        # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
+        # table name in fully qualified format.
+        table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for rfe dataset
         self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
-        copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace")
+        # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
+        is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
+        copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
     def _feature_selection_pca_transformation(self):
         """
@@ -758,17 +774,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
         # Extracting accumulate columns
         accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
-        # Scaling on pca dataset
-        pca_scaled_df = ScaleTransform(data=self.data,
-                    object=pca_scale_fit_obj,
-                    accumulate=accumulate_cols).result
-        # Displaying scaled dataset
-        self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
-                          data=pca_scaled_df,
-                          progress_bar=self.progress_bar)
+        pca_scaled_df = self.data
+        if pca_scale_fit_obj is not None:
+            # Scaling on pca dataset
+            pca_scaled_df = ScaleTransform(data=self.data,
+                                           object=pca_scale_fit_obj,
+                                           accumulate=accumulate_cols).result
+            # Displaying scaled dataset
+            self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
+                            data=pca_scaled_df,
+                            progress_bar=self.progress_bar)
         # Convert to pandas dataframe for applying pca
-        pca_scaled_pd = pca_scaled_df.to_pandas()
+        pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
         # Extracting pca fit instance for applying pca
         pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
         # Extracting columns for applying pca
@@ -804,6 +823,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         # Uploading pca dataset to table for further use
         table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
+        # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
+        # table name in fully qualified format.
+        table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for pca dataset
         self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
-        copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace")
+        # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
+        is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
+        copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)

teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.4py3-none-any.whl