PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/automl/data_transformation.py CHANGED Viewed

@@ -23,6 +23,7 @@ from teradataml.dataframe.copy_to import copy_to_sql
 from teradataml import Antiselect
 from teradataml import BincodeTransform
 from teradataml import ConvertTo
+from teradataml import execute_sql
 from teradataml import FillRowId
 from teradataml import NonLinearCombineTransform
 from teradataml import OneHotEncodingTransform
@@ -32,7 +33,6 @@ from teradataml import ScaleTransform
 from teradataml import SimpleImputeTransform
 from teradataml import TargetEncodingTransform
 from teradataml import Transform, UtilFuncs, TeradataConstants
-from teradataml import execute_sql
 from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.hyperparameter_tuner.utils import _ProgressBar
 from teradataml.options.configure import configure
@@ -48,10 +48,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
     def __init__(self,
                  data,
                  data_transformation_params,
-                 auto = True,
-                 verbose = 0,
-                 target_column_ind = False,
-                 table_name_mapping = {}):
+                 auto=True,
+                 verbose=0,
+                 target_column_ind=False,
+                 table_name_mapping={},
+                 cluster=False,
+                 feature_selection_method=None):
         """
         DESCRIPTION:
             Function initializes the data, data transformation object and running mode
@@ -89,7 +91,25 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                 Optional Argument.
                 Specifies whether target column is present in given dataset.
                 Default Value: False
-                Types: bool
+                Types: bool
+            table_name_mapping:
+                Optional Argument.
+                Specifies the mapping of table names for the transformed data.
+                Default Value: {}
+                Types: dict
+            cluster:
+                Optional Argument.
+                Specifies whether to apply clustering techniques.
+                Default Value: False
+                Types: bool
+            feature_selection_method:
+                Optional Argument.
+                Specifies the feature selection method to be used.
+                Default Value: None
+                Types: str
         """
         self.data = data
         self.data_transformation_params = data_transformation_params
@@ -97,9 +117,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         self.verbose = verbose
         self.target_column_ind = target_column_ind
         self.table_name_mapping = table_name_mapping
+        self.data_types = {key: value for key, value in self.data._column_names_and_types}
         self.data_node_id = data._nodeid
         self.table_name_mapping[self.data_node_id] = {}
+        self.cluster = cluster
+        self.feature_selection_method = feature_selection_method
     def data_transformation(self):
         """
         DESCRIPTION:
@@ -112,15 +136,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         """
         # Initializing Feature Exploration
         _FeatureExplore.__init__(self,
-                                 data = self.data,
-                                 target_column = None,
-                                 verbose = self.verbose)
+                                 data=self.data,
+                                 target_column=None,
+                                 verbose=self.verbose,
+                                 cluster=self.cluster)
         # Initializing Feature Engineering
-        _FeatureEngineering.__init__(self,
-                                     data = self.data,
-                                     target_column = None,
-                                     model_list = None,
-                                     verbose = self.verbose)
+        _FeatureEngineering.__init__(self,
+                                     data=self.data,
+                                     target_column=None,
+                                     model_list=None,
+                                     verbose=self.verbose,
+                                     cluster=self.cluster)
         self._display_msg(msg="Data Transformation started ...", show_data=True)
         # Extracting target column details and type whether it is classification or not
@@ -128,13 +154,14 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         self.classification_type = self.data_transformation_params.get("classification_type", False)
         # Setting number of jobs for progress bar based on mode of execution
-        jobs = 10 if self.auto else 15
+        jobs = 9 if self.cluster else (10 if self.auto else 15)
         self.progress_bar = _ProgressBar(jobs=jobs, verbose=2, prefix='Transformation Running:')
         # Performing transformation carried out in feature engineering phase
         self.feature_engineering_transformation()
         # Performing transformation carried out in data preparation phase
-        self.data_preparation_transformation()
+        self.data_preparation_transformation(feature_selection_method=self.feature_selection_method)
         self._display_msg(msg="Data Transformation completed.", show_data=True)
         return self.table_name_mapping
@@ -157,8 +184,9 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         self.progress_bar.update()
         # Handling target column transformation
-        if self.target_column_ind and self.classification_type:
-            self._handle_target_column_transformation()
+        if not self.cluster:
+            if self.target_column_ind and self.classification_type:
+                self._handle_target_column_transformation()
         self.progress_bar.update()
         self._date_column_handling_transformation()
@@ -193,7 +221,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
             self._custom_anti_select_column_transformation()
             self.progress_bar.update()
-    def data_preparation_transformation(self):
+    def data_preparation_transformation(self, feature_selection_method=None):
         """
         DESCRIPTION:
             Function performs transformation carried out in data preparation phase
@@ -209,14 +237,21 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         # Performing transformation including feature selection using lasso, rfe and pca
         # followed by scaling
-        self._feature_selection_lasso_transformation()
-        self.progress_bar.update()
-        self._feature_selection_rfe_transformation()
-        self.progress_bar.update()
+        if not self.cluster:
+            self._feature_selection_lasso_transformation()
+            self.progress_bar.update()
-        self._feature_selection_pca_transformation()
-        self.progress_bar.update()
+            self._feature_selection_rfe_transformation()
+            self.progress_bar.update()
+            self._feature_selection_pca_transformation()
+            self.progress_bar.update()
+        else:
+            self._feature_selection_pca_transformation()
+            self.progress_bar.update()
+            self._feature_selection_non_pca_transformation()
+            self.progress_bar.update()
     def _preprocess_transformation(self):
         """
@@ -224,7 +259,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
             Function drops irrelevent columns and adds id column.
         """
         # Extracting irrelevant column list
-        columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
+        columns_to_be_removed = self.data_transformation_params.get("drop_irrelevant_columns", None)
         if columns_to_be_removed:
             self.data = self.data.drop(columns_to_be_removed, axis=1)
             self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
@@ -297,9 +332,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         imputation_cols = self.data_transformation_params.get("imputation_columns", None)
         if imputation_cols:
             sm_fit_obj = self.data_transformation_params.get("imputation_fit_object")
+            ## Workaround done for bug https://teradata-pe.atlassian.net/browse/TDAF-15617.
+            #partition_column = self.data_transformation_params.get("imputation_partition_column", None)
+            params = {"data" : self.data,
+                      "object" : sm_fit_obj
+                      }
+            # if partition_column is not None:
+            #     params["data_partition_column"] = partition_column
+            #     params["object_partition_column"] = partition_column
             # imputing column using fit object
-            self.data = SimpleImputeTransform(data=self.data,
-                                              object=sm_fit_obj).result
+            self.data = SimpleImputeTransform(**params).result
             self._display_msg(msg="\nUpdated dataset after imputing missing value containing columns :",
                               data=self.data,
                               progress_bar=self.progress_bar)
@@ -438,7 +484,34 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
             self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
                               data=self.data,
                               progress_bar=self.progress_bar)
+            return
+        # AutoFraud Routine
+        auto_target_encoding_ind = self.data_transformation_params.get("auto_target_encoding_ind", False)
+        auto_target_encoding_fit_obj = self.data_transformation_params.get("auto_target_encoding_fit_obj", None)
+        target_encoding_accumulate_columns = self.data_transformation_params.get("target_encoding_accumulate_columns")
+        if auto_target_encoding_ind:
+            # Adding transform parameters for performing encoding
+            transform_params = {
+                    "data" : self.data,
+                    "object" : auto_target_encoding_fit_obj,
+                    "accumulate" : target_encoding_accumulate_columns,
+                    "is_input_dense" : True,
+                    "persist" : True,
+                    "display_table_name" : False
+                }
+            # Performing one hot encoding transformation
+            self.data = TargetEncodingTransform(**transform_params).result
+            # Adding transformed data containing table to garbage collector
+            GarbageCollector._add_to_garbagecollector(self.data._table_name)
+            self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
+                              data=self.data,
+                              progress_bar=self.progress_bar)
     def _custom_categorical_encoding_transformation(self):
         """
         DESCRIPTION:
@@ -493,7 +566,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                     warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
                                   This may cause inaccurate predictions. Consider retraining the model with updated data.",
                                   stacklevel=0)
             self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
                               data=self.data,
                               progress_bar=self.progress_bar)
@@ -628,7 +701,9 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
             for classification problem.
         """
         # Fetching target column encoding indicator and fit object
         target_col_encode_ind = self.data_transformation_params.get("target_col_encode_ind", False)
         if target_col_encode_ind:
             # Extracting ordinal encoding fit object for target column
             target_col_ord_encoding_fit_obj = self.data_transformation_params.get("target_col_ord_encoding_fit_obj", None)
@@ -647,14 +722,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                 self.data = OrdinalEncodingTransform(**transform_params).result
                 # Adding transformed data containing table to garbage collector
                 GarbageCollector._add_to_garbagecollector(self.data._table_name)
-        # Converting target column to integer datatype
-        params = {
-            "data" : self.data,
-            "target_columns" : [self.data_target_column],
-            "target_datatype" : ["integer"],
-            "accumulate" : self._extract_list(self.data.columns, [self.data_target_column])
-        }
-        self.data = ConvertTo(**params).result
         self._display_msg(msg="\nUpdated dataset after performing target column transformation :",
                           data=self.data,
                           progress_bar=self.progress_bar)
@@ -715,17 +783,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                         accumulate=accumulate_cols).result
             # Displaying scaled dataset
             self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
-                            data=lasso_df,
-                            progress_bar=self.progress_bar)
+                              data=lasso_df,
+                              progress_bar=self.progress_bar)
         # Uploading lasso dataset to table for further use
-        table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
+        table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
         # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
         # table name in fully qualified format.
         table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for lasso dataset
-        self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
+        self.table_name_mapping[self.data_node_id]["lasso_test"] = table_name
         # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
         is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
         copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
@@ -760,17 +828,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                                     accumulate=accumulate_cols).result
             # Displaying scaled dataset
             self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
-                            data=rfe_df,
-                            progress_bar=self.progress_bar)
+                              data=rfe_df,
+                              progress_bar=self.progress_bar)
         # Uploading rfe dataset to table for further use
-        table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
+        table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
         # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
         # table name in fully qualified format.
         table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for rfe dataset
-        self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
+        self.table_name_mapping[self.data_node_id]["rfe_test"] = table_name
         # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
         is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
         copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
@@ -783,19 +851,19 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
         # Extracting fit object and column details for perfroming feature scaling
         pca_scale_fit_obj = self.data_transformation_params.get("pca_scale_fit_obj", None)
         pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
-        # Extracting accumulate columns
-        accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
         pca_scaled_df = self.data
         if pca_scale_fit_obj is not None:
+            # Extracting accumulate columns
+            accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
             # Scaling on pca dataset
             pca_scaled_df = ScaleTransform(data=self.data,
                                            object=pca_scale_fit_obj,
                                            accumulate=accumulate_cols).result
             # Displaying scaled dataset
             self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
-                            data=pca_scaled_df,
-                            progress_bar=self.progress_bar)
+                              data=pca_scaled_df,
+                              progress_bar=self.progress_bar)
         # Convert to pandas dataframe for applying pca
         pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
@@ -832,14 +900,47 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                           progress_bar=self.progress_bar)
         # Uploading pca dataset to table for further use
-        table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
+        table_name = UtilFuncs._generate_temp_table_name(prefix="pca_test",
                                                          table_type = TeradataConstants.TERADATA_TABLE)
         # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
         # table name in fully qualified format.
         table_name = UtilFuncs._extract_table_name(table_name)
         # Storing table name mapping for pca dataset
-        self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
+        self.table_name_mapping[self.data_node_id]["pca_test"] = table_name
         # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
         is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
         copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
+    def _feature_selection_non_pca_transformation(self):
+        """
+        DESCRIPTION:
+            Function performs feature scaling on raw data for non-PCA clustering models.
+        """
+        self._display_msg(msg="\nRunning Non-PCA feature selection transformation for clustering...",
+                          show_data=True,
+                          progress_bar=self.progress_bar)
+        # Extracting fit object and columns for scaling
+        non_pca_scale_fit_obj = self.data_transformation_params.get("non_pca_scale_fit_obj", None)
+        non_pca_scale_col = self.data_transformation_params.get("non_pca_scale_col", None)
+        if non_pca_scale_fit_obj is not None and non_pca_scale_col is not None:
+            accumulate_cols = self._extract_list(self.data.columns, non_pca_scale_col)
+            # Scaling dataset
+            scaled_df = ScaleTransform(data=self.data,
+                                       object=non_pca_scale_fit_obj,
+                                       accumulate=accumulate_cols).result
+            # Displaying scaled dataset
+            self._display_msg(msg="\nUpdated dataset after performing Non-PCA scaling for clustering:",
+                              data=scaled_df,
+                              progress_bar=self.progress_bar)
+            # Uploading non_pca dataset to SQL
+            table_name = UtilFuncs._generate_temp_table_name(prefix="non_pca_test",
+                                                             table_type=TeradataConstants.TERADATA_TABLE)
+            self.table_name_mapping[self.data_node_id]["non_pca_test"] = table_name
+            copy_to_sql(df=scaled_df, table_name=table_name, if_exists="replace")
+        else:
+            print(" Missing non_pca_scale_fit_obj or non_pca_scale_col in data transformation params.")

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl