PyPI - teradataml - Versions diffs - 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl - Mend

teradataml 20.0.0.4py3-none-any.whl → 20.0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (107) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +86 -13
teradataml/__init__.py +2 -1
teradataml/_version.py +2 -2
teradataml/analytics/analytic_function_executor.py +7 -12
teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
teradataml/analytics/sqle/__init__.py +16 -1
teradataml/analytics/utils.py +15 -1
teradataml/automl/__init__.py +290 -106
teradataml/automl/autodataprep/__init__.py +471 -0
teradataml/automl/data_preparation.py +29 -10
teradataml/automl/data_transformation.py +11 -0
teradataml/automl/feature_engineering.py +64 -4
teradataml/automl/feature_exploration.py +639 -25
teradataml/automl/model_training.py +1 -1
teradataml/clients/auth_client.py +2 -2
teradataml/common/constants.py +61 -26
teradataml/common/messagecodes.py +2 -1
teradataml/common/messages.py +5 -4
teradataml/common/utils.py +255 -37
teradataml/context/context.py +225 -87
teradataml/data/apriori_example.json +22 -0
teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
teradataml/data/ner_dict.csv +8 -0
teradataml/data/ner_input_eng.csv +7 -0
teradataml/data/ner_rule.csv +5 -0
teradataml/data/pos_input.csv +40 -0
teradataml/data/tdnerextractor_example.json +14 -0
teradataml/data/teradataml_example.json +13 -0
teradataml/data/textmorph_example.json +5 -0
teradataml/data/to_num_data.csv +4 -0
teradataml/data/tochar_data.csv +5 -0
teradataml/data/trans_dense.csv +16 -0
teradataml/data/trans_sparse.csv +55 -0
teradataml/dataframe/copy_to.py +37 -26
teradataml/dataframe/data_transfer.py +61 -45
teradataml/dataframe/dataframe.py +130 -50
teradataml/dataframe/dataframe_utils.py +15 -2
teradataml/dataframe/functions.py +109 -9
teradataml/dataframe/sql.py +328 -76
teradataml/dbutils/dbutils.py +33 -13
teradataml/dbutils/filemgr.py +14 -10
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/_base.py +6 -157
teradataml/options/configure.py +4 -5
teradataml/scriptmgmt/UserEnv.py +305 -38
teradataml/scriptmgmt/lls_utils.py +376 -130
teradataml/store/__init__.py +1 -1
teradataml/table_operators/Apply.py +16 -1
teradataml/table_operators/Script.py +20 -1
teradataml/table_operators/table_operator_util.py +58 -9
teradataml/utils/dtypes.py +2 -1
teradataml/utils/internal_buffer.py +22 -2
teradataml/utils/validators.py +313 -57
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0

teradataml/automl/autodataprep/__init__.py ADDED Viewed

@@ -0,0 +1,471 @@
+# External libraries
+import pandas as pd
+# Teradata libraries
+from teradataml import db_drop_table
+from teradataml.common.constants import AutoMLConstants as aml_const
+from teradataml.common.messages import Messages, MessageCodes
+from teradataml.dataframe.dataframe import DataFrame
+from teradataml.dataframe.copy_to import copy_to_sql
+from teradataml.utils.validators import _Validators
+# AutoML Internal libraries
+from teradataml import AutoML, TeradataMlException
+class AutoDataPrep(AutoML):
+    def __init__(self,
+                 task_type = "Default",
+                 verbose = 0,
+                 **kwargs):
+        """
+        DESCRIPTION:
+            AutoDataPrep simplifies the data preparation process by automating the different aspects of
+            data cleaning and transformation, enabling seamless exploration, transformation, and optimization of datasets.
+        PARAMETERS:
+            task_type:
+                Optional Argument.
+                Specifies the task type for AutoDataPrep, whether to apply regression OR classification
+                on the provided dataset. If user wants AutoDataPrep() to decide the task type automatically,
+                then it should be set to "Default".
+                Default Value: "Default"
+                Permitted Values: "Regression", "Classification", "Default"
+                Types: str
+            verbose:
+                Optional Argument.
+                Specifies the detailed execution steps based on verbose level.
+                Default Value: 0
+                Permitted Values:
+                    * 0: prints the progress bar.
+                    * 1: prints the execution steps.
+                    * 2: prints the intermediate data between the execution of each step.
+                Types: int
+            **kwargs:
+                Specifies the additional arguments for AutoDataPrep. Below
+                are the additional arguments:
+                    custom_config_file:
+                        Optional Argument.
+                        Specifies the path of JSON file in case of custom run.
+                        Types: str
+                    volatile:
+                        Optional Argument.
+                        Specifies whether to put the interim results of the
+                        functions in a volatile table or not. When set to
+                        True, results are stored in a volatile table,
+                        otherwise not.
+                        Default Value: False
+                        Types: bool
+                    persist:
+                        Optional Argument.
+                        Specifies whether to persist the interim results of the
+                        functions in a table or not. When set to True,
+                        results are persisted in a table; otherwise,
+                        results are garbage collected at the end of the
+                        session.
+                        Default Value: False
+                        Types: bool
+        RETURNS:
+            Instance of AutoDataPrep.
+        RAISES:
+            TeradataMlException, TypeError, ValueError
+        EXAMPLES:
+            # Notes:
+            #     1. Get the connection to Vantage to execute the function.
+            #     2. One must import the required functions mentioned in
+            #        the example from teradataml.
+            #     3. Function raises error if not supported on the Vantage
+            #        user is connected to.
+            # Load the example data.
+            >>> load_example_data("teradataml", "titanic")
+            # Create teradataml DataFrames.
+            >>> titanic = DataFrame.from_table("titanic")
+            # Example 1: Run AutoDataPrep for classification problem.
+            # Scenario: Titanic dataset is used to predict the survival of passengers.
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
+            # Fit the data.
+            >>> aprep_obj.fit(titanic, titanic.survived)
+            # Retrieve the data after Auto Data Preparation.
+            >>> datas = aprep_obj.get_data()
+        """
+        # Initialize the AutoML object
+        super().__init__(task_type=task_type,
+                         verbose=verbose,
+                         **kwargs)
+        # Setting the attrubutes for AutoDataPrep
+        super().__setattr__("_auto_dataprep", True)
+        super().__setattr__("model_list", [])
+        super().__setattr__("_phases", ["1. Feature Exploration ->",
+                                        "2. Feature Engineering ->",
+                                        "3. Data Preparation"])
+        super().__setattr__("_progressbar_prefix", 'Auto Data Prep:')
+    def fit(self,
+            data,
+            target_column):
+        """
+        DESCRIPTION:
+            Function to fit the data for Auto Data Preparation.
+        PARAMETERS:
+            data:
+                Required Argument.
+                Specifies the input data to be used for Auto Data Preparation.
+                Types: DataFrame
+            target_column:
+                Required Argument.
+                Specifies the target column to be used for Auto Data Preparation.
+                Types: str
+        RETURNS:
+            None
+        RAISES:
+            TeradataMlException, ValueError
+        EXAMPLES:
+            # Notes:
+            #     1. Get the connection to Vantage to execute the function.
+            #     2. One must import the required functions mentioned in
+            #        the example from teradataml.
+            #     3. Function raises error if not supported on the Vantage
+            #        user is connected to.
+            # Load the example data.
+            >>> load_example_data("teradataml", "titanic")
+            # Create teradataml DataFrames.
+            >>> titanic = DataFrame.from_table("titanic")
+            # Example 1: Run AutoDataPrep for classification problem.
+            # Scenario: Titanic dataset is used to predict the survival of passengers.
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
+            # Fit the data.
+            >>> aprep_obj.fit(titanic, titanic.survived)
+        """
+        # Fit the data using AutoML object
+        super().fit(data, target_column)
+    def get_data(self):
+        """
+        DESCRIPTION:
+            Function to retrieve the data after Auto Data Preparation.
+        RETURNS:
+             Dictionary of DataFrames containing the data after Auto Data Preparation.
+        RAISES:
+            TeradataMlException
+        EXAMPLES:
+            # Notes:
+            #     1. Get the connection to Vantage to execute the function.
+            #     2. One must import the required functions mentioned in
+            #        the example from teradataml.
+            #     3. Function raises error if not supported on the Vantage
+            #        user is connected to.
+            # Load the example data.
+            >>> load_example_data("teradataml", "titanic")
+            # Create teradataml DataFrames.
+            >>> titanic = DataFrame.from_table("titanic")
+            # Example 1: Run AutoDataPrep for classification problem.
+            # Scenario: Titanic dataset is used to predict the survival of passengers.
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
+            # Fit the data.
+            >>> aprep_obj.fit(titanic, titanic.survived)
+            # Retrieve the data after Auto Data Preparation.
+            >>> datas = aprep_obj.get_data()
+        """
+        # Raise error if fit is not called before get_data
+        _Validators._validate_dependent_method("get_data", "fit", self._is_fit_called)
+        datas = {}
+        for  key, val in self.table_name_mapping.items():
+            datas[key] = DataFrame(val)
+        return datas
+    def deploy(self, table_name):
+        """
+        DESCRIPTION:
+            Deploy the AutoDataPrep generated data to the database,
+            i.e., saves the data in the database.
+        PARAMETERS:
+            table_name:
+                Required Argument.
+                Specifies the name of the table to store the information
+                of deployed datasets in the database.
+                Types: str
+        RETURNS:
+            None
+        RAISES:
+            TeradataMlException, ValueError
+        EXAMPLES:
+            # Create an instance of the AutoDataPrep.
+            # Perform fit() operation on the AutoDataPrep object.
+            # Deploy the data to the table.
+            From teradataml import AutoDataPrep
+            # Load the example data.
+            >>> load_example_data("teradataml", "titanic")
+            >>> titanic = DataFrame.from_table("titanic")
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
+            # Fit the data.
+            >>> aprep_obj.fit(titanic, titanic.survived)
+            # Deploy the data to the table.
+            >>> aprep_obj.deploy("table_name")
+        """
+        # Appending arguments to list for validation
+        arg_info_matrix = []
+        arg_info_matrix.append(["table_name", table_name, True, (str), True])
+        # Validating the arguments
+        _Validators._validate_function_arguments(arg_info_matrix)
+        # Raise Error if fit is not called before deploy
+        _Validators._validate_dependent_method("deploy", "fit", self._is_fit_called)
+        if self.table_name_mapping is not None and \
+            isinstance(self.table_name_mapping, dict):
+            tab_map = {}
+            # If persist is False, then generate permanent table
+            if not self.kwargs.get("persist", False):
+                for key, val in self.table_name_mapping.items():
+                    # Perist the data
+                    per_name = self._create_per_result_table(prefix='{}_'.format(self.target_column),
+                                                             persist_result_table=val)
+                    # Store the table name mapping
+                    tab_map[key] = per_name
+            else:
+                # Tables are already persisted
+                tab_map = self.table_name_mapping
+            data = pd.DataFrame(list(tab_map.items()), columns=['Feature_Selection_Method', 'Table_Name'])
+            # Save the data to the database
+            copy_to_sql(df= data, table_name=table_name, if_exists="replace")
+            print("Data deployed successfully to the table: ", table_name)
+            return
+        # Raise error if data is not found or
+        # table_name_mapping is not a dictionary/ None
+        err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
+                                   "'deploy' method", \
+                                   "Data not found to deploy.")
+        raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
+    def load(self, table_name):
+        """
+        DESCRIPTION:
+            Loads the AutoDataPrep generated data from the database
+            in the session to use it for model training or scoring.
+        PARAMETERS:
+            table_name:
+                Required Argument.
+                Specifies the name of the table containing the information
+                of deployed datasets in the database.
+                Types: str
+        RETURNS:
+            Dictionary of DataFrames containing the datas generated from AutoDataPrep.
+        RAISES:
+            TeradataMlException, ValueError
+        EXAMPLES:
+            # Create an instance of the AutoDataPrep.
+            # Load the data from the table.
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep()
+            # Load the data from the table.
+            >>> data = aprep_obj.load("table_name")
+            # Retrieve the data
+            >>> print(data)
+        """
+        # Appending arguments to list for validation
+        arg_info_matrix = []
+        arg_info_matrix.append(["table_name", table_name, True, (str), True])
+        # Validating the arguments
+        _Validators._validate_function_arguments(arg_info_matrix)
+        # Load the data from the table
+        load_df = DataFrame(table_name)
+        data = {}
+        # Load the data into dictionary
+        for mtd, tab_name in load_df.get_values():
+            try:
+                data[mtd] = DataFrame(tab_name)
+            except Exception as e:
+                print(f"Error while loading {mtd} table: ", e)
+                data[mtd] = None
+                continue
+        return data
+    def delete_data(self,
+                    table_name,
+                    fs_method=None):
+        """
+        DESCRIPTION:
+            Deletes the deployed datasets from the database.
+        PARAMETERS:
+            table_name:
+                Required Argument.
+                Specifies the name of the table containing the deployed datasets.
+                Types: str
+            fs_method:
+                Optional Argument.
+                Specifies the name of the feature selection method to delete from the
+                deployed datasets.
+                Default Value: None
+                Permitted Values: "lasso", "rfe", "pca"
+                Note:
+                    * If "fs_method" is None, then method deletes all the deployed datasets.
+                Types: str or list of str
+        RETURNS:
+            None
+        RAISES:
+            TeradataMlException
+        EXAMPLES:
+            # Create an instance of the AutoDataPrep.
+            # Fit the data.
+            # Deploy the data to the table.
+            # Remove the deployed data from the table.
+            # Example 1: Remove the deployed data from the table within the AutoDataPrep object.
+            from teradataml import AutoDataPrep
+            # Load the example data.
+            >>> load_example_data("teradataml", "titanic")
+            >>> titanic = DataFrame.from_table("titanic")
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
+            # fit the data.
+            >>> aprep_obj.fit(titanic, titanic.survived)
+            # Deploy the datas to the database.
+            >>> aprep_obj.deploy("table_name")
+            # Remove lasso deployed data from the table.
+            >>> aprep_obj.delete_data("table_name", fs_method="lasso")
+            # Example 2: Remove the deployed data from the table using different instance of AutoDataPrep object.
+            # Create an instance of AutoDataPrep.
+            >>> aprep_obj2 = AutoDataPrep()
+            # Remove lasso and pca deployed data from the table.
+            >>> aprep_obj2.delete_data("table_name", fs_method=["lasso", "pca"])
+        """
+        # Appending arguments to list for validation
+        arg_info_matrix = []
+        arg_info_matrix.append(["table_name", table_name, False, (str), True])
+        arg_info_matrix.append(["fs_method", fs_method, True, (str, list), True, aml_const.FEATURE_SELECTION_MTDS.value])
+        # Validating the arguments
+        _Validators._validate_function_arguments(arg_info_matrix)
+        # Load the data from the table
+        df = DataFrame(table_name)
+        # Get the values from the loaded DataFrame
+        values = df.get_values()
+        if fs_method is None:
+            # If fs_method is None, then delete all the tables
+            methods = aml_const.FEATURE_SELECTION_MTDS.value
+        elif isinstance(fs_method, str):
+            # If fs_method is str, then convert it to list
+            methods = [fs_method]
+        else:
+            # If fs_method is list, then use it as it is
+            methods = fs_method
+        # Convert the methods to lower case
+        methods = [method.lower() for method in methods]
+        filtered_data = []
+        remaining_data = []
+        # Filter the values based on the fs_method
+        for row in values:
+            if any(cond in row[0] for cond in methods):
+                filtered_data.append(row)
+            else:
+                remaining_data.append(row)
+        # Drop the tables
+        err_flag = False
+        for row in filtered_data:
+            tab_name = row[1]
+            mtd = row[0]
+            try:
+                db_drop_table(tab_name)
+                print(f"Removed {mtd} table successfully.")
+            except Exception as e:
+                print(f"Error while removing {mtd} table: ", e)
+                remaining_data.append(row)
+                err_flag = True
+                continue
+        if err_flag:
+            # Print message if error occured while removing deployed data
+            print("Error occured while removing deployed data.")
+        if len(remaining_data) > 0:
+            rem_data = pd.DataFrame(remaining_data, columns=['Feature_Selection_Method', 'Table_Name'])
+            # Save the data to the database
+            copy_to_sql(df= rem_data, table_name=table_name, if_exists="replace")
+        elif not err_flag:
+            # Drop the whole table if no data is remaining
+            db_drop_table(table_name)
+            print("Deployed data removed successfully.")

teradataml/automl/data_preparation.py CHANGED Viewed

@@ -130,12 +130,12 @@ class _DataPreparation:
         self.task_type = task_type
         self.volatile = kwargs.get("volatile", False)
         self.persist = kwargs.get("persist", False)
+        self.aml_phases = kwargs.get("automl_phases", None)
         # Setting default value for auto run mode
         self._data_sampling_method = "SMOTE"
         self._scale_method_reg = "STD"
         self._scale_method_cls = "RANGE"
-        self.table_name_mapping = {}
         self.data_types = {key: value for key, value in self.data._column_names_and_types}
         self.seed = kwargs.get("seed", 42)
@@ -145,6 +145,8 @@ class _DataPreparation:
         if kwargs.get("seed") is not None:
             np.random.seed(self.seed)
+        self.data_mapping = kwargs.get("data_mapping", {})
     def data_preparation(self,
                          auto = True):
@@ -167,7 +169,8 @@ class _DataPreparation:
              list of lists containing, feature selected by rfe, pca and lasso.
         """
         self._display_heading(phase=2,
-                              progress_bar=self.progress_bar)
+                              progress_bar=self.progress_bar,
+                              automl_phases=self.aml_phases)
         self._display_msg(msg='Data preparation started ...',
                           progress_bar=self.progress_bar)
         # Setting user value in case of custom running mode
@@ -210,7 +213,7 @@ class _DataPreparation:
         self._feature_selection_PCA()
         self.progress_bar.update()
-        return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
+        return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
     def _handle_outliers(self,
                          auto):
@@ -355,6 +358,9 @@ class _DataPreparation:
             # Adding transformed data containing table to garbage collector
             GarbageCollector._add_to_garbagecollector(self.data._table_name)
+        # Returning outlier fit object to store in data mapping dictionary
+        return outlier_fit_out
     def _outlier_processing(self):
         """
         DESCRIPTION:
@@ -378,7 +384,10 @@ class _DataPreparation:
                               progress_bar=self.progress_bar)
             target_columns=columns_to_drop_rows
             replacement_strategy = "DELETE"
-            self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
+            fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
+            self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
+            self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
+            self.data_mapping['outlier_filtered_data'] = self.data._table_name
             self._display_msg(msg="Sample of dataset after removing outlier rows:",
                               data=self.data,
                               progress_bar=self.progress_bar)
@@ -390,7 +399,10 @@ class _DataPreparation:
                               progress_bar=self.progress_bar)
             target_columns=columns_to_impute
             replacement_strategy = "MEDIAN"
-            self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
+            fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
+            self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
+            self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
+            self.data_mapping['outlier_imputed_data'] = self.data._table_name
             self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
                               data=self.data,
                               progress_bar=self.progress_bar)
@@ -446,7 +458,10 @@ class _DataPreparation:
                         # Fetching replacement value
                         replacement_value = transform_val["replacement_value"]
                         # Performing outlier handling
-                        self._outlier_handling(target_col, outlier_method, replacement_value)
+                        fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
+                        self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
+                        self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
+                        self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
                 else:
                     self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
                                       progress_bar=self.progress_bar)
@@ -491,7 +506,7 @@ class _DataPreparation:
         start_time = time.time()
         # Temporary Pulling data for feature selection
-        pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
+        pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
         # Drop unnecessary columns and store the result
         train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
@@ -759,7 +774,7 @@ class _DataPreparation:
         train_table_name = UtilFuncs._extract_table_name(train_table_name)
         # Storing the table names in the table name mapping dictionary
-        self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
+        self.data_mapping['{}_train'.format(prefix)] = train_table_name
         # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
         is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
@@ -839,9 +854,9 @@ class _DataPreparation:
         # Loading data for feature scaling based of feature selection method
         if feature_selection_mtd == 'rfe':
-            data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
+            data_to_scale = DataFrame(self.data_mapping['rfe_train'])
         elif feature_selection_mtd == 'lasso':
-            data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
+            data_to_scale = DataFrame(self.data_mapping['lasso_train'])
         else:
             data_to_scale = self.data
@@ -864,6 +879,9 @@ class _DataPreparation:
                                volatile=volatile,
                                persist=persist)
+            self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
+            self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
             # storing the scale fit object and columns in data transformation dictionary
             self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
             self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
@@ -965,6 +983,7 @@ class _DataPreparation:
             fit_params["persist"] = False
         transform_output = RoundColumns(**fit_params).result
+        self.data_mapping['round_columns_data'] = transform_output._table_name
         if not self.volatile and not self.persist:
             # Adding transformed data containing table to garbage collector
             GarbageCollector._add_to_garbagecollector(transform_output._table_name)

teradataml/automl/data_transformation.py CHANGED Viewed

@@ -15,6 +15,7 @@
 # Python libraries
 import pandas as pd
+import warnings
 # Teradata libraries
 from teradataml.dataframe.dataframe import DataFrame
@@ -468,6 +469,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
             custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
             custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
             if custom_target_encoding_ind:
+                warn_cols = []
                 for col, tar_fit_obj in custom_target_encoding_fit_obj.items():
                     # Extracting accumulate columns
                     accumulate_columns = self._extract_list(self.data.columns, [col])
@@ -483,6 +485,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
                     self.data = TargetEncodingTransform(**transform_params).result
                     # Adding transformed data containing table to garbage collector
                     GarbageCollector._add_to_garbagecollector(self.data._table_name)
+                    if self.data[self.data[col] == -1].shape[0] > 0:
+                        warn_cols.append(col)
+                # Checking for unseen values in target encoding columns
+                if len(warn_cols) > 0:
+                    warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
+                                  This may cause inaccurate predictions. Consider retraining the model with updated data.",
+                                  stacklevel=0)
             self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
                               data=self.data,
                               progress_bar=self.progress_bar)

teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.4py3-none-any.whl → 20.0.0.5py3-none-any.whl