PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/automl/feature_exploration.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ##################################################################
 #
-# Copyright 2024 Teradata. All rights reserved.
+# Copyright 2025 Teradata. All rights reserved.
 # TERADATA CONFIDENTIAL AND TRADE SECRET
 #
 # Primary Owner: Sweta Shaw
@@ -24,11 +24,13 @@ from teradataml.dataframe.dataframe import DataFrame
 from teradataml.dataframe.copy_to import copy_to_sql
 from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
 from teradataml import OutlierFilterFit, OutlierFilterTransform
+from teradataml import OrdinalEncodingFit, OrdinalEncodingTransform
 from teradataml.hyperparameter_tuner.utils import _ProgressBar
 from teradataml.common.messages import Messages, MessageCodes
 from teradataml import display as dp
 from teradataml.utils.validators import _Validators
 from teradataml.common.utils import UtilFuncs
+from teradataml.common.garbagecollector import GarbageCollector
 def _is_terminal():
     """
@@ -59,10 +61,15 @@ if not _is_terminal():
 class _FeatureExplore:
     def __init__(self,
-                data=None,
-                target_column=None,
-                verbose=0,
-                task_type='regression'):
+                 data=None,
+                 target_column=None,
+                 custom_data=None,
+                 verbose=0,
+                 task_type='regression',
+                 fraud=False,
+                 churn=False,
+                 cluster=False,
+                 **kwargs):
         """
         DESCRIPTION:
             Internal function initializes the data, target column for feature exploration.
@@ -75,9 +82,15 @@ class _FeatureExplore:
             target_column:
                 Required Arugment.
+                Set to None for Clustering
                 Specifies the name of the target column in "data".
                 Types: str
+            custom_data:
+                Optional Argument.
+                Specifies json object containing user customized input.
+                Types: json object
             verbose:
                 Optional Argument.
                 Specifies the detailed execution steps based on verbose level.
@@ -96,13 +109,38 @@ class _FeatureExplore:
                     * 'regression'
                     * 'classification'
                 Types: str
+            fraud:
+                Optional Argument.
+                Specifies whether to apply fraud detection techniques.
+                Default Value: False
+                Types: bool
+            churn:
+                Optional Argument.
+                Specifies whether to apply churn prediction techniques.
+                Default Value: False
+                Types: bool
+            cluster:
+                Optional Argument.
+                Specifies whether to apply clustering techniques.
+                Default Value: False
+                Types: bool
         """
         self.data = data
         self.target_column = target_column
         self.verbose = verbose
+        self.custom_data = custom_data
+        self.data_transform_dict = {}
+        self.data_types = {key: value for key, value in self.data._column_names_and_types}
         self.terminal_print = _is_terminal()
         self.style = self._common_style()
         self.task_type = task_type
+        self.fraud = fraud
+        self.churn = churn
+        self.cluster = cluster
     def _exploration(self,
                      **kwargs):
@@ -113,8 +151,12 @@ class _FeatureExplore:
                 2. Statistics of numeric columns of the dataset
                 3. Categorical column summary
                 4. Futile columns in the dataset
-                5. Target column distribution
+                5. Target column distribution, not applicable for Clustering task_type
                 6. Outlier Percentage in numeric columns of the dataset
+                7. Heatmap of Numerical Features
+                8. Boxplots of Feature Distribution
+                9. Countplot of Categorical features
+                10.Scatterplot for selected features for Clustering task_type
         """
         numerical_columns = []
         categorical_columns= []
@@ -142,7 +184,7 @@ class _FeatureExplore:
         # Displaying date columns
         if len(date_column_list)!=0:
             self._display_msg(msg='Identified Date Columns:',
-                             data=date_column_list)
+                              data=date_column_list)
         # Column Summary of each feature of data
         # such as null count, datatype, non null count
@@ -155,14 +197,30 @@ class _FeatureExplore:
         if len(categorical_columns) != 0:
             categorical_obj = self._categorical_summary(categorical_columns)
             self._futile_column(categorical_obj)
+        if not self.cluster:
+            # Plot a graph of target column
+            self._target_column_details()
-        # Plot a graph of target column
-        self._target_column_details()
         # Displays outlier percentage
-        outlier_method = "Tukey"
-        df = self._outlier_detection(outlier_method,numerical_columns)
+        if self.fraud or self.churn:
+            outlier_method = "percentile"
+            df = self._outlier_detection(numerical_columns, outlier_method)
+        else:
+            outlier_method = "Tukey"
+            df = self._outlier_detection(outlier_method, numerical_columns)
+        if self.fraud or self.churn or self.cluster:
+            # Boxplots and Heatmap for feature distribution by target column
+            self._boxplot_heatmap()
+            # Countplots for feature distribution by target column
+            self._countplot_categorical_distribution()
+        if self.cluster:
+            self._scatter_plot()
     def _statistics(self):
         """
         DESCRIPTION:
@@ -172,8 +230,7 @@ class _FeatureExplore:
         self._display_msg(msg='\nStatistics of Data:',
                           data=self.data.describe(),
                           show_data=True)
     def _column_summary(self):
         """
         DESCRIPTION:
@@ -228,7 +285,7 @@ class _FeatureExplore:
         PARAMETERS:
             categorical_obj:
                 Required Argument.
-                Specifies the instance of CategoricalSummary for futile column detection..
+                Specifies the instance of CategoricalSummary for futile column detection.
                 Types: Instance of CategoricalSummary
         """
         # Futile columns detection using categorical column object
@@ -248,23 +305,489 @@ class _FeatureExplore:
                               data=gfc_out.result,
                               show_data=True)
-    def _target_column_details(self):
+    def _target_column_details(self,
+                               plot_data=None):
         """
         DESCRIPTION:
             Internal function displays the target column distribution of Target column/ Response column.
         PARAMETERS:
-            None
+            plot_data:
+                Optional Argument.
+                Specifies the input teradataml DataFrame for plotting distribution.
+                Types: teradataml Dataframe
         """
         if self._check_visualization_libraries() and not _is_terminal():
-            # Plotting target column distribution
+            import matplotlib.pyplot as plt
+            import seaborn as sns
+            if plot_data is None:
+                target_data = self.data.select([self.target_column]).to_pandas()
+            else:
+                target_data = plot_data[[self.target_column]]
             self._display_msg(msg='\nTarget Column Distribution:',
                               show_data=True)
-            _FeatureExplore._visualize(data=self.data,
-                                       target_column=self.target_column,
-                                       plot_type=["target"],
-                                       problem_type=self.task_type)
+            plt.figure(figsize=(8, 6))
+            # Ploting a histogram for target column
+            plt.hist(target_data, bins=10, density=True, edgecolor='black')
+            plt.xlabel(self.target_column)
+            plt.ylabel('Density')
+            plt.show()
+    def _countplot_categorical_distribution(self, plot_data=None, top_n=20, max_unique_threshold=50):
+        """
+        DESCRIPTION:
+            Function to plot count plots for categorical features based on the target column.
+            Limits the number of unique categories to avoid messy visuals.
+        PARAMETERS:
+            plot_data:
+                Optional Argument.
+                Specifies the input teradataml DataFrame for plotting distribution.
+                Default Value: None. It will use entire dataset passed for training.
+                Types: teradataml Dataframe
+            top_n:
+                Optional Argument.
+                Maximum number of categories to display per feature.
+                Default Value: 20
+                Types: int
+            max_unique_threshold:
+                Optional Argument.
+                Only plot features with unique values below this threshold.
+                Default Value: 50
+                Types: int
+        """
+        if self._check_visualization_libraries() and not _is_terminal():
+            import matplotlib.pyplot as plt
+            import seaborn as sns
+            # Prepare data
+            if plot_data is None:
+                data = self.data.to_pandas().reset_index()
+            else:
+                data = plot_data
+            target_column = self.target_column
+            # Select categorical features
+            categorical_features = data.select_dtypes(include=['object', 'category']).columns
+            if not self.cluster:
+                categorical_features = [col for col in categorical_features if col != target_column]
+            # Filter categorical features based on unique value threshold
+            categorical_features = [col for col in categorical_features if data[col].nunique() <= max_unique_threshold]
+            if len(categorical_features) == 0:
+                print("No categorical columns found with unique values within the threshold.")
+                return
+            self._display_msg(msg='\nCategorical Feature Distributions by Target Column (Count Plots):',
+                              show_data=False)
+            for feature in categorical_features:
+                plt.figure(figsize=(10, 6))
+                # Get value counts and filter top N categories
+                value_counts = data[feature].value_counts()
+                top_categories = value_counts.nlargest(top_n).index.tolist()
+                # Remove duplicates while preserving order
+                top_categories = list(dict.fromkeys(top_categories))
+                # Replace less frequent categories with "Other"
+                data[feature] = data[feature].apply(lambda x: x if x in top_categories else "Other")
+                # Generate count plot
+                if not self.cluster:
+                    cntplot = sns.countplot(data=data, x=feature, hue=target_column, order=top_categories)
+                else:
+                    cntplot = sns.countplot(data=data, x=feature, order=top_categories)
+                for p in cntplot.patches:
+                    height = p.get_height()
+                    if height > 0:  # Only display if height is greater than 0
+                        cntplot.annotate(f'{int(height)}',
+                                         (p.get_x() + p.get_width() / 2, height),
+                                         ha='center', va='bottom', fontsize=10, fontweight='bold')
+                if not self.cluster:
+                    plt.title(f"Distribution of {feature} by {target_column}")
+                else:
+                    plt.title(f"Distribution of {feature}")
+                plt.xlabel(feature)
+                plt.ylabel("Count")
+                plt.xticks(rotation=45, ha='right')  # Improve label visibility
+                if not self.cluster:
+                    plt.legend(title=target_column)
+                plt.tight_layout()
+                plt.show()
+    def _correlation(self, data, threshold=0.1, max_features=10, min_features=2):
+        """
+        DESCRIPTION:
+            Function to calculate the correlation values between features.
+        PARAMETERS:
+            data:
+                Required Argument.
+                Specifies the input pandas DataFrame for correlation analysis.
+                Types: pandas DataFrame
+            threshold:
+                Optional Argument.
+                Specifies the minimum correlation threshold for feature selection.
+                Default Value: 0.1
+                Types: float
+            max_features:
+                Optional Argument.
+                Specifies the maximum number of features to select.
+                Default Value: 10
+                Types: int
+            min_features:
+                Optional Argument.
+                Specifies the minimum number of features to select as fallback.
+                Default Value: 2
+                Types: int
+        """
+        import numpy as np
+        numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
+        # For AutoML, exclude target_column from numerical features
+        if not self.cluster and self.target_column in numerical_features:
+            numerical_features = [col for col in numerical_features if col != self.target_column]
+        total_numerical_features = len(numerical_features)
+        if self.cluster:
+            # Clustering: feature vs feature correlation
+            corr_matrix = data[numerical_features].corr()
+            # Extract upper triangle without diagonal
+            mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
+            corr_vals = corr_matrix.where(mask).stack().reset_index()
+            corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
+            corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
+            corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
+            filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_features)
+            selection_criteria = "Top Correlated Feature Pairs"
+            if len(filtered) < 2:
+                filtered = corr_vals.head(min(2, len(corr_vals)))
+                selection_criteria = f"Top {min(2, len(corr_vals))} Correlated Feature Pairs (Fallback)"
+            # Merge unique features from pairs
+            selected_features = list(set(filtered['Feature1'].tolist() + filtered['Feature2'].tolist()))
+            selected_features = selected_features[:max_features]  # restrict total features
+            corr_matrix = data[selected_features].corr()
+            return filtered, selected_features, corr_matrix, selection_criteria
+        else:
+            # AutoML: correlation with target column
+            correlation_values = data[numerical_features].corrwith(data[self.target_column])
+            correlation_df = correlation_values.reset_index()
+            correlation_df.columns = ['Feature', 'Correlation']
+            correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()
+            correlation_df = correlation_df.sort_values(by='Abs_Correlation', ascending=False)
+            filtered = correlation_df[correlation_df['Abs_Correlation'] > threshold].head(max_features)
+            selection_criteria = "Features above threshold correlation with target"
+            if len(filtered) < 2:
+                filtered = correlation_df.head(min(min_features, total_numerical_features))
+                selection_criteria = f"Top {min(min_features, total_numerical_features)} Correlated Features (Fallback)"
+            selected_features = filtered['Feature'].tolist() + [self.target_column]
+            selected_features = list(dict.fromkeys(selected_features))  # preserve order, remove dup
+            corr_matrix = data[selected_features].corr()
+            return selected_features, corr_matrix, selection_criteria
+    def _boxplot_heatmap(self, plot_data=None):
+        """
+        DESCRIPTION:
+            Internal function to display heatmap and boxplots of selected numerical features.
+            Handles both AutoML (feature vs target) and Clustering (feature vs feature).
+        Parameters:
+            plot_data:
+                Optional Argument.
+                Specifies the data to be plotted.
+                Default Value: None. It will use entire dataset passed for training.
+                Types: teradataml DataFrame.
+        """
+        if self._check_visualization_libraries() and not _is_terminal():
+            import matplotlib.pyplot as plt
+            import seaborn as sns
+            import numpy as np
+            import pandas as pd
+            # Get DataFrame
+            if plot_data is not None:
+                data = plot_data.to_pandas().reset_index()
+            else:
+                # Perform ordinal encoding if needed for classification
+                if not self.cluster and self.data_types.get(self.target_column) in ['str']:
+                    self._ordinal_encoding([self.target_column])
+                data = self.data.to_pandas().reset_index()
+            if not self.cluster:
+                # Get selected features and correlation matrix
+                selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
+            else:
+                filtered, selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
+            # Display heatmap
+            mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=0)
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
+            plt.title("Heatmap of Selected Features")
+            plt.show()
+            num_features = len(selected_features)
+            self._display_msg(msg=f'\nNumber of features selected for Boxplots: {num_features}', show_data=False)
+            self._display_msg(msg=f'\nSelection Criteria: {selection_criteria}', show_data=False)
+            self._display_msg(msg=f'\nSelected Features: {", ".join(selected_features)}', show_data=False)
+            self._display_msg(msg='\nBoxplots:', show_data=False)
+            if self.cluster:
+                num_plots = len(filtered)
+                cols = 2 if num_plots > 1 else 1
+                rows = (num_plots + cols - 1) // cols
+                fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
+                axes = axes.flatten() if len(filtered) > 1 else [axes]
+                for i, (idx, row) in enumerate(filtered.iterrows()):
+                    if i >= len(axes):
+                        break  # prevent IndexError if more data than axes
+                    feature_x, feature_y = row["Feature1"], row["Feature2"]
+                    x_unique = data[feature_x].nunique()
+                    x = data[feature_x]
+                    if x_unique > 20:
+                        x = pd.qcut(x, q=10, duplicates='drop')
+                    sns.boxplot(x=x, y=data[feature_y], ax=axes[i])
+                    axes[i].set_title(f"{feature_y} vs {feature_x}")
+                    axes[i].set_xlabel(feature_x)
+                    axes[i].set_ylabel(feature_y)
+                    axes[i].tick_params(axis='x', rotation=45)
+            else:
+                # Prepare boxplot layout
+                num_features = len(selected_features)
+                cols = 2 if num_features > 1 else 1
+                rows = max((num_features // 2) + (num_features % 2),1)
+                rows = max(rows, 1)
+                fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
+                axes = axes.flatten() if num_features > 1 else [axes]
+                # AutoML: Plot boxplot of feature vs target column
+                for i, feature in enumerate(selected_features):
+                    if feature != self.target_column:
+                        sns.boxplot(x=data[self.target_column], y=data[feature], ax=axes[i])
+                        axes[i].set_title(f"{feature}")
+                        axes[i].set_xlabel(self.target_column)
+                        axes[i].set_ylabel(feature)
+            plt.tight_layout()
+            plt.show()
+    def _scatter_plot(self, plot_data=None, max_selected_pairs=10, threshold=0.1):
+        """
+        DESCRIPTION:
+            Internal function to display scatterplots of selected numerical features.
+            Handles Clustering (feature vs feature).
+        PARAMETERS:
+            plot_data:
+                Optional Argument.
+                Specifies the input teradataml dataFrame for plotting scatter plots.
+                Default Value: None. It will use entire dataset passed for training.
+                Types: teradataml DataFrame
+            max_selected_pairs:
+                Optional Argument.
+                Specifies the maximum number of feature pairs to select for scatter plots.
+                Default Value: 10
+                Types: int
+            threshold:
+                Optional Argument.
+                Specifies the minimum correlation threshold for feature pair selection.
+                Default Value: 0.1
+                Types: float
+        """
+        if self._check_visualization_libraries() and not _is_terminal():
+            import matplotlib.pyplot as plt
+            import seaborn as sns
+            import numpy as np
+            # Load data
+            data = plot_data.to_pandas().reset_index() if plot_data is not None else self.data.to_pandas().reset_index()
+            # Select numerical features
+            numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
+            if len(numerical_features) < 2:
+                print("Not enough numerical features for scatter plots.")
+                return
+            # Compute correlation matrix
+            corr_matrix = data[numerical_features].corr()
+            # Extract upper triangle (excluding diagonal)
+            mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
+            corr_vals = corr_matrix.where(mask).stack().reset_index()
+            corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
+            corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
+            # Sort and filter top pairs
+            corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
+            filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_selected_pairs)
+            if len(filtered) < 2:
+                filtered = corr_vals.head(min(2, len(corr_vals)))
+            if len(filtered) == 0:
+                print("No correlated pairs found above threshold.")
+                return
+            self._display_msg(msg=f"\nScatter Plots for Top Correlated Feature Pairs:", show_data=False)
+            # Plot scatter plots
+            for _, row in filtered.iterrows():
+                feature_x, feature_y = row["Feature1"], row["Feature2"]
+                plt.figure(figsize=(6, 4))
+                sns.scatterplot(x=data[feature_x], y=data[feature_y], alpha=0.3)
+                plt.xlabel(feature_x)
+                plt.ylabel(feature_y)
+                plt.title(f"Scatter Plot: {feature_x} vs {feature_y} (Corr: {row['Correlation']:.2f})")
+                plt.tight_layout()
+                plt.show()
+    def _ordinal_encoding(self,
+                          ordinal_columns):
+        """
+        DESCRIPTION:
+            Function performs the ordinal encoding to categorical columns or features in the dataset.
+        PARAMETERS:
+            ordinal_columns:
+                Required Argument.
+                Specifies the categorical columns for which ordinal encoding will be performed.
+                Types: str or list of strings (str)
+        """
+        # Setting volatile and persist parameters for performing encoding
+        volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
+                                                         param_name="CategoricalEncodingParam")
+        # Adding fit parameters for performing encoding
+        fit_params = {
+            "data" : self.data,
+            "target_column" : ordinal_columns,
+            "volatile" : volatile,
+            "persist" : persist
+        }
+        # Performing ordinal encoding fit on target columns
+        ord_fit_obj = OrdinalEncodingFit(**fit_params)
+        # Storing fit object and column list for ordinal encoding in data transform dictionary
+        if ordinal_columns[0] != self.target_column:
+            self.data_transform_dict["custom_ord_encoding_fit_obj"] = ord_fit_obj.result
+            self.data_transform_dict['custom_ord_encoding_col'] = ordinal_columns
+        else:
+            self.data_transform_dict['target_col_encode_ind'] = True
+            self.data_transform_dict['target_col_ord_encoding_fit_obj'] = ord_fit_obj.result
+        # Extracting accumulate columns
+        accumulate_columns = self._extract_list(self.data.columns, ordinal_columns)
+        # Adding transform parameters for performing encoding
+        transform_params = {
+            "data" : self.data,
+            "object" : ord_fit_obj.result,
+            "accumulate" : accumulate_columns,
+            "persist" : True
+        }
+        # Disabling display table name if persist is True by default
+        if not volatile and not persist:
+            transform_params["display_table_name"] = False
+        # Setting persist to False if volatile is True
+        if volatile:
+            transform_params["volatile"] = True
+            transform_params["persist"] = False
+        # Performing ordinal encoding transformation
+        self.data = OrdinalEncodingTransform(**transform_params).result
+        if not volatile and not persist:
+            # Adding transformed data containing table to garbage collector
+            GarbageCollector._add_to_garbagecollector(self.data._table_name)
+        if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
+            self.target_label = ord_fit_obj
+    def _extract_list(self,
+                      list1,
+                      list2):
+        """
+        DESCRIPTION:
+            Function to extract elements from list1 which are not present in list2.
+        PARAMETERS:
+            list1:
+                Required Argument.
+                Specifies the first list for extracting elements from.
+                Types: list
+            list2:
+                Required Argument.
+                Specifies the second list to get elements for avoiding in first list while extracting.
+                Types: list
+        RETURN:
+            Returns extracted elements in form of list.
+        """
+        new_lst = list(set(list1) - set(list2))
+        return new_lst
+    def _get_generic_parameters(self,
+                                func_indicator=None,
+                                param_name=None):
+        """
+        DESCRIPTION:
+            Function to get generic parameters.
+        PARAMETERS:
+            func_indicator:
+                Optional Argument.
+                Specifies the name of function indicator.
+                Types: str
+            param_name:
+                Optional Argument.
+                Specifies the name of the param which contains generic parameters.
+                Types: str
+        RETURNS:
+            Tuple containing volatile and persist parameters.
+        """
+        volatile = self.volatile
+        persist = self.persist
+        if self.custom_data is not None and self.custom_data.get(func_indicator, False):
+            volatile = self.custom_data[param_name].get("volatile", False)
+            persist = self.custom_data[param_name].get("persist", False)
+        return (volatile, persist)
     def _check_visualization_libraries(self):
         """
         DESCRIPTION:
@@ -287,8 +810,8 @@ class _FeatureExplore:
     def _outlier_detection(self,
                            outlier_method,
                            column_list,
-                           lower_percentile = None,
-                           upper_percentile = None):
+                           lower_percentile=None,
+                           upper_percentile=None):
         """
         DESCRIPTION:
             Function detects the outlier in numerical column and display thier percentage.

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl