PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/automl/model_evaluation.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ##################################################################
 #
-# Copyright 2024 Teradata. All rights reserved.
+# Copyright 2025 Teradata. All rights reserved.
 # TERADATA CONFIDENTIAL AND TRADE SECRET
 #
 # Primary Owner: Sweta Shaw
@@ -20,6 +20,8 @@ import ast
 # Teradata libraries
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml.automl.model_training import _ModelTraining
+from teradataml.automl.feature_exploration import _FeatureExplore
+from teradataml import Shap
 class _ModelEvaluator:
@@ -27,7 +29,8 @@ class _ModelEvaluator:
     def __init__(self,
                  df=None,
                  target_column=None,
-                 task_type=None):
+                 task_type=None,
+                 cluster=False):
         """
         DESCRIPTION:
             Function initializes the data, target column, features and models
@@ -52,17 +55,26 @@ class _ModelEvaluator:
                 Permitted Values: "Regression", "Classification"
                 Types: str
+            cluster:
+                Required Argument.
+                Specifies whether to apply clustering techniques.
+                Default Value: False
+                Types: bool
         """
         self.model_info = df
         self.target_column = target_column
         self.task_type = task_type
+        self.cluster = cluster
+        self.shap_results = None
     def model_evaluation(self,
                          rank,
                          table_name_mapping,
                          data_node_id,
-                         target_column_ind = True,
-                         get_metrics = False):
+                         target_column_ind=True,
+                         get_metrics=False,
+                         is_predict=False):
         """
         DESCRIPTION:
             Function performs the model evaluation on the specified rank in leaderborad.
@@ -94,7 +106,12 @@ class _ModelEvaluator:
                 Specifies whether to return metrics or not.
                 Default Value: False
                 Types: bool
+            is_predict:
+                Required Argument.
+                Specifies whether predict is called or evaluate is called.
+                Default Value: False
+                Types: bool
         RETURNS:
             tuple containing, performance metrics and predicitions of specified rank ML model.
@@ -105,8 +122,25 @@ class _ModelEvaluator:
         self.data_node_id = data_node_id
         self.get_metrics = get_metrics
-        # Return predictions only if test data is present and target column is not present
-        return self._evaluator(rank)
+        # Perform evaluation
+        if self.cluster:
+            evaluation_results, test_data = self._evaluator(rank)
+        else:
+            evaluation_results = self._evaluator(rank)
+        # Apply SHAP if applicable
+        if is_predict:
+            if not self.cluster:
+                model_id = self.model_info.loc[rank]['MODEL_ID'].split('_')[0]
+                permitted_models = ["XGBOOST", "DECISIONFOREST"]
+                if model_id.upper() in permitted_models:
+                    print("\nApplying SHAP for Model Interpretation...")
+                    self._apply_shap(rank, isload=False)
+                else:
+                    print(f"\nSHAP is not applied for {model_id}. Only permitted models: {permitted_models}")
+            else:
+                self._visualize_cluster(test_data)
+        return evaluation_results
     def _evaluator(self,
                    rank):
@@ -130,31 +164,225 @@ class _ModelEvaluator:
         ml_name = self.model_info.loc[rank]['MODEL_ID'].split('_')[0]
-        # Defining eval_params
-        eval_params = _ModelTraining._eval_params_generation(ml_name,
-                                                             self.target_column,
-                                                             self.task_type)
+        if not self.cluster:
+            # Defining eval_params
+            eval_params = _ModelTraining._eval_params_generation(ml_name,
+                                                                 self.target_column,
+                                                                 self.task_type)
+            # Extracting test data for evaluation based on data node id
+            test = DataFrame(self.table_name_mapping[self.data_node_id]['{}_test'.format(model['FEATURE_SELECTION'])])
+            print("\nFollowing model is being picked for evaluation:")
+            print("Model ID :", model['MODEL_ID'],
+                "\nFeature Selection Method :",model['FEATURE_SELECTION'])
+            if self.task_type.lower() == 'classification':
+                params = ast.literal_eval(model['PARAMETERS'])
+                eval_params['output_responses'] = params['output_responses']
+            # Mapping data according to model type
+            data_map = 'test_data' if ml_name == 'KNN' else 'newdata'
+            # Performing evaluation if get_metrics is True else returning predictions
+            if self.get_metrics:
+                metrics = model['model-obj'].evaluate(**{data_map: test}, **eval_params)
+                return metrics
+            else:
+                # Removing accumulate parameter if target column is not present
+                if not self.target_column_ind:
+                    eval_params.pop("accumulate")
+                pred = model['model-obj'].predict(**{data_map: test}, **eval_params)
+                return pred
+        else:
+            print("\nFollowing model is being picked for evaluation of clustering:")
+            print("Model ID :", model['MODEL_ID'],
+                "\nFeature Selection Method :",model['FEATURE_SELECTION'])
+            feature_type = model["FEATURE_SELECTION"]
+            test_table_key = f"{feature_type}_test"
+            if test_table_key not in self.table_name_mapping[self.data_node_id]:
+                raise KeyError(f"Table key '{test_table_key}' not found in table_name_mapping. Available keys: {self.table_name_mapping[self.data_node_id].keys()}")
+            test_data = DataFrame(self.table_name_mapping[self.data_node_id][test_table_key])
+            if self.get_metrics:
+                from teradataml import td_sklearn as skl
+                X = test_data
+                result = model["model-obj"].predict(X)
+                silhouette = skl.silhouette_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
+                calinski = skl.calinski_harabasz_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
+                davies = skl.davies_bouldin_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
+                return {
+                    "SILHOUETTE": silhouette,
+                    "CALINSKI": calinski,
+                    "DAVIES": davies
+                }, test_data
+            else:
+                return model["model-obj"].predict(test_data),test_data
+    def _apply_shap(self, rank, isload):
+        """
+        DESCRIPTION:
+            Applies SHAP analysis to explain model predictions after evaluation.
+        PARAMETERS:
+            rank:
+                Required Argument.
+                Specifies the position(rank) of ML model for evaluation.
+                Types: int
+            isload:
+                Required Argument.
+                Specifies whether load is calling the function or not.
+                Types: bool
+        """
-        # Extracting test data for evaluation based on data node id
-        test = DataFrame(self.table_name_mapping[self.data_node_id]['{}_new_test'.format(model['FEATURE_SELECTION'])])
+        test_data = DataFrame(self.table_name_mapping[self.data_node_id]['{}_test'.format(self.model_info.loc[rank]['FEATURE_SELECTION'])])
+        id_column = "id"
+        input_columns = [col for col in test_data.columns if col != self.target_column and col != id_column]
-        print("\nFollowing model is being picked for evaluation:")
-        print("Model ID :", model['MODEL_ID'],
-              "\nFeature Selection Method :",model['FEATURE_SELECTION'])
+        if isload:
+            result_table_name = self.model_info.loc[rank, 'RESULT_TABLE']
+            model_object = DataFrame(result_table_name)
+        else:
+            model_obj = self.model_info.loc[rank]['model-obj']
+            model_object = model_obj.result
-        if self.task_type.lower() == 'classification':
-            params = ast.literal_eval(model['PARAMETERS'])
-            eval_params['output_responses'] = params['output_responses']
+        # Extract model training function from MODEL_ID and format it correctly
+        raw_model_id = self.model_info.loc[rank]['MODEL_ID'].split('_')[0]  # Extract base model name
+        formatted_training_function = "TD_" + raw_model_id  # Add TD_ prefix
+        #Currently issue with default value of model_type, it is not case insensitive
+        #Hence, converting task_type to lower case
+        shap_output = Shap(
+            data=test_data,
+            object=model_object,
+            id_column='id',
+            training_function=formatted_training_function,
+            model_type=self.task_type.lower(),
+            input_columns=input_columns,
+            detailed=True
+        )
-        # Mapping data according to model type
-        data_map = 'test_data' if ml_name == 'KNN' else 'newdata'
-        # Performing evaluation if get_metrics is True else returning predictions
-        if self.get_metrics:
-            metrics = model['model-obj'].evaluate(**{data_map: test}, **eval_params)
-            return metrics
-        else:
-            # Removing accumulate parameter if target column is not present
-            if not self.target_column_ind:
-                eval_params.pop("accumulate")
-            pred = model['model-obj'].predict(**{data_map: test}, **eval_params)
-            return pred
+        self.shap_results = shap_output.output_data
+        print("\nSHAP Analysis Completed. Feature Importance Available.")
+        # Extract SHAP values for visualization
+        df = self.shap_results
+        data = next(df.itertuples())._asdict()
+        import matplotlib.pyplot as plt
+        # Extract keys and values
+        keys = list(data.keys())
+        values = list(data.values())
+        # Plot SHAP values as a bar graph
+        plt.figure(figsize=(10, 6))
+        bars = plt.bar(keys, values, color='skyblue', edgecolor='black')
+        for bar in bars:
+            yval = bar.get_height()
+            plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
+        plt.xticks(rotation=45, ha='right')
+        plt.title('Feature Importance (SHAP Values)', fontsize=14)
+        plt.xlabel('Features', fontsize=12)
+        plt.ylabel('SHAP Value', fontsize=12)
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        plt.tight_layout()
+        plt.show()
+    def _visualize_cluster(self, test_data):
+        print("\nVisualizing Clusters for interpretability...")
+        df = test_data.to_pandas()
+        print(df.head())
+        from sklearn.cluster import KMeans
+        import numpy as np
+        import matplotlib.pyplot as plt
+        # Automatically pick top 2 high variance numeric features
+        numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
+        if 'id' in numerical_features:
+            numerical_features.remove('id')
+        if len(numerical_features) < 2:
+            print("Not enough numeric features available for scatter plot.")
+            return
+        # Compute correlation matrix
+        corr_matrix = df[numerical_features].corr()
+        # Extract upper triangle without diagonal
+        mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
+        corr_vals = corr_matrix.where(mask).stack().reset_index()
+        corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
+        corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
+        # Sort and select top pair
+        corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
+        filtered = corr_vals[corr_vals['Abs_Correlation'] > 0.1].head(1)
+        variances = df[numerical_features].var().sort_values(ascending=False)
+        top_features = variances.index[:2].tolist()
+        print("Selection Criteria: Top 2 High Variance Features")
+        print(f"Selected Features: {top_features[0]}, {top_features[1]}")
+        X = df[top_features].values
+        kmeans = KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300,
+                        tol=0.0001, random_state=111, algorithm='elkan')
+        kmeans.fit(X)
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as mpatches
+        import numpy as np
+        from matplotlib.colors import ListedColormap
+        # Define a fixed color map
+        cmap = ListedColormap(plt.cm.Pastel2.colors)
+        n_clusters = len(np.unique(kmeans.labels_))
+        colors = cmap.colors[:n_clusters]
+        # Plot decision regions
+        h = 0.02
+        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                             np.arange(y_min, y_max, h))
+        Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
+        Z = Z.reshape(xx.shape)
+        plt.figure(figsize=(14, 7))
+        plt.imshow(Z, interpolation='nearest',
+                   extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+                   cmap=ListedColormap(colors), aspect='auto', origin='lower', zorder=1)
+        # Plot actual clustered data points (zorder > 1)
+        cluster_colors = [colors[label] for label in kmeans.labels_]
+        plt.scatter(X[:, 0], X[:, 1], c=cluster_colors, s=100, edgecolor='k', alpha=0.85, zorder=2)
+        # Plot red centroids
+        centroids = kmeans.cluster_centers_
+        plt.scatter(centroids[:, 0], centroids[:, 1],
+                    s=300, c='red', alpha=0.7, zorder=3)
+        # Annotate centroids
+        for i, (x, y) in enumerate(centroids):
+            """plt.text(x, y + 0.05, f'Cluster {i}', fontsize=11, weight='bold',
+                    ha='center', va='bottom', zorder=4)"""
+            plt.text(x, y - 0.05, f'({x:.2f}, {y:.2f})', fontsize=9,
+                    ha='center', va='top', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'), zorder=4)
+        # Legend (manually matched)
+        legend_handles = [mpatches.Patch(color=colors[i], label=f'Cluster {i}') for i in range(n_clusters)]
+        plt.legend(handles=legend_handles, title="Cluster ID", loc='upper right')
+        # Axis labels and title
+        plt.xlabel(top_features[0])
+        plt.ylabel(top_features[1])
+        plt.title("Cluster Visualization on Test Data")
+        plt.show()

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl