teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show
  1. teradataml/README.md +210 -0
  2. teradataml/__init__.py +1 -1
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +162 -76
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/json_parser/__init__.py +2 -0
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
  8. teradataml/analytics/json_parser/metadata.py +22 -4
  9. teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
  10. teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
  11. teradataml/analytics/sqle/__init__.py +3 -0
  12. teradataml/analytics/utils.py +4 -1
  13. teradataml/automl/__init__.py +2369 -464
  14. teradataml/automl/autodataprep/__init__.py +15 -0
  15. teradataml/automl/custom_json_utils.py +184 -112
  16. teradataml/automl/data_preparation.py +113 -58
  17. teradataml/automl/data_transformation.py +154 -53
  18. teradataml/automl/feature_engineering.py +113 -53
  19. teradataml/automl/feature_exploration.py +548 -25
  20. teradataml/automl/model_evaluation.py +260 -32
  21. teradataml/automl/model_training.py +399 -206
  22. teradataml/clients/auth_client.py +2 -2
  23. teradataml/common/aed_utils.py +11 -2
  24. teradataml/common/bulk_exposed_utils.py +4 -2
  25. teradataml/common/constants.py +62 -2
  26. teradataml/common/garbagecollector.py +50 -21
  27. teradataml/common/messagecodes.py +47 -2
  28. teradataml/common/messages.py +19 -1
  29. teradataml/common/sqlbundle.py +23 -6
  30. teradataml/common/utils.py +116 -10
  31. teradataml/context/aed_context.py +16 -10
  32. teradataml/data/Employee.csv +5 -0
  33. teradataml/data/Employee_Address.csv +4 -0
  34. teradataml/data/Employee_roles.csv +5 -0
  35. teradataml/data/JulesBelvezeDummyData.csv +100 -0
  36. teradataml/data/byom_example.json +5 -0
  37. teradataml/data/creditcard_data.csv +284618 -0
  38. teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
  39. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
  40. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
  42. teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
  43. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
  44. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
  45. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
  46. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
  47. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
  48. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
  49. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
  50. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
  51. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
  52. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
  53. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
  54. teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
  55. teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
  56. teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
  57. teradataml/data/load_example_data.py +29 -11
  58. teradataml/data/payment_fraud_dataset.csv +10001 -0
  59. teradataml/data/teradataml_example.json +67 -0
  60. teradataml/dataframe/copy_to.py +714 -54
  61. teradataml/dataframe/dataframe.py +1153 -33
  62. teradataml/dataframe/dataframe_utils.py +8 -3
  63. teradataml/dataframe/functions.py +168 -1
  64. teradataml/dataframe/setop.py +4 -1
  65. teradataml/dataframe/sql.py +141 -9
  66. teradataml/dbutils/dbutils.py +470 -35
  67. teradataml/dbutils/filemgr.py +1 -1
  68. teradataml/hyperparameter_tuner/optimizer.py +456 -142
  69. teradataml/lib/aed_0_1.dll +0 -0
  70. teradataml/lib/libaed_0_1.dylib +0 -0
  71. teradataml/lib/libaed_0_1.so +0 -0
  72. teradataml/lib/libaed_0_1_aarch64.so +0 -0
  73. teradataml/scriptmgmt/UserEnv.py +234 -34
  74. teradataml/scriptmgmt/lls_utils.py +43 -17
  75. teradataml/sdk/_json_parser.py +1 -1
  76. teradataml/sdk/api_client.py +9 -6
  77. teradataml/sdk/modelops/_client.py +3 -0
  78. teradataml/series/series.py +12 -7
  79. teradataml/store/feature_store/constants.py +601 -234
  80. teradataml/store/feature_store/feature_store.py +2886 -616
  81. teradataml/store/feature_store/mind_map.py +639 -0
  82. teradataml/store/feature_store/models.py +5831 -214
  83. teradataml/store/feature_store/utils.py +390 -0
  84. teradataml/table_operators/table_operator_util.py +1 -1
  85. teradataml/table_operators/templates/dataframe_register.template +6 -2
  86. teradataml/table_operators/templates/dataframe_udf.template +6 -2
  87. teradataml/utils/docstring.py +527 -0
  88. teradataml/utils/dtypes.py +93 -0
  89. teradataml/utils/internal_buffer.py +2 -2
  90. teradataml/utils/utils.py +41 -2
  91. teradataml/utils/validators.py +694 -17
  92. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
  93. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
  94. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
  95. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
  96. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  # ##################################################################
2
2
  #
3
- # Copyright 2024 Teradata. All rights reserved.
3
+ # Copyright 2025 Teradata. All rights reserved.
4
4
  # TERADATA CONFIDENTIAL AND TRADE SECRET
5
5
  #
6
6
  # Primary Owner: Sweta Shaw
@@ -24,11 +24,13 @@ from teradataml.dataframe.dataframe import DataFrame
24
24
  from teradataml.dataframe.copy_to import copy_to_sql
25
25
  from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
26
26
  from teradataml import OutlierFilterFit, OutlierFilterTransform
27
+ from teradataml import OrdinalEncodingFit, OrdinalEncodingTransform
27
28
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
28
29
  from teradataml.common.messages import Messages, MessageCodes
29
30
  from teradataml import display as dp
30
31
  from teradataml.utils.validators import _Validators
31
32
  from teradataml.common.utils import UtilFuncs
33
+ from teradataml.common.garbagecollector import GarbageCollector
32
34
 
33
35
  def _is_terminal():
34
36
  """
@@ -59,10 +61,15 @@ if not _is_terminal():
59
61
  class _FeatureExplore:
60
62
 
61
63
  def __init__(self,
62
- data=None,
63
- target_column=None,
64
- verbose=0,
65
- task_type='regression'):
64
+ data=None,
65
+ target_column=None,
66
+ custom_data=None,
67
+ verbose=0,
68
+ task_type='regression',
69
+ fraud=False,
70
+ churn=False,
71
+ cluster=False,
72
+ **kwargs):
66
73
  """
67
74
  DESCRIPTION:
68
75
  Internal function initializes the data, target column for feature exploration.
@@ -75,9 +82,15 @@ class _FeatureExplore:
75
82
 
76
83
  target_column:
77
84
  Required Arugment.
85
+ Set to None for Clustering
78
86
  Specifies the name of the target column in "data".
79
87
  Types: str
80
-
88
+
89
+ custom_data:
90
+ Optional Argument.
91
+ Specifies json object containing user customized input.
92
+ Types: json object
93
+
81
94
  verbose:
82
95
  Optional Argument.
83
96
  Specifies the detailed execution steps based on verbose level.
@@ -96,13 +109,38 @@ class _FeatureExplore:
96
109
  * 'regression'
97
110
  * 'classification'
98
111
  Types: str
112
+
113
+ fraud:
114
+ Optional Argument.
115
+ Specifies whether to apply fraud detection techniques.
116
+ Default Value: False
117
+ Types: bool
118
+
119
+ churn:
120
+ Optional Argument.
121
+ Specifies whether to apply churn prediction techniques.
122
+ Default Value: False
123
+ Types: bool
124
+
125
+ cluster:
126
+ Optional Argument.
127
+ Specifies whether to apply clustering techniques.
128
+ Default Value: False
129
+ Types: bool
99
130
  """
100
131
  self.data = data
101
132
  self.target_column = target_column
102
133
  self.verbose = verbose
134
+ self.custom_data = custom_data
135
+ self.data_transform_dict = {}
136
+ self.data_types = {key: value for key, value in self.data._column_names_and_types}
103
137
  self.terminal_print = _is_terminal()
104
138
  self.style = self._common_style()
105
139
  self.task_type = task_type
140
+
141
+ self.fraud = fraud
142
+ self.churn = churn
143
+ self.cluster = cluster
106
144
 
107
145
  def _exploration(self,
108
146
  **kwargs):
@@ -113,8 +151,12 @@ class _FeatureExplore:
113
151
  2. Statistics of numeric columns of the dataset
114
152
  3. Categorical column summary
115
153
  4. Futile columns in the dataset
116
- 5. Target column distribution
154
+ 5. Target column distribution, not applicable for Clustering task_type
117
155
  6. Outlier Percentage in numeric columns of the dataset
156
+ 7. Heatmap of Numerical Features
157
+ 8. Boxplots of Feature Distribution
158
+ 9. Countplot of Categorical features
159
+ 10.Scatterplot for selected features for Clustering task_type
118
160
  """
119
161
  numerical_columns = []
120
162
  categorical_columns= []
@@ -142,7 +184,7 @@ class _FeatureExplore:
142
184
  # Displaying date columns
143
185
  if len(date_column_list)!=0:
144
186
  self._display_msg(msg='Identified Date Columns:',
145
- data=date_column_list)
187
+ data=date_column_list)
146
188
 
147
189
  # Column Summary of each feature of data
148
190
  # such as null count, datatype, non null count
@@ -155,14 +197,30 @@ class _FeatureExplore:
155
197
  if len(categorical_columns) != 0:
156
198
  categorical_obj = self._categorical_summary(categorical_columns)
157
199
  self._futile_column(categorical_obj)
200
+
201
+ if not self.cluster:
202
+ # Plot a graph of target column
203
+ self._target_column_details()
158
204
 
159
- # Plot a graph of target column
160
- self._target_column_details()
161
205
 
162
206
  # Displays outlier percentage
163
- outlier_method = "Tukey"
164
- df = self._outlier_detection(outlier_method,numerical_columns)
207
+ if self.fraud or self.churn:
208
+ outlier_method = "percentile"
209
+ df = self._outlier_detection(numerical_columns, outlier_method)
210
+ else:
211
+ outlier_method = "Tukey"
212
+ df = self._outlier_detection(outlier_method, numerical_columns)
165
213
 
214
+
215
+ if self.fraud or self.churn or self.cluster:
216
+ # Boxplots and Heatmap for feature distribution by target column
217
+ self._boxplot_heatmap()
218
+
219
+ # Countplots for feature distribution by target column
220
+ self._countplot_categorical_distribution()
221
+ if self.cluster:
222
+ self._scatter_plot()
223
+
166
224
  def _statistics(self):
167
225
  """
168
226
  DESCRIPTION:
@@ -172,8 +230,7 @@ class _FeatureExplore:
172
230
  self._display_msg(msg='\nStatistics of Data:',
173
231
  data=self.data.describe(),
174
232
  show_data=True)
175
-
176
-
233
+
177
234
  def _column_summary(self):
178
235
  """
179
236
  DESCRIPTION:
@@ -228,7 +285,7 @@ class _FeatureExplore:
228
285
  PARAMETERS:
229
286
  categorical_obj:
230
287
  Required Argument.
231
- Specifies the instance of CategoricalSummary for futile column detection..
288
+ Specifies the instance of CategoricalSummary for futile column detection.
232
289
  Types: Instance of CategoricalSummary
233
290
  """
234
291
  # Futile columns detection using categorical column object
@@ -248,23 +305,489 @@ class _FeatureExplore:
248
305
  data=gfc_out.result,
249
306
  show_data=True)
250
307
 
251
- def _target_column_details(self):
308
+ def _target_column_details(self,
309
+ plot_data=None):
252
310
  """
253
311
  DESCRIPTION:
254
312
  Internal function displays the target column distribution of Target column/ Response column.
255
313
 
256
314
  PARAMETERS:
257
- None
315
+ plot_data:
316
+ Optional Argument.
317
+ Specifies the input teradataml DataFrame for plotting distribution.
318
+ Types: teradataml Dataframe
258
319
  """
259
320
  if self._check_visualization_libraries() and not _is_terminal():
260
- # Plotting target column distribution
321
+ import matplotlib.pyplot as plt
322
+ import seaborn as sns
323
+ if plot_data is None:
324
+ target_data = self.data.select([self.target_column]).to_pandas()
325
+ else:
326
+ target_data = plot_data[[self.target_column]]
261
327
  self._display_msg(msg='\nTarget Column Distribution:',
262
328
  show_data=True)
263
- _FeatureExplore._visualize(data=self.data,
264
- target_column=self.target_column,
265
- plot_type=["target"],
266
- problem_type=self.task_type)
267
-
329
+ plt.figure(figsize=(8, 6))
330
+ # Ploting a histogram for target column
331
+ plt.hist(target_data, bins=10, density=True, edgecolor='black')
332
+ plt.xlabel(self.target_column)
333
+ plt.ylabel('Density')
334
+ plt.show()
335
+
336
+ def _countplot_categorical_distribution(self, plot_data=None, top_n=20, max_unique_threshold=50):
337
+ """
338
+ DESCRIPTION:
339
+ Function to plot count plots for categorical features based on the target column.
340
+ Limits the number of unique categories to avoid messy visuals.
341
+
342
+ PARAMETERS:
343
+ plot_data:
344
+ Optional Argument.
345
+ Specifies the input teradataml DataFrame for plotting distribution.
346
+ Default Value: None. It will use entire dataset passed for training.
347
+ Types: teradataml Dataframe
348
+
349
+ top_n:
350
+ Optional Argument.
351
+ Maximum number of categories to display per feature.
352
+ Default Value: 20
353
+ Types: int
354
+
355
+ max_unique_threshold:
356
+ Optional Argument.
357
+ Only plot features with unique values below this threshold.
358
+ Default Value: 50
359
+ Types: int
360
+ """
361
+ if self._check_visualization_libraries() and not _is_terminal():
362
+ import matplotlib.pyplot as plt
363
+ import seaborn as sns
364
+
365
+ # Prepare data
366
+ if plot_data is None:
367
+ data = self.data.to_pandas().reset_index()
368
+ else:
369
+ data = plot_data
370
+
371
+ target_column = self.target_column
372
+
373
+ # Select categorical features
374
+ categorical_features = data.select_dtypes(include=['object', 'category']).columns
375
+
376
+ if not self.cluster:
377
+ categorical_features = [col for col in categorical_features if col != target_column]
378
+
379
+ # Filter categorical features based on unique value threshold
380
+ categorical_features = [col for col in categorical_features if data[col].nunique() <= max_unique_threshold]
381
+
382
+ if len(categorical_features) == 0:
383
+ print("No categorical columns found with unique values within the threshold.")
384
+ return
385
+
386
+ self._display_msg(msg='\nCategorical Feature Distributions by Target Column (Count Plots):',
387
+ show_data=False)
388
+
389
+ for feature in categorical_features:
390
+ plt.figure(figsize=(10, 6))
391
+
392
+ # Get value counts and filter top N categories
393
+ value_counts = data[feature].value_counts()
394
+
395
+ top_categories = value_counts.nlargest(top_n).index.tolist()
396
+
397
+ # Remove duplicates while preserving order
398
+ top_categories = list(dict.fromkeys(top_categories))
399
+
400
+ # Replace less frequent categories with "Other"
401
+ data[feature] = data[feature].apply(lambda x: x if x in top_categories else "Other")
402
+
403
+
404
+ # Generate count plot
405
+ if not self.cluster:
406
+ cntplot = sns.countplot(data=data, x=feature, hue=target_column, order=top_categories)
407
+ else:
408
+ cntplot = sns.countplot(data=data, x=feature, order=top_categories)
409
+ for p in cntplot.patches:
410
+ height = p.get_height()
411
+ if height > 0: # Only display if height is greater than 0
412
+ cntplot.annotate(f'{int(height)}',
413
+ (p.get_x() + p.get_width() / 2, height),
414
+ ha='center', va='bottom', fontsize=10, fontweight='bold')
415
+
416
+
417
+ if not self.cluster:
418
+ plt.title(f"Distribution of {feature} by {target_column}")
419
+ else:
420
+ plt.title(f"Distribution of {feature}")
421
+ plt.xlabel(feature)
422
+ plt.ylabel("Count")
423
+ plt.xticks(rotation=45, ha='right') # Improve label visibility
424
+ if not self.cluster:
425
+ plt.legend(title=target_column)
426
+ plt.tight_layout()
427
+ plt.show()
428
+
429
+ def _correlation(self, data, threshold=0.1, max_features=10, min_features=2):
430
+ """
431
+ DESCRIPTION:
432
+ Function to calculate the correlation values between features.
433
+
434
+ PARAMETERS:
435
+ data:
436
+ Required Argument.
437
+ Specifies the input pandas DataFrame for correlation analysis.
438
+ Types: pandas DataFrame
439
+
440
+ threshold:
441
+ Optional Argument.
442
+ Specifies the minimum correlation threshold for feature selection.
443
+ Default Value: 0.1
444
+ Types: float
445
+
446
+ max_features:
447
+ Optional Argument.
448
+ Specifies the maximum number of features to select.
449
+ Default Value: 10
450
+ Types: int
451
+
452
+ min_features:
453
+ Optional Argument.
454
+ Specifies the minimum number of features to select as fallback.
455
+ Default Value: 2
456
+ Types: int
457
+ """
458
+ import numpy as np
459
+
460
+ numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
461
+
462
+ # For AutoML, exclude target_column from numerical features
463
+ if not self.cluster and self.target_column in numerical_features:
464
+ numerical_features = [col for col in numerical_features if col != self.target_column]
465
+
466
+ total_numerical_features = len(numerical_features)
467
+
468
+ if self.cluster:
469
+ # Clustering: feature vs feature correlation
470
+ corr_matrix = data[numerical_features].corr()
471
+ # Extract upper triangle without diagonal
472
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
473
+ corr_vals = corr_matrix.where(mask).stack().reset_index()
474
+ corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
475
+ corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
476
+ corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
477
+
478
+ filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_features)
479
+ selection_criteria = "Top Correlated Feature Pairs"
480
+
481
+ if len(filtered) < 2:
482
+ filtered = corr_vals.head(min(2, len(corr_vals)))
483
+ selection_criteria = f"Top {min(2, len(corr_vals))} Correlated Feature Pairs (Fallback)"
484
+
485
+ # Merge unique features from pairs
486
+ selected_features = list(set(filtered['Feature1'].tolist() + filtered['Feature2'].tolist()))
487
+ selected_features = selected_features[:max_features] # restrict total features
488
+ corr_matrix = data[selected_features].corr()
489
+
490
+ return filtered, selected_features, corr_matrix, selection_criteria
491
+ else:
492
+ # AutoML: correlation with target column
493
+ correlation_values = data[numerical_features].corrwith(data[self.target_column])
494
+ correlation_df = correlation_values.reset_index()
495
+ correlation_df.columns = ['Feature', 'Correlation']
496
+ correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()
497
+ correlation_df = correlation_df.sort_values(by='Abs_Correlation', ascending=False)
498
+
499
+ filtered = correlation_df[correlation_df['Abs_Correlation'] > threshold].head(max_features)
500
+ selection_criteria = "Features above threshold correlation with target"
501
+
502
+ if len(filtered) < 2:
503
+ filtered = correlation_df.head(min(min_features, total_numerical_features))
504
+ selection_criteria = f"Top {min(min_features, total_numerical_features)} Correlated Features (Fallback)"
505
+
506
+ selected_features = filtered['Feature'].tolist() + [self.target_column]
507
+ selected_features = list(dict.fromkeys(selected_features)) # preserve order, remove dup
508
+ corr_matrix = data[selected_features].corr()
509
+
510
+ return selected_features, corr_matrix, selection_criteria
511
+
512
+ def _boxplot_heatmap(self, plot_data=None):
513
+ """
514
+ DESCRIPTION:
515
+ Internal function to display heatmap and boxplots of selected numerical features.
516
+ Handles both AutoML (feature vs target) and Clustering (feature vs feature).
517
+
518
+ Parameters:
519
+ plot_data:
520
+ Optional Argument.
521
+ Specifies the data to be plotted.
522
+ Default Value: None. It will use entire dataset passed for training.
523
+ Types: teradataml DataFrame.
524
+ """
525
+ if self._check_visualization_libraries() and not _is_terminal():
526
+ import matplotlib.pyplot as plt
527
+ import seaborn as sns
528
+ import numpy as np
529
+ import pandas as pd
530
+
531
+ # Get DataFrame
532
+ if plot_data is not None:
533
+ data = plot_data.to_pandas().reset_index()
534
+ else:
535
+ # Perform ordinal encoding if needed for classification
536
+ if not self.cluster and self.data_types.get(self.target_column) in ['str']:
537
+ self._ordinal_encoding([self.target_column])
538
+ data = self.data.to_pandas().reset_index()
539
+
540
+ if not self.cluster:
541
+ # Get selected features and correlation matrix
542
+ selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
543
+ else:
544
+ filtered, selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
545
+
546
+ # Display heatmap
547
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=0)
548
+ plt.figure(figsize=(8, 6))
549
+ sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
550
+ plt.title("Heatmap of Selected Features")
551
+ plt.show()
552
+
553
+ num_features = len(selected_features)
554
+ self._display_msg(msg=f'\nNumber of features selected for Boxplots: {num_features}', show_data=False)
555
+ self._display_msg(msg=f'\nSelection Criteria: {selection_criteria}', show_data=False)
556
+ self._display_msg(msg=f'\nSelected Features: {", ".join(selected_features)}', show_data=False)
557
+ self._display_msg(msg='\nBoxplots:', show_data=False)
558
+
559
+ if self.cluster:
560
+ num_plots = len(filtered)
561
+ cols = 2 if num_plots > 1 else 1
562
+ rows = (num_plots + cols - 1) // cols
563
+
564
+ fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
565
+ axes = axes.flatten() if len(filtered) > 1 else [axes]
566
+
567
+ for i, (idx, row) in enumerate(filtered.iterrows()):
568
+ if i >= len(axes):
569
+ break # prevent IndexError if more data than axes
570
+
571
+ feature_x, feature_y = row["Feature1"], row["Feature2"]
572
+
573
+ x_unique = data[feature_x].nunique()
574
+ x = data[feature_x]
575
+ if x_unique > 20:
576
+ x = pd.qcut(x, q=10, duplicates='drop')
577
+
578
+ sns.boxplot(x=x, y=data[feature_y], ax=axes[i])
579
+ axes[i].set_title(f"{feature_y} vs {feature_x}")
580
+ axes[i].set_xlabel(feature_x)
581
+ axes[i].set_ylabel(feature_y)
582
+ axes[i].tick_params(axis='x', rotation=45)
583
+ else:
584
+ # Prepare boxplot layout
585
+ num_features = len(selected_features)
586
+ cols = 2 if num_features > 1 else 1
587
+ rows = max((num_features // 2) + (num_features % 2),1)
588
+
589
+ rows = max(rows, 1)
590
+
591
+ fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
592
+ axes = axes.flatten() if num_features > 1 else [axes]
593
+ # AutoML: Plot boxplot of feature vs target column
594
+ for i, feature in enumerate(selected_features):
595
+ if feature != self.target_column:
596
+ sns.boxplot(x=data[self.target_column], y=data[feature], ax=axes[i])
597
+ axes[i].set_title(f"{feature}")
598
+ axes[i].set_xlabel(self.target_column)
599
+ axes[i].set_ylabel(feature)
600
+
601
+ plt.tight_layout()
602
+ plt.show()
603
+
604
+ def _scatter_plot(self, plot_data=None, max_selected_pairs=10, threshold=0.1):
605
+ """
606
+ DESCRIPTION:
607
+ Internal function to display scatterplots of selected numerical features.
608
+ Handles Clustering (feature vs feature).
609
+
610
+ PARAMETERS:
611
+ plot_data:
612
+ Optional Argument.
613
+ Specifies the input teradataml dataFrame for plotting scatter plots.
614
+ Default Value: None. It will use entire dataset passed for training.
615
+ Types: teradataml DataFrame
616
+
617
+ max_selected_pairs:
618
+ Optional Argument.
619
+ Specifies the maximum number of feature pairs to select for scatter plots.
620
+ Default Value: 10
621
+ Types: int
622
+
623
+ threshold:
624
+ Optional Argument.
625
+ Specifies the minimum correlation threshold for feature pair selection.
626
+ Default Value: 0.1
627
+ Types: float
628
+ """
629
+ if self._check_visualization_libraries() and not _is_terminal():
630
+ import matplotlib.pyplot as plt
631
+ import seaborn as sns
632
+ import numpy as np
633
+
634
+ # Load data
635
+ data = plot_data.to_pandas().reset_index() if plot_data is not None else self.data.to_pandas().reset_index()
636
+
637
+ # Select numerical features
638
+ numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
639
+ if len(numerical_features) < 2:
640
+ print("Not enough numerical features for scatter plots.")
641
+ return
642
+
643
+ # Compute correlation matrix
644
+ corr_matrix = data[numerical_features].corr()
645
+
646
+ # Extract upper triangle (excluding diagonal)
647
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
648
+ corr_vals = corr_matrix.where(mask).stack().reset_index()
649
+ corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
650
+ corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
651
+
652
+ # Sort and filter top pairs
653
+ corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
654
+ filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_selected_pairs)
655
+
656
+ if len(filtered) < 2:
657
+ filtered = corr_vals.head(min(2, len(corr_vals)))
658
+
659
+ if len(filtered) == 0:
660
+ print("No correlated pairs found above threshold.")
661
+ return
662
+
663
+ self._display_msg(msg=f"\nScatter Plots for Top Correlated Feature Pairs:", show_data=False)
664
+
665
+ # Plot scatter plots
666
+ for _, row in filtered.iterrows():
667
+ feature_x, feature_y = row["Feature1"], row["Feature2"]
668
+
669
+ plt.figure(figsize=(6, 4))
670
+ sns.scatterplot(x=data[feature_x], y=data[feature_y], alpha=0.3)
671
+ plt.xlabel(feature_x)
672
+ plt.ylabel(feature_y)
673
+ plt.title(f"Scatter Plot: {feature_x} vs {feature_y} (Corr: {row['Correlation']:.2f})")
674
+ plt.tight_layout()
675
+ plt.show()
676
+
677
+ def _ordinal_encoding(self,
678
+ ordinal_columns):
679
+ """
680
+ DESCRIPTION:
681
+ Function performs the ordinal encoding to categorical columns or features in the dataset.
682
+
683
+ PARAMETERS:
684
+ ordinal_columns:
685
+ Required Argument.
686
+ Specifies the categorical columns for which ordinal encoding will be performed.
687
+ Types: str or list of strings (str)
688
+ """
689
+ # Setting volatile and persist parameters for performing encoding
690
+ volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
691
+ param_name="CategoricalEncodingParam")
692
+
693
+ # Adding fit parameters for performing encoding
694
+ fit_params = {
695
+ "data" : self.data,
696
+ "target_column" : ordinal_columns,
697
+ "volatile" : volatile,
698
+ "persist" : persist
699
+ }
700
+ # Performing ordinal encoding fit on target columns
701
+ ord_fit_obj = OrdinalEncodingFit(**fit_params)
702
+ # Storing fit object and column list for ordinal encoding in data transform dictionary
703
+ if ordinal_columns[0] != self.target_column:
704
+ self.data_transform_dict["custom_ord_encoding_fit_obj"] = ord_fit_obj.result
705
+ self.data_transform_dict['custom_ord_encoding_col'] = ordinal_columns
706
+ else:
707
+ self.data_transform_dict['target_col_encode_ind'] = True
708
+ self.data_transform_dict['target_col_ord_encoding_fit_obj'] = ord_fit_obj.result
709
+
710
+ # Extracting accumulate columns
711
+ accumulate_columns = self._extract_list(self.data.columns, ordinal_columns)
712
+ # Adding transform parameters for performing encoding
713
+ transform_params = {
714
+ "data" : self.data,
715
+ "object" : ord_fit_obj.result,
716
+ "accumulate" : accumulate_columns,
717
+ "persist" : True
718
+ }
719
+ # Disabling display table name if persist is True by default
720
+ if not volatile and not persist:
721
+ transform_params["display_table_name"] = False
722
+
723
+ # Setting persist to False if volatile is True
724
+ if volatile:
725
+ transform_params["volatile"] = True
726
+ transform_params["persist"] = False
727
+ # Performing ordinal encoding transformation
728
+ self.data = OrdinalEncodingTransform(**transform_params).result
729
+
730
+ if not volatile and not persist:
731
+ # Adding transformed data containing table to garbage collector
732
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
733
+
734
+ if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
735
+ self.target_label = ord_fit_obj
736
+
737
+ def _extract_list(self,
738
+ list1,
739
+ list2):
740
+ """
741
+ DESCRIPTION:
742
+ Function to extract elements from list1 which are not present in list2.
743
+
744
+ PARAMETERS:
745
+ list1:
746
+ Required Argument.
747
+ Specifies the first list for extracting elements from.
748
+ Types: list
749
+
750
+ list2:
751
+ Required Argument.
752
+ Specifies the second list to get elements for avoiding in first list while extracting.
753
+ Types: list
754
+
755
+ RETURN:
756
+ Returns extracted elements in form of list.
757
+
758
+ """
759
+ new_lst = list(set(list1) - set(list2))
760
+ return new_lst
761
+
762
+ def _get_generic_parameters(self,
763
+ func_indicator=None,
764
+ param_name=None):
765
+ """
766
+ DESCRIPTION:
767
+ Function to get generic parameters.
768
+
769
+ PARAMETERS:
770
+ func_indicator:
771
+ Optional Argument.
772
+ Specifies the name of function indicator.
773
+ Types: str
774
+
775
+ param_name:
776
+ Optional Argument.
777
+ Specifies the name of the param which contains generic parameters.
778
+ Types: str
779
+
780
+ RETURNS:
781
+ Tuple containing volatile and persist parameters.
782
+ """
783
+ volatile = self.volatile
784
+ persist = self.persist
785
+ if self.custom_data is not None and self.custom_data.get(func_indicator, False):
786
+ volatile = self.custom_data[param_name].get("volatile", False)
787
+ persist = self.custom_data[param_name].get("persist", False)
788
+
789
+ return (volatile, persist)
790
+
268
791
  def _check_visualization_libraries(self):
269
792
  """
270
793
  DESCRIPTION:
@@ -287,8 +810,8 @@ class _FeatureExplore:
287
810
  def _outlier_detection(self,
288
811
  outlier_method,
289
812
  column_list,
290
- lower_percentile = None,
291
- upper_percentile = None):
813
+ lower_percentile=None,
814
+ upper_percentile=None):
292
815
  """
293
816
  DESCRIPTION:
294
817
  Function detects the outlier in numerical column and display thier percentage.