teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Sweta Shaw
|
|
@@ -24,11 +24,13 @@ from teradataml.dataframe.dataframe import DataFrame
|
|
|
24
24
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
25
|
from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
|
|
26
26
|
from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
27
|
+
from teradataml import OrdinalEncodingFit, OrdinalEncodingTransform
|
|
27
28
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
28
29
|
from teradataml.common.messages import Messages, MessageCodes
|
|
29
30
|
from teradataml import display as dp
|
|
30
31
|
from teradataml.utils.validators import _Validators
|
|
31
32
|
from teradataml.common.utils import UtilFuncs
|
|
33
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
32
34
|
|
|
33
35
|
def _is_terminal():
|
|
34
36
|
"""
|
|
@@ -59,10 +61,15 @@ if not _is_terminal():
|
|
|
59
61
|
class _FeatureExplore:
|
|
60
62
|
|
|
61
63
|
def __init__(self,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
data=None,
|
|
65
|
+
target_column=None,
|
|
66
|
+
custom_data=None,
|
|
67
|
+
verbose=0,
|
|
68
|
+
task_type='regression',
|
|
69
|
+
fraud=False,
|
|
70
|
+
churn=False,
|
|
71
|
+
cluster=False,
|
|
72
|
+
**kwargs):
|
|
66
73
|
"""
|
|
67
74
|
DESCRIPTION:
|
|
68
75
|
Internal function initializes the data, target column for feature exploration.
|
|
@@ -75,9 +82,15 @@ class _FeatureExplore:
|
|
|
75
82
|
|
|
76
83
|
target_column:
|
|
77
84
|
Required Arugment.
|
|
85
|
+
Set to None for Clustering
|
|
78
86
|
Specifies the name of the target column in "data".
|
|
79
87
|
Types: str
|
|
80
|
-
|
|
88
|
+
|
|
89
|
+
custom_data:
|
|
90
|
+
Optional Argument.
|
|
91
|
+
Specifies json object containing user customized input.
|
|
92
|
+
Types: json object
|
|
93
|
+
|
|
81
94
|
verbose:
|
|
82
95
|
Optional Argument.
|
|
83
96
|
Specifies the detailed execution steps based on verbose level.
|
|
@@ -96,13 +109,38 @@ class _FeatureExplore:
|
|
|
96
109
|
* 'regression'
|
|
97
110
|
* 'classification'
|
|
98
111
|
Types: str
|
|
112
|
+
|
|
113
|
+
fraud:
|
|
114
|
+
Optional Argument.
|
|
115
|
+
Specifies whether to apply fraud detection techniques.
|
|
116
|
+
Default Value: False
|
|
117
|
+
Types: bool
|
|
118
|
+
|
|
119
|
+
churn:
|
|
120
|
+
Optional Argument.
|
|
121
|
+
Specifies whether to apply churn prediction techniques.
|
|
122
|
+
Default Value: False
|
|
123
|
+
Types: bool
|
|
124
|
+
|
|
125
|
+
cluster:
|
|
126
|
+
Optional Argument.
|
|
127
|
+
Specifies whether to apply clustering techniques.
|
|
128
|
+
Default Value: False
|
|
129
|
+
Types: bool
|
|
99
130
|
"""
|
|
100
131
|
self.data = data
|
|
101
132
|
self.target_column = target_column
|
|
102
133
|
self.verbose = verbose
|
|
134
|
+
self.custom_data = custom_data
|
|
135
|
+
self.data_transform_dict = {}
|
|
136
|
+
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
103
137
|
self.terminal_print = _is_terminal()
|
|
104
138
|
self.style = self._common_style()
|
|
105
139
|
self.task_type = task_type
|
|
140
|
+
|
|
141
|
+
self.fraud = fraud
|
|
142
|
+
self.churn = churn
|
|
143
|
+
self.cluster = cluster
|
|
106
144
|
|
|
107
145
|
def _exploration(self,
|
|
108
146
|
**kwargs):
|
|
@@ -113,8 +151,12 @@ class _FeatureExplore:
|
|
|
113
151
|
2. Statistics of numeric columns of the dataset
|
|
114
152
|
3. Categorical column summary
|
|
115
153
|
4. Futile columns in the dataset
|
|
116
|
-
5. Target column distribution
|
|
154
|
+
5. Target column distribution, not applicable for Clustering task_type
|
|
117
155
|
6. Outlier Percentage in numeric columns of the dataset
|
|
156
|
+
7. Heatmap of Numerical Features
|
|
157
|
+
8. Boxplots of Feature Distribution
|
|
158
|
+
9. Countplot of Categorical features
|
|
159
|
+
10.Scatterplot for selected features for Clustering task_type
|
|
118
160
|
"""
|
|
119
161
|
numerical_columns = []
|
|
120
162
|
categorical_columns= []
|
|
@@ -142,7 +184,7 @@ class _FeatureExplore:
|
|
|
142
184
|
# Displaying date columns
|
|
143
185
|
if len(date_column_list)!=0:
|
|
144
186
|
self._display_msg(msg='Identified Date Columns:',
|
|
145
|
-
|
|
187
|
+
data=date_column_list)
|
|
146
188
|
|
|
147
189
|
# Column Summary of each feature of data
|
|
148
190
|
# such as null count, datatype, non null count
|
|
@@ -155,14 +197,30 @@ class _FeatureExplore:
|
|
|
155
197
|
if len(categorical_columns) != 0:
|
|
156
198
|
categorical_obj = self._categorical_summary(categorical_columns)
|
|
157
199
|
self._futile_column(categorical_obj)
|
|
200
|
+
|
|
201
|
+
if not self.cluster:
|
|
202
|
+
# Plot a graph of target column
|
|
203
|
+
self._target_column_details()
|
|
158
204
|
|
|
159
|
-
# Plot a graph of target column
|
|
160
|
-
self._target_column_details()
|
|
161
205
|
|
|
162
206
|
# Displays outlier percentage
|
|
163
|
-
|
|
164
|
-
|
|
207
|
+
if self.fraud or self.churn:
|
|
208
|
+
outlier_method = "percentile"
|
|
209
|
+
df = self._outlier_detection(numerical_columns, outlier_method)
|
|
210
|
+
else:
|
|
211
|
+
outlier_method = "Tukey"
|
|
212
|
+
df = self._outlier_detection(outlier_method, numerical_columns)
|
|
165
213
|
|
|
214
|
+
|
|
215
|
+
if self.fraud or self.churn or self.cluster:
|
|
216
|
+
# Boxplots and Heatmap for feature distribution by target column
|
|
217
|
+
self._boxplot_heatmap()
|
|
218
|
+
|
|
219
|
+
# Countplots for feature distribution by target column
|
|
220
|
+
self._countplot_categorical_distribution()
|
|
221
|
+
if self.cluster:
|
|
222
|
+
self._scatter_plot()
|
|
223
|
+
|
|
166
224
|
def _statistics(self):
|
|
167
225
|
"""
|
|
168
226
|
DESCRIPTION:
|
|
@@ -172,8 +230,7 @@ class _FeatureExplore:
|
|
|
172
230
|
self._display_msg(msg='\nStatistics of Data:',
|
|
173
231
|
data=self.data.describe(),
|
|
174
232
|
show_data=True)
|
|
175
|
-
|
|
176
|
-
|
|
233
|
+
|
|
177
234
|
def _column_summary(self):
|
|
178
235
|
"""
|
|
179
236
|
DESCRIPTION:
|
|
@@ -228,7 +285,7 @@ class _FeatureExplore:
|
|
|
228
285
|
PARAMETERS:
|
|
229
286
|
categorical_obj:
|
|
230
287
|
Required Argument.
|
|
231
|
-
Specifies the instance of CategoricalSummary for futile column detection
|
|
288
|
+
Specifies the instance of CategoricalSummary for futile column detection.
|
|
232
289
|
Types: Instance of CategoricalSummary
|
|
233
290
|
"""
|
|
234
291
|
# Futile columns detection using categorical column object
|
|
@@ -248,23 +305,489 @@ class _FeatureExplore:
|
|
|
248
305
|
data=gfc_out.result,
|
|
249
306
|
show_data=True)
|
|
250
307
|
|
|
251
|
-
def _target_column_details(self
|
|
308
|
+
def _target_column_details(self,
|
|
309
|
+
plot_data=None):
|
|
252
310
|
"""
|
|
253
311
|
DESCRIPTION:
|
|
254
312
|
Internal function displays the target column distribution of Target column/ Response column.
|
|
255
313
|
|
|
256
314
|
PARAMETERS:
|
|
257
|
-
|
|
315
|
+
plot_data:
|
|
316
|
+
Optional Argument.
|
|
317
|
+
Specifies the input teradataml DataFrame for plotting distribution.
|
|
318
|
+
Types: teradataml Dataframe
|
|
258
319
|
"""
|
|
259
320
|
if self._check_visualization_libraries() and not _is_terminal():
|
|
260
|
-
|
|
321
|
+
import matplotlib.pyplot as plt
|
|
322
|
+
import seaborn as sns
|
|
323
|
+
if plot_data is None:
|
|
324
|
+
target_data = self.data.select([self.target_column]).to_pandas()
|
|
325
|
+
else:
|
|
326
|
+
target_data = plot_data[[self.target_column]]
|
|
261
327
|
self._display_msg(msg='\nTarget Column Distribution:',
|
|
262
328
|
show_data=True)
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
329
|
+
plt.figure(figsize=(8, 6))
|
|
330
|
+
# Ploting a histogram for target column
|
|
331
|
+
plt.hist(target_data, bins=10, density=True, edgecolor='black')
|
|
332
|
+
plt.xlabel(self.target_column)
|
|
333
|
+
plt.ylabel('Density')
|
|
334
|
+
plt.show()
|
|
335
|
+
|
|
336
|
+
def _countplot_categorical_distribution(self, plot_data=None, top_n=20, max_unique_threshold=50):
|
|
337
|
+
"""
|
|
338
|
+
DESCRIPTION:
|
|
339
|
+
Function to plot count plots for categorical features based on the target column.
|
|
340
|
+
Limits the number of unique categories to avoid messy visuals.
|
|
341
|
+
|
|
342
|
+
PARAMETERS:
|
|
343
|
+
plot_data:
|
|
344
|
+
Optional Argument.
|
|
345
|
+
Specifies the input teradataml DataFrame for plotting distribution.
|
|
346
|
+
Default Value: None. It will use entire dataset passed for training.
|
|
347
|
+
Types: teradataml Dataframe
|
|
348
|
+
|
|
349
|
+
top_n:
|
|
350
|
+
Optional Argument.
|
|
351
|
+
Maximum number of categories to display per feature.
|
|
352
|
+
Default Value: 20
|
|
353
|
+
Types: int
|
|
354
|
+
|
|
355
|
+
max_unique_threshold:
|
|
356
|
+
Optional Argument.
|
|
357
|
+
Only plot features with unique values below this threshold.
|
|
358
|
+
Default Value: 50
|
|
359
|
+
Types: int
|
|
360
|
+
"""
|
|
361
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
362
|
+
import matplotlib.pyplot as plt
|
|
363
|
+
import seaborn as sns
|
|
364
|
+
|
|
365
|
+
# Prepare data
|
|
366
|
+
if plot_data is None:
|
|
367
|
+
data = self.data.to_pandas().reset_index()
|
|
368
|
+
else:
|
|
369
|
+
data = plot_data
|
|
370
|
+
|
|
371
|
+
target_column = self.target_column
|
|
372
|
+
|
|
373
|
+
# Select categorical features
|
|
374
|
+
categorical_features = data.select_dtypes(include=['object', 'category']).columns
|
|
375
|
+
|
|
376
|
+
if not self.cluster:
|
|
377
|
+
categorical_features = [col for col in categorical_features if col != target_column]
|
|
378
|
+
|
|
379
|
+
# Filter categorical features based on unique value threshold
|
|
380
|
+
categorical_features = [col for col in categorical_features if data[col].nunique() <= max_unique_threshold]
|
|
381
|
+
|
|
382
|
+
if len(categorical_features) == 0:
|
|
383
|
+
print("No categorical columns found with unique values within the threshold.")
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
self._display_msg(msg='\nCategorical Feature Distributions by Target Column (Count Plots):',
|
|
387
|
+
show_data=False)
|
|
388
|
+
|
|
389
|
+
for feature in categorical_features:
|
|
390
|
+
plt.figure(figsize=(10, 6))
|
|
391
|
+
|
|
392
|
+
# Get value counts and filter top N categories
|
|
393
|
+
value_counts = data[feature].value_counts()
|
|
394
|
+
|
|
395
|
+
top_categories = value_counts.nlargest(top_n).index.tolist()
|
|
396
|
+
|
|
397
|
+
# Remove duplicates while preserving order
|
|
398
|
+
top_categories = list(dict.fromkeys(top_categories))
|
|
399
|
+
|
|
400
|
+
# Replace less frequent categories with "Other"
|
|
401
|
+
data[feature] = data[feature].apply(lambda x: x if x in top_categories else "Other")
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# Generate count plot
|
|
405
|
+
if not self.cluster:
|
|
406
|
+
cntplot = sns.countplot(data=data, x=feature, hue=target_column, order=top_categories)
|
|
407
|
+
else:
|
|
408
|
+
cntplot = sns.countplot(data=data, x=feature, order=top_categories)
|
|
409
|
+
for p in cntplot.patches:
|
|
410
|
+
height = p.get_height()
|
|
411
|
+
if height > 0: # Only display if height is greater than 0
|
|
412
|
+
cntplot.annotate(f'{int(height)}',
|
|
413
|
+
(p.get_x() + p.get_width() / 2, height),
|
|
414
|
+
ha='center', va='bottom', fontsize=10, fontweight='bold')
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
if not self.cluster:
|
|
418
|
+
plt.title(f"Distribution of {feature} by {target_column}")
|
|
419
|
+
else:
|
|
420
|
+
plt.title(f"Distribution of {feature}")
|
|
421
|
+
plt.xlabel(feature)
|
|
422
|
+
plt.ylabel("Count")
|
|
423
|
+
plt.xticks(rotation=45, ha='right') # Improve label visibility
|
|
424
|
+
if not self.cluster:
|
|
425
|
+
plt.legend(title=target_column)
|
|
426
|
+
plt.tight_layout()
|
|
427
|
+
plt.show()
|
|
428
|
+
|
|
429
|
+
def _correlation(self, data, threshold=0.1, max_features=10, min_features=2):
|
|
430
|
+
"""
|
|
431
|
+
DESCRIPTION:
|
|
432
|
+
Function to calculate the correlation values between features.
|
|
433
|
+
|
|
434
|
+
PARAMETERS:
|
|
435
|
+
data:
|
|
436
|
+
Required Argument.
|
|
437
|
+
Specifies the input pandas DataFrame for correlation analysis.
|
|
438
|
+
Types: pandas DataFrame
|
|
439
|
+
|
|
440
|
+
threshold:
|
|
441
|
+
Optional Argument.
|
|
442
|
+
Specifies the minimum correlation threshold for feature selection.
|
|
443
|
+
Default Value: 0.1
|
|
444
|
+
Types: float
|
|
445
|
+
|
|
446
|
+
max_features:
|
|
447
|
+
Optional Argument.
|
|
448
|
+
Specifies the maximum number of features to select.
|
|
449
|
+
Default Value: 10
|
|
450
|
+
Types: int
|
|
451
|
+
|
|
452
|
+
min_features:
|
|
453
|
+
Optional Argument.
|
|
454
|
+
Specifies the minimum number of features to select as fallback.
|
|
455
|
+
Default Value: 2
|
|
456
|
+
Types: int
|
|
457
|
+
"""
|
|
458
|
+
import numpy as np
|
|
459
|
+
|
|
460
|
+
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
|
|
461
|
+
|
|
462
|
+
# For AutoML, exclude target_column from numerical features
|
|
463
|
+
if not self.cluster and self.target_column in numerical_features:
|
|
464
|
+
numerical_features = [col for col in numerical_features if col != self.target_column]
|
|
465
|
+
|
|
466
|
+
total_numerical_features = len(numerical_features)
|
|
467
|
+
|
|
468
|
+
if self.cluster:
|
|
469
|
+
# Clustering: feature vs feature correlation
|
|
470
|
+
corr_matrix = data[numerical_features].corr()
|
|
471
|
+
# Extract upper triangle without diagonal
|
|
472
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
|
473
|
+
corr_vals = corr_matrix.where(mask).stack().reset_index()
|
|
474
|
+
corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
|
|
475
|
+
corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
|
|
476
|
+
corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
|
|
477
|
+
|
|
478
|
+
filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_features)
|
|
479
|
+
selection_criteria = "Top Correlated Feature Pairs"
|
|
480
|
+
|
|
481
|
+
if len(filtered) < 2:
|
|
482
|
+
filtered = corr_vals.head(min(2, len(corr_vals)))
|
|
483
|
+
selection_criteria = f"Top {min(2, len(corr_vals))} Correlated Feature Pairs (Fallback)"
|
|
484
|
+
|
|
485
|
+
# Merge unique features from pairs
|
|
486
|
+
selected_features = list(set(filtered['Feature1'].tolist() + filtered['Feature2'].tolist()))
|
|
487
|
+
selected_features = selected_features[:max_features] # restrict total features
|
|
488
|
+
corr_matrix = data[selected_features].corr()
|
|
489
|
+
|
|
490
|
+
return filtered, selected_features, corr_matrix, selection_criteria
|
|
491
|
+
else:
|
|
492
|
+
# AutoML: correlation with target column
|
|
493
|
+
correlation_values = data[numerical_features].corrwith(data[self.target_column])
|
|
494
|
+
correlation_df = correlation_values.reset_index()
|
|
495
|
+
correlation_df.columns = ['Feature', 'Correlation']
|
|
496
|
+
correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()
|
|
497
|
+
correlation_df = correlation_df.sort_values(by='Abs_Correlation', ascending=False)
|
|
498
|
+
|
|
499
|
+
filtered = correlation_df[correlation_df['Abs_Correlation'] > threshold].head(max_features)
|
|
500
|
+
selection_criteria = "Features above threshold correlation with target"
|
|
501
|
+
|
|
502
|
+
if len(filtered) < 2:
|
|
503
|
+
filtered = correlation_df.head(min(min_features, total_numerical_features))
|
|
504
|
+
selection_criteria = f"Top {min(min_features, total_numerical_features)} Correlated Features (Fallback)"
|
|
505
|
+
|
|
506
|
+
selected_features = filtered['Feature'].tolist() + [self.target_column]
|
|
507
|
+
selected_features = list(dict.fromkeys(selected_features)) # preserve order, remove dup
|
|
508
|
+
corr_matrix = data[selected_features].corr()
|
|
509
|
+
|
|
510
|
+
return selected_features, corr_matrix, selection_criteria
|
|
511
|
+
|
|
512
|
+
def _boxplot_heatmap(self, plot_data=None):
|
|
513
|
+
"""
|
|
514
|
+
DESCRIPTION:
|
|
515
|
+
Internal function to display heatmap and boxplots of selected numerical features.
|
|
516
|
+
Handles both AutoML (feature vs target) and Clustering (feature vs feature).
|
|
517
|
+
|
|
518
|
+
Parameters:
|
|
519
|
+
plot_data:
|
|
520
|
+
Optional Argument.
|
|
521
|
+
Specifies the data to be plotted.
|
|
522
|
+
Default Value: None. It will use entire dataset passed for training.
|
|
523
|
+
Types: teradataml DataFrame.
|
|
524
|
+
"""
|
|
525
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
526
|
+
import matplotlib.pyplot as plt
|
|
527
|
+
import seaborn as sns
|
|
528
|
+
import numpy as np
|
|
529
|
+
import pandas as pd
|
|
530
|
+
|
|
531
|
+
# Get DataFrame
|
|
532
|
+
if plot_data is not None:
|
|
533
|
+
data = plot_data.to_pandas().reset_index()
|
|
534
|
+
else:
|
|
535
|
+
# Perform ordinal encoding if needed for classification
|
|
536
|
+
if not self.cluster and self.data_types.get(self.target_column) in ['str']:
|
|
537
|
+
self._ordinal_encoding([self.target_column])
|
|
538
|
+
data = self.data.to_pandas().reset_index()
|
|
539
|
+
|
|
540
|
+
if not self.cluster:
|
|
541
|
+
# Get selected features and correlation matrix
|
|
542
|
+
selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
|
|
543
|
+
else:
|
|
544
|
+
filtered, selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
|
|
545
|
+
|
|
546
|
+
# Display heatmap
|
|
547
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=0)
|
|
548
|
+
plt.figure(figsize=(8, 6))
|
|
549
|
+
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
|
550
|
+
plt.title("Heatmap of Selected Features")
|
|
551
|
+
plt.show()
|
|
552
|
+
|
|
553
|
+
num_features = len(selected_features)
|
|
554
|
+
self._display_msg(msg=f'\nNumber of features selected for Boxplots: {num_features}', show_data=False)
|
|
555
|
+
self._display_msg(msg=f'\nSelection Criteria: {selection_criteria}', show_data=False)
|
|
556
|
+
self._display_msg(msg=f'\nSelected Features: {", ".join(selected_features)}', show_data=False)
|
|
557
|
+
self._display_msg(msg='\nBoxplots:', show_data=False)
|
|
558
|
+
|
|
559
|
+
if self.cluster:
|
|
560
|
+
num_plots = len(filtered)
|
|
561
|
+
cols = 2 if num_plots > 1 else 1
|
|
562
|
+
rows = (num_plots + cols - 1) // cols
|
|
563
|
+
|
|
564
|
+
fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
|
|
565
|
+
axes = axes.flatten() if len(filtered) > 1 else [axes]
|
|
566
|
+
|
|
567
|
+
for i, (idx, row) in enumerate(filtered.iterrows()):
|
|
568
|
+
if i >= len(axes):
|
|
569
|
+
break # prevent IndexError if more data than axes
|
|
570
|
+
|
|
571
|
+
feature_x, feature_y = row["Feature1"], row["Feature2"]
|
|
572
|
+
|
|
573
|
+
x_unique = data[feature_x].nunique()
|
|
574
|
+
x = data[feature_x]
|
|
575
|
+
if x_unique > 20:
|
|
576
|
+
x = pd.qcut(x, q=10, duplicates='drop')
|
|
577
|
+
|
|
578
|
+
sns.boxplot(x=x, y=data[feature_y], ax=axes[i])
|
|
579
|
+
axes[i].set_title(f"{feature_y} vs {feature_x}")
|
|
580
|
+
axes[i].set_xlabel(feature_x)
|
|
581
|
+
axes[i].set_ylabel(feature_y)
|
|
582
|
+
axes[i].tick_params(axis='x', rotation=45)
|
|
583
|
+
else:
|
|
584
|
+
# Prepare boxplot layout
|
|
585
|
+
num_features = len(selected_features)
|
|
586
|
+
cols = 2 if num_features > 1 else 1
|
|
587
|
+
rows = max((num_features // 2) + (num_features % 2),1)
|
|
588
|
+
|
|
589
|
+
rows = max(rows, 1)
|
|
590
|
+
|
|
591
|
+
fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
|
|
592
|
+
axes = axes.flatten() if num_features > 1 else [axes]
|
|
593
|
+
# AutoML: Plot boxplot of feature vs target column
|
|
594
|
+
for i, feature in enumerate(selected_features):
|
|
595
|
+
if feature != self.target_column:
|
|
596
|
+
sns.boxplot(x=data[self.target_column], y=data[feature], ax=axes[i])
|
|
597
|
+
axes[i].set_title(f"{feature}")
|
|
598
|
+
axes[i].set_xlabel(self.target_column)
|
|
599
|
+
axes[i].set_ylabel(feature)
|
|
600
|
+
|
|
601
|
+
plt.tight_layout()
|
|
602
|
+
plt.show()
|
|
603
|
+
|
|
604
|
+
def _scatter_plot(self, plot_data=None, max_selected_pairs=10, threshold=0.1):
|
|
605
|
+
"""
|
|
606
|
+
DESCRIPTION:
|
|
607
|
+
Internal function to display scatterplots of selected numerical features.
|
|
608
|
+
Handles Clustering (feature vs feature).
|
|
609
|
+
|
|
610
|
+
PARAMETERS:
|
|
611
|
+
plot_data:
|
|
612
|
+
Optional Argument.
|
|
613
|
+
Specifies the input teradataml dataFrame for plotting scatter plots.
|
|
614
|
+
Default Value: None. It will use entire dataset passed for training.
|
|
615
|
+
Types: teradataml DataFrame
|
|
616
|
+
|
|
617
|
+
max_selected_pairs:
|
|
618
|
+
Optional Argument.
|
|
619
|
+
Specifies the maximum number of feature pairs to select for scatter plots.
|
|
620
|
+
Default Value: 10
|
|
621
|
+
Types: int
|
|
622
|
+
|
|
623
|
+
threshold:
|
|
624
|
+
Optional Argument.
|
|
625
|
+
Specifies the minimum correlation threshold for feature pair selection.
|
|
626
|
+
Default Value: 0.1
|
|
627
|
+
Types: float
|
|
628
|
+
"""
|
|
629
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
630
|
+
import matplotlib.pyplot as plt
|
|
631
|
+
import seaborn as sns
|
|
632
|
+
import numpy as np
|
|
633
|
+
|
|
634
|
+
# Load data
|
|
635
|
+
data = plot_data.to_pandas().reset_index() if plot_data is not None else self.data.to_pandas().reset_index()
|
|
636
|
+
|
|
637
|
+
# Select numerical features
|
|
638
|
+
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
|
|
639
|
+
if len(numerical_features) < 2:
|
|
640
|
+
print("Not enough numerical features for scatter plots.")
|
|
641
|
+
return
|
|
642
|
+
|
|
643
|
+
# Compute correlation matrix
|
|
644
|
+
corr_matrix = data[numerical_features].corr()
|
|
645
|
+
|
|
646
|
+
# Extract upper triangle (excluding diagonal)
|
|
647
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
|
648
|
+
corr_vals = corr_matrix.where(mask).stack().reset_index()
|
|
649
|
+
corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
|
|
650
|
+
corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
|
|
651
|
+
|
|
652
|
+
# Sort and filter top pairs
|
|
653
|
+
corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
|
|
654
|
+
filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_selected_pairs)
|
|
655
|
+
|
|
656
|
+
if len(filtered) < 2:
|
|
657
|
+
filtered = corr_vals.head(min(2, len(corr_vals)))
|
|
658
|
+
|
|
659
|
+
if len(filtered) == 0:
|
|
660
|
+
print("No correlated pairs found above threshold.")
|
|
661
|
+
return
|
|
662
|
+
|
|
663
|
+
self._display_msg(msg=f"\nScatter Plots for Top Correlated Feature Pairs:", show_data=False)
|
|
664
|
+
|
|
665
|
+
# Plot scatter plots
|
|
666
|
+
for _, row in filtered.iterrows():
|
|
667
|
+
feature_x, feature_y = row["Feature1"], row["Feature2"]
|
|
668
|
+
|
|
669
|
+
plt.figure(figsize=(6, 4))
|
|
670
|
+
sns.scatterplot(x=data[feature_x], y=data[feature_y], alpha=0.3)
|
|
671
|
+
plt.xlabel(feature_x)
|
|
672
|
+
plt.ylabel(feature_y)
|
|
673
|
+
plt.title(f"Scatter Plot: {feature_x} vs {feature_y} (Corr: {row['Correlation']:.2f})")
|
|
674
|
+
plt.tight_layout()
|
|
675
|
+
plt.show()
|
|
676
|
+
|
|
677
|
+
def _ordinal_encoding(self,
|
|
678
|
+
ordinal_columns):
|
|
679
|
+
"""
|
|
680
|
+
DESCRIPTION:
|
|
681
|
+
Function performs the ordinal encoding to categorical columns or features in the dataset.
|
|
682
|
+
|
|
683
|
+
PARAMETERS:
|
|
684
|
+
ordinal_columns:
|
|
685
|
+
Required Argument.
|
|
686
|
+
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
687
|
+
Types: str or list of strings (str)
|
|
688
|
+
"""
|
|
689
|
+
# Setting volatile and persist parameters for performing encoding
|
|
690
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
691
|
+
param_name="CategoricalEncodingParam")
|
|
692
|
+
|
|
693
|
+
# Adding fit parameters for performing encoding
|
|
694
|
+
fit_params = {
|
|
695
|
+
"data" : self.data,
|
|
696
|
+
"target_column" : ordinal_columns,
|
|
697
|
+
"volatile" : volatile,
|
|
698
|
+
"persist" : persist
|
|
699
|
+
}
|
|
700
|
+
# Performing ordinal encoding fit on target columns
|
|
701
|
+
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
702
|
+
# Storing fit object and column list for ordinal encoding in data transform dictionary
|
|
703
|
+
if ordinal_columns[0] != self.target_column:
|
|
704
|
+
self.data_transform_dict["custom_ord_encoding_fit_obj"] = ord_fit_obj.result
|
|
705
|
+
self.data_transform_dict['custom_ord_encoding_col'] = ordinal_columns
|
|
706
|
+
else:
|
|
707
|
+
self.data_transform_dict['target_col_encode_ind'] = True
|
|
708
|
+
self.data_transform_dict['target_col_ord_encoding_fit_obj'] = ord_fit_obj.result
|
|
709
|
+
|
|
710
|
+
# Extracting accumulate columns
|
|
711
|
+
accumulate_columns = self._extract_list(self.data.columns, ordinal_columns)
|
|
712
|
+
# Adding transform parameters for performing encoding
|
|
713
|
+
transform_params = {
|
|
714
|
+
"data" : self.data,
|
|
715
|
+
"object" : ord_fit_obj.result,
|
|
716
|
+
"accumulate" : accumulate_columns,
|
|
717
|
+
"persist" : True
|
|
718
|
+
}
|
|
719
|
+
# Disabling display table name if persist is True by default
|
|
720
|
+
if not volatile and not persist:
|
|
721
|
+
transform_params["display_table_name"] = False
|
|
722
|
+
|
|
723
|
+
# Setting persist to False if volatile is True
|
|
724
|
+
if volatile:
|
|
725
|
+
transform_params["volatile"] = True
|
|
726
|
+
transform_params["persist"] = False
|
|
727
|
+
# Performing ordinal encoding transformation
|
|
728
|
+
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
729
|
+
|
|
730
|
+
if not volatile and not persist:
|
|
731
|
+
# Adding transformed data containing table to garbage collector
|
|
732
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
733
|
+
|
|
734
|
+
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
735
|
+
self.target_label = ord_fit_obj
|
|
736
|
+
|
|
737
|
+
def _extract_list(self,
|
|
738
|
+
list1,
|
|
739
|
+
list2):
|
|
740
|
+
"""
|
|
741
|
+
DESCRIPTION:
|
|
742
|
+
Function to extract elements from list1 which are not present in list2.
|
|
743
|
+
|
|
744
|
+
PARAMETERS:
|
|
745
|
+
list1:
|
|
746
|
+
Required Argument.
|
|
747
|
+
Specifies the first list for extracting elements from.
|
|
748
|
+
Types: list
|
|
749
|
+
|
|
750
|
+
list2:
|
|
751
|
+
Required Argument.
|
|
752
|
+
Specifies the second list to get elements for avoiding in first list while extracting.
|
|
753
|
+
Types: list
|
|
754
|
+
|
|
755
|
+
RETURN:
|
|
756
|
+
Returns extracted elements in form of list.
|
|
757
|
+
|
|
758
|
+
"""
|
|
759
|
+
new_lst = list(set(list1) - set(list2))
|
|
760
|
+
return new_lst
|
|
761
|
+
|
|
762
|
+
def _get_generic_parameters(self,
|
|
763
|
+
func_indicator=None,
|
|
764
|
+
param_name=None):
|
|
765
|
+
"""
|
|
766
|
+
DESCRIPTION:
|
|
767
|
+
Function to get generic parameters.
|
|
768
|
+
|
|
769
|
+
PARAMETERS:
|
|
770
|
+
func_indicator:
|
|
771
|
+
Optional Argument.
|
|
772
|
+
Specifies the name of function indicator.
|
|
773
|
+
Types: str
|
|
774
|
+
|
|
775
|
+
param_name:
|
|
776
|
+
Optional Argument.
|
|
777
|
+
Specifies the name of the param which contains generic parameters.
|
|
778
|
+
Types: str
|
|
779
|
+
|
|
780
|
+
RETURNS:
|
|
781
|
+
Tuple containing volatile and persist parameters.
|
|
782
|
+
"""
|
|
783
|
+
volatile = self.volatile
|
|
784
|
+
persist = self.persist
|
|
785
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
786
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
787
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
788
|
+
|
|
789
|
+
return (volatile, persist)
|
|
790
|
+
|
|
268
791
|
def _check_visualization_libraries(self):
|
|
269
792
|
"""
|
|
270
793
|
DESCRIPTION:
|
|
@@ -287,8 +810,8 @@ class _FeatureExplore:
|
|
|
287
810
|
def _outlier_detection(self,
|
|
288
811
|
outlier_method,
|
|
289
812
|
column_list,
|
|
290
|
-
lower_percentile
|
|
291
|
-
upper_percentile
|
|
813
|
+
lower_percentile=None,
|
|
814
|
+
upper_percentile=None):
|
|
292
815
|
"""
|
|
293
816
|
DESCRIPTION:
|
|
294
817
|
Function detects the outlier in numerical column and display thier percentage.
|