teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Sweta Shaw
|
|
@@ -51,9 +51,9 @@ class _FeatureEngineering:
|
|
|
51
51
|
data,
|
|
52
52
|
target_column,
|
|
53
53
|
model_list,
|
|
54
|
-
verbose
|
|
55
|
-
task_type
|
|
56
|
-
custom_data
|
|
54
|
+
verbose=0,
|
|
55
|
+
task_type="Regression",
|
|
56
|
+
custom_data=None,
|
|
57
57
|
**kwargs):
|
|
58
58
|
"""
|
|
59
59
|
DESCRIPTION:
|
|
@@ -88,10 +88,10 @@ class _FeatureEngineering:
|
|
|
88
88
|
|
|
89
89
|
task_type:
|
|
90
90
|
Required Argument.
|
|
91
|
-
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
91
|
+
Specifies the task type for AutoML, whether to apply regresion OR classification OR clustering
|
|
92
92
|
on the provived dataset.
|
|
93
93
|
Default Value: "Regression"
|
|
94
|
-
Permitted Values: "Regression", "Classification"
|
|
94
|
+
Permitted Values: "Regression", "Classification", "Clustering"
|
|
95
95
|
Types: str
|
|
96
96
|
|
|
97
97
|
custom_data:
|
|
@@ -120,6 +120,30 @@ class _FeatureEngineering:
|
|
|
120
120
|
session.
|
|
121
121
|
Default Value: False
|
|
122
122
|
Types: bool
|
|
123
|
+
|
|
124
|
+
cluster:
|
|
125
|
+
Optional Argument.
|
|
126
|
+
Specifies whether to apply clustering techniques.
|
|
127
|
+
Default Value: False
|
|
128
|
+
Types: bool
|
|
129
|
+
|
|
130
|
+
progress_prefix:
|
|
131
|
+
Optional Argument.
|
|
132
|
+
Specifies the prefix for the progress bar messages.
|
|
133
|
+
Default Value: None
|
|
134
|
+
Types: str.
|
|
135
|
+
|
|
136
|
+
automl_phases:
|
|
137
|
+
Optional Argument.
|
|
138
|
+
Specifies the phase of AutoML to be executed.
|
|
139
|
+
Default Value: None
|
|
140
|
+
Types: str or list of str.
|
|
141
|
+
|
|
142
|
+
auto_dataprep:
|
|
143
|
+
Optional Argument.
|
|
144
|
+
Specifies whether to run AutoDataPrep workflow.
|
|
145
|
+
Default Value: False
|
|
146
|
+
Types: bool
|
|
123
147
|
"""
|
|
124
148
|
# Instance variables
|
|
125
149
|
self.data = data
|
|
@@ -131,16 +155,18 @@ class _FeatureEngineering:
|
|
|
131
155
|
self.excluded_cols=[]
|
|
132
156
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
133
157
|
self.target_label = None
|
|
134
|
-
|
|
158
|
+
|
|
135
159
|
self.one_hot_obj_count = 0
|
|
136
160
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
137
161
|
self.persist = kwargs.get('persist', False)
|
|
138
162
|
self.volatile = kwargs.get('volatile', False) or (configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE and self.persist is False)
|
|
163
|
+
self.cluster = kwargs.get('cluster', False)
|
|
139
164
|
|
|
140
165
|
self.data_mapping = {}
|
|
141
166
|
self.progress_prefix = kwargs.get('progress_prefix', None)
|
|
142
167
|
self.aml_phases = kwargs.get('automl_phases', None)
|
|
143
|
-
|
|
168
|
+
self.auto_dataprep = kwargs.get('auto_dataprep', False)
|
|
169
|
+
|
|
144
170
|
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
145
171
|
def feature_engineering(self,
|
|
146
172
|
auto=True):
|
|
@@ -165,13 +191,25 @@ class _FeatureEngineering:
|
|
|
165
191
|
second element represents list of columns which are not participating in outlier tranformation.
|
|
166
192
|
"""
|
|
167
193
|
# Assigning number of base jobs for progress bar.
|
|
168
|
-
|
|
194
|
+
if self.cluster:
|
|
195
|
+
base_jobs = 11 if auto else 15
|
|
196
|
+
else:
|
|
197
|
+
base_jobs = 12 if auto else 17
|
|
169
198
|
|
|
170
199
|
# Updating model list based on distinct value of target column for classification type
|
|
171
200
|
if self.is_classification_type():
|
|
172
201
|
if self.data.drop_duplicate(self.target_column).size > 2:
|
|
173
|
-
unsupported_models = ['svm', 'glm']
|
|
202
|
+
unsupported_models = ['svm', 'glm'] # Models that don't support multiclass
|
|
203
|
+
for model in unsupported_models:
|
|
204
|
+
if model in self.model_list:
|
|
205
|
+
self._display_msg(inline_msg="\nMulti-class classification is "
|
|
206
|
+
"not supported by {} model. Skipping {} model."
|
|
207
|
+
.format(model, model))
|
|
174
208
|
self.model_list = [model for model in self.model_list if model not in unsupported_models]
|
|
209
|
+
|
|
210
|
+
# After filtering models like glm/svm due to multiclass
|
|
211
|
+
if not self.auto_dataprep:
|
|
212
|
+
_Validators._validate_non_empty_list_or_valid_selection(self.model_list, "List of models")
|
|
175
213
|
|
|
176
214
|
# Updating number of jobs for progress bar based on number of models.
|
|
177
215
|
jobs = base_jobs + len(self.model_list)
|
|
@@ -187,13 +225,24 @@ class _FeatureEngineering:
|
|
|
187
225
|
progress_bar=self.progress_bar)
|
|
188
226
|
|
|
189
227
|
# Storing target column to data transform dictionary
|
|
190
|
-
|
|
228
|
+
# Setting target column for supervised learning, for clustering it will be None.
|
|
229
|
+
if not self.cluster:
|
|
230
|
+
self.data_transform_dict['data_target_column'] = self.target_column
|
|
231
|
+
else:
|
|
232
|
+
self.data_transform_dict['data_target_column'] = None
|
|
233
|
+
|
|
191
234
|
# Storing target column encoding indicator to data transform dictionary
|
|
192
|
-
self.data_transform_dict
|
|
235
|
+
if "target_col_encode_ind" not in self.data_transform_dict:
|
|
236
|
+
self.data_transform_dict["target_col_encode_ind"] = False
|
|
237
|
+
|
|
238
|
+
|
|
193
239
|
# Storing task type to data transform dictionary
|
|
194
|
-
self.
|
|
240
|
+
if not self.cluster:
|
|
241
|
+
self.data_transform_dict['classification_type'] = self.is_classification_type()
|
|
242
|
+
else:
|
|
243
|
+
self.data_transform_dict['classification_type'] = False
|
|
195
244
|
# Storing params for performing one hot encoding
|
|
196
|
-
self.data_transform_dict['one_hot_encoding_fit_obj'] ={}
|
|
245
|
+
self.data_transform_dict['one_hot_encoding_fit_obj'] = {}
|
|
197
246
|
self.data_transform_dict['one_hot_encoding_drop_list'] = []
|
|
198
247
|
|
|
199
248
|
if auto:
|
|
@@ -333,8 +382,8 @@ class _FeatureEngineering:
|
|
|
333
382
|
# Removing id column, if exists
|
|
334
383
|
if len(columns_to_be_removed) != 0:
|
|
335
384
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
336
|
-
# Storing
|
|
337
|
-
self.data_transform_dict['
|
|
385
|
+
# Storing irrelevant column list in data transform dictionary
|
|
386
|
+
self.data_transform_dict['drop_irrelevant_columns'] = columns_to_be_removed
|
|
338
387
|
|
|
339
388
|
# Adding id columns
|
|
340
389
|
obj = FillRowId(data=self.data, row_id_column='id')
|
|
@@ -355,18 +404,21 @@ class _FeatureEngineering:
|
|
|
355
404
|
|
|
356
405
|
# Handling string type target column in classification
|
|
357
406
|
# Performing Ordinal Encoding
|
|
358
|
-
if self.
|
|
359
|
-
self.
|
|
407
|
+
if not self.cluster:
|
|
408
|
+
if self.data_types[self.target_column] in ['str']:
|
|
409
|
+
self._ordinal_encoding([self.target_column])
|
|
360
410
|
|
|
361
411
|
# Detecting categorical columns
|
|
362
412
|
categorical_columns = [col for col, d_type in self.data._column_names_and_types if d_type == 'str']
|
|
363
413
|
|
|
364
414
|
# Detecting and removing futile columns, if categorical_column exists
|
|
365
415
|
if len(categorical_columns) != 0:
|
|
416
|
+
|
|
366
417
|
obj = CategoricalSummary(data=self.data,
|
|
367
418
|
target_columns=categorical_columns,
|
|
368
419
|
volatile=self.volatile,
|
|
369
420
|
persist=self.persist)
|
|
421
|
+
|
|
370
422
|
gfc_out = GetFutileColumns(data=self.data,
|
|
371
423
|
object=obj,
|
|
372
424
|
category_summary_column="ColumnName",
|
|
@@ -378,8 +430,8 @@ class _FeatureEngineering:
|
|
|
378
430
|
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
379
431
|
|
|
380
432
|
self.data_mapping['categorical_summary'] = obj.result._table_name
|
|
381
|
-
self.data_mapping['futile_columns'] = gfc_out.result._table_name
|
|
382
|
-
|
|
433
|
+
self.data_mapping['futile_columns'] = gfc_out.result._table_name
|
|
434
|
+
|
|
383
435
|
if len(f_cols) == 0:
|
|
384
436
|
self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
|
|
385
437
|
progress_bar=self.progress_bar)
|
|
@@ -597,7 +649,8 @@ class _FeatureEngineering:
|
|
|
597
649
|
"""
|
|
598
650
|
|
|
599
651
|
# Removing rows with missing target column value
|
|
600
|
-
|
|
652
|
+
if not self.cluster:
|
|
653
|
+
self.data = self.data.dropna(subset=[self.target_column])
|
|
601
654
|
|
|
602
655
|
params = {
|
|
603
656
|
"data": self.data,
|
|
@@ -664,6 +717,11 @@ class _FeatureEngineering:
|
|
|
664
717
|
drop_cols.append(col)
|
|
665
718
|
continue
|
|
666
719
|
|
|
720
|
+
# For clustering tasks, all columns with missing values are sent directly to imputation
|
|
721
|
+
if self.cluster:
|
|
722
|
+
self.imputation_cols[col] = val
|
|
723
|
+
continue
|
|
724
|
+
|
|
667
725
|
if self.data_types[col] in ['float', 'int']:
|
|
668
726
|
corr_df = self.data[col].corr(self.data[self.target_column])
|
|
669
727
|
corr_val = self.data.assign(True, corr_=corr_df)
|
|
@@ -674,7 +732,7 @@ class _FeatureEngineering:
|
|
|
674
732
|
if val < .02*d_size and related <= .25:
|
|
675
733
|
delete_rows.append(col)
|
|
676
734
|
continue
|
|
677
|
-
|
|
735
|
+
|
|
678
736
|
elif self.data_types[col] in ['str']:
|
|
679
737
|
# Delete row, if count of missing value < 4%
|
|
680
738
|
if val < .04*d_size:
|
|
@@ -806,8 +864,7 @@ class _FeatureEngineering:
|
|
|
806
864
|
self._display_msg(msg="Time taken to perform imputation: {:.2f} sec ".format(end_time - start_time),
|
|
807
865
|
progress_bar=self.progress_bar,
|
|
808
866
|
show_data=True)
|
|
809
|
-
|
|
810
|
-
|
|
867
|
+
|
|
811
868
|
def _custom_handling_missing_value(self):
|
|
812
869
|
"""
|
|
813
870
|
DESCRIPTION:
|
|
@@ -1001,7 +1058,7 @@ class _FeatureEngineering:
|
|
|
1001
1058
|
# Extracting accumulate columns
|
|
1002
1059
|
accumulate_columns = self._extract_list(self.data.columns, equal_width_bin_columns)
|
|
1003
1060
|
# Adding transform parameters for performing binning with Equal-Width.
|
|
1004
|
-
eql_transform_params={
|
|
1061
|
+
eql_transform_params = {
|
|
1005
1062
|
"data" : self.data,
|
|
1006
1063
|
"object" : eql_bin_code_fit.output,
|
|
1007
1064
|
"accumulate" : accumulate_columns,
|
|
@@ -1021,7 +1078,7 @@ class _FeatureEngineering:
|
|
|
1021
1078
|
|
|
1022
1079
|
self.data_mapping['fit_eql_width'] = eql_bin_code_fit.output._table_name
|
|
1023
1080
|
self.data_mapping['eql_width_bincoded_data'] = self.data._table_name
|
|
1024
|
-
|
|
1081
|
+
|
|
1025
1082
|
self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
|
|
1026
1083
|
data=self.data,
|
|
1027
1084
|
progress_bar=self.progress_bar)
|
|
@@ -1150,7 +1207,7 @@ class _FeatureEngineering:
|
|
|
1150
1207
|
string_operation = transform_val["StringOperation"]
|
|
1151
1208
|
|
|
1152
1209
|
# Setting volatile and persist parameters for performing string manipulation
|
|
1153
|
-
volatile, persist = self.
|
|
1210
|
+
volatile, persist = self._get_generic_parameters(func_indicator="StringManipulationIndicator",
|
|
1154
1211
|
param_name="StringManipulationParam")
|
|
1155
1212
|
|
|
1156
1213
|
# Storing general parameters for performing string transformation
|
|
@@ -1219,7 +1276,7 @@ class _FeatureEngineering:
|
|
|
1219
1276
|
drop_lst = [ele + "_other" for ele in one_hot_columns]
|
|
1220
1277
|
|
|
1221
1278
|
# Setting volatile and persist parameters for performing encoding
|
|
1222
|
-
volatile, persist = self.
|
|
1279
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1223
1280
|
param_name="CategoricalEncodingParam")
|
|
1224
1281
|
|
|
1225
1282
|
# Adding fit parameters for performing encoding
|
|
@@ -1280,7 +1337,7 @@ class _FeatureEngineering:
|
|
|
1280
1337
|
Types: str or list of strings (str)
|
|
1281
1338
|
"""
|
|
1282
1339
|
# Setting volatile and persist parameters for performing encoding
|
|
1283
|
-
volatile, persist = self.
|
|
1340
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1284
1341
|
param_name="CategoricalEncodingParam")
|
|
1285
1342
|
|
|
1286
1343
|
# Adding fit parameters for performing encoding
|
|
@@ -1326,11 +1383,10 @@ class _FeatureEngineering:
|
|
|
1326
1383
|
self.data_mapping['fit_ordinal_output'] = ord_fit_obj.output_data._table_name
|
|
1327
1384
|
self.data_mapping['fit_ordinal_result'] = ord_fit_obj.result._table_name
|
|
1328
1385
|
self.data_mapping['ordinal_encoded_data'] = self.data._table_name
|
|
1329
|
-
|
|
1386
|
+
|
|
1330
1387
|
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1331
1388
|
self.target_label = ord_fit_obj
|
|
1332
1389
|
|
|
1333
|
-
|
|
1334
1390
|
def _target_encoding(self,
|
|
1335
1391
|
target_encoding_list):
|
|
1336
1392
|
"""
|
|
@@ -1344,22 +1400,22 @@ class _FeatureEngineering:
|
|
|
1344
1400
|
Types: str or list of strings (str)
|
|
1345
1401
|
"""
|
|
1346
1402
|
# Fetching all columns on which target encoding will be performed.
|
|
1347
|
-
target_columns= list(target_encoding_list.keys())
|
|
1403
|
+
target_columns = list(target_encoding_list.keys())
|
|
1348
1404
|
# Checking for column present in dataset or not
|
|
1349
1405
|
_Validators._validate_dataframe_has_argument_columns(target_columns, "TargetEncodingList", self.data, "df")
|
|
1350
1406
|
# Finding distinct values and counts for columns.
|
|
1351
|
-
cat_sum = CategoricalSummary(data
|
|
1352
|
-
target_columns
|
|
1353
|
-
category_data=cat_sum.result.groupby("ColumnName").count()
|
|
1354
|
-
category_data = category_data.assign(drop_columns
|
|
1355
|
-
ColumnName
|
|
1356
|
-
CategoryCount
|
|
1407
|
+
cat_sum = CategoricalSummary(data=self.data,
|
|
1408
|
+
target_columns=target_columns)
|
|
1409
|
+
category_data = cat_sum.result.groupby("ColumnName").count()
|
|
1410
|
+
category_data = category_data.assign(drop_columns=True,
|
|
1411
|
+
ColumnName=category_data.ColumnName,
|
|
1412
|
+
CategoryCount=category_data.count_DistinctValue)
|
|
1357
1413
|
# Storing indicator and fit object for target encoding in data transform dictionary
|
|
1358
1414
|
self.data_transform_dict["custom_target_encoding_ind"] = True
|
|
1359
1415
|
self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
|
|
1360
1416
|
|
|
1361
1417
|
# Setting volatile and persist parameters for performing encoding
|
|
1362
|
-
volatile, persist = self.
|
|
1418
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1363
1419
|
param_name="CategoricalEncodingParam")
|
|
1364
1420
|
|
|
1365
1421
|
# Fetching required argument for performing target encoding
|
|
@@ -1392,7 +1448,7 @@ class _FeatureEngineering:
|
|
|
1392
1448
|
"data" : self.data,
|
|
1393
1449
|
"object" : tar_fit_obj,
|
|
1394
1450
|
"accumulate" : accumulate_columns,
|
|
1395
|
-
"persist" : True
|
|
1451
|
+
"persist" : True
|
|
1396
1452
|
}
|
|
1397
1453
|
|
|
1398
1454
|
# Disabling display table name if persist is True by default
|
|
@@ -1422,7 +1478,7 @@ class _FeatureEngineering:
|
|
|
1422
1478
|
start_time = time.time()
|
|
1423
1479
|
|
|
1424
1480
|
ohe_col = []
|
|
1425
|
-
unique_count
|
|
1481
|
+
unique_count = []
|
|
1426
1482
|
|
|
1427
1483
|
# List of columns before one hot
|
|
1428
1484
|
col_bf_ohe = self.data.columns
|
|
@@ -1487,7 +1543,7 @@ class _FeatureEngineering:
|
|
|
1487
1543
|
progress_bar=self.progress_bar)
|
|
1488
1544
|
else:
|
|
1489
1545
|
if onehot_encode_ind:
|
|
1490
|
-
unique_count
|
|
1546
|
+
unique_count = []
|
|
1491
1547
|
ohe_list = encoding_list.get("OneHotEncodingList", None)
|
|
1492
1548
|
# Checking for empty list
|
|
1493
1549
|
if not ohe_list:
|
|
@@ -1530,16 +1586,20 @@ class _FeatureEngineering:
|
|
|
1530
1586
|
progress_bar=self.progress_bar)
|
|
1531
1587
|
|
|
1532
1588
|
if target_encode_ind:
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
self._display_msg(inline_msg="No information provided for customized target encoding technique.",
|
|
1536
|
-
progress_bar=self.progress_bar)
|
|
1537
|
-
else:
|
|
1538
|
-
# Performing target encoding
|
|
1539
|
-
self._target_encoding(tar_list)
|
|
1540
|
-
self._display_msg(msg="Updated dataset sample after performing target encoding:",
|
|
1541
|
-
data=self.data,
|
|
1589
|
+
if self.cluster:
|
|
1590
|
+
self._display_msg(inline_msg="Target Encoding is not applicable for clustering. Skipping it.",
|
|
1542
1591
|
progress_bar=self.progress_bar)
|
|
1592
|
+
else:
|
|
1593
|
+
tar_list = encoding_list.get("TargetEncodingList", None)
|
|
1594
|
+
if not tar_list:
|
|
1595
|
+
self._display_msg(inline_msg="No information provided for customized target encoding technique.",
|
|
1596
|
+
progress_bar=self.progress_bar)
|
|
1597
|
+
else:
|
|
1598
|
+
# Performing target encoding
|
|
1599
|
+
self._target_encoding(tar_list)
|
|
1600
|
+
self._display_msg(msg="Updated dataset sample after performing target encoding:",
|
|
1601
|
+
data=self.data,
|
|
1602
|
+
progress_bar=self.progress_bar)
|
|
1543
1603
|
else:
|
|
1544
1604
|
self._display_msg(inline_msg="No input provided for performing customized categorical encoding. AutoML will proceed with default encoding technique.",
|
|
1545
1605
|
progress_bar=self.progress_bar)
|
|
@@ -1571,7 +1631,7 @@ class _FeatureEngineering:
|
|
|
1571
1631
|
apply_method = transform_val["apply_method"]
|
|
1572
1632
|
|
|
1573
1633
|
# Setting volatile and persist parameters for performing transformation
|
|
1574
|
-
volatile, persist = self.
|
|
1634
|
+
volatile, persist = self._get_generic_parameters(func_indicator="MathameticalTransformationIndicator",
|
|
1575
1635
|
param_name="MathameticalTransformationParam")
|
|
1576
1636
|
# Adding fit parameters for performing transformation
|
|
1577
1637
|
fit_params={
|
|
@@ -1855,7 +1915,7 @@ class _FeatureEngineering:
|
|
|
1855
1915
|
self._display_msg(inline_msg="Skipping customized anti-select columns.",
|
|
1856
1916
|
progress_bar=self.progress_bar)
|
|
1857
1917
|
|
|
1858
|
-
def
|
|
1918
|
+
def _get_generic_parameters(self,
|
|
1859
1919
|
func_indicator=None,
|
|
1860
1920
|
param_name=None):
|
|
1861
1921
|
"""
|