teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Sweta Shaw
|
|
@@ -120,6 +120,26 @@ class _DataPreparation:
|
|
|
120
120
|
Specifies the random seed for reproducibility.
|
|
121
121
|
Default Value: 42
|
|
122
122
|
Types: int
|
|
123
|
+
|
|
124
|
+
automl_phases:
|
|
125
|
+
Optional Argument.
|
|
126
|
+
Specifies the phase of AutoML to be executed.
|
|
127
|
+
Default Value: None
|
|
128
|
+
Types: str or list of str.
|
|
129
|
+
|
|
130
|
+
cluster:
|
|
131
|
+
Optional Argument.
|
|
132
|
+
Specifies whether to run data preparation for handling clustering.
|
|
133
|
+
Default Value: False
|
|
134
|
+
Types: bool
|
|
135
|
+
|
|
136
|
+
imbalance_handling_method:
|
|
137
|
+
Optional Argument.
|
|
138
|
+
Specifies which imbalance handling method to use.
|
|
139
|
+
Default Value: "SMOTE"
|
|
140
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
141
|
+
Types: str
|
|
142
|
+
|
|
123
143
|
"""
|
|
124
144
|
self.data = data
|
|
125
145
|
self.target_column = target_column
|
|
@@ -131,11 +151,13 @@ class _DataPreparation:
|
|
|
131
151
|
self.volatile = kwargs.get("volatile", False)
|
|
132
152
|
self.persist = kwargs.get("persist", False)
|
|
133
153
|
self.aml_phases = kwargs.get("automl_phases", None)
|
|
154
|
+
self.cluster = kwargs.get('cluster', False)
|
|
155
|
+
self._data_sampling_method = kwargs.get("imbalance_handling_method", "SMOTE")
|
|
134
156
|
|
|
135
157
|
# Setting default value for auto run mode
|
|
136
|
-
self._data_sampling_method = "SMOTE"
|
|
137
158
|
self._scale_method_reg = "STD"
|
|
138
159
|
self._scale_method_cls = "RANGE"
|
|
160
|
+
self._scale_method_clust = "STD"
|
|
139
161
|
|
|
140
162
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
141
163
|
self.seed = kwargs.get("seed", 42)
|
|
@@ -147,9 +169,8 @@ class _DataPreparation:
|
|
|
147
169
|
|
|
148
170
|
self.data_mapping = kwargs.get("data_mapping", {})
|
|
149
171
|
|
|
150
|
-
|
|
151
172
|
def data_preparation(self,
|
|
152
|
-
auto
|
|
173
|
+
auto=True):
|
|
153
174
|
"""
|
|
154
175
|
DESCRIPTION:
|
|
155
176
|
Function to perform following tasks:-
|
|
@@ -178,42 +199,50 @@ class _DataPreparation:
|
|
|
178
199
|
self._set_custom_scaling_method()
|
|
179
200
|
self._set_custom_sampling()
|
|
180
201
|
|
|
181
|
-
# Handling ouliers in dataset
|
|
182
|
-
self._handle_outliers(auto)
|
|
183
|
-
self.progress_bar.update()
|
|
184
|
-
|
|
185
202
|
# Handling float type features before processing with feature selection and scaling
|
|
186
203
|
training_data = self._handle_generated_features()
|
|
187
204
|
self.progress_bar.update()
|
|
188
|
-
|
|
205
|
+
|
|
206
|
+
# Handling ouliers in dataset
|
|
207
|
+
self._handle_outliers(auto)
|
|
208
|
+
self.progress_bar.update()
|
|
209
|
+
|
|
189
210
|
# Temporary Pulling data for feature selection
|
|
190
211
|
# Will change after sto
|
|
191
212
|
|
|
192
213
|
# Checking for data imbalance
|
|
193
|
-
if self.
|
|
194
|
-
|
|
214
|
+
if not self.cluster:
|
|
215
|
+
if self._check_data_imbalance(training_data):
|
|
216
|
+
training_data = self._data_sampling(training_data)
|
|
195
217
|
self.progress_bar.update()
|
|
196
218
|
|
|
197
219
|
# Sorting the data based on id to
|
|
198
220
|
# remove any shuffling done by sampling
|
|
199
221
|
training_data = training_data.sort_values(by='id')
|
|
200
222
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
223
|
+
if not self.cluster:
|
|
224
|
+
# Performing feature selection using lasso followed by scaling
|
|
225
|
+
self._feature_selection_Lasso(training_data)
|
|
226
|
+
self._scaling_features(feature_selection_mtd="lasso")
|
|
227
|
+
self.progress_bar.update()
|
|
228
|
+
|
|
229
|
+
# Performing feature selection using rfe followed by scaling
|
|
230
|
+
self._feature_selection_RFE(training_data)
|
|
231
|
+
self._scaling_features(feature_selection_mtd="rfe")
|
|
232
|
+
self.progress_bar.update()
|
|
233
|
+
else:
|
|
234
|
+
self._scaling_features(feature_selection_mtd="Non_pca")
|
|
235
|
+
self.progress_bar.update()
|
|
236
|
+
|
|
237
|
+
# Performing scaling followed by feature selection using pca
|
|
212
238
|
self._scaling_features(feature_selection_mtd="pca")
|
|
213
239
|
self._feature_selection_PCA()
|
|
214
240
|
self.progress_bar.update()
|
|
215
|
-
|
|
216
|
-
|
|
241
|
+
|
|
242
|
+
if not self.cluster:
|
|
243
|
+
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
244
|
+
else:
|
|
245
|
+
return [self.non_pca_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
217
246
|
|
|
218
247
|
def _handle_outliers(self,
|
|
219
248
|
auto):
|
|
@@ -280,19 +309,23 @@ class _DataPreparation:
|
|
|
280
309
|
if len(outlier_columns) != 0:
|
|
281
310
|
# Detecting outlier percentage in each columns
|
|
282
311
|
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
283
|
-
|
|
312
|
+
|
|
284
313
|
# Outlier Handling techniques
|
|
285
314
|
for i in outlier_percentage_df.itertuples():
|
|
286
315
|
# Column Name
|
|
287
316
|
col = i[0]
|
|
288
317
|
# Outlier value
|
|
289
318
|
value = i[1]
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
319
|
+
if self.cluster:
|
|
320
|
+
if value > 0.0:
|
|
321
|
+
columns_to_impute.append(col)
|
|
322
|
+
else:
|
|
323
|
+
# Dropping rows
|
|
324
|
+
if value > 0.0 and value <= 8.0 :
|
|
325
|
+
columns_to_drop_rows.append(col)
|
|
326
|
+
elif value> 8.0 and value <= 25.0:
|
|
327
|
+
columns_to_impute.append(col)
|
|
328
|
+
|
|
296
329
|
return columns_to_drop_rows, columns_to_impute
|
|
297
330
|
|
|
298
331
|
def _outlier_handling(self,
|
|
@@ -325,7 +358,7 @@ class _DataPreparation:
|
|
|
325
358
|
"""
|
|
326
359
|
|
|
327
360
|
# Setting volatile and persist parameters for Outlier handling function
|
|
328
|
-
volatile, persist = self.
|
|
361
|
+
volatile, persist = self._get_generic_parameters(func_indicator='OutlierFilterIndicator',
|
|
329
362
|
param_name='OutlierFilterParam')
|
|
330
363
|
|
|
331
364
|
# Performing fit on dataset for outlier handling
|
|
@@ -431,17 +464,17 @@ class _DataPreparation:
|
|
|
431
464
|
# List of columns for outlier processing.
|
|
432
465
|
target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
433
466
|
# Checking user input for outlier detection method
|
|
434
|
-
outlier_method = self.custom_data.get("
|
|
467
|
+
outlier_method = self.custom_data.get("OutlierFilterMethod", None)
|
|
435
468
|
if outlier_method == 'PERCENTILE':
|
|
436
469
|
lower_percentile = self.custom_data.get("OutlierLowerPercentile", None)
|
|
437
470
|
upper_percentile = self.custom_data.get("OutlierUpperPercentile", None)
|
|
438
471
|
if lower_percentile and upper_percentile:
|
|
439
472
|
# Detecting outlier percentage for each columns
|
|
440
|
-
outlier_df = self._outlier_detection(outlier_method, target_columns, \
|
|
441
|
-
lower_percentile, upper_percentile)
|
|
473
|
+
outlier_df = self._outlier_detection(outlier_method=outlier_method, column_list=target_columns, \
|
|
474
|
+
lower_percentile=lower_percentile, upper_percentile=upper_percentile)
|
|
442
475
|
else:
|
|
443
476
|
# Detecting outlier percentage for each column in case of other than percentile method
|
|
444
|
-
outlier_df = self._outlier_detection(outlier_method, target_columns)
|
|
477
|
+
outlier_df = self._outlier_detection(outlier_method=outlier_method, column_list=target_columns)
|
|
445
478
|
|
|
446
479
|
# Checking for rows if outlier containing columns exist
|
|
447
480
|
if outlier_df.shape[0]:
|
|
@@ -462,6 +495,8 @@ class _DataPreparation:
|
|
|
462
495
|
self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
|
|
463
496
|
self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
|
|
464
497
|
self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
|
|
498
|
+
self._display_msg(msg="Sample of dataset after performing custom outlier filtering",
|
|
499
|
+
data=self.data,progress_bar=self.progress_bar)
|
|
465
500
|
else:
|
|
466
501
|
self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
|
|
467
502
|
progress_bar=self.progress_bar)
|
|
@@ -472,7 +507,8 @@ class _DataPreparation:
|
|
|
472
507
|
self._display_msg(inline_msg="No information provided for customized outlier processing. AutoML will proceed with default settings.",
|
|
473
508
|
progress_bar=self.progress_bar)
|
|
474
509
|
# Performing default handling for outliers
|
|
475
|
-
self.
|
|
510
|
+
if not self.cluster:
|
|
511
|
+
self._outlier_processing()
|
|
476
512
|
|
|
477
513
|
# function for getting value of "K" in k folds cross validation
|
|
478
514
|
def _num_of_folds(self, rows=None):
|
|
@@ -509,7 +545,10 @@ class _DataPreparation:
|
|
|
509
545
|
pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
|
|
510
546
|
|
|
511
547
|
# Drop unnecessary columns and store the result
|
|
512
|
-
|
|
548
|
+
if not self.cluster:
|
|
549
|
+
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
550
|
+
else:
|
|
551
|
+
train_data = pca_train.drop(columns=['id'], axis=1)
|
|
513
552
|
|
|
514
553
|
# Initialize and fit PCA
|
|
515
554
|
pca = PCA(random_state=self.seed)
|
|
@@ -545,9 +584,11 @@ class _DataPreparation:
|
|
|
545
584
|
train_df = pd.concat([pca_train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
|
|
546
585
|
|
|
547
586
|
# merging target column with new data
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
587
|
+
if not self.cluster:
|
|
588
|
+
train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
|
|
589
|
+
self.pca_feature = train_df.drop(columns=['id', self.target_column], axis=1).columns.tolist()
|
|
590
|
+
else:
|
|
591
|
+
self.pca_feature = train_df.drop(columns=['id'], axis=1).columns.tolist()
|
|
551
592
|
|
|
552
593
|
self._display_msg(msg="PCA columns:",
|
|
553
594
|
col_lst=self.pca_feature,
|
|
@@ -820,8 +861,12 @@ class _DataPreparation:
|
|
|
820
861
|
self.lasso_feature = columns_to_scale
|
|
821
862
|
elif feature_selection_mtd == "rfe":
|
|
822
863
|
self.rfe_feature = columns_to_scale
|
|
823
|
-
|
|
864
|
+
elif feature_selection_mtd == "pca":
|
|
824
865
|
self.pca_feature = columns_to_scale
|
|
866
|
+
elif feature_selection_mtd == "raw_scaled":
|
|
867
|
+
self.raw_scaled_feature = columns_to_scale
|
|
868
|
+
else:
|
|
869
|
+
self.non_pca_feature = columns_to_scale
|
|
825
870
|
|
|
826
871
|
columns_to_scale = [col for col in columns_to_scale if col not in self.excluded_columns]
|
|
827
872
|
return columns_to_scale
|
|
@@ -839,7 +884,8 @@ class _DataPreparation:
|
|
|
839
884
|
Specifies the feature selection algorithm used.
|
|
840
885
|
Types: str
|
|
841
886
|
"""
|
|
842
|
-
|
|
887
|
+
|
|
888
|
+
feature_selection_mtd = feature_selection_mtd.lower()
|
|
843
889
|
self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
|
|
844
890
|
progress_bar=self.progress_bar,
|
|
845
891
|
show_data=True)
|
|
@@ -847,21 +893,26 @@ class _DataPreparation:
|
|
|
847
893
|
start_time = time.time()
|
|
848
894
|
data_to_scale = None
|
|
849
895
|
|
|
850
|
-
if self.
|
|
851
|
-
|
|
896
|
+
if not self.cluster:
|
|
897
|
+
if self.is_classification_type():
|
|
898
|
+
scale_method = self._scale_method_cls
|
|
899
|
+
else:
|
|
900
|
+
scale_method = self._scale_method_reg
|
|
852
901
|
else:
|
|
853
|
-
scale_method = self.
|
|
854
|
-
|
|
902
|
+
scale_method = self._scale_method_clust
|
|
903
|
+
|
|
855
904
|
# Loading data for feature scaling based of feature selection method
|
|
856
905
|
if feature_selection_mtd == 'rfe':
|
|
857
906
|
data_to_scale = DataFrame(self.data_mapping['rfe_train'])
|
|
858
907
|
elif feature_selection_mtd == 'lasso':
|
|
859
908
|
data_to_scale = DataFrame(self.data_mapping['lasso_train'])
|
|
909
|
+
elif feature_selection_mtd == 'raw_scaled':
|
|
910
|
+
data_to_scale = DataFrame(self.data_mapping['raw_scaled_train'])
|
|
860
911
|
else:
|
|
861
912
|
data_to_scale = self.data
|
|
862
913
|
|
|
863
914
|
# Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
|
|
864
|
-
volatile, persist = self.
|
|
915
|
+
volatile, persist = self._get_generic_parameters(func_indicator='FeatureScalingIndicator',
|
|
865
916
|
param_name='FeatureScalingParam')
|
|
866
917
|
|
|
867
918
|
# List of columns that will be scaled
|
|
@@ -881,14 +932,13 @@ class _DataPreparation:
|
|
|
881
932
|
|
|
882
933
|
self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
|
|
883
934
|
self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
|
|
884
|
-
|
|
935
|
+
|
|
885
936
|
# storing the scale fit object and columns in data transformation dictionary
|
|
886
937
|
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
887
938
|
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
888
939
|
|
|
889
940
|
# List of columns to copy to the output generated by scale transform
|
|
890
941
|
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
891
|
-
|
|
892
942
|
|
|
893
943
|
# Scaling dataset
|
|
894
944
|
transform_obj = ScaleTransform(data=data_to_scale,
|
|
@@ -907,6 +957,11 @@ class _DataPreparation:
|
|
|
907
957
|
|
|
908
958
|
self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
|
|
909
959
|
|
|
960
|
+
if self.cluster and feature_selection_mtd == "non_pca":
|
|
961
|
+
self.data_mapping["non_pca_train"] = scaled_df._table_name
|
|
962
|
+
elif self.cluster and feature_selection_mtd == "raw_scaled":
|
|
963
|
+
self.data_mapping["raw_scaled_train"] = scaled_df._table_name
|
|
964
|
+
|
|
910
965
|
end_time = time.time()
|
|
911
966
|
self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
|
|
912
967
|
progress_bar=self.progress_bar,
|
|
@@ -930,7 +985,9 @@ class _DataPreparation:
|
|
|
930
985
|
self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
|
|
931
986
|
progress_bar=self.progress_bar)
|
|
932
987
|
else:
|
|
933
|
-
if self.
|
|
988
|
+
if self.cluster:
|
|
989
|
+
self._scale_method_cluster = custom_scaling_method
|
|
990
|
+
elif self.is_classification_type():
|
|
934
991
|
self._scale_method_cls = custom_scaling_method
|
|
935
992
|
else:
|
|
936
993
|
self._scale_method_reg = custom_scaling_method
|
|
@@ -943,7 +1000,7 @@ class _DataPreparation:
|
|
|
943
1000
|
"""
|
|
944
1001
|
DESCRIPTION:
|
|
945
1002
|
Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
|
|
946
|
-
|
|
1003
|
+
|
|
947
1004
|
RETURNS:
|
|
948
1005
|
Pandas DataFrame containing, rounded up float columns.
|
|
949
1006
|
"""
|
|
@@ -951,7 +1008,7 @@ class _DataPreparation:
|
|
|
951
1008
|
target_df = self.data
|
|
952
1009
|
# Detecting list of float columns on target dataset
|
|
953
1010
|
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
|
|
954
|
-
|
|
1011
|
+
|
|
955
1012
|
if len(float_columns) == 0:
|
|
956
1013
|
cols = target_df.columns
|
|
957
1014
|
# Doing reset index to get index column
|
|
@@ -960,10 +1017,8 @@ class _DataPreparation:
|
|
|
960
1017
|
# Returning the dataframe with cols
|
|
961
1018
|
# to avoid extra columns generated by reset_index()
|
|
962
1019
|
return df[cols]
|
|
963
|
-
|
|
964
1020
|
# storing the column details for round up in data transformation dictionary
|
|
965
1021
|
self.data_transform_dict["round_columns"] = float_columns
|
|
966
|
-
|
|
967
1022
|
# Extracting accumulate columns
|
|
968
1023
|
accumulate_columns = self._extract_list(target_df.columns,float_columns)
|
|
969
1024
|
# Performing rounding up on target column upto 4 precision digit
|
|
@@ -973,11 +1028,11 @@ class _DataPreparation:
|
|
|
973
1028
|
"precision_digit" : 4,
|
|
974
1029
|
"accumulate" : accumulate_columns,
|
|
975
1030
|
"persist" : True}
|
|
976
|
-
|
|
1031
|
+
|
|
977
1032
|
# Disabling print if persist is True by default
|
|
978
1033
|
if not self.volatile and not self.persist:
|
|
979
1034
|
fit_params["display_table_name"] = False
|
|
980
|
-
|
|
1035
|
+
|
|
981
1036
|
if self.volatile:
|
|
982
1037
|
fit_params["volatile"] = True
|
|
983
1038
|
fit_params["persist"] = False
|
|
@@ -990,4 +1045,4 @@ class _DataPreparation:
|
|
|
990
1045
|
cols = transform_output.columns
|
|
991
1046
|
df = transform_output.to_pandas().reset_index()
|
|
992
1047
|
df = df[cols]
|
|
993
|
-
return df
|
|
1048
|
+
return df
|