teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +71 -0
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +51 -24
- teradataml/analytics/json_parser/utils.py +11 -17
- teradataml/automl/__init__.py +103 -48
- teradataml/automl/data_preparation.py +55 -37
- teradataml/automl/data_transformation.py +131 -69
- teradataml/automl/feature_engineering.py +117 -185
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +13 -25
- teradataml/automl/model_training.py +214 -75
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +11 -6
- teradataml/common/garbagecollector.py +5 -0
- teradataml/common/messagecodes.py +3 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/utils.py +6 -0
- teradataml/context/context.py +49 -29
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/glm_example.json +28 -1
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +20 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
- teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
- teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
- teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
- teradataml/data/teradataml_example.json +77 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +120 -61
- teradataml/dataframe/dataframe.py +102 -17
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +272 -89
- teradataml/dataframe/sql.py +84 -0
- teradataml/dbutils/dbutils.py +2 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
- teradataml/options/__init__.py +13 -4
- teradataml/options/configure.py +27 -6
- teradataml/scriptmgmt/UserEnv.py +19 -16
- teradataml/scriptmgmt/lls_utils.py +117 -14
- teradataml/table_operators/Script.py +2 -3
- teradataml/table_operators/TableOperator.py +58 -10
- teradataml/utils/validators.py +40 -2
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
|
@@ -31,6 +31,8 @@ from teradataml import ScaleTransform
|
|
|
31
31
|
from teradataml import SimpleImputeTransform
|
|
32
32
|
from teradataml import TargetEncodingTransform
|
|
33
33
|
from teradataml import Transform, UtilFuncs, TeradataConstants
|
|
34
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
|
+
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
34
36
|
|
|
35
37
|
# AutoML Internal libraries
|
|
36
38
|
from teradataml.automl.feature_exploration import _FeatureExplore
|
|
@@ -58,12 +60,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
58
60
|
Types: teradataml Dataframe
|
|
59
61
|
|
|
60
62
|
data_transformation_params:
|
|
61
|
-
Required
|
|
63
|
+
Required Argument.
|
|
62
64
|
Specifies the parameters for performing data transformation.
|
|
63
65
|
Types: dict
|
|
64
66
|
|
|
65
67
|
auto:
|
|
66
|
-
Optional
|
|
68
|
+
Optional Argument.
|
|
67
69
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
68
70
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
69
71
|
Default Value: True
|
|
@@ -80,7 +82,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
80
82
|
Types: int
|
|
81
83
|
|
|
82
84
|
target_column_ind:
|
|
83
|
-
Optional
|
|
85
|
+
Optional Argument.
|
|
84
86
|
Specifies whether target column is present in given dataset.
|
|
85
87
|
Default Value: False
|
|
86
88
|
Types: bool
|
|
@@ -118,6 +120,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
118
120
|
# Extracting target column details and type whether it is classification or not
|
|
119
121
|
self.data_target_column = self.data_transformation_params.get("data_target_column")
|
|
120
122
|
self.classification_type = self.data_transformation_params.get("classification_type", False)
|
|
123
|
+
|
|
124
|
+
# Setting number of jobs for progress bar based on mode of execution
|
|
125
|
+
jobs = 10 if self.auto else 15
|
|
126
|
+
self.progress_bar = _ProgressBar(jobs=jobs, verbose=2, prefix='Transformation Running:')
|
|
127
|
+
|
|
121
128
|
# Performing transformation carried out in feature engineering phase
|
|
122
129
|
self.feature_engineering_transformation()
|
|
123
130
|
# Performing transformation carried out in data preparation phase
|
|
@@ -133,27 +140,52 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
133
140
|
on test data using parameters from data_transformation_params.
|
|
134
141
|
"""
|
|
135
142
|
self._display_msg(msg="Performing transformation carried out in feature engineering phase ...",
|
|
136
|
-
show_data=True
|
|
143
|
+
show_data=True,
|
|
144
|
+
progress_bar=self.progress_bar)
|
|
145
|
+
|
|
137
146
|
# Performing default transformation for both auto and custom mode
|
|
138
147
|
self._preprocess_transformation()
|
|
148
|
+
self.progress_bar.update()
|
|
149
|
+
|
|
139
150
|
self._futile_column_handling_transformation()
|
|
151
|
+
self.progress_bar.update()
|
|
152
|
+
|
|
140
153
|
# Handling target column transformation
|
|
141
154
|
if self.target_column_ind and self.classification_type:
|
|
142
155
|
self._handle_target_column_transformation()
|
|
156
|
+
self.progress_bar.update()
|
|
157
|
+
|
|
143
158
|
self._date_column_handling_transformation()
|
|
159
|
+
self.progress_bar.update()
|
|
144
160
|
|
|
145
161
|
# Performing transformation according to run mode
|
|
146
162
|
if self.auto:
|
|
147
163
|
self._missing_value_handling_transformation()
|
|
164
|
+
self.progress_bar.update()
|
|
165
|
+
|
|
148
166
|
self._categorical_encoding_transformation()
|
|
167
|
+
self.progress_bar.update()
|
|
149
168
|
else:
|
|
150
169
|
self._custom_missing_value_handling_transformation()
|
|
170
|
+
self.progress_bar.update()
|
|
171
|
+
|
|
151
172
|
self._custom_bincode_column_transformation()
|
|
173
|
+
self.progress_bar.update()
|
|
174
|
+
|
|
152
175
|
self._custom_string_column_transformation()
|
|
176
|
+
self.progress_bar.update()
|
|
177
|
+
|
|
153
178
|
self._custom_categorical_encoding_transformation()
|
|
179
|
+
self.progress_bar.update()
|
|
180
|
+
|
|
154
181
|
self._custom_mathematical_transformation()
|
|
182
|
+
self.progress_bar.update()
|
|
183
|
+
|
|
155
184
|
self._custom_non_linear_transformation()
|
|
185
|
+
self.progress_bar.update()
|
|
186
|
+
|
|
156
187
|
self._custom_anti_select_column_transformation()
|
|
188
|
+
self.progress_bar.update()
|
|
157
189
|
|
|
158
190
|
def data_preparation_transformation(self):
|
|
159
191
|
"""
|
|
@@ -162,15 +194,23 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
162
194
|
on test data using parameters from data_transformation_params.
|
|
163
195
|
"""
|
|
164
196
|
self._display_msg(msg="Performing transformation carried out in data preparation phase ...",
|
|
165
|
-
show_data=True
|
|
197
|
+
show_data=True,
|
|
198
|
+
progress_bar=self.progress_bar)
|
|
199
|
+
|
|
166
200
|
# Handling features transformed from feature engineering phase
|
|
167
201
|
self._handle_generated_features_transformation()
|
|
202
|
+
self.progress_bar.update()
|
|
168
203
|
|
|
169
204
|
# Performing transformation including feature selection using lasso, rfe and pca
|
|
170
205
|
# followed by scaling
|
|
171
206
|
self._feature_selection_lasso_transformation()
|
|
207
|
+
self.progress_bar.update()
|
|
208
|
+
|
|
172
209
|
self._feature_selection_rfe_transformation()
|
|
210
|
+
self.progress_bar.update()
|
|
211
|
+
|
|
173
212
|
self._feature_selection_pca_transformation()
|
|
213
|
+
self.progress_bar.update()
|
|
174
214
|
|
|
175
215
|
def _preprocess_transformation(self):
|
|
176
216
|
"""
|
|
@@ -182,7 +222,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
182
222
|
if columns_to_be_removed:
|
|
183
223
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
184
224
|
self._display_msg(msg="\nUpdated dataset after dropping irrelevent columns :",
|
|
185
|
-
data=self.data
|
|
225
|
+
data=self.data,
|
|
226
|
+
progress_bar=self.progress_bar)
|
|
186
227
|
|
|
187
228
|
# Adding id column
|
|
188
229
|
self.data = FillRowId(data=self.data, row_id_column='id').result
|
|
@@ -197,7 +238,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
197
238
|
if futile_cols:
|
|
198
239
|
self.data = self.data.drop(futile_cols, axis=1)
|
|
199
240
|
self._display_msg(msg="\nUpdated dataset after dropping futile columns :",
|
|
200
|
-
data=self.data
|
|
241
|
+
data=self.data,
|
|
242
|
+
progress_bar=self.progress_bar)
|
|
201
243
|
|
|
202
244
|
def _date_column_handling_transformation(self):
|
|
203
245
|
"""
|
|
@@ -205,47 +247,32 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
205
247
|
Function performs transformation on date columns and generates new columns.
|
|
206
248
|
"""
|
|
207
249
|
# Extracting date columns
|
|
208
|
-
|
|
209
|
-
if
|
|
250
|
+
self.date_column_list = self.data_transformation_params.get("date_columns",None)
|
|
251
|
+
if self.date_column_list:
|
|
210
252
|
# Dropping rows with null values in date columns
|
|
211
|
-
self.data = self.data.dropna(subset=
|
|
253
|
+
self.data = self.data.dropna(subset=self.date_column_list)
|
|
212
254
|
# Extracting unique date columns for dropping
|
|
213
255
|
drop_unique_date_columns = self.data_transformation_params.get("drop_unique_date_columns",None)
|
|
214
256
|
if drop_unique_date_columns:
|
|
215
257
|
self.data = self.data.drop(drop_unique_date_columns, axis=1)
|
|
258
|
+
# Updated date column list after dropping irrelevant date columns
|
|
259
|
+
self.date_column_list = [item for item in self.date_column_list if item not in drop_unique_date_columns]
|
|
216
260
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
self.
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
year_diff_component_fit_object = self.data_transformation_params.get("year_diff_component_fit_object", None)
|
|
233
|
-
|
|
234
|
-
# Performing bincode transformation on day, month and year components
|
|
235
|
-
for fit_object in [day_component_fit_object, month_component_fit_object, year_diff_component_fit_object]:
|
|
236
|
-
if fit_object:
|
|
237
|
-
for col, bin_code_fit in fit_object.items():
|
|
238
|
-
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
239
|
-
transform_params = {
|
|
240
|
-
"data": self.data,
|
|
241
|
-
"object": bin_code_fit,
|
|
242
|
-
"accumulate": accumulate_columns,
|
|
243
|
-
"persist": True
|
|
244
|
-
}
|
|
245
|
-
self.data = BincodeTransform(**transform_params).result
|
|
246
|
-
|
|
247
|
-
self._display_msg(msg="\nUpdated dataset after transforming date columns :",
|
|
248
|
-
data=self.data)
|
|
261
|
+
if len(self.date_column_list) != 0:
|
|
262
|
+
# Extracting date components parameters for new columns generation
|
|
263
|
+
new_columns=self._fetch_date_component()
|
|
264
|
+
|
|
265
|
+
# Extracting irrelevant date component columns for dropping
|
|
266
|
+
drop_extract_date_columns = self.data_transformation_params.get("drop_extract_date_columns", None)
|
|
267
|
+
if drop_extract_date_columns:
|
|
268
|
+
self.data = self.data.drop(drop_extract_date_columns, axis=1)
|
|
269
|
+
new_columns = [item for item in new_columns if item not in drop_extract_date_columns]
|
|
270
|
+
|
|
271
|
+
self._display_msg(msg='Updated list of newly generated features from existing date features :',
|
|
272
|
+
col_lst=new_columns)
|
|
273
|
+
self._display_msg(msg="\nUpdated dataset after transforming date columns :",
|
|
274
|
+
data=self.data,
|
|
275
|
+
progress_bar=self.progress_bar)
|
|
249
276
|
|
|
250
277
|
def _missing_value_handling_transformation(self):
|
|
251
278
|
"""
|
|
@@ -257,7 +284,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
257
284
|
if drop_cols:
|
|
258
285
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
259
286
|
self._display_msg(msg="\nUpdated dataset after dropping missing value containing columns : ",
|
|
260
|
-
data=self.data
|
|
287
|
+
data=self.data,
|
|
288
|
+
progress_bar=self.progress_bar)
|
|
261
289
|
|
|
262
290
|
# Extracting imputation columns and fit object for missing value imputation
|
|
263
291
|
imputation_cols = self.data_transformation_params.get("imputation_columns", None)
|
|
@@ -265,20 +293,22 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
265
293
|
sm_fit_obj = self.data_transformation_params.get("imputation_fit_object")
|
|
266
294
|
# imputing column using fit object
|
|
267
295
|
self.data = SimpleImputeTransform(data=self.data,
|
|
268
|
-
|
|
269
|
-
volatile=True).result
|
|
296
|
+
object=sm_fit_obj).result
|
|
270
297
|
self._display_msg(msg="\nUpdated dataset after imputing missing value containing columns :",
|
|
271
|
-
data=self.data
|
|
298
|
+
data=self.data,
|
|
299
|
+
progress_bar=self.progress_bar)
|
|
272
300
|
|
|
273
301
|
# Handling rest null, its temporary solution. It subjects to change based on input.
|
|
274
302
|
dropped_data = self.data.dropna()
|
|
275
303
|
dropped_count = self.data.shape[0] - dropped_data.shape[0]
|
|
276
304
|
if dropped_count > 0:
|
|
277
|
-
self.data = dropped_data
|
|
278
305
|
self._display_msg(msg="\nFound additional {} rows that contain missing values :".format(dropped_count),
|
|
279
|
-
data=self.data
|
|
306
|
+
data=self.data,
|
|
307
|
+
progress_bar=self.progress_bar)
|
|
308
|
+
self.data = dropped_data
|
|
280
309
|
self._display_msg(msg="\nUpdated dataset after dropping additional missing value containing rows :",
|
|
281
|
-
data=self.data
|
|
310
|
+
data=self.data,
|
|
311
|
+
progress_bar=self.progress_bar)
|
|
282
312
|
|
|
283
313
|
def _custom_missing_value_handling_transformation(self):
|
|
284
314
|
"""
|
|
@@ -291,7 +321,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
291
321
|
if drop_col_list:
|
|
292
322
|
self.data = self.data.drop(drop_col_list, axis=1)
|
|
293
323
|
self._display_msg(msg="\nUpdated dataset after dropping customized missing value containing columns :",
|
|
294
|
-
data=self.data
|
|
324
|
+
data=self.data,
|
|
325
|
+
progress_bar=self.progress_bar)
|
|
295
326
|
|
|
296
327
|
# Extracting custom imputation columns and fit object for missing value imputation
|
|
297
328
|
custom_imp_ind = self.data_transformation_params.get("custom_imputation_ind", False)
|
|
@@ -299,10 +330,10 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
299
330
|
sm_fit_obj = self.data_transformation_params.get("custom_imputation_fit_object")
|
|
300
331
|
# imputing column using fit object
|
|
301
332
|
self.data = SimpleImputeTransform(data=self.data,
|
|
302
|
-
|
|
303
|
-
volatile=True).result
|
|
333
|
+
object=sm_fit_obj).result
|
|
304
334
|
self._display_msg(msg="\nUpdated dataset after imputing customized missing value containing columns :",
|
|
305
|
-
data=self.data
|
|
335
|
+
data=self.data,
|
|
336
|
+
progress_bar=self.progress_bar)
|
|
306
337
|
# Handling rest with default missing value handling
|
|
307
338
|
self._missing_value_handling_transformation()
|
|
308
339
|
|
|
@@ -328,8 +359,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
328
359
|
"persist" : True,
|
|
329
360
|
}
|
|
330
361
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
362
|
+
# Adding transformed data containing table to garbage collector
|
|
363
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
331
364
|
self._display_msg(msg="\nUpdated dataset after performing customized equal width bin-code transformation :",
|
|
332
|
-
data=self.data
|
|
365
|
+
data=self.data,
|
|
366
|
+
progress_bar=self.progress_bar)
|
|
333
367
|
|
|
334
368
|
# Hnadling bincode transformation for Variable-Width
|
|
335
369
|
custom_var_bincode_col = self.data_transformation_params.get("custom_var_bincode_col", None)
|
|
@@ -346,8 +380,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
346
380
|
"persist" : True
|
|
347
381
|
}
|
|
348
382
|
self.data = BincodeTransform(**var_transform_params).result
|
|
383
|
+
# Adding transformed data containing table to garbage collector
|
|
384
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
349
385
|
self._display_msg(msg="\nUpdated dataset after performing customized variable width bin-code transformation :",
|
|
350
|
-
data=self.data
|
|
386
|
+
data=self.data,
|
|
387
|
+
progress_bar=self.progress_bar)
|
|
351
388
|
|
|
352
389
|
def _custom_string_column_transformation(self):
|
|
353
390
|
"""
|
|
@@ -362,7 +399,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
362
399
|
for target_col,transform_val in custom_string_manipulation_param.items():
|
|
363
400
|
self.data = self._str_method_mapping(target_col, transform_val)
|
|
364
401
|
self._display_msg(msg="\nUpdated dataset after performing customized string manipulation :",
|
|
365
|
-
data=self.data
|
|
402
|
+
data=self.data,
|
|
403
|
+
progress_bar=self.progress_bar)
|
|
366
404
|
|
|
367
405
|
def _categorical_encoding_transformation(self):
|
|
368
406
|
"""
|
|
@@ -384,10 +422,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
384
422
|
}
|
|
385
423
|
# Performing one hot encoding transformation
|
|
386
424
|
self.data = OneHotEncodingTransform(**transform_params).result
|
|
425
|
+
# Adding transformed data containing table to garbage collector
|
|
426
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
387
427
|
# Dropping old columns after encoding
|
|
388
428
|
self.data = self.data.drop(one_hot_encoding_drop_list, axis=1)
|
|
389
429
|
self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
|
|
390
|
-
|
|
430
|
+
data=self.data,
|
|
431
|
+
progress_bar=self.progress_bar)
|
|
391
432
|
|
|
392
433
|
def _custom_categorical_encoding_transformation(self):
|
|
393
434
|
"""
|
|
@@ -412,6 +453,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
412
453
|
}
|
|
413
454
|
# Performing ordinal encoding transformation
|
|
414
455
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
456
|
+
# Adding transformed data containing table to garbage collector
|
|
457
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
415
458
|
# Extracting parameters for target encoding
|
|
416
459
|
custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
|
|
417
460
|
custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
|
|
@@ -426,10 +469,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
426
469
|
"accumulate" : accumulate_columns,
|
|
427
470
|
"persist" : True
|
|
428
471
|
}
|
|
429
|
-
# Performing
|
|
472
|
+
# Performing target encoding transformation
|
|
430
473
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
474
|
+
# Adding transformed data containing table to garbage collector
|
|
475
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
431
476
|
self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
|
|
432
|
-
data=self.data
|
|
477
|
+
data=self.data,
|
|
478
|
+
progress_bar=self.progress_bar)
|
|
433
479
|
|
|
434
480
|
# Handling rest with default categorical encoding transformation
|
|
435
481
|
self._categorical_encoding_transformation()
|
|
@@ -472,8 +518,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
472
518
|
}
|
|
473
519
|
# Peforming transformation on target columns
|
|
474
520
|
self.data = Transform(**transform_params).result
|
|
521
|
+
# Adding transformed data containing table to garbage collector
|
|
522
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
475
523
|
self._display_msg(msg="\nUpdated dataset after performing customized mathematical transformation :",
|
|
476
|
-
data=self.data
|
|
524
|
+
data=self.data,
|
|
525
|
+
progress_bar=self.progress_bar)
|
|
477
526
|
|
|
478
527
|
def _custom_non_linear_transformation(self):
|
|
479
528
|
"""
|
|
@@ -495,8 +544,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
495
544
|
}
|
|
496
545
|
# Performing transformation
|
|
497
546
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
547
|
+
# Adding transformed data containing table to garbage collector
|
|
548
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
498
549
|
self._display_msg(msg="\nUpdated dataset after performing customized non-linear transformation :",
|
|
499
|
-
data=self.data
|
|
550
|
+
data=self.data,
|
|
551
|
+
progress_bar=self.progress_bar)
|
|
500
552
|
|
|
501
553
|
def _custom_anti_select_column_transformation(self):
|
|
502
554
|
"""
|
|
@@ -516,7 +568,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
516
568
|
# Performing transformation for given user input
|
|
517
569
|
self.data = Antiselect(**fit_params).result
|
|
518
570
|
self._display_msg(msg="\nUpdated dataset after performing customized anti-selection :",
|
|
519
|
-
data=self.data
|
|
571
|
+
data=self.data,
|
|
572
|
+
progress_bar=self.progress_bar)
|
|
520
573
|
|
|
521
574
|
def _handle_generated_features_transformation(self):
|
|
522
575
|
"""
|
|
@@ -541,6 +594,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
541
594
|
"accumulate" : accumulate_columns,
|
|
542
595
|
"persist" : True}
|
|
543
596
|
self.data = RoundColumns(**fit_params).result
|
|
597
|
+
# Adding transformed data containing table to garbage collector
|
|
598
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
544
599
|
|
|
545
600
|
def _handle_target_column_transformation(self):
|
|
546
601
|
"""
|
|
@@ -565,7 +620,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
565
620
|
}
|
|
566
621
|
# Performing ordinal encoding transformation
|
|
567
622
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
568
|
-
|
|
623
|
+
# Adding transformed data containing table to garbage collector
|
|
624
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
569
625
|
# Converting target column to integer datatype
|
|
570
626
|
params = {
|
|
571
627
|
"data" : self.data,
|
|
@@ -575,7 +631,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
575
631
|
}
|
|
576
632
|
self.data = ConvertTo(**params).result
|
|
577
633
|
self._display_msg(msg="\nUpdated dataset after performing target column transformation :",
|
|
578
|
-
data=self.data
|
|
634
|
+
data=self.data,
|
|
635
|
+
progress_bar=self.progress_bar)
|
|
579
636
|
|
|
580
637
|
def _extract_and_display_features(self, feature_type, feature_list):
|
|
581
638
|
"""
|
|
@@ -605,7 +662,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
605
662
|
|
|
606
663
|
# Displaying feature dataframe
|
|
607
664
|
self._display_msg(msg=f"\nUpdated dataset after performing {feature_type} feature selection:",
|
|
608
|
-
|
|
665
|
+
data=feature_df,
|
|
666
|
+
progress_bar=self.progress_bar)
|
|
609
667
|
|
|
610
668
|
# Returning feature dataframe
|
|
611
669
|
return feature_df
|
|
@@ -631,7 +689,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
631
689
|
accumulate=accumulate_cols).result
|
|
632
690
|
# Displaying scaled dataset
|
|
633
691
|
self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
|
|
634
|
-
data=lasso_df
|
|
692
|
+
data=lasso_df,
|
|
693
|
+
progress_bar=self.progress_bar)
|
|
635
694
|
|
|
636
695
|
# Uploading lasso dataset to table for further use
|
|
637
696
|
table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
|
|
@@ -667,7 +726,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
667
726
|
accumulate=accumulate_cols).result
|
|
668
727
|
# Displaying scaled dataset
|
|
669
728
|
self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
|
|
670
|
-
data=rfe_df
|
|
729
|
+
data=rfe_df,
|
|
730
|
+
progress_bar=self.progress_bar)
|
|
671
731
|
|
|
672
732
|
# Uploading rfe dataset to table for further use
|
|
673
733
|
table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
|
|
@@ -691,7 +751,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
691
751
|
accumulate=accumulate_cols).result
|
|
692
752
|
# Displaying scaled dataset
|
|
693
753
|
self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
|
|
694
|
-
data=pca_scaled_df
|
|
754
|
+
data=pca_scaled_df,
|
|
755
|
+
progress_bar=self.progress_bar)
|
|
695
756
|
|
|
696
757
|
# Convert to pandas dataframe for applying pca
|
|
697
758
|
pca_scaled_pd = pca_scaled_df.to_pandas()
|
|
@@ -718,7 +779,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
718
779
|
pca_df[self.data_target_column] = pca_scaled_pd[self.data_target_column].reset_index(drop=True)
|
|
719
780
|
# Displaying pca dataframe
|
|
720
781
|
self._display_msg(msg="\nUpdated dataset after performing PCA feature selection :",
|
|
721
|
-
data=pca_df)
|
|
782
|
+
data=pca_df.head(10),
|
|
783
|
+
progress_bar=self.progress_bar)
|
|
722
784
|
|
|
723
785
|
# Uploading pca dataset to table for further use
|
|
724
786
|
table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
|