teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +112 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +224 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +6 -4
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +245 -306
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +313 -82
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +109 -146
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/constants.py +37 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +151 -120
- teradataml/common/messagecodes.py +4 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +97 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +72 -2
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +17 -10
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +2 -2
- teradataml/data/scripts/sklearn/sklearn_function.template +30 -7
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +55 -4
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +474 -41
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +658 -20
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +322 -16
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +154 -69
- teradataml/options/__init__.py +3 -1
- teradataml/options/configure.py +14 -2
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +10 -6
- teradataml/scriptmgmt/lls_utils.py +3 -2
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +1 -1
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +115 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +200 -140
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -50,7 +50,8 @@ class _FeatureEngineering:
|
|
|
50
50
|
model_list,
|
|
51
51
|
verbose = 0,
|
|
52
52
|
task_type = "Regression",
|
|
53
|
-
custom_data = None
|
|
53
|
+
custom_data = None,
|
|
54
|
+
**kwargs):
|
|
54
55
|
"""
|
|
55
56
|
DESCRIPTION:
|
|
56
57
|
Function initializes the data, target column and columns datatypes
|
|
@@ -94,6 +95,28 @@ class _FeatureEngineering:
|
|
|
94
95
|
Optional Argument.
|
|
95
96
|
Specifies json object containing user customized input.
|
|
96
97
|
Types: json object
|
|
98
|
+
|
|
99
|
+
**kwargs:
|
|
100
|
+
Specifies the additional arguments for feature engineering. Below
|
|
101
|
+
are the additional arguments:
|
|
102
|
+
volatile:
|
|
103
|
+
Optional Argument.
|
|
104
|
+
Specifies whether to put the interim results of the
|
|
105
|
+
functions in a volatile table or not. When set to
|
|
106
|
+
True, results are stored in a volatile table,
|
|
107
|
+
otherwise not.
|
|
108
|
+
Default Value: False
|
|
109
|
+
Types: bool
|
|
110
|
+
|
|
111
|
+
persist:
|
|
112
|
+
Optional Argument.
|
|
113
|
+
Specifies whether to persist the interim results of the
|
|
114
|
+
functions in a table or not. When set to True,
|
|
115
|
+
results are persisted in a table; otherwise,
|
|
116
|
+
results are garbage collected at the end of the
|
|
117
|
+
session.
|
|
118
|
+
Default Value: False
|
|
119
|
+
Types: bool
|
|
97
120
|
"""
|
|
98
121
|
# Instance variables
|
|
99
122
|
self.data = data
|
|
@@ -108,6 +131,8 @@ class _FeatureEngineering:
|
|
|
108
131
|
self.data_transform_dict = {}
|
|
109
132
|
self.one_hot_obj_count = 0
|
|
110
133
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
134
|
+
self.volatile = kwargs.get('volatile', False)
|
|
135
|
+
self.persist = kwargs.get('persist', False)
|
|
111
136
|
|
|
112
137
|
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
113
138
|
def feature_engineering(self,
|
|
@@ -133,7 +158,7 @@ class _FeatureEngineering:
|
|
|
133
158
|
second element represents list of columns which are not participating in outlier tranformation.
|
|
134
159
|
"""
|
|
135
160
|
# Assigning number of base jobs for progress bar.
|
|
136
|
-
base_jobs =
|
|
161
|
+
base_jobs = 13 if auto else 17
|
|
137
162
|
|
|
138
163
|
# Updating model list based on distinct value of target column for classification type
|
|
139
164
|
if self.is_classification_type():
|
|
@@ -183,9 +208,12 @@ class _FeatureEngineering:
|
|
|
183
208
|
self._remove_duplicate_rows()
|
|
184
209
|
self.progress_bar.update()
|
|
185
210
|
|
|
211
|
+
self._anti_select_columns()
|
|
212
|
+
self.progress_bar.update()
|
|
213
|
+
|
|
186
214
|
self._remove_futile_columns()
|
|
187
215
|
self.progress_bar.update()
|
|
188
|
-
|
|
216
|
+
|
|
189
217
|
self._handle_date_columns()
|
|
190
218
|
self.progress_bar.update()
|
|
191
219
|
|
|
@@ -206,10 +234,7 @@ class _FeatureEngineering:
|
|
|
206
234
|
|
|
207
235
|
self._non_linear_transformation()
|
|
208
236
|
self.progress_bar.update()
|
|
209
|
-
|
|
210
|
-
self._anti_select_columns()
|
|
211
|
-
self.progress_bar.update()
|
|
212
|
-
|
|
237
|
+
|
|
213
238
|
return self.data, self.excluded_cols, self.target_label, self.data_transform_dict
|
|
214
239
|
|
|
215
240
|
def _extract_list(self,
|
|
@@ -324,12 +349,16 @@ class _FeatureEngineering:
|
|
|
324
349
|
if len(categorical_columns) != 0:
|
|
325
350
|
|
|
326
351
|
obj = CategoricalSummary(data=self.data,
|
|
327
|
-
target_columns=categorical_columns
|
|
352
|
+
target_columns=categorical_columns,
|
|
353
|
+
volatile=self.volatile,
|
|
354
|
+
persist=self.persist)
|
|
328
355
|
|
|
329
356
|
gfc_out = GetFutileColumns(data=self.data,
|
|
330
357
|
object=obj,
|
|
331
358
|
category_summary_column="ColumnName",
|
|
332
|
-
threshold_value =0.7
|
|
359
|
+
threshold_value =0.7,
|
|
360
|
+
volatile=self.volatile,
|
|
361
|
+
persist=self.persist)
|
|
333
362
|
|
|
334
363
|
# Extracting Futile columns
|
|
335
364
|
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
@@ -402,10 +431,22 @@ class _FeatureEngineering:
|
|
|
402
431
|
"accumulate" : accumulate_columns,
|
|
403
432
|
"persist" : True
|
|
404
433
|
}
|
|
434
|
+
# Disabling display table name if persist is True by default
|
|
435
|
+
if not self.volatile and not self.persist:
|
|
436
|
+
convertto_params["display_table_name"] = False
|
|
437
|
+
|
|
438
|
+
# Setting persist to False if volatile is True
|
|
439
|
+
if self.volatile:
|
|
440
|
+
convertto_params["persist"] = False
|
|
441
|
+
convertto_params["volatile"] = True
|
|
442
|
+
|
|
405
443
|
# returning dataset after performing string manipulation
|
|
406
444
|
self.data = ConvertTo(**convertto_params).result
|
|
407
|
-
|
|
408
|
-
|
|
445
|
+
|
|
446
|
+
# IF volatile is False and persist is False
|
|
447
|
+
if not self.volatile and not self.persist:
|
|
448
|
+
# Adding transformed data containing table to garbage collector
|
|
449
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
409
450
|
return new_date_components
|
|
410
451
|
|
|
411
452
|
def _handle_date_columns_helper(self):
|
|
@@ -526,7 +567,9 @@ class _FeatureEngineering:
|
|
|
526
567
|
self.data = self.data.dropna(subset=[self.target_column])
|
|
527
568
|
|
|
528
569
|
obj = ColumnSummary(data=self.data,
|
|
529
|
-
target_columns=self.data.columns
|
|
570
|
+
target_columns=self.data.columns,
|
|
571
|
+
volatile=self.volatile,
|
|
572
|
+
persist=self.persist)
|
|
530
573
|
|
|
531
574
|
cols_miss_val={}
|
|
532
575
|
# Iterating over each row in the column summary result
|
|
@@ -697,12 +740,16 @@ class _FeatureEngineering:
|
|
|
697
740
|
|
|
698
741
|
fit_obj = SimpleImputeFit(data=self.data,
|
|
699
742
|
stats_columns=col_stat,
|
|
700
|
-
stats=stat
|
|
743
|
+
stats=stat,
|
|
744
|
+
volatile=self.volatile,
|
|
745
|
+
persist=self.persist)
|
|
701
746
|
|
|
702
747
|
# Storing fit object for imputation in data transform dictionary
|
|
703
748
|
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
704
749
|
sm = SimpleImputeTransform(data=self.data,
|
|
705
|
-
|
|
750
|
+
object=fit_obj,
|
|
751
|
+
volatile=self.volatile,
|
|
752
|
+
persist=self.persist)
|
|
706
753
|
|
|
707
754
|
self.data = sm.result
|
|
708
755
|
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
@@ -735,6 +782,8 @@ class _FeatureEngineering:
|
|
|
735
782
|
drop_col_ind = missing_handling_param.get("DroppingColumnIndicator", False)
|
|
736
783
|
drop_row_ind = missing_handling_param.get("DroppingRowIndicator", False)
|
|
737
784
|
impute_ind = missing_handling_param.get("ImputeMissingIndicator", False)
|
|
785
|
+
volatile = missing_handling_param.pop("volatile", False)
|
|
786
|
+
persist = missing_handling_param.pop("persist", False)
|
|
738
787
|
# Checking for user input if all methods indicator are false or not
|
|
739
788
|
if not any([drop_col_ind, drop_row_ind, impute_ind]):
|
|
740
789
|
self._display_msg(inline_msg="No method information provided for performing customized missing value handling. \
|
|
@@ -791,7 +840,9 @@ class _FeatureEngineering:
|
|
|
791
840
|
"stats_columns" : stat_list,
|
|
792
841
|
"stats" : stat_method,
|
|
793
842
|
"literals_columns" : literal_list,
|
|
794
|
-
"literals" : literal_value
|
|
843
|
+
"literals" : literal_value,
|
|
844
|
+
"volatile" : volatile,
|
|
845
|
+
"persist" : persist
|
|
795
846
|
}
|
|
796
847
|
# Fitting on dataset
|
|
797
848
|
fit_obj = SimpleImputeFit(**fit_param)
|
|
@@ -804,10 +855,18 @@ class _FeatureEngineering:
|
|
|
804
855
|
"object" : fit_obj.output,
|
|
805
856
|
"persist" : True
|
|
806
857
|
}
|
|
858
|
+
# Disabling display table name if persist is True by default
|
|
859
|
+
if not volatile and not persist:
|
|
860
|
+
transform_param["display_table_name"] = False
|
|
861
|
+
|
|
862
|
+
if volatile:
|
|
863
|
+
transform_param["volatile"] = True
|
|
864
|
+
transform_param["persist"] = False
|
|
807
865
|
# Updating dataset with transform result
|
|
808
866
|
self.data = SimpleImputeTransform(**transform_param).result
|
|
809
|
-
|
|
810
|
-
|
|
867
|
+
if not volatile and not persist:
|
|
868
|
+
# Adding transformed data containing table to garbage collector
|
|
869
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
811
870
|
self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
|
|
812
871
|
data=self.data,
|
|
813
872
|
progress_bar=self.progress_bar)
|
|
@@ -848,6 +907,8 @@ class _FeatureEngineering:
|
|
|
848
907
|
equal_width_bin_columns = []
|
|
849
908
|
var_width_bin_list = []
|
|
850
909
|
var_width_bin_columns = []
|
|
910
|
+
volatile = extracted_col.pop("volatile", False)
|
|
911
|
+
persist = extracted_col.pop("persist", False)
|
|
851
912
|
|
|
852
913
|
# Checking for column present in dataset or not
|
|
853
914
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "BincodeParam", self.data, "df")
|
|
@@ -881,7 +942,9 @@ class _FeatureEngineering:
|
|
|
881
942
|
"data" : self.data,
|
|
882
943
|
"target_columns": equal_width_bin_columns,
|
|
883
944
|
"method_type" : "Equal-Width",
|
|
884
|
-
"nbins" : bins
|
|
945
|
+
"nbins" : bins,
|
|
946
|
+
"volatile" : volatile,
|
|
947
|
+
"persist" : persist
|
|
885
948
|
}
|
|
886
949
|
eql_bin_code_fit = BincodeFit(**fit_params)
|
|
887
950
|
# Storing fit object and column list for Equal-Width binning in data transform dictionary
|
|
@@ -894,11 +957,19 @@ class _FeatureEngineering:
|
|
|
894
957
|
"data" : self.data,
|
|
895
958
|
"object" : eql_bin_code_fit.output,
|
|
896
959
|
"accumulate" : accumulate_columns,
|
|
897
|
-
"persist" : True
|
|
960
|
+
"persist" : True
|
|
898
961
|
}
|
|
962
|
+
# Disabling display table name if persist is True by default
|
|
963
|
+
if not volatile and not persist:
|
|
964
|
+
eql_transform_params["display_table_name"] = False
|
|
965
|
+
|
|
966
|
+
if volatile:
|
|
967
|
+
eql_transform_params["volatile"] = True
|
|
968
|
+
eql_transform_params["persist"] = False
|
|
899
969
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
900
|
-
|
|
901
|
-
|
|
970
|
+
if not volatile and not persist:
|
|
971
|
+
# Adding transformed data containing table to garbage collector
|
|
972
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
902
973
|
self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
|
|
903
974
|
data=self.data,
|
|
904
975
|
progress_bar=self.progress_bar)
|
|
@@ -923,7 +994,9 @@ class _FeatureEngineering:
|
|
|
923
994
|
"maxvalue_column" : "MaxValue",
|
|
924
995
|
"label_column" : "Label",
|
|
925
996
|
"method_type" : "Variable-Width",
|
|
926
|
-
"label_prefix" : "label_prefix"
|
|
997
|
+
"label_prefix" : "label_prefix",
|
|
998
|
+
"volatile" : volatile,
|
|
999
|
+
"persist" : persist
|
|
927
1000
|
}
|
|
928
1001
|
var_bin_code_fit = BincodeFit(**fit_params)
|
|
929
1002
|
# Storing fit object and column list for Variable-Width binning in data transform dictionary
|
|
@@ -935,11 +1008,19 @@ class _FeatureEngineering:
|
|
|
935
1008
|
"object" : var_bin_code_fit.output,
|
|
936
1009
|
"object_order_column" : "TD_MinValue_BINFIT",
|
|
937
1010
|
"accumulate" : accumulate_columns,
|
|
938
|
-
"persist" : True
|
|
1011
|
+
"persist" : True
|
|
939
1012
|
}
|
|
1013
|
+
# Disabling display table name if persist is True by default
|
|
1014
|
+
if not volatile and not persist:
|
|
1015
|
+
var_transform_params["display_table_name"] = False
|
|
1016
|
+
|
|
1017
|
+
if volatile:
|
|
1018
|
+
var_transform_params["volatile"] = True
|
|
1019
|
+
var_transform_params["persist"] = False
|
|
940
1020
|
self.data = BincodeTransform(**var_transform_params).result
|
|
941
|
-
|
|
942
|
-
|
|
1021
|
+
if not volatile and not persist:
|
|
1022
|
+
# Adding transformed data containing table to garbage collector
|
|
1023
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
943
1024
|
self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
|
|
944
1025
|
data=self.data,
|
|
945
1026
|
progress_bar=self.progress_bar)
|
|
@@ -963,11 +1044,13 @@ class _FeatureEngineering:
|
|
|
963
1044
|
# Storing custom string manipulation indicator in data transform dictionary
|
|
964
1045
|
self.data_transform_dict['custom_string_manipulation_ind'] = True
|
|
965
1046
|
# Fetching list required for performing operation.
|
|
966
|
-
extracted_col = self.custom_data.get("StringManipulationParam", None)
|
|
1047
|
+
extracted_col = self.custom_data.get("StringManipulationParam", None).copy()
|
|
967
1048
|
if not extracted_col:
|
|
968
1049
|
self._display_msg(inline_msg="No information provided for performing string manipulation.",
|
|
969
1050
|
progress_bar=self.progress_bar)
|
|
970
1051
|
else:
|
|
1052
|
+
volatile = extracted_col.pop("volatile", False)
|
|
1053
|
+
persist = extracted_col.pop("persist", False)
|
|
971
1054
|
# Checking for column present in dataset or not
|
|
972
1055
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "StringManipulationParam", self.data, "df")
|
|
973
1056
|
|
|
@@ -980,8 +1063,9 @@ class _FeatureEngineering:
|
|
|
980
1063
|
data=self.data,
|
|
981
1064
|
progress_bar=self.progress_bar)
|
|
982
1065
|
else:
|
|
983
|
-
self._display_msg(inline_msg="Skipping customized string manipulation."
|
|
984
|
-
|
|
1066
|
+
self._display_msg(inline_msg="Skipping customized string manipulation.",
|
|
1067
|
+
progress_bar=self.progress_bar)
|
|
1068
|
+
|
|
985
1069
|
def _str_method_mapping(self,
|
|
986
1070
|
target_col,
|
|
987
1071
|
transform_val):
|
|
@@ -1010,7 +1094,11 @@ class _FeatureEngineering:
|
|
|
1010
1094
|
|
|
1011
1095
|
# Fetching required parameters from json object
|
|
1012
1096
|
string_operation = transform_val["StringOperation"]
|
|
1013
|
-
|
|
1097
|
+
|
|
1098
|
+
# Setting volatile and persist parameters for performing string manipulation
|
|
1099
|
+
volatile, persist = self._set_generic_parameters(func_indicator="StringManipulationIndicator",
|
|
1100
|
+
param_name="StringManipulationParam")
|
|
1101
|
+
|
|
1014
1102
|
# Storing general parameters for performing string transformation
|
|
1015
1103
|
fit_params = {
|
|
1016
1104
|
"data" : self.data,
|
|
@@ -1020,6 +1108,14 @@ class _FeatureEngineering:
|
|
|
1020
1108
|
"inplace" : True,
|
|
1021
1109
|
"persist" : True
|
|
1022
1110
|
}
|
|
1111
|
+
# Disabling display table name if persist is True by default
|
|
1112
|
+
if not volatile and not persist:
|
|
1113
|
+
fit_params["display_table_name"] = False
|
|
1114
|
+
|
|
1115
|
+
if volatile:
|
|
1116
|
+
fit_params["volatile"] = True
|
|
1117
|
+
fit_params["persist"] = False
|
|
1118
|
+
|
|
1023
1119
|
# Adding additional parameters based on string operation type
|
|
1024
1120
|
if string_operation in ["StringCon", "StringTrim"]:
|
|
1025
1121
|
string_argument = transform_val["String"]
|
|
@@ -1040,13 +1136,14 @@ class _FeatureEngineering:
|
|
|
1040
1136
|
|
|
1041
1137
|
# returning dataset after performing string manipulation
|
|
1042
1138
|
transform_output = StrApply(**fit_params).result
|
|
1043
|
-
|
|
1044
|
-
|
|
1139
|
+
if not volatile and not persist:
|
|
1140
|
+
# Adding transformed data containing table to garbage collector
|
|
1141
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1045
1142
|
return transform_output
|
|
1046
1143
|
|
|
1047
1144
|
def _one_hot_encoding(self,
|
|
1048
|
-
|
|
1049
|
-
|
|
1145
|
+
one_hot_columns,
|
|
1146
|
+
unique_counts):
|
|
1050
1147
|
"""
|
|
1051
1148
|
DESCRIPTION:
|
|
1052
1149
|
Function performs the one hot encoding to categorcial columns/features in the dataset.
|
|
@@ -1060,12 +1157,16 @@ class _FeatureEngineering:
|
|
|
1060
1157
|
unique_counts:
|
|
1061
1158
|
Required Argument.
|
|
1062
1159
|
Specifies the unique counts in the categorical columns.
|
|
1063
|
-
Types: int or list of integer (int)
|
|
1064
|
-
|
|
1160
|
+
Types: int or list of integer (int)
|
|
1065
1161
|
"""
|
|
1066
1162
|
# TD function will add extra column_other in onehotEncoding, so
|
|
1067
1163
|
# initailizing this list to remove those extra columns
|
|
1068
1164
|
drop_lst = [ele + "_other" for ele in one_hot_columns]
|
|
1165
|
+
|
|
1166
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1167
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1168
|
+
param_name="CategoricalEncodingParam")
|
|
1169
|
+
|
|
1069
1170
|
# Adding fit parameters for performing encoding
|
|
1070
1171
|
fit_params = {
|
|
1071
1172
|
"data" : self.data,
|
|
@@ -1073,7 +1174,9 @@ class _FeatureEngineering:
|
|
|
1073
1174
|
"is_input_dense" : True,
|
|
1074
1175
|
"target_column" : one_hot_columns,
|
|
1075
1176
|
"category_counts" : unique_counts,
|
|
1076
|
-
"other_column" : "other"
|
|
1177
|
+
"other_column" : "other",
|
|
1178
|
+
"volatile" : volatile,
|
|
1179
|
+
"persist" : persist
|
|
1077
1180
|
}
|
|
1078
1181
|
# Performing one hot encoding fit on target columns
|
|
1079
1182
|
fit_obj = OneHotEncodingFit(**fit_params)
|
|
@@ -1089,10 +1192,21 @@ class _FeatureEngineering:
|
|
|
1089
1192
|
"is_input_dense" : True,
|
|
1090
1193
|
"persist" : True
|
|
1091
1194
|
}
|
|
1195
|
+
# Disabling display table name if persist is True by default
|
|
1196
|
+
if not volatile and not persist:
|
|
1197
|
+
transform_params["display_table_name"] = False
|
|
1198
|
+
|
|
1199
|
+
# Setting persist to False if volatile is True
|
|
1200
|
+
if volatile:
|
|
1201
|
+
transform_params["volatile"] = True
|
|
1202
|
+
transform_params["persist"] = False
|
|
1203
|
+
|
|
1092
1204
|
# Performing one hot encoding transformation
|
|
1093
1205
|
transform_output = OneHotEncodingTransform(**transform_params).result
|
|
1094
|
-
|
|
1095
|
-
|
|
1206
|
+
|
|
1207
|
+
if not volatile and not persist:
|
|
1208
|
+
# Adding transformed data containing table to garbage collector
|
|
1209
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1096
1210
|
self.data = transform_output.drop(drop_lst, axis=1)
|
|
1097
1211
|
|
|
1098
1212
|
def _ordinal_encoding(self,
|
|
@@ -1107,10 +1221,16 @@ class _FeatureEngineering:
|
|
|
1107
1221
|
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
1108
1222
|
Types: str or list of strings (str)
|
|
1109
1223
|
"""
|
|
1224
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1225
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1226
|
+
param_name="CategoricalEncodingParam")
|
|
1227
|
+
|
|
1110
1228
|
# Adding fit parameters for performing encoding
|
|
1111
1229
|
fit_params = {
|
|
1112
1230
|
"data" : self.data,
|
|
1113
|
-
"target_column" : ordinal_columns
|
|
1231
|
+
"target_column" : ordinal_columns,
|
|
1232
|
+
"volatile" : volatile,
|
|
1233
|
+
"persist" : persist
|
|
1114
1234
|
}
|
|
1115
1235
|
# Performing ordinal encoding fit on target columns
|
|
1116
1236
|
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
@@ -1130,17 +1250,27 @@ class _FeatureEngineering:
|
|
|
1130
1250
|
"accumulate" : accumulate_columns,
|
|
1131
1251
|
"persist" : True
|
|
1132
1252
|
}
|
|
1253
|
+
# Disabling display table name if persist is True by default
|
|
1254
|
+
if not volatile and not persist:
|
|
1255
|
+
transform_params["display_table_name"] = False
|
|
1256
|
+
|
|
1257
|
+
# Setting persist to False if volatile is True
|
|
1258
|
+
if volatile:
|
|
1259
|
+
transform_params["volatile"] = True
|
|
1260
|
+
transform_params["persist"] = False
|
|
1133
1261
|
# Performing ordinal encoding transformation
|
|
1134
1262
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
1135
|
-
|
|
1136
|
-
|
|
1263
|
+
|
|
1264
|
+
if not volatile and not persist:
|
|
1265
|
+
# Adding transformed data containing table to garbage collector
|
|
1266
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1137
1267
|
|
|
1138
1268
|
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1139
1269
|
self.target_label = ord_fit_obj
|
|
1140
1270
|
|
|
1141
1271
|
|
|
1142
1272
|
def _target_encoding(self,
|
|
1143
|
-
|
|
1273
|
+
target_encoding_list):
|
|
1144
1274
|
"""
|
|
1145
1275
|
DESCRIPTION:
|
|
1146
1276
|
Function performs the target encoding to categorcial columns/features in the dataset.
|
|
@@ -1165,6 +1295,11 @@ class _FeatureEngineering:
|
|
|
1165
1295
|
# Storing indicator and fit object for target encoding in data transform dictionary
|
|
1166
1296
|
self.data_transform_dict["custom_target_encoding_ind"] = True
|
|
1167
1297
|
self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
|
|
1298
|
+
|
|
1299
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1300
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1301
|
+
param_name="CategoricalEncodingParam")
|
|
1302
|
+
|
|
1168
1303
|
# Fetching required argument for performing target encoding
|
|
1169
1304
|
for col,transform_val in target_encoding_list.items():
|
|
1170
1305
|
encoder_method = transform_val["encoder_method"]
|
|
@@ -1175,7 +1310,9 @@ class _FeatureEngineering:
|
|
|
1175
1310
|
"category_data" : category_data,
|
|
1176
1311
|
"encoder_method" : encoder_method,
|
|
1177
1312
|
"target_columns" : col,
|
|
1178
|
-
"response_column" : response_column
|
|
1313
|
+
"response_column" : response_column,
|
|
1314
|
+
"volatile" : volatile,
|
|
1315
|
+
"persist" : persist
|
|
1179
1316
|
}
|
|
1180
1317
|
if encoder_method == "CBM_DIRICHLET":
|
|
1181
1318
|
num_distinct_responses=transform_val["num_distinct_responses"]
|
|
@@ -1184,7 +1321,7 @@ class _FeatureEngineering:
|
|
|
1184
1321
|
# Performing target encoding fit on target columns
|
|
1185
1322
|
tar_fit_obj = TargetEncodingFit(**fit_params)
|
|
1186
1323
|
# Storing each column fit object for target encoding in data transform dictionary
|
|
1187
|
-
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj})
|
|
1324
|
+
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj.result})
|
|
1188
1325
|
# Extracting accumulate columns
|
|
1189
1326
|
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
1190
1327
|
# Adding transform parameters for performing encoding
|
|
@@ -1192,12 +1329,21 @@ class _FeatureEngineering:
|
|
|
1192
1329
|
"data" : self.data,
|
|
1193
1330
|
"object" : tar_fit_obj,
|
|
1194
1331
|
"accumulate" : accumulate_columns,
|
|
1195
|
-
"persist" : True
|
|
1332
|
+
"persist" : True
|
|
1196
1333
|
}
|
|
1334
|
+
|
|
1335
|
+
# Disabling display table name if persist is True by default
|
|
1336
|
+
if not volatile and not persist:
|
|
1337
|
+
transform_params["display_table_name"] = False
|
|
1338
|
+
|
|
1339
|
+
if volatile:
|
|
1340
|
+
transform_params["volatile"] = True
|
|
1341
|
+
transform_params["persist"] = False
|
|
1197
1342
|
# Performing ordinal encoding transformation
|
|
1198
1343
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
1199
|
-
|
|
1200
|
-
|
|
1344
|
+
if not volatile and not persist:
|
|
1345
|
+
# Adding transformed data containing table to garbage collector
|
|
1346
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1201
1347
|
|
|
1202
1348
|
def _encoding_categorical_columns(self):
|
|
1203
1349
|
"""
|
|
@@ -1262,8 +1408,10 @@ class _FeatureEngineering:
|
|
|
1262
1408
|
# Storing custom categorical encoding indicator in data transform dictionary
|
|
1263
1409
|
self.data_transform_dict["custom_categorical_encoding_ind"] = True
|
|
1264
1410
|
# Fetching user input list for performing
|
|
1265
|
-
encoding_list = self.custom_data.get("CategoricalEncodingParam", None)
|
|
1411
|
+
encoding_list = self.custom_data.get("CategoricalEncodingParam", None).copy()
|
|
1266
1412
|
if encoding_list:
|
|
1413
|
+
volatile = encoding_list.pop("volatile", False)
|
|
1414
|
+
persist = encoding_list.pop("persist", False)
|
|
1267
1415
|
onehot_encode_ind = encoding_list.get("OneHotEncodingIndicator", False)
|
|
1268
1416
|
ordinal_encode_ind = encoding_list.get("OrdinalEncodingIndicator", False)
|
|
1269
1417
|
target_encode_ind = encoding_list.get("TargetEncodingIndicator", False)
|
|
@@ -1340,11 +1488,25 @@ class _FeatureEngineering:
|
|
|
1340
1488
|
"""
|
|
1341
1489
|
DESCRIPTION:
|
|
1342
1490
|
Function to perform different numerical transformations using NumApply on numerical features based on user input.
|
|
1343
|
-
|
|
1491
|
+
|
|
1492
|
+
PARAMETERS:
|
|
1493
|
+
target_col:
|
|
1494
|
+
Required Argument.
|
|
1495
|
+
Specifies the numerical column for which transformation will be performed.
|
|
1496
|
+
Types: str
|
|
1497
|
+
|
|
1498
|
+
transform_val:
|
|
1499
|
+
Required Argument.
|
|
1500
|
+
Specifies different parameter require for applying numerical transformation.
|
|
1501
|
+
Types: dict
|
|
1344
1502
|
"""
|
|
1345
1503
|
# Fetching columns for accumulation
|
|
1346
1504
|
accumulate_columns = self._extract_list(self.data.columns, [target_col])
|
|
1347
1505
|
apply_method = transform_val["apply_method"]
|
|
1506
|
+
|
|
1507
|
+
# Setting volatile and persist parameters for performing transformation
|
|
1508
|
+
volatile, persist = self._set_generic_parameters(func_indicator="MathameticalTransformationIndicator",
|
|
1509
|
+
param_name="MathameticalTransformationParam")
|
|
1348
1510
|
# Adding fit parameters for performing transformation
|
|
1349
1511
|
fit_params={
|
|
1350
1512
|
"data": self.data,
|
|
@@ -1354,17 +1516,25 @@ class _FeatureEngineering:
|
|
|
1354
1516
|
"persist" :True,
|
|
1355
1517
|
"accumulate" : accumulate_columns
|
|
1356
1518
|
}
|
|
1519
|
+
# Disabling display table name if persist is True by default
|
|
1520
|
+
if not volatile and not persist:
|
|
1521
|
+
fit_params["display_table_name"] = False
|
|
1522
|
+
|
|
1523
|
+
if volatile:
|
|
1524
|
+
fit_params["volatile"] = True
|
|
1525
|
+
fit_params["persist"] = False
|
|
1357
1526
|
# Adding addition details for fit parameters in case of SIGMOID transformation
|
|
1358
1527
|
if apply_method == "sigmoid":
|
|
1359
1528
|
sigmoid_style=transform_val["sigmoid_style"]
|
|
1360
1529
|
fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
|
|
1361
1530
|
# Performing transformation on target columns
|
|
1362
1531
|
transform_output = NumApply(**fit_params).result
|
|
1363
|
-
|
|
1364
|
-
|
|
1532
|
+
if not volatile and not persist:
|
|
1533
|
+
# Adding transformed data containing table to garbage collector
|
|
1534
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1365
1535
|
return transform_output
|
|
1366
1536
|
|
|
1367
|
-
def _numerical_transformation(self, target_columns, num_transform_data):
|
|
1537
|
+
def _numerical_transformation(self, target_columns, num_transform_data, volatile, persist):
|
|
1368
1538
|
"""
|
|
1369
1539
|
DESCRIPTION:
|
|
1370
1540
|
Function to perform different numerical transformations using Fit and Transform on numerical features based on user input.
|
|
@@ -1374,7 +1544,9 @@ class _FeatureEngineering:
|
|
|
1374
1544
|
fit_params={
|
|
1375
1545
|
"data" : self.data,
|
|
1376
1546
|
"object" : num_transform_data,
|
|
1377
|
-
"object_order_column" : "TargetColumn"
|
|
1547
|
+
"object_order_column" : "TargetColumn",
|
|
1548
|
+
"volatile" : volatile,
|
|
1549
|
+
"persist" : persist
|
|
1378
1550
|
}
|
|
1379
1551
|
# Peforming fit with all arguments.
|
|
1380
1552
|
num_fit_obj = Fit(**fit_params)
|
|
@@ -1392,10 +1564,18 @@ class _FeatureEngineering:
|
|
|
1392
1564
|
"id_columns" : id_columns,
|
|
1393
1565
|
"persist" :True
|
|
1394
1566
|
}
|
|
1567
|
+
# Disabling display table name if persist is True by default
|
|
1568
|
+
if not volatile and not persist:
|
|
1569
|
+
transform_params["display_table_name"] = False
|
|
1570
|
+
|
|
1571
|
+
if volatile:
|
|
1572
|
+
transform_params["volatile"] = True
|
|
1573
|
+
transform_params["persist"] = False
|
|
1395
1574
|
# Peforming transformation on target columns
|
|
1396
1575
|
self.data = Transform(**transform_params).result
|
|
1397
|
-
|
|
1398
|
-
|
|
1576
|
+
if not volatile and not persist:
|
|
1577
|
+
# Adding transformed data containing table to garbage collector
|
|
1578
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1399
1579
|
self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
|
|
1400
1580
|
data=self.data,
|
|
1401
1581
|
progress_bar=self.progress_bar)
|
|
@@ -1414,8 +1594,11 @@ class _FeatureEngineering:
|
|
|
1414
1594
|
# Checking user input for mathematical transformations
|
|
1415
1595
|
if mat_transform_input:
|
|
1416
1596
|
# Extracting list required for mathematical transformations
|
|
1417
|
-
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None)
|
|
1597
|
+
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None).copy()
|
|
1598
|
+
|
|
1418
1599
|
if mat_transform_list:
|
|
1600
|
+
volatile = mat_transform_list.pop("volatile", False)
|
|
1601
|
+
persist = mat_transform_list.pop("persist", False)
|
|
1419
1602
|
# Checking for column present in dataset or not
|
|
1420
1603
|
_Validators._validate_dataframe_has_argument_columns(list(mat_transform_list.keys()),
|
|
1421
1604
|
"MathameticalTransformationParam", self.data, "df")
|
|
@@ -1459,7 +1642,7 @@ class _FeatureEngineering:
|
|
|
1459
1642
|
copy_to_sql(df=transform_data, table_name="automl_num_transform_data", temporary=True)
|
|
1460
1643
|
num_transform_data = DataFrame.from_table("automl_num_transform_data")
|
|
1461
1644
|
# Applying transformation using Fit/Transform functions
|
|
1462
|
-
self._numerical_transformation(target_columns, num_transform_data)
|
|
1645
|
+
self._numerical_transformation(target_columns, num_transform_data, volatile, persist)
|
|
1463
1646
|
# Storing custom numerical transformation parameters and column list in data transform dictionary
|
|
1464
1647
|
self.data_transform_dict['custom_numerical_transformation_col'] = target_columns
|
|
1465
1648
|
self.data_transform_dict['custom_numerical_transformation_params'] = num_transform_data
|
|
@@ -1485,6 +1668,8 @@ class _FeatureEngineering:
|
|
|
1485
1668
|
nl_transform_list = self.custom_data.get("NonLinearTransformationParam", None)
|
|
1486
1669
|
# Extracting list required for non-linear transformation
|
|
1487
1670
|
if nl_transform_list:
|
|
1671
|
+
volatile = nl_transform_list.pop("volatile", False)
|
|
1672
|
+
persist = nl_transform_list.pop("persist", False)
|
|
1488
1673
|
total_combination = len(nl_transform_list)
|
|
1489
1674
|
# Generating all possible combination names
|
|
1490
1675
|
possible_combination = ["Combination_"+str(counter) for counter in range(1,total_combination+1)]
|
|
@@ -1511,12 +1696,14 @@ class _FeatureEngineering:
|
|
|
1511
1696
|
"data" : self.data,
|
|
1512
1697
|
"target_columns" : target_columns,
|
|
1513
1698
|
"formula" : formula,
|
|
1514
|
-
"result_column" : result_column
|
|
1699
|
+
"result_column" : result_column,
|
|
1700
|
+
"volatile" : volatile,
|
|
1701
|
+
"persist" : persist
|
|
1515
1702
|
}
|
|
1516
1703
|
# Performing fit on dataset
|
|
1517
1704
|
fit_obj = NonLinearCombineFit(**fit_param)
|
|
1518
1705
|
# Updating it for each non-linear combination
|
|
1519
|
-
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj})
|
|
1706
|
+
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj.result})
|
|
1520
1707
|
# Adding transform params for transformation
|
|
1521
1708
|
transform_params = {
|
|
1522
1709
|
"data" : self.data,
|
|
@@ -1524,9 +1711,18 @@ class _FeatureEngineering:
|
|
|
1524
1711
|
"accumulate" : self.data.columns,
|
|
1525
1712
|
"persist" : True
|
|
1526
1713
|
}
|
|
1714
|
+
# Disabling display table name if persist is True by default
|
|
1715
|
+
if not volatile and not persist:
|
|
1716
|
+
transform_params["display_table_name"] = False
|
|
1717
|
+
|
|
1718
|
+
if volatile:
|
|
1719
|
+
transform_params["volatile"] = True
|
|
1720
|
+
transform_params["persist"] = False
|
|
1527
1721
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
1528
|
-
|
|
1529
|
-
|
|
1722
|
+
|
|
1723
|
+
if not volatile and not persist:
|
|
1724
|
+
# Adding transformed data containing table to garbage collector
|
|
1725
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1530
1726
|
else:
|
|
1531
1727
|
self._display_msg(inline_msg="Combinations are not as per expectation.",
|
|
1532
1728
|
progress_bar=self.progress_bar)
|
|
@@ -1552,29 +1748,64 @@ class _FeatureEngineering:
|
|
|
1552
1748
|
anti_select_input = self.custom_data.get("AntiselectIndicator", False)
|
|
1553
1749
|
# Checking user input for anti-select columns
|
|
1554
1750
|
if anti_select_input:
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1751
|
+
anti_select_params = self.custom_data.get("AntiselectParam", None)
|
|
1752
|
+
if anti_select_params:
|
|
1753
|
+
# Extracting list required for anti-select columns
|
|
1754
|
+
anti_select_list = anti_select_params.get("excluded_columns", None)
|
|
1755
|
+
volatile = anti_select_params.get("volatile", False)
|
|
1756
|
+
persist = anti_select_params.get("persist", False)
|
|
1757
|
+
if(anti_select_list):
|
|
1758
|
+
if all(item in self.data.columns for item in anti_select_list):
|
|
1759
|
+
# Storing custom anti-select columns indicator and column list in data transform dictionary
|
|
1760
|
+
self.data_transform_dict['custom_anti_select_columns_ind'] = True
|
|
1761
|
+
self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
|
|
1762
|
+
fit_params = {
|
|
1763
|
+
"data" : self.data,
|
|
1764
|
+
"exclude" : anti_select_list,
|
|
1765
|
+
"volatile" : volatile,
|
|
1766
|
+
"persist" : persist
|
|
1767
|
+
}
|
|
1768
|
+
# Performing transformation for given user input
|
|
1769
|
+
self.data = Antiselect(**fit_params).result
|
|
1770
|
+
self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
|
|
1771
|
+
data=self.data,
|
|
1772
|
+
progress_bar=self.progress_bar)
|
|
1773
|
+
else:
|
|
1774
|
+
self._display_msg(msg="Columns provided in list are not present in dataset:",
|
|
1775
|
+
col_lst=anti_select_list,
|
|
1776
|
+
progress_bar=self.progress_bar)
|
|
1575
1777
|
else:
|
|
1576
1778
|
self._display_msg(inline_msg="No information provided for performing anti-select columns operation.",
|
|
1577
1779
|
progress_bar=self.progress_bar)
|
|
1578
1780
|
else:
|
|
1579
1781
|
self._display_msg(inline_msg="Skipping customized anti-select columns.",
|
|
1580
|
-
progress_bar=self.progress_bar)
|
|
1782
|
+
progress_bar=self.progress_bar)
|
|
1783
|
+
|
|
1784
|
+
def _set_generic_parameters(self,
|
|
1785
|
+
func_indicator=None,
|
|
1786
|
+
param_name=None):
|
|
1787
|
+
"""
|
|
1788
|
+
DESCRIPTION:
|
|
1789
|
+
Function to set generic parameters.
|
|
1790
|
+
|
|
1791
|
+
PARAMETERS:
|
|
1792
|
+
func_indicator:
|
|
1793
|
+
Optional Argument.
|
|
1794
|
+
Specifies the name of function indicator.
|
|
1795
|
+
Types: str
|
|
1796
|
+
|
|
1797
|
+
param_name:
|
|
1798
|
+
Optional Argument.
|
|
1799
|
+
Specifies the name of the param which contains generic parameters.
|
|
1800
|
+
Types: str
|
|
1801
|
+
|
|
1802
|
+
RETURNS:
|
|
1803
|
+
Tuple containing volatile and persist parameters.
|
|
1804
|
+
"""
|
|
1805
|
+
volatile = self.volatile
|
|
1806
|
+
persist = self.persist
|
|
1807
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
1808
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
1809
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
1810
|
+
|
|
1811
|
+
return (volatile, persist)
|