teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -40,6 +40,7 @@ from teradataml.common.garbagecollector import GarbageCollector
|
|
|
40
40
|
from teradataml.dataframe.sql_functions import case
|
|
41
41
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
42
42
|
from teradataml.utils.validators import _Validators
|
|
43
|
+
from teradataml.common.utils import UtilFuncs
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class _FeatureEngineering:
|
|
@@ -50,7 +51,8 @@ class _FeatureEngineering:
|
|
|
50
51
|
model_list,
|
|
51
52
|
verbose = 0,
|
|
52
53
|
task_type = "Regression",
|
|
53
|
-
custom_data = None
|
|
54
|
+
custom_data = None,
|
|
55
|
+
**kwargs):
|
|
54
56
|
"""
|
|
55
57
|
DESCRIPTION:
|
|
56
58
|
Function initializes the data, target column and columns datatypes
|
|
@@ -94,6 +96,28 @@ class _FeatureEngineering:
|
|
|
94
96
|
Optional Argument.
|
|
95
97
|
Specifies json object containing user customized input.
|
|
96
98
|
Types: json object
|
|
99
|
+
|
|
100
|
+
**kwargs:
|
|
101
|
+
Specifies the additional arguments for feature engineering. Below
|
|
102
|
+
are the additional arguments:
|
|
103
|
+
volatile:
|
|
104
|
+
Optional Argument.
|
|
105
|
+
Specifies whether to put the interim results of the
|
|
106
|
+
functions in a volatile table or not. When set to
|
|
107
|
+
True, results are stored in a volatile table,
|
|
108
|
+
otherwise not.
|
|
109
|
+
Default Value: False
|
|
110
|
+
Types: bool
|
|
111
|
+
|
|
112
|
+
persist:
|
|
113
|
+
Optional Argument.
|
|
114
|
+
Specifies whether to persist the interim results of the
|
|
115
|
+
functions in a table or not. When set to True,
|
|
116
|
+
results are persisted in a table; otherwise,
|
|
117
|
+
results are garbage collected at the end of the
|
|
118
|
+
session.
|
|
119
|
+
Default Value: False
|
|
120
|
+
Types: bool
|
|
97
121
|
"""
|
|
98
122
|
# Instance variables
|
|
99
123
|
self.data = data
|
|
@@ -108,6 +132,8 @@ class _FeatureEngineering:
|
|
|
108
132
|
self.data_transform_dict = {}
|
|
109
133
|
self.one_hot_obj_count = 0
|
|
110
134
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
135
|
+
self.volatile = kwargs.get('volatile', False)
|
|
136
|
+
self.persist = kwargs.get('persist', False)
|
|
111
137
|
|
|
112
138
|
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
113
139
|
def feature_engineering(self,
|
|
@@ -133,7 +159,7 @@ class _FeatureEngineering:
|
|
|
133
159
|
second element represents list of columns which are not participating in outlier tranformation.
|
|
134
160
|
"""
|
|
135
161
|
# Assigning number of base jobs for progress bar.
|
|
136
|
-
base_jobs =
|
|
162
|
+
base_jobs = 13 if auto else 17
|
|
137
163
|
|
|
138
164
|
# Updating model list based on distinct value of target column for classification type
|
|
139
165
|
if self.is_classification_type():
|
|
@@ -183,9 +209,12 @@ class _FeatureEngineering:
|
|
|
183
209
|
self._remove_duplicate_rows()
|
|
184
210
|
self.progress_bar.update()
|
|
185
211
|
|
|
212
|
+
self._anti_select_columns()
|
|
213
|
+
self.progress_bar.update()
|
|
214
|
+
|
|
186
215
|
self._remove_futile_columns()
|
|
187
216
|
self.progress_bar.update()
|
|
188
|
-
|
|
217
|
+
|
|
189
218
|
self._handle_date_columns()
|
|
190
219
|
self.progress_bar.update()
|
|
191
220
|
|
|
@@ -206,10 +235,7 @@ class _FeatureEngineering:
|
|
|
206
235
|
|
|
207
236
|
self._non_linear_transformation()
|
|
208
237
|
self.progress_bar.update()
|
|
209
|
-
|
|
210
|
-
self._anti_select_columns()
|
|
211
|
-
self.progress_bar.update()
|
|
212
|
-
|
|
238
|
+
|
|
213
239
|
return self.data, self.excluded_cols, self.target_label, self.data_transform_dict
|
|
214
240
|
|
|
215
241
|
def _extract_list(self,
|
|
@@ -248,7 +274,7 @@ class _FeatureEngineering:
|
|
|
248
274
|
show_data=True)
|
|
249
275
|
start_time = time.time()
|
|
250
276
|
rows = self.data.shape[0]
|
|
251
|
-
self.data=self.data.drop_duplicate()
|
|
277
|
+
self.data=self.data.drop_duplicate(self.data.columns)
|
|
252
278
|
if rows != self.data.shape[0]:
|
|
253
279
|
self._display_msg(msg=f'Updated dataset sample after removing {rows-self.data.shape[0]} duplicate records:',
|
|
254
280
|
data=self.data,
|
|
@@ -324,12 +350,16 @@ class _FeatureEngineering:
|
|
|
324
350
|
if len(categorical_columns) != 0:
|
|
325
351
|
|
|
326
352
|
obj = CategoricalSummary(data=self.data,
|
|
327
|
-
target_columns=categorical_columns
|
|
353
|
+
target_columns=categorical_columns,
|
|
354
|
+
volatile=self.volatile,
|
|
355
|
+
persist=self.persist)
|
|
328
356
|
|
|
329
357
|
gfc_out = GetFutileColumns(data=self.data,
|
|
330
358
|
object=obj,
|
|
331
359
|
category_summary_column="ColumnName",
|
|
332
|
-
threshold_value =0.7
|
|
360
|
+
threshold_value =0.7,
|
|
361
|
+
volatile=self.volatile,
|
|
362
|
+
persist=self.persist)
|
|
333
363
|
|
|
334
364
|
# Extracting Futile columns
|
|
335
365
|
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
@@ -402,10 +432,22 @@ class _FeatureEngineering:
|
|
|
402
432
|
"accumulate" : accumulate_columns,
|
|
403
433
|
"persist" : True
|
|
404
434
|
}
|
|
435
|
+
# Disabling display table name if persist is True by default
|
|
436
|
+
if not self.volatile and not self.persist:
|
|
437
|
+
convertto_params["display_table_name"] = False
|
|
438
|
+
|
|
439
|
+
# Setting persist to False if volatile is True
|
|
440
|
+
if self.volatile:
|
|
441
|
+
convertto_params["persist"] = False
|
|
442
|
+
convertto_params["volatile"] = True
|
|
443
|
+
|
|
405
444
|
# returning dataset after performing string manipulation
|
|
406
445
|
self.data = ConvertTo(**convertto_params).result
|
|
407
|
-
|
|
408
|
-
|
|
446
|
+
|
|
447
|
+
# IF volatile is False and persist is False
|
|
448
|
+
if not self.volatile and not self.persist:
|
|
449
|
+
# Adding transformed data containing table to garbage collector
|
|
450
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
409
451
|
return new_date_components
|
|
410
452
|
|
|
411
453
|
def _handle_date_columns_helper(self):
|
|
@@ -524,9 +566,18 @@ class _FeatureEngineering:
|
|
|
524
566
|
|
|
525
567
|
# Removing rows with missing target column value
|
|
526
568
|
self.data = self.data.dropna(subset=[self.target_column])
|
|
569
|
+
|
|
570
|
+
params = {
|
|
571
|
+
"data": self.data,
|
|
572
|
+
"target_columns": self.data.columns,
|
|
573
|
+
"persist": True,
|
|
574
|
+
"display_table_name": False
|
|
575
|
+
}
|
|
527
576
|
|
|
528
|
-
obj = ColumnSummary(
|
|
529
|
-
|
|
577
|
+
obj = ColumnSummary(**params)
|
|
578
|
+
|
|
579
|
+
# Adding transformed data containing table to garbage collector
|
|
580
|
+
GarbageCollector._add_to_garbagecollector(obj.result._table_name)
|
|
530
581
|
|
|
531
582
|
cols_miss_val={}
|
|
532
583
|
# Iterating over each row in the column summary result
|
|
@@ -661,7 +712,7 @@ class _FeatureEngineering:
|
|
|
661
712
|
for key, val in self.imputation_cols.items():
|
|
662
713
|
|
|
663
714
|
col_stat.append(key)
|
|
664
|
-
if self.data_types[key] in ['float', 'int']:
|
|
715
|
+
if self.data_types[key] in ['float', 'int', 'decimal.Decimal']:
|
|
665
716
|
val = skew_data[f'skew_{key}']
|
|
666
717
|
# Median imputation method, if abs(skewness value) > 1
|
|
667
718
|
if abs(val) > 1:
|
|
@@ -670,7 +721,7 @@ class _FeatureEngineering:
|
|
|
670
721
|
else:
|
|
671
722
|
stat.append('mean')
|
|
672
723
|
# Mode imputation method, if categorical column
|
|
673
|
-
|
|
724
|
+
elif self.data_types[key] in ['str']:
|
|
674
725
|
stat.append('mode')
|
|
675
726
|
|
|
676
727
|
self._display_msg(msg="Columns with their imputation method:",
|
|
@@ -697,12 +748,16 @@ class _FeatureEngineering:
|
|
|
697
748
|
|
|
698
749
|
fit_obj = SimpleImputeFit(data=self.data,
|
|
699
750
|
stats_columns=col_stat,
|
|
700
|
-
stats=stat
|
|
751
|
+
stats=stat,
|
|
752
|
+
volatile=self.volatile,
|
|
753
|
+
persist=self.persist)
|
|
701
754
|
|
|
702
755
|
# Storing fit object for imputation in data transform dictionary
|
|
703
756
|
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
704
757
|
sm = SimpleImputeTransform(data=self.data,
|
|
705
|
-
|
|
758
|
+
object=fit_obj,
|
|
759
|
+
volatile=self.volatile,
|
|
760
|
+
persist=self.persist)
|
|
706
761
|
|
|
707
762
|
self.data = sm.result
|
|
708
763
|
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
@@ -735,6 +790,8 @@ class _FeatureEngineering:
|
|
|
735
790
|
drop_col_ind = missing_handling_param.get("DroppingColumnIndicator", False)
|
|
736
791
|
drop_row_ind = missing_handling_param.get("DroppingRowIndicator", False)
|
|
737
792
|
impute_ind = missing_handling_param.get("ImputeMissingIndicator", False)
|
|
793
|
+
volatile = missing_handling_param.pop("volatile", False)
|
|
794
|
+
persist = missing_handling_param.pop("persist", False)
|
|
738
795
|
# Checking for user input if all methods indicator are false or not
|
|
739
796
|
if not any([drop_col_ind, drop_row_ind, impute_ind]):
|
|
740
797
|
self._display_msg(inline_msg="No method information provided for performing customized missing value handling. \
|
|
@@ -791,7 +848,9 @@ class _FeatureEngineering:
|
|
|
791
848
|
"stats_columns" : stat_list,
|
|
792
849
|
"stats" : stat_method,
|
|
793
850
|
"literals_columns" : literal_list,
|
|
794
|
-
"literals" : literal_value
|
|
851
|
+
"literals" : literal_value,
|
|
852
|
+
"volatile" : volatile,
|
|
853
|
+
"persist" : persist
|
|
795
854
|
}
|
|
796
855
|
# Fitting on dataset
|
|
797
856
|
fit_obj = SimpleImputeFit(**fit_param)
|
|
@@ -804,10 +863,18 @@ class _FeatureEngineering:
|
|
|
804
863
|
"object" : fit_obj.output,
|
|
805
864
|
"persist" : True
|
|
806
865
|
}
|
|
866
|
+
# Disabling display table name if persist is True by default
|
|
867
|
+
if not volatile and not persist:
|
|
868
|
+
transform_param["display_table_name"] = False
|
|
869
|
+
|
|
870
|
+
if volatile:
|
|
871
|
+
transform_param["volatile"] = True
|
|
872
|
+
transform_param["persist"] = False
|
|
807
873
|
# Updating dataset with transform result
|
|
808
874
|
self.data = SimpleImputeTransform(**transform_param).result
|
|
809
|
-
|
|
810
|
-
|
|
875
|
+
if not volatile and not persist:
|
|
876
|
+
# Adding transformed data containing table to garbage collector
|
|
877
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
811
878
|
self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
|
|
812
879
|
data=self.data,
|
|
813
880
|
progress_bar=self.progress_bar)
|
|
@@ -848,6 +915,8 @@ class _FeatureEngineering:
|
|
|
848
915
|
equal_width_bin_columns = []
|
|
849
916
|
var_width_bin_list = []
|
|
850
917
|
var_width_bin_columns = []
|
|
918
|
+
volatile = extracted_col.pop("volatile", False)
|
|
919
|
+
persist = extracted_col.pop("persist", False)
|
|
851
920
|
|
|
852
921
|
# Checking for column present in dataset or not
|
|
853
922
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "BincodeParam", self.data, "df")
|
|
@@ -881,7 +950,9 @@ class _FeatureEngineering:
|
|
|
881
950
|
"data" : self.data,
|
|
882
951
|
"target_columns": equal_width_bin_columns,
|
|
883
952
|
"method_type" : "Equal-Width",
|
|
884
|
-
"nbins" : bins
|
|
953
|
+
"nbins" : bins,
|
|
954
|
+
"volatile" : volatile,
|
|
955
|
+
"persist" : persist
|
|
885
956
|
}
|
|
886
957
|
eql_bin_code_fit = BincodeFit(**fit_params)
|
|
887
958
|
# Storing fit object and column list for Equal-Width binning in data transform dictionary
|
|
@@ -894,11 +965,19 @@ class _FeatureEngineering:
|
|
|
894
965
|
"data" : self.data,
|
|
895
966
|
"object" : eql_bin_code_fit.output,
|
|
896
967
|
"accumulate" : accumulate_columns,
|
|
897
|
-
"persist" : True
|
|
968
|
+
"persist" : True
|
|
898
969
|
}
|
|
970
|
+
# Disabling display table name if persist is True by default
|
|
971
|
+
if not volatile and not persist:
|
|
972
|
+
eql_transform_params["display_table_name"] = False
|
|
973
|
+
|
|
974
|
+
if volatile:
|
|
975
|
+
eql_transform_params["volatile"] = True
|
|
976
|
+
eql_transform_params["persist"] = False
|
|
899
977
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
900
|
-
|
|
901
|
-
|
|
978
|
+
if not volatile and not persist:
|
|
979
|
+
# Adding transformed data containing table to garbage collector
|
|
980
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
902
981
|
self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
|
|
903
982
|
data=self.data,
|
|
904
983
|
progress_bar=self.progress_bar)
|
|
@@ -923,7 +1002,9 @@ class _FeatureEngineering:
|
|
|
923
1002
|
"maxvalue_column" : "MaxValue",
|
|
924
1003
|
"label_column" : "Label",
|
|
925
1004
|
"method_type" : "Variable-Width",
|
|
926
|
-
"label_prefix" : "label_prefix"
|
|
1005
|
+
"label_prefix" : "label_prefix",
|
|
1006
|
+
"volatile" : volatile,
|
|
1007
|
+
"persist" : persist
|
|
927
1008
|
}
|
|
928
1009
|
var_bin_code_fit = BincodeFit(**fit_params)
|
|
929
1010
|
# Storing fit object and column list for Variable-Width binning in data transform dictionary
|
|
@@ -935,11 +1016,19 @@ class _FeatureEngineering:
|
|
|
935
1016
|
"object" : var_bin_code_fit.output,
|
|
936
1017
|
"object_order_column" : "TD_MinValue_BINFIT",
|
|
937
1018
|
"accumulate" : accumulate_columns,
|
|
938
|
-
"persist" : True
|
|
1019
|
+
"persist" : True
|
|
939
1020
|
}
|
|
1021
|
+
# Disabling display table name if persist is True by default
|
|
1022
|
+
if not volatile and not persist:
|
|
1023
|
+
var_transform_params["display_table_name"] = False
|
|
1024
|
+
|
|
1025
|
+
if volatile:
|
|
1026
|
+
var_transform_params["volatile"] = True
|
|
1027
|
+
var_transform_params["persist"] = False
|
|
940
1028
|
self.data = BincodeTransform(**var_transform_params).result
|
|
941
|
-
|
|
942
|
-
|
|
1029
|
+
if not volatile and not persist:
|
|
1030
|
+
# Adding transformed data containing table to garbage collector
|
|
1031
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
943
1032
|
self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
|
|
944
1033
|
data=self.data,
|
|
945
1034
|
progress_bar=self.progress_bar)
|
|
@@ -963,11 +1052,13 @@ class _FeatureEngineering:
|
|
|
963
1052
|
# Storing custom string manipulation indicator in data transform dictionary
|
|
964
1053
|
self.data_transform_dict['custom_string_manipulation_ind'] = True
|
|
965
1054
|
# Fetching list required for performing operation.
|
|
966
|
-
extracted_col = self.custom_data.get("StringManipulationParam", None)
|
|
1055
|
+
extracted_col = self.custom_data.get("StringManipulationParam", None).copy()
|
|
967
1056
|
if not extracted_col:
|
|
968
1057
|
self._display_msg(inline_msg="No information provided for performing string manipulation.",
|
|
969
1058
|
progress_bar=self.progress_bar)
|
|
970
1059
|
else:
|
|
1060
|
+
volatile = extracted_col.pop("volatile", False)
|
|
1061
|
+
persist = extracted_col.pop("persist", False)
|
|
971
1062
|
# Checking for column present in dataset or not
|
|
972
1063
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "StringManipulationParam", self.data, "df")
|
|
973
1064
|
|
|
@@ -980,8 +1071,9 @@ class _FeatureEngineering:
|
|
|
980
1071
|
data=self.data,
|
|
981
1072
|
progress_bar=self.progress_bar)
|
|
982
1073
|
else:
|
|
983
|
-
self._display_msg(inline_msg="Skipping customized string manipulation."
|
|
984
|
-
|
|
1074
|
+
self._display_msg(inline_msg="Skipping customized string manipulation.",
|
|
1075
|
+
progress_bar=self.progress_bar)
|
|
1076
|
+
|
|
985
1077
|
def _str_method_mapping(self,
|
|
986
1078
|
target_col,
|
|
987
1079
|
transform_val):
|
|
@@ -1010,7 +1102,11 @@ class _FeatureEngineering:
|
|
|
1010
1102
|
|
|
1011
1103
|
# Fetching required parameters from json object
|
|
1012
1104
|
string_operation = transform_val["StringOperation"]
|
|
1013
|
-
|
|
1105
|
+
|
|
1106
|
+
# Setting volatile and persist parameters for performing string manipulation
|
|
1107
|
+
volatile, persist = self._set_generic_parameters(func_indicator="StringManipulationIndicator",
|
|
1108
|
+
param_name="StringManipulationParam")
|
|
1109
|
+
|
|
1014
1110
|
# Storing general parameters for performing string transformation
|
|
1015
1111
|
fit_params = {
|
|
1016
1112
|
"data" : self.data,
|
|
@@ -1020,6 +1116,14 @@ class _FeatureEngineering:
|
|
|
1020
1116
|
"inplace" : True,
|
|
1021
1117
|
"persist" : True
|
|
1022
1118
|
}
|
|
1119
|
+
# Disabling display table name if persist is True by default
|
|
1120
|
+
if not volatile and not persist:
|
|
1121
|
+
fit_params["display_table_name"] = False
|
|
1122
|
+
|
|
1123
|
+
if volatile:
|
|
1124
|
+
fit_params["volatile"] = True
|
|
1125
|
+
fit_params["persist"] = False
|
|
1126
|
+
|
|
1023
1127
|
# Adding additional parameters based on string operation type
|
|
1024
1128
|
if string_operation in ["StringCon", "StringTrim"]:
|
|
1025
1129
|
string_argument = transform_val["String"]
|
|
@@ -1040,13 +1144,14 @@ class _FeatureEngineering:
|
|
|
1040
1144
|
|
|
1041
1145
|
# returning dataset after performing string manipulation
|
|
1042
1146
|
transform_output = StrApply(**fit_params).result
|
|
1043
|
-
|
|
1044
|
-
|
|
1147
|
+
if not volatile and not persist:
|
|
1148
|
+
# Adding transformed data containing table to garbage collector
|
|
1149
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1045
1150
|
return transform_output
|
|
1046
1151
|
|
|
1047
1152
|
def _one_hot_encoding(self,
|
|
1048
|
-
|
|
1049
|
-
|
|
1153
|
+
one_hot_columns,
|
|
1154
|
+
unique_counts):
|
|
1050
1155
|
"""
|
|
1051
1156
|
DESCRIPTION:
|
|
1052
1157
|
Function performs the one hot encoding to categorcial columns/features in the dataset.
|
|
@@ -1060,12 +1165,16 @@ class _FeatureEngineering:
|
|
|
1060
1165
|
unique_counts:
|
|
1061
1166
|
Required Argument.
|
|
1062
1167
|
Specifies the unique counts in the categorical columns.
|
|
1063
|
-
Types: int or list of integer (int)
|
|
1064
|
-
|
|
1168
|
+
Types: int or list of integer (int)
|
|
1065
1169
|
"""
|
|
1066
1170
|
# TD function will add extra column_other in onehotEncoding, so
|
|
1067
1171
|
# initailizing this list to remove those extra columns
|
|
1068
1172
|
drop_lst = [ele + "_other" for ele in one_hot_columns]
|
|
1173
|
+
|
|
1174
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1175
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1176
|
+
param_name="CategoricalEncodingParam")
|
|
1177
|
+
|
|
1069
1178
|
# Adding fit parameters for performing encoding
|
|
1070
1179
|
fit_params = {
|
|
1071
1180
|
"data" : self.data,
|
|
@@ -1073,7 +1182,9 @@ class _FeatureEngineering:
|
|
|
1073
1182
|
"is_input_dense" : True,
|
|
1074
1183
|
"target_column" : one_hot_columns,
|
|
1075
1184
|
"category_counts" : unique_counts,
|
|
1076
|
-
"other_column" : "other"
|
|
1185
|
+
"other_column" : "other",
|
|
1186
|
+
"volatile" : volatile,
|
|
1187
|
+
"persist" : persist
|
|
1077
1188
|
}
|
|
1078
1189
|
# Performing one hot encoding fit on target columns
|
|
1079
1190
|
fit_obj = OneHotEncodingFit(**fit_params)
|
|
@@ -1089,10 +1200,21 @@ class _FeatureEngineering:
|
|
|
1089
1200
|
"is_input_dense" : True,
|
|
1090
1201
|
"persist" : True
|
|
1091
1202
|
}
|
|
1203
|
+
# Disabling display table name if persist is True by default
|
|
1204
|
+
if not volatile and not persist:
|
|
1205
|
+
transform_params["display_table_name"] = False
|
|
1206
|
+
|
|
1207
|
+
# Setting persist to False if volatile is True
|
|
1208
|
+
if volatile:
|
|
1209
|
+
transform_params["volatile"] = True
|
|
1210
|
+
transform_params["persist"] = False
|
|
1211
|
+
|
|
1092
1212
|
# Performing one hot encoding transformation
|
|
1093
1213
|
transform_output = OneHotEncodingTransform(**transform_params).result
|
|
1094
|
-
|
|
1095
|
-
|
|
1214
|
+
|
|
1215
|
+
if not volatile and not persist:
|
|
1216
|
+
# Adding transformed data containing table to garbage collector
|
|
1217
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1096
1218
|
self.data = transform_output.drop(drop_lst, axis=1)
|
|
1097
1219
|
|
|
1098
1220
|
def _ordinal_encoding(self,
|
|
@@ -1107,10 +1229,16 @@ class _FeatureEngineering:
|
|
|
1107
1229
|
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
1108
1230
|
Types: str or list of strings (str)
|
|
1109
1231
|
"""
|
|
1232
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1233
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1234
|
+
param_name="CategoricalEncodingParam")
|
|
1235
|
+
|
|
1110
1236
|
# Adding fit parameters for performing encoding
|
|
1111
1237
|
fit_params = {
|
|
1112
1238
|
"data" : self.data,
|
|
1113
|
-
"target_column" : ordinal_columns
|
|
1239
|
+
"target_column" : ordinal_columns,
|
|
1240
|
+
"volatile" : volatile,
|
|
1241
|
+
"persist" : persist
|
|
1114
1242
|
}
|
|
1115
1243
|
# Performing ordinal encoding fit on target columns
|
|
1116
1244
|
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
@@ -1130,17 +1258,27 @@ class _FeatureEngineering:
|
|
|
1130
1258
|
"accumulate" : accumulate_columns,
|
|
1131
1259
|
"persist" : True
|
|
1132
1260
|
}
|
|
1261
|
+
# Disabling display table name if persist is True by default
|
|
1262
|
+
if not volatile and not persist:
|
|
1263
|
+
transform_params["display_table_name"] = False
|
|
1264
|
+
|
|
1265
|
+
# Setting persist to False if volatile is True
|
|
1266
|
+
if volatile:
|
|
1267
|
+
transform_params["volatile"] = True
|
|
1268
|
+
transform_params["persist"] = False
|
|
1133
1269
|
# Performing ordinal encoding transformation
|
|
1134
1270
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
1135
|
-
|
|
1136
|
-
|
|
1271
|
+
|
|
1272
|
+
if not volatile and not persist:
|
|
1273
|
+
# Adding transformed data containing table to garbage collector
|
|
1274
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1137
1275
|
|
|
1138
1276
|
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1139
1277
|
self.target_label = ord_fit_obj
|
|
1140
1278
|
|
|
1141
1279
|
|
|
1142
1280
|
def _target_encoding(self,
|
|
1143
|
-
|
|
1281
|
+
target_encoding_list):
|
|
1144
1282
|
"""
|
|
1145
1283
|
DESCRIPTION:
|
|
1146
1284
|
Function performs the target encoding to categorcial columns/features in the dataset.
|
|
@@ -1165,6 +1303,11 @@ class _FeatureEngineering:
|
|
|
1165
1303
|
# Storing indicator and fit object for target encoding in data transform dictionary
|
|
1166
1304
|
self.data_transform_dict["custom_target_encoding_ind"] = True
|
|
1167
1305
|
self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
|
|
1306
|
+
|
|
1307
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1308
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1309
|
+
param_name="CategoricalEncodingParam")
|
|
1310
|
+
|
|
1168
1311
|
# Fetching required argument for performing target encoding
|
|
1169
1312
|
for col,transform_val in target_encoding_list.items():
|
|
1170
1313
|
encoder_method = transform_val["encoder_method"]
|
|
@@ -1175,7 +1318,9 @@ class _FeatureEngineering:
|
|
|
1175
1318
|
"category_data" : category_data,
|
|
1176
1319
|
"encoder_method" : encoder_method,
|
|
1177
1320
|
"target_columns" : col,
|
|
1178
|
-
"response_column" : response_column
|
|
1321
|
+
"response_column" : response_column,
|
|
1322
|
+
"volatile" : volatile,
|
|
1323
|
+
"persist" : persist
|
|
1179
1324
|
}
|
|
1180
1325
|
if encoder_method == "CBM_DIRICHLET":
|
|
1181
1326
|
num_distinct_responses=transform_val["num_distinct_responses"]
|
|
@@ -1184,7 +1329,7 @@ class _FeatureEngineering:
|
|
|
1184
1329
|
# Performing target encoding fit on target columns
|
|
1185
1330
|
tar_fit_obj = TargetEncodingFit(**fit_params)
|
|
1186
1331
|
# Storing each column fit object for target encoding in data transform dictionary
|
|
1187
|
-
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj})
|
|
1332
|
+
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj.result})
|
|
1188
1333
|
# Extracting accumulate columns
|
|
1189
1334
|
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
1190
1335
|
# Adding transform parameters for performing encoding
|
|
@@ -1192,12 +1337,21 @@ class _FeatureEngineering:
|
|
|
1192
1337
|
"data" : self.data,
|
|
1193
1338
|
"object" : tar_fit_obj,
|
|
1194
1339
|
"accumulate" : accumulate_columns,
|
|
1195
|
-
"persist" : True
|
|
1340
|
+
"persist" : True
|
|
1196
1341
|
}
|
|
1342
|
+
|
|
1343
|
+
# Disabling display table name if persist is True by default
|
|
1344
|
+
if not volatile and not persist:
|
|
1345
|
+
transform_params["display_table_name"] = False
|
|
1346
|
+
|
|
1347
|
+
if volatile:
|
|
1348
|
+
transform_params["volatile"] = True
|
|
1349
|
+
transform_params["persist"] = False
|
|
1197
1350
|
# Performing ordinal encoding transformation
|
|
1198
1351
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
1199
|
-
|
|
1200
|
-
|
|
1352
|
+
if not volatile and not persist:
|
|
1353
|
+
# Adding transformed data containing table to garbage collector
|
|
1354
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1201
1355
|
|
|
1202
1356
|
def _encoding_categorical_columns(self):
|
|
1203
1357
|
"""
|
|
@@ -1262,8 +1416,10 @@ class _FeatureEngineering:
|
|
|
1262
1416
|
# Storing custom categorical encoding indicator in data transform dictionary
|
|
1263
1417
|
self.data_transform_dict["custom_categorical_encoding_ind"] = True
|
|
1264
1418
|
# Fetching user input list for performing
|
|
1265
|
-
encoding_list = self.custom_data.get("CategoricalEncodingParam", None)
|
|
1419
|
+
encoding_list = self.custom_data.get("CategoricalEncodingParam", None).copy()
|
|
1266
1420
|
if encoding_list:
|
|
1421
|
+
volatile = encoding_list.pop("volatile", False)
|
|
1422
|
+
persist = encoding_list.pop("persist", False)
|
|
1267
1423
|
onehot_encode_ind = encoding_list.get("OneHotEncodingIndicator", False)
|
|
1268
1424
|
ordinal_encode_ind = encoding_list.get("OrdinalEncodingIndicator", False)
|
|
1269
1425
|
target_encode_ind = encoding_list.get("TargetEncodingIndicator", False)
|
|
@@ -1340,11 +1496,25 @@ class _FeatureEngineering:
|
|
|
1340
1496
|
"""
|
|
1341
1497
|
DESCRIPTION:
|
|
1342
1498
|
Function to perform different numerical transformations using NumApply on numerical features based on user input.
|
|
1343
|
-
|
|
1499
|
+
|
|
1500
|
+
PARAMETERS:
|
|
1501
|
+
target_col:
|
|
1502
|
+
Required Argument.
|
|
1503
|
+
Specifies the numerical column for which transformation will be performed.
|
|
1504
|
+
Types: str
|
|
1505
|
+
|
|
1506
|
+
transform_val:
|
|
1507
|
+
Required Argument.
|
|
1508
|
+
Specifies different parameter require for applying numerical transformation.
|
|
1509
|
+
Types: dict
|
|
1344
1510
|
"""
|
|
1345
1511
|
# Fetching columns for accumulation
|
|
1346
1512
|
accumulate_columns = self._extract_list(self.data.columns, [target_col])
|
|
1347
1513
|
apply_method = transform_val["apply_method"]
|
|
1514
|
+
|
|
1515
|
+
# Setting volatile and persist parameters for performing transformation
|
|
1516
|
+
volatile, persist = self._set_generic_parameters(func_indicator="MathameticalTransformationIndicator",
|
|
1517
|
+
param_name="MathameticalTransformationParam")
|
|
1348
1518
|
# Adding fit parameters for performing transformation
|
|
1349
1519
|
fit_params={
|
|
1350
1520
|
"data": self.data,
|
|
@@ -1354,17 +1524,25 @@ class _FeatureEngineering:
|
|
|
1354
1524
|
"persist" :True,
|
|
1355
1525
|
"accumulate" : accumulate_columns
|
|
1356
1526
|
}
|
|
1527
|
+
# Disabling display table name if persist is True by default
|
|
1528
|
+
if not volatile and not persist:
|
|
1529
|
+
fit_params["display_table_name"] = False
|
|
1530
|
+
|
|
1531
|
+
if volatile:
|
|
1532
|
+
fit_params["volatile"] = True
|
|
1533
|
+
fit_params["persist"] = False
|
|
1357
1534
|
# Adding addition details for fit parameters in case of SIGMOID transformation
|
|
1358
1535
|
if apply_method == "sigmoid":
|
|
1359
1536
|
sigmoid_style=transform_val["sigmoid_style"]
|
|
1360
1537
|
fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
|
|
1361
1538
|
# Performing transformation on target columns
|
|
1362
1539
|
transform_output = NumApply(**fit_params).result
|
|
1363
|
-
|
|
1364
|
-
|
|
1540
|
+
if not volatile and not persist:
|
|
1541
|
+
# Adding transformed data containing table to garbage collector
|
|
1542
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1365
1543
|
return transform_output
|
|
1366
1544
|
|
|
1367
|
-
def _numerical_transformation(self, target_columns, num_transform_data):
|
|
1545
|
+
def _numerical_transformation(self, target_columns, num_transform_data, volatile, persist):
|
|
1368
1546
|
"""
|
|
1369
1547
|
DESCRIPTION:
|
|
1370
1548
|
Function to perform different numerical transformations using Fit and Transform on numerical features based on user input.
|
|
@@ -1374,7 +1552,9 @@ class _FeatureEngineering:
|
|
|
1374
1552
|
fit_params={
|
|
1375
1553
|
"data" : self.data,
|
|
1376
1554
|
"object" : num_transform_data,
|
|
1377
|
-
"object_order_column" : "TargetColumn"
|
|
1555
|
+
"object_order_column" : "TargetColumn",
|
|
1556
|
+
"volatile" : volatile,
|
|
1557
|
+
"persist" : persist
|
|
1378
1558
|
}
|
|
1379
1559
|
# Peforming fit with all arguments.
|
|
1380
1560
|
num_fit_obj = Fit(**fit_params)
|
|
@@ -1392,10 +1572,18 @@ class _FeatureEngineering:
|
|
|
1392
1572
|
"id_columns" : id_columns,
|
|
1393
1573
|
"persist" :True
|
|
1394
1574
|
}
|
|
1575
|
+
# Disabling display table name if persist is True by default
|
|
1576
|
+
if not volatile and not persist:
|
|
1577
|
+
transform_params["display_table_name"] = False
|
|
1578
|
+
|
|
1579
|
+
if volatile:
|
|
1580
|
+
transform_params["volatile"] = True
|
|
1581
|
+
transform_params["persist"] = False
|
|
1395
1582
|
# Peforming transformation on target columns
|
|
1396
1583
|
self.data = Transform(**transform_params).result
|
|
1397
|
-
|
|
1398
|
-
|
|
1584
|
+
if not volatile and not persist:
|
|
1585
|
+
# Adding transformed data containing table to garbage collector
|
|
1586
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1399
1587
|
self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
|
|
1400
1588
|
data=self.data,
|
|
1401
1589
|
progress_bar=self.progress_bar)
|
|
@@ -1414,8 +1602,11 @@ class _FeatureEngineering:
|
|
|
1414
1602
|
# Checking user input for mathematical transformations
|
|
1415
1603
|
if mat_transform_input:
|
|
1416
1604
|
# Extracting list required for mathematical transformations
|
|
1417
|
-
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None)
|
|
1605
|
+
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None).copy()
|
|
1606
|
+
|
|
1418
1607
|
if mat_transform_list:
|
|
1608
|
+
volatile = mat_transform_list.pop("volatile", False)
|
|
1609
|
+
persist = mat_transform_list.pop("persist", False)
|
|
1419
1610
|
# Checking for column present in dataset or not
|
|
1420
1611
|
_Validators._validate_dataframe_has_argument_columns(list(mat_transform_list.keys()),
|
|
1421
1612
|
"MathameticalTransformationParam", self.data, "df")
|
|
@@ -1459,7 +1650,7 @@ class _FeatureEngineering:
|
|
|
1459
1650
|
copy_to_sql(df=transform_data, table_name="automl_num_transform_data", temporary=True)
|
|
1460
1651
|
num_transform_data = DataFrame.from_table("automl_num_transform_data")
|
|
1461
1652
|
# Applying transformation using Fit/Transform functions
|
|
1462
|
-
self._numerical_transformation(target_columns, num_transform_data)
|
|
1653
|
+
self._numerical_transformation(target_columns, num_transform_data, volatile, persist)
|
|
1463
1654
|
# Storing custom numerical transformation parameters and column list in data transform dictionary
|
|
1464
1655
|
self.data_transform_dict['custom_numerical_transformation_col'] = target_columns
|
|
1465
1656
|
self.data_transform_dict['custom_numerical_transformation_params'] = num_transform_data
|
|
@@ -1485,6 +1676,8 @@ class _FeatureEngineering:
|
|
|
1485
1676
|
nl_transform_list = self.custom_data.get("NonLinearTransformationParam", None)
|
|
1486
1677
|
# Extracting list required for non-linear transformation
|
|
1487
1678
|
if nl_transform_list:
|
|
1679
|
+
volatile = nl_transform_list.pop("volatile", False)
|
|
1680
|
+
persist = nl_transform_list.pop("persist", False)
|
|
1488
1681
|
total_combination = len(nl_transform_list)
|
|
1489
1682
|
# Generating all possible combination names
|
|
1490
1683
|
possible_combination = ["Combination_"+str(counter) for counter in range(1,total_combination+1)]
|
|
@@ -1511,12 +1704,14 @@ class _FeatureEngineering:
|
|
|
1511
1704
|
"data" : self.data,
|
|
1512
1705
|
"target_columns" : target_columns,
|
|
1513
1706
|
"formula" : formula,
|
|
1514
|
-
"result_column" : result_column
|
|
1707
|
+
"result_column" : result_column,
|
|
1708
|
+
"volatile" : volatile,
|
|
1709
|
+
"persist" : persist
|
|
1515
1710
|
}
|
|
1516
1711
|
# Performing fit on dataset
|
|
1517
1712
|
fit_obj = NonLinearCombineFit(**fit_param)
|
|
1518
1713
|
# Updating it for each non-linear combination
|
|
1519
|
-
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj})
|
|
1714
|
+
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj.result})
|
|
1520
1715
|
# Adding transform params for transformation
|
|
1521
1716
|
transform_params = {
|
|
1522
1717
|
"data" : self.data,
|
|
@@ -1524,9 +1719,18 @@ class _FeatureEngineering:
|
|
|
1524
1719
|
"accumulate" : self.data.columns,
|
|
1525
1720
|
"persist" : True
|
|
1526
1721
|
}
|
|
1722
|
+
# Disabling display table name if persist is True by default
|
|
1723
|
+
if not volatile and not persist:
|
|
1724
|
+
transform_params["display_table_name"] = False
|
|
1725
|
+
|
|
1726
|
+
if volatile:
|
|
1727
|
+
transform_params["volatile"] = True
|
|
1728
|
+
transform_params["persist"] = False
|
|
1527
1729
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
1528
|
-
|
|
1529
|
-
|
|
1730
|
+
|
|
1731
|
+
if not volatile and not persist:
|
|
1732
|
+
# Adding transformed data containing table to garbage collector
|
|
1733
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1530
1734
|
else:
|
|
1531
1735
|
self._display_msg(inline_msg="Combinations are not as per expectation.",
|
|
1532
1736
|
progress_bar=self.progress_bar)
|
|
@@ -1552,29 +1756,64 @@ class _FeatureEngineering:
|
|
|
1552
1756
|
anti_select_input = self.custom_data.get("AntiselectIndicator", False)
|
|
1553
1757
|
# Checking user input for anti-select columns
|
|
1554
1758
|
if anti_select_input:
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1759
|
+
anti_select_params = self.custom_data.get("AntiselectParam", None)
|
|
1760
|
+
if anti_select_params:
|
|
1761
|
+
# Extracting list required for anti-select columns
|
|
1762
|
+
anti_select_list = anti_select_params.get("excluded_columns", None)
|
|
1763
|
+
volatile = anti_select_params.get("volatile", False)
|
|
1764
|
+
persist = anti_select_params.get("persist", False)
|
|
1765
|
+
if(anti_select_list):
|
|
1766
|
+
if all(item in self.data.columns for item in anti_select_list):
|
|
1767
|
+
# Storing custom anti-select columns indicator and column list in data transform dictionary
|
|
1768
|
+
self.data_transform_dict['custom_anti_select_columns_ind'] = True
|
|
1769
|
+
self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
|
|
1770
|
+
fit_params = {
|
|
1771
|
+
"data" : self.data,
|
|
1772
|
+
"exclude" : anti_select_list,
|
|
1773
|
+
"volatile" : volatile,
|
|
1774
|
+
"persist" : persist
|
|
1775
|
+
}
|
|
1776
|
+
# Performing transformation for given user input
|
|
1777
|
+
self.data = Antiselect(**fit_params).result
|
|
1778
|
+
self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
|
|
1779
|
+
data=self.data,
|
|
1780
|
+
progress_bar=self.progress_bar)
|
|
1781
|
+
else:
|
|
1782
|
+
self._display_msg(msg="Columns provided in list are not present in dataset:",
|
|
1783
|
+
col_lst=anti_select_list,
|
|
1784
|
+
progress_bar=self.progress_bar)
|
|
1575
1785
|
else:
|
|
1576
1786
|
self._display_msg(inline_msg="No information provided for performing anti-select columns operation.",
|
|
1577
1787
|
progress_bar=self.progress_bar)
|
|
1578
1788
|
else:
|
|
1579
1789
|
self._display_msg(inline_msg="Skipping customized anti-select columns.",
|
|
1580
|
-
progress_bar=self.progress_bar)
|
|
1790
|
+
progress_bar=self.progress_bar)
|
|
1791
|
+
|
|
1792
|
+
def _set_generic_parameters(self,
|
|
1793
|
+
func_indicator=None,
|
|
1794
|
+
param_name=None):
|
|
1795
|
+
"""
|
|
1796
|
+
DESCRIPTION:
|
|
1797
|
+
Function to set generic parameters.
|
|
1798
|
+
|
|
1799
|
+
PARAMETERS:
|
|
1800
|
+
func_indicator:
|
|
1801
|
+
Optional Argument.
|
|
1802
|
+
Specifies the name of function indicator.
|
|
1803
|
+
Types: str
|
|
1804
|
+
|
|
1805
|
+
param_name:
|
|
1806
|
+
Optional Argument.
|
|
1807
|
+
Specifies the name of the param which contains generic parameters.
|
|
1808
|
+
Types: str
|
|
1809
|
+
|
|
1810
|
+
RETURNS:
|
|
1811
|
+
Tuple containing volatile and persist parameters.
|
|
1812
|
+
"""
|
|
1813
|
+
volatile = self.volatile
|
|
1814
|
+
persist = self.persist
|
|
1815
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
1816
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
1817
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
1818
|
+
|
|
1819
|
+
return (volatile, persist)
|