teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -24,7 +24,7 @@ from teradataml.dataframe.dataframe import DataFrame
|
|
|
24
24
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
25
|
from teradataml import Antiselect
|
|
26
26
|
from teradataml import BincodeFit, BincodeTransform
|
|
27
|
-
from teradataml import ColumnSummary,
|
|
27
|
+
from teradataml import CategoricalSummary, ColumnSummary, ConvertTo, GetFutileColumns, FillRowId
|
|
28
28
|
from teradataml import Fit, Transform
|
|
29
29
|
from teradataml import NonLinearCombineFit, NonLinearCombineTransform
|
|
30
30
|
from teradataml import NumApply
|
|
@@ -36,6 +36,8 @@ from teradataml import TargetEncodingFit, TargetEncodingTransform
|
|
|
36
36
|
from sqlalchemy import literal_column
|
|
37
37
|
from teradatasqlalchemy import INTEGER
|
|
38
38
|
from teradataml import display
|
|
39
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
40
|
+
from teradataml.dataframe.sql_functions import case
|
|
39
41
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
40
42
|
from teradataml.utils.validators import _Validators
|
|
41
43
|
|
|
@@ -48,7 +50,8 @@ class _FeatureEngineering:
|
|
|
48
50
|
model_list,
|
|
49
51
|
verbose = 0,
|
|
50
52
|
task_type = "Regression",
|
|
51
|
-
custom_data = None
|
|
53
|
+
custom_data = None,
|
|
54
|
+
**kwargs):
|
|
52
55
|
"""
|
|
53
56
|
DESCRIPTION:
|
|
54
57
|
Function initializes the data, target column and columns datatypes
|
|
@@ -61,12 +64,12 @@ class _FeatureEngineering:
|
|
|
61
64
|
Types: teradataml Dataframe
|
|
62
65
|
|
|
63
66
|
target_column:
|
|
64
|
-
Required
|
|
67
|
+
Required Argument.
|
|
65
68
|
Specifies the name of the target column in "data"..
|
|
66
69
|
Types: str
|
|
67
70
|
|
|
68
71
|
model_list:
|
|
69
|
-
Required
|
|
72
|
+
Required Argument.
|
|
70
73
|
Specifies the list of models to be used for model training.
|
|
71
74
|
Types: list
|
|
72
75
|
|
|
@@ -81,7 +84,7 @@ class _FeatureEngineering:
|
|
|
81
84
|
Types: int
|
|
82
85
|
|
|
83
86
|
task_type:
|
|
84
|
-
Required
|
|
87
|
+
Required Argument.
|
|
85
88
|
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
86
89
|
on the provived dataset.
|
|
87
90
|
Default Value: "Regression"
|
|
@@ -89,9 +92,31 @@ class _FeatureEngineering:
|
|
|
89
92
|
Types: str
|
|
90
93
|
|
|
91
94
|
custom_data:
|
|
92
|
-
Optional
|
|
95
|
+
Optional Argument.
|
|
93
96
|
Specifies json object containing user customized input.
|
|
94
97
|
Types: json object
|
|
98
|
+
|
|
99
|
+
**kwargs:
|
|
100
|
+
Specifies the additional arguments for feature engineering. Below
|
|
101
|
+
are the additional arguments:
|
|
102
|
+
volatile:
|
|
103
|
+
Optional Argument.
|
|
104
|
+
Specifies whether to put the interim results of the
|
|
105
|
+
functions in a volatile table or not. When set to
|
|
106
|
+
True, results are stored in a volatile table,
|
|
107
|
+
otherwise not.
|
|
108
|
+
Default Value: False
|
|
109
|
+
Types: bool
|
|
110
|
+
|
|
111
|
+
persist:
|
|
112
|
+
Optional Argument.
|
|
113
|
+
Specifies whether to persist the interim results of the
|
|
114
|
+
functions in a table or not. When set to True,
|
|
115
|
+
results are persisted in a table; otherwise,
|
|
116
|
+
results are garbage collected at the end of the
|
|
117
|
+
session.
|
|
118
|
+
Default Value: False
|
|
119
|
+
Types: bool
|
|
95
120
|
"""
|
|
96
121
|
# Instance variables
|
|
97
122
|
self.data = data
|
|
@@ -106,6 +131,8 @@ class _FeatureEngineering:
|
|
|
106
131
|
self.data_transform_dict = {}
|
|
107
132
|
self.one_hot_obj_count = 0
|
|
108
133
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
134
|
+
self.volatile = kwargs.get('volatile', False)
|
|
135
|
+
self.persist = kwargs.get('persist', False)
|
|
109
136
|
|
|
110
137
|
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
111
138
|
def feature_engineering(self,
|
|
@@ -120,7 +147,7 @@ class _FeatureEngineering:
|
|
|
120
147
|
|
|
121
148
|
PARAMETERS:
|
|
122
149
|
auto:
|
|
123
|
-
Optional
|
|
150
|
+
Optional Argument.
|
|
124
151
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
125
152
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
126
153
|
Default Value: True
|
|
@@ -131,7 +158,7 @@ class _FeatureEngineering:
|
|
|
131
158
|
second element represents list of columns which are not participating in outlier tranformation.
|
|
132
159
|
"""
|
|
133
160
|
# Assigning number of base jobs for progress bar.
|
|
134
|
-
base_jobs =
|
|
161
|
+
base_jobs = 13 if auto else 17
|
|
135
162
|
|
|
136
163
|
# Updating model list based on distinct value of target column for classification type
|
|
137
164
|
if self.is_classification_type():
|
|
@@ -181,9 +208,12 @@ class _FeatureEngineering:
|
|
|
181
208
|
self._remove_duplicate_rows()
|
|
182
209
|
self.progress_bar.update()
|
|
183
210
|
|
|
211
|
+
self._anti_select_columns()
|
|
212
|
+
self.progress_bar.update()
|
|
213
|
+
|
|
184
214
|
self._remove_futile_columns()
|
|
185
215
|
self.progress_bar.update()
|
|
186
|
-
|
|
216
|
+
|
|
187
217
|
self._handle_date_columns()
|
|
188
218
|
self.progress_bar.update()
|
|
189
219
|
|
|
@@ -204,10 +234,7 @@ class _FeatureEngineering:
|
|
|
204
234
|
|
|
205
235
|
self._non_linear_transformation()
|
|
206
236
|
self.progress_bar.update()
|
|
207
|
-
|
|
208
|
-
self._anti_select_columns()
|
|
209
|
-
self.progress_bar.update()
|
|
210
|
-
|
|
237
|
+
|
|
211
238
|
return self.data, self.excluded_cols, self.target_label, self.data_transform_dict
|
|
212
239
|
|
|
213
240
|
def _extract_list(self,
|
|
@@ -255,7 +282,7 @@ class _FeatureEngineering:
|
|
|
255
282
|
f"Remaining Columns in the data: {self.data.shape[1]}",
|
|
256
283
|
progress_bar=self.progress_bar)
|
|
257
284
|
else:
|
|
258
|
-
self._display_msg(inline_msg="Analysis
|
|
285
|
+
self._display_msg(inline_msg="Analysis completed. No action taken.",
|
|
259
286
|
progress_bar=self.progress_bar)
|
|
260
287
|
|
|
261
288
|
end_time = time.time()
|
|
@@ -322,18 +349,22 @@ class _FeatureEngineering:
|
|
|
322
349
|
if len(categorical_columns) != 0:
|
|
323
350
|
|
|
324
351
|
obj = CategoricalSummary(data=self.data,
|
|
325
|
-
target_columns=categorical_columns
|
|
352
|
+
target_columns=categorical_columns,
|
|
353
|
+
volatile=self.volatile,
|
|
354
|
+
persist=self.persist)
|
|
326
355
|
|
|
327
356
|
gfc_out = GetFutileColumns(data=self.data,
|
|
328
357
|
object=obj,
|
|
329
358
|
category_summary_column="ColumnName",
|
|
330
|
-
threshold_value =0.7
|
|
359
|
+
threshold_value =0.7,
|
|
360
|
+
volatile=self.volatile,
|
|
361
|
+
persist=self.persist)
|
|
331
362
|
|
|
332
363
|
# Extracting Futile columns
|
|
333
364
|
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
334
365
|
|
|
335
366
|
if len(f_cols) == 0:
|
|
336
|
-
self._display_msg(inline_msg="
|
|
367
|
+
self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
|
|
337
368
|
progress_bar=self.progress_bar)
|
|
338
369
|
else:
|
|
339
370
|
|
|
@@ -350,128 +381,80 @@ class _FeatureEngineering:
|
|
|
350
381
|
self._display_msg(msg="Total time to handle less significant features: {:.2f} sec ".format( end_time - start_time),
|
|
351
382
|
progress_bar=self.progress_bar,
|
|
352
383
|
show_data=True)
|
|
353
|
-
|
|
354
|
-
def _handle_date_component(self,
|
|
355
|
-
date_component_columns,
|
|
356
|
-
date_component):
|
|
357
384
|
|
|
385
|
+
def _fetch_date_component(self):
|
|
358
386
|
"""
|
|
359
387
|
DESCRIPTION:
|
|
360
|
-
Function to
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
PARAMETERS:
|
|
365
|
-
date_component_columns:
|
|
366
|
-
Required Argument.
|
|
367
|
-
Specifies the list of newly generated differnt component of date features.
|
|
368
|
-
Types: list
|
|
369
|
-
|
|
370
|
-
date_component:
|
|
371
|
-
Required Argument.
|
|
372
|
-
Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
|
|
373
|
-
Types: str
|
|
374
|
-
|
|
375
|
-
"""
|
|
376
|
-
# Check for day
|
|
377
|
-
if date_component == "D":
|
|
378
|
-
prefix_value = "Day_"
|
|
379
|
-
# Check for month
|
|
380
|
-
elif date_component == "M":
|
|
381
|
-
prefix_value = "Month_"
|
|
382
|
-
# Check for year diff
|
|
383
|
-
elif date_component == "Y":
|
|
384
|
-
prefix_value = "Year_diff_"
|
|
385
|
-
|
|
386
|
-
# Deciding bins based on distinct value of date component features.
|
|
387
|
-
for col in date_component_columns:
|
|
388
|
-
data_size = self.data.drop_duplicate(col).size
|
|
389
|
-
if data_size < 4:
|
|
390
|
-
num_bins = data_size
|
|
391
|
-
else:
|
|
392
|
-
num_bins = 4
|
|
393
|
-
# Performing bincode for converting date component to specific labels
|
|
394
|
-
fit_params = {
|
|
395
|
-
"data": self.data,
|
|
396
|
-
"target_columns": col,
|
|
397
|
-
"method_type":"Equal-Width",
|
|
398
|
-
"nbins": num_bins,
|
|
399
|
-
"label_prefix" : prefix_value
|
|
400
|
-
}
|
|
401
|
-
bin_code_fit = BincodeFit(**fit_params)
|
|
402
|
-
|
|
403
|
-
fit_params_map = {"D": "day_component_fit_object",
|
|
404
|
-
"M": "month_component_fit_object",
|
|
405
|
-
"Y": "year_diff_component_fit_object"}
|
|
406
|
-
|
|
407
|
-
# Storing fit object for each date component in data transform dictionary
|
|
408
|
-
self.data_transform_dict[fit_params_map[date_component]][col] = bin_code_fit.output
|
|
409
|
-
|
|
410
|
-
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
411
|
-
transform_params = {
|
|
412
|
-
"data": self.data,
|
|
413
|
-
"object": bin_code_fit.output,
|
|
414
|
-
"accumulate": accumulate_columns,
|
|
415
|
-
"persist": True
|
|
416
|
-
}
|
|
417
|
-
self.data = BincodeTransform(**transform_params).result
|
|
418
|
-
|
|
419
|
-
def _fetch_date_component(self,
|
|
420
|
-
process,
|
|
421
|
-
regex_str,
|
|
422
|
-
columns,
|
|
423
|
-
date_component):
|
|
388
|
+
Function to fetch day of week, week of month, month of quarter, quarter of year
|
|
389
|
+
component from date column. Generate weekend and month half details from day of week and
|
|
390
|
+
week of month columns respectively. Convert quarter of year and month of quarter
|
|
391
|
+
component columns to VARCHAR.
|
|
424
392
|
|
|
393
|
+
RETURNS:
|
|
394
|
+
List of newly generated date component features.
|
|
425
395
|
"""
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
regex_str:
|
|
437
|
-
Required Argument.
|
|
438
|
-
Specifies regular expression for identifying newly generated date component features.
|
|
439
|
-
Types: str
|
|
440
|
-
|
|
441
|
-
columns:
|
|
442
|
-
Required Argument.
|
|
443
|
-
Specifies list of newly generated date component features.
|
|
444
|
-
Types: list
|
|
445
|
-
|
|
446
|
-
date_component:
|
|
447
|
-
Required Argument.
|
|
448
|
-
Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
|
|
449
|
-
Types: str
|
|
396
|
+
# List for storing newly generated date component features
|
|
397
|
+
new_date_components=[]
|
|
398
|
+
# Extracting weekend, month, quarter details information from date columns
|
|
399
|
+
date_component_param={}
|
|
400
|
+
for col in self.date_column_list:
|
|
401
|
+
# Generating new column names for extracted date components
|
|
402
|
+
weekend_col = f'{col}_weekend'
|
|
403
|
+
month_half_col = f'{col}_month_half'
|
|
404
|
+
month_of_quarter_col=f'{col}_month_of_quarter'
|
|
405
|
+
quarter_of_year_col=f'{col}_quarter_of_year'
|
|
450
406
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
407
|
+
date_component_param = {
|
|
408
|
+
**date_component_param,
|
|
409
|
+
weekend_col: case([(self.data[col].day_of_week().isin([1, 7]), 'yes')], else_='no'),
|
|
410
|
+
month_half_col: case([(self.data[col].week_of_month().isin([1, 2]), 'first_half')], else_='second_half'),
|
|
411
|
+
month_of_quarter_col: self.data[col].month_of_quarter(),
|
|
412
|
+
quarter_of_year_col: self.data[col].quarter_of_year()
|
|
413
|
+
}
|
|
414
|
+
# Storing newly generated date component month and quarter columns.
|
|
415
|
+
# Skipping day of week and week of month columns as they will be used
|
|
416
|
+
# later for extracting weekend and month part details.
|
|
417
|
+
new_date_components.extend([weekend_col, month_half_col, month_of_quarter_col, quarter_of_year_col])
|
|
418
|
+
# Adding new date component columns to dataset
|
|
419
|
+
self.data=self.data.assign(**date_component_param)
|
|
420
|
+
# Dropping date columns as different component columns are extracted.
|
|
421
|
+
self.data = self.data.drop(self.date_column_list, axis=1)
|
|
422
|
+
|
|
423
|
+
# Converting remaining component columns to VARCHAR
|
|
424
|
+
# So that it will be treated as categorical columns
|
|
425
|
+
remaining_component_columns = [col for col in self.data.columns if re.search('month_of_quarter|quarter_of_year'+"$", col)]
|
|
426
|
+
accumulate_columns = self._extract_list(self.data.columns, remaining_component_columns)
|
|
427
|
+
convertto_params = {
|
|
428
|
+
"data" : self.data,
|
|
429
|
+
"target_columns" : remaining_component_columns,
|
|
430
|
+
"target_datatype" : ["VARCHAR(charlen=20,charset=UNICODE,casespecific=NO)"],
|
|
431
|
+
"accumulate" : accumulate_columns,
|
|
432
|
+
"persist" : True
|
|
433
|
+
}
|
|
434
|
+
# Disabling display table name if persist is True by default
|
|
435
|
+
if not self.volatile and not self.persist:
|
|
436
|
+
convertto_params["display_table_name"] = False
|
|
465
437
|
|
|
466
|
-
|
|
438
|
+
# Setting persist to False if volatile is True
|
|
439
|
+
if self.volatile:
|
|
440
|
+
convertto_params["persist"] = False
|
|
441
|
+
convertto_params["volatile"] = True
|
|
442
|
+
|
|
443
|
+
# returning dataset after performing string manipulation
|
|
444
|
+
self.data = ConvertTo(**convertto_params).result
|
|
445
|
+
|
|
446
|
+
# IF volatile is False and persist is False
|
|
447
|
+
if not self.volatile and not self.persist:
|
|
448
|
+
# Adding transformed data containing table to garbage collector
|
|
449
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
450
|
+
return new_date_components
|
|
467
451
|
|
|
468
452
|
def _handle_date_columns_helper(self):
|
|
469
453
|
|
|
470
454
|
"""
|
|
471
455
|
DESCRIPTION:
|
|
472
|
-
Function for dropping irrelevent date features.
|
|
473
|
-
|
|
474
|
-
Passing extracted component for performing binning.
|
|
456
|
+
Function for dropping irrelevent date features. Perform Extraction of different
|
|
457
|
+
component from revelent date features and transform them.
|
|
475
458
|
"""
|
|
476
459
|
|
|
477
460
|
# Dropping missing value for all date columns
|
|
@@ -484,7 +467,7 @@ class _FeatureEngineering:
|
|
|
484
467
|
# Date columns list eligible for dropping from dataset
|
|
485
468
|
drop_date_cols = []
|
|
486
469
|
|
|
487
|
-
# Checking for
|
|
470
|
+
# Checking for unique valued date columns
|
|
488
471
|
for col in self.date_column_list:
|
|
489
472
|
if self.data.drop_duplicate(col).size == self.data.shape[0]:
|
|
490
473
|
drop_date_cols.append(col)
|
|
@@ -496,46 +479,18 @@ class _FeatureEngineering:
|
|
|
496
479
|
self._display_msg(msg='Dropping date features with all unique value:',
|
|
497
480
|
col_lst = drop_date_cols,
|
|
498
481
|
progress_bar=self.progress_bar)
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
|
|
502
|
-
|
|
503
|
-
# List for storing newly generated date component features
|
|
504
|
-
new_columns=[]
|
|
482
|
+
# Updated date column list after dropping irrelevant date columns
|
|
483
|
+
self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
|
|
505
484
|
|
|
506
|
-
# Extracting day, month and year difference from date columns
|
|
507
485
|
if len(self.date_column_list) != 0:
|
|
508
486
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
day_column=str(col)+"_day_comp"
|
|
513
|
-
month_column=str(col)+"_month_comp"
|
|
514
|
-
year_diff_column=str(col)+"_year_diff_comp"
|
|
515
|
-
new_columns.extend([day_column,month_column,year_diff_column])
|
|
516
|
-
day_query=("EXTRACT(DAY FROM {0})".format(col))
|
|
517
|
-
month_query=("EXTRACT(MONTH FROM {0})".format(col))
|
|
518
|
-
year_query=("EXTRACT(YEAR FROM CURRENT_DATE) - EXTRACT(YEAR FROM {0})".format(col))
|
|
519
|
-
component_param[day_column]=literal_column(day_query,INTEGER())
|
|
520
|
-
component_param[month_column]=literal_column(month_query,INTEGER())
|
|
521
|
-
component_param[year_diff_column]=literal_column(year_query,INTEGER())
|
|
522
|
-
|
|
523
|
-
self.data=self.data.assign(**component_param)
|
|
524
|
-
# Storing newly generated date component list along with parameters in data transform dictionary
|
|
525
|
-
self.data_transform_dict['extract_date_comp_col'] = self.date_column_list
|
|
526
|
-
self.data_transform_dict['extract_date_comp_param'] = component_param
|
|
527
|
-
|
|
528
|
-
# Dropping date columns as we have already extracted day, month and year in new columns
|
|
529
|
-
self.data = self.data.drop(self.date_column_list, axis=1)
|
|
487
|
+
# List for storing newly generated date component features
|
|
488
|
+
new_columns=self._fetch_date_component()
|
|
530
489
|
self._display_msg(msg='List of newly generated features from existing date features:',
|
|
531
490
|
col_lst=new_columns,
|
|
532
491
|
progress_bar=self.progress_bar)
|
|
533
|
-
|
|
534
|
-
data=self.data,
|
|
535
|
-
progress_bar=self.progress_bar)
|
|
536
|
-
|
|
492
|
+
# Dropping columns with all unique values or single value
|
|
537
493
|
drop_cols=[]
|
|
538
|
-
|
|
539
494
|
for col in new_columns:
|
|
540
495
|
distinct_rows = self.data.drop_duplicate(col).size
|
|
541
496
|
if distinct_rows == self.data.shape[0]:
|
|
@@ -555,21 +510,11 @@ class _FeatureEngineering:
|
|
|
555
510
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
556
511
|
# Storing extract date component list for drop in data transform dictionary
|
|
557
512
|
self.data_transform_dict['drop_extract_date_columns'] = drop_cols
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
new_columns = [item for item in new_columns if item not in drop_cols]
|
|
513
|
+
# Extracting all newly generated columns
|
|
514
|
+
new_columns = [item for item in new_columns if item not in drop_cols]
|
|
561
515
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
'day_component_fit_object': {},
|
|
565
|
-
'month_component_fit_object': {},
|
|
566
|
-
'year_diff_component_fit_object': {}}
|
|
567
|
-
# Grouping date components based on types i.e., day, month, and year_diff for performing binning
|
|
568
|
-
if len(new_columns) != 0:
|
|
569
|
-
self.day_columns = self._fetch_date_component("day", "_day_comp", new_columns, "D")
|
|
570
|
-
self.month_columns = self._fetch_date_component("month", "_month_comp", new_columns, "M")
|
|
571
|
-
self.year_diff_columns = self._fetch_date_component("year_diff", "_year_diff_comp", new_columns, "Y")
|
|
572
|
-
self._display_msg(inline_msg="No useful date component found",
|
|
516
|
+
self._display_msg(msg='Updated list of newly generated features from existing date features :',
|
|
517
|
+
col_lst=new_columns,
|
|
573
518
|
progress_bar=self.progress_bar)
|
|
574
519
|
|
|
575
520
|
self._display_msg(msg='Updated dataset sample after handling date features:',
|
|
@@ -595,7 +540,7 @@ class _FeatureEngineering:
|
|
|
595
540
|
if d_type in ["datetime.date","datetime.datetime"]]
|
|
596
541
|
|
|
597
542
|
if len(self.date_column_list) == 0:
|
|
598
|
-
self._display_msg(inline_msg="Dataset does not contain any feature related to dates.",
|
|
543
|
+
self._display_msg(inline_msg="Analysis Completed. Dataset does not contain any feature related to dates. No action needed.",
|
|
599
544
|
progress_bar=self.progress_bar)
|
|
600
545
|
else:
|
|
601
546
|
# Storing date column list in data transform dictionary
|
|
@@ -622,8 +567,9 @@ class _FeatureEngineering:
|
|
|
622
567
|
self.data = self.data.dropna(subset=[self.target_column])
|
|
623
568
|
|
|
624
569
|
obj = ColumnSummary(data=self.data,
|
|
625
|
-
target_columns=self.data.columns,
|
|
626
|
-
volatile=
|
|
570
|
+
target_columns=self.data.columns,
|
|
571
|
+
volatile=self.volatile,
|
|
572
|
+
persist=self.persist)
|
|
627
573
|
|
|
628
574
|
cols_miss_val={}
|
|
629
575
|
# Iterating over each row in the column summary result
|
|
@@ -705,11 +651,15 @@ class _FeatureEngineering:
|
|
|
705
651
|
self.data_transform_dict['imputation_columns'] = self.imputation_cols
|
|
706
652
|
|
|
707
653
|
if len(delete_rows) != 0:
|
|
654
|
+
rows = self.data.shape[0]
|
|
708
655
|
self.data = self.data.dropna(subset=delete_rows)
|
|
709
656
|
msg_val_found=1
|
|
710
657
|
self._display_msg(msg='Deleting rows of these columns for handling missing values:',
|
|
711
658
|
col_lst=delete_rows,
|
|
712
659
|
progress_bar=self.progress_bar)
|
|
660
|
+
self._display_msg(msg=f'Sample of dataset after removing {rows-self.data.shape[0]} rows:',
|
|
661
|
+
data=self.data,
|
|
662
|
+
progress_bar=self.progress_bar)
|
|
713
663
|
|
|
714
664
|
if len(drop_cols) != 0:
|
|
715
665
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
@@ -719,9 +669,12 @@ class _FeatureEngineering:
|
|
|
719
669
|
self._display_msg(msg='Dropping these columns for handling missing values:',
|
|
720
670
|
col_lst=drop_cols,
|
|
721
671
|
progress_bar=self.progress_bar)
|
|
672
|
+
self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
|
|
673
|
+
data=self.data,
|
|
674
|
+
progress_bar=self.progress_bar)
|
|
722
675
|
|
|
723
676
|
if len(self.imputation_cols) == 0 and msg_val_found ==0:
|
|
724
|
-
self._display_msg(inline_msg="No Missing Values Detected.",
|
|
677
|
+
self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
|
|
725
678
|
progress_bar=self.progress_bar)
|
|
726
679
|
|
|
727
680
|
end_time = time.time()
|
|
@@ -787,21 +740,23 @@ class _FeatureEngineering:
|
|
|
787
740
|
|
|
788
741
|
fit_obj = SimpleImputeFit(data=self.data,
|
|
789
742
|
stats_columns=col_stat,
|
|
790
|
-
stats=stat,
|
|
791
|
-
volatile=
|
|
743
|
+
stats=stat,
|
|
744
|
+
volatile=self.volatile,
|
|
745
|
+
persist=self.persist)
|
|
792
746
|
|
|
793
747
|
# Storing fit object for imputation in data transform dictionary
|
|
794
748
|
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
795
749
|
sm = SimpleImputeTransform(data=self.data,
|
|
796
|
-
|
|
797
|
-
|
|
750
|
+
object=fit_obj,
|
|
751
|
+
volatile=self.volatile,
|
|
752
|
+
persist=self.persist)
|
|
798
753
|
|
|
799
754
|
self.data = sm.result
|
|
800
|
-
self._display_msg(msg="Sample of
|
|
755
|
+
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
801
756
|
data=self.data,
|
|
802
757
|
progress_bar=self.progress_bar)
|
|
803
758
|
else:
|
|
804
|
-
self._display_msg(inline_msg="No imputation
|
|
759
|
+
self._display_msg(inline_msg="Analysis completed. No imputation required.",
|
|
805
760
|
progress_bar=self.progress_bar)
|
|
806
761
|
|
|
807
762
|
end_time = time.time()
|
|
@@ -827,6 +782,8 @@ class _FeatureEngineering:
|
|
|
827
782
|
drop_col_ind = missing_handling_param.get("DroppingColumnIndicator", False)
|
|
828
783
|
drop_row_ind = missing_handling_param.get("DroppingRowIndicator", False)
|
|
829
784
|
impute_ind = missing_handling_param.get("ImputeMissingIndicator", False)
|
|
785
|
+
volatile = missing_handling_param.pop("volatile", False)
|
|
786
|
+
persist = missing_handling_param.pop("persist", False)
|
|
830
787
|
# Checking for user input if all methods indicator are false or not
|
|
831
788
|
if not any([drop_col_ind, drop_row_ind, impute_ind]):
|
|
832
789
|
self._display_msg(inline_msg="No method information provided for performing customized missing value handling. \
|
|
@@ -883,7 +840,9 @@ class _FeatureEngineering:
|
|
|
883
840
|
"stats_columns" : stat_list,
|
|
884
841
|
"stats" : stat_method,
|
|
885
842
|
"literals_columns" : literal_list,
|
|
886
|
-
"literals" : literal_value
|
|
843
|
+
"literals" : literal_value,
|
|
844
|
+
"volatile" : volatile,
|
|
845
|
+
"persist" : persist
|
|
887
846
|
}
|
|
888
847
|
# Fitting on dataset
|
|
889
848
|
fit_obj = SimpleImputeFit(**fit_param)
|
|
@@ -896,8 +855,18 @@ class _FeatureEngineering:
|
|
|
896
855
|
"object" : fit_obj.output,
|
|
897
856
|
"persist" : True
|
|
898
857
|
}
|
|
858
|
+
# Disabling display table name if persist is True by default
|
|
859
|
+
if not volatile and not persist:
|
|
860
|
+
transform_param["display_table_name"] = False
|
|
861
|
+
|
|
862
|
+
if volatile:
|
|
863
|
+
transform_param["volatile"] = True
|
|
864
|
+
transform_param["persist"] = False
|
|
899
865
|
# Updating dataset with transform result
|
|
900
866
|
self.data = SimpleImputeTransform(**transform_param).result
|
|
867
|
+
if not volatile and not persist:
|
|
868
|
+
# Adding transformed data containing table to garbage collector
|
|
869
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
901
870
|
self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
|
|
902
871
|
data=self.data,
|
|
903
872
|
progress_bar=self.progress_bar)
|
|
@@ -938,6 +907,8 @@ class _FeatureEngineering:
|
|
|
938
907
|
equal_width_bin_columns = []
|
|
939
908
|
var_width_bin_list = []
|
|
940
909
|
var_width_bin_columns = []
|
|
910
|
+
volatile = extracted_col.pop("volatile", False)
|
|
911
|
+
persist = extracted_col.pop("persist", False)
|
|
941
912
|
|
|
942
913
|
# Checking for column present in dataset or not
|
|
943
914
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "BincodeParam", self.data, "df")
|
|
@@ -971,7 +942,9 @@ class _FeatureEngineering:
|
|
|
971
942
|
"data" : self.data,
|
|
972
943
|
"target_columns": equal_width_bin_columns,
|
|
973
944
|
"method_type" : "Equal-Width",
|
|
974
|
-
"nbins" : bins
|
|
945
|
+
"nbins" : bins,
|
|
946
|
+
"volatile" : volatile,
|
|
947
|
+
"persist" : persist
|
|
975
948
|
}
|
|
976
949
|
eql_bin_code_fit = BincodeFit(**fit_params)
|
|
977
950
|
# Storing fit object and column list for Equal-Width binning in data transform dictionary
|
|
@@ -984,9 +957,19 @@ class _FeatureEngineering:
|
|
|
984
957
|
"data" : self.data,
|
|
985
958
|
"object" : eql_bin_code_fit.output,
|
|
986
959
|
"accumulate" : accumulate_columns,
|
|
987
|
-
"persist" : True
|
|
960
|
+
"persist" : True
|
|
988
961
|
}
|
|
962
|
+
# Disabling display table name if persist is True by default
|
|
963
|
+
if not volatile and not persist:
|
|
964
|
+
eql_transform_params["display_table_name"] = False
|
|
965
|
+
|
|
966
|
+
if volatile:
|
|
967
|
+
eql_transform_params["volatile"] = True
|
|
968
|
+
eql_transform_params["persist"] = False
|
|
989
969
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
970
|
+
if not volatile and not persist:
|
|
971
|
+
# Adding transformed data containing table to garbage collector
|
|
972
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
990
973
|
self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
|
|
991
974
|
data=self.data,
|
|
992
975
|
progress_bar=self.progress_bar)
|
|
@@ -1011,7 +994,9 @@ class _FeatureEngineering:
|
|
|
1011
994
|
"maxvalue_column" : "MaxValue",
|
|
1012
995
|
"label_column" : "Label",
|
|
1013
996
|
"method_type" : "Variable-Width",
|
|
1014
|
-
"label_prefix" : "label_prefix"
|
|
997
|
+
"label_prefix" : "label_prefix",
|
|
998
|
+
"volatile" : volatile,
|
|
999
|
+
"persist" : persist
|
|
1015
1000
|
}
|
|
1016
1001
|
var_bin_code_fit = BincodeFit(**fit_params)
|
|
1017
1002
|
# Storing fit object and column list for Variable-Width binning in data transform dictionary
|
|
@@ -1023,9 +1008,19 @@ class _FeatureEngineering:
|
|
|
1023
1008
|
"object" : var_bin_code_fit.output,
|
|
1024
1009
|
"object_order_column" : "TD_MinValue_BINFIT",
|
|
1025
1010
|
"accumulate" : accumulate_columns,
|
|
1026
|
-
"persist" : True
|
|
1011
|
+
"persist" : True
|
|
1027
1012
|
}
|
|
1013
|
+
# Disabling display table name if persist is True by default
|
|
1014
|
+
if not volatile and not persist:
|
|
1015
|
+
var_transform_params["display_table_name"] = False
|
|
1016
|
+
|
|
1017
|
+
if volatile:
|
|
1018
|
+
var_transform_params["volatile"] = True
|
|
1019
|
+
var_transform_params["persist"] = False
|
|
1028
1020
|
self.data = BincodeTransform(**var_transform_params).result
|
|
1021
|
+
if not volatile and not persist:
|
|
1022
|
+
# Adding transformed data containing table to garbage collector
|
|
1023
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1029
1024
|
self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
|
|
1030
1025
|
data=self.data,
|
|
1031
1026
|
progress_bar=self.progress_bar)
|
|
@@ -1049,11 +1044,13 @@ class _FeatureEngineering:
|
|
|
1049
1044
|
# Storing custom string manipulation indicator in data transform dictionary
|
|
1050
1045
|
self.data_transform_dict['custom_string_manipulation_ind'] = True
|
|
1051
1046
|
# Fetching list required for performing operation.
|
|
1052
|
-
extracted_col = self.custom_data.get("StringManipulationParam", None)
|
|
1047
|
+
extracted_col = self.custom_data.get("StringManipulationParam", None).copy()
|
|
1053
1048
|
if not extracted_col:
|
|
1054
1049
|
self._display_msg(inline_msg="No information provided for performing string manipulation.",
|
|
1055
1050
|
progress_bar=self.progress_bar)
|
|
1056
1051
|
else:
|
|
1052
|
+
volatile = extracted_col.pop("volatile", False)
|
|
1053
|
+
persist = extracted_col.pop("persist", False)
|
|
1057
1054
|
# Checking for column present in dataset or not
|
|
1058
1055
|
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "StringManipulationParam", self.data, "df")
|
|
1059
1056
|
|
|
@@ -1066,8 +1063,9 @@ class _FeatureEngineering:
|
|
|
1066
1063
|
data=self.data,
|
|
1067
1064
|
progress_bar=self.progress_bar)
|
|
1068
1065
|
else:
|
|
1069
|
-
self._display_msg(inline_msg="Skipping customized string manipulation."
|
|
1070
|
-
|
|
1066
|
+
self._display_msg(inline_msg="Skipping customized string manipulation.",
|
|
1067
|
+
progress_bar=self.progress_bar)
|
|
1068
|
+
|
|
1071
1069
|
def _str_method_mapping(self,
|
|
1072
1070
|
target_col,
|
|
1073
1071
|
transform_val):
|
|
@@ -1096,7 +1094,11 @@ class _FeatureEngineering:
|
|
|
1096
1094
|
|
|
1097
1095
|
# Fetching required parameters from json object
|
|
1098
1096
|
string_operation = transform_val["StringOperation"]
|
|
1099
|
-
|
|
1097
|
+
|
|
1098
|
+
# Setting volatile and persist parameters for performing string manipulation
|
|
1099
|
+
volatile, persist = self._set_generic_parameters(func_indicator="StringManipulationIndicator",
|
|
1100
|
+
param_name="StringManipulationParam")
|
|
1101
|
+
|
|
1100
1102
|
# Storing general parameters for performing string transformation
|
|
1101
1103
|
fit_params = {
|
|
1102
1104
|
"data" : self.data,
|
|
@@ -1106,6 +1108,14 @@ class _FeatureEngineering:
|
|
|
1106
1108
|
"inplace" : True,
|
|
1107
1109
|
"persist" : True
|
|
1108
1110
|
}
|
|
1111
|
+
# Disabling display table name if persist is True by default
|
|
1112
|
+
if not volatile and not persist:
|
|
1113
|
+
fit_params["display_table_name"] = False
|
|
1114
|
+
|
|
1115
|
+
if volatile:
|
|
1116
|
+
fit_params["volatile"] = True
|
|
1117
|
+
fit_params["persist"] = False
|
|
1118
|
+
|
|
1109
1119
|
# Adding additional parameters based on string operation type
|
|
1110
1120
|
if string_operation in ["StringCon", "StringTrim"]:
|
|
1111
1121
|
string_argument = transform_val["String"]
|
|
@@ -1125,11 +1135,15 @@ class _FeatureEngineering:
|
|
|
1125
1135
|
"string_length" : string_length}
|
|
1126
1136
|
|
|
1127
1137
|
# returning dataset after performing string manipulation
|
|
1128
|
-
|
|
1138
|
+
transform_output = StrApply(**fit_params).result
|
|
1139
|
+
if not volatile and not persist:
|
|
1140
|
+
# Adding transformed data containing table to garbage collector
|
|
1141
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1142
|
+
return transform_output
|
|
1129
1143
|
|
|
1130
1144
|
def _one_hot_encoding(self,
|
|
1131
|
-
|
|
1132
|
-
|
|
1145
|
+
one_hot_columns,
|
|
1146
|
+
unique_counts):
|
|
1133
1147
|
"""
|
|
1134
1148
|
DESCRIPTION:
|
|
1135
1149
|
Function performs the one hot encoding to categorcial columns/features in the dataset.
|
|
@@ -1143,12 +1157,16 @@ class _FeatureEngineering:
|
|
|
1143
1157
|
unique_counts:
|
|
1144
1158
|
Required Argument.
|
|
1145
1159
|
Specifies the unique counts in the categorical columns.
|
|
1146
|
-
Types: int or list of integer (int)
|
|
1147
|
-
|
|
1160
|
+
Types: int or list of integer (int)
|
|
1148
1161
|
"""
|
|
1149
1162
|
# TD function will add extra column_other in onehotEncoding, so
|
|
1150
1163
|
# initailizing this list to remove those extra columns
|
|
1151
1164
|
drop_lst = [ele + "_other" for ele in one_hot_columns]
|
|
1165
|
+
|
|
1166
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1167
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1168
|
+
param_name="CategoricalEncodingParam")
|
|
1169
|
+
|
|
1152
1170
|
# Adding fit parameters for performing encoding
|
|
1153
1171
|
fit_params = {
|
|
1154
1172
|
"data" : self.data,
|
|
@@ -1156,7 +1174,9 @@ class _FeatureEngineering:
|
|
|
1156
1174
|
"is_input_dense" : True,
|
|
1157
1175
|
"target_column" : one_hot_columns,
|
|
1158
1176
|
"category_counts" : unique_counts,
|
|
1159
|
-
"other_column" : "other"
|
|
1177
|
+
"other_column" : "other",
|
|
1178
|
+
"volatile" : volatile,
|
|
1179
|
+
"persist" : persist
|
|
1160
1180
|
}
|
|
1161
1181
|
# Performing one hot encoding fit on target columns
|
|
1162
1182
|
fit_obj = OneHotEncodingFit(**fit_params)
|
|
@@ -1172,9 +1192,22 @@ class _FeatureEngineering:
|
|
|
1172
1192
|
"is_input_dense" : True,
|
|
1173
1193
|
"persist" : True
|
|
1174
1194
|
}
|
|
1195
|
+
# Disabling display table name if persist is True by default
|
|
1196
|
+
if not volatile and not persist:
|
|
1197
|
+
transform_params["display_table_name"] = False
|
|
1198
|
+
|
|
1199
|
+
# Setting persist to False if volatile is True
|
|
1200
|
+
if volatile:
|
|
1201
|
+
transform_params["volatile"] = True
|
|
1202
|
+
transform_params["persist"] = False
|
|
1203
|
+
|
|
1175
1204
|
# Performing one hot encoding transformation
|
|
1176
|
-
|
|
1177
|
-
|
|
1205
|
+
transform_output = OneHotEncodingTransform(**transform_params).result
|
|
1206
|
+
|
|
1207
|
+
if not volatile and not persist:
|
|
1208
|
+
# Adding transformed data containing table to garbage collector
|
|
1209
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1210
|
+
self.data = transform_output.drop(drop_lst, axis=1)
|
|
1178
1211
|
|
|
1179
1212
|
def _ordinal_encoding(self,
|
|
1180
1213
|
ordinal_columns):
|
|
@@ -1188,11 +1221,16 @@ class _FeatureEngineering:
|
|
|
1188
1221
|
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
1189
1222
|
Types: str or list of strings (str)
|
|
1190
1223
|
"""
|
|
1224
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1225
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1226
|
+
param_name="CategoricalEncodingParam")
|
|
1227
|
+
|
|
1191
1228
|
# Adding fit parameters for performing encoding
|
|
1192
1229
|
fit_params = {
|
|
1193
1230
|
"data" : self.data,
|
|
1194
1231
|
"target_column" : ordinal_columns,
|
|
1195
|
-
"volatile" :
|
|
1232
|
+
"volatile" : volatile,
|
|
1233
|
+
"persist" : persist
|
|
1196
1234
|
}
|
|
1197
1235
|
# Performing ordinal encoding fit on target columns
|
|
1198
1236
|
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
@@ -1212,15 +1250,27 @@ class _FeatureEngineering:
|
|
|
1212
1250
|
"accumulate" : accumulate_columns,
|
|
1213
1251
|
"persist" : True
|
|
1214
1252
|
}
|
|
1253
|
+
# Disabling display table name if persist is True by default
|
|
1254
|
+
if not volatile and not persist:
|
|
1255
|
+
transform_params["display_table_name"] = False
|
|
1256
|
+
|
|
1257
|
+
# Setting persist to False if volatile is True
|
|
1258
|
+
if volatile:
|
|
1259
|
+
transform_params["volatile"] = True
|
|
1260
|
+
transform_params["persist"] = False
|
|
1215
1261
|
# Performing ordinal encoding transformation
|
|
1216
1262
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
1263
|
+
|
|
1264
|
+
if not volatile and not persist:
|
|
1265
|
+
# Adding transformed data containing table to garbage collector
|
|
1266
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1217
1267
|
|
|
1218
1268
|
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1219
1269
|
self.target_label = ord_fit_obj
|
|
1220
1270
|
|
|
1221
1271
|
|
|
1222
1272
|
def _target_encoding(self,
|
|
1223
|
-
|
|
1273
|
+
target_encoding_list):
|
|
1224
1274
|
"""
|
|
1225
1275
|
DESCRIPTION:
|
|
1226
1276
|
Function performs the target encoding to categorcial columns/features in the dataset.
|
|
@@ -1245,6 +1295,11 @@ class _FeatureEngineering:
|
|
|
1245
1295
|
# Storing indicator and fit object for target encoding in data transform dictionary
|
|
1246
1296
|
self.data_transform_dict["custom_target_encoding_ind"] = True
|
|
1247
1297
|
self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
|
|
1298
|
+
|
|
1299
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1300
|
+
volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1301
|
+
param_name="CategoricalEncodingParam")
|
|
1302
|
+
|
|
1248
1303
|
# Fetching required argument for performing target encoding
|
|
1249
1304
|
for col,transform_val in target_encoding_list.items():
|
|
1250
1305
|
encoder_method = transform_val["encoder_method"]
|
|
@@ -1255,7 +1310,9 @@ class _FeatureEngineering:
|
|
|
1255
1310
|
"category_data" : category_data,
|
|
1256
1311
|
"encoder_method" : encoder_method,
|
|
1257
1312
|
"target_columns" : col,
|
|
1258
|
-
"response_column" : response_column
|
|
1313
|
+
"response_column" : response_column,
|
|
1314
|
+
"volatile" : volatile,
|
|
1315
|
+
"persist" : persist
|
|
1259
1316
|
}
|
|
1260
1317
|
if encoder_method == "CBM_DIRICHLET":
|
|
1261
1318
|
num_distinct_responses=transform_val["num_distinct_responses"]
|
|
@@ -1264,7 +1321,7 @@ class _FeatureEngineering:
|
|
|
1264
1321
|
# Performing target encoding fit on target columns
|
|
1265
1322
|
tar_fit_obj = TargetEncodingFit(**fit_params)
|
|
1266
1323
|
# Storing each column fit object for target encoding in data transform dictionary
|
|
1267
|
-
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj})
|
|
1324
|
+
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj.result})
|
|
1268
1325
|
# Extracting accumulate columns
|
|
1269
1326
|
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
1270
1327
|
# Adding transform parameters for performing encoding
|
|
@@ -1272,10 +1329,21 @@ class _FeatureEngineering:
|
|
|
1272
1329
|
"data" : self.data,
|
|
1273
1330
|
"object" : tar_fit_obj,
|
|
1274
1331
|
"accumulate" : accumulate_columns,
|
|
1275
|
-
"persist" : True
|
|
1332
|
+
"persist" : True
|
|
1276
1333
|
}
|
|
1334
|
+
|
|
1335
|
+
# Disabling display table name if persist is True by default
|
|
1336
|
+
if not volatile and not persist:
|
|
1337
|
+
transform_params["display_table_name"] = False
|
|
1338
|
+
|
|
1339
|
+
if volatile:
|
|
1340
|
+
transform_params["volatile"] = True
|
|
1341
|
+
transform_params["persist"] = False
|
|
1277
1342
|
# Performing ordinal encoding transformation
|
|
1278
1343
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
1344
|
+
if not volatile and not persist:
|
|
1345
|
+
# Adding transformed data containing table to garbage collector
|
|
1346
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1279
1347
|
|
|
1280
1348
|
def _encoding_categorical_columns(self):
|
|
1281
1349
|
"""
|
|
@@ -1308,8 +1376,11 @@ class _FeatureEngineering:
|
|
|
1308
1376
|
self._display_msg(msg="ONE HOT Encoding these Columns:",
|
|
1309
1377
|
col_lst=ohe_col,
|
|
1310
1378
|
progress_bar=self.progress_bar)
|
|
1379
|
+
self._display_msg(msg="Sample of dataset after performing one hot encoding:",
|
|
1380
|
+
data=self.data,
|
|
1381
|
+
progress_bar=self.progress_bar)
|
|
1311
1382
|
else:
|
|
1312
|
-
self._display_msg(inline_msg="
|
|
1383
|
+
self._display_msg(inline_msg="Analysis completed. No categorical columns were found.",
|
|
1313
1384
|
progress_bar=self.progress_bar)
|
|
1314
1385
|
|
|
1315
1386
|
# List of columns after one hot
|
|
@@ -1337,8 +1408,10 @@ class _FeatureEngineering:
|
|
|
1337
1408
|
# Storing custom categorical encoding indicator in data transform dictionary
|
|
1338
1409
|
self.data_transform_dict["custom_categorical_encoding_ind"] = True
|
|
1339
1410
|
# Fetching user input list for performing
|
|
1340
|
-
encoding_list = self.custom_data.get("CategoricalEncodingParam", None)
|
|
1411
|
+
encoding_list = self.custom_data.get("CategoricalEncodingParam", None).copy()
|
|
1341
1412
|
if encoding_list:
|
|
1413
|
+
volatile = encoding_list.pop("volatile", False)
|
|
1414
|
+
persist = encoding_list.pop("persist", False)
|
|
1342
1415
|
onehot_encode_ind = encoding_list.get("OneHotEncodingIndicator", False)
|
|
1343
1416
|
ordinal_encode_ind = encoding_list.get("OrdinalEncodingIndicator", False)
|
|
1344
1417
|
target_encode_ind = encoding_list.get("TargetEncodingIndicator", False)
|
|
@@ -1415,11 +1488,25 @@ class _FeatureEngineering:
|
|
|
1415
1488
|
"""
|
|
1416
1489
|
DESCRIPTION:
|
|
1417
1490
|
Function to perform different numerical transformations using NumApply on numerical features based on user input.
|
|
1418
|
-
|
|
1491
|
+
|
|
1492
|
+
PARAMETERS:
|
|
1493
|
+
target_col:
|
|
1494
|
+
Required Argument.
|
|
1495
|
+
Specifies the numerical column for which transformation will be performed.
|
|
1496
|
+
Types: str
|
|
1497
|
+
|
|
1498
|
+
transform_val:
|
|
1499
|
+
Required Argument.
|
|
1500
|
+
Specifies different parameter require for applying numerical transformation.
|
|
1501
|
+
Types: dict
|
|
1419
1502
|
"""
|
|
1420
1503
|
# Fetching columns for accumulation
|
|
1421
1504
|
accumulate_columns = self._extract_list(self.data.columns, [target_col])
|
|
1422
1505
|
apply_method = transform_val["apply_method"]
|
|
1506
|
+
|
|
1507
|
+
# Setting volatile and persist parameters for performing transformation
|
|
1508
|
+
volatile, persist = self._set_generic_parameters(func_indicator="MathameticalTransformationIndicator",
|
|
1509
|
+
param_name="MathameticalTransformationParam")
|
|
1423
1510
|
# Adding fit parameters for performing transformation
|
|
1424
1511
|
fit_params={
|
|
1425
1512
|
"data": self.data,
|
|
@@ -1429,14 +1516,25 @@ class _FeatureEngineering:
|
|
|
1429
1516
|
"persist" :True,
|
|
1430
1517
|
"accumulate" : accumulate_columns
|
|
1431
1518
|
}
|
|
1519
|
+
# Disabling display table name if persist is True by default
|
|
1520
|
+
if not volatile and not persist:
|
|
1521
|
+
fit_params["display_table_name"] = False
|
|
1522
|
+
|
|
1523
|
+
if volatile:
|
|
1524
|
+
fit_params["volatile"] = True
|
|
1525
|
+
fit_params["persist"] = False
|
|
1432
1526
|
# Adding addition details for fit parameters in case of SIGMOID transformation
|
|
1433
1527
|
if apply_method == "sigmoid":
|
|
1434
1528
|
sigmoid_style=transform_val["sigmoid_style"]
|
|
1435
1529
|
fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
|
|
1436
1530
|
# Performing transformation on target columns
|
|
1437
|
-
|
|
1531
|
+
transform_output = NumApply(**fit_params).result
|
|
1532
|
+
if not volatile and not persist:
|
|
1533
|
+
# Adding transformed data containing table to garbage collector
|
|
1534
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1535
|
+
return transform_output
|
|
1438
1536
|
|
|
1439
|
-
def _numerical_transformation(self, target_columns, num_transform_data):
|
|
1537
|
+
def _numerical_transformation(self, target_columns, num_transform_data, volatile, persist):
|
|
1440
1538
|
"""
|
|
1441
1539
|
DESCRIPTION:
|
|
1442
1540
|
Function to perform different numerical transformations using Fit and Transform on numerical features based on user input.
|
|
@@ -1446,7 +1544,9 @@ class _FeatureEngineering:
|
|
|
1446
1544
|
fit_params={
|
|
1447
1545
|
"data" : self.data,
|
|
1448
1546
|
"object" : num_transform_data,
|
|
1449
|
-
"object_order_column" : "TargetColumn"
|
|
1547
|
+
"object_order_column" : "TargetColumn",
|
|
1548
|
+
"volatile" : volatile,
|
|
1549
|
+
"persist" : persist
|
|
1450
1550
|
}
|
|
1451
1551
|
# Peforming fit with all arguments.
|
|
1452
1552
|
num_fit_obj = Fit(**fit_params)
|
|
@@ -1464,8 +1564,18 @@ class _FeatureEngineering:
|
|
|
1464
1564
|
"id_columns" : id_columns,
|
|
1465
1565
|
"persist" :True
|
|
1466
1566
|
}
|
|
1567
|
+
# Disabling display table name if persist is True by default
|
|
1568
|
+
if not volatile and not persist:
|
|
1569
|
+
transform_params["display_table_name"] = False
|
|
1570
|
+
|
|
1571
|
+
if volatile:
|
|
1572
|
+
transform_params["volatile"] = True
|
|
1573
|
+
transform_params["persist"] = False
|
|
1467
1574
|
# Peforming transformation on target columns
|
|
1468
|
-
self.data = Transform(**transform_params).result
|
|
1575
|
+
self.data = Transform(**transform_params).result
|
|
1576
|
+
if not volatile and not persist:
|
|
1577
|
+
# Adding transformed data containing table to garbage collector
|
|
1578
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1469
1579
|
self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
|
|
1470
1580
|
data=self.data,
|
|
1471
1581
|
progress_bar=self.progress_bar)
|
|
@@ -1484,8 +1594,11 @@ class _FeatureEngineering:
|
|
|
1484
1594
|
# Checking user input for mathematical transformations
|
|
1485
1595
|
if mat_transform_input:
|
|
1486
1596
|
# Extracting list required for mathematical transformations
|
|
1487
|
-
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None)
|
|
1597
|
+
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None).copy()
|
|
1598
|
+
|
|
1488
1599
|
if mat_transform_list:
|
|
1600
|
+
volatile = mat_transform_list.pop("volatile", False)
|
|
1601
|
+
persist = mat_transform_list.pop("persist", False)
|
|
1489
1602
|
# Checking for column present in dataset or not
|
|
1490
1603
|
_Validators._validate_dataframe_has_argument_columns(list(mat_transform_list.keys()),
|
|
1491
1604
|
"MathameticalTransformationParam", self.data, "df")
|
|
@@ -1529,7 +1642,7 @@ class _FeatureEngineering:
|
|
|
1529
1642
|
copy_to_sql(df=transform_data, table_name="automl_num_transform_data", temporary=True)
|
|
1530
1643
|
num_transform_data = DataFrame.from_table("automl_num_transform_data")
|
|
1531
1644
|
# Applying transformation using Fit/Transform functions
|
|
1532
|
-
self._numerical_transformation(target_columns, num_transform_data)
|
|
1645
|
+
self._numerical_transformation(target_columns, num_transform_data, volatile, persist)
|
|
1533
1646
|
# Storing custom numerical transformation parameters and column list in data transform dictionary
|
|
1534
1647
|
self.data_transform_dict['custom_numerical_transformation_col'] = target_columns
|
|
1535
1648
|
self.data_transform_dict['custom_numerical_transformation_params'] = num_transform_data
|
|
@@ -1555,6 +1668,8 @@ class _FeatureEngineering:
|
|
|
1555
1668
|
nl_transform_list = self.custom_data.get("NonLinearTransformationParam", None)
|
|
1556
1669
|
# Extracting list required for non-linear transformation
|
|
1557
1670
|
if nl_transform_list:
|
|
1671
|
+
volatile = nl_transform_list.pop("volatile", False)
|
|
1672
|
+
persist = nl_transform_list.pop("persist", False)
|
|
1558
1673
|
total_combination = len(nl_transform_list)
|
|
1559
1674
|
# Generating all possible combination names
|
|
1560
1675
|
possible_combination = ["Combination_"+str(counter) for counter in range(1,total_combination+1)]
|
|
@@ -1581,12 +1696,14 @@ class _FeatureEngineering:
|
|
|
1581
1696
|
"data" : self.data,
|
|
1582
1697
|
"target_columns" : target_columns,
|
|
1583
1698
|
"formula" : formula,
|
|
1584
|
-
"result_column" : result_column
|
|
1699
|
+
"result_column" : result_column,
|
|
1700
|
+
"volatile" : volatile,
|
|
1701
|
+
"persist" : persist
|
|
1585
1702
|
}
|
|
1586
1703
|
# Performing fit on dataset
|
|
1587
1704
|
fit_obj = NonLinearCombineFit(**fit_param)
|
|
1588
1705
|
# Updating it for each non-linear combination
|
|
1589
|
-
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj})
|
|
1706
|
+
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj.result})
|
|
1590
1707
|
# Adding transform params for transformation
|
|
1591
1708
|
transform_params = {
|
|
1592
1709
|
"data" : self.data,
|
|
@@ -1594,7 +1711,18 @@ class _FeatureEngineering:
|
|
|
1594
1711
|
"accumulate" : self.data.columns,
|
|
1595
1712
|
"persist" : True
|
|
1596
1713
|
}
|
|
1714
|
+
# Disabling display table name if persist is True by default
|
|
1715
|
+
if not volatile and not persist:
|
|
1716
|
+
transform_params["display_table_name"] = False
|
|
1717
|
+
|
|
1718
|
+
if volatile:
|
|
1719
|
+
transform_params["volatile"] = True
|
|
1720
|
+
transform_params["persist"] = False
|
|
1597
1721
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
1722
|
+
|
|
1723
|
+
if not volatile and not persist:
|
|
1724
|
+
# Adding transformed data containing table to garbage collector
|
|
1725
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1598
1726
|
else:
|
|
1599
1727
|
self._display_msg(inline_msg="Combinations are not as per expectation.",
|
|
1600
1728
|
progress_bar=self.progress_bar)
|
|
@@ -1620,29 +1748,64 @@ class _FeatureEngineering:
|
|
|
1620
1748
|
anti_select_input = self.custom_data.get("AntiselectIndicator", False)
|
|
1621
1749
|
# Checking user input for anti-select columns
|
|
1622
1750
|
if anti_select_input:
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1751
|
+
anti_select_params = self.custom_data.get("AntiselectParam", None)
|
|
1752
|
+
if anti_select_params:
|
|
1753
|
+
# Extracting list required for anti-select columns
|
|
1754
|
+
anti_select_list = anti_select_params.get("excluded_columns", None)
|
|
1755
|
+
volatile = anti_select_params.get("volatile", False)
|
|
1756
|
+
persist = anti_select_params.get("persist", False)
|
|
1757
|
+
if(anti_select_list):
|
|
1758
|
+
if all(item in self.data.columns for item in anti_select_list):
|
|
1759
|
+
# Storing custom anti-select columns indicator and column list in data transform dictionary
|
|
1760
|
+
self.data_transform_dict['custom_anti_select_columns_ind'] = True
|
|
1761
|
+
self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
|
|
1762
|
+
fit_params = {
|
|
1763
|
+
"data" : self.data,
|
|
1764
|
+
"exclude" : anti_select_list,
|
|
1765
|
+
"volatile" : volatile,
|
|
1766
|
+
"persist" : persist
|
|
1767
|
+
}
|
|
1768
|
+
# Performing transformation for given user input
|
|
1769
|
+
self.data = Antiselect(**fit_params).result
|
|
1770
|
+
self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
|
|
1771
|
+
data=self.data,
|
|
1772
|
+
progress_bar=self.progress_bar)
|
|
1773
|
+
else:
|
|
1774
|
+
self._display_msg(msg="Columns provided in list are not present in dataset:",
|
|
1775
|
+
col_lst=anti_select_list,
|
|
1776
|
+
progress_bar=self.progress_bar)
|
|
1643
1777
|
else:
|
|
1644
1778
|
self._display_msg(inline_msg="No information provided for performing anti-select columns operation.",
|
|
1645
1779
|
progress_bar=self.progress_bar)
|
|
1646
1780
|
else:
|
|
1647
1781
|
self._display_msg(inline_msg="Skipping customized anti-select columns.",
|
|
1648
|
-
progress_bar=self.progress_bar)
|
|
1782
|
+
progress_bar=self.progress_bar)
|
|
1783
|
+
|
|
1784
|
+
def _set_generic_parameters(self,
|
|
1785
|
+
func_indicator=None,
|
|
1786
|
+
param_name=None):
|
|
1787
|
+
"""
|
|
1788
|
+
DESCRIPTION:
|
|
1789
|
+
Function to set generic parameters.
|
|
1790
|
+
|
|
1791
|
+
PARAMETERS:
|
|
1792
|
+
func_indicator:
|
|
1793
|
+
Optional Argument.
|
|
1794
|
+
Specifies the name of function indicator.
|
|
1795
|
+
Types: str
|
|
1796
|
+
|
|
1797
|
+
param_name:
|
|
1798
|
+
Optional Argument.
|
|
1799
|
+
Specifies the name of the param which contains generic parameters.
|
|
1800
|
+
Types: str
|
|
1801
|
+
|
|
1802
|
+
RETURNS:
|
|
1803
|
+
Tuple containing volatile and persist parameters.
|
|
1804
|
+
"""
|
|
1805
|
+
volatile = self.volatile
|
|
1806
|
+
persist = self.persist
|
|
1807
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
1808
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
1809
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
1810
|
+
|
|
1811
|
+
return (volatile, persist)
|