teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -31,6 +31,8 @@ from teradataml import ScaleTransform
|
|
|
31
31
|
from teradataml import SimpleImputeTransform
|
|
32
32
|
from teradataml import TargetEncodingTransform
|
|
33
33
|
from teradataml import Transform, UtilFuncs, TeradataConstants
|
|
34
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
|
+
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
34
36
|
|
|
35
37
|
# AutoML Internal libraries
|
|
36
38
|
from teradataml.automl.feature_exploration import _FeatureExplore
|
|
@@ -58,12 +60,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
58
60
|
Types: teradataml Dataframe
|
|
59
61
|
|
|
60
62
|
data_transformation_params:
|
|
61
|
-
Required
|
|
63
|
+
Required Argument.
|
|
62
64
|
Specifies the parameters for performing data transformation.
|
|
63
65
|
Types: dict
|
|
64
66
|
|
|
65
67
|
auto:
|
|
66
|
-
Optional
|
|
68
|
+
Optional Argument.
|
|
67
69
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
68
70
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
69
71
|
Default Value: True
|
|
@@ -80,7 +82,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
80
82
|
Types: int
|
|
81
83
|
|
|
82
84
|
target_column_ind:
|
|
83
|
-
Optional
|
|
85
|
+
Optional Argument.
|
|
84
86
|
Specifies whether target column is present in given dataset.
|
|
85
87
|
Default Value: False
|
|
86
88
|
Types: bool
|
|
@@ -91,6 +93,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
91
93
|
self.verbose = verbose
|
|
92
94
|
self.target_column_ind = target_column_ind
|
|
93
95
|
self.table_name_mapping = table_name_mapping
|
|
96
|
+
self.data_node_id = data._nodeid
|
|
97
|
+
self.table_name_mapping[self.data_node_id] = {}
|
|
94
98
|
|
|
95
99
|
def data_transformation(self):
|
|
96
100
|
"""
|
|
@@ -118,6 +122,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
118
122
|
# Extracting target column details and type whether it is classification or not
|
|
119
123
|
self.data_target_column = self.data_transformation_params.get("data_target_column")
|
|
120
124
|
self.classification_type = self.data_transformation_params.get("classification_type", False)
|
|
125
|
+
|
|
126
|
+
# Setting number of jobs for progress bar based on mode of execution
|
|
127
|
+
jobs = 10 if self.auto else 15
|
|
128
|
+
self.progress_bar = _ProgressBar(jobs=jobs, verbose=2, prefix='Transformation Running:')
|
|
129
|
+
|
|
121
130
|
# Performing transformation carried out in feature engineering phase
|
|
122
131
|
self.feature_engineering_transformation()
|
|
123
132
|
# Performing transformation carried out in data preparation phase
|
|
@@ -133,27 +142,52 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
133
142
|
on test data using parameters from data_transformation_params.
|
|
134
143
|
"""
|
|
135
144
|
self._display_msg(msg="Performing transformation carried out in feature engineering phase ...",
|
|
136
|
-
show_data=True
|
|
145
|
+
show_data=True,
|
|
146
|
+
progress_bar=self.progress_bar)
|
|
147
|
+
|
|
137
148
|
# Performing default transformation for both auto and custom mode
|
|
138
149
|
self._preprocess_transformation()
|
|
150
|
+
self.progress_bar.update()
|
|
151
|
+
|
|
139
152
|
self._futile_column_handling_transformation()
|
|
153
|
+
self.progress_bar.update()
|
|
154
|
+
|
|
140
155
|
# Handling target column transformation
|
|
141
156
|
if self.target_column_ind and self.classification_type:
|
|
142
157
|
self._handle_target_column_transformation()
|
|
158
|
+
self.progress_bar.update()
|
|
159
|
+
|
|
143
160
|
self._date_column_handling_transformation()
|
|
161
|
+
self.progress_bar.update()
|
|
144
162
|
|
|
145
163
|
# Performing transformation according to run mode
|
|
146
164
|
if self.auto:
|
|
147
165
|
self._missing_value_handling_transformation()
|
|
166
|
+
self.progress_bar.update()
|
|
167
|
+
|
|
148
168
|
self._categorical_encoding_transformation()
|
|
169
|
+
self.progress_bar.update()
|
|
149
170
|
else:
|
|
150
171
|
self._custom_missing_value_handling_transformation()
|
|
172
|
+
self.progress_bar.update()
|
|
173
|
+
|
|
151
174
|
self._custom_bincode_column_transformation()
|
|
175
|
+
self.progress_bar.update()
|
|
176
|
+
|
|
152
177
|
self._custom_string_column_transformation()
|
|
178
|
+
self.progress_bar.update()
|
|
179
|
+
|
|
153
180
|
self._custom_categorical_encoding_transformation()
|
|
181
|
+
self.progress_bar.update()
|
|
182
|
+
|
|
154
183
|
self._custom_mathematical_transformation()
|
|
184
|
+
self.progress_bar.update()
|
|
185
|
+
|
|
155
186
|
self._custom_non_linear_transformation()
|
|
187
|
+
self.progress_bar.update()
|
|
188
|
+
|
|
156
189
|
self._custom_anti_select_column_transformation()
|
|
190
|
+
self.progress_bar.update()
|
|
157
191
|
|
|
158
192
|
def data_preparation_transformation(self):
|
|
159
193
|
"""
|
|
@@ -162,15 +196,23 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
162
196
|
on test data using parameters from data_transformation_params.
|
|
163
197
|
"""
|
|
164
198
|
self._display_msg(msg="Performing transformation carried out in data preparation phase ...",
|
|
165
|
-
show_data=True
|
|
199
|
+
show_data=True,
|
|
200
|
+
progress_bar=self.progress_bar)
|
|
201
|
+
|
|
166
202
|
# Handling features transformed from feature engineering phase
|
|
167
203
|
self._handle_generated_features_transformation()
|
|
204
|
+
self.progress_bar.update()
|
|
168
205
|
|
|
169
206
|
# Performing transformation including feature selection using lasso, rfe and pca
|
|
170
207
|
# followed by scaling
|
|
171
208
|
self._feature_selection_lasso_transformation()
|
|
209
|
+
self.progress_bar.update()
|
|
210
|
+
|
|
172
211
|
self._feature_selection_rfe_transformation()
|
|
212
|
+
self.progress_bar.update()
|
|
213
|
+
|
|
173
214
|
self._feature_selection_pca_transformation()
|
|
215
|
+
self.progress_bar.update()
|
|
174
216
|
|
|
175
217
|
def _preprocess_transformation(self):
|
|
176
218
|
"""
|
|
@@ -182,7 +224,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
182
224
|
if columns_to_be_removed:
|
|
183
225
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
184
226
|
self._display_msg(msg="\nUpdated dataset after dropping irrelevent columns :",
|
|
185
|
-
data=self.data
|
|
227
|
+
data=self.data,
|
|
228
|
+
progress_bar=self.progress_bar)
|
|
186
229
|
|
|
187
230
|
# Adding id column
|
|
188
231
|
self.data = FillRowId(data=self.data, row_id_column='id').result
|
|
@@ -197,7 +240,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
197
240
|
if futile_cols:
|
|
198
241
|
self.data = self.data.drop(futile_cols, axis=1)
|
|
199
242
|
self._display_msg(msg="\nUpdated dataset after dropping futile columns :",
|
|
200
|
-
data=self.data
|
|
243
|
+
data=self.data,
|
|
244
|
+
progress_bar=self.progress_bar)
|
|
201
245
|
|
|
202
246
|
def _date_column_handling_transformation(self):
|
|
203
247
|
"""
|
|
@@ -205,47 +249,32 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
205
249
|
Function performs transformation on date columns and generates new columns.
|
|
206
250
|
"""
|
|
207
251
|
# Extracting date columns
|
|
208
|
-
|
|
209
|
-
if
|
|
252
|
+
self.date_column_list = self.data_transformation_params.get("date_columns",None)
|
|
253
|
+
if self.date_column_list:
|
|
210
254
|
# Dropping rows with null values in date columns
|
|
211
|
-
self.data = self.data.dropna(subset=
|
|
255
|
+
self.data = self.data.dropna(subset=self.date_column_list)
|
|
212
256
|
# Extracting unique date columns for dropping
|
|
213
257
|
drop_unique_date_columns = self.data_transformation_params.get("drop_unique_date_columns",None)
|
|
214
258
|
if drop_unique_date_columns:
|
|
215
259
|
self.data = self.data.drop(drop_unique_date_columns, axis=1)
|
|
260
|
+
# Updated date column list after dropping irrelevant date columns
|
|
261
|
+
self.date_column_list = [item for item in self.date_column_list if item not in drop_unique_date_columns]
|
|
216
262
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
self.
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
year_diff_component_fit_object = self.data_transformation_params.get("year_diff_component_fit_object", None)
|
|
233
|
-
|
|
234
|
-
# Performing bincode transformation on day, month and year components
|
|
235
|
-
for fit_object in [day_component_fit_object, month_component_fit_object, year_diff_component_fit_object]:
|
|
236
|
-
if fit_object:
|
|
237
|
-
for col, bin_code_fit in fit_object.items():
|
|
238
|
-
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
239
|
-
transform_params = {
|
|
240
|
-
"data": self.data,
|
|
241
|
-
"object": bin_code_fit,
|
|
242
|
-
"accumulate": accumulate_columns,
|
|
243
|
-
"persist": True
|
|
244
|
-
}
|
|
245
|
-
self.data = BincodeTransform(**transform_params).result
|
|
246
|
-
|
|
247
|
-
self._display_msg(msg="\nUpdated dataset after transforming date columns :",
|
|
248
|
-
data=self.data)
|
|
263
|
+
if len(self.date_column_list) != 0:
|
|
264
|
+
# Extracting date components parameters for new columns generation
|
|
265
|
+
new_columns=self._fetch_date_component()
|
|
266
|
+
|
|
267
|
+
# Extracting irrelevant date component columns for dropping
|
|
268
|
+
drop_extract_date_columns = self.data_transformation_params.get("drop_extract_date_columns", None)
|
|
269
|
+
if drop_extract_date_columns:
|
|
270
|
+
self.data = self.data.drop(drop_extract_date_columns, axis=1)
|
|
271
|
+
new_columns = [item for item in new_columns if item not in drop_extract_date_columns]
|
|
272
|
+
|
|
273
|
+
self._display_msg(msg='Updated list of newly generated features from existing date features :',
|
|
274
|
+
col_lst=new_columns)
|
|
275
|
+
self._display_msg(msg="\nUpdated dataset after transforming date columns :",
|
|
276
|
+
data=self.data,
|
|
277
|
+
progress_bar=self.progress_bar)
|
|
249
278
|
|
|
250
279
|
def _missing_value_handling_transformation(self):
|
|
251
280
|
"""
|
|
@@ -257,7 +286,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
257
286
|
if drop_cols:
|
|
258
287
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
259
288
|
self._display_msg(msg="\nUpdated dataset after dropping missing value containing columns : ",
|
|
260
|
-
data=self.data
|
|
289
|
+
data=self.data,
|
|
290
|
+
progress_bar=self.progress_bar)
|
|
261
291
|
|
|
262
292
|
# Extracting imputation columns and fit object for missing value imputation
|
|
263
293
|
imputation_cols = self.data_transformation_params.get("imputation_columns", None)
|
|
@@ -265,20 +295,22 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
265
295
|
sm_fit_obj = self.data_transformation_params.get("imputation_fit_object")
|
|
266
296
|
# imputing column using fit object
|
|
267
297
|
self.data = SimpleImputeTransform(data=self.data,
|
|
268
|
-
|
|
269
|
-
volatile=True).result
|
|
298
|
+
object=sm_fit_obj).result
|
|
270
299
|
self._display_msg(msg="\nUpdated dataset after imputing missing value containing columns :",
|
|
271
|
-
data=self.data
|
|
300
|
+
data=self.data,
|
|
301
|
+
progress_bar=self.progress_bar)
|
|
272
302
|
|
|
273
303
|
# Handling rest null, its temporary solution. It subjects to change based on input.
|
|
274
304
|
dropped_data = self.data.dropna()
|
|
275
305
|
dropped_count = self.data.shape[0] - dropped_data.shape[0]
|
|
276
306
|
if dropped_count > 0:
|
|
277
|
-
self.data = dropped_data
|
|
278
307
|
self._display_msg(msg="\nFound additional {} rows that contain missing values :".format(dropped_count),
|
|
279
|
-
data=self.data
|
|
308
|
+
data=self.data,
|
|
309
|
+
progress_bar=self.progress_bar)
|
|
310
|
+
self.data = dropped_data
|
|
280
311
|
self._display_msg(msg="\nUpdated dataset after dropping additional missing value containing rows :",
|
|
281
|
-
data=self.data
|
|
312
|
+
data=self.data,
|
|
313
|
+
progress_bar=self.progress_bar)
|
|
282
314
|
|
|
283
315
|
def _custom_missing_value_handling_transformation(self):
|
|
284
316
|
"""
|
|
@@ -291,7 +323,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
291
323
|
if drop_col_list:
|
|
292
324
|
self.data = self.data.drop(drop_col_list, axis=1)
|
|
293
325
|
self._display_msg(msg="\nUpdated dataset after dropping customized missing value containing columns :",
|
|
294
|
-
data=self.data
|
|
326
|
+
data=self.data,
|
|
327
|
+
progress_bar=self.progress_bar)
|
|
295
328
|
|
|
296
329
|
# Extracting custom imputation columns and fit object for missing value imputation
|
|
297
330
|
custom_imp_ind = self.data_transformation_params.get("custom_imputation_ind", False)
|
|
@@ -299,10 +332,10 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
299
332
|
sm_fit_obj = self.data_transformation_params.get("custom_imputation_fit_object")
|
|
300
333
|
# imputing column using fit object
|
|
301
334
|
self.data = SimpleImputeTransform(data=self.data,
|
|
302
|
-
|
|
303
|
-
volatile=True).result
|
|
335
|
+
object=sm_fit_obj).result
|
|
304
336
|
self._display_msg(msg="\nUpdated dataset after imputing customized missing value containing columns :",
|
|
305
|
-
data=self.data
|
|
337
|
+
data=self.data,
|
|
338
|
+
progress_bar=self.progress_bar)
|
|
306
339
|
# Handling rest with default missing value handling
|
|
307
340
|
self._missing_value_handling_transformation()
|
|
308
341
|
|
|
@@ -325,11 +358,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
325
358
|
"data" : self.data,
|
|
326
359
|
"object" : custom_eql_bincode_fit_object,
|
|
327
360
|
"accumulate" : accumulate_columns,
|
|
328
|
-
"persist" : True,
|
|
361
|
+
"persist" : True,
|
|
362
|
+
"display_table_name" : False
|
|
329
363
|
}
|
|
330
364
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
365
|
+
# Adding transformed data containing table to garbage collector
|
|
366
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
331
367
|
self._display_msg(msg="\nUpdated dataset after performing customized equal width bin-code transformation :",
|
|
332
|
-
data=self.data
|
|
368
|
+
data=self.data,
|
|
369
|
+
progress_bar=self.progress_bar)
|
|
333
370
|
|
|
334
371
|
# Hnadling bincode transformation for Variable-Width
|
|
335
372
|
custom_var_bincode_col = self.data_transformation_params.get("custom_var_bincode_col", None)
|
|
@@ -343,11 +380,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
343
380
|
"object" : custom_var_bincode_fit_object,
|
|
344
381
|
"object_order_column" : "TD_MinValue_BINFIT",
|
|
345
382
|
"accumulate" : accumulate_columns,
|
|
346
|
-
"persist" : True
|
|
383
|
+
"persist" : True,
|
|
384
|
+
"display_table_name" : False
|
|
347
385
|
}
|
|
348
386
|
self.data = BincodeTransform(**var_transform_params).result
|
|
387
|
+
# Adding transformed data containing table to garbage collector
|
|
388
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
349
389
|
self._display_msg(msg="\nUpdated dataset after performing customized variable width bin-code transformation :",
|
|
350
|
-
data=self.data
|
|
390
|
+
data=self.data,
|
|
391
|
+
progress_bar=self.progress_bar)
|
|
351
392
|
|
|
352
393
|
def _custom_string_column_transformation(self):
|
|
353
394
|
"""
|
|
@@ -362,7 +403,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
362
403
|
for target_col,transform_val in custom_string_manipulation_param.items():
|
|
363
404
|
self.data = self._str_method_mapping(target_col, transform_val)
|
|
364
405
|
self._display_msg(msg="\nUpdated dataset after performing customized string manipulation :",
|
|
365
|
-
data=self.data
|
|
406
|
+
data=self.data,
|
|
407
|
+
progress_bar=self.progress_bar)
|
|
366
408
|
|
|
367
409
|
def _categorical_encoding_transformation(self):
|
|
368
410
|
"""
|
|
@@ -380,14 +422,18 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
380
422
|
"data" : self.data,
|
|
381
423
|
"object" : fit_obj,
|
|
382
424
|
"is_input_dense" : True,
|
|
383
|
-
"persist" : True
|
|
425
|
+
"persist" : True,
|
|
426
|
+
"display_table_name" : False
|
|
384
427
|
}
|
|
385
428
|
# Performing one hot encoding transformation
|
|
386
429
|
self.data = OneHotEncodingTransform(**transform_params).result
|
|
430
|
+
# Adding transformed data containing table to garbage collector
|
|
431
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
387
432
|
# Dropping old columns after encoding
|
|
388
433
|
self.data = self.data.drop(one_hot_encoding_drop_list, axis=1)
|
|
389
434
|
self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
|
|
390
|
-
|
|
435
|
+
data=self.data,
|
|
436
|
+
progress_bar=self.progress_bar)
|
|
391
437
|
|
|
392
438
|
def _custom_categorical_encoding_transformation(self):
|
|
393
439
|
"""
|
|
@@ -408,10 +454,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
408
454
|
"data" : self.data,
|
|
409
455
|
"object" : custom_ord_encoding_fit_obj,
|
|
410
456
|
"accumulate" : accumulate_columns,
|
|
411
|
-
"persist" : True
|
|
457
|
+
"persist" : True,
|
|
458
|
+
"display_table_name" : False
|
|
412
459
|
}
|
|
413
460
|
# Performing ordinal encoding transformation
|
|
414
461
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
462
|
+
# Adding transformed data containing table to garbage collector
|
|
463
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
415
464
|
# Extracting parameters for target encoding
|
|
416
465
|
custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
|
|
417
466
|
custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
|
|
@@ -424,12 +473,16 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
424
473
|
"data" : self.data,
|
|
425
474
|
"object" : tar_fit_obj,
|
|
426
475
|
"accumulate" : accumulate_columns,
|
|
427
|
-
"persist" : True
|
|
476
|
+
"persist" : True,
|
|
477
|
+
"display_table_name" : False
|
|
428
478
|
}
|
|
429
|
-
# Performing
|
|
479
|
+
# Performing target encoding transformation
|
|
430
480
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
481
|
+
# Adding transformed data containing table to garbage collector
|
|
482
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
431
483
|
self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
|
|
432
|
-
data=self.data
|
|
484
|
+
data=self.data,
|
|
485
|
+
progress_bar=self.progress_bar)
|
|
433
486
|
|
|
434
487
|
# Handling rest with default categorical encoding transformation
|
|
435
488
|
self._categorical_encoding_transformation()
|
|
@@ -468,12 +521,16 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
468
521
|
"data" : self.data,
|
|
469
522
|
"object" : custom_numerical_transformation_fit_object,
|
|
470
523
|
"id_columns" : custom_numerical_transformation_id_columns,
|
|
471
|
-
"persist" :True
|
|
524
|
+
"persist" :True,
|
|
525
|
+
"display_table_name" : False
|
|
472
526
|
}
|
|
473
527
|
# Peforming transformation on target columns
|
|
474
528
|
self.data = Transform(**transform_params).result
|
|
529
|
+
# Adding transformed data containing table to garbage collector
|
|
530
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
475
531
|
self._display_msg(msg="\nUpdated dataset after performing customized mathematical transformation :",
|
|
476
|
-
data=self.data
|
|
532
|
+
data=self.data,
|
|
533
|
+
progress_bar=self.progress_bar)
|
|
477
534
|
|
|
478
535
|
def _custom_non_linear_transformation(self):
|
|
479
536
|
"""
|
|
@@ -491,12 +548,16 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
491
548
|
"data" : self.data,
|
|
492
549
|
"object" : fit_obj,
|
|
493
550
|
"accumulate" : self.data.columns,
|
|
494
|
-
"persist" : True
|
|
551
|
+
"persist" : True,
|
|
552
|
+
"display_table_name" : False
|
|
495
553
|
}
|
|
496
554
|
# Performing transformation
|
|
497
555
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
556
|
+
# Adding transformed data containing table to garbage collector
|
|
557
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
498
558
|
self._display_msg(msg="\nUpdated dataset after performing customized non-linear transformation :",
|
|
499
|
-
data=self.data
|
|
559
|
+
data=self.data,
|
|
560
|
+
progress_bar=self.progress_bar)
|
|
500
561
|
|
|
501
562
|
def _custom_anti_select_column_transformation(self):
|
|
502
563
|
"""
|
|
@@ -516,7 +577,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
516
577
|
# Performing transformation for given user input
|
|
517
578
|
self.data = Antiselect(**fit_params).result
|
|
518
579
|
self._display_msg(msg="\nUpdated dataset after performing customized anti-selection :",
|
|
519
|
-
data=self.data
|
|
580
|
+
data=self.data,
|
|
581
|
+
progress_bar=self.progress_bar)
|
|
520
582
|
|
|
521
583
|
def _handle_generated_features_transformation(self):
|
|
522
584
|
"""
|
|
@@ -539,8 +601,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
539
601
|
"target_columns" : round_columns,
|
|
540
602
|
"precision_digit" : 4,
|
|
541
603
|
"accumulate" : accumulate_columns,
|
|
542
|
-
"persist" : True
|
|
604
|
+
"persist" : True,
|
|
605
|
+
"display_table_name" : False}
|
|
543
606
|
self.data = RoundColumns(**fit_params).result
|
|
607
|
+
# Adding transformed data containing table to garbage collector
|
|
608
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
544
609
|
|
|
545
610
|
def _handle_target_column_transformation(self):
|
|
546
611
|
"""
|
|
@@ -561,11 +626,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
561
626
|
"data" : self.data,
|
|
562
627
|
"object" : target_col_ord_encoding_fit_obj,
|
|
563
628
|
"accumulate" : accumulate_columns,
|
|
564
|
-
"persist" : True
|
|
629
|
+
"persist" : True,
|
|
630
|
+
"display_table_name" : False
|
|
565
631
|
}
|
|
566
632
|
# Performing ordinal encoding transformation
|
|
567
633
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
568
|
-
|
|
634
|
+
# Adding transformed data containing table to garbage collector
|
|
635
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
569
636
|
# Converting target column to integer datatype
|
|
570
637
|
params = {
|
|
571
638
|
"data" : self.data,
|
|
@@ -575,7 +642,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
575
642
|
}
|
|
576
643
|
self.data = ConvertTo(**params).result
|
|
577
644
|
self._display_msg(msg="\nUpdated dataset after performing target column transformation :",
|
|
578
|
-
data=self.data
|
|
645
|
+
data=self.data,
|
|
646
|
+
progress_bar=self.progress_bar)
|
|
579
647
|
|
|
580
648
|
def _extract_and_display_features(self, feature_type, feature_list):
|
|
581
649
|
"""
|
|
@@ -605,7 +673,8 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
605
673
|
|
|
606
674
|
# Displaying feature dataframe
|
|
607
675
|
self._display_msg(msg=f"\nUpdated dataset after performing {feature_type} feature selection:",
|
|
608
|
-
|
|
676
|
+
data=feature_df,
|
|
677
|
+
progress_bar=self.progress_bar)
|
|
609
678
|
|
|
610
679
|
# Returning feature dataframe
|
|
611
680
|
return feature_df
|
|
@@ -631,12 +700,14 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
631
700
|
accumulate=accumulate_cols).result
|
|
632
701
|
# Displaying scaled dataset
|
|
633
702
|
self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
|
|
634
|
-
data=lasso_df
|
|
703
|
+
data=lasso_df,
|
|
704
|
+
progress_bar=self.progress_bar)
|
|
635
705
|
|
|
636
706
|
# Uploading lasso dataset to table for further use
|
|
637
707
|
table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
|
|
638
708
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
639
|
-
|
|
709
|
+
# Storing table name mapping for lasso dataset
|
|
710
|
+
self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
|
|
640
711
|
copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace")
|
|
641
712
|
|
|
642
713
|
def _feature_selection_rfe_transformation(self):
|
|
@@ -667,12 +738,14 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
667
738
|
accumulate=accumulate_cols).result
|
|
668
739
|
# Displaying scaled dataset
|
|
669
740
|
self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
|
|
670
|
-
data=rfe_df
|
|
741
|
+
data=rfe_df,
|
|
742
|
+
progress_bar=self.progress_bar)
|
|
671
743
|
|
|
672
744
|
# Uploading rfe dataset to table for further use
|
|
673
745
|
table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
|
|
674
746
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
675
|
-
|
|
747
|
+
# Storing table name mapping for rfe dataset
|
|
748
|
+
self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
|
|
676
749
|
copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace")
|
|
677
750
|
|
|
678
751
|
def _feature_selection_pca_transformation(self):
|
|
@@ -691,18 +764,25 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
691
764
|
accumulate=accumulate_cols).result
|
|
692
765
|
# Displaying scaled dataset
|
|
693
766
|
self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
|
|
694
|
-
data=pca_scaled_df
|
|
767
|
+
data=pca_scaled_df,
|
|
768
|
+
progress_bar=self.progress_bar)
|
|
695
769
|
|
|
696
770
|
# Convert to pandas dataframe for applying pca
|
|
697
771
|
pca_scaled_pd = pca_scaled_df.to_pandas()
|
|
698
772
|
# Extracting pca fit instance for applying pca
|
|
699
773
|
pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
|
|
774
|
+
# Extracting columns for applying pca
|
|
775
|
+
pca_fit_columns = self.data_transformation_params.get("pca_fit_columns", None)
|
|
700
776
|
|
|
701
777
|
# drop id column and target column if present
|
|
702
778
|
drop_col = ['id']
|
|
703
779
|
if self.target_column_ind:
|
|
704
780
|
drop_col.append(self.data_target_column)
|
|
705
781
|
pca_df = pca_scaled_pd.drop(columns=drop_col, axis=1)
|
|
782
|
+
|
|
783
|
+
# Rearranging columns to match the order used during PCA fitting to
|
|
784
|
+
# avoid issues during PCA transformation.
|
|
785
|
+
pca_df = pca_df[pca_fit_columns]
|
|
706
786
|
|
|
707
787
|
# Applying pca on scaled dataset
|
|
708
788
|
pca_df = pca_fit_instance.transform(pca_df)
|
|
@@ -718,10 +798,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
718
798
|
pca_df[self.data_target_column] = pca_scaled_pd[self.data_target_column].reset_index(drop=True)
|
|
719
799
|
# Displaying pca dataframe
|
|
720
800
|
self._display_msg(msg="\nUpdated dataset after performing PCA feature selection :",
|
|
721
|
-
data=pca_df)
|
|
801
|
+
data=pca_df.head(10),
|
|
802
|
+
progress_bar=self.progress_bar)
|
|
722
803
|
|
|
723
804
|
# Uploading pca dataset to table for further use
|
|
724
805
|
table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
|
|
725
806
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
726
|
-
|
|
807
|
+
# Storing table name mapping for pca dataset
|
|
808
|
+
self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
|
|
727
809
|
copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace")
|