teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -19,7 +19,6 @@ import pandas as pd
|
|
|
19
19
|
import random
|
|
20
20
|
import time
|
|
21
21
|
import warnings
|
|
22
|
-
warnings.filterwarnings("ignore")
|
|
23
22
|
|
|
24
23
|
# Teradata libraries
|
|
25
24
|
from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -27,10 +26,15 @@ from teradataml.dataframe.copy_to import copy_to_sql
|
|
|
27
26
|
from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
28
27
|
from teradataml import RoundColumns, TeradataMlException
|
|
29
28
|
from teradataml import ScaleFit, ScaleTransform
|
|
30
|
-
from teradataml import
|
|
29
|
+
from teradataml import UtilFuncs, TeradataConstants
|
|
30
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
31
31
|
from teradataml.common.messages import Messages, MessageCodes
|
|
32
32
|
from teradataml.utils.validators import _Validators
|
|
33
|
+
from teradataml import INTEGER
|
|
33
34
|
|
|
35
|
+
# Control Randomnes
|
|
36
|
+
random.seed(42)
|
|
37
|
+
np.random.seed(42)
|
|
34
38
|
|
|
35
39
|
class _DataPreparation:
|
|
36
40
|
|
|
@@ -41,7 +45,8 @@ class _DataPreparation:
|
|
|
41
45
|
excluded_columns=None,
|
|
42
46
|
custom_data=None,
|
|
43
47
|
data_transform_dict=None,
|
|
44
|
-
task_type="Regression"
|
|
48
|
+
task_type="Regression",
|
|
49
|
+
**kwargs):
|
|
45
50
|
"""
|
|
46
51
|
DESCRIPTION:
|
|
47
52
|
Function initializes the data, target column and columns datatypes
|
|
@@ -54,7 +59,7 @@ class _DataPreparation:
|
|
|
54
59
|
Types: teradataml Dataframe
|
|
55
60
|
|
|
56
61
|
target_column:
|
|
57
|
-
Required
|
|
62
|
+
Required Argument.
|
|
58
63
|
Specifies the name of the target column in "data".
|
|
59
64
|
Types: str
|
|
60
65
|
|
|
@@ -69,27 +74,49 @@ class _DataPreparation:
|
|
|
69
74
|
Types: int
|
|
70
75
|
|
|
71
76
|
excluded_columns:
|
|
72
|
-
Required
|
|
77
|
+
Required Argument.
|
|
73
78
|
Specifies the columns should be excluded from any processing.
|
|
74
79
|
Types: str or list of strings (str)
|
|
75
80
|
|
|
76
81
|
custom_data:
|
|
77
|
-
Optional
|
|
82
|
+
Optional Argument.
|
|
78
83
|
Specifies json object containing user customized input.
|
|
79
84
|
Types: json object
|
|
80
85
|
|
|
81
86
|
data_transform_dict:
|
|
82
|
-
Optional
|
|
87
|
+
Optional Argument.
|
|
83
88
|
Specifies the parameters for data transformation.
|
|
84
89
|
Types: dict
|
|
85
90
|
|
|
86
91
|
task_type:
|
|
87
|
-
Required
|
|
92
|
+
Required Argument.
|
|
88
93
|
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
89
94
|
on the provived dataset.
|
|
90
95
|
Default Value: "Regression"
|
|
91
96
|
Permitted Values: "Regression", "Classification"
|
|
92
97
|
Types: str
|
|
98
|
+
|
|
99
|
+
**kwargs:
|
|
100
|
+
Specifies the additional arguments for data preparation. Below
|
|
101
|
+
are the additional arguments:
|
|
102
|
+
volatile:
|
|
103
|
+
Optional Argument.
|
|
104
|
+
Specifies whether to put the interim results of the
|
|
105
|
+
functions in a volatile table or not. When set to
|
|
106
|
+
True, results are stored in a volatile table,
|
|
107
|
+
otherwise not.
|
|
108
|
+
Default Value: False
|
|
109
|
+
Types: bool
|
|
110
|
+
|
|
111
|
+
persist:
|
|
112
|
+
Optional Argument.
|
|
113
|
+
Specifies whether to persist the interim results of the
|
|
114
|
+
functions in a table or not. When set to True,
|
|
115
|
+
results are persisted in a table; otherwise,
|
|
116
|
+
results are garbage collected at the end of the
|
|
117
|
+
session.
|
|
118
|
+
Default Value: False
|
|
119
|
+
Types: bool
|
|
93
120
|
"""
|
|
94
121
|
self.data = data
|
|
95
122
|
self.target_column = target_column
|
|
@@ -98,16 +125,15 @@ class _DataPreparation:
|
|
|
98
125
|
self.data_transform_dict = data_transform_dict
|
|
99
126
|
self.custom_data = custom_data
|
|
100
127
|
self.task_type = task_type
|
|
128
|
+
self.volatile = kwargs.get("volatile", False)
|
|
129
|
+
self.persist = kwargs.get("persist", False)
|
|
101
130
|
|
|
102
131
|
# Setting default value for auto run mode
|
|
103
|
-
self._train_size = 0.80
|
|
104
132
|
self._data_sampling_method = "SMOTE"
|
|
105
133
|
self._scale_method_reg = "STD"
|
|
106
134
|
self._scale_method_cls = "RANGE"
|
|
107
135
|
self.table_name_mapping = {}
|
|
108
136
|
|
|
109
|
-
random.seed(42)
|
|
110
|
-
np.random.seed(42)
|
|
111
137
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
112
138
|
|
|
113
139
|
|
|
@@ -116,14 +142,13 @@ class _DataPreparation:
|
|
|
116
142
|
"""
|
|
117
143
|
DESCRIPTION:
|
|
118
144
|
Function to perform following tasks:-
|
|
119
|
-
1.
|
|
120
|
-
2. Performs
|
|
121
|
-
3. Performs feature
|
|
122
|
-
4. Performs feature scaling.
|
|
145
|
+
1. Performs outlier processing and transformation on dataset.
|
|
146
|
+
2. Performs feature selection using RFE, PCA, and Lasso.
|
|
147
|
+
3. Performs feature scaling.
|
|
123
148
|
|
|
124
149
|
PARAMETERS:
|
|
125
150
|
auto:
|
|
126
|
-
Optional
|
|
151
|
+
Optional Argument.
|
|
127
152
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
128
153
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
129
154
|
Default Value: True
|
|
@@ -138,38 +163,36 @@ class _DataPreparation:
|
|
|
138
163
|
progress_bar=self.progress_bar)
|
|
139
164
|
# Setting user value in case of custom running mode
|
|
140
165
|
if not auto:
|
|
141
|
-
self._set_custom_train_test_split()
|
|
142
166
|
self._set_custom_scaling_method()
|
|
143
167
|
self._set_custom_sampling()
|
|
144
168
|
|
|
145
|
-
# Performing train test split
|
|
146
|
-
self._train_test_split()
|
|
147
|
-
self.progress_bar.update()
|
|
148
|
-
|
|
149
169
|
# Handling ouliers in dataset
|
|
150
170
|
self._handle_outliers(auto)
|
|
151
171
|
self.progress_bar.update()
|
|
152
172
|
|
|
153
173
|
# Handling float type features before processing with feature selection and scaling
|
|
154
|
-
|
|
155
|
-
test = self._handle_generated_features('test')
|
|
174
|
+
training_data = self._handle_generated_features()
|
|
156
175
|
self.progress_bar.update()
|
|
157
176
|
|
|
158
177
|
# Temporary Pulling data for feature selection
|
|
159
178
|
# Will change after sto
|
|
160
179
|
|
|
161
180
|
# Checking for data imbalance
|
|
162
|
-
if self._check_data_imbalance(
|
|
163
|
-
|
|
181
|
+
if self._check_data_imbalance(training_data):
|
|
182
|
+
training_data = self._data_sampling(training_data)
|
|
164
183
|
self.progress_bar.update()
|
|
165
184
|
|
|
185
|
+
# Sorting the data based on id to
|
|
186
|
+
# remove any shuffling done by sampling
|
|
187
|
+
training_data = training_data.sort_values(by='id')
|
|
188
|
+
|
|
166
189
|
# Performing feature selection using lasso followed by scaling
|
|
167
|
-
self._feature_selection_Lasso(
|
|
190
|
+
self._feature_selection_Lasso(training_data)
|
|
168
191
|
self._scaling_features(feature_selection_mtd="lasso")
|
|
169
192
|
self.progress_bar.update()
|
|
170
193
|
|
|
171
194
|
# Performing feature selection using rfe followed by scaling
|
|
172
|
-
self._feature_selection_RFE(
|
|
195
|
+
self._feature_selection_RFE(training_data)
|
|
173
196
|
self._scaling_features(feature_selection_mtd="rfe")
|
|
174
197
|
self.progress_bar.update()
|
|
175
198
|
|
|
@@ -180,85 +203,8 @@ class _DataPreparation:
|
|
|
180
203
|
|
|
181
204
|
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
|
|
182
205
|
|
|
183
|
-
# Splits data into train and test
|
|
184
|
-
def _train_test_split(self):
|
|
185
|
-
|
|
186
|
-
"""
|
|
187
|
-
DESCRIPTION:
|
|
188
|
-
Function splits the data into training and testing datasets.
|
|
189
|
-
|
|
190
|
-
PARAMETERS:
|
|
191
|
-
train_size:
|
|
192
|
-
Optional Argument.
|
|
193
|
-
Specifies the training size required for splitting dataset.
|
|
194
|
-
By Default, it takes 0.8 as training size.
|
|
195
|
-
Types: float
|
|
196
|
-
"""
|
|
197
|
-
self._display_msg(msg="\nSpliting of dataset into training and testing ...",
|
|
198
|
-
progress_bar=self.progress_bar,
|
|
199
|
-
show_data=True)
|
|
200
|
-
self._display_msg(inline_msg="Training size : {}".format(self._train_size),
|
|
201
|
-
progress_bar=self.progress_bar)
|
|
202
|
-
self._display_msg(inline_msg="Testing size : {}".format(round((1-self._train_size),2)),
|
|
203
|
-
progress_bar=self.progress_bar)
|
|
204
|
-
start_time = time.time()
|
|
205
|
-
# Applying TrainTestSplit function on data
|
|
206
|
-
# Regression
|
|
207
|
-
train_test_func_params = {
|
|
208
|
-
"data" : self.data,
|
|
209
|
-
"id_column" : "id",
|
|
210
|
-
"train_size" : self._train_size,
|
|
211
|
-
"seed" : 42
|
|
212
|
-
}
|
|
213
|
-
if self.is_classification_type():
|
|
214
|
-
train_test_func_params["stratify_column"]=self.target_column
|
|
215
|
-
train_test_split_out = TrainTestSplit(**train_test_func_params)
|
|
216
|
-
train_test_split_out = train_test_split_out.result
|
|
217
|
-
|
|
218
|
-
# Splitting the data into training and testing data
|
|
219
|
-
self.train_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 1].drop('TD_IsTrainRow', axis=1)
|
|
220
|
-
self.test_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 0].drop('TD_IsTrainRow', axis=1)
|
|
221
|
-
|
|
222
|
-
self._display_msg(msg="Training data sample",
|
|
223
|
-
data=self.train_df,
|
|
224
|
-
progress_bar=self.progress_bar)
|
|
225
|
-
|
|
226
|
-
self._display_msg(msg="Testing data sample",
|
|
227
|
-
data=self.test_df,
|
|
228
|
-
progress_bar=self.progress_bar)
|
|
229
|
-
|
|
230
|
-
end_time = time.time()
|
|
231
|
-
self._display_msg(msg="Time taken for spliting of data: {:.2f} sec ".format(end_time - start_time),
|
|
232
|
-
progress_bar=self.progress_bar,
|
|
233
|
-
show_data=True)
|
|
234
|
-
|
|
235
|
-
def _set_custom_train_test_split(self):
|
|
236
|
-
"""
|
|
237
|
-
DESCRIPTION:
|
|
238
|
-
Function to split dataset into training and testing based on user input.
|
|
239
|
-
|
|
240
|
-
"""
|
|
241
|
-
# Fetching user input for train test split
|
|
242
|
-
train_test_split_input = self.custom_data.get("TrainTestSplitIndicator", False)
|
|
243
|
-
if train_test_split_input:
|
|
244
|
-
# Extracting training size
|
|
245
|
-
custom_train_size = self.custom_data.get("TrainingSize", None)
|
|
246
|
-
if custom_train_size is None:
|
|
247
|
-
self._display_msg(inline_msg="No information provided for training size. Proceeding with default option.",
|
|
248
|
-
progress_bar=self.progress_bar)
|
|
249
|
-
else:
|
|
250
|
-
if not isinstance(custom_train_size, float):
|
|
251
|
-
err = Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE,
|
|
252
|
-
'custom_train', type(custom_train_size).__name__,
|
|
253
|
-
'float')
|
|
254
|
-
raise TeradataMlException(err, MessageCodes.INVALID_COLUMN_TYPE)
|
|
255
|
-
self._train_size = custom_train_size
|
|
256
|
-
else:
|
|
257
|
-
self._display_msg(inline_msg="No information provided for performing customized train test split. Proceeding with default option.",
|
|
258
|
-
progress_bar=self.progress_bar)
|
|
259
|
-
|
|
260
206
|
def _handle_outliers(self,
|
|
261
|
-
|
|
207
|
+
auto):
|
|
262
208
|
"""
|
|
263
209
|
DESCRIPTION:
|
|
264
210
|
Function to handle existing outliers in dataset based on running mode.
|
|
@@ -289,6 +235,12 @@ class _DataPreparation:
|
|
|
289
235
|
DESCRIPTION:
|
|
290
236
|
Function to handle data imbalance in dataset using sampling techniques
|
|
291
237
|
in case of classification.
|
|
238
|
+
|
|
239
|
+
PARAMETERS:
|
|
240
|
+
data:
|
|
241
|
+
Required Argument.
|
|
242
|
+
Specifies the input teradataml DataFrame.
|
|
243
|
+
Types: pandas Dataframe.
|
|
292
244
|
"""
|
|
293
245
|
pass
|
|
294
246
|
|
|
@@ -310,7 +262,7 @@ class _DataPreparation:
|
|
|
310
262
|
outlier_method = "Tukey"
|
|
311
263
|
|
|
312
264
|
# List of columns for outlier processing.
|
|
313
|
-
outlier_columns = [col for col in self.
|
|
265
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
314
266
|
|
|
315
267
|
# Detecting outlier percentage in each columns
|
|
316
268
|
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
@@ -360,26 +312,45 @@ class _DataPreparation:
|
|
|
360
312
|
Pandas DataFrame containing, column name with outlier percentage.
|
|
361
313
|
|
|
362
314
|
"""
|
|
363
|
-
|
|
315
|
+
|
|
316
|
+
# Setting volatile and persist parameters for Outlier handling function
|
|
317
|
+
volatile, persist = self._set_generic_parameters(func_indicator='OutlierFilterIndicator',
|
|
318
|
+
param_name='OutlierFilterParam')
|
|
319
|
+
|
|
320
|
+
# Performing fit on dataset for outlier handling
|
|
364
321
|
fit_params = {
|
|
365
|
-
"data" : self.
|
|
322
|
+
"data" : self.data,
|
|
366
323
|
"target_columns" : target_columns,
|
|
367
324
|
"outlier_method" : outlier_method,
|
|
368
|
-
"replacement_value" : replacement_value
|
|
325
|
+
"replacement_value" : replacement_value,
|
|
326
|
+
"volatile" : volatile,
|
|
327
|
+
"persist" : persist
|
|
369
328
|
}
|
|
370
329
|
outlier_fit_out = OutlierFilterFit(**fit_params)
|
|
371
|
-
# Performing transform on
|
|
330
|
+
# Performing transform on dataset for outlier handling
|
|
372
331
|
transform_params = {
|
|
373
|
-
"data" : self.
|
|
332
|
+
"data" : self.data,
|
|
374
333
|
"object" : outlier_fit_out.result,
|
|
375
334
|
"persist" : True
|
|
376
335
|
}
|
|
377
|
-
|
|
336
|
+
|
|
337
|
+
# Disabling print if persist is True by default
|
|
338
|
+
if not volatile and not persist:
|
|
339
|
+
transform_params["display_table_name"] = False
|
|
340
|
+
|
|
341
|
+
if volatile:
|
|
342
|
+
transform_params["volatile"] = True
|
|
343
|
+
transform_params["persist"] = False
|
|
344
|
+
self.data = OutlierFilterTransform(**transform_params).result
|
|
345
|
+
|
|
346
|
+
if not volatile and not persist:
|
|
347
|
+
# Adding transformed data containing table to garbage collector
|
|
348
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
378
349
|
|
|
379
350
|
def _outlier_processing(self):
|
|
380
351
|
"""
|
|
381
352
|
DESCRIPTION:
|
|
382
|
-
Function performs outlier processing on
|
|
353
|
+
Function performs outlier processing on dataset. It identifies and handle outliers in the dataset.
|
|
383
354
|
|
|
384
355
|
"""
|
|
385
356
|
self._display_msg(msg="\nOutlier preprocessing ...",
|
|
@@ -400,6 +371,9 @@ class _DataPreparation:
|
|
|
400
371
|
target_columns=columns_to_drop_rows
|
|
401
372
|
replacement_strategy = "DELETE"
|
|
402
373
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
374
|
+
self._display_msg(msg="Sample of dataset after removing outlier rows:",
|
|
375
|
+
data=self.data,
|
|
376
|
+
progress_bar=self.progress_bar)
|
|
403
377
|
|
|
404
378
|
# Imputing Median value in place of outliers
|
|
405
379
|
if len(columns_to_impute) != 0:
|
|
@@ -409,6 +383,13 @@ class _DataPreparation:
|
|
|
409
383
|
target_columns=columns_to_impute
|
|
410
384
|
replacement_strategy = "MEDIAN"
|
|
411
385
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
386
|
+
self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
|
|
387
|
+
data=self.data,
|
|
388
|
+
progress_bar=self.progress_bar)
|
|
389
|
+
|
|
390
|
+
if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
|
|
391
|
+
self._display_msg(msg='Analysis indicates not outlier in the dataset. No Action Taken.',
|
|
392
|
+
progress_bar=self.progress_bar)
|
|
412
393
|
|
|
413
394
|
end_time = time.time()
|
|
414
395
|
self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
|
|
@@ -418,7 +399,7 @@ class _DataPreparation:
|
|
|
418
399
|
def _custom_outlier_processing(self):
|
|
419
400
|
"""
|
|
420
401
|
DESCRIPTION:
|
|
421
|
-
Function to perform outlier processing on
|
|
402
|
+
Function to perform outlier processing on dataset based on user input.
|
|
422
403
|
|
|
423
404
|
"""
|
|
424
405
|
self._display_msg(msg="\nStarting customized outlier processing ...",
|
|
@@ -428,7 +409,7 @@ class _DataPreparation:
|
|
|
428
409
|
# Checking user input for outlier filtering
|
|
429
410
|
if outlier_filter_input:
|
|
430
411
|
# List of columns for outlier processing.
|
|
431
|
-
target_columns = [col for col in self.
|
|
412
|
+
target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
432
413
|
# Checking user input for outlier detection method
|
|
433
414
|
outlier_method = self.custom_data.get("OutlierDetectionMethod", None)
|
|
434
415
|
if outlier_method == 'PERCENTILE':
|
|
@@ -445,11 +426,13 @@ class _DataPreparation:
|
|
|
445
426
|
# Checking for rows if outlier containing columns exist
|
|
446
427
|
if outlier_df.shape[0]:
|
|
447
428
|
# Checking user input list for outlier handling
|
|
448
|
-
outlier_transform_list = self.custom_data.get("OutlierFilterParam", None)
|
|
429
|
+
outlier_transform_list = self.custom_data.get("OutlierFilterParam", None).copy()
|
|
449
430
|
if outlier_transform_list:
|
|
431
|
+
volatile = outlier_transform_list.pop("volatile", False)
|
|
432
|
+
persist = outlier_transform_list.pop("persist", False)
|
|
450
433
|
# Checking user input for outlier handling
|
|
451
434
|
_Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
|
|
452
|
-
self.
|
|
435
|
+
self.data, "outlier_data")
|
|
453
436
|
|
|
454
437
|
for target_col, transform_val in outlier_transform_list.items():
|
|
455
438
|
# Fetching replacement value
|
|
@@ -498,14 +481,12 @@ class _DataPreparation:
|
|
|
498
481
|
from sklearn.decomposition import PCA
|
|
499
482
|
|
|
500
483
|
start_time = time.time()
|
|
501
|
-
|
|
484
|
+
|
|
502
485
|
# Temporary Pulling data for feature selection
|
|
503
|
-
|
|
504
|
-
test = DataFrame.from_table(self.table_name_mapping['pca_test']).to_pandas()
|
|
486
|
+
pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
|
|
505
487
|
|
|
506
488
|
# Drop unnecessary columns and store the result
|
|
507
|
-
train_data =
|
|
508
|
-
test_data = test.drop(columns=['id', self.target_column], axis=1)
|
|
489
|
+
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
509
490
|
|
|
510
491
|
# Initialize and fit PCA
|
|
511
492
|
pca = PCA()
|
|
@@ -518,16 +499,15 @@ class _DataPreparation:
|
|
|
518
499
|
# Create a new instance of PCA with the optimal number of components
|
|
519
500
|
pca = PCA(n_components=n, random_state=42)
|
|
520
501
|
|
|
521
|
-
# Apply PCA on
|
|
502
|
+
# Apply PCA on dataset
|
|
522
503
|
X_train_pca = pca.fit_transform(train_data)
|
|
523
|
-
X_test_pca = pca.transform(test_data)
|
|
524
504
|
|
|
525
505
|
# storing instance of PCA in data transformation dictionary
|
|
526
506
|
self.data_transform_dict["pca_fit_instance"] = pca
|
|
507
|
+
self.data_transform_dict["pca_fit_columns"] = train_data.columns.tolist()
|
|
527
508
|
|
|
528
509
|
#converting the numarray into dataframes
|
|
529
510
|
train_df = pd.DataFrame(X_train_pca)
|
|
530
|
-
test_df = pd.DataFrame(X_test_pca)
|
|
531
511
|
|
|
532
512
|
#creating names for combined columns
|
|
533
513
|
column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
|
|
@@ -537,15 +517,12 @@ class _DataPreparation:
|
|
|
537
517
|
|
|
538
518
|
#renaming them
|
|
539
519
|
train_df = train_df.rename(columns=column_name)
|
|
540
|
-
test_df = test_df.rename(columns=column_name)
|
|
541
520
|
|
|
542
521
|
# adding the id column [PCA does not shuffle the dataset]
|
|
543
|
-
train_df = pd.concat([
|
|
544
|
-
test_df = pd.concat([test.reset_index(drop=True)['id'], test_df.reset_index(drop=True)], axis=1)
|
|
522
|
+
train_df = pd.concat([pca_train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
|
|
545
523
|
|
|
546
|
-
# merging target column with new
|
|
547
|
-
train_df[self.target_column] =
|
|
548
|
-
test_df[self.target_column] = test[self.target_column].reset_index(drop=True)
|
|
524
|
+
# merging target column with new data
|
|
525
|
+
train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
|
|
549
526
|
|
|
550
527
|
self.pca_feature = train_df.drop(columns=['id',self.target_column],axis=1).columns.tolist()
|
|
551
528
|
|
|
@@ -557,31 +534,21 @@ class _DataPreparation:
|
|
|
557
534
|
progress_bar=self.progress_bar,
|
|
558
535
|
show_data=True)
|
|
559
536
|
|
|
560
|
-
if self.is_classification_type():
|
|
561
|
-
train_df[self.target_column] = train_df[self.target_column].astype('int')
|
|
562
|
-
test_df[self.target_column] = test_df[self.target_column].astype('int')
|
|
563
|
-
|
|
564
537
|
# Pushing the data in database
|
|
565
|
-
self.copy_dataframe_to_sql(train_df,
|
|
538
|
+
self.copy_dataframe_to_sql(train_df, 'pca', self.persist)
|
|
566
539
|
|
|
567
|
-
def _feature_selection_RFE(self,
|
|
568
|
-
|
|
569
|
-
test=None):
|
|
540
|
+
def _feature_selection_RFE(self,
|
|
541
|
+
data=None):
|
|
570
542
|
"""
|
|
571
543
|
DESCRIPTION:
|
|
572
544
|
Function performs Recursive Feature Elimination (RFE) for feature selection.
|
|
573
545
|
It identifies a subset of the most relevant features in the dataset.
|
|
574
546
|
|
|
575
547
|
PARAMETERS:
|
|
576
|
-
|
|
548
|
+
data:
|
|
577
549
|
Required Argument.
|
|
578
550
|
Specifies the input train pandas DataFrame.
|
|
579
|
-
Types: pandas Dataframe
|
|
580
|
-
|
|
581
|
-
test:
|
|
582
|
-
Required Argument.
|
|
583
|
-
Specifies the input test pandas DataFrame.
|
|
584
|
-
Types: pandas Dataframe
|
|
551
|
+
Types: pandas Dataframe
|
|
585
552
|
"""
|
|
586
553
|
self._display_msg(msg="\nFeature selection using rfe ...",
|
|
587
554
|
progress_bar=self.progress_bar,
|
|
@@ -590,57 +557,59 @@ class _DataPreparation:
|
|
|
590
557
|
# Required imports for RFE
|
|
591
558
|
from sklearn.feature_selection import RFECV
|
|
592
559
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
593
|
-
from sklearn.model_selection import StratifiedKFold
|
|
560
|
+
from sklearn.model_selection import StratifiedKFold
|
|
594
561
|
|
|
595
562
|
start_time = time.time()
|
|
596
563
|
# Regression
|
|
597
564
|
is_classification = self.is_classification_type()
|
|
598
565
|
# Getting the value of k in k-fold cross-validation
|
|
599
|
-
folds = self._num_of_folds(
|
|
566
|
+
folds = self._num_of_folds(data.shape[0])
|
|
600
567
|
|
|
601
|
-
#
|
|
602
|
-
|
|
603
|
-
|
|
568
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
569
|
+
with warnings.catch_warnings():
|
|
570
|
+
warnings.filterwarnings('ignore')
|
|
604
571
|
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
572
|
+
# Random forest for RFE model
|
|
573
|
+
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
574
|
+
rf = RFModel(n_estimators=100, random_state=42)
|
|
608
575
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
576
|
+
# Determine the scoring metric based on the number of unique classes
|
|
577
|
+
score = 'r2' if not self.is_classification_type() \
|
|
578
|
+
else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
|
|
612
579
|
|
|
613
|
-
|
|
614
|
-
|
|
580
|
+
# # Instantiate StratifiedKFold with shuffling for classification
|
|
581
|
+
cv = folds if not self.is_classification_type() \
|
|
582
|
+
else StratifiedKFold(n_splits=folds, shuffle=False)
|
|
615
583
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
train_target = train[self.target_column]
|
|
584
|
+
# Define the RFE with cross-validation
|
|
585
|
+
rfecv = RFECV(rf, cv=cv, scoring=score)
|
|
619
586
|
|
|
620
|
-
|
|
621
|
-
|
|
587
|
+
# Prepare data
|
|
588
|
+
train_data = data.drop(columns=['id',self.target_column], axis=1)
|
|
589
|
+
train_target = data[self.target_column]
|
|
622
590
|
|
|
623
|
-
|
|
624
|
-
|
|
591
|
+
# Fit the RFE using cv
|
|
592
|
+
rfecv.fit(train_data, train_target)
|
|
625
593
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
594
|
+
# Extract the features
|
|
595
|
+
features = train_data.columns[rfecv.support_].tolist()
|
|
596
|
+
|
|
597
|
+
self._display_msg(msg="feature selected by RFE:",
|
|
598
|
+
col_lst=features,
|
|
599
|
+
progress_bar=self.progress_bar)
|
|
600
|
+
features.append(self.target_column)
|
|
601
|
+
features.insert(0,'id')
|
|
602
|
+
|
|
603
|
+
selected_rfe_df = data[features]
|
|
604
|
+
|
|
605
|
+
# storing the rfe selected features in data transformation dictionary
|
|
606
|
+
self.data_transform_dict['rfe_features'] = features
|
|
607
|
+
|
|
608
|
+
columns_to_rename = [col for col in selected_rfe_df.columns if col not in ['id', self.target_column]]
|
|
609
|
+
new_column = {col: f'r_{col}' for col in columns_to_rename}
|
|
610
|
+
self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
|
|
611
|
+
|
|
612
|
+
selected_rfe_df.rename(columns=new_column, inplace=True)
|
|
644
613
|
|
|
645
614
|
# storing the rename column list in data transformation dictionary
|
|
646
615
|
self.data_transform_dict['rfe_rename_column'] = columns_to_rename
|
|
@@ -649,29 +618,24 @@ class _DataPreparation:
|
|
|
649
618
|
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
650
619
|
progress_bar=self.progress_bar,
|
|
651
620
|
show_data=True)
|
|
652
|
-
self.rfe_feature =
|
|
621
|
+
self.rfe_feature = selected_rfe_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
|
|
653
622
|
|
|
654
623
|
# Pushing data into database
|
|
655
|
-
self.copy_dataframe_to_sql(
|
|
624
|
+
self.copy_dataframe_to_sql(selected_rfe_df, 'rfe', self.persist)
|
|
656
625
|
|
|
657
626
|
def _feature_selection_Lasso(self,
|
|
658
|
-
|
|
659
|
-
test=None):
|
|
627
|
+
data=None):
|
|
660
628
|
"""
|
|
661
629
|
DESCRIPTION:
|
|
662
630
|
Function performs Lasso Regression for feature selection.
|
|
663
631
|
It helps in identifing and retaining the most important features while setting less important ones to zero.
|
|
664
632
|
|
|
665
633
|
PARAMETERS:
|
|
666
|
-
|
|
634
|
+
data:
|
|
667
635
|
Required Argument.
|
|
668
636
|
Specifies the input train pandas DataFrame.
|
|
669
637
|
Types: pandas Dataframe
|
|
670
638
|
|
|
671
|
-
test:
|
|
672
|
-
Required Argument.
|
|
673
|
-
Specifies the input test pandas DataFrame.
|
|
674
|
-
Types: pandas Dataframe
|
|
675
639
|
"""
|
|
676
640
|
start_time = time.time()
|
|
677
641
|
self._display_msg(msg="\nFeature selection using lasso ...",
|
|
@@ -682,35 +646,46 @@ class _DataPreparation:
|
|
|
682
646
|
from sklearn.model_selection import GridSearchCV
|
|
683
647
|
from sklearn.linear_model import Lasso
|
|
684
648
|
from sklearn.linear_model import LogisticRegression
|
|
685
|
-
|
|
649
|
+
from sklearn.model_selection import StratifiedKFold
|
|
650
|
+
|
|
686
651
|
# Getting the value k in k-fold cross-validation
|
|
687
|
-
num_folds = self._num_of_folds(
|
|
652
|
+
num_folds = self._num_of_folds(data.shape[0])
|
|
688
653
|
|
|
689
|
-
# Prepare
|
|
690
|
-
train_features =
|
|
691
|
-
train_target =
|
|
654
|
+
# Prepare data
|
|
655
|
+
train_features = data.drop(columns=['id',self.target_column], axis=1)
|
|
656
|
+
train_target = data[self.target_column]
|
|
692
657
|
|
|
693
|
-
#
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
658
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
659
|
+
with warnings.catch_warnings():
|
|
660
|
+
warnings.filterwarnings('ignore')
|
|
661
|
+
|
|
662
|
+
# Determine the estimator and parameters based on the type of problem
|
|
663
|
+
if self.is_classification_type():
|
|
664
|
+
if self.data.drop_duplicate(self.target_column).size == 2:
|
|
665
|
+
scoring_metric = 'roc_auc'
|
|
666
|
+
else:
|
|
667
|
+
scoring_metric = 'f1_macro'
|
|
668
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
|
|
669
|
+
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
697
670
|
else:
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
671
|
+
estimator = Lasso(random_state=42)
|
|
672
|
+
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
673
|
+
scoring_metric = "r2"
|
|
674
|
+
|
|
675
|
+
if self.is_classification_type():
|
|
676
|
+
cv = StratifiedKFold(n_splits=5, shuffle=False)
|
|
677
|
+
else:
|
|
678
|
+
cv = num_folds
|
|
705
679
|
|
|
706
|
-
|
|
707
|
-
|
|
680
|
+
# Applying hyperparameter tuning and optimizing score
|
|
681
|
+
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
682
|
+
scoring=scoring_metric, verbose=0)
|
|
708
683
|
|
|
709
|
-
|
|
710
|
-
|
|
684
|
+
# Fitting the best result from hyperparameter
|
|
685
|
+
hyperparameter_search.fit(train_features, train_target)
|
|
711
686
|
|
|
712
|
-
|
|
713
|
-
|
|
687
|
+
# Extracting the important estimators
|
|
688
|
+
feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
|
|
714
689
|
|
|
715
690
|
# Extracting feature using estimators whose importance > 0
|
|
716
691
|
if self.is_classification_type():
|
|
@@ -725,8 +700,7 @@ class _DataPreparation:
|
|
|
725
700
|
progress_bar=self.progress_bar)
|
|
726
701
|
|
|
727
702
|
important_features = ['id'] + important_features + [self.target_column]
|
|
728
|
-
|
|
729
|
-
test_df = test[important_features]
|
|
703
|
+
selected_lasso_df = data[important_features]
|
|
730
704
|
|
|
731
705
|
# Storing the lasso selected features in data transformation dictionary
|
|
732
706
|
self.data_transform_dict['lasso_features'] = important_features
|
|
@@ -736,61 +710,62 @@ class _DataPreparation:
|
|
|
736
710
|
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
737
711
|
progress_bar=self.progress_bar,
|
|
738
712
|
show_data=True)
|
|
739
|
-
self.lasso_feature =
|
|
713
|
+
self.lasso_feature = selected_lasso_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
|
|
740
714
|
|
|
741
|
-
self.copy_dataframe_to_sql(
|
|
715
|
+
self.copy_dataframe_to_sql(selected_lasso_df, 'lasso', self.persist)
|
|
742
716
|
|
|
743
717
|
def copy_dataframe_to_sql(self,
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
718
|
+
data,
|
|
719
|
+
prefix,
|
|
720
|
+
persist):
|
|
747
721
|
"""
|
|
748
722
|
DESCRIPTION:
|
|
749
723
|
Function to copy dataframe to SQL with generated table name.
|
|
750
724
|
|
|
751
725
|
PARAMETERS:
|
|
752
|
-
|
|
753
|
-
Required Argument.
|
|
754
|
-
Specifies the input train pandas DataFrame.
|
|
755
|
-
Types: pandas Dataframe
|
|
756
|
-
|
|
757
|
-
test:
|
|
726
|
+
data:
|
|
758
727
|
Required Argument.
|
|
759
|
-
Specifies the input
|
|
728
|
+
Specifies the input pandas DataFrame.
|
|
760
729
|
Types: pandas Dataframe
|
|
761
730
|
|
|
762
731
|
prefix:
|
|
763
732
|
Required Argument.
|
|
764
733
|
Specifies the prefix for the table name.
|
|
765
734
|
Types: str
|
|
735
|
+
|
|
736
|
+
persist:
|
|
737
|
+
Required Argument.
|
|
738
|
+
Specifies whether to persist the results of the
|
|
739
|
+
function in a table or not. When set to True,
|
|
740
|
+
results are persisted in a table; otherwise,
|
|
741
|
+
results are garbage collected at the end of the
|
|
742
|
+
session.
|
|
743
|
+
Types: bool
|
|
766
744
|
"""
|
|
767
745
|
# Generating table names
|
|
768
746
|
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
769
|
-
table_type = TeradataConstants.TERADATA_TABLE
|
|
770
|
-
|
|
771
|
-
table_type = TeradataConstants.TERADATA_TABLE)
|
|
772
|
-
|
|
747
|
+
table_type = TeradataConstants.TERADATA_TABLE,
|
|
748
|
+
gc_on_quit=not persist)
|
|
773
749
|
# Storing the table names in the table name mapping dictionary
|
|
774
750
|
self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
|
|
775
|
-
self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
|
|
776
751
|
|
|
777
752
|
# Pushing data into database
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
753
|
+
if self.is_classification_type():
|
|
754
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
755
|
+
else:
|
|
756
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
|
|
782
757
|
|
|
783
758
|
def _scaling_features_helper(self,
|
|
784
|
-
|
|
785
|
-
|
|
759
|
+
data=None,
|
|
760
|
+
feature_selection_mtd=None):
|
|
786
761
|
"""
|
|
787
762
|
DESCRIPTION:
|
|
788
763
|
This function selects the features on which feature scaling should be applied.
|
|
789
764
|
|
|
790
765
|
PARAMETERS:
|
|
791
|
-
|
|
766
|
+
data:
|
|
792
767
|
Required Argument.
|
|
793
|
-
Specifies the
|
|
768
|
+
Specifies the data on which feature scaling will be applied.
|
|
794
769
|
Types: teradataml Dataframe
|
|
795
770
|
|
|
796
771
|
feature_selection_mtd:
|
|
@@ -805,10 +780,10 @@ class _DataPreparation:
|
|
|
805
780
|
columns_to_scale = []
|
|
806
781
|
|
|
807
782
|
# Iterating over the columns
|
|
808
|
-
for col in
|
|
783
|
+
for col in data.columns:
|
|
809
784
|
# Selecting columns that will be scaled
|
|
810
785
|
# Exculding target_col and columns with single value
|
|
811
|
-
if col not in ['id', self.target_column] and
|
|
786
|
+
if col not in ['id', self.target_column] and data.drop_duplicate(col).size > 1:
|
|
812
787
|
columns_to_scale.append(col)
|
|
813
788
|
|
|
814
789
|
if feature_selection_mtd == "lasso":
|
|
@@ -822,7 +797,7 @@ class _DataPreparation:
|
|
|
822
797
|
return columns_to_scale
|
|
823
798
|
|
|
824
799
|
def _scaling_features(self,
|
|
825
|
-
|
|
800
|
+
feature_selection_mtd=None):
|
|
826
801
|
"""
|
|
827
802
|
DESCRIPTION:
|
|
828
803
|
Function performs feature scaling on columns present inside the dataset
|
|
@@ -832,7 +807,7 @@ class _DataPreparation:
|
|
|
832
807
|
feature_selection_mtd:
|
|
833
808
|
Required Argument.
|
|
834
809
|
Specifies the feature selection algorithm used.
|
|
835
|
-
Types: str
|
|
810
|
+
Types: str
|
|
836
811
|
"""
|
|
837
812
|
|
|
838
813
|
self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
|
|
@@ -840,8 +815,7 @@ class _DataPreparation:
|
|
|
840
815
|
show_data=True)
|
|
841
816
|
|
|
842
817
|
start_time = time.time()
|
|
843
|
-
|
|
844
|
-
test = None
|
|
818
|
+
data_to_scale = None
|
|
845
819
|
|
|
846
820
|
if self.is_classification_type():
|
|
847
821
|
scale_method = self._scale_method_cls
|
|
@@ -850,17 +824,18 @@ class _DataPreparation:
|
|
|
850
824
|
|
|
851
825
|
# Loading data for feature scaling based of feature selection method
|
|
852
826
|
if feature_selection_mtd == 'rfe':
|
|
853
|
-
|
|
854
|
-
test = DataFrame(self.table_name_mapping['rfe_test'])
|
|
827
|
+
data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
|
|
855
828
|
elif feature_selection_mtd == 'lasso':
|
|
856
|
-
|
|
857
|
-
test = DataFrame(self.table_name_mapping['lasso_test'])
|
|
829
|
+
data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
|
|
858
830
|
else:
|
|
859
|
-
|
|
860
|
-
|
|
831
|
+
data_to_scale = self.data
|
|
832
|
+
|
|
833
|
+
# Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
|
|
834
|
+
volatile, persist = self._set_generic_parameters(func_indicator='FeatureScalingIndicator',
|
|
835
|
+
param_name='FeatureScalingParam')
|
|
861
836
|
|
|
862
837
|
# List of columns that will be scaled
|
|
863
|
-
scale_col= self._scaling_features_helper(
|
|
838
|
+
scale_col= self._scaling_features_helper(data_to_scale, feature_selection_mtd)
|
|
864
839
|
|
|
865
840
|
if len(scale_col) != 0:
|
|
866
841
|
self._display_msg(msg="columns that will be scaled: ",
|
|
@@ -868,58 +843,38 @@ class _DataPreparation:
|
|
|
868
843
|
progress_bar=self.progress_bar)
|
|
869
844
|
|
|
870
845
|
# Scale Fit
|
|
871
|
-
fit_obj = ScaleFit(data=
|
|
846
|
+
fit_obj = ScaleFit(data=data_to_scale,
|
|
872
847
|
target_columns=scale_col,
|
|
873
|
-
scale_method=scale_method
|
|
848
|
+
scale_method=scale_method,
|
|
849
|
+
volatile=volatile,
|
|
850
|
+
persist=persist)
|
|
874
851
|
|
|
875
852
|
# storing the scale fit object and columns in data transformation dictionary
|
|
876
|
-
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj
|
|
853
|
+
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
877
854
|
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
878
855
|
|
|
879
856
|
# List of columns to copy to the output generated by scale transform
|
|
880
|
-
accumulate_cols = list(set(
|
|
881
|
-
|
|
882
|
-
# Scaling on training dataset
|
|
883
|
-
tr_obj = ScaleTransform(data=train,
|
|
884
|
-
object=fit_obj,
|
|
885
|
-
accumulate=accumulate_cols)
|
|
886
|
-
|
|
887
|
-
# Scaling on testing dataset
|
|
888
|
-
ts_obj = ScaleTransform(data=test,
|
|
889
|
-
object=fit_obj,
|
|
890
|
-
accumulate=accumulate_cols)
|
|
857
|
+
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
891
858
|
|
|
892
|
-
|
|
893
|
-
|
|
859
|
+
# Scaling dataset
|
|
860
|
+
transform_obj = ScaleTransform(data=data_to_scale,
|
|
861
|
+
object=fit_obj,
|
|
862
|
+
accumulate=accumulate_cols)
|
|
863
|
+
scaled_df = transform_obj.result
|
|
894
864
|
|
|
895
|
-
self._display_msg(msg="
|
|
896
|
-
data=
|
|
897
|
-
progress_bar=self.progress_bar)
|
|
898
|
-
self._display_msg(msg="Testing dataset sample after scaling:",
|
|
899
|
-
data=test,
|
|
865
|
+
self._display_msg(msg="Dataset sample after scaling:",
|
|
866
|
+
data=scaled_df,
|
|
900
867
|
progress_bar=self.progress_bar)
|
|
901
868
|
else:
|
|
902
869
|
self._display_msg(msg="No columns to scale.",
|
|
903
870
|
progress_bar=self.progress_bar)
|
|
904
|
-
|
|
905
|
-
if self.is_classification_type():
|
|
906
|
-
train, test = self._bigint_to_int(train, test)
|
|
907
871
|
|
|
908
|
-
self.copy_dataframe_to_sql(
|
|
872
|
+
self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
|
|
909
873
|
|
|
910
874
|
end_time = time.time()
|
|
911
875
|
self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
|
|
912
876
|
progress_bar=self.progress_bar,
|
|
913
877
|
show_data=True)
|
|
914
|
-
|
|
915
|
-
def _bigint_to_int(self, train, test):
|
|
916
|
-
tr = train.to_pandas()
|
|
917
|
-
tr[self.target_column] = tr[self.target_column].astype('int')
|
|
918
|
-
|
|
919
|
-
ts = test.to_pandas()
|
|
920
|
-
ts[self.target_column] = ts[self.target_column].astype('int')
|
|
921
|
-
|
|
922
|
-
return tr, ts
|
|
923
878
|
|
|
924
879
|
def _set_custom_scaling_method(self):
|
|
925
880
|
"""
|
|
@@ -932,43 +887,32 @@ class _DataPreparation:
|
|
|
932
887
|
# Checking user input for feature scaling
|
|
933
888
|
if feature_scaling_input:
|
|
934
889
|
# Extracting scaling method
|
|
935
|
-
|
|
936
|
-
if
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
self._scale_method_cls = custom_scaling_method
|
|
890
|
+
custom_scaling_params = self.custom_data.get("FeatureScalingParam", None)
|
|
891
|
+
if custom_scaling_params:
|
|
892
|
+
custom_scaling_method = custom_scaling_params.get("FeatureScalingMethod", None)
|
|
893
|
+
if custom_scaling_method is None:
|
|
894
|
+
self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
|
|
895
|
+
progress_bar=self.progress_bar)
|
|
942
896
|
else:
|
|
943
|
-
self.
|
|
897
|
+
if self.is_classification_type():
|
|
898
|
+
self._scale_method_cls = custom_scaling_method
|
|
899
|
+
else:
|
|
900
|
+
self._scale_method_reg = custom_scaling_method
|
|
944
901
|
else:
|
|
945
902
|
self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
|
|
946
903
|
progress_bar=self.progress_bar)
|
|
947
904
|
|
|
948
905
|
|
|
949
|
-
def _handle_generated_features(self
|
|
950
|
-
label = None):
|
|
906
|
+
def _handle_generated_features(self):
|
|
951
907
|
"""
|
|
952
908
|
DESCRIPTION:
|
|
953
909
|
Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
Optional Argument.
|
|
958
|
-
Specifies label for dataset on which rounding up is getting done i.e., 'train' for training
|
|
959
|
-
and 'test' for testing dataset.
|
|
960
|
-
By Default, it takes None and transformation is getting applied to whole dataset.
|
|
961
|
-
Types: str
|
|
962
|
-
|
|
910
|
+
|
|
911
|
+
RETURNS:
|
|
912
|
+
Pandas DataFrame containing, rounded up float columns.
|
|
963
913
|
"""
|
|
964
|
-
#
|
|
965
|
-
|
|
966
|
-
target_df = self.train_df
|
|
967
|
-
elif label == 'test':
|
|
968
|
-
target_df = self.test_df
|
|
969
|
-
else:
|
|
970
|
-
target_df=self.data
|
|
971
|
-
|
|
914
|
+
# Assigning data to target dataframe
|
|
915
|
+
target_df = self.data
|
|
972
916
|
# Detecting list of float columns on target dataset
|
|
973
917
|
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
|
|
974
918
|
|
|
@@ -988,6 +932,19 @@ class _DataPreparation:
|
|
|
988
932
|
"accumulate" : accumulate_columns,
|
|
989
933
|
"persist" : True}
|
|
990
934
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
935
|
+
# Disabling print if persist is True by default
|
|
936
|
+
if not self.volatile and not self.persist:
|
|
937
|
+
fit_params["display_table_name"] = False
|
|
938
|
+
|
|
939
|
+
if self.volatile:
|
|
940
|
+
fit_params["volatile"] = True
|
|
941
|
+
fit_params["persist"] = False
|
|
942
|
+
|
|
943
|
+
transform_output = RoundColumns(**fit_params).result
|
|
944
|
+
if not self.volatile and not self.persist:
|
|
945
|
+
# Adding transformed data containing table to garbage collector
|
|
946
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
947
|
+
cols = transform_output.columns
|
|
948
|
+
df = transform_output.to_pandas().reset_index()
|
|
949
|
+
df = df[cols]
|
|
950
|
+
return df
|