teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -19,7 +19,6 @@ import pandas as pd
|
|
|
19
19
|
import random
|
|
20
20
|
import time
|
|
21
21
|
import warnings
|
|
22
|
-
warnings.filterwarnings("ignore")
|
|
23
22
|
|
|
24
23
|
# Teradata libraries
|
|
25
24
|
from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -27,7 +26,7 @@ from teradataml.dataframe.copy_to import copy_to_sql
|
|
|
27
26
|
from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
28
27
|
from teradataml import RoundColumns, TeradataMlException
|
|
29
28
|
from teradataml import ScaleFit, ScaleTransform
|
|
30
|
-
from teradataml import
|
|
29
|
+
from teradataml import UtilFuncs, TeradataConstants
|
|
31
30
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
32
31
|
from teradataml.common.messages import Messages, MessageCodes
|
|
33
32
|
from teradataml.utils.validators import _Validators
|
|
@@ -46,7 +45,8 @@ class _DataPreparation:
|
|
|
46
45
|
excluded_columns=None,
|
|
47
46
|
custom_data=None,
|
|
48
47
|
data_transform_dict=None,
|
|
49
|
-
task_type="Regression"
|
|
48
|
+
task_type="Regression",
|
|
49
|
+
**kwargs):
|
|
50
50
|
"""
|
|
51
51
|
DESCRIPTION:
|
|
52
52
|
Function initializes the data, target column and columns datatypes
|
|
@@ -95,6 +95,28 @@ class _DataPreparation:
|
|
|
95
95
|
Default Value: "Regression"
|
|
96
96
|
Permitted Values: "Regression", "Classification"
|
|
97
97
|
Types: str
|
|
98
|
+
|
|
99
|
+
**kwargs:
|
|
100
|
+
Specifies the additional arguments for data preparation. Below
|
|
101
|
+
are the additional arguments:
|
|
102
|
+
volatile:
|
|
103
|
+
Optional Argument.
|
|
104
|
+
Specifies whether to put the interim results of the
|
|
105
|
+
functions in a volatile table or not. When set to
|
|
106
|
+
True, results are stored in a volatile table,
|
|
107
|
+
otherwise not.
|
|
108
|
+
Default Value: False
|
|
109
|
+
Types: bool
|
|
110
|
+
|
|
111
|
+
persist:
|
|
112
|
+
Optional Argument.
|
|
113
|
+
Specifies whether to persist the interim results of the
|
|
114
|
+
functions in a table or not. When set to True,
|
|
115
|
+
results are persisted in a table; otherwise,
|
|
116
|
+
results are garbage collected at the end of the
|
|
117
|
+
session.
|
|
118
|
+
Default Value: False
|
|
119
|
+
Types: bool
|
|
98
120
|
"""
|
|
99
121
|
self.data = data
|
|
100
122
|
self.target_column = target_column
|
|
@@ -103,9 +125,10 @@ class _DataPreparation:
|
|
|
103
125
|
self.data_transform_dict = data_transform_dict
|
|
104
126
|
self.custom_data = custom_data
|
|
105
127
|
self.task_type = task_type
|
|
128
|
+
self.volatile = kwargs.get("volatile", False)
|
|
129
|
+
self.persist = kwargs.get("persist", False)
|
|
106
130
|
|
|
107
131
|
# Setting default value for auto run mode
|
|
108
|
-
self._train_size = 0.80
|
|
109
132
|
self._data_sampling_method = "SMOTE"
|
|
110
133
|
self._scale_method_reg = "STD"
|
|
111
134
|
self._scale_method_cls = "RANGE"
|
|
@@ -119,10 +142,9 @@ class _DataPreparation:
|
|
|
119
142
|
"""
|
|
120
143
|
DESCRIPTION:
|
|
121
144
|
Function to perform following tasks:-
|
|
122
|
-
1.
|
|
123
|
-
2. Performs
|
|
124
|
-
3. Performs feature
|
|
125
|
-
4. Performs feature scaling.
|
|
145
|
+
1. Performs outlier processing and transformation on dataset.
|
|
146
|
+
2. Performs feature selection using RFE, PCA, and Lasso.
|
|
147
|
+
3. Performs feature scaling.
|
|
126
148
|
|
|
127
149
|
PARAMETERS:
|
|
128
150
|
auto:
|
|
@@ -141,42 +163,36 @@ class _DataPreparation:
|
|
|
141
163
|
progress_bar=self.progress_bar)
|
|
142
164
|
# Setting user value in case of custom running mode
|
|
143
165
|
if not auto:
|
|
144
|
-
self._set_custom_train_test_split()
|
|
145
166
|
self._set_custom_scaling_method()
|
|
146
167
|
self._set_custom_sampling()
|
|
147
168
|
|
|
148
|
-
# Performing train test split
|
|
149
|
-
self._train_test_split()
|
|
150
|
-
self.progress_bar.update()
|
|
151
|
-
|
|
152
169
|
# Handling ouliers in dataset
|
|
153
170
|
self._handle_outliers(auto)
|
|
154
171
|
self.progress_bar.update()
|
|
155
172
|
|
|
156
173
|
# Handling float type features before processing with feature selection and scaling
|
|
157
|
-
|
|
158
|
-
test = self._handle_generated_features('test')
|
|
174
|
+
training_data = self._handle_generated_features()
|
|
159
175
|
self.progress_bar.update()
|
|
160
176
|
|
|
161
177
|
# Temporary Pulling data for feature selection
|
|
162
178
|
# Will change after sto
|
|
163
179
|
|
|
164
180
|
# Checking for data imbalance
|
|
165
|
-
if self._check_data_imbalance(
|
|
166
|
-
|
|
181
|
+
if self._check_data_imbalance(training_data):
|
|
182
|
+
training_data = self._data_sampling(training_data)
|
|
167
183
|
self.progress_bar.update()
|
|
168
184
|
|
|
169
185
|
# Sorting the data based on id to
|
|
170
186
|
# remove any shuffling done by sampling
|
|
171
|
-
|
|
187
|
+
training_data = training_data.sort_values(by='id')
|
|
172
188
|
|
|
173
189
|
# Performing feature selection using lasso followed by scaling
|
|
174
|
-
self._feature_selection_Lasso(
|
|
190
|
+
self._feature_selection_Lasso(training_data)
|
|
175
191
|
self._scaling_features(feature_selection_mtd="lasso")
|
|
176
192
|
self.progress_bar.update()
|
|
177
193
|
|
|
178
194
|
# Performing feature selection using rfe followed by scaling
|
|
179
|
-
self._feature_selection_RFE(
|
|
195
|
+
self._feature_selection_RFE(training_data)
|
|
180
196
|
self._scaling_features(feature_selection_mtd="rfe")
|
|
181
197
|
self.progress_bar.update()
|
|
182
198
|
|
|
@@ -187,85 +203,8 @@ class _DataPreparation:
|
|
|
187
203
|
|
|
188
204
|
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
|
|
189
205
|
|
|
190
|
-
# Splits data into train and test
|
|
191
|
-
def _train_test_split(self):
|
|
192
|
-
|
|
193
|
-
"""
|
|
194
|
-
DESCRIPTION:
|
|
195
|
-
Function splits the data into training and testing datasets.
|
|
196
|
-
|
|
197
|
-
PARAMETERS:
|
|
198
|
-
train_size:
|
|
199
|
-
Optional Argument.
|
|
200
|
-
Specifies the training size required for splitting dataset.
|
|
201
|
-
By Default, it takes 0.8 as training size.
|
|
202
|
-
Types: float
|
|
203
|
-
"""
|
|
204
|
-
self._display_msg(msg="\nSpliting of dataset into training and testing ...",
|
|
205
|
-
progress_bar=self.progress_bar,
|
|
206
|
-
show_data=True)
|
|
207
|
-
self._display_msg(inline_msg="Training size : {}".format(self._train_size),
|
|
208
|
-
progress_bar=self.progress_bar)
|
|
209
|
-
self._display_msg(inline_msg="Testing size : {}".format(round((1-self._train_size),2)),
|
|
210
|
-
progress_bar=self.progress_bar)
|
|
211
|
-
start_time = time.time()
|
|
212
|
-
# Applying TrainTestSplit function on data
|
|
213
|
-
# Regression
|
|
214
|
-
train_test_func_params = {
|
|
215
|
-
"data" : self.data,
|
|
216
|
-
"id_column" : "id",
|
|
217
|
-
"train_size" : self._train_size,
|
|
218
|
-
"seed" : 42
|
|
219
|
-
}
|
|
220
|
-
if self.is_classification_type():
|
|
221
|
-
train_test_func_params["stratify_column"]=self.target_column
|
|
222
|
-
train_test_split_out = TrainTestSplit(**train_test_func_params)
|
|
223
|
-
train_test_split_out = train_test_split_out.result
|
|
224
|
-
|
|
225
|
-
# Splitting the data into training and testing data
|
|
226
|
-
self.train_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 1].drop('TD_IsTrainRow', axis=1)
|
|
227
|
-
self.test_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 0].drop('TD_IsTrainRow', axis=1)
|
|
228
|
-
|
|
229
|
-
self._display_msg(msg="Training data sample",
|
|
230
|
-
data=self.train_df,
|
|
231
|
-
progress_bar=self.progress_bar)
|
|
232
|
-
|
|
233
|
-
self._display_msg(msg="Testing data sample",
|
|
234
|
-
data=self.test_df,
|
|
235
|
-
progress_bar=self.progress_bar)
|
|
236
|
-
|
|
237
|
-
end_time = time.time()
|
|
238
|
-
self._display_msg(msg="Time taken for spliting of data: {:.2f} sec ".format(end_time - start_time),
|
|
239
|
-
progress_bar=self.progress_bar,
|
|
240
|
-
show_data=True)
|
|
241
|
-
|
|
242
|
-
def _set_custom_train_test_split(self):
|
|
243
|
-
"""
|
|
244
|
-
DESCRIPTION:
|
|
245
|
-
Function to split dataset into training and testing based on user input.
|
|
246
|
-
|
|
247
|
-
"""
|
|
248
|
-
# Fetching user input for train test split
|
|
249
|
-
train_test_split_input = self.custom_data.get("TrainTestSplitIndicator", False)
|
|
250
|
-
if train_test_split_input:
|
|
251
|
-
# Extracting training size
|
|
252
|
-
custom_train_size = self.custom_data.get("TrainingSize", None)
|
|
253
|
-
if custom_train_size is None:
|
|
254
|
-
self._display_msg(inline_msg="No information provided for training size. Proceeding with default option.",
|
|
255
|
-
progress_bar=self.progress_bar)
|
|
256
|
-
else:
|
|
257
|
-
if not isinstance(custom_train_size, float):
|
|
258
|
-
err = Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE,
|
|
259
|
-
'custom_train', type(custom_train_size).__name__,
|
|
260
|
-
'float')
|
|
261
|
-
raise TeradataMlException(err, MessageCodes.INVALID_COLUMN_TYPE)
|
|
262
|
-
self._train_size = custom_train_size
|
|
263
|
-
else:
|
|
264
|
-
self._display_msg(inline_msg="No information provided for performing customized train test split. Proceeding with default option.",
|
|
265
|
-
progress_bar=self.progress_bar)
|
|
266
|
-
|
|
267
206
|
def _handle_outliers(self,
|
|
268
|
-
|
|
207
|
+
auto):
|
|
269
208
|
"""
|
|
270
209
|
DESCRIPTION:
|
|
271
210
|
Function to handle existing outliers in dataset based on running mode.
|
|
@@ -296,6 +235,12 @@ class _DataPreparation:
|
|
|
296
235
|
DESCRIPTION:
|
|
297
236
|
Function to handle data imbalance in dataset using sampling techniques
|
|
298
237
|
in case of classification.
|
|
238
|
+
|
|
239
|
+
PARAMETERS:
|
|
240
|
+
data:
|
|
241
|
+
Required Argument.
|
|
242
|
+
Specifies the input teradataml DataFrame.
|
|
243
|
+
Types: pandas Dataframe.
|
|
299
244
|
"""
|
|
300
245
|
pass
|
|
301
246
|
|
|
@@ -317,7 +262,7 @@ class _DataPreparation:
|
|
|
317
262
|
outlier_method = "Tukey"
|
|
318
263
|
|
|
319
264
|
# List of columns for outlier processing.
|
|
320
|
-
outlier_columns = [col for col in self.
|
|
265
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
321
266
|
|
|
322
267
|
# Detecting outlier percentage in each columns
|
|
323
268
|
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
@@ -367,28 +312,45 @@ class _DataPreparation:
|
|
|
367
312
|
Pandas DataFrame containing, column name with outlier percentage.
|
|
368
313
|
|
|
369
314
|
"""
|
|
370
|
-
|
|
315
|
+
|
|
316
|
+
# Setting volatile and persist parameters for Outlier handling function
|
|
317
|
+
volatile, persist = self._set_generic_parameters(func_indicator='OutlierFilterIndicator',
|
|
318
|
+
param_name='OutlierFilterParam')
|
|
319
|
+
|
|
320
|
+
# Performing fit on dataset for outlier handling
|
|
371
321
|
fit_params = {
|
|
372
|
-
"data" : self.
|
|
322
|
+
"data" : self.data,
|
|
373
323
|
"target_columns" : target_columns,
|
|
374
324
|
"outlier_method" : outlier_method,
|
|
375
|
-
"replacement_value" : replacement_value
|
|
325
|
+
"replacement_value" : replacement_value,
|
|
326
|
+
"volatile" : volatile,
|
|
327
|
+
"persist" : persist
|
|
376
328
|
}
|
|
377
329
|
outlier_fit_out = OutlierFilterFit(**fit_params)
|
|
378
|
-
# Performing transform on
|
|
330
|
+
# Performing transform on dataset for outlier handling
|
|
379
331
|
transform_params = {
|
|
380
|
-
"data" : self.
|
|
332
|
+
"data" : self.data,
|
|
381
333
|
"object" : outlier_fit_out.result,
|
|
382
334
|
"persist" : True
|
|
383
335
|
}
|
|
384
|
-
|
|
385
|
-
#
|
|
386
|
-
|
|
336
|
+
|
|
337
|
+
# Disabling print if persist is True by default
|
|
338
|
+
if not volatile and not persist:
|
|
339
|
+
transform_params["display_table_name"] = False
|
|
340
|
+
|
|
341
|
+
if volatile:
|
|
342
|
+
transform_params["volatile"] = True
|
|
343
|
+
transform_params["persist"] = False
|
|
344
|
+
self.data = OutlierFilterTransform(**transform_params).result
|
|
345
|
+
|
|
346
|
+
if not volatile and not persist:
|
|
347
|
+
# Adding transformed data containing table to garbage collector
|
|
348
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
387
349
|
|
|
388
350
|
def _outlier_processing(self):
|
|
389
351
|
"""
|
|
390
352
|
DESCRIPTION:
|
|
391
|
-
Function performs outlier processing on
|
|
353
|
+
Function performs outlier processing on dataset. It identifies and handle outliers in the dataset.
|
|
392
354
|
|
|
393
355
|
"""
|
|
394
356
|
self._display_msg(msg="\nOutlier preprocessing ...",
|
|
@@ -409,8 +371,8 @@ class _DataPreparation:
|
|
|
409
371
|
target_columns=columns_to_drop_rows
|
|
410
372
|
replacement_strategy = "DELETE"
|
|
411
373
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
412
|
-
self._display_msg(msg="Sample of
|
|
413
|
-
data=self.
|
|
374
|
+
self._display_msg(msg="Sample of dataset after removing outlier rows:",
|
|
375
|
+
data=self.data,
|
|
414
376
|
progress_bar=self.progress_bar)
|
|
415
377
|
|
|
416
378
|
# Imputing Median value in place of outliers
|
|
@@ -421,8 +383,8 @@ class _DataPreparation:
|
|
|
421
383
|
target_columns=columns_to_impute
|
|
422
384
|
replacement_strategy = "MEDIAN"
|
|
423
385
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
424
|
-
self._display_msg(msg="Sample of
|
|
425
|
-
data=self.
|
|
386
|
+
self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
|
|
387
|
+
data=self.data,
|
|
426
388
|
progress_bar=self.progress_bar)
|
|
427
389
|
|
|
428
390
|
if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
|
|
@@ -437,7 +399,7 @@ class _DataPreparation:
|
|
|
437
399
|
def _custom_outlier_processing(self):
|
|
438
400
|
"""
|
|
439
401
|
DESCRIPTION:
|
|
440
|
-
Function to perform outlier processing on
|
|
402
|
+
Function to perform outlier processing on dataset based on user input.
|
|
441
403
|
|
|
442
404
|
"""
|
|
443
405
|
self._display_msg(msg="\nStarting customized outlier processing ...",
|
|
@@ -447,7 +409,7 @@ class _DataPreparation:
|
|
|
447
409
|
# Checking user input for outlier filtering
|
|
448
410
|
if outlier_filter_input:
|
|
449
411
|
# List of columns for outlier processing.
|
|
450
|
-
target_columns = [col for col in self.
|
|
412
|
+
target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
451
413
|
# Checking user input for outlier detection method
|
|
452
414
|
outlier_method = self.custom_data.get("OutlierDetectionMethod", None)
|
|
453
415
|
if outlier_method == 'PERCENTILE':
|
|
@@ -464,11 +426,13 @@ class _DataPreparation:
|
|
|
464
426
|
# Checking for rows if outlier containing columns exist
|
|
465
427
|
if outlier_df.shape[0]:
|
|
466
428
|
# Checking user input list for outlier handling
|
|
467
|
-
outlier_transform_list = self.custom_data.get("OutlierFilterParam", None)
|
|
429
|
+
outlier_transform_list = self.custom_data.get("OutlierFilterParam", None).copy()
|
|
468
430
|
if outlier_transform_list:
|
|
431
|
+
volatile = outlier_transform_list.pop("volatile", False)
|
|
432
|
+
persist = outlier_transform_list.pop("persist", False)
|
|
469
433
|
# Checking user input for outlier handling
|
|
470
434
|
_Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
|
|
471
|
-
self.
|
|
435
|
+
self.data, "outlier_data")
|
|
472
436
|
|
|
473
437
|
for target_col, transform_val in outlier_transform_list.items():
|
|
474
438
|
# Fetching replacement value
|
|
@@ -501,7 +465,7 @@ class _DataPreparation:
|
|
|
501
465
|
RETURNS:
|
|
502
466
|
int, number of folds to be used for cross-validation.
|
|
503
467
|
"""
|
|
504
|
-
num_of_folds = lambda rows:
|
|
468
|
+
num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
|
|
505
469
|
return num_of_folds(rows)
|
|
506
470
|
|
|
507
471
|
def _feature_selection_PCA(self):
|
|
@@ -517,14 +481,12 @@ class _DataPreparation:
|
|
|
517
481
|
from sklearn.decomposition import PCA
|
|
518
482
|
|
|
519
483
|
start_time = time.time()
|
|
520
|
-
|
|
484
|
+
|
|
521
485
|
# Temporary Pulling data for feature selection
|
|
522
|
-
|
|
523
|
-
test = DataFrame.from_table(self.table_name_mapping['pca_test']).to_pandas()
|
|
486
|
+
pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
|
|
524
487
|
|
|
525
488
|
# Drop unnecessary columns and store the result
|
|
526
|
-
train_data =
|
|
527
|
-
test_data = test.drop(columns=['id', self.target_column], axis=1)
|
|
489
|
+
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
528
490
|
|
|
529
491
|
# Initialize and fit PCA
|
|
530
492
|
pca = PCA()
|
|
@@ -537,16 +499,15 @@ class _DataPreparation:
|
|
|
537
499
|
# Create a new instance of PCA with the optimal number of components
|
|
538
500
|
pca = PCA(n_components=n, random_state=42)
|
|
539
501
|
|
|
540
|
-
# Apply PCA on
|
|
502
|
+
# Apply PCA on dataset
|
|
541
503
|
X_train_pca = pca.fit_transform(train_data)
|
|
542
|
-
X_test_pca = pca.transform(test_data)
|
|
543
504
|
|
|
544
505
|
# storing instance of PCA in data transformation dictionary
|
|
545
506
|
self.data_transform_dict["pca_fit_instance"] = pca
|
|
507
|
+
self.data_transform_dict["pca_fit_columns"] = train_data.columns.tolist()
|
|
546
508
|
|
|
547
509
|
#converting the numarray into dataframes
|
|
548
510
|
train_df = pd.DataFrame(X_train_pca)
|
|
549
|
-
test_df = pd.DataFrame(X_test_pca)
|
|
550
511
|
|
|
551
512
|
#creating names for combined columns
|
|
552
513
|
column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
|
|
@@ -556,15 +517,12 @@ class _DataPreparation:
|
|
|
556
517
|
|
|
557
518
|
#renaming them
|
|
558
519
|
train_df = train_df.rename(columns=column_name)
|
|
559
|
-
test_df = test_df.rename(columns=column_name)
|
|
560
520
|
|
|
561
521
|
# adding the id column [PCA does not shuffle the dataset]
|
|
562
|
-
train_df = pd.concat([
|
|
563
|
-
test_df = pd.concat([test.reset_index(drop=True)['id'], test_df.reset_index(drop=True)], axis=1)
|
|
522
|
+
train_df = pd.concat([pca_train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
|
|
564
523
|
|
|
565
|
-
# merging target column with new
|
|
566
|
-
train_df[self.target_column] =
|
|
567
|
-
test_df[self.target_column] = test[self.target_column].reset_index(drop=True)
|
|
524
|
+
# merging target column with new data
|
|
525
|
+
train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
|
|
568
526
|
|
|
569
527
|
self.pca_feature = train_df.drop(columns=['id',self.target_column],axis=1).columns.tolist()
|
|
570
528
|
|
|
@@ -577,26 +535,20 @@ class _DataPreparation:
|
|
|
577
535
|
show_data=True)
|
|
578
536
|
|
|
579
537
|
# Pushing the data in database
|
|
580
|
-
self.copy_dataframe_to_sql(train_df,
|
|
538
|
+
self.copy_dataframe_to_sql(train_df, 'pca', self.persist)
|
|
581
539
|
|
|
582
|
-
def _feature_selection_RFE(self,
|
|
583
|
-
|
|
584
|
-
test=None):
|
|
540
|
+
def _feature_selection_RFE(self,
|
|
541
|
+
data=None):
|
|
585
542
|
"""
|
|
586
543
|
DESCRIPTION:
|
|
587
544
|
Function performs Recursive Feature Elimination (RFE) for feature selection.
|
|
588
545
|
It identifies a subset of the most relevant features in the dataset.
|
|
589
546
|
|
|
590
547
|
PARAMETERS:
|
|
591
|
-
|
|
548
|
+
data:
|
|
592
549
|
Required Argument.
|
|
593
550
|
Specifies the input train pandas DataFrame.
|
|
594
|
-
Types: pandas Dataframe
|
|
595
|
-
|
|
596
|
-
test:
|
|
597
|
-
Required Argument.
|
|
598
|
-
Specifies the input test pandas DataFrame.
|
|
599
|
-
Types: pandas Dataframe
|
|
551
|
+
Types: pandas Dataframe
|
|
600
552
|
"""
|
|
601
553
|
self._display_msg(msg="\nFeature selection using rfe ...",
|
|
602
554
|
progress_bar=self.progress_bar,
|
|
@@ -611,51 +563,53 @@ class _DataPreparation:
|
|
|
611
563
|
# Regression
|
|
612
564
|
is_classification = self.is_classification_type()
|
|
613
565
|
# Getting the value of k in k-fold cross-validation
|
|
614
|
-
folds = self._num_of_folds(
|
|
566
|
+
folds = self._num_of_folds(data.shape[0])
|
|
615
567
|
|
|
616
|
-
#
|
|
617
|
-
|
|
618
|
-
|
|
568
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
569
|
+
with warnings.catch_warnings():
|
|
570
|
+
warnings.filterwarnings('ignore')
|
|
619
571
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
572
|
+
# Random forest for RFE model
|
|
573
|
+
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
574
|
+
rf = RFModel(n_estimators=100, random_state=42)
|
|
623
575
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
576
|
+
# Determine the scoring metric based on the number of unique classes
|
|
577
|
+
score = 'r2' if not self.is_classification_type() \
|
|
578
|
+
else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
|
|
627
579
|
|
|
628
|
-
|
|
629
|
-
|
|
580
|
+
# # Instantiate StratifiedKFold with shuffling for classification
|
|
581
|
+
cv = folds if not self.is_classification_type() \
|
|
582
|
+
else StratifiedKFold(n_splits=folds, shuffle=False)
|
|
630
583
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
train_target = train[self.target_column]
|
|
584
|
+
# Define the RFE with cross-validation
|
|
585
|
+
rfecv = RFECV(rf, cv=cv, scoring=score)
|
|
634
586
|
|
|
635
|
-
|
|
636
|
-
|
|
587
|
+
# Prepare data
|
|
588
|
+
train_data = data.drop(columns=['id',self.target_column], axis=1)
|
|
589
|
+
train_target = data[self.target_column]
|
|
637
590
|
|
|
638
|
-
|
|
639
|
-
|
|
591
|
+
# Fit the RFE using cv
|
|
592
|
+
rfecv.fit(train_data, train_target)
|
|
640
593
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
594
|
+
# Extract the features
|
|
595
|
+
features = train_data.columns[rfecv.support_].tolist()
|
|
596
|
+
|
|
597
|
+
self._display_msg(msg="feature selected by RFE:",
|
|
598
|
+
col_lst=features,
|
|
599
|
+
progress_bar=self.progress_bar)
|
|
600
|
+
features.append(self.target_column)
|
|
601
|
+
features.insert(0,'id')
|
|
602
|
+
|
|
603
|
+
selected_rfe_df = data[features]
|
|
604
|
+
|
|
605
|
+
# storing the rfe selected features in data transformation dictionary
|
|
606
|
+
self.data_transform_dict['rfe_features'] = features
|
|
607
|
+
|
|
608
|
+
columns_to_rename = [col for col in selected_rfe_df.columns if col not in ['id', self.target_column]]
|
|
609
|
+
new_column = {col: f'r_{col}' for col in columns_to_rename}
|
|
610
|
+
self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
|
|
611
|
+
|
|
612
|
+
selected_rfe_df.rename(columns=new_column, inplace=True)
|
|
659
613
|
|
|
660
614
|
# storing the rename column list in data transformation dictionary
|
|
661
615
|
self.data_transform_dict['rfe_rename_column'] = columns_to_rename
|
|
@@ -664,29 +618,24 @@ class _DataPreparation:
|
|
|
664
618
|
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
665
619
|
progress_bar=self.progress_bar,
|
|
666
620
|
show_data=True)
|
|
667
|
-
self.rfe_feature =
|
|
621
|
+
self.rfe_feature = selected_rfe_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
|
|
668
622
|
|
|
669
623
|
# Pushing data into database
|
|
670
|
-
self.copy_dataframe_to_sql(
|
|
624
|
+
self.copy_dataframe_to_sql(selected_rfe_df, 'rfe', self.persist)
|
|
671
625
|
|
|
672
626
|
def _feature_selection_Lasso(self,
|
|
673
|
-
|
|
674
|
-
test=None):
|
|
627
|
+
data=None):
|
|
675
628
|
"""
|
|
676
629
|
DESCRIPTION:
|
|
677
630
|
Function performs Lasso Regression for feature selection.
|
|
678
631
|
It helps in identifing and retaining the most important features while setting less important ones to zero.
|
|
679
632
|
|
|
680
633
|
PARAMETERS:
|
|
681
|
-
|
|
634
|
+
data:
|
|
682
635
|
Required Argument.
|
|
683
636
|
Specifies the input train pandas DataFrame.
|
|
684
637
|
Types: pandas Dataframe
|
|
685
638
|
|
|
686
|
-
test:
|
|
687
|
-
Required Argument.
|
|
688
|
-
Specifies the input test pandas DataFrame.
|
|
689
|
-
Types: pandas Dataframe
|
|
690
639
|
"""
|
|
691
640
|
start_time = time.time()
|
|
692
641
|
self._display_msg(msg="\nFeature selection using lasso ...",
|
|
@@ -700,39 +649,43 @@ class _DataPreparation:
|
|
|
700
649
|
from sklearn.model_selection import StratifiedKFold
|
|
701
650
|
|
|
702
651
|
# Getting the value k in k-fold cross-validation
|
|
703
|
-
num_folds = self._num_of_folds(
|
|
652
|
+
num_folds = self._num_of_folds(data.shape[0])
|
|
704
653
|
|
|
705
|
-
# Prepare
|
|
706
|
-
train_features =
|
|
707
|
-
train_target =
|
|
654
|
+
# Prepare data
|
|
655
|
+
train_features = data.drop(columns=['id',self.target_column], axis=1)
|
|
656
|
+
train_target = data[self.target_column]
|
|
708
657
|
|
|
709
|
-
#
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
658
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
659
|
+
with warnings.catch_warnings():
|
|
660
|
+
warnings.filterwarnings('ignore')
|
|
661
|
+
|
|
662
|
+
# Determine the estimator and parameters based on the type of problem
|
|
663
|
+
if self.is_classification_type():
|
|
664
|
+
if self.data.drop_duplicate(self.target_column).size == 2:
|
|
665
|
+
scoring_metric = 'roc_auc'
|
|
666
|
+
else:
|
|
667
|
+
scoring_metric = 'f1_macro'
|
|
668
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
|
|
669
|
+
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
713
670
|
else:
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
else:
|
|
718
|
-
estimator = Lasso(random_state=42)
|
|
719
|
-
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
720
|
-
scoring_metric = "r2"
|
|
671
|
+
estimator = Lasso(random_state=42)
|
|
672
|
+
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
673
|
+
scoring_metric = "r2"
|
|
721
674
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
675
|
+
if self.is_classification_type():
|
|
676
|
+
cv = StratifiedKFold(n_splits=5, shuffle=False)
|
|
677
|
+
else:
|
|
678
|
+
cv = num_folds
|
|
726
679
|
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
680
|
+
# Applying hyperparameter tuning and optimizing score
|
|
681
|
+
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
682
|
+
scoring=scoring_metric, verbose=0)
|
|
730
683
|
|
|
731
|
-
|
|
732
|
-
|
|
684
|
+
# Fitting the best result from hyperparameter
|
|
685
|
+
hyperparameter_search.fit(train_features, train_target)
|
|
733
686
|
|
|
734
|
-
|
|
735
|
-
|
|
687
|
+
# Extracting the important estimators
|
|
688
|
+
feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
|
|
736
689
|
|
|
737
690
|
# Extracting feature using estimators whose importance > 0
|
|
738
691
|
if self.is_classification_type():
|
|
@@ -747,8 +700,7 @@ class _DataPreparation:
|
|
|
747
700
|
progress_bar=self.progress_bar)
|
|
748
701
|
|
|
749
702
|
important_features = ['id'] + important_features + [self.target_column]
|
|
750
|
-
|
|
751
|
-
test_df = test[important_features]
|
|
703
|
+
selected_lasso_df = data[important_features]
|
|
752
704
|
|
|
753
705
|
# Storing the lasso selected features in data transformation dictionary
|
|
754
706
|
self.data_transform_dict['lasso_features'] = important_features
|
|
@@ -758,65 +710,62 @@ class _DataPreparation:
|
|
|
758
710
|
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
759
711
|
progress_bar=self.progress_bar,
|
|
760
712
|
show_data=True)
|
|
761
|
-
self.lasso_feature =
|
|
713
|
+
self.lasso_feature = selected_lasso_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
|
|
762
714
|
|
|
763
|
-
self.copy_dataframe_to_sql(
|
|
715
|
+
self.copy_dataframe_to_sql(selected_lasso_df, 'lasso', self.persist)
|
|
764
716
|
|
|
765
717
|
def copy_dataframe_to_sql(self,
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
718
|
+
data,
|
|
719
|
+
prefix,
|
|
720
|
+
persist):
|
|
769
721
|
"""
|
|
770
722
|
DESCRIPTION:
|
|
771
723
|
Function to copy dataframe to SQL with generated table name.
|
|
772
724
|
|
|
773
725
|
PARAMETERS:
|
|
774
|
-
|
|
775
|
-
Required Argument.
|
|
776
|
-
Specifies the input train pandas DataFrame.
|
|
777
|
-
Types: pandas Dataframe
|
|
778
|
-
|
|
779
|
-
test:
|
|
726
|
+
data:
|
|
780
727
|
Required Argument.
|
|
781
|
-
Specifies the input
|
|
728
|
+
Specifies the input pandas DataFrame.
|
|
782
729
|
Types: pandas Dataframe
|
|
783
730
|
|
|
784
731
|
prefix:
|
|
785
732
|
Required Argument.
|
|
786
733
|
Specifies the prefix for the table name.
|
|
787
734
|
Types: str
|
|
735
|
+
|
|
736
|
+
persist:
|
|
737
|
+
Required Argument.
|
|
738
|
+
Specifies whether to persist the results of the
|
|
739
|
+
function in a table or not. When set to True,
|
|
740
|
+
results are persisted in a table; otherwise,
|
|
741
|
+
results are garbage collected at the end of the
|
|
742
|
+
session.
|
|
743
|
+
Types: bool
|
|
788
744
|
"""
|
|
789
745
|
# Generating table names
|
|
790
746
|
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
791
|
-
table_type = TeradataConstants.TERADATA_TABLE
|
|
792
|
-
|
|
793
|
-
table_type = TeradataConstants.TERADATA_TABLE)
|
|
794
|
-
|
|
747
|
+
table_type = TeradataConstants.TERADATA_TABLE,
|
|
748
|
+
gc_on_quit=not persist)
|
|
795
749
|
# Storing the table names in the table name mapping dictionary
|
|
796
750
|
self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
|
|
797
|
-
self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
|
|
798
751
|
|
|
799
752
|
# Pushing data into database
|
|
800
753
|
if self.is_classification_type():
|
|
801
|
-
copy_to_sql(df=
|
|
802
|
-
copy_to_sql(df=test, table_name=test_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
754
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
803
755
|
else:
|
|
804
|
-
copy_to_sql(df=
|
|
805
|
-
copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
|
|
806
|
-
|
|
807
|
-
|
|
756
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
|
|
808
757
|
|
|
809
758
|
def _scaling_features_helper(self,
|
|
810
|
-
|
|
811
|
-
|
|
759
|
+
data=None,
|
|
760
|
+
feature_selection_mtd=None):
|
|
812
761
|
"""
|
|
813
762
|
DESCRIPTION:
|
|
814
763
|
This function selects the features on which feature scaling should be applied.
|
|
815
764
|
|
|
816
765
|
PARAMETERS:
|
|
817
|
-
|
|
766
|
+
data:
|
|
818
767
|
Required Argument.
|
|
819
|
-
Specifies the
|
|
768
|
+
Specifies the data on which feature scaling will be applied.
|
|
820
769
|
Types: teradataml Dataframe
|
|
821
770
|
|
|
822
771
|
feature_selection_mtd:
|
|
@@ -831,10 +780,11 @@ class _DataPreparation:
|
|
|
831
780
|
columns_to_scale = []
|
|
832
781
|
|
|
833
782
|
# Iterating over the columns
|
|
834
|
-
for col in
|
|
783
|
+
for col in data.columns:
|
|
835
784
|
# Selecting columns that will be scaled
|
|
836
785
|
# Exculding target_col and columns with single value
|
|
837
|
-
if col not in ['id', self.target_column] and
|
|
786
|
+
if col not in ['id', self.target_column] and \
|
|
787
|
+
data.drop_duplicate(col).size > 1:
|
|
838
788
|
columns_to_scale.append(col)
|
|
839
789
|
|
|
840
790
|
if feature_selection_mtd == "lasso":
|
|
@@ -848,7 +798,7 @@ class _DataPreparation:
|
|
|
848
798
|
return columns_to_scale
|
|
849
799
|
|
|
850
800
|
def _scaling_features(self,
|
|
851
|
-
|
|
801
|
+
feature_selection_mtd=None):
|
|
852
802
|
"""
|
|
853
803
|
DESCRIPTION:
|
|
854
804
|
Function performs feature scaling on columns present inside the dataset
|
|
@@ -858,7 +808,7 @@ class _DataPreparation:
|
|
|
858
808
|
feature_selection_mtd:
|
|
859
809
|
Required Argument.
|
|
860
810
|
Specifies the feature selection algorithm used.
|
|
861
|
-
Types: str
|
|
811
|
+
Types: str
|
|
862
812
|
"""
|
|
863
813
|
|
|
864
814
|
self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
|
|
@@ -866,8 +816,7 @@ class _DataPreparation:
|
|
|
866
816
|
show_data=True)
|
|
867
817
|
|
|
868
818
|
start_time = time.time()
|
|
869
|
-
|
|
870
|
-
test = None
|
|
819
|
+
data_to_scale = None
|
|
871
820
|
|
|
872
821
|
if self.is_classification_type():
|
|
873
822
|
scale_method = self._scale_method_cls
|
|
@@ -876,17 +825,18 @@ class _DataPreparation:
|
|
|
876
825
|
|
|
877
826
|
# Loading data for feature scaling based of feature selection method
|
|
878
827
|
if feature_selection_mtd == 'rfe':
|
|
879
|
-
|
|
880
|
-
test = DataFrame(self.table_name_mapping['rfe_test'])
|
|
828
|
+
data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
|
|
881
829
|
elif feature_selection_mtd == 'lasso':
|
|
882
|
-
|
|
883
|
-
test = DataFrame(self.table_name_mapping['lasso_test'])
|
|
830
|
+
data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
|
|
884
831
|
else:
|
|
885
|
-
|
|
886
|
-
|
|
832
|
+
data_to_scale = self.data
|
|
833
|
+
|
|
834
|
+
# Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
|
|
835
|
+
volatile, persist = self._set_generic_parameters(func_indicator='FeatureScalingIndicator',
|
|
836
|
+
param_name='FeatureScalingParam')
|
|
887
837
|
|
|
888
838
|
# List of columns that will be scaled
|
|
889
|
-
scale_col= self._scaling_features_helper(
|
|
839
|
+
scale_col= self._scaling_features_helper(data_to_scale, feature_selection_mtd)
|
|
890
840
|
|
|
891
841
|
if len(scale_col) != 0:
|
|
892
842
|
self._display_msg(msg="columns that will be scaled: ",
|
|
@@ -894,41 +844,33 @@ class _DataPreparation:
|
|
|
894
844
|
progress_bar=self.progress_bar)
|
|
895
845
|
|
|
896
846
|
# Scale Fit
|
|
897
|
-
fit_obj = ScaleFit(data=
|
|
847
|
+
fit_obj = ScaleFit(data=data_to_scale,
|
|
898
848
|
target_columns=scale_col,
|
|
899
|
-
scale_method=scale_method
|
|
849
|
+
scale_method=scale_method,
|
|
850
|
+
volatile=volatile,
|
|
851
|
+
persist=persist)
|
|
900
852
|
|
|
901
853
|
# storing the scale fit object and columns in data transformation dictionary
|
|
902
|
-
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj
|
|
854
|
+
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
903
855
|
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
904
856
|
|
|
905
857
|
# List of columns to copy to the output generated by scale transform
|
|
906
|
-
accumulate_cols = list(set(
|
|
907
|
-
|
|
908
|
-
# Scaling on training dataset
|
|
909
|
-
tr_obj = ScaleTransform(data=train,
|
|
910
|
-
object=fit_obj,
|
|
911
|
-
accumulate=accumulate_cols)
|
|
858
|
+
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
912
859
|
|
|
913
|
-
# Scaling
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
860
|
+
# Scaling dataset
|
|
861
|
+
transform_obj = ScaleTransform(data=data_to_scale,
|
|
862
|
+
object=fit_obj,
|
|
863
|
+
accumulate=accumulate_cols)
|
|
864
|
+
scaled_df = transform_obj.result
|
|
917
865
|
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
self._display_msg(msg="Training dataset sample after scaling:",
|
|
922
|
-
data=train,
|
|
923
|
-
progress_bar=self.progress_bar)
|
|
924
|
-
self._display_msg(msg="Testing dataset sample after scaling:",
|
|
925
|
-
data=test,
|
|
866
|
+
self._display_msg(msg="Dataset sample after scaling:",
|
|
867
|
+
data=scaled_df,
|
|
926
868
|
progress_bar=self.progress_bar)
|
|
927
869
|
else:
|
|
928
870
|
self._display_msg(msg="No columns to scale.",
|
|
929
871
|
progress_bar=self.progress_bar)
|
|
930
872
|
|
|
931
|
-
self.copy_dataframe_to_sql(
|
|
873
|
+
self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
|
|
932
874
|
|
|
933
875
|
end_time = time.time()
|
|
934
876
|
self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
|
|
@@ -946,43 +888,32 @@ class _DataPreparation:
|
|
|
946
888
|
# Checking user input for feature scaling
|
|
947
889
|
if feature_scaling_input:
|
|
948
890
|
# Extracting scaling method
|
|
949
|
-
|
|
950
|
-
if
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
self._scale_method_cls = custom_scaling_method
|
|
891
|
+
custom_scaling_params = self.custom_data.get("FeatureScalingParam", None)
|
|
892
|
+
if custom_scaling_params:
|
|
893
|
+
custom_scaling_method = custom_scaling_params.get("FeatureScalingMethod", None)
|
|
894
|
+
if custom_scaling_method is None:
|
|
895
|
+
self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
|
|
896
|
+
progress_bar=self.progress_bar)
|
|
956
897
|
else:
|
|
957
|
-
self.
|
|
898
|
+
if self.is_classification_type():
|
|
899
|
+
self._scale_method_cls = custom_scaling_method
|
|
900
|
+
else:
|
|
901
|
+
self._scale_method_reg = custom_scaling_method
|
|
958
902
|
else:
|
|
959
903
|
self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
|
|
960
904
|
progress_bar=self.progress_bar)
|
|
961
905
|
|
|
962
906
|
|
|
963
|
-
def _handle_generated_features(self
|
|
964
|
-
label = None):
|
|
907
|
+
def _handle_generated_features(self):
|
|
965
908
|
"""
|
|
966
909
|
DESCRIPTION:
|
|
967
910
|
Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
Optional Argument.
|
|
972
|
-
Specifies label for dataset on which rounding up is getting done i.e., 'train' for training
|
|
973
|
-
and 'test' for testing dataset.
|
|
974
|
-
By Default, it takes None and transformation is getting applied to whole dataset.
|
|
975
|
-
Types: str
|
|
976
|
-
|
|
911
|
+
|
|
912
|
+
RETURNS:
|
|
913
|
+
Pandas DataFrame containing, rounded up float columns.
|
|
977
914
|
"""
|
|
978
|
-
#
|
|
979
|
-
|
|
980
|
-
target_df = self.train_df
|
|
981
|
-
elif label == 'test':
|
|
982
|
-
target_df = self.test_df
|
|
983
|
-
else:
|
|
984
|
-
target_df=self.data
|
|
985
|
-
|
|
915
|
+
# Assigning data to target dataframe
|
|
916
|
+
target_df = self.data
|
|
986
917
|
# Detecting list of float columns on target dataset
|
|
987
918
|
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
|
|
988
919
|
|
|
@@ -1001,10 +932,19 @@ class _DataPreparation:
|
|
|
1001
932
|
"precision_digit" : 4,
|
|
1002
933
|
"accumulate" : accumulate_columns,
|
|
1003
934
|
"persist" : True}
|
|
935
|
+
|
|
936
|
+
# Disabling print if persist is True by default
|
|
937
|
+
if not self.volatile and not self.persist:
|
|
938
|
+
fit_params["display_table_name"] = False
|
|
939
|
+
|
|
940
|
+
if self.volatile:
|
|
941
|
+
fit_params["volatile"] = True
|
|
942
|
+
fit_params["persist"] = False
|
|
1004
943
|
|
|
1005
944
|
transform_output = RoundColumns(**fit_params).result
|
|
1006
|
-
|
|
1007
|
-
|
|
945
|
+
if not self.volatile and not self.persist:
|
|
946
|
+
# Adding transformed data containing table to garbage collector
|
|
947
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1008
948
|
cols = transform_output.columns
|
|
1009
949
|
df = transform_output.to_pandas().reset_index()
|
|
1010
950
|
df = df[cols]
|