teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -16,15 +16,18 @@
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
19
|
+
import math
|
|
19
20
|
import pandas as pd
|
|
20
21
|
from itertools import product
|
|
22
|
+
import numpy as np
|
|
21
23
|
|
|
22
24
|
# Teradata libraries
|
|
23
25
|
from teradataml.context import context as tdmlctx
|
|
24
26
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
27
|
from teradataml.dataframe.dataframe import DataFrame
|
|
26
28
|
from teradataml import execute_sql, get_connection
|
|
27
|
-
from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
|
|
29
|
+
from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
|
|
30
|
+
from teradataml.utils.validators import _Validators
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
class _ModelTraining:
|
|
@@ -36,7 +39,8 @@ class _ModelTraining:
|
|
|
36
39
|
verbose=0,
|
|
37
40
|
features=None,
|
|
38
41
|
task_type="Regression",
|
|
39
|
-
custom_data = None
|
|
42
|
+
custom_data = None,
|
|
43
|
+
**kwargs):
|
|
40
44
|
"""
|
|
41
45
|
DESCRIPTION:
|
|
42
46
|
Function initializes the data, target column, features and models
|
|
@@ -49,12 +53,12 @@ class _ModelTraining:
|
|
|
49
53
|
Types: teradataml Dataframe
|
|
50
54
|
|
|
51
55
|
target_column:
|
|
52
|
-
Required
|
|
56
|
+
Required Argument.
|
|
53
57
|
Specifies the target column present inside the dataset.
|
|
54
58
|
Types: str
|
|
55
59
|
|
|
56
60
|
model_list:
|
|
57
|
-
Required
|
|
61
|
+
Required Argument.
|
|
58
62
|
Specifies the list of models to be used for model training.
|
|
59
63
|
Types: list
|
|
60
64
|
|
|
@@ -70,13 +74,13 @@ class _ModelTraining:
|
|
|
70
74
|
Types: int
|
|
71
75
|
|
|
72
76
|
features:
|
|
73
|
-
Required
|
|
77
|
+
Required Argument.
|
|
74
78
|
Specifies the list of selected feature by rfe, lasso and pca
|
|
75
79
|
respectively in this order.
|
|
76
80
|
Types: list of list of strings (str)
|
|
77
81
|
|
|
78
82
|
task_type:
|
|
79
|
-
Required
|
|
83
|
+
Required Argument.
|
|
80
84
|
Specifies the task type for AutoML, whether to apply regresion
|
|
81
85
|
or classification on the provived dataset.
|
|
82
86
|
Default Value: "Regression"
|
|
@@ -84,9 +88,31 @@ class _ModelTraining:
|
|
|
84
88
|
Types: str
|
|
85
89
|
|
|
86
90
|
custom_data:
|
|
87
|
-
Optional
|
|
91
|
+
Optional Argument.
|
|
88
92
|
Specifies json object containing user customized input.
|
|
89
93
|
Types: json object
|
|
94
|
+
|
|
95
|
+
**kwargs:
|
|
96
|
+
Specifies the additional arguments for model training. Below
|
|
97
|
+
are the additional arguments:
|
|
98
|
+
volatile:
|
|
99
|
+
Optional Argument.
|
|
100
|
+
Specifies whether to put the interim results of the
|
|
101
|
+
functions in a volatile table or not. When set to
|
|
102
|
+
True, results are stored in a volatile table,
|
|
103
|
+
otherwise not.
|
|
104
|
+
Default Value: False
|
|
105
|
+
Types: bool
|
|
106
|
+
|
|
107
|
+
persist:
|
|
108
|
+
Optional Argument.
|
|
109
|
+
Specifies whether to persist the interim results of the
|
|
110
|
+
functions in a table or not. When set to True,
|
|
111
|
+
results are persisted in a table; otherwise,
|
|
112
|
+
results are garbage collected at the end of the
|
|
113
|
+
session.
|
|
114
|
+
Default Value: False
|
|
115
|
+
Types: bool
|
|
90
116
|
"""
|
|
91
117
|
self.data = data
|
|
92
118
|
self.target_column = target_column
|
|
@@ -96,12 +122,16 @@ class _ModelTraining:
|
|
|
96
122
|
self.task_type = task_type
|
|
97
123
|
self.custom_data = custom_data
|
|
98
124
|
self.labels = self.data.drop_duplicate(self.target_column).size
|
|
125
|
+
self.startify_col = None
|
|
126
|
+
self.persist = kwargs.get("persist", False)
|
|
127
|
+
self.volatile = kwargs.get("volatile", False)
|
|
99
128
|
|
|
100
129
|
def model_training(self,
|
|
101
130
|
auto=True,
|
|
102
131
|
max_runtime_secs=None,
|
|
103
132
|
stopping_metric=None,
|
|
104
|
-
stopping_tolerance=0
|
|
133
|
+
stopping_tolerance=0,
|
|
134
|
+
max_models=None
|
|
105
135
|
):
|
|
106
136
|
"""
|
|
107
137
|
DESCRIPTION:
|
|
@@ -112,14 +142,14 @@ class _ModelTraining:
|
|
|
112
142
|
|
|
113
143
|
PARAMETERS:
|
|
114
144
|
auto:
|
|
115
|
-
Optional
|
|
145
|
+
Optional Argument.
|
|
116
146
|
Specifies whether to run data preparation in auto mode or custom mode.
|
|
117
147
|
When set to True, runs automtically otherwise, it take user inputs.
|
|
118
148
|
Default Value: True
|
|
119
149
|
Types: boolean
|
|
120
150
|
|
|
121
151
|
max_runtime_secs:
|
|
122
|
-
Optional
|
|
152
|
+
Optional Argument.
|
|
123
153
|
Specifies the time limit in seconds for model training.
|
|
124
154
|
Types: int
|
|
125
155
|
|
|
@@ -132,6 +162,11 @@ class _ModelTraining:
|
|
|
132
162
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
133
163
|
Specifies the stopping tolerance for stopping metrics in model training.
|
|
134
164
|
Types: float
|
|
165
|
+
|
|
166
|
+
max_models:
|
|
167
|
+
Optional Argument.
|
|
168
|
+
Specifies the maximum number of models to be trained.
|
|
169
|
+
Types: int
|
|
135
170
|
|
|
136
171
|
RETURNS:
|
|
137
172
|
pandas dataframes containing model information, leaderboard and target
|
|
@@ -140,6 +175,7 @@ class _ModelTraining:
|
|
|
140
175
|
self.stopping_metric = stopping_metric
|
|
141
176
|
self.stopping_tolerance = stopping_tolerance
|
|
142
177
|
self.max_runtime_secs = max_runtime_secs
|
|
178
|
+
self.max_models = max_models
|
|
143
179
|
|
|
144
180
|
self._display_heading(phase=3, progress_bar=self.progress_bar)
|
|
145
181
|
self._display_msg(msg='Model Training started ...',
|
|
@@ -152,6 +188,10 @@ class _ModelTraining:
|
|
|
152
188
|
if not auto:
|
|
153
189
|
parameters = self._custom_hyperparameters(parameters)
|
|
154
190
|
|
|
191
|
+
# Validates the upper limit of max_models based on total model combinations
|
|
192
|
+
if self.max_models is not None:
|
|
193
|
+
self._validate_upper_limit_for_max_models(parameters)
|
|
194
|
+
|
|
155
195
|
if self.verbose == 2:
|
|
156
196
|
self._display_hyperparameters(parameters)
|
|
157
197
|
|
|
@@ -167,6 +207,54 @@ class _ModelTraining:
|
|
|
167
207
|
|
|
168
208
|
return models, leader_board, self.labels
|
|
169
209
|
|
|
210
|
+
def _get_model_param_space(self,
|
|
211
|
+
hyperparameters):
|
|
212
|
+
"""
|
|
213
|
+
DESCRIPTION:
|
|
214
|
+
Internal function to calculate the total number of models to be trained for specific model.
|
|
215
|
+
|
|
216
|
+
PARAMETERS:
|
|
217
|
+
hyperparameters:
|
|
218
|
+
Required Argument.
|
|
219
|
+
Specifies the hyperparameters availables for ML model.
|
|
220
|
+
Types: list of dict
|
|
221
|
+
|
|
222
|
+
RETURNS:
|
|
223
|
+
int containing, total number of models available for training.
|
|
224
|
+
"""
|
|
225
|
+
# Creating all possible combinations of hyperparameters
|
|
226
|
+
all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
|
|
227
|
+
# Getting total number of models for each model model training function
|
|
228
|
+
total_models = len(all_combinations)
|
|
229
|
+
return total_models
|
|
230
|
+
|
|
231
|
+
def _validate_upper_limit_for_max_models(self,
|
|
232
|
+
hyperparameters_list):
|
|
233
|
+
"""
|
|
234
|
+
DESCRIPTION:
|
|
235
|
+
Internal function to validate the upper limit of max_models.
|
|
236
|
+
|
|
237
|
+
PARAMETERS:
|
|
238
|
+
hyperparameters_list:
|
|
239
|
+
Required Argument.
|
|
240
|
+
Specifies the hyperparameters for different ML models.
|
|
241
|
+
Types: list of dict
|
|
242
|
+
|
|
243
|
+
RETURNS:
|
|
244
|
+
None
|
|
245
|
+
|
|
246
|
+
RAISES:
|
|
247
|
+
TeradataMlException, ValueError
|
|
248
|
+
"""
|
|
249
|
+
model_param_space = 0
|
|
250
|
+
for hyperparameter_dct in hyperparameters_list:
|
|
251
|
+
# getting total number of models for each model
|
|
252
|
+
total_models = self._get_model_param_space(hyperparameter_dct)
|
|
253
|
+
model_param_space += total_models
|
|
254
|
+
|
|
255
|
+
# Validating upper range for max_models
|
|
256
|
+
_Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
|
|
257
|
+
|
|
170
258
|
def _display_hyperparameters(self,
|
|
171
259
|
hyperparameters_list):
|
|
172
260
|
"""
|
|
@@ -175,7 +263,7 @@ class _ModelTraining:
|
|
|
175
263
|
|
|
176
264
|
PARAMETERS:
|
|
177
265
|
hyperparameters_list:
|
|
178
|
-
Required
|
|
266
|
+
Required Argument.
|
|
179
267
|
Specifies the hyperparameters for different ML models.
|
|
180
268
|
Types: list of dict
|
|
181
269
|
|
|
@@ -189,16 +277,13 @@ class _ModelTraining:
|
|
|
189
277
|
|
|
190
278
|
# Iterating over hyperparameters_list
|
|
191
279
|
for hyperparameter_dct in hyperparameters_list:
|
|
192
|
-
# Extracting hyperparameter and
|
|
280
|
+
# Extracting hyperparameter and their value from hyperparameters dictionary
|
|
193
281
|
for key, val in hyperparameter_dct.items():
|
|
194
282
|
# Displaying hyperparameters
|
|
195
283
|
print(f"{key} : {str(val)}")
|
|
196
284
|
|
|
197
|
-
# Creating all possible combinations of hyperparameters
|
|
198
|
-
all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
|
|
199
|
-
|
|
200
285
|
# Displaying total number of models for each model
|
|
201
|
-
total_models =
|
|
286
|
+
total_models = self._get_model_param_space(hyperparameter_dct)
|
|
202
287
|
print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
|
|
203
288
|
print(f"--"*100+'\n')
|
|
204
289
|
|
|
@@ -210,7 +295,7 @@ class _ModelTraining:
|
|
|
210
295
|
|
|
211
296
|
PARAMETERS:
|
|
212
297
|
trained_models_info:
|
|
213
|
-
Required
|
|
298
|
+
Required Argument.
|
|
214
299
|
Specifies the trained models inforamtion to display.
|
|
215
300
|
Types: pandas Dataframe
|
|
216
301
|
|
|
@@ -219,18 +304,25 @@ class _ModelTraining:
|
|
|
219
304
|
"""
|
|
220
305
|
# Creating a copy to avoid use of same reference of memory
|
|
221
306
|
if self.task_type != "Regression":
|
|
222
|
-
sorted_model_df = trained_models_info.sort_values(by=['
|
|
223
|
-
|
|
307
|
+
sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
|
|
308
|
+
ascending=[False, False]).reset_index(drop=True)
|
|
224
309
|
else:
|
|
225
|
-
sorted_model_df = trained_models_info.sort_values(by='R2
|
|
226
|
-
|
|
310
|
+
sorted_model_df = trained_models_info.sort_values(by='R2',
|
|
311
|
+
ascending=False).reset_index(drop=True)
|
|
312
|
+
|
|
227
313
|
|
|
228
314
|
# Adding rank to leaderboard
|
|
229
|
-
sorted_model_df.insert(0, '
|
|
230
|
-
|
|
231
|
-
#
|
|
232
|
-
|
|
233
|
-
|
|
315
|
+
sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
|
|
316
|
+
|
|
317
|
+
# Internal Data list for leaderboard
|
|
318
|
+
dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
|
|
319
|
+
|
|
320
|
+
# Excluding the model object and model name from leaderboard
|
|
321
|
+
leaderboard = sorted_model_df.drop(dp_lst, axis=1)
|
|
322
|
+
|
|
323
|
+
# filtering the rows based on the max_models
|
|
324
|
+
if self.max_models is not None:
|
|
325
|
+
leaderboard = leaderboard[leaderboard["RANK"] <= self.max_models]
|
|
234
326
|
|
|
235
327
|
self._display_msg(msg="Leaderboard",
|
|
236
328
|
progress_bar=self.progress_bar,
|
|
@@ -343,12 +435,12 @@ class _ModelTraining:
|
|
|
343
435
|
|
|
344
436
|
PARAMETERS:
|
|
345
437
|
num_rows:
|
|
346
|
-
Required
|
|
438
|
+
Required Argument.
|
|
347
439
|
Specifies the number of rows in dataset.
|
|
348
440
|
Types: int
|
|
349
441
|
|
|
350
442
|
num_cols:
|
|
351
|
-
Required
|
|
443
|
+
Required Argument.
|
|
352
444
|
Specifies the number of columns in dataset.
|
|
353
445
|
Types: int
|
|
354
446
|
|
|
@@ -375,28 +467,24 @@ class _ModelTraining:
|
|
|
375
467
|
max_depth.extend([6, 7, 8])
|
|
376
468
|
min_node_size.extend([2])
|
|
377
469
|
iter_num.extend([20])
|
|
378
|
-
num_trees.extend([10, 20])
|
|
379
470
|
elif num_rows < 10000 and num_cols < 15:
|
|
380
471
|
min_impurity.extend([0.1, 0.2])
|
|
381
472
|
shrinkage_factor.extend([0.1, 0.3])
|
|
382
473
|
max_depth.extend([6, 8, 10])
|
|
383
474
|
min_node_size.extend([2, 3])
|
|
384
475
|
iter_num.extend([20, 30])
|
|
385
|
-
num_trees.extend([20, 30])
|
|
386
476
|
elif num_rows < 100000 and num_cols < 20:
|
|
387
477
|
min_impurity.extend([0.2, 0.3])
|
|
388
478
|
shrinkage_factor.extend([0.01, 0.1, 0.2])
|
|
389
479
|
max_depth.extend([4, 6, 7])
|
|
390
480
|
min_node_size.extend([3, 4])
|
|
391
481
|
iter_num.extend([30, 40])
|
|
392
|
-
num_trees.extend([30, 40])
|
|
393
482
|
else:
|
|
394
483
|
min_impurity.extend([0.1, 0.2, 0.3])
|
|
395
484
|
shrinkage_factor.extend([0.01, 0.05, 0.1])
|
|
396
485
|
max_depth.extend([3, 4, 7, 8])
|
|
397
486
|
min_node_size.extend([2, 3, 4])
|
|
398
487
|
iter_num.extend([20, 30, 40])
|
|
399
|
-
num_trees.extend([20, 30, 40])
|
|
400
488
|
|
|
401
489
|
# Hyperparameters for XGBoost model
|
|
402
490
|
xgb_params = {
|
|
@@ -409,7 +497,8 @@ class _ModelTraining:
|
|
|
409
497
|
'shrinkage_factor': tuple(shrinkage_factor),
|
|
410
498
|
'max_depth': tuple(max_depth),
|
|
411
499
|
'min_node_size': tuple(min_node_size),
|
|
412
|
-
'iter_num': tuple(iter_num)
|
|
500
|
+
'iter_num': tuple(iter_num),
|
|
501
|
+
'seed':42
|
|
413
502
|
}
|
|
414
503
|
# Hyperparameters for Decision Forest model
|
|
415
504
|
df_params = {
|
|
@@ -419,7 +508,8 @@ class _ModelTraining:
|
|
|
419
508
|
'min_impurity': tuple(min_impurity),
|
|
420
509
|
'max_depth': tuple(max_depth),
|
|
421
510
|
'min_node_size': tuple(min_node_size),
|
|
422
|
-
'num_trees': tuple(num_trees)
|
|
511
|
+
'num_trees': tuple(num_trees),
|
|
512
|
+
'seed':42
|
|
423
513
|
}
|
|
424
514
|
|
|
425
515
|
# Updating model type in case of classification
|
|
@@ -445,12 +535,12 @@ class _ModelTraining:
|
|
|
445
535
|
|
|
446
536
|
PARAMETERS:
|
|
447
537
|
num_rows
|
|
448
|
-
Required
|
|
538
|
+
Required Argument.
|
|
449
539
|
Specifies the number of rows in dataset.
|
|
450
540
|
Types: int
|
|
451
541
|
|
|
452
542
|
num_cols:
|
|
453
|
-
Required
|
|
543
|
+
Required Argument.
|
|
454
544
|
Specifies the number of columns in dataset.
|
|
455
545
|
Types: int
|
|
456
546
|
|
|
@@ -482,12 +572,12 @@ class _ModelTraining:
|
|
|
482
572
|
|
|
483
573
|
PARAMETERS:
|
|
484
574
|
num_rows:
|
|
485
|
-
Required
|
|
575
|
+
Required Argument.
|
|
486
576
|
Specifies the number of rows in dataset.
|
|
487
577
|
Types: int
|
|
488
578
|
|
|
489
579
|
num_cols:
|
|
490
|
-
Required
|
|
580
|
+
Required Argument.
|
|
491
581
|
Specifies the number of columns in dataset.
|
|
492
582
|
Types: int
|
|
493
583
|
|
|
@@ -616,6 +706,44 @@ class _ModelTraining:
|
|
|
616
706
|
raise ValueError("No model is selected for training.")
|
|
617
707
|
|
|
618
708
|
return parameters
|
|
709
|
+
|
|
710
|
+
def distribute_max_models(self):
|
|
711
|
+
"""
|
|
712
|
+
DESCRIPTION:
|
|
713
|
+
Internal function to distribute max_models across available model functions.
|
|
714
|
+
|
|
715
|
+
RETURNS:
|
|
716
|
+
dictionary containing max_models distribution and list of models to remove.
|
|
717
|
+
"""
|
|
718
|
+
# Getting total number of models
|
|
719
|
+
model_count=len(self.model_list)
|
|
720
|
+
# Evenly distributing max_models across models
|
|
721
|
+
base_assign = self.max_models // model_count
|
|
722
|
+
# Creating list of max_models for each model
|
|
723
|
+
distribution = [base_assign] * model_count
|
|
724
|
+
|
|
725
|
+
# Calculating remaining models
|
|
726
|
+
remaining_model_count = self.max_models % model_count
|
|
727
|
+
if remaining_model_count:
|
|
728
|
+
# distributing remaining model across models.
|
|
729
|
+
# Starting from first model in list and distributing remaining models by 1 each.
|
|
730
|
+
for i in range(remaining_model_count):
|
|
731
|
+
distribution[i] += 1
|
|
732
|
+
|
|
733
|
+
# Creating dictionary for model distribution
|
|
734
|
+
model_distribution = dict(zip(self.model_list, distribution))
|
|
735
|
+
# Getting list of models with 0 distribution and removing them from model list
|
|
736
|
+
# While for model having distribution greater than 0, updating distribution with
|
|
737
|
+
# 1/3rd of original value as we are training with 3 different feature selection methods.
|
|
738
|
+
models_to_remove = []
|
|
739
|
+
for model in self.model_list:
|
|
740
|
+
initial_count = model_distribution[model]
|
|
741
|
+
if initial_count == 0:
|
|
742
|
+
models_to_remove.append(model)
|
|
743
|
+
else:
|
|
744
|
+
model_distribution[model] = math.ceil(initial_count / 3)
|
|
745
|
+
|
|
746
|
+
return model_distribution, models_to_remove
|
|
619
747
|
|
|
620
748
|
def _parallel_training(self, parameters):
|
|
621
749
|
"""
|
|
@@ -635,12 +763,15 @@ class _ModelTraining:
|
|
|
635
763
|
|
|
636
764
|
# Hyperparameters for each model
|
|
637
765
|
model_params = parameters[:min(len(parameters), 5)]
|
|
638
|
-
self._display_msg(msg="\nPerforming
|
|
766
|
+
self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
|
|
639
767
|
|
|
640
|
-
# Defining training
|
|
768
|
+
# Defining training data
|
|
641
769
|
data_types = ['lasso', 'rfe', 'pca']
|
|
642
770
|
trainng_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_train']) for data_type in data_types)
|
|
643
|
-
|
|
771
|
+
|
|
772
|
+
if self.task_type == "Classification":
|
|
773
|
+
response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
|
|
774
|
+
self.output_response = [str(i) for i in response_values]
|
|
644
775
|
|
|
645
776
|
if self.stopping_metric is None:
|
|
646
777
|
self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
|
|
@@ -648,115 +779,31 @@ class _ModelTraining:
|
|
|
648
779
|
|
|
649
780
|
self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
|
|
650
781
|
if self.max_runtime_secs is not None else None
|
|
782
|
+
|
|
783
|
+
if self.max_models is not None:
|
|
784
|
+
# Getting model distribution and models to remove
|
|
785
|
+
self.max_models_distribution, models_to_remove = self.distribute_max_models()
|
|
786
|
+
# Removing model parameters with 0 distribution
|
|
787
|
+
if len(models_to_remove):
|
|
788
|
+
for model in models_to_remove:
|
|
789
|
+
model_params = [param for param in model_params if param['name'] != model]
|
|
790
|
+
# Updating progress bar as we are removing model
|
|
791
|
+
self.progress_bar.update()
|
|
792
|
+
|
|
793
|
+
if self.is_classification_type():
|
|
794
|
+
self.startify_col = self.target_column
|
|
651
795
|
|
|
652
796
|
trained_models = []
|
|
653
797
|
for param in model_params:
|
|
654
|
-
result = self._hyperparameter_tunning(param, trainng_datas
|
|
798
|
+
result = self._hyperparameter_tunning(param, trainng_datas)
|
|
655
799
|
trained_models.append(result)
|
|
656
800
|
|
|
657
801
|
models_df = pd.concat(trained_models, ignore_index=True)
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
trained_models_info = self._model_scoring(testing_datas, models_df)
|
|
661
|
-
trained_models_info = trained_models_info.reset_index(drop=True)
|
|
662
|
-
|
|
663
|
-
return trained_models_info
|
|
664
|
-
|
|
665
|
-
def _model_scoring(self,
|
|
666
|
-
test_data,
|
|
667
|
-
model_info):
|
|
668
|
-
"""
|
|
669
|
-
DESCRIPTION:
|
|
670
|
-
Internal function generates the performance metrics for
|
|
671
|
-
trained ML models using testing dataset.
|
|
672
|
-
|
|
673
|
-
PARAMETERS:
|
|
674
|
-
test_data
|
|
675
|
-
Required Argument.
|
|
676
|
-
Specifies the testing datasets
|
|
677
|
-
Types: tuple of Teradataml DataFrame
|
|
678
|
-
|
|
679
|
-
model_info
|
|
680
|
-
Required Arugment.
|
|
681
|
-
Specifies the trained models information.
|
|
682
|
-
Types: Pandas DataFrame
|
|
683
|
-
|
|
684
|
-
RETURNS:
|
|
685
|
-
Pandas DataFrame containing, trained models with thier performance metrics.
|
|
686
|
-
"""
|
|
687
|
-
self._display_msg(msg="Evaluating models performance ...",
|
|
688
|
-
progress_bar = self.progress_bar,
|
|
689
|
-
show_data=True)
|
|
690
|
-
# Empty list for storing model performance metrics
|
|
691
|
-
model_performance_data = []
|
|
692
|
-
|
|
693
|
-
# Mapping feature selection methods to corresponding test data
|
|
694
|
-
feature_selection_to_test_data = {"lasso": test_data[0],
|
|
695
|
-
"rfe": test_data[1],
|
|
696
|
-
"pca": test_data[2]}
|
|
697
|
-
|
|
698
|
-
# Iterating over models
|
|
699
|
-
for index, model_row in model_info.iterrows():
|
|
700
|
-
# Extracting model name, feature selection method, and model object
|
|
701
|
-
model_name, feature_selection, model_object = model_row['Name'], \
|
|
702
|
-
model_row['Feature selection'], model_row['obj']
|
|
703
|
-
|
|
704
|
-
# Selecting test data based on feature selection method
|
|
705
|
-
test_set = feature_selection_to_test_data[feature_selection]
|
|
706
|
-
|
|
707
|
-
# Model evaluation
|
|
708
|
-
if model_name == 'knn':
|
|
709
|
-
performance_metrics = model_object.evaluate(test_data=test_set)
|
|
710
|
-
else:
|
|
711
|
-
eval_params = self._eval_params_generation(model_name)
|
|
712
|
-
performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
|
|
713
|
-
|
|
714
|
-
# Extracting performance metrics
|
|
715
|
-
if self.is_classification_type():
|
|
716
|
-
# Classification
|
|
717
|
-
# Extract performance metrics from the output data
|
|
718
|
-
performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
|
|
719
|
-
|
|
720
|
-
# Combine all the elements to form a new row
|
|
721
|
-
new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
|
|
722
|
-
else:
|
|
723
|
-
# Regression
|
|
724
|
-
regression_metrics = next(performance_metrics.result.itertuples())
|
|
725
|
-
sample_size = test_set.select('id').size
|
|
726
|
-
feature_count = len(test_set.columns) - 2
|
|
727
|
-
r2_score = regression_metrics[8]
|
|
728
|
-
adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
|
|
729
|
-
new_row = [model_name, feature_selection, regression_metrics[0], regression_metrics[1], regression_metrics[2],
|
|
730
|
-
regression_metrics[5], regression_metrics[6], r2_score, adjusted_r2_score, model_object]
|
|
731
|
-
|
|
732
|
-
model_performance_data.append(new_row)
|
|
733
|
-
|
|
734
|
-
if self.is_classification_type():
|
|
735
|
-
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Feature selection',
|
|
736
|
-
'Accuracy','Micro-Precision',
|
|
737
|
-
'Micro-Recall','Micro-F1',
|
|
738
|
-
'Macro-Precision','Macro-Recall',
|
|
739
|
-
'Macro-F1','Weighted-Precision',
|
|
740
|
-
'Weighted-Recall','Weighted-F1',
|
|
741
|
-
'model-obj'])
|
|
742
|
-
else:
|
|
743
|
-
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
|
|
744
|
-
'Feature selection',
|
|
745
|
-
'MAE', 'MSE', 'MSLE',
|
|
746
|
-
'RMSE', 'RMSLE',
|
|
747
|
-
'R2-score',
|
|
748
|
-
'Adjusted R2-score',
|
|
749
|
-
'model-obj'])
|
|
750
|
-
self._display_msg(msg="Evaluation completed.",
|
|
751
|
-
progress_bar = self.progress_bar,
|
|
752
|
-
show_data=True)
|
|
753
|
-
|
|
754
|
-
return model_metrics_df
|
|
755
|
-
|
|
802
|
+
return models_df
|
|
803
|
+
|
|
756
804
|
def _hyperparameter_tunning(self,
|
|
757
805
|
model_param,
|
|
758
|
-
train_data
|
|
759
|
-
test_data):
|
|
806
|
+
train_data):
|
|
760
807
|
"""
|
|
761
808
|
DESCRIPTION:
|
|
762
809
|
Internal function performs hyperparameter tuning on
|
|
@@ -764,18 +811,13 @@ class _ModelTraining:
|
|
|
764
811
|
|
|
765
812
|
PARAMETERS:
|
|
766
813
|
model_param
|
|
767
|
-
Required
|
|
814
|
+
Required Argument.
|
|
768
815
|
Specifies the eval_params argument for GridSearch.
|
|
769
816
|
Types: dict
|
|
770
817
|
|
|
771
818
|
train_data:
|
|
772
|
-
Required Arugment.
|
|
773
|
-
Specifies the training datasets.
|
|
774
|
-
Types: tuple of Teradataml DataFrame
|
|
775
|
-
|
|
776
|
-
test_data
|
|
777
819
|
Required Argument.
|
|
778
|
-
Specifies the
|
|
820
|
+
Specifies the training datasets.
|
|
779
821
|
Types: tuple of Teradataml DataFrame
|
|
780
822
|
|
|
781
823
|
RETURNS:
|
|
@@ -786,21 +828,42 @@ class _ModelTraining:
|
|
|
786
828
|
"xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
|
|
787
829
|
|
|
788
830
|
# Setting eval_params for hpt.
|
|
789
|
-
eval_params =
|
|
831
|
+
eval_params = _ModelTraining._eval_params_generation(model_param['name'],
|
|
832
|
+
self.target_column,
|
|
833
|
+
self.task_type)
|
|
790
834
|
|
|
791
835
|
# Input columns for model
|
|
792
836
|
model_param['input_columns'] = self.features
|
|
793
837
|
|
|
838
|
+
# Setting persist for model
|
|
839
|
+
model_param['persist'] = self.persist
|
|
840
|
+
|
|
794
841
|
self._display_msg(msg=model_param['name'],
|
|
795
842
|
progress_bar=self.progress_bar,
|
|
796
843
|
show_data=True)
|
|
797
844
|
|
|
798
|
-
#
|
|
845
|
+
# As we are using entire data for HPT training. So,
|
|
846
|
+
# passing prepared training data as test_data for KNN.
|
|
799
847
|
if model_param['name'] == 'knn':
|
|
800
|
-
model_param['test_data'] =
|
|
848
|
+
model_param['test_data'] = train_data
|
|
801
849
|
|
|
802
|
-
|
|
803
|
-
|
|
850
|
+
if self.task_type == "Classification":
|
|
851
|
+
model_param['output_prob'] = True
|
|
852
|
+
model_param['output_responses'] = self.output_response
|
|
853
|
+
|
|
854
|
+
# Using RandomSearch for hyperparameter tunning when max_models is given.
|
|
855
|
+
# Otherwise, using GridSearch for hyperparameter tunning.
|
|
856
|
+
if self.max_models is not None:
|
|
857
|
+
# Setting max_models for RandomSearch based on model name
|
|
858
|
+
model_param['max_models'] = self.max_models_distribution[model_param['name']]
|
|
859
|
+
# Defining RandomSearch with ML model based on Name, and max_models
|
|
860
|
+
_obj = RandomSearch(func=model_to_func[model_param['name']],
|
|
861
|
+
params=model_param,
|
|
862
|
+
n_iter=model_param['max_models'])
|
|
863
|
+
else:
|
|
864
|
+
# Defining Gridsearch with ML model based on Name
|
|
865
|
+
_obj = GridSearch(func=model_to_func[model_param['name']],
|
|
866
|
+
params=model_param)
|
|
804
867
|
|
|
805
868
|
if self.verbose > 0:
|
|
806
869
|
print(" " *200, end='\r', flush=True)
|
|
@@ -813,46 +876,54 @@ class _ModelTraining:
|
|
|
813
876
|
_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
|
|
814
877
|
early_stop=self.stopping_tolerance, run_parallel=True,
|
|
815
878
|
sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
|
|
816
|
-
verbose=verbose, max_time=self.max_runtime_secs)
|
|
879
|
+
stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
|
|
817
880
|
else:
|
|
818
881
|
_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
|
|
819
882
|
early_stop=self.stopping_tolerance, **eval_params,
|
|
820
883
|
run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
|
|
821
|
-
sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
|
|
884
|
+
sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
|
|
822
885
|
|
|
823
886
|
# Getting all passed models
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
#
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
#
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
887
|
+
model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
|
|
888
|
+
on='MODEL_ID', how='inner')
|
|
889
|
+
# Creating mapping data ID to feature selection method
|
|
890
|
+
data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
|
|
891
|
+
"DF_1": ('rfe', train_data[1]._table_name),
|
|
892
|
+
"DF_2": ('pca', train_data[2]._table_name)}
|
|
893
|
+
|
|
894
|
+
# Updating model stats with feature selection method and result table
|
|
895
|
+
for index, row in model_info.iterrows():
|
|
896
|
+
model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
|
|
897
|
+
model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
|
|
898
|
+
model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
|
|
899
|
+
model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
|
|
900
|
+
|
|
901
|
+
# Dropping column 'DATA_ID'
|
|
902
|
+
model_info.drop(['DATA_ID'], axis=1, inplace=True)
|
|
903
|
+
|
|
904
|
+
model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
|
|
905
|
+
|
|
906
|
+
if not self.is_classification_type():
|
|
907
|
+
# Calculating Adjusted-R2 for regression
|
|
908
|
+
# Getting size and feature count for each feature selection method
|
|
909
|
+
methods = ["lasso", "rfe", "pca"]
|
|
910
|
+
size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
|
|
911
|
+
feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
|
|
912
|
+
model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
|
|
913
|
+
1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
|
|
914
|
+
(size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
|
|
915
|
+
|
|
847
916
|
self._display_msg(msg="-"*100,
|
|
848
917
|
progress_bar=self.progress_bar,
|
|
849
918
|
show_data=True)
|
|
850
919
|
self.progress_bar.update()
|
|
851
920
|
|
|
852
921
|
return model_info
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
922
|
+
|
|
923
|
+
@staticmethod
|
|
924
|
+
def _eval_params_generation(ml_name,
|
|
925
|
+
target_column,
|
|
926
|
+
task_type):
|
|
856
927
|
"""
|
|
857
928
|
DESCRIPTION:
|
|
858
929
|
Internal function generates the eval_params for
|
|
@@ -860,28 +931,59 @@ class _ModelTraining:
|
|
|
860
931
|
|
|
861
932
|
PARAMETERS:
|
|
862
933
|
ml_name
|
|
863
|
-
Required
|
|
934
|
+
Required Argument.
|
|
864
935
|
Specifies the ML name for eval_params generation.
|
|
865
936
|
Types: str
|
|
937
|
+
|
|
938
|
+
target_column
|
|
939
|
+
Required Argument.
|
|
940
|
+
Specifies the target column.
|
|
941
|
+
Types: str
|
|
866
942
|
|
|
943
|
+
task_type:
|
|
944
|
+
Required Argument.
|
|
945
|
+
Specifies the task type for AutoML, whether to apply regresion
|
|
946
|
+
or classification on the provived dataset.
|
|
947
|
+
Default Value: "Regression"
|
|
948
|
+
Permitted Values: "Regression", "Classification"
|
|
949
|
+
Types: str
|
|
950
|
+
|
|
867
951
|
RETURNS:
|
|
868
952
|
dict containing, eval_params for ML model.
|
|
869
953
|
"""
|
|
870
954
|
# Setting the eval_params
|
|
871
955
|
eval_params = {"id_column": "id",
|
|
872
|
-
"accumulate":
|
|
956
|
+
"accumulate": target_column}
|
|
957
|
+
|
|
958
|
+
model_type = {
|
|
959
|
+
'xgboost': 'model_type',
|
|
960
|
+
'glm': 'model_type',
|
|
961
|
+
'decisionforest': 'tree_type',
|
|
962
|
+
'svm': 'model_type',
|
|
963
|
+
'knn': 'model_type'
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
ml_name = ml_name.replace('_', '').lower()
|
|
873
967
|
|
|
874
968
|
# For Classification
|
|
875
|
-
if
|
|
969
|
+
if task_type.lower() != "regression":
|
|
970
|
+
eval_params[model_type[ml_name]] = 'Classification'
|
|
971
|
+
eval_params['output_prob'] = True
|
|
972
|
+
|
|
876
973
|
if ml_name == 'xgboost':
|
|
877
|
-
eval_params['model_type'] = 'Classification'
|
|
878
974
|
eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
879
|
-
|
|
880
|
-
|
|
975
|
+
|
|
976
|
+
elif ml_name == 'glm':
|
|
977
|
+
eval_params['family'] = 'BINOMIAL'
|
|
978
|
+
|
|
881
979
|
else:
|
|
882
980
|
# For Regression
|
|
981
|
+
eval_params[model_type[ml_name]] = 'Regression'
|
|
982
|
+
|
|
883
983
|
if ml_name == 'xgboost':
|
|
884
|
-
eval_params['model_type'] = 'Regression'
|
|
885
984
|
eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
|
|
985
|
+
|
|
986
|
+
elif ml_name == 'glm':
|
|
987
|
+
eval_params['family'] = 'GAUSSIAN'
|
|
886
988
|
|
|
887
989
|
return eval_params
|