teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
teradataml/automl/__init__.py
CHANGED
|
@@ -15,21 +15,29 @@
|
|
|
15
15
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import json
|
|
18
|
+
import pandas as pd
|
|
18
19
|
import numpy as np
|
|
19
20
|
from sklearn.metrics import confusion_matrix
|
|
20
21
|
import time
|
|
22
|
+
import ast
|
|
23
|
+
import warnings
|
|
24
|
+
import joblib
|
|
25
|
+
from io import BytesIO
|
|
21
26
|
|
|
22
27
|
# Teradata libraries
|
|
23
28
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
24
29
|
from teradataml import ColumnExpression
|
|
25
30
|
from teradataml.dataframe.dataframe import DataFrame
|
|
31
|
+
from teradataml.utils.utils import execute_sql
|
|
26
32
|
from teradataml.utils.validators import _Validators
|
|
27
|
-
from teradataml import ROC
|
|
28
|
-
from teradataml.common.utils import UtilFuncs
|
|
33
|
+
from teradataml import ROC, BLOB
|
|
29
34
|
from teradataml.utils.dtypes import _Dtypes
|
|
30
35
|
from teradataml.common.utils import UtilFuncs
|
|
31
36
|
from teradataml import TeradataMlException
|
|
32
37
|
from teradataml.common.messages import Messages, MessageCodes
|
|
38
|
+
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
39
|
+
from teradataml import TeradataConstants
|
|
40
|
+
from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
|
|
33
41
|
|
|
34
42
|
# AutoML Internal libraries
|
|
35
43
|
from teradataml.automl.data_preparation import _DataPreparation
|
|
@@ -51,7 +59,9 @@ class AutoML:
|
|
|
51
59
|
max_runtime_secs = None,
|
|
52
60
|
stopping_metric = None,
|
|
53
61
|
stopping_tolerance = None,
|
|
54
|
-
|
|
62
|
+
max_models = None,
|
|
63
|
+
custom_config_file = None,
|
|
64
|
+
**kwargs):
|
|
55
65
|
"""
|
|
56
66
|
DESCRIPTION:
|
|
57
67
|
AutoML (Automated Machine Learning) is an approach that automates the process
|
|
@@ -82,12 +92,12 @@ class AutoML:
|
|
|
82
92
|
AutoML also provides an option to customize the processes within feature
|
|
83
93
|
engineering, data preparation and model training phases. User can customize
|
|
84
94
|
the processes by passing the JSON file path in case of custom run. It also
|
|
85
|
-
supports early stopping of model training based on stopping metrics
|
|
86
|
-
maximum running time.
|
|
95
|
+
supports early stopping of model training based on stopping metrics,
|
|
96
|
+
maximum running time and maximum models to be trained.
|
|
87
97
|
|
|
88
98
|
PARAMETERS:
|
|
89
99
|
task_type:
|
|
90
|
-
Optional
|
|
100
|
+
Optional Argument.
|
|
91
101
|
Specifies the task type for AutoML, whether to apply regression OR classification
|
|
92
102
|
on the provided dataset. If user wants AutoML to decide the task type automatically,
|
|
93
103
|
then it should be set to "Default".
|
|
@@ -122,7 +132,7 @@ class AutoML:
|
|
|
122
132
|
Types: int
|
|
123
133
|
|
|
124
134
|
max_runtime_secs:
|
|
125
|
-
Optional
|
|
135
|
+
Optional Argument.
|
|
126
136
|
Specifies the time limit in seconds for model training.
|
|
127
137
|
Types: int
|
|
128
138
|
|
|
@@ -130,8 +140,10 @@ class AutoML:
|
|
|
130
140
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
131
141
|
Specifies the stopping metrics for stopping tolerance in model training.
|
|
132
142
|
Permitted Values:
|
|
133
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
134
|
-
"RMSE", "RMSLE"
|
|
143
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
144
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
145
|
+
"ME", "EV", "MPD", "MGD"
|
|
146
|
+
|
|
135
147
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
136
148
|
'MICRO-RECALL','MACRO-RECALL',
|
|
137
149
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -143,11 +155,38 @@ class AutoML:
|
|
|
143
155
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
144
156
|
Specifies the stopping tolerance for stopping metrics in model training.
|
|
145
157
|
Types: float
|
|
158
|
+
|
|
159
|
+
max_models:
|
|
160
|
+
Optional Argument.
|
|
161
|
+
Specifies the maximum number of models to be trained.
|
|
162
|
+
Types: int
|
|
146
163
|
|
|
147
164
|
custom_config_file:
|
|
148
165
|
Optional Argument.
|
|
149
166
|
Specifies the path of JSON file in case of custom run.
|
|
150
167
|
Types: str
|
|
168
|
+
|
|
169
|
+
**kwargs:
|
|
170
|
+
Specifies the additional arguments for AutoML. Below
|
|
171
|
+
are the additional arguments:
|
|
172
|
+
volatile:
|
|
173
|
+
Optional Argument.
|
|
174
|
+
Specifies whether to put the interim results of the
|
|
175
|
+
functions in a volatile table or not. When set to
|
|
176
|
+
True, results are stored in a volatile table,
|
|
177
|
+
otherwise not.
|
|
178
|
+
Default Value: False
|
|
179
|
+
Types: bool
|
|
180
|
+
|
|
181
|
+
persist:
|
|
182
|
+
Optional Argument.
|
|
183
|
+
Specifies whether to persist the interim results of the
|
|
184
|
+
functions in a table or not. When set to True,
|
|
185
|
+
results are persisted in a table; otherwise,
|
|
186
|
+
results are garbage collected at the end of the
|
|
187
|
+
session.
|
|
188
|
+
Default Value: False
|
|
189
|
+
Types: bool
|
|
151
190
|
|
|
152
191
|
RETURNS:
|
|
153
192
|
Instance of AutoML.
|
|
@@ -185,24 +224,28 @@ class AutoML:
|
|
|
185
224
|
|
|
186
225
|
# Fit the data.
|
|
187
226
|
>>> automl_obj.fit(admissions_train, "admitted")
|
|
188
|
-
|
|
189
|
-
# Run predict with best performing model.
|
|
190
|
-
>>> prediction = automl_obj.predict()
|
|
191
|
-
>>> prediction
|
|
192
|
-
|
|
193
|
-
# Run predict for new test data with best performing model.
|
|
194
|
-
>>> prediction = automl_obj.predict(admissions_test)
|
|
195
|
-
>>> prediction
|
|
196
227
|
|
|
197
|
-
# Run predict for new test data with second best performing model.
|
|
198
|
-
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
199
|
-
>>> prediction
|
|
200
|
-
|
|
201
228
|
# Display leaderboard.
|
|
202
229
|
>>> automl_obj.leaderboard()
|
|
203
230
|
|
|
204
231
|
# Display best performing model.
|
|
205
232
|
>>> automl_obj.leader()
|
|
233
|
+
|
|
234
|
+
# Run predict on test data using best performing model.
|
|
235
|
+
>>> prediction = automl_obj.predict(admissions_test)
|
|
236
|
+
>>> prediction
|
|
237
|
+
|
|
238
|
+
# Run predict on test data using second best performing model.
|
|
239
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
240
|
+
>>> prediction
|
|
241
|
+
|
|
242
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
243
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
244
|
+
>>> performance_metrics
|
|
245
|
+
|
|
246
|
+
# Run evaluate to get performance metrics using model rank 3.
|
|
247
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, rank=3)
|
|
248
|
+
>>> performance_metrics
|
|
206
249
|
|
|
207
250
|
# Example 2 : Run AutoML for regression problem.
|
|
208
251
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -221,24 +264,28 @@ class AutoML:
|
|
|
221
264
|
>>> custom_config_file="custom_housing.json")
|
|
222
265
|
# Fit the data.
|
|
223
266
|
>>> automl_obj.fit(housing_train, "price")
|
|
224
|
-
|
|
225
|
-
# Run predict with best performing model.
|
|
226
|
-
>>> prediction = automl_obj.predict()
|
|
227
|
-
>>> prediction
|
|
228
267
|
|
|
229
|
-
# Run predict for new test data with best performing model.
|
|
230
|
-
>>> prediction = automl_obj.predict(housing_test)
|
|
231
|
-
>>> prediction
|
|
232
|
-
|
|
233
|
-
# Run predict for new test data with second best performing model.
|
|
234
|
-
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
235
|
-
>>> prediction
|
|
236
|
-
|
|
237
268
|
# Display leaderboard.
|
|
238
269
|
>>> automl_obj.leaderboard()
|
|
239
270
|
|
|
240
271
|
# Display best performing model.
|
|
241
272
|
>>> automl_obj.leader()
|
|
273
|
+
|
|
274
|
+
# Run predict on test data using best performing model.
|
|
275
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
276
|
+
>>> prediction
|
|
277
|
+
|
|
278
|
+
# Run predict on test data using second best performing model.
|
|
279
|
+
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
280
|
+
>>> prediction
|
|
281
|
+
|
|
282
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
283
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
284
|
+
>>> performance_metrics
|
|
285
|
+
|
|
286
|
+
# Run evaluate to get performance metrics using second best performing model.
|
|
287
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test, rank=2)
|
|
288
|
+
>>> performance_metrics
|
|
242
289
|
|
|
243
290
|
# Example 3 : Run AutoML for multiclass classification problem.
|
|
244
291
|
# Scenario : Predict the species of iris flower based on different
|
|
@@ -246,6 +293,11 @@ class AutoML:
|
|
|
246
293
|
# different processes of AutoML Run to get the best
|
|
247
294
|
# performing model out of available models.
|
|
248
295
|
|
|
296
|
+
# Split the data into train and test.
|
|
297
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
298
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
299
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
300
|
+
|
|
249
301
|
# Generate custom JSON file
|
|
250
302
|
>>> AutoML.generate_custom_config()
|
|
251
303
|
|
|
@@ -253,22 +305,23 @@ class AutoML:
|
|
|
253
305
|
>>> automl_obj = AutoML(verbose=2,
|
|
254
306
|
>>> exclude="xgboost",
|
|
255
307
|
>>> custom_config_file="custom.json")
|
|
308
|
+
|
|
256
309
|
# Fit the data.
|
|
257
|
-
>>> automl_obj.fit(
|
|
258
|
-
|
|
259
|
-
# Run predict with best performing model.
|
|
260
|
-
>>> prediction = automl_obj.predict()
|
|
261
|
-
>>> prediction
|
|
262
|
-
|
|
263
|
-
# Run predict with second best performing model.
|
|
264
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
265
|
-
>>> prediction
|
|
310
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
266
311
|
|
|
267
312
|
# Display leaderboard.
|
|
268
313
|
>>> automl_obj.leaderboard()
|
|
269
314
|
|
|
270
315
|
# Display best performing model.
|
|
271
316
|
>>> automl_obj.leader()
|
|
317
|
+
|
|
318
|
+
# Run predict on test data using second best performing model.
|
|
319
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
320
|
+
>>> prediction
|
|
321
|
+
|
|
322
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
323
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
324
|
+
>>> performance_metrics
|
|
272
325
|
|
|
273
326
|
# Example 4 : Run AutoML for regression problem with early stopping metric and tolerance.
|
|
274
327
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -285,41 +338,61 @@ class AutoML:
|
|
|
285
338
|
>>> exclude="xgboost",
|
|
286
339
|
>>> stopping_metric="R2",
|
|
287
340
|
>>> stopping_tolerance=0.7,
|
|
341
|
+
>>> max_models=10,
|
|
288
342
|
>>> custom_config_file="custom_housing.json")
|
|
289
343
|
# Fit the data.
|
|
290
344
|
>>> automl_obj.fit(housing_train, "price")
|
|
291
|
-
|
|
292
|
-
# Run predict with best performing model.
|
|
293
|
-
>>> prediction = automl_obj.predict()
|
|
294
|
-
>>> prediction
|
|
295
|
-
|
|
345
|
+
|
|
296
346
|
# Display leaderboard.
|
|
297
347
|
>>> automl_obj.leaderboard()
|
|
348
|
+
|
|
349
|
+
# Run predict on test data using best performing model.
|
|
350
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
351
|
+
>>> prediction
|
|
352
|
+
|
|
353
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
354
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
355
|
+
>>> performance_metrics
|
|
298
356
|
|
|
299
357
|
# Example 5 : Run AutoML for regression problem with maximum runtime.
|
|
300
358
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
301
359
|
# Run AutoML to get the best performing model in specified time.
|
|
302
360
|
|
|
361
|
+
# Split the data into train and test.
|
|
362
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
363
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
364
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
365
|
+
|
|
303
366
|
# Create instance of AutoML.
|
|
304
367
|
>>> automl_obj = AutoML(verbose=2,
|
|
305
368
|
>>> exclude="xgboost",
|
|
306
|
-
>>> max_runtime_secs=500
|
|
369
|
+
>>> max_runtime_secs=500,
|
|
370
|
+
>>> max_models=3)
|
|
371
|
+
|
|
307
372
|
# Fit the data.
|
|
308
|
-
>>> automl_obj.fit(
|
|
309
|
-
|
|
310
|
-
# Run predict with best performing model.
|
|
311
|
-
>>> prediction = automl_obj.predict()
|
|
312
|
-
>>> prediction
|
|
313
|
-
|
|
314
|
-
# Run predict with second best performing model.
|
|
315
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
316
|
-
>>> prediction
|
|
317
|
-
|
|
373
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
374
|
+
|
|
318
375
|
# Display leaderboard.
|
|
319
376
|
>>> automl_obj.leaderboard()
|
|
320
377
|
|
|
321
378
|
# Display best performing model.
|
|
322
|
-
>>> automl_obj.leader()
|
|
379
|
+
>>> automl_obj.leader()
|
|
380
|
+
|
|
381
|
+
# Run predict on test data using best performing model.
|
|
382
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
383
|
+
>>> prediction
|
|
384
|
+
|
|
385
|
+
# Run predict on test data using second best performing model.
|
|
386
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
387
|
+
>>> prediction
|
|
388
|
+
|
|
389
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
390
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
391
|
+
>>> performance_metrics
|
|
392
|
+
|
|
393
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
394
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test, 4)
|
|
395
|
+
>>> performance_metrics
|
|
323
396
|
"""
|
|
324
397
|
# Appending arguments to list for validation
|
|
325
398
|
arg_info_matrix = []
|
|
@@ -330,25 +403,36 @@ class AutoML:
|
|
|
330
403
|
"decision_forest", "xgboost"]])
|
|
331
404
|
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
332
405
|
arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
|
|
333
|
-
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2",
|
|
334
|
-
|
|
335
|
-
|
|
406
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", "MAE", "MSE", "MSLE",
|
|
407
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
408
|
+
"ME", "EV", "MPD", "MGD",
|
|
336
409
|
'MICRO-F1','MACRO-F1',
|
|
337
410
|
'MICRO-RECALL','MACRO-RECALL',
|
|
338
411
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
339
412
|
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
340
413
|
'WEIGHTED-F1', 'ACCURACY']])
|
|
341
414
|
arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
|
|
415
|
+
arg_info_matrix.append(["max_models", max_models, True, (int)])
|
|
342
416
|
arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
|
|
343
|
-
|
|
417
|
+
|
|
418
|
+
volatile = kwargs.get('volatile', False)
|
|
419
|
+
persist = kwargs.get('persist', False)
|
|
420
|
+
|
|
421
|
+
arg_info_matrix.append(["volatile", volatile, True, (bool)])
|
|
422
|
+
arg_info_matrix.append(["persist", persist, True, (bool)])
|
|
344
423
|
|
|
345
424
|
# Validate argument types
|
|
346
425
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
347
426
|
# Either include or exclude can be used.
|
|
348
427
|
if include is not None or exclude is not None:
|
|
349
428
|
_Validators._validate_mutually_exclusive_arguments(include, "include", exclude, "exclude")
|
|
429
|
+
# Either volatile or persist can be used.
|
|
430
|
+
if volatile and persist:
|
|
431
|
+
_Validators._validate_mutually_exclusive_arguments(volatile, "volatlie", persist, "persist")
|
|
350
432
|
# Validate mutually inclusive arguments
|
|
351
433
|
_Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
|
|
434
|
+
# Validate lower range for max_models
|
|
435
|
+
_Validators._validate_argument_range(max_models, "max_models", lbound=1, lbound_inclusive=True)
|
|
352
436
|
|
|
353
437
|
custom_data = None
|
|
354
438
|
self.auto = True
|
|
@@ -375,10 +459,15 @@ class AutoML:
|
|
|
375
459
|
self.max_runtime_secs = max_runtime_secs
|
|
376
460
|
self.stopping_metric = stopping_metric
|
|
377
461
|
self.stopping_tolerance = stopping_tolerance
|
|
462
|
+
self.max_models = max_models
|
|
378
463
|
self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
|
|
379
464
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
380
465
|
self._is_fit_called = False
|
|
466
|
+
self._is_load_model_called = False
|
|
467
|
+
self.kwargs = kwargs
|
|
468
|
+
self.table_name_mapping={}
|
|
381
469
|
|
|
470
|
+
@collect_queryband(queryband="AutoML_fit")
|
|
382
471
|
def fit(self,
|
|
383
472
|
data,
|
|
384
473
|
target_column):
|
|
@@ -394,7 +483,7 @@ class AutoML:
|
|
|
394
483
|
Types: teradataml Dataframe
|
|
395
484
|
|
|
396
485
|
target_column:
|
|
397
|
-
Required
|
|
486
|
+
Required Argument.
|
|
398
487
|
Specifies target column of dataset.
|
|
399
488
|
Types: str or ColumnExpression
|
|
400
489
|
|
|
@@ -475,7 +564,9 @@ class AutoML:
|
|
|
475
564
|
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
476
565
|
else:
|
|
477
566
|
if self.stopping_metric is not None:
|
|
478
|
-
permitted_values = ["R2",
|
|
567
|
+
permitted_values = ["R2", "MAE", "MSE", "MSLE",
|
|
568
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
569
|
+
"ME", "EV", "MPD", "MGD"]
|
|
479
570
|
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
480
571
|
|
|
481
572
|
if not self.is_classification_type():
|
|
@@ -484,7 +575,7 @@ class AutoML:
|
|
|
484
575
|
|
|
485
576
|
# Displaying received custom input
|
|
486
577
|
if self.custom_data:
|
|
487
|
-
print("\
|
|
578
|
+
print("\nReceived below input for customization : ")
|
|
488
579
|
print(json.dumps(self.custom_data, indent=4))
|
|
489
580
|
|
|
490
581
|
# Classification probelm
|
|
@@ -500,38 +591,39 @@ class AutoML:
|
|
|
500
591
|
clf = task_cls(self.data, self.target_column, self.custom_data)
|
|
501
592
|
|
|
502
593
|
self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
503
|
-
self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
594
|
+
self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
|
|
595
|
+
model_list = self.model_list,
|
|
596
|
+
auto = self.auto,
|
|
597
|
+
verbose = self.verbose,
|
|
598
|
+
max_runtime_secs = self.max_runtime_secs,
|
|
599
|
+
stopping_metric = self.stopping_metric,
|
|
600
|
+
stopping_tolerance = self.stopping_tolerance,
|
|
601
|
+
max_models = self.max_models,
|
|
602
|
+
**self.kwargs)
|
|
603
|
+
|
|
511
604
|
# Model Evaluation Phase
|
|
512
605
|
self.m_evaluator = _ModelEvaluator(self.model_info,
|
|
513
606
|
self.target_column,
|
|
514
607
|
self.task_type)
|
|
515
608
|
|
|
609
|
+
@collect_queryband(queryband="AutoML_predict")
|
|
516
610
|
def predict(self,
|
|
517
|
-
data
|
|
518
|
-
rank = 1
|
|
611
|
+
data,
|
|
612
|
+
rank = 1,
|
|
613
|
+
use_loaded_models = False):
|
|
519
614
|
"""
|
|
520
615
|
DESCRIPTION:
|
|
521
|
-
Function generates prediction on
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
and performance metrics, otherwise displays only prediction.
|
|
616
|
+
Function generates prediction on data using model rank in
|
|
617
|
+
leaderboard.
|
|
618
|
+
Note:
|
|
619
|
+
* If both fit and load method are called before predict, then fit method model will be used
|
|
620
|
+
for prediction by default unless 'use_loaded_models' is set to True in predict.
|
|
527
621
|
|
|
528
622
|
PARAMETERS:
|
|
529
623
|
data:
|
|
530
|
-
|
|
531
|
-
Specifies the dataset on which prediction
|
|
532
|
-
|
|
533
|
-
When "data" is not specified default test data is used. Default
|
|
534
|
-
test data is the dataset generated at the time of training.
|
|
624
|
+
Required Argument.
|
|
625
|
+
Specifies the dataset on which prediction needs to be generated
|
|
626
|
+
using model rank in leaderboard.
|
|
535
627
|
Types: teradataml DataFrame
|
|
536
628
|
|
|
537
629
|
rank:
|
|
@@ -539,6 +631,12 @@ class AutoML:
|
|
|
539
631
|
Specifies the rank of the model in the leaderboard to be used for prediction.
|
|
540
632
|
Default Value: 1
|
|
541
633
|
Types: int
|
|
634
|
+
|
|
635
|
+
use_loaded_models:
|
|
636
|
+
Optional Argument.
|
|
637
|
+
Specifies whether to use loaded models from database for prediction or not.
|
|
638
|
+
Default Value: False
|
|
639
|
+
Types: bool
|
|
542
640
|
|
|
543
641
|
RETURNS:
|
|
544
642
|
Pandas DataFrame with predictions.
|
|
@@ -552,88 +650,84 @@ class AutoML:
|
|
|
552
650
|
# Perform fit() operation on the "automl_obj".
|
|
553
651
|
# Perform predict() operation on the "automl_obj".
|
|
554
652
|
|
|
555
|
-
# Example 1: Run predict
|
|
556
|
-
>>> prediction = automl_obj.predict()
|
|
557
|
-
>>> prediction
|
|
558
|
-
|
|
559
|
-
# Example 2: Run predict with second best performing model.
|
|
560
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
561
|
-
>>> prediction
|
|
562
|
-
|
|
563
|
-
# Example 3: Run predict for new test data with best performing model.
|
|
653
|
+
# Example 1: Run predict on test data using best performing model.
|
|
564
654
|
>>> prediction = automl_obj.predict(admissions_test)
|
|
565
655
|
>>> prediction
|
|
566
656
|
|
|
567
|
-
# Example
|
|
657
|
+
# Example 2: Run predict on test data using second best performing model.
|
|
568
658
|
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
569
659
|
>>> prediction
|
|
660
|
+
|
|
661
|
+
# Example 3: Run predict on test data using loaded model.
|
|
662
|
+
>>> automl_obj.load("model_table")
|
|
663
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=3)
|
|
664
|
+
>>> prediction
|
|
665
|
+
|
|
666
|
+
# Example 4: Run predict on test data using loaded model when fit is also called.
|
|
667
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
668
|
+
>>> automl_obj.load("model_table")
|
|
669
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=3, use_loaded_models=True)
|
|
670
|
+
>>> prediction
|
|
570
671
|
"""
|
|
571
|
-
if not
|
|
572
|
-
|
|
672
|
+
# Checking if fit or load model is called before predict, If not raise error
|
|
673
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
573
674
|
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
574
675
|
"'predict' method", \
|
|
575
|
-
"'fit' method must be called before" \
|
|
676
|
+
"'fit' or 'load' method must be called before" \
|
|
576
677
|
" running predict.")
|
|
577
678
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
679
|
+
|
|
578
680
|
# Appending predict arguments to list for validation.
|
|
579
681
|
arg_info_pred_matrix = []
|
|
580
|
-
arg_info_pred_matrix.append(["data", data,
|
|
682
|
+
arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
|
|
581
683
|
arg_info_pred_matrix.append(["rank", rank, True, (int), True])
|
|
684
|
+
arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
582
685
|
|
|
583
686
|
# Validate argument types
|
|
584
687
|
_Validators._validate_function_arguments(arg_info_pred_matrix)
|
|
688
|
+
|
|
689
|
+
# Run predict using loaded model
|
|
690
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
691
|
+
# Validate range for model rank
|
|
692
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
693
|
+
ubound=self.loaded_models_info.RANK.max(),
|
|
694
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
695
|
+
return self._run_loaded_model(data, rank)
|
|
696
|
+
|
|
697
|
+
# Validate range for model rank
|
|
698
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
699
|
+
ubound=self.leader_board.RANK.max(),
|
|
700
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
585
701
|
|
|
586
|
-
# Setting
|
|
587
|
-
self.
|
|
588
|
-
# Setting target column indicator to default value, i.e., False.
|
|
589
|
-
self.target_column_ind = False
|
|
702
|
+
# Setting target column indicator to default value, i.e., True.
|
|
703
|
+
self.target_column_ind = True
|
|
590
704
|
# Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
|
|
591
705
|
rank = rank-1
|
|
706
|
+
|
|
707
|
+
# Setting indicator to False if target column doesn't exist
|
|
708
|
+
if self.target_column not in data.columns:
|
|
709
|
+
self.target_column_ind = False
|
|
592
710
|
|
|
593
|
-
# Checking if
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
711
|
+
# Checking if data is already transformed before or not
|
|
712
|
+
data_node_id = data._nodeid
|
|
713
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
714
|
+
# At first data transformation will be performed on raw test data
|
|
715
|
+
# then evaluation will happen.
|
|
716
|
+
self.transform_data(data)
|
|
599
717
|
else:
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
data_transformation_params = \
|
|
609
|
-
self.data_transformation_params,
|
|
610
|
-
auto = self.auto,
|
|
611
|
-
verbose = self.verbose,
|
|
612
|
-
target_column_ind = self.target_column_ind,
|
|
613
|
-
table_name_mapping=self.table_name_mapping)
|
|
614
|
-
|
|
615
|
-
self.table_name_mapping = data_transform_instance.data_transformation()
|
|
616
|
-
|
|
617
|
-
# Checking for target column presence in passed test data.
|
|
618
|
-
# If present, then both prediction and evaluation metrics will be generated.
|
|
619
|
-
# If not present, then only prediction will be generated.
|
|
620
|
-
if self.target_column_ind:
|
|
621
|
-
metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
622
|
-
test_data_ind = \
|
|
623
|
-
self.test_data_ind,
|
|
624
|
-
target_column_ind = \
|
|
625
|
-
self.target_column_ind,
|
|
626
|
-
table_name_mapping=self.table_name_mapping)
|
|
627
|
-
else:
|
|
628
|
-
pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
629
|
-
test_data_ind = \
|
|
630
|
-
self.test_data_ind,
|
|
631
|
-
table_name_mapping=self.table_name_mapping)
|
|
718
|
+
print("\nSkipping data transformation as data is already transformed.")
|
|
719
|
+
|
|
720
|
+
# Generating prediction
|
|
721
|
+
pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
722
|
+
table_name_mapping = self.table_name_mapping,
|
|
723
|
+
data_node_id = data_node_id,
|
|
724
|
+
target_column_ind = self.target_column_ind)
|
|
725
|
+
|
|
632
726
|
# Checking if problem type is classification and target label is present.
|
|
633
727
|
if self.is_classification_type() and self.target_label is not None:
|
|
634
728
|
# Displaying target column labels
|
|
635
729
|
tar_dct = {}
|
|
636
|
-
print('
|
|
730
|
+
print('\nTarget Column Mapping:')
|
|
637
731
|
# Iterating rows
|
|
638
732
|
for row in self.target_label.result.itertuples():
|
|
639
733
|
# Retrieving the category names of encoded target column
|
|
@@ -644,76 +738,1011 @@ class AutoML:
|
|
|
644
738
|
|
|
645
739
|
for key, value in tar_dct.items():
|
|
646
740
|
print(f"{key}: {value}")
|
|
647
|
-
|
|
648
|
-
|
|
741
|
+
|
|
742
|
+
# Renaming probability column if any
|
|
743
|
+
prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
|
|
744
|
+
if len(prob_lst) > 0:
|
|
745
|
+
rename_dict ={}
|
|
746
|
+
for col in pred.result.columns:
|
|
747
|
+
if col not in prob_lst:
|
|
748
|
+
rename_dict[col] = getattr(pred.result, col)
|
|
749
|
+
else:
|
|
750
|
+
indx = int(col.split('_')[1])
|
|
751
|
+
rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
|
|
752
|
+
rename_dict['drop_columns'] = True
|
|
753
|
+
pred.result = pred.result.assign(**rename_dict)
|
|
754
|
+
|
|
755
|
+
print("\nPrediction : ")
|
|
649
756
|
print(pred.result)
|
|
650
757
|
|
|
651
|
-
|
|
652
|
-
# Or if target column is present in test data.
|
|
653
|
-
if not self.test_data_ind or self.target_column_ind:
|
|
654
|
-
print("\n Performance Metrics : ")
|
|
655
|
-
print(metrics.result)
|
|
656
|
-
|
|
758
|
+
if self.target_column_ind:
|
|
657
759
|
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
658
|
-
|
|
760
|
+
probability_column = 'prob_1'
|
|
659
761
|
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
660
762
|
if self.is_classification_type():
|
|
661
763
|
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
662
764
|
# Displaying ROC-AUC for binary classification
|
|
663
765
|
if self.target_count == 2:
|
|
664
766
|
fit_params = {
|
|
665
|
-
"probability_column" :
|
|
767
|
+
"probability_column" : probability_column,
|
|
666
768
|
"observation_column" : self.target_column,
|
|
667
769
|
"positive_class" : "1",
|
|
668
770
|
"data" : pred.result
|
|
669
771
|
}
|
|
670
772
|
# Fitting ROC
|
|
671
773
|
roc_out = ROC(**fit_params)
|
|
672
|
-
print("\
|
|
774
|
+
print("\nROC-AUC : ")
|
|
673
775
|
print_data(roc_out.result)
|
|
674
776
|
print_data(roc_out.output_data)
|
|
675
777
|
|
|
676
778
|
# Displaying confusion matrix for binary and multiclass classification
|
|
677
779
|
prediction_df=pred.result.to_pandas()
|
|
678
780
|
target_col = self.target_column
|
|
679
|
-
print("\
|
|
781
|
+
print("\nConfusion Matrix : ")
|
|
680
782
|
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
681
783
|
|
|
682
784
|
# Returning prediction
|
|
683
|
-
return pred.result
|
|
785
|
+
return pred.result
|
|
786
|
+
|
|
787
|
+
@collect_queryband(queryband="AutoML_evaluate")
|
|
788
|
+
def evaluate(self,
|
|
789
|
+
data,
|
|
790
|
+
rank = 1,
|
|
791
|
+
use_loaded_models = False
|
|
792
|
+
):
|
|
793
|
+
"""
|
|
794
|
+
DESCRIPTION:
|
|
795
|
+
Function evaluates on data using model rank in leaderboard
|
|
796
|
+
and generates performance metrics.
|
|
797
|
+
Note:
|
|
798
|
+
* If both fit and load method are called before predict, then fit method model will be used
|
|
799
|
+
for prediction by default unless 'use_loaded_models' is set to True in predict.
|
|
800
|
+
|
|
801
|
+
PARAMETERS:
|
|
802
|
+
data:
|
|
803
|
+
Required Argument.
|
|
804
|
+
Specifies the dataset on which performance metrics needs to be generated.
|
|
805
|
+
Types: teradataml DataFrame
|
|
806
|
+
|
|
807
|
+
Note:
|
|
808
|
+
* Target column used for generating model is mandatory in "data" for evaluation.
|
|
809
|
+
|
|
810
|
+
rank:
|
|
811
|
+
Optional Argument.
|
|
812
|
+
Specifies the rank of the model available in the leaderboard to be used for evaluation.
|
|
813
|
+
Default Value: 1
|
|
814
|
+
Types: int
|
|
815
|
+
|
|
816
|
+
use_loaded_models:
|
|
817
|
+
Optional Argument.
|
|
818
|
+
Specifies whether to use loaded models from database for prediction or not.
|
|
819
|
+
Default Value: False
|
|
820
|
+
Types: bool
|
|
821
|
+
|
|
822
|
+
RETURNS:
|
|
823
|
+
Pandas DataFrame with performance metrics.
|
|
824
|
+
|
|
825
|
+
RAISES:
|
|
826
|
+
TeradataMlException.
|
|
827
|
+
|
|
828
|
+
EXAMPLES:
|
|
829
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
830
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
831
|
+
# Perform fit() operation on the "automl_obj".
|
|
832
|
+
# Perform evaluate() operation on the "automl_obj".
|
|
833
|
+
|
|
834
|
+
# Example 1: Run evaluate on test data using best performing model.
|
|
835
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
836
|
+
>>> performance_metrics
|
|
837
|
+
|
|
838
|
+
# Example 2: Run evaluate on test data using second best performing model.
|
|
839
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, rank=2)
|
|
840
|
+
>>> performance_metrics
|
|
841
|
+
|
|
842
|
+
# Example 3: Run evaluate on test data using loaded model.
|
|
843
|
+
>>> automl_obj.load("model_table")
|
|
844
|
+
>>> evaluation = automl_obj.evaluate(admissions_test, rank=3)
|
|
845
|
+
>>> evaluation
|
|
846
|
+
|
|
847
|
+
# Example 4: Run predict on test data using loaded model when fit is also called.
|
|
848
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
849
|
+
>>> automl_obj.load("model_table")
|
|
850
|
+
>>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
|
|
851
|
+
>>> evaluation
|
|
852
|
+
"""
|
|
853
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
854
|
+
# raise ValueError("fit() method must be called before evaluating.")
|
|
855
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
856
|
+
"'evaluate' method", \
|
|
857
|
+
"'fit' or 'load' method must be called before" \
|
|
858
|
+
" running evaluate.")
|
|
859
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
860
|
+
# Appending evaluate arguments to list for validation.
|
|
861
|
+
arg_info_pred_matrix = []
|
|
862
|
+
arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
|
|
863
|
+
arg_info_pred_matrix.append(["rank", rank, True, (int), True])
|
|
864
|
+
arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
865
|
+
|
|
866
|
+
# Validate argument types
|
|
867
|
+
_Validators._validate_function_arguments(arg_info_pred_matrix)
|
|
868
|
+
|
|
869
|
+
# Run evaluate using loaded model
|
|
870
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
871
|
+
# Validate range for model rank
|
|
872
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
873
|
+
ubound=self.loaded_models_info.RANK.max(),
|
|
874
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
875
|
+
return self._run_loaded_model(data, rank, output_type="evaluate")
|
|
876
|
+
|
|
877
|
+
# Validate range for model rank
|
|
878
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
879
|
+
ubound=self.leader_board.RANK.max(),
|
|
880
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
881
|
+
|
|
882
|
+
# Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
|
|
883
|
+
rank = rank-1
|
|
884
|
+
|
|
885
|
+
# Raising exception if target column is not present in data
|
|
886
|
+
# as it is required for evaluation.
|
|
887
|
+
if self.target_column not in data.columns:
|
|
888
|
+
raise TeradataMlException(
|
|
889
|
+
Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
|
|
890
|
+
MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
|
|
891
|
+
|
|
892
|
+
# Checking if data is already transformed before or not
|
|
893
|
+
data_node_id = data._nodeid
|
|
894
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
895
|
+
# At first data transformation will be performed on raw test data
|
|
896
|
+
# then evaluation will happen.
|
|
897
|
+
self.transform_data(data)
|
|
898
|
+
else:
|
|
899
|
+
print("\nSkipping data transformation as data is already transformed.")
|
|
900
|
+
|
|
901
|
+
metrics = self.m_evaluator.model_evaluation(rank = rank,
|
|
902
|
+
table_name_mapping=self.table_name_mapping,
|
|
903
|
+
data_node_id = data_node_id,
|
|
904
|
+
get_metrics = True)
|
|
905
|
+
|
|
906
|
+
# Checking if problem type is classification and target label is present.
|
|
907
|
+
if self.is_classification_type() and self.target_label is not None:
|
|
908
|
+
# Displaying target column labels
|
|
909
|
+
tar_dct = {}
|
|
910
|
+
print('\nTarget Column Mapping:')
|
|
911
|
+
# Iterating rows
|
|
912
|
+
for row in self.target_label.result.itertuples():
|
|
913
|
+
# Retrieving the category names of encoded target column
|
|
914
|
+
# row[1] contains the orginal name of cateogry
|
|
915
|
+
# row[2] contains the encoded value
|
|
916
|
+
if row[1] != 'TD_CATEGORY_COUNT':
|
|
917
|
+
tar_dct[row[1]] = row[2]
|
|
918
|
+
|
|
919
|
+
for key, value in tar_dct.items():
|
|
920
|
+
print(f"{key}: {value}")
|
|
921
|
+
|
|
922
|
+
# Showing performance metrics
|
|
923
|
+
print("\nPerformance Metrics : ")
|
|
924
|
+
print(metrics.result)
|
|
925
|
+
if self.is_classification_type():
|
|
926
|
+
print("-"*80)
|
|
927
|
+
print(metrics.output_data)
|
|
928
|
+
|
|
929
|
+
# Returning performance metrics
|
|
930
|
+
return metrics.result
|
|
684
931
|
|
|
932
|
+
def transform_data(self,
|
|
933
|
+
data,
|
|
934
|
+
data_params = None,
|
|
935
|
+
auto = None,
|
|
936
|
+
verbose = None,
|
|
937
|
+
target_column_ind = None):
|
|
938
|
+
"""
|
|
939
|
+
DESCRIPTION:
|
|
940
|
+
Function transforms the data based on the data transformation parameters
|
|
941
|
+
generated during the fit phase.
|
|
942
|
+
|
|
943
|
+
PARAMETERS:
|
|
944
|
+
data:
|
|
945
|
+
Required Argument.
|
|
946
|
+
Specifies the dataset to be transformed.
|
|
947
|
+
Types: teradataml DataFrame
|
|
948
|
+
|
|
949
|
+
data_params:
|
|
950
|
+
Optional Argument.
|
|
951
|
+
Specifies the data transformation parameters.
|
|
952
|
+
Default Value: None
|
|
953
|
+
Types: dict
|
|
954
|
+
|
|
955
|
+
auto:
|
|
956
|
+
Optional Argument.
|
|
957
|
+
Specifies whether to AutoML ran in auto or custom mode.
|
|
958
|
+
Default Value: None
|
|
959
|
+
Types: bool
|
|
960
|
+
|
|
961
|
+
verbose:
|
|
962
|
+
Optional Argument.
|
|
963
|
+
Specifies the verbosity level.
|
|
964
|
+
Default Value: None
|
|
965
|
+
Types: int
|
|
966
|
+
|
|
967
|
+
target_column_ind:
|
|
968
|
+
Optional Argument.
|
|
969
|
+
Specifies whether target column is present in data or not.
|
|
970
|
+
Default Value: None
|
|
971
|
+
Types: bool
|
|
972
|
+
|
|
973
|
+
RETURNS:
|
|
974
|
+
None
|
|
975
|
+
"""
|
|
976
|
+
# Creating instance of DataTransformation
|
|
977
|
+
data_transform_instance = _DataTransformation(data = data,
|
|
978
|
+
data_transformation_params=data_params if data_params is not None else \
|
|
979
|
+
self.data_transformation_params,
|
|
980
|
+
auto=auto if data_params is not None else self.auto,
|
|
981
|
+
verbose=verbose if verbose is not None else self.verbose,
|
|
982
|
+
target_column_ind=target_column_ind if target_column_ind is not None else \
|
|
983
|
+
self.target_column_ind,
|
|
984
|
+
table_name_mapping=self.table_name_mapping)
|
|
985
|
+
|
|
986
|
+
# Storing mapping of table names for transformed data
|
|
987
|
+
self.table_name_mapping = data_transform_instance.data_transformation()
|
|
988
|
+
|
|
989
|
+
@collect_queryband(queryband="AutoML_leaderboard")
|
|
685
990
|
def leaderboard(self):
|
|
686
991
|
"""
|
|
687
992
|
DESCRIPTION:
|
|
688
|
-
Function displays leaderboard.
|
|
993
|
+
Function displays leaderboard.
|
|
994
|
+
|
|
995
|
+
RETURNS:
|
|
996
|
+
Pandas DataFrame with Leaderboard information.
|
|
997
|
+
|
|
998
|
+
RAISES:
|
|
999
|
+
TeradataMlException.
|
|
1000
|
+
|
|
1001
|
+
EXAMPLES:
|
|
1002
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1003
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1004
|
+
# Perform fit() operation on the "automl_obj".
|
|
1005
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1006
|
+
>>> automl_obj.leaderboard()
|
|
1007
|
+
"""
|
|
1008
|
+
if not self._is_fit_called:
|
|
1009
|
+
# raise ValueError("fit() method must be called before generating leaderboard.")
|
|
1010
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1011
|
+
"'leaderboard' method", \
|
|
1012
|
+
"'fit' method must be called before" \
|
|
1013
|
+
" generating leaderboard.")
|
|
1014
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1015
|
+
return self.leader_board
|
|
1016
|
+
|
|
1017
|
+
@collect_queryband(queryband="AutoML_leader")
|
|
1018
|
+
def leader(self):
|
|
1019
|
+
"""
|
|
1020
|
+
DESCRIPTION:
|
|
1021
|
+
Function displays best performing model.
|
|
1022
|
+
|
|
1023
|
+
RETURNS:
|
|
1024
|
+
None
|
|
1025
|
+
|
|
1026
|
+
RAISES:
|
|
1027
|
+
TeradataMlException.
|
|
1028
|
+
|
|
1029
|
+
EXAMPLES:
|
|
1030
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1031
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1032
|
+
# Perform fit() operation on the "automl_obj".
|
|
1033
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1034
|
+
# Display best performing model using leader() method on "automl_obj".
|
|
1035
|
+
>>> automl_obj.leader()
|
|
1036
|
+
"""
|
|
1037
|
+
if not self._is_fit_called:
|
|
1038
|
+
# raise ValueError("fit() method must be called before generating leader.")
|
|
1039
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1040
|
+
"'leader' method", \
|
|
1041
|
+
"'fit' method must be called before" \
|
|
1042
|
+
" generating leader.")
|
|
1043
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1044
|
+
record = self.leader_board
|
|
1045
|
+
if not _is_terminal():
|
|
1046
|
+
display(record[record['RANK'] == 1])
|
|
1047
|
+
else:
|
|
1048
|
+
print(record[record['RANK'] == 1])
|
|
1049
|
+
|
|
1050
|
+
@collect_queryband(queryband="AutoML_hyperparameter")
|
|
1051
|
+
def model_hyperparameters(self,
|
|
1052
|
+
rank=1,
|
|
1053
|
+
use_loaded_models=False):
|
|
1054
|
+
"""
|
|
1055
|
+
DESCRIPTION:
|
|
1056
|
+
Get hyperparameters of the model based on rank in leaderboard.
|
|
1057
|
+
Note:
|
|
1058
|
+
* If both the fit() and load() methods are invoked before calling model_hyperparameters(),
|
|
1059
|
+
by default hyperparameters are retrieved from the fit leaderboard.
|
|
1060
|
+
To retrieve hyperparameters from the loaded models, set "use_loaded_models" to True in the model_hyperparameters call.
|
|
1061
|
+
|
|
1062
|
+
PARAMETERS:
|
|
1063
|
+
rank:
|
|
1064
|
+
Required Argument.
|
|
1065
|
+
Specifies the rank of the model in the leaderboard.
|
|
1066
|
+
Default Value: 1
|
|
1067
|
+
Types: int
|
|
1068
|
+
|
|
1069
|
+
use_loaded_models:
|
|
1070
|
+
Optional Argument.
|
|
1071
|
+
Specifies whether to use loaded models from database to get hyperparameters or not.
|
|
1072
|
+
Default Value: False
|
|
1073
|
+
Types: bool
|
|
1074
|
+
|
|
1075
|
+
RETURNS:
|
|
1076
|
+
Dictionary, containing hyperparameters.
|
|
1077
|
+
|
|
1078
|
+
RAISES:
|
|
1079
|
+
TeradataMlException.
|
|
1080
|
+
|
|
1081
|
+
EXAMPLES:
|
|
1082
|
+
# Example 1: Get hyperparameters of the model using fit models.
|
|
1083
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1084
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1085
|
+
# Perform fit() operation on the "automl_obj".
|
|
1086
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1087
|
+
>>> automl_obj = AutoML(task_type="Classification")
|
|
1088
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
1089
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1090
|
+
|
|
1091
|
+
# Example 2: Get hyperparameters of the model using loaded models.
|
|
1092
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1093
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1094
|
+
# Load models from the specified table.
|
|
1095
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1096
|
+
>>> automl_obj = AutoML()
|
|
1097
|
+
>>> automl_obj.load("model_table")
|
|
1098
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1099
|
+
|
|
1100
|
+
# Example 3: Get hyperparameters of the model when both fit and load method are called.
|
|
1101
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1102
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1103
|
+
# Fit the data.
|
|
1104
|
+
# Load models from the specified table.
|
|
1105
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1106
|
+
>>> automl_obj = AutoML(task_type="Classification")
|
|
1107
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
1108
|
+
>>> automl_obj.load("model_table")
|
|
1109
|
+
|
|
1110
|
+
# Get hyperparameters of the model using loaded models.
|
|
1111
|
+
>>> automl_obj.model_hyperparameters(rank=1, use_loaded_models=True)
|
|
1112
|
+
# Get hyperparameters of the model using fit models.
|
|
1113
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1114
|
+
"""
|
|
1115
|
+
|
|
1116
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
1117
|
+
# raise ValueError("fit() or load() method must be called before getting hyperparameters.")
|
|
1118
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1119
|
+
"'model_hyperparameters' method",
|
|
1120
|
+
"No models available to get hyperparameters. " \
|
|
1121
|
+
"Run 'fit()' or 'load()' methods to get models.")
|
|
1122
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1123
|
+
|
|
1124
|
+
arg_info_matrix = []
|
|
1125
|
+
arg_info_matrix.append(["rank", rank, True, (int), True])
|
|
1126
|
+
arg_info_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
1127
|
+
|
|
1128
|
+
# Validate argument types
|
|
1129
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1130
|
+
|
|
1131
|
+
leaderboard = None
|
|
1132
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
1133
|
+
leaderboard = self.loaded_models_info
|
|
1134
|
+
else:
|
|
1135
|
+
leaderboard = self.model_info
|
|
1136
|
+
|
|
1137
|
+
# Validate range for model rank from loaded models
|
|
1138
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
1139
|
+
ubound=leaderboard.RANK.max(),
|
|
1140
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1141
|
+
hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
|
|
1142
|
+
|
|
1143
|
+
# Deserializing hyperparameters
|
|
1144
|
+
hyperparams = ast.literal_eval(hyperparams)
|
|
1145
|
+
|
|
1146
|
+
# Removing 'data' from hyperparameters
|
|
1147
|
+
keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
|
|
1148
|
+
for key in keys_to_remove:
|
|
1149
|
+
hyperparams.pop(key, None)
|
|
1150
|
+
|
|
1151
|
+
return hyperparams
|
|
1152
|
+
|
|
1153
|
+
@collect_queryband(queryband="AutoML_load")
|
|
1154
|
+
def load(self,
|
|
1155
|
+
table_name):
|
|
1156
|
+
"""
|
|
1157
|
+
DESCRIPTION:
|
|
1158
|
+
Function loads models information from the specified table.
|
|
1159
|
+
|
|
1160
|
+
PARAMETERS:
|
|
1161
|
+
table_name:
|
|
1162
|
+
Required Argument.
|
|
1163
|
+
Specifies the table name from which models are to be loaded.
|
|
1164
|
+
Types: str
|
|
1165
|
+
|
|
1166
|
+
RETURNS:
|
|
1167
|
+
Pandas DataFrame with loaded models information.
|
|
1168
|
+
|
|
1169
|
+
RAISES:
|
|
1170
|
+
TeradataMlException.
|
|
1171
|
+
|
|
1172
|
+
EXAMPLES:
|
|
1173
|
+
# Create an instance of the AutoML called "obj"
|
|
1174
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1175
|
+
>>> obj = AutoML()
|
|
1176
|
+
# Load models from the specified table.
|
|
1177
|
+
>>> tab = obj.load("model_table")
|
|
1178
|
+
"""
|
|
1179
|
+
# Appending arguments to list for validation
|
|
1180
|
+
arg_info_matrix = []
|
|
1181
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1182
|
+
|
|
1183
|
+
# Validate argument types
|
|
1184
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1185
|
+
|
|
1186
|
+
# Loading models
|
|
1187
|
+
self.loaded_models_info = DataFrame(table_name).to_pandas()
|
|
1188
|
+
|
|
1189
|
+
self._load_data_transform_params()
|
|
1190
|
+
|
|
1191
|
+
self._is_load_model_called = True
|
|
1192
|
+
|
|
1193
|
+
return self.loaded_models_info.drop(['RESULT_TABLE', 'PARAMETERS'], axis=1)
|
|
1194
|
+
|
|
1195
|
+
def _load_data_transform_params(self):
|
|
1196
|
+
"""
|
|
1197
|
+
DESCRIPTION:
|
|
1198
|
+
Internal Function loads data transformation parameters from the specified table.
|
|
1199
|
+
"""
|
|
1200
|
+
from sklearn.decomposition import PCA
|
|
1201
|
+
|
|
1202
|
+
# Getting data transformation row
|
|
1203
|
+
data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
|
|
1204
|
+
|
|
1205
|
+
# Removing data transformation row and dropping 'DATA_PARAMS' column
|
|
1206
|
+
# from loaded models info
|
|
1207
|
+
self.loaded_models_info = self.loaded_models_info[self.loaded_models_info['RANK'] != -1]
|
|
1208
|
+
self.loaded_models_info.drop('DATA_PARAMS', axis=1, inplace=True)
|
|
1209
|
+
|
|
1210
|
+
# Loading data transformation parameters by deserializing
|
|
1211
|
+
buffer = BytesIO(data_transform_row['DATA_PARAMS'])
|
|
1212
|
+
data_params = joblib.load(buffer)
|
|
1213
|
+
|
|
1214
|
+
fit_obj_lst = json.loads(data_transform_row['PARAMETERS'])
|
|
1215
|
+
|
|
1216
|
+
# Generating Dataframe from table_names in data params
|
|
1217
|
+
# fit_obj_lst contain : ['one_hot_encoding_fit_obj', 'lasso_scale_fit_obj', 'pca_scale_fit_obj', imputation_fit_object]
|
|
1218
|
+
# Iterating over fit_obj_lst and converting table names to DataFrame
|
|
1219
|
+
for fit_obj_name in fit_obj_lst:
|
|
1220
|
+
if isinstance(data_params[fit_obj_name], dict):
|
|
1221
|
+
for key, val in data_params[fit_obj_name].items():
|
|
1222
|
+
# Key: automl transformation step name, val: table name
|
|
1223
|
+
data_params[fit_obj_name][key] = DataFrame(f'{val}')
|
|
1224
|
+
else:
|
|
1225
|
+
data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
|
|
1226
|
+
|
|
1227
|
+
# Manually deserializing and reconstructing PCA object
|
|
1228
|
+
load_pca_info = data_params['pca_fit_instance']
|
|
1229
|
+
pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
|
|
1230
|
+
pca.components_ = np.array(load_pca_info['components'])
|
|
1231
|
+
pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
|
|
1232
|
+
pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
|
|
1233
|
+
pca.mean_ = np.array(load_pca_info['mean'])
|
|
1234
|
+
pca.n_components_ = load_pca_info['n_components']
|
|
1235
|
+
pca.noise_variance_ = load_pca_info['noise_variance']
|
|
1236
|
+
pca.singular_values_ = np.array(load_pca_info['singular_values'])
|
|
1237
|
+
|
|
1238
|
+
data_params['pca_fit_instance'] = pca
|
|
1239
|
+
|
|
1240
|
+
self.loaded_data_transformation_params = data_params
|
|
1241
|
+
|
|
1242
|
+
def _validate_ranks(self, ranks):
|
|
1243
|
+
"""
|
|
1244
|
+
DESCRIPTION:
|
|
1245
|
+
Function validates the ranks argument.
|
|
1246
|
+
|
|
1247
|
+
PARAMETERS:
|
|
1248
|
+
ranks:
|
|
1249
|
+
Required Argument.
|
|
1250
|
+
Specifies the ranks for the models to be saved.
|
|
1251
|
+
Types: int or list of int
|
|
1252
|
+
|
|
1253
|
+
RAISES:
|
|
1254
|
+
TeradataMlException.
|
|
1255
|
+
"""
|
|
1256
|
+
start_rank, end_rank = ranks.start, ranks.stop
|
|
1257
|
+
|
|
1258
|
+
# Check if both parts are non-negative integers
|
|
1259
|
+
if not (start_rank > 0 and end_rank > 0):
|
|
1260
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1261
|
+
"'deploy' method", \
|
|
1262
|
+
"Provided start and end rank in 'ranks' "\
|
|
1263
|
+
"must be positive non-zero integers.")
|
|
1264
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1265
|
+
|
|
1266
|
+
# Check if start_rank is less than or equal to end_rank
|
|
1267
|
+
if start_rank > end_rank:
|
|
1268
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1269
|
+
"'deploy' method", \
|
|
1270
|
+
"Provided start rank in 'ranks' must be less than"\
|
|
1271
|
+
" or equal to end rank in 'ranks'.")
|
|
1272
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1273
|
+
|
|
1274
|
+
# check end rank is less than or equal to total models
|
|
1275
|
+
if end_rank > self.leader_board.RANK.max():
|
|
1276
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1277
|
+
"'deploy' method", \
|
|
1278
|
+
"Provided end rank in 'ranks' must be less than"\
|
|
1279
|
+
" or equal to total models available.")
|
|
1280
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1281
|
+
|
|
1282
|
+
return start_rank, end_rank
|
|
1283
|
+
|
|
1284
|
+
@collect_queryband(queryband="AutoML_deploy")
|
|
1285
|
+
def deploy(self,
|
|
1286
|
+
table_name,
|
|
1287
|
+
top_n = 3,
|
|
1288
|
+
ranks = None
|
|
1289
|
+
):
|
|
1290
|
+
"""
|
|
1291
|
+
DESCRIPTION:
|
|
1292
|
+
Function saves models to the specified table name.
|
|
1293
|
+
Note:
|
|
1294
|
+
* If 'ranks' is provided, specified models in 'ranks' will be saved
|
|
1295
|
+
and ranks will be reassigned to specified models based
|
|
1296
|
+
on the order of the leaderboard, non-specified models will be ignored.
|
|
1297
|
+
|
|
1298
|
+
PARAMETERS:
|
|
1299
|
+
table_name:
|
|
1300
|
+
Required Argument.
|
|
1301
|
+
Specifies the table name to which models information is to be saved.
|
|
1302
|
+
Types: str
|
|
1303
|
+
|
|
1304
|
+
top_n:
|
|
1305
|
+
Optional Argument.
|
|
1306
|
+
Specifies the top n models to be saved.
|
|
1307
|
+
Note:
|
|
1308
|
+
* If 'ranks' is not provided, the function saves the top 'top_n' models.
|
|
1309
|
+
|
|
1310
|
+
Default Value: 3
|
|
1311
|
+
Types: int
|
|
1312
|
+
|
|
1313
|
+
ranks:
|
|
1314
|
+
Optional Argument.
|
|
1315
|
+
Specifies the ranks for the models to be saved.
|
|
1316
|
+
Note:
|
|
1317
|
+
* If 'ranks' is provided, then 'top_n' is ignored.
|
|
1318
|
+
Types: int or list of int or range object
|
|
1319
|
+
|
|
1320
|
+
RETURNS:
|
|
1321
|
+
None
|
|
1322
|
+
|
|
1323
|
+
RAISES:
|
|
1324
|
+
TeradataMlException.
|
|
1325
|
+
|
|
1326
|
+
EXAMPLES:
|
|
1327
|
+
# Create an instance of the AutoML called "obj"
|
|
1328
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1329
|
+
>>> obj = AutoML(task_type="Classification")
|
|
1330
|
+
>>> obj.fit(data = data, target_column = target_column)
|
|
1331
|
+
|
|
1332
|
+
# Save top 3 models to the specified table.
|
|
1333
|
+
>>> obj.deploy("model_table")
|
|
1334
|
+
|
|
1335
|
+
# Save top n models to the specified table.
|
|
1336
|
+
>>> obj.deploy("model_table", top_n=5)
|
|
1337
|
+
|
|
1338
|
+
# Save models based on specified ranks to the specified table.
|
|
1339
|
+
>>> obj.deploy("model_table", ranks=[1, 3, 5])
|
|
1340
|
+
|
|
1341
|
+
# Save models based on specified rank range to the specified table.
|
|
1342
|
+
>>> obj.deploy("model_table", ranks=range(2,6))
|
|
1343
|
+
"""
|
|
1344
|
+
# raise Error if fit is not called
|
|
1345
|
+
if not self._is_fit_called:
|
|
1346
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1347
|
+
"'deploy' method", \
|
|
1348
|
+
"'fit' method must be called before" \
|
|
1349
|
+
" 'deploy'.")
|
|
1350
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1351
|
+
|
|
1352
|
+
# Appending arguments to list for validation
|
|
1353
|
+
arg_info_matrix = []
|
|
1354
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1355
|
+
arg_info_matrix.append(["top_n", top_n, True, (int)])
|
|
1356
|
+
if not isinstance(ranks, range):
|
|
1357
|
+
arg_info_matrix.append(["ranks", ranks, True, (int, list)])
|
|
1358
|
+
|
|
1359
|
+
# Validate argument types
|
|
1360
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1361
|
+
|
|
1362
|
+
if isinstance(ranks, int):
|
|
1363
|
+
ranks = [ranks]
|
|
1364
|
+
elif isinstance(ranks, range):
|
|
1365
|
+
start_rank, end_rank = self._validate_ranks(ranks)
|
|
1366
|
+
|
|
1367
|
+
if ranks is None or len(ranks) == 0:
|
|
1368
|
+
# If total models are greater than available models or less than 1
|
|
1369
|
+
try:
|
|
1370
|
+
_Validators._validate_argument_range(top_n, "top_n", lbound=1,
|
|
1371
|
+
ubound=self.leader_board.RANK.max(),
|
|
1372
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1373
|
+
except ValueError as e:
|
|
1374
|
+
msg = "\n'top_n' should be equal or less than the available models or greater than 0. " \
|
|
1375
|
+
"Deploying all available models to the table."
|
|
1376
|
+
warnings.warn(message=msg, stacklevel=2)
|
|
1377
|
+
top_n = self.leader_board.shape[0]
|
|
1378
|
+
elif isinstance(ranks, list):
|
|
1379
|
+
# If ranks is provided, then validating the ranks elements
|
|
1380
|
+
for ele in ranks:
|
|
1381
|
+
_Validators._validate_argument_range(ele, "element in ranks", lbound=1,
|
|
1382
|
+
ubound=self.leader_board.RANK.max(),
|
|
1383
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1384
|
+
|
|
1385
|
+
feature_selections = self.model_info['FEATURE_SELECTION'].unique().tolist()
|
|
1386
|
+
|
|
1387
|
+
# Mapping feature selection to training data,
|
|
1388
|
+
# we are creating a dictionary with key as feature selection and
|
|
1389
|
+
# value as temporary training data table name, so that we can copy
|
|
1390
|
+
# temporary training data to permanent table.
|
|
1391
|
+
# Here's an example of mapping:
|
|
1392
|
+
# Example: {'lasso': 'ml__survived_lasso_1717475362789542',
|
|
1393
|
+
# 'rfe': 'ml__survived_rfe_1717474570567062',
|
|
1394
|
+
# 'pca': 'ml__survived_pca_1717475375119752'}
|
|
1395
|
+
fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
|
|
1396
|
+
'DATA_TABLE'].iloc[0] for fs in feature_selections}
|
|
1397
|
+
|
|
1398
|
+
# Saving temporary training data to permanent table
|
|
1399
|
+
# We are replacing DATA_TABLE with permanent table name in model_info
|
|
1400
|
+
for key, val in fs_to_data_dict.items():
|
|
1401
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, key),
|
|
1402
|
+
persist_result_table=val)
|
|
1403
|
+
fs_to_data_dict[key] = per_name
|
|
1404
|
+
|
|
1405
|
+
# Persist flag
|
|
1406
|
+
persist = self.kwargs.get('persist', False)
|
|
1407
|
+
# If ranks is provided, then saving models based on specified rank
|
|
1408
|
+
# in list will be prioritized over 'top_n'.
|
|
1409
|
+
if ranks is None or len(ranks) == 0:
|
|
1410
|
+
# Saving only top 'top_n' models
|
|
1411
|
+
for index, row in self.model_info.iterrows():
|
|
1412
|
+
if index < top_n:
|
|
1413
|
+
self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1414
|
+
if not persist:
|
|
1415
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1416
|
+
persist_result_table=row['RESULT_TABLE'])
|
|
1417
|
+
self.model_info.loc[index, 'RESULT_TABLE'] = per_name
|
|
1418
|
+
else:
|
|
1419
|
+
break
|
|
1420
|
+
sv_models = self.model_info.drop('model-obj', axis=1).head(top_n)
|
|
1421
|
+
else:
|
|
1422
|
+
if isinstance(ranks, range):
|
|
1423
|
+
# Saving models based on start and end rank.
|
|
1424
|
+
sv_models = self.model_info[start_rank-1:end_rank].copy()
|
|
1425
|
+
else:
|
|
1426
|
+
# Saving models based on specified rank in list
|
|
1427
|
+
sv_models = self.model_info[self.model_info['RANK'].isin(ranks)].copy()
|
|
1428
|
+
sv_models.drop('model-obj', axis=1, inplace=True)
|
|
1429
|
+
sv_models.reset_index(drop=True, inplace=True)
|
|
1430
|
+
|
|
1431
|
+
for index, row in sv_models.iterrows():
|
|
1432
|
+
sv_models.loc[index, 'RANK'] = index + 1
|
|
1433
|
+
sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1434
|
+
if not persist:
|
|
1435
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1436
|
+
persist_result_table=row['RESULT_TABLE'])
|
|
1437
|
+
sv_models.loc[index, 'RESULT_TABLE'] = per_name
|
|
1438
|
+
|
|
1439
|
+
# Data Transformation Parameters
|
|
1440
|
+
df = self._deploy_data_transformation_params()
|
|
1441
|
+
|
|
1442
|
+
# Saving data transformation parameters to the specified table
|
|
1443
|
+
sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
|
|
1444
|
+
|
|
1445
|
+
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
|
|
1446
|
+
|
|
1447
|
+
print('Model Deployment Completed Successfully.')
|
|
1448
|
+
|
|
1449
|
+
def _create_per_result_table(self, prefix, persist_result_table):
|
|
1450
|
+
"""
|
|
1451
|
+
DESCRIPTION:
|
|
1452
|
+
Internal Function creates permanent table for the specified result table.
|
|
1453
|
+
|
|
1454
|
+
PARAMETERS:
|
|
1455
|
+
prefix:
|
|
1456
|
+
Required Argument.
|
|
1457
|
+
Specifies the prefix for the permanent table name.
|
|
1458
|
+
Types: str
|
|
1459
|
+
|
|
1460
|
+
persist_result_table:
|
|
1461
|
+
Required Argument.
|
|
1462
|
+
Specifies the result table name.
|
|
1463
|
+
Types: str
|
|
1464
|
+
|
|
1465
|
+
RETURNS:
|
|
1466
|
+
Permanent table name.
|
|
1467
|
+
|
|
1468
|
+
RAISES:
|
|
1469
|
+
TeradataMlException.
|
|
1470
|
+
"""
|
|
1471
|
+
|
|
1472
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix=prefix,
|
|
1473
|
+
table_type=TeradataConstants.TERADATA_TABLE,
|
|
1474
|
+
gc_on_quit=False)
|
|
1475
|
+
qry = f"SELECT * FROM {persist_result_table}"
|
|
1476
|
+
UtilFuncs._create_table(table_name=table_name,
|
|
1477
|
+
query=qry,
|
|
1478
|
+
volatile=False)
|
|
1479
|
+
return table_name
|
|
1480
|
+
|
|
1481
|
+
|
|
1482
|
+
def _deploy_data_transformation_params(self):
|
|
1483
|
+
"""
|
|
1484
|
+
DESCRIPTION:
|
|
1485
|
+
Internal Function converts data transformation parameters dictonary (information of each step of automl)
|
|
1486
|
+
to DataFrame with rank as -1 and return the DataFrame that can be concatenated with model_info DataFrame
|
|
1487
|
+
and saved to the user specified table in database.
|
|
1488
|
+
|
|
1489
|
+
PARAMETERS:
|
|
1490
|
+
None
|
|
689
1491
|
|
|
690
1492
|
RETURNS:
|
|
691
|
-
|
|
1493
|
+
None
|
|
692
1494
|
|
|
693
1495
|
RAISES:
|
|
694
1496
|
TeradataMlException.
|
|
695
|
-
|
|
696
|
-
EXAMPLES:
|
|
697
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
698
|
-
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
699
|
-
# Perform fit() operation on the "automl_obj".
|
|
700
|
-
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
701
|
-
>>> automl_obj.leaderboard()
|
|
702
1497
|
"""
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
1498
|
+
# Create a new dictionary to store the deep copy
|
|
1499
|
+
data_params = {}
|
|
1500
|
+
|
|
1501
|
+
# Define a recursive function to deep copy dictionaries
|
|
1502
|
+
def deep_copy_dict(d):
|
|
1503
|
+
if not isinstance(d, dict):
|
|
1504
|
+
return d # Base case: if it's not a dictionary, return the value directly
|
|
1505
|
+
return {k: deep_copy_dict(v) for k, v in d.items()} # Recursively copy each item
|
|
711
1506
|
|
|
712
|
-
|
|
1507
|
+
# Deep copy is needed as the original dictionary contains nested dictionaries
|
|
1508
|
+
# and we want to avoid modifying the original dictionary when changes are made.
|
|
1509
|
+
# The .copy() method creates a shallow copy, which does not suffice for nested dictionaries.
|
|
1510
|
+
# Iterate through the original dictionary to handle deep copying.
|
|
1511
|
+
for key, value in self.data_transformation_params.items():
|
|
1512
|
+
# Check if value is a dictionary
|
|
1513
|
+
if isinstance(value, dict):
|
|
1514
|
+
# If the value is a dictionary, create a deep copy of the dictionary
|
|
1515
|
+
# This ensures that nested dictionaries are also copied, not just referenced.
|
|
1516
|
+
data_params[key] = deep_copy_dict(value)
|
|
1517
|
+
else:
|
|
1518
|
+
# If the value is not a dictionary, perform a shallow copy (direct assignment)
|
|
1519
|
+
data_params[key] = value
|
|
1520
|
+
|
|
1521
|
+
# Names of fit objects that contain the table names
|
|
1522
|
+
# pointing to tables in the database.
|
|
1523
|
+
fit_obj_names = []
|
|
1524
|
+
|
|
1525
|
+
# Persist flag
|
|
1526
|
+
persist = self.kwargs.get('persist', False)
|
|
1527
|
+
|
|
1528
|
+
data_params['auto_mode'] = False if self.custom_data is not None else True
|
|
1529
|
+
|
|
1530
|
+
# Iterating over data transformation parameters
|
|
1531
|
+
# aml_step_name is the name of transformation step taken and val is the value
|
|
1532
|
+
for aml_step_name,val in data_params.items():
|
|
1533
|
+
# Checking if value is of type teradataml DataFrame
|
|
1534
|
+
# If yes, then creating permanent table for the same
|
|
1535
|
+
# and storing the table_name in data_params instead of dataframe.
|
|
1536
|
+
if isinstance(val, DataFrame):
|
|
1537
|
+
fit_obj_names.append(aml_step_name)
|
|
1538
|
+
if persist:
|
|
1539
|
+
data_params[aml_step_name] = val._table_name
|
|
1540
|
+
else:
|
|
1541
|
+
per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
|
|
1542
|
+
persist_result_table= val._table_name)
|
|
1543
|
+
data_params[aml_step_name] = per_name
|
|
1544
|
+
elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
|
|
1545
|
+
for key, val in val.items():
|
|
1546
|
+
if isinstance(val, DataFrame):
|
|
1547
|
+
fit_obj_names.append(aml_step_name)
|
|
1548
|
+
if persist:
|
|
1549
|
+
data_params[aml_step_name][key] = val._table_name
|
|
1550
|
+
else:
|
|
1551
|
+
per_name = self._create_per_result_table(prefix='{}'.format(key),
|
|
1552
|
+
persist_result_table= val._table_name)
|
|
1553
|
+
data_params[aml_step_name][key] = per_name
|
|
1554
|
+
elif aml_step_name == 'pca_fit_instance':
|
|
1555
|
+
# Serializing PCA object
|
|
1556
|
+
pca = data_params[aml_step_name]
|
|
1557
|
+
# Extract pca parameters
|
|
1558
|
+
pca_params = {
|
|
1559
|
+
'n_components': pca.n_components_,
|
|
1560
|
+
'components': pca.components_.tolist(),
|
|
1561
|
+
'explained_variance': pca.explained_variance_.tolist(),
|
|
1562
|
+
'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
|
|
1563
|
+
'mean': pca.mean_.tolist(),
|
|
1564
|
+
'singular_values': pca.singular_values_.tolist(),
|
|
1565
|
+
'noise_variance': pca.noise_variance_
|
|
1566
|
+
}
|
|
1567
|
+
data_params[aml_step_name] = pca_params
|
|
1568
|
+
|
|
1569
|
+
# Serializing data transformation parameters
|
|
1570
|
+
buffer = BytesIO()
|
|
1571
|
+
joblib.dump(data_params, buffer)
|
|
1572
|
+
buffer.seek(0)
|
|
1573
|
+
serialized_data = buffer.getvalue()
|
|
1574
|
+
|
|
1575
|
+
# Creating a string representation of fit object names
|
|
1576
|
+
param = json.dumps(fit_obj_names)
|
|
1577
|
+
|
|
1578
|
+
# Creating a DataFrame of data transformation information
|
|
1579
|
+
row = {
|
|
1580
|
+
'RANK':-1,
|
|
1581
|
+
'PARAMETERS':param,
|
|
1582
|
+
'DATA_PARAMS':serialized_data,
|
|
1583
|
+
}
|
|
1584
|
+
df = pd.DataFrame([row])
|
|
1585
|
+
|
|
1586
|
+
return df
|
|
1587
|
+
|
|
1588
|
+
def _run_loaded_model(self,
|
|
1589
|
+
test_data,
|
|
1590
|
+
rank=1,
|
|
1591
|
+
output_type='prediction'):
|
|
713
1592
|
"""
|
|
714
1593
|
DESCRIPTION:
|
|
715
|
-
Function
|
|
1594
|
+
Internal Function generates prediction and performance metrics using the specified model rank
|
|
1595
|
+
in the loaded models leaderboard.
|
|
1596
|
+
|
|
1597
|
+
PARAMETERS:
|
|
1598
|
+
test_data:
|
|
1599
|
+
Required Argument.
|
|
1600
|
+
Specifies the test data on which prediction and performance metrics needs to be generated.
|
|
1601
|
+
Types: teradataml DataFrame
|
|
1602
|
+
|
|
1603
|
+
rank:
|
|
1604
|
+
Optional Argument.
|
|
1605
|
+
Specifies the rank of the model in the leaderboard to be used for prediction.
|
|
1606
|
+
Default Value: 1
|
|
1607
|
+
Types: int
|
|
1608
|
+
|
|
1609
|
+
output_type:
|
|
1610
|
+
Optional Argument.
|
|
1611
|
+
Specifies the type of output to be generated.
|
|
1612
|
+
Default Value: 'prediction'
|
|
1613
|
+
Types: str
|
|
1614
|
+
Permitted Values: 'prediction', 'metrics'
|
|
1615
|
+
|
|
1616
|
+
RETURNS:
|
|
1617
|
+
Tuple containing prediction and performance metrics.
|
|
1618
|
+
|
|
1619
|
+
RAISES:
|
|
1620
|
+
TeradataMlException.
|
|
1621
|
+
|
|
1622
|
+
"""
|
|
1623
|
+
# Indexing starts from 0
|
|
1624
|
+
rank = rank - 1
|
|
1625
|
+
# Extracting parameters
|
|
1626
|
+
parameters = ast.literal_eval(self.loaded_models_info.loc[rank, 'PARAMETERS'])
|
|
1627
|
+
# Model name
|
|
1628
|
+
model_name = self.loaded_models_info.loc[rank, 'MODEL_ID'].split('_')[0]
|
|
1629
|
+
# Feature selection
|
|
1630
|
+
fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
|
|
1631
|
+
|
|
1632
|
+
# Checking task type
|
|
1633
|
+
if 'R2' in self.loaded_models_info.columns:
|
|
1634
|
+
task_type='Regression'
|
|
1635
|
+
else:
|
|
1636
|
+
task_type='Classification'
|
|
1637
|
+
|
|
1638
|
+
# Model names mapping to Analytic Functions
|
|
1639
|
+
func_map = {
|
|
1640
|
+
'XGBOOST': lambda params: XGBoost(**params),
|
|
1641
|
+
'GLM': lambda params: GLM(**params),
|
|
1642
|
+
'SVM': lambda params: SVM(**params),
|
|
1643
|
+
'DECISIONFOREST': lambda params: DecisionForest(**params),
|
|
1644
|
+
'KNN': lambda params: KNN(**params)
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
if output_type == 'prediction':
|
|
1648
|
+
print('Generating prediction using:')
|
|
1649
|
+
else:
|
|
1650
|
+
print('Generating performance metrics using:')
|
|
1651
|
+
print(f"Model Name: {model_name}")
|
|
1652
|
+
print(f"Feature Selection: {fs}")
|
|
1653
|
+
|
|
1654
|
+
# Generating evaluation parameters
|
|
1655
|
+
eval_params = _ModelTraining._eval_params_generation(model_name,
|
|
1656
|
+
parameters['response_column'],
|
|
1657
|
+
task_type)
|
|
1658
|
+
if task_type == 'Classification':
|
|
1659
|
+
eval_params['output_responses'] = parameters['output_responses']
|
|
1660
|
+
|
|
1661
|
+
# Checking if response column is present in test data
|
|
1662
|
+
if parameters['response_column'] not in test_data.columns:
|
|
1663
|
+
# Checking if output type is evaluation
|
|
1664
|
+
if output_type == 'evaluation':
|
|
1665
|
+
# Response column is rqeuired for evaluation, raise error if not present
|
|
1666
|
+
raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
|
|
1667
|
+
eval_params.pop('accumulate', None)
|
|
1668
|
+
reponse_col_present = False
|
|
1669
|
+
else:
|
|
1670
|
+
reponse_col_present = True
|
|
1671
|
+
|
|
1672
|
+
# Checking if data is already transformed before or not
|
|
1673
|
+
data_node_id = test_data._nodeid
|
|
1674
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
1675
|
+
# Data transformation will be performed on raw test data
|
|
1676
|
+
self.transform_data(data=test_data,
|
|
1677
|
+
data_params=self.loaded_data_transformation_params,
|
|
1678
|
+
auto=self.loaded_data_transformation_params['auto_mode'],
|
|
1679
|
+
verbose=0,
|
|
1680
|
+
target_column_ind=reponse_col_present)
|
|
1681
|
+
|
|
1682
|
+
# Extracting test data
|
|
1683
|
+
for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
|
|
1684
|
+
if fs in feature_selection:
|
|
1685
|
+
test_data = DataFrame(table_name)
|
|
1686
|
+
break
|
|
1687
|
+
|
|
1688
|
+
if model_name == 'KNN':
|
|
1689
|
+
train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
|
|
1690
|
+
|
|
1691
|
+
parameters['train_data'] = train_data
|
|
1692
|
+
parameters['test_data'] = test_data
|
|
716
1693
|
|
|
1694
|
+
if parameters['response_column'] in test_data.columns:
|
|
1695
|
+
parameters['accumulate'] = parameters['response_column']
|
|
1696
|
+
|
|
1697
|
+
knn = func_map[model_name](parameters)
|
|
1698
|
+
|
|
1699
|
+
# Checking if response column is present in test data
|
|
1700
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1701
|
+
metrics = knn.evaluate(test_data=test_data, **eval_params)
|
|
1702
|
+
else:
|
|
1703
|
+
predictions = knn.result
|
|
1704
|
+
else:
|
|
1705
|
+
# Extracting result table name
|
|
1706
|
+
result_table_name = self.loaded_models_info.loc[rank, 'RESULT_TABLE']
|
|
1707
|
+
result_table = DataFrame(result_table_name)
|
|
1708
|
+
params = {
|
|
1709
|
+
"skip_input_arg_processing":True,
|
|
1710
|
+
"skip_output_arg_processing":True,
|
|
1711
|
+
"skip_other_arg_processing":True,
|
|
1712
|
+
"skip_func_output_processing":True,
|
|
1713
|
+
"_result_data":result_table,
|
|
1714
|
+
"response_column": parameters['response_column']
|
|
1715
|
+
}
|
|
1716
|
+
model = func_map[model_name](params)
|
|
1717
|
+
# Checking if response column is present in test data
|
|
1718
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1719
|
+
metrics = model.evaluate(newdata=test_data, **eval_params)
|
|
1720
|
+
else:
|
|
1721
|
+
predictions = model.predict(newdata=test_data, **eval_params)
|
|
1722
|
+
|
|
1723
|
+
# Return prediction and metrics, when output type is metrics
|
|
1724
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1725
|
+
return metrics
|
|
1726
|
+
|
|
1727
|
+
# Return prediction, when output type is prediction
|
|
1728
|
+
return predictions if model_name == 'KNN' else predictions.result
|
|
1729
|
+
|
|
1730
|
+
@collect_queryband(queryband="AutoML_remove_saved_models")
|
|
1731
|
+
def remove_saved_models(self,
|
|
1732
|
+
table_name):
|
|
1733
|
+
"""
|
|
1734
|
+
DESCRIPTION:
|
|
1735
|
+
Function removes the specified table containing saved models.
|
|
1736
|
+
Note:
|
|
1737
|
+
* If any data table result table is not present inside the database,
|
|
1738
|
+
then it will be skipped.
|
|
1739
|
+
|
|
1740
|
+
PARAMETERS:
|
|
1741
|
+
table_name:
|
|
1742
|
+
Required Argument.
|
|
1743
|
+
Specifies the table name containing saved models.
|
|
1744
|
+
Types: str
|
|
1745
|
+
|
|
717
1746
|
RETURNS:
|
|
718
1747
|
None
|
|
719
1748
|
|
|
@@ -721,25 +1750,48 @@ class AutoML:
|
|
|
721
1750
|
TeradataMlException.
|
|
722
1751
|
|
|
723
1752
|
EXAMPLES:
|
|
724
|
-
# Create an instance of the AutoML called "
|
|
1753
|
+
# Create an instance of the AutoML called "obj"
|
|
725
1754
|
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
726
|
-
|
|
727
|
-
#
|
|
728
|
-
|
|
729
|
-
>>> automl_obj.leader()
|
|
1755
|
+
>>> obj = AutoML()
|
|
1756
|
+
# Remove saved models from the specified table.
|
|
1757
|
+
>>> obj.remove_saved_models("model_table")
|
|
730
1758
|
"""
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
1759
|
+
# Appending arguments to list for validation
|
|
1760
|
+
arg_info_matrix = []
|
|
1761
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1762
|
+
|
|
1763
|
+
# Validate argument types
|
|
1764
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1765
|
+
|
|
1766
|
+
df = DataFrame(table_name).to_pandas()
|
|
1767
|
+
|
|
1768
|
+
drop_list = df['DATA_TABLE'].dropna().unique().tolist()
|
|
1769
|
+
drop_list.extend(df['RESULT_TABLE'].dropna().unique().tolist())
|
|
1770
|
+
|
|
1771
|
+
# Removing data transformation parameters tables
|
|
1772
|
+
data=df[df['RANK'] == -1].iloc[0]
|
|
1773
|
+
buffer = BytesIO(data['DATA_PARAMS'])
|
|
1774
|
+
data_params = joblib.load(buffer)
|
|
1775
|
+
fit_obj_lst = json.loads(data['PARAMETERS'])
|
|
1776
|
+
for i in fit_obj_lst:
|
|
1777
|
+
if isinstance(data_params[i], dict):
|
|
1778
|
+
drop_list.extend(data_params[i].values())
|
|
1779
|
+
else:
|
|
1780
|
+
drop_list.append(data_params[i])
|
|
1781
|
+
|
|
1782
|
+
non_existent_tables = []
|
|
1783
|
+
for table in drop_list:
|
|
1784
|
+
try:
|
|
1785
|
+
execute_sql(f"DROP TABLE {table};")
|
|
1786
|
+
except Exception as e:
|
|
1787
|
+
non_existent_tables.append(table)
|
|
1788
|
+
continue
|
|
1789
|
+
|
|
1790
|
+
if len(non_existent_tables) > 0:
|
|
1791
|
+
warnings.warn(message=f"\nThe following tables '{non_existent_tables}' do not exist in the database and have been skipped.",
|
|
1792
|
+
stacklevel=2)
|
|
1793
|
+
|
|
1794
|
+
db_drop_table(table_name)
|
|
743
1795
|
|
|
744
1796
|
@staticmethod
|
|
745
1797
|
def generate_custom_config(file_name = "custom"):
|
|
@@ -810,12 +1862,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
810
1862
|
Types: teradataml Dataframe
|
|
811
1863
|
|
|
812
1864
|
target_column:
|
|
813
|
-
Required
|
|
1865
|
+
Required Argument.
|
|
814
1866
|
Specifies the name of the target column in "data".
|
|
815
1867
|
Types: str
|
|
816
1868
|
|
|
817
1869
|
custom_data:
|
|
818
|
-
Optional
|
|
1870
|
+
Optional Argument.
|
|
819
1871
|
Specifies json object containing user customized input.
|
|
820
1872
|
Types: json object
|
|
821
1873
|
"""
|
|
@@ -830,14 +1882,16 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
830
1882
|
verbose = 0,
|
|
831
1883
|
max_runtime_secs = None,
|
|
832
1884
|
stopping_metric = None,
|
|
833
|
-
stopping_tolerance = None
|
|
1885
|
+
stopping_tolerance = None,
|
|
1886
|
+
max_models = None,
|
|
1887
|
+
**kwargs):
|
|
834
1888
|
"""
|
|
835
1889
|
DESCRIPTION:
|
|
836
1890
|
Interal Function runs Regression.
|
|
837
1891
|
|
|
838
1892
|
PARAMETERS:
|
|
839
1893
|
auto:
|
|
840
|
-
Optional
|
|
1894
|
+
Optional Argument.
|
|
841
1895
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
842
1896
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
843
1897
|
Types: bool
|
|
@@ -853,20 +1907,44 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
853
1907
|
Types: int
|
|
854
1908
|
|
|
855
1909
|
max_runtime_secs:
|
|
856
|
-
Optional
|
|
1910
|
+
Optional Argument.
|
|
857
1911
|
Specifies the time limit in seconds for model training.
|
|
858
1912
|
Types: int
|
|
859
1913
|
|
|
860
1914
|
stopping_metric:
|
|
861
1915
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
862
|
-
|
|
1916
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
863
1917
|
Types: str
|
|
864
1918
|
|
|
865
1919
|
stopping_tolerance:
|
|
866
1920
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
867
|
-
|
|
1921
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
868
1922
|
Types: float
|
|
1923
|
+
|
|
1924
|
+
max_models:
|
|
1925
|
+
Optional Argument.
|
|
1926
|
+
Specifies the maximum number of models to be trained.
|
|
1927
|
+
Types: int
|
|
869
1928
|
|
|
1929
|
+
volatile:
|
|
1930
|
+
Optional Argument.
|
|
1931
|
+
Specifies whether to put the results of the
|
|
1932
|
+
function in a volatile table or not. When set to
|
|
1933
|
+
True, results are stored in a volatile table,
|
|
1934
|
+
otherwise not.
|
|
1935
|
+
Default Value: False
|
|
1936
|
+
Types: bool
|
|
1937
|
+
|
|
1938
|
+
persist:
|
|
1939
|
+
Optional Argument.
|
|
1940
|
+
Specifies whether to persist the results of the
|
|
1941
|
+
function in a table or not. When set to True,
|
|
1942
|
+
results are persisted in a table; otherwise,
|
|
1943
|
+
results are garbage collected at the end of the
|
|
1944
|
+
session.
|
|
1945
|
+
Default Value: False
|
|
1946
|
+
Types: bool
|
|
1947
|
+
|
|
870
1948
|
RETURNS:
|
|
871
1949
|
a tuple containing, model information and leaderboard.
|
|
872
1950
|
"""
|
|
@@ -883,7 +1961,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
883
1961
|
target_column = self.target_column,
|
|
884
1962
|
model_list = model_list,
|
|
885
1963
|
verbose = verbose,
|
|
886
|
-
custom_data = self.custom_data
|
|
1964
|
+
custom_data = self.custom_data,
|
|
1965
|
+
**kwargs)
|
|
887
1966
|
# Start time
|
|
888
1967
|
start_time = time.time()
|
|
889
1968
|
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
@@ -895,7 +1974,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
895
1974
|
verbose = verbose,
|
|
896
1975
|
excluded_columns = excluded_columns,
|
|
897
1976
|
custom_data = self.custom_data,
|
|
898
|
-
data_transform_dict = data_transformation_params
|
|
1977
|
+
data_transform_dict = data_transformation_params,
|
|
1978
|
+
**kwargs)
|
|
899
1979
|
features, data_transformation_params = self.data_preparation(auto)
|
|
900
1980
|
|
|
901
1981
|
# Calculating max_runtime_secs for model training by,
|
|
@@ -915,11 +1995,13 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
915
1995
|
verbose = verbose,
|
|
916
1996
|
features = features,
|
|
917
1997
|
task_type = "Regression",
|
|
918
|
-
custom_data = self.custom_data
|
|
1998
|
+
custom_data = self.custom_data,
|
|
1999
|
+
**kwargs)
|
|
919
2000
|
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
920
2001
|
max_runtime_secs = max_runtime_secs,
|
|
921
2002
|
stopping_metric = stopping_metric,
|
|
922
|
-
stopping_tolerance = stopping_tolerance
|
|
2003
|
+
stopping_tolerance = stopping_tolerance,
|
|
2004
|
+
max_models = max_models)
|
|
923
2005
|
|
|
924
2006
|
return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
|
|
925
2007
|
|
|
@@ -940,12 +2022,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
940
2022
|
Types: teradataml Dataframe
|
|
941
2023
|
|
|
942
2024
|
target_column:
|
|
943
|
-
Required
|
|
2025
|
+
Required Argument.
|
|
944
2026
|
Specifies the name of the target column in "data".
|
|
945
2027
|
Types: str
|
|
946
2028
|
|
|
947
2029
|
custom_data:
|
|
948
|
-
Optional
|
|
2030
|
+
Optional Argument.
|
|
949
2031
|
Specifies json object containing user customized input.
|
|
950
2032
|
Types: json object
|
|
951
2033
|
"""
|
|
@@ -959,14 +2041,16 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
959
2041
|
verbose = 0,
|
|
960
2042
|
max_runtime_secs = None,
|
|
961
2043
|
stopping_metric = None,
|
|
962
|
-
stopping_tolerance = None
|
|
2044
|
+
stopping_tolerance = None,
|
|
2045
|
+
max_models = None,
|
|
2046
|
+
**kwargs):
|
|
963
2047
|
"""
|
|
964
2048
|
DESCRIPTION:
|
|
965
2049
|
Interal Function runs Classification.
|
|
966
2050
|
|
|
967
2051
|
PARAMETERS:
|
|
968
2052
|
auto:
|
|
969
|
-
Optional
|
|
2053
|
+
Optional Argument.
|
|
970
2054
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
971
2055
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
972
2056
|
Types: bool
|
|
@@ -982,7 +2066,7 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
982
2066
|
Types: int
|
|
983
2067
|
|
|
984
2068
|
max_runtime_secs:
|
|
985
|
-
Optional
|
|
2069
|
+
Optional Argument.
|
|
986
2070
|
Specifies the time limit in seconds for model training.
|
|
987
2071
|
Types: int
|
|
988
2072
|
|
|
@@ -995,12 +2079,35 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
995
2079
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
996
2080
|
Specifies the stopping tolerance for stopping metrics in model training.
|
|
997
2081
|
Types: float
|
|
998
|
-
|
|
2082
|
+
|
|
2083
|
+
max_models:
|
|
2084
|
+
Optional Argument.
|
|
2085
|
+
Specifies the maximum number of models to be trained.
|
|
2086
|
+
Types: int
|
|
2087
|
+
|
|
2088
|
+
volatile:
|
|
2089
|
+
Optional Argument.
|
|
2090
|
+
Specifies whether to put the results of the
|
|
2091
|
+
function in a volatile table or not. When set to
|
|
2092
|
+
True, results are stored in a volatile table,
|
|
2093
|
+
otherwise not.
|
|
2094
|
+
Default Value: False
|
|
2095
|
+
Types: bool
|
|
2096
|
+
|
|
2097
|
+
persist:
|
|
2098
|
+
Optional Argument.
|
|
2099
|
+
Specifies whether to persist the results of the
|
|
2100
|
+
function in a table or not. When set to True,
|
|
2101
|
+
results are persisted in a table; otherwise,
|
|
2102
|
+
results are garbage collected at the end of the
|
|
2103
|
+
session.
|
|
2104
|
+
Default Value: False
|
|
2105
|
+
Types: bool
|
|
2106
|
+
|
|
999
2107
|
RETURNS:
|
|
1000
2108
|
a tuple containing, model information and leaderboard.
|
|
1001
2109
|
"""
|
|
1002
|
-
|
|
1003
|
-
|
|
2110
|
+
|
|
1004
2111
|
# Feature Exploration Phase
|
|
1005
2112
|
_FeatureExplore.__init__(self,
|
|
1006
2113
|
data = self.data,
|
|
@@ -1015,7 +2122,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1015
2122
|
model_list = model_list,
|
|
1016
2123
|
verbose = verbose,
|
|
1017
2124
|
task_type = "Classification",
|
|
1018
|
-
custom_data = self.custom_data
|
|
2125
|
+
custom_data = self.custom_data,
|
|
2126
|
+
**kwargs)
|
|
1019
2127
|
# Start time
|
|
1020
2128
|
start_time = time.time()
|
|
1021
2129
|
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
@@ -1027,7 +2135,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1027
2135
|
excluded_columns = excluded_columns,
|
|
1028
2136
|
custom_data = self.custom_data,
|
|
1029
2137
|
data_transform_dict = data_transformation_params,
|
|
1030
|
-
task_type = "Classification"
|
|
2138
|
+
task_type = "Classification",
|
|
2139
|
+
**kwargs)
|
|
1031
2140
|
features, data_transformation_params = self.data_preparation(auto)
|
|
1032
2141
|
|
|
1033
2142
|
# Calculating max_runtime_secs for model training by,
|
|
@@ -1047,11 +2156,13 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1047
2156
|
verbose = verbose,
|
|
1048
2157
|
features = features,
|
|
1049
2158
|
task_type = "Classification",
|
|
1050
|
-
custom_data = self.custom_data
|
|
2159
|
+
custom_data = self.custom_data,
|
|
2160
|
+
**kwargs)
|
|
1051
2161
|
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
1052
2162
|
max_runtime_secs = max_runtime_secs,
|
|
1053
2163
|
stopping_metric = stopping_metric,
|
|
1054
|
-
stopping_tolerance = stopping_tolerance
|
|
2164
|
+
stopping_tolerance = stopping_tolerance,
|
|
2165
|
+
max_models = max_models)
|
|
1055
2166
|
|
|
1056
2167
|
return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
|
|
1057
2168
|
|
|
@@ -1166,7 +2277,7 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1166
2277
|
min_label_count = min(data[self.target_column].value_counts())
|
|
1167
2278
|
if self._data_sampling_method == 'SMOTE':
|
|
1168
2279
|
n_neighbors = min(5, min_label_count - 1)
|
|
1169
|
-
sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=
|
|
2280
|
+
sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=42)
|
|
1170
2281
|
else:
|
|
1171
2282
|
n_neighbors = min(3, min_label_count)
|
|
1172
2283
|
sampling_method = NearMiss(version=1, n_neighbors=n_neighbors)
|
|
@@ -1206,7 +2317,9 @@ class AutoRegressor(AutoML):
|
|
|
1206
2317
|
max_runtime_secs=None,
|
|
1207
2318
|
stopping_metric=None,
|
|
1208
2319
|
stopping_tolerance=None,
|
|
1209
|
-
|
|
2320
|
+
max_models=None,
|
|
2321
|
+
custom_config_file=None,
|
|
2322
|
+
**kwargs
|
|
1210
2323
|
):
|
|
1211
2324
|
"""
|
|
1212
2325
|
DESCRIPTION:
|
|
@@ -1239,7 +2352,7 @@ class AutoRegressor(AutoML):
|
|
|
1239
2352
|
Types: int
|
|
1240
2353
|
|
|
1241
2354
|
max_runtime_secs:
|
|
1242
|
-
Optional
|
|
2355
|
+
Optional Argument.
|
|
1243
2356
|
Specifies the time limit in seconds for model training.
|
|
1244
2357
|
Types: int
|
|
1245
2358
|
|
|
@@ -1247,8 +2360,10 @@ class AutoRegressor(AutoML):
|
|
|
1247
2360
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1248
2361
|
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1249
2362
|
Permitted Values:
|
|
1250
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1251
|
-
"RMSE", "RMSLE"
|
|
2363
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
2364
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
2365
|
+
"ME", "EV", "MPD", "MGD"
|
|
2366
|
+
|
|
1252
2367
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1253
2368
|
'MICRO-RECALL','MACRO-RECALL',
|
|
1254
2369
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -1260,12 +2375,39 @@ class AutoRegressor(AutoML):
|
|
|
1260
2375
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
1261
2376
|
Specifies the stopping tolerance for stopping metrics in model training.
|
|
1262
2377
|
Types: float
|
|
2378
|
+
|
|
2379
|
+
max_models:
|
|
2380
|
+
Optional Argument.
|
|
2381
|
+
Specifies the maximum number of models to be trained.
|
|
2382
|
+
Types: int
|
|
1263
2383
|
|
|
1264
2384
|
custom_config_file:
|
|
1265
2385
|
Optional Argument.
|
|
1266
2386
|
Specifies the path of JSON file in case of custom run.
|
|
1267
2387
|
Types: str
|
|
1268
|
-
|
|
2388
|
+
|
|
2389
|
+
**kwargs:
|
|
2390
|
+
Specifies the additional arguments for AutoRegressor. Below
|
|
2391
|
+
are the additional arguments:
|
|
2392
|
+
volatile:
|
|
2393
|
+
Optional Argument.
|
|
2394
|
+
Specifies whether to put the interim results of the
|
|
2395
|
+
functions in a volatile table or not. When set to
|
|
2396
|
+
True, results are stored in a volatile table,
|
|
2397
|
+
otherwise not.
|
|
2398
|
+
Default Value: False
|
|
2399
|
+
Types: bool
|
|
2400
|
+
|
|
2401
|
+
persist:
|
|
2402
|
+
Optional Argument.
|
|
2403
|
+
Specifies whether to persist the interim results of the
|
|
2404
|
+
functions in a table or not. When set to True,
|
|
2405
|
+
results are persisted in a table; otherwise,
|
|
2406
|
+
results are garbage collected at the end of the
|
|
2407
|
+
session.
|
|
2408
|
+
Default Value: False
|
|
2409
|
+
Types: bool
|
|
2410
|
+
|
|
1269
2411
|
RETURNS:
|
|
1270
2412
|
Instance of AutoRegressor.
|
|
1271
2413
|
|
|
@@ -1294,24 +2436,28 @@ class AutoRegressor(AutoML):
|
|
|
1294
2436
|
|
|
1295
2437
|
# Fit the data.
|
|
1296
2438
|
>>> automl_obj.fit(housing_train, "price")
|
|
2439
|
+
|
|
2440
|
+
# Display leaderboard.
|
|
2441
|
+
>>> automl_obj.leaderboard()
|
|
1297
2442
|
|
|
1298
|
-
#
|
|
1299
|
-
>>>
|
|
1300
|
-
>>> prediction
|
|
2443
|
+
# Display best performing model.
|
|
2444
|
+
>>> automl_obj.leader()
|
|
1301
2445
|
|
|
1302
|
-
# Run predict
|
|
2446
|
+
# Run predict on test data using best performing model.
|
|
1303
2447
|
>>> prediction = automl_obj.predict(housing_test)
|
|
1304
2448
|
>>> prediction
|
|
1305
2449
|
|
|
1306
|
-
# Run predict
|
|
2450
|
+
# Run predict on test data using second best performing model.
|
|
1307
2451
|
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
1308
2452
|
>>> prediction
|
|
1309
|
-
|
|
1310
|
-
#
|
|
1311
|
-
>>> automl_obj.
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
2453
|
+
|
|
2454
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2455
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2456
|
+
>>> performance_metrics
|
|
2457
|
+
|
|
2458
|
+
# Run evaluate to get performance metrics using second best performing model.
|
|
2459
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test, 2)
|
|
2460
|
+
>>> performance_metrics
|
|
1315
2461
|
|
|
1316
2462
|
# Example 2 : Run AutoRegressor for regression problem with early stopping metric and tolerance.
|
|
1317
2463
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -1325,19 +2471,24 @@ class AutoRegressor(AutoML):
|
|
|
1325
2471
|
|
|
1326
2472
|
# Create instance of AutoRegressor.
|
|
1327
2473
|
>>> automl_obj = AutoRegressor(verbose=2,
|
|
1328
|
-
>>>
|
|
1329
|
-
>>>
|
|
1330
|
-
>>>
|
|
1331
|
-
>>>
|
|
2474
|
+
>>> exclude="xgboost",
|
|
2475
|
+
>>> stopping_metric="R2",
|
|
2476
|
+
>>> stopping_tolerance=0.7,
|
|
2477
|
+
>>> max_models=10,
|
|
2478
|
+
>>> custom_config_file="custom_housing.json")
|
|
1332
2479
|
# Fit the data.
|
|
1333
2480
|
>>> automl_obj.fit(housing_train, "price")
|
|
1334
|
-
|
|
1335
|
-
# Run predict with best performing model.
|
|
1336
|
-
>>> prediction = automl_obj.predict()
|
|
1337
|
-
>>> prediction
|
|
1338
|
-
|
|
2481
|
+
|
|
1339
2482
|
# Display leaderboard.
|
|
1340
2483
|
>>> automl_obj.leaderboard()
|
|
2484
|
+
|
|
2485
|
+
# Run predict on test data using best performing model.
|
|
2486
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
2487
|
+
>>> prediction
|
|
2488
|
+
|
|
2489
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2490
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2491
|
+
>>> performance_metrics
|
|
1341
2492
|
|
|
1342
2493
|
# Example 3 : Run AutoRegressor for regression problem with maximum runtime.
|
|
1343
2494
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -1345,29 +2496,34 @@ class AutoRegressor(AutoML):
|
|
|
1345
2496
|
|
|
1346
2497
|
# Create instance of AutoRegressor.
|
|
1347
2498
|
>>> automl_obj = AutoRegressor(verbose=2,
|
|
1348
|
-
>>>
|
|
1349
|
-
>>>
|
|
2499
|
+
>>> exclude="xgboost",
|
|
2500
|
+
>>> max_runtime_secs=500)
|
|
1350
2501
|
# Fit the data.
|
|
1351
2502
|
>>> automl_obj.fit(housing_train, "price")
|
|
1352
|
-
|
|
1353
|
-
# Run predict with best performing model.
|
|
1354
|
-
>>> prediction = automl_obj.predict()
|
|
1355
|
-
>>> prediction
|
|
1356
|
-
|
|
1357
|
-
# Run predict with second best performing model.
|
|
1358
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1359
|
-
>>> prediction
|
|
1360
|
-
|
|
2503
|
+
|
|
1361
2504
|
# Display leaderboard.
|
|
1362
2505
|
>>> automl_obj.leaderboard()
|
|
1363
2506
|
|
|
1364
2507
|
# Display best performing model.
|
|
1365
2508
|
>>> automl_obj.leader()
|
|
2509
|
+
|
|
2510
|
+
# Run predict on test data using best performing model.
|
|
2511
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
2512
|
+
>>> prediction
|
|
2513
|
+
|
|
2514
|
+
# Run predict on test data using second best performing model.
|
|
2515
|
+
>>> prediction = automl_obj.predict(housing_test, 2)
|
|
2516
|
+
>>> prediction
|
|
2517
|
+
|
|
2518
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2519
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2520
|
+
>>> performance_metrics
|
|
1366
2521
|
"""
|
|
1367
2522
|
self.verbose = verbose
|
|
1368
2523
|
self.max_runtime_secs = max_runtime_secs
|
|
1369
2524
|
self.stopping_metric = stopping_metric
|
|
1370
2525
|
self.stopping_tolerance = stopping_tolerance
|
|
2526
|
+
self.max_models = max_models
|
|
1371
2527
|
self.custom_config_file = custom_config_file
|
|
1372
2528
|
self.task_type = "Regression"
|
|
1373
2529
|
self.include = include
|
|
@@ -1380,7 +2536,9 @@ class AutoRegressor(AutoML):
|
|
|
1380
2536
|
max_runtime_secs=self.max_runtime_secs,
|
|
1381
2537
|
stopping_metric=self.stopping_metric,
|
|
1382
2538
|
stopping_tolerance=self.stopping_tolerance,
|
|
1383
|
-
|
|
2539
|
+
max_models=self.max_models,
|
|
2540
|
+
custom_config_file=self.custom_config_file,
|
|
2541
|
+
**kwargs)
|
|
1384
2542
|
class AutoClassifier(AutoML):
|
|
1385
2543
|
|
|
1386
2544
|
def __init__(self,
|
|
@@ -1390,7 +2548,9 @@ class AutoClassifier(AutoML):
|
|
|
1390
2548
|
max_runtime_secs=None,
|
|
1391
2549
|
stopping_metric=None,
|
|
1392
2550
|
stopping_tolerance=None,
|
|
1393
|
-
|
|
2551
|
+
max_models=None,
|
|
2552
|
+
custom_config_file=None,
|
|
2553
|
+
**kwargs
|
|
1394
2554
|
):
|
|
1395
2555
|
"""
|
|
1396
2556
|
DESCRIPTION:
|
|
@@ -1423,32 +2583,61 @@ class AutoClassifier(AutoML):
|
|
|
1423
2583
|
Types: int
|
|
1424
2584
|
|
|
1425
2585
|
max_runtime_secs:
|
|
1426
|
-
Optional
|
|
2586
|
+
Optional Argument.
|
|
1427
2587
|
Specifies the time limit in seconds for model training.
|
|
1428
2588
|
Types: int
|
|
1429
2589
|
|
|
1430
2590
|
stopping_metric:
|
|
1431
2591
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1432
2592
|
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1433
|
-
Types: str
|
|
1434
|
-
|
|
1435
|
-
stopping_tolerance:
|
|
1436
|
-
Required, when "stopping_metric" is set, otherwise optional.
|
|
1437
|
-
Specifies the stopping tolerance for stopping metrics in model training.
|
|
1438
2593
|
Permitted Values:
|
|
1439
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1440
|
-
"RMSE", "RMSLE"
|
|
2594
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
2595
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
2596
|
+
"ME", "EV", "MPD", "MGD"
|
|
2597
|
+
|
|
1441
2598
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1442
2599
|
'MICRO-RECALL','MACRO-RECALL',
|
|
1443
2600
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
1444
2601
|
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
1445
2602
|
'WEIGHTED-F1', 'ACCURACY'
|
|
2603
|
+
Types: str
|
|
2604
|
+
|
|
2605
|
+
stopping_tolerance:
|
|
2606
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
2607
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
1446
2608
|
Types: float
|
|
2609
|
+
|
|
2610
|
+
max_models:
|
|
2611
|
+
Optional Argument.
|
|
2612
|
+
Specifies the maximum number of models to be trained.
|
|
2613
|
+
Types: int
|
|
1447
2614
|
|
|
1448
2615
|
custom_config_file:
|
|
1449
2616
|
Optional Argument.
|
|
1450
2617
|
Specifies the path of json file in case of custom run.
|
|
1451
2618
|
Types: str
|
|
2619
|
+
|
|
2620
|
+
**kwargs:
|
|
2621
|
+
Specifies the additional arguments for AutoClassifier. Below
|
|
2622
|
+
are the additional arguments:
|
|
2623
|
+
volatile:
|
|
2624
|
+
Optional Argument.
|
|
2625
|
+
Specifies whether to put the interim results of the
|
|
2626
|
+
functions in a volatile table or not. When set to
|
|
2627
|
+
True, results are stored in a volatile table,
|
|
2628
|
+
otherwise not.
|
|
2629
|
+
Default Value: False
|
|
2630
|
+
Types: bool
|
|
2631
|
+
|
|
2632
|
+
persist:
|
|
2633
|
+
Optional Argument.
|
|
2634
|
+
Specifies whether to persist the interim results of the
|
|
2635
|
+
functions in a table or not. When set to True,
|
|
2636
|
+
results are persisted in a table; otherwise,
|
|
2637
|
+
results are garbage collected at the end of the
|
|
2638
|
+
session.
|
|
2639
|
+
Default Value: False
|
|
2640
|
+
Types: bool
|
|
1452
2641
|
|
|
1453
2642
|
RETURNS:
|
|
1454
2643
|
Instance of AutoClassifier.
|
|
@@ -1484,24 +2673,28 @@ class AutoClassifier(AutoML):
|
|
|
1484
2673
|
|
|
1485
2674
|
# Fit the data.
|
|
1486
2675
|
>>> automl_obj.fit(admissions_train, "admitted")
|
|
2676
|
+
|
|
2677
|
+
# Display leaderboard.
|
|
2678
|
+
>>> automl_obj.leaderboard()
|
|
1487
2679
|
|
|
1488
|
-
#
|
|
1489
|
-
>>>
|
|
1490
|
-
>>> prediction
|
|
2680
|
+
# Display best performing model.
|
|
2681
|
+
>>> automl_obj.leader()
|
|
1491
2682
|
|
|
1492
|
-
# Run predict
|
|
2683
|
+
# Run predict on test data using best performing model.
|
|
1493
2684
|
>>> prediction = automl_obj.predict(admissions_test)
|
|
1494
2685
|
>>> prediction
|
|
1495
2686
|
|
|
1496
|
-
# Run predict
|
|
2687
|
+
# Run predict on test data using second best performing model.
|
|
1497
2688
|
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
1498
2689
|
>>> prediction
|
|
1499
|
-
|
|
1500
|
-
#
|
|
1501
|
-
>>> automl_obj.
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
2690
|
+
|
|
2691
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2692
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
2693
|
+
>>> performance_metrics
|
|
2694
|
+
|
|
2695
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
2696
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, 4)
|
|
2697
|
+
>>> performance_metrics
|
|
1505
2698
|
|
|
1506
2699
|
# Example 2 : Run AutoClassifier for binary classification.
|
|
1507
2700
|
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
@@ -1510,6 +2703,11 @@ class AutoClassifier(AutoML):
|
|
|
1510
2703
|
# configuration file to customize different processes of
|
|
1511
2704
|
# AutoML Run.
|
|
1512
2705
|
|
|
2706
|
+
# Split the data into train and test.
|
|
2707
|
+
>>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
|
|
2708
|
+
>>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2709
|
+
>>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2710
|
+
|
|
1513
2711
|
# Generate custom configuration file.
|
|
1514
2712
|
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1515
2713
|
|
|
@@ -1517,21 +2715,25 @@ class AutoClassifier(AutoML):
|
|
|
1517
2715
|
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1518
2716
|
>>> custom_config_file="custom_titanic.json")
|
|
1519
2717
|
# Fit the data.
|
|
1520
|
-
>>> automl_obj.fit(
|
|
1521
|
-
|
|
1522
|
-
# Run predict with best performing model.
|
|
1523
|
-
>>> prediction = automl_obj.predict()
|
|
1524
|
-
>>> prediction
|
|
1525
|
-
|
|
1526
|
-
# Run predict with second best performing model.
|
|
1527
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1528
|
-
>>> prediction
|
|
2718
|
+
>>> automl_obj.fit(titanic_train, titanic_train.survived)
|
|
1529
2719
|
|
|
1530
2720
|
# Display leaderboard.
|
|
1531
2721
|
>>> automl_obj.leaderboard()
|
|
1532
2722
|
|
|
1533
2723
|
# Display best performing model.
|
|
1534
2724
|
>>> automl_obj.leader()
|
|
2725
|
+
|
|
2726
|
+
# Run predict on test data using best performing model.
|
|
2727
|
+
>>> prediction = automl_obj.predict(titanic_test)
|
|
2728
|
+
>>> prediction
|
|
2729
|
+
|
|
2730
|
+
# Run predict on test data using second best performing model.
|
|
2731
|
+
>>> prediction = automl_obj.predict(titanic_test, rank=2)
|
|
2732
|
+
>>> prediction
|
|
2733
|
+
|
|
2734
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2735
|
+
>>> performance_metrics = automl_obj.evaluate(titanic_test)
|
|
2736
|
+
>>> performance_metrics
|
|
1535
2737
|
|
|
1536
2738
|
# Example 3 : Run AutoClassifier for multiclass classification problem.
|
|
1537
2739
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
@@ -1539,6 +2741,11 @@ class AutoClassifier(AutoML):
|
|
|
1539
2741
|
# models. Use custom configuration file to customize different
|
|
1540
2742
|
# processes of AutoML Run.
|
|
1541
2743
|
|
|
2744
|
+
# Split the data into train and test.
|
|
2745
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
2746
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2747
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2748
|
+
|
|
1542
2749
|
# Generate custom configuration file.
|
|
1543
2750
|
>>> AutoClassifier.generate_custom_config("custom_iris")
|
|
1544
2751
|
|
|
@@ -1546,18 +2753,22 @@ class AutoClassifier(AutoML):
|
|
|
1546
2753
|
>>> automl_obj = AutoClassifier(verbose=1,
|
|
1547
2754
|
>>> custom_config_file="custom_iris.json")
|
|
1548
2755
|
# Fit the data.
|
|
1549
|
-
>>> automl_obj.fit(
|
|
1550
|
-
|
|
1551
|
-
# Predict using best performing model.
|
|
1552
|
-
>>> prediction = automl_obj.predict()
|
|
1553
|
-
>>> prediction
|
|
1554
|
-
|
|
2756
|
+
>>> automl_obj.fit(iris_train, "species")
|
|
2757
|
+
|
|
1555
2758
|
# Display leaderboard.
|
|
1556
2759
|
>>> automl_obj.leaderboard()
|
|
1557
2760
|
|
|
1558
2761
|
# Display best performing model.
|
|
1559
2762
|
>>> automl_obj.leader()
|
|
1560
2763
|
|
|
2764
|
+
# Predict on test data using best performing model.
|
|
2765
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
2766
|
+
>>> prediction
|
|
2767
|
+
|
|
2768
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2769
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
2770
|
+
>>> performance_metrics
|
|
2771
|
+
|
|
1561
2772
|
# Example 4 : Run AutoClassifier for classification problem with stopping metric and tolerance.
|
|
1562
2773
|
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
1563
2774
|
# or not based on differect factors. Use custom configuration
|
|
@@ -1565,64 +2776,87 @@ class AutoClassifier(AutoML):
|
|
|
1565
2776
|
# performance threshold to acquire for the available models, and
|
|
1566
2777
|
# terminate training upon meeting the stipulated performance criteria.
|
|
1567
2778
|
|
|
2779
|
+
# Split the data into train and test.
|
|
2780
|
+
>>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
|
|
2781
|
+
>>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2782
|
+
>>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2783
|
+
|
|
1568
2784
|
# Generate custom configuration file.
|
|
1569
2785
|
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1570
2786
|
|
|
1571
2787
|
# Create instance of AutoClassifier.
|
|
1572
2788
|
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1573
|
-
>>>
|
|
1574
|
-
>>>
|
|
1575
|
-
>>>
|
|
1576
|
-
>>>
|
|
2789
|
+
>>> exclude="xgboost",
|
|
2790
|
+
>>> stopping_metric="MICRO-F1",
|
|
2791
|
+
>>> stopping_tolerance=0.7,
|
|
2792
|
+
>>> max_models=8
|
|
2793
|
+
>>> custom_config_file="custom_titanic.json")
|
|
1577
2794
|
# Fit the data.
|
|
1578
|
-
>>> automl_obj.fit(
|
|
1579
|
-
|
|
1580
|
-
# Run predict with best performing model.
|
|
1581
|
-
>>> prediction = automl_obj.predict()
|
|
1582
|
-
>>> prediction
|
|
1583
|
-
|
|
2795
|
+
>>> automl_obj.fit(titanic_train, titanic_train.survived)
|
|
2796
|
+
|
|
1584
2797
|
# Display leaderboard.
|
|
1585
2798
|
>>> automl_obj.leaderboard()
|
|
2799
|
+
|
|
2800
|
+
# Run predict on test data using best performing model.
|
|
2801
|
+
>>> prediction = automl_obj.predict(titanic_test)
|
|
2802
|
+
>>> prediction
|
|
2803
|
+
|
|
2804
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2805
|
+
>>> performance_metrics = automl_obj.evaluate(titanic_test)
|
|
2806
|
+
>>> performance_metrics
|
|
1586
2807
|
|
|
1587
2808
|
# Example 5 : Run AutoClassifier for classification problem with maximum runtime.
|
|
1588
2809
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
1589
2810
|
# Run AutoML to get the best performing model in specified time.
|
|
2811
|
+
|
|
2812
|
+
# Split the data into train and test.
|
|
2813
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
2814
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2815
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
1590
2816
|
|
|
1591
2817
|
# Create instance of AutoClassifier.
|
|
1592
2818
|
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1593
|
-
>>>
|
|
1594
|
-
>>>
|
|
2819
|
+
>>> exclude="xgboost",
|
|
2820
|
+
>>> max_runtime_secs=500)
|
|
2821
|
+
>>> max_models=3)
|
|
1595
2822
|
# Fit the data.
|
|
1596
|
-
>>> automl_obj.fit(
|
|
1597
|
-
|
|
1598
|
-
# Run predict with best performing model.
|
|
1599
|
-
>>> prediction = automl_obj.predict()
|
|
1600
|
-
>>> prediction
|
|
1601
|
-
|
|
1602
|
-
# Run predict with second best performing model.
|
|
1603
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1604
|
-
>>> prediction
|
|
1605
|
-
|
|
2823
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
2824
|
+
|
|
1606
2825
|
# Display leaderboard.
|
|
1607
2826
|
>>> automl_obj.leaderboard()
|
|
1608
2827
|
|
|
1609
2828
|
# Display best performing model.
|
|
1610
|
-
>>> automl_obj.leader()
|
|
2829
|
+
>>> automl_obj.leader()
|
|
2830
|
+
|
|
2831
|
+
# Run predict on test data using best performing model.
|
|
2832
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
2833
|
+
>>> prediction
|
|
2834
|
+
|
|
2835
|
+
# Run predict on test data using second best performing model.
|
|
2836
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
2837
|
+
>>> prediction
|
|
2838
|
+
|
|
2839
|
+
# Run evaluate to get performance metrics using model rank 3.
|
|
2840
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test, 3)
|
|
2841
|
+
>>> performance_metrics
|
|
1611
2842
|
"""
|
|
1612
2843
|
self.verbose = verbose
|
|
1613
2844
|
self.max_runtime_secs = max_runtime_secs
|
|
1614
2845
|
self.stopping_metric = stopping_metric
|
|
1615
2846
|
self.stopping_tolerance = stopping_tolerance
|
|
2847
|
+
self.max_models = max_models
|
|
1616
2848
|
self.custom_config_file = custom_config_file
|
|
1617
2849
|
self.task_type = "Classification"
|
|
1618
2850
|
self.include = include
|
|
1619
2851
|
self.exclude = exclude
|
|
1620
2852
|
|
|
1621
2853
|
super(AutoClassifier, self).__init__(task_type=self.task_type,
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
2854
|
+
include = self.include,
|
|
2855
|
+
exclude = self.exclude,
|
|
2856
|
+
verbose=self.verbose,
|
|
2857
|
+
max_runtime_secs=self.max_runtime_secs,
|
|
2858
|
+
stopping_metric=self.stopping_metric,
|
|
2859
|
+
stopping_tolerance=self.stopping_tolerance,
|
|
2860
|
+
max_models=self.max_models,
|
|
2861
|
+
custom_config_file=self.custom_config_file,
|
|
2862
|
+
**kwargs)
|