teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
teradataml/automl/__init__.py
CHANGED
|
@@ -15,22 +15,29 @@
|
|
|
15
15
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import json
|
|
18
|
+
import pandas as pd
|
|
18
19
|
import numpy as np
|
|
19
20
|
from sklearn.metrics import confusion_matrix
|
|
20
21
|
import time
|
|
22
|
+
import ast
|
|
23
|
+
import warnings
|
|
24
|
+
import joblib
|
|
25
|
+
from io import BytesIO
|
|
21
26
|
|
|
22
27
|
# Teradata libraries
|
|
23
28
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
24
29
|
from teradataml import ColumnExpression
|
|
25
30
|
from teradataml.dataframe.dataframe import DataFrame
|
|
31
|
+
from teradataml.utils.utils import execute_sql
|
|
26
32
|
from teradataml.utils.validators import _Validators
|
|
27
|
-
from teradataml import ROC
|
|
28
|
-
from teradataml.common.utils import UtilFuncs
|
|
33
|
+
from teradataml import ROC, BLOB
|
|
29
34
|
from teradataml.utils.dtypes import _Dtypes
|
|
30
35
|
from teradataml.common.utils import UtilFuncs
|
|
31
36
|
from teradataml import TeradataMlException
|
|
32
37
|
from teradataml.common.messages import Messages, MessageCodes
|
|
33
|
-
from
|
|
38
|
+
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
39
|
+
from teradataml import TeradataConstants
|
|
40
|
+
from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
|
|
34
41
|
|
|
35
42
|
# AutoML Internal libraries
|
|
36
43
|
from teradataml.automl.data_preparation import _DataPreparation
|
|
@@ -53,7 +60,8 @@ class AutoML:
|
|
|
53
60
|
stopping_metric = None,
|
|
54
61
|
stopping_tolerance = None,
|
|
55
62
|
max_models = None,
|
|
56
|
-
custom_config_file = None
|
|
63
|
+
custom_config_file = None,
|
|
64
|
+
**kwargs):
|
|
57
65
|
"""
|
|
58
66
|
DESCRIPTION:
|
|
59
67
|
AutoML (Automated Machine Learning) is an approach that automates the process
|
|
@@ -132,8 +140,10 @@ class AutoML:
|
|
|
132
140
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
133
141
|
Specifies the stopping metrics for stopping tolerance in model training.
|
|
134
142
|
Permitted Values:
|
|
135
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
136
|
-
"RMSE", "RMSLE"
|
|
143
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
144
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
145
|
+
"ME", "EV", "MPD", "MGD"
|
|
146
|
+
|
|
137
147
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
138
148
|
'MICRO-RECALL','MACRO-RECALL',
|
|
139
149
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -155,6 +165,28 @@ class AutoML:
|
|
|
155
165
|
Optional Argument.
|
|
156
166
|
Specifies the path of JSON file in case of custom run.
|
|
157
167
|
Types: str
|
|
168
|
+
|
|
169
|
+
**kwargs:
|
|
170
|
+
Specifies the additional arguments for AutoML. Below
|
|
171
|
+
are the additional arguments:
|
|
172
|
+
volatile:
|
|
173
|
+
Optional Argument.
|
|
174
|
+
Specifies whether to put the interim results of the
|
|
175
|
+
functions in a volatile table or not. When set to
|
|
176
|
+
True, results are stored in a volatile table,
|
|
177
|
+
otherwise not.
|
|
178
|
+
Default Value: False
|
|
179
|
+
Types: bool
|
|
180
|
+
|
|
181
|
+
persist:
|
|
182
|
+
Optional Argument.
|
|
183
|
+
Specifies whether to persist the interim results of the
|
|
184
|
+
functions in a table or not. When set to True,
|
|
185
|
+
results are persisted in a table; otherwise,
|
|
186
|
+
results are garbage collected at the end of the
|
|
187
|
+
session.
|
|
188
|
+
Default Value: False
|
|
189
|
+
Types: bool
|
|
158
190
|
|
|
159
191
|
RETURNS:
|
|
160
192
|
Instance of AutoML.
|
|
@@ -192,24 +224,28 @@ class AutoML:
|
|
|
192
224
|
|
|
193
225
|
# Fit the data.
|
|
194
226
|
>>> automl_obj.fit(admissions_train, "admitted")
|
|
195
|
-
|
|
196
|
-
# Run predict with best performing model.
|
|
197
|
-
>>> prediction = automl_obj.predict()
|
|
198
|
-
>>> prediction
|
|
199
|
-
|
|
200
|
-
# Run predict for new test data with best performing model.
|
|
201
|
-
>>> prediction = automl_obj.predict(admissions_test)
|
|
202
|
-
>>> prediction
|
|
203
227
|
|
|
204
|
-
# Run predict for new test data with second best performing model.
|
|
205
|
-
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
206
|
-
>>> prediction
|
|
207
|
-
|
|
208
228
|
# Display leaderboard.
|
|
209
229
|
>>> automl_obj.leaderboard()
|
|
210
230
|
|
|
211
231
|
# Display best performing model.
|
|
212
232
|
>>> automl_obj.leader()
|
|
233
|
+
|
|
234
|
+
# Run predict on test data using best performing model.
|
|
235
|
+
>>> prediction = automl_obj.predict(admissions_test)
|
|
236
|
+
>>> prediction
|
|
237
|
+
|
|
238
|
+
# Run predict on test data using second best performing model.
|
|
239
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
240
|
+
>>> prediction
|
|
241
|
+
|
|
242
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
243
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
244
|
+
>>> performance_metrics
|
|
245
|
+
|
|
246
|
+
# Run evaluate to get performance metrics using model rank 3.
|
|
247
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, rank=3)
|
|
248
|
+
>>> performance_metrics
|
|
213
249
|
|
|
214
250
|
# Example 2 : Run AutoML for regression problem.
|
|
215
251
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -228,24 +264,28 @@ class AutoML:
|
|
|
228
264
|
>>> custom_config_file="custom_housing.json")
|
|
229
265
|
# Fit the data.
|
|
230
266
|
>>> automl_obj.fit(housing_train, "price")
|
|
231
|
-
|
|
232
|
-
# Run predict with best performing model.
|
|
233
|
-
>>> prediction = automl_obj.predict()
|
|
234
|
-
>>> prediction
|
|
235
|
-
|
|
236
|
-
# Run predict for new test data with best performing model.
|
|
237
|
-
>>> prediction = automl_obj.predict(housing_test)
|
|
238
|
-
>>> prediction
|
|
239
267
|
|
|
240
|
-
# Run predict for new test data with second best performing model.
|
|
241
|
-
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
242
|
-
>>> prediction
|
|
243
|
-
|
|
244
268
|
# Display leaderboard.
|
|
245
269
|
>>> automl_obj.leaderboard()
|
|
246
270
|
|
|
247
271
|
# Display best performing model.
|
|
248
272
|
>>> automl_obj.leader()
|
|
273
|
+
|
|
274
|
+
# Run predict on test data using best performing model.
|
|
275
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
276
|
+
>>> prediction
|
|
277
|
+
|
|
278
|
+
# Run predict on test data using second best performing model.
|
|
279
|
+
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
280
|
+
>>> prediction
|
|
281
|
+
|
|
282
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
283
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
284
|
+
>>> performance_metrics
|
|
285
|
+
|
|
286
|
+
# Run evaluate to get performance metrics using second best performing model.
|
|
287
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test, rank=2)
|
|
288
|
+
>>> performance_metrics
|
|
249
289
|
|
|
250
290
|
# Example 3 : Run AutoML for multiclass classification problem.
|
|
251
291
|
# Scenario : Predict the species of iris flower based on different
|
|
@@ -253,6 +293,11 @@ class AutoML:
|
|
|
253
293
|
# different processes of AutoML Run to get the best
|
|
254
294
|
# performing model out of available models.
|
|
255
295
|
|
|
296
|
+
# Split the data into train and test.
|
|
297
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
298
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
299
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
300
|
+
|
|
256
301
|
# Generate custom JSON file
|
|
257
302
|
>>> AutoML.generate_custom_config()
|
|
258
303
|
|
|
@@ -260,22 +305,23 @@ class AutoML:
|
|
|
260
305
|
>>> automl_obj = AutoML(verbose=2,
|
|
261
306
|
>>> exclude="xgboost",
|
|
262
307
|
>>> custom_config_file="custom.json")
|
|
308
|
+
|
|
263
309
|
# Fit the data.
|
|
264
|
-
>>> automl_obj.fit(
|
|
265
|
-
|
|
266
|
-
# Run predict with best performing model.
|
|
267
|
-
>>> prediction = automl_obj.predict()
|
|
268
|
-
>>> prediction
|
|
269
|
-
|
|
270
|
-
# Run predict with second best performing model.
|
|
271
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
272
|
-
>>> prediction
|
|
310
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
273
311
|
|
|
274
312
|
# Display leaderboard.
|
|
275
313
|
>>> automl_obj.leaderboard()
|
|
276
314
|
|
|
277
315
|
# Display best performing model.
|
|
278
316
|
>>> automl_obj.leader()
|
|
317
|
+
|
|
318
|
+
# Run predict on test data using second best performing model.
|
|
319
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
320
|
+
>>> prediction
|
|
321
|
+
|
|
322
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
323
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
324
|
+
>>> performance_metrics
|
|
279
325
|
|
|
280
326
|
# Example 4 : Run AutoML for regression problem with early stopping metric and tolerance.
|
|
281
327
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -296,39 +342,57 @@ class AutoML:
|
|
|
296
342
|
>>> custom_config_file="custom_housing.json")
|
|
297
343
|
# Fit the data.
|
|
298
344
|
>>> automl_obj.fit(housing_train, "price")
|
|
299
|
-
|
|
300
|
-
# Run predict with best performing model.
|
|
301
|
-
>>> prediction = automl_obj.predict()
|
|
302
|
-
>>> prediction
|
|
303
|
-
|
|
345
|
+
|
|
304
346
|
# Display leaderboard.
|
|
305
347
|
>>> automl_obj.leaderboard()
|
|
348
|
+
|
|
349
|
+
# Run predict on test data using best performing model.
|
|
350
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
351
|
+
>>> prediction
|
|
352
|
+
|
|
353
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
354
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
355
|
+
>>> performance_metrics
|
|
306
356
|
|
|
307
357
|
# Example 5 : Run AutoML for regression problem with maximum runtime.
|
|
308
358
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
309
359
|
# Run AutoML to get the best performing model in specified time.
|
|
310
360
|
|
|
361
|
+
# Split the data into train and test.
|
|
362
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
363
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
364
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
365
|
+
|
|
311
366
|
# Create instance of AutoML.
|
|
312
367
|
>>> automl_obj = AutoML(verbose=2,
|
|
313
368
|
>>> exclude="xgboost",
|
|
314
369
|
>>> max_runtime_secs=500,
|
|
315
370
|
>>> max_models=3)
|
|
371
|
+
|
|
316
372
|
# Fit the data.
|
|
317
|
-
>>> automl_obj.fit(
|
|
318
|
-
|
|
319
|
-
# Run predict with best performing model.
|
|
320
|
-
>>> prediction = automl_obj.predict()
|
|
321
|
-
>>> prediction
|
|
322
|
-
|
|
323
|
-
# Run predict with second best performing model.
|
|
324
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
325
|
-
>>> prediction
|
|
326
|
-
|
|
373
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
374
|
+
|
|
327
375
|
# Display leaderboard.
|
|
328
376
|
>>> automl_obj.leaderboard()
|
|
329
377
|
|
|
330
378
|
# Display best performing model.
|
|
331
|
-
>>> automl_obj.leader()
|
|
379
|
+
>>> automl_obj.leader()
|
|
380
|
+
|
|
381
|
+
# Run predict on test data using best performing model.
|
|
382
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
383
|
+
>>> prediction
|
|
384
|
+
|
|
385
|
+
# Run predict on test data using second best performing model.
|
|
386
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
387
|
+
>>> prediction
|
|
388
|
+
|
|
389
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
390
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
391
|
+
>>> performance_metrics
|
|
392
|
+
|
|
393
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
394
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test, 4)
|
|
395
|
+
>>> performance_metrics
|
|
332
396
|
"""
|
|
333
397
|
# Appending arguments to list for validation
|
|
334
398
|
arg_info_matrix = []
|
|
@@ -339,9 +403,9 @@ class AutoML:
|
|
|
339
403
|
"decision_forest", "xgboost"]])
|
|
340
404
|
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
341
405
|
arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
|
|
342
|
-
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2",
|
|
343
|
-
|
|
344
|
-
|
|
406
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", "MAE", "MSE", "MSLE",
|
|
407
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
408
|
+
"ME", "EV", "MPD", "MGD",
|
|
345
409
|
'MICRO-F1','MACRO-F1',
|
|
346
410
|
'MICRO-RECALL','MACRO-RECALL',
|
|
347
411
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -350,13 +414,21 @@ class AutoML:
|
|
|
350
414
|
arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
|
|
351
415
|
arg_info_matrix.append(["max_models", max_models, True, (int)])
|
|
352
416
|
arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
|
|
353
|
-
|
|
417
|
+
|
|
418
|
+
volatile = kwargs.get('volatile', False)
|
|
419
|
+
persist = kwargs.get('persist', False)
|
|
420
|
+
|
|
421
|
+
arg_info_matrix.append(["volatile", volatile, True, (bool)])
|
|
422
|
+
arg_info_matrix.append(["persist", persist, True, (bool)])
|
|
354
423
|
|
|
355
424
|
# Validate argument types
|
|
356
425
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
357
426
|
# Either include or exclude can be used.
|
|
358
427
|
if include is not None or exclude is not None:
|
|
359
428
|
_Validators._validate_mutually_exclusive_arguments(include, "include", exclude, "exclude")
|
|
429
|
+
# Either volatile or persist can be used.
|
|
430
|
+
if volatile and persist:
|
|
431
|
+
_Validators._validate_mutually_exclusive_arguments(volatile, "volatlie", persist, "persist")
|
|
360
432
|
# Validate mutually inclusive arguments
|
|
361
433
|
_Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
|
|
362
434
|
# Validate lower range for max_models
|
|
@@ -391,6 +463,9 @@ class AutoML:
|
|
|
391
463
|
self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
|
|
392
464
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
393
465
|
self._is_fit_called = False
|
|
466
|
+
self._is_load_model_called = False
|
|
467
|
+
self.kwargs = kwargs
|
|
468
|
+
self.table_name_mapping={}
|
|
394
469
|
|
|
395
470
|
@collect_queryband(queryband="AutoML_fit")
|
|
396
471
|
def fit(self,
|
|
@@ -489,7 +564,9 @@ class AutoML:
|
|
|
489
564
|
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
490
565
|
else:
|
|
491
566
|
if self.stopping_metric is not None:
|
|
492
|
-
permitted_values = ["R2",
|
|
567
|
+
permitted_values = ["R2", "MAE", "MSE", "MSLE",
|
|
568
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
569
|
+
"ME", "EV", "MPD", "MGD"]
|
|
493
570
|
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
494
571
|
|
|
495
572
|
if not self.is_classification_type():
|
|
@@ -514,40 +591,39 @@ class AutoML:
|
|
|
514
591
|
clf = task_cls(self.data, self.target_column, self.custom_data)
|
|
515
592
|
|
|
516
593
|
self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
517
|
-
self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
594
|
+
self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
|
|
595
|
+
model_list = self.model_list,
|
|
596
|
+
auto = self.auto,
|
|
597
|
+
verbose = self.verbose,
|
|
598
|
+
max_runtime_secs = self.max_runtime_secs,
|
|
599
|
+
stopping_metric = self.stopping_metric,
|
|
600
|
+
stopping_tolerance = self.stopping_tolerance,
|
|
601
|
+
max_models = self.max_models,
|
|
602
|
+
**self.kwargs)
|
|
525
603
|
|
|
526
604
|
# Model Evaluation Phase
|
|
527
605
|
self.m_evaluator = _ModelEvaluator(self.model_info,
|
|
528
606
|
self.target_column,
|
|
529
607
|
self.task_type)
|
|
530
608
|
|
|
531
|
-
@collect_queryband(queryband="AutoML_predict")
|
|
609
|
+
@collect_queryband(queryband="AutoML_predict")
|
|
532
610
|
def predict(self,
|
|
533
|
-
data
|
|
534
|
-
rank = 1
|
|
611
|
+
data,
|
|
612
|
+
rank = 1,
|
|
613
|
+
use_loaded_models = False):
|
|
535
614
|
"""
|
|
536
615
|
DESCRIPTION:
|
|
537
|
-
Function generates prediction on
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
and performance metrics, otherwise displays only prediction.
|
|
616
|
+
Function generates prediction on data using model rank in
|
|
617
|
+
leaderboard.
|
|
618
|
+
Note:
|
|
619
|
+
* If both fit and load method are called before predict, then fit method model will be used
|
|
620
|
+
for prediction by default unless 'use_loaded_models' is set to True in predict.
|
|
543
621
|
|
|
544
622
|
PARAMETERS:
|
|
545
623
|
data:
|
|
546
|
-
|
|
547
|
-
Specifies the dataset on which prediction
|
|
548
|
-
|
|
549
|
-
When "data" is not specified default test data is used. Default
|
|
550
|
-
test data is the dataset generated at the time of training.
|
|
624
|
+
Required Argument.
|
|
625
|
+
Specifies the dataset on which prediction needs to be generated
|
|
626
|
+
using model rank in leaderboard.
|
|
551
627
|
Types: teradataml DataFrame
|
|
552
628
|
|
|
553
629
|
rank:
|
|
@@ -555,6 +631,12 @@ class AutoML:
|
|
|
555
631
|
Specifies the rank of the model in the leaderboard to be used for prediction.
|
|
556
632
|
Default Value: 1
|
|
557
633
|
Types: int
|
|
634
|
+
|
|
635
|
+
use_loaded_models:
|
|
636
|
+
Optional Argument.
|
|
637
|
+
Specifies whether to use loaded models from database for prediction or not.
|
|
638
|
+
Default Value: False
|
|
639
|
+
Types: bool
|
|
558
640
|
|
|
559
641
|
RETURNS:
|
|
560
642
|
Pandas DataFrame with predictions.
|
|
@@ -568,174 +650,1099 @@ class AutoML:
|
|
|
568
650
|
# Perform fit() operation on the "automl_obj".
|
|
569
651
|
# Perform predict() operation on the "automl_obj".
|
|
570
652
|
|
|
571
|
-
# Example 1: Run predict
|
|
572
|
-
>>> prediction = automl_obj.predict()
|
|
573
|
-
>>> prediction
|
|
574
|
-
|
|
575
|
-
# Example 2: Run predict with second best performing model.
|
|
576
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
577
|
-
>>> prediction
|
|
578
|
-
|
|
579
|
-
# Example 3: Run predict for new test data with best performing model.
|
|
653
|
+
# Example 1: Run predict on test data using best performing model.
|
|
580
654
|
>>> prediction = automl_obj.predict(admissions_test)
|
|
581
655
|
>>> prediction
|
|
582
656
|
|
|
583
|
-
# Example
|
|
657
|
+
# Example 2: Run predict on test data using second best performing model.
|
|
584
658
|
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
585
659
|
>>> prediction
|
|
660
|
+
|
|
661
|
+
# Example 3: Run predict on test data using loaded model.
|
|
662
|
+
>>> automl_obj.load("model_table")
|
|
663
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=3)
|
|
664
|
+
>>> prediction
|
|
665
|
+
|
|
666
|
+
# Example 4: Run predict on test data using loaded model when fit is also called.
|
|
667
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
668
|
+
>>> automl_obj.load("model_table")
|
|
669
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=3, use_loaded_models=True)
|
|
670
|
+
>>> prediction
|
|
586
671
|
"""
|
|
587
|
-
if not
|
|
588
|
-
|
|
672
|
+
# Checking if fit or load model is called before predict, If not raise error
|
|
673
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
589
674
|
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
590
675
|
"'predict' method", \
|
|
591
|
-
"'fit' method must be called before" \
|
|
676
|
+
"'fit' or 'load' method must be called before" \
|
|
592
677
|
" running predict.")
|
|
593
678
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
679
|
+
|
|
594
680
|
# Appending predict arguments to list for validation.
|
|
595
681
|
arg_info_pred_matrix = []
|
|
596
|
-
arg_info_pred_matrix.append(["data", data,
|
|
682
|
+
arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
|
|
597
683
|
arg_info_pred_matrix.append(["rank", rank, True, (int), True])
|
|
684
|
+
arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
598
685
|
|
|
599
686
|
# Validate argument types
|
|
600
687
|
_Validators._validate_function_arguments(arg_info_pred_matrix)
|
|
688
|
+
|
|
689
|
+
# Run predict using loaded model
|
|
690
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
691
|
+
# Validate range for model rank
|
|
692
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
693
|
+
ubound=self.loaded_models_info.RANK.max(),
|
|
694
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
695
|
+
return self._run_loaded_model(data, rank)
|
|
696
|
+
|
|
601
697
|
# Validate range for model rank
|
|
602
698
|
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
603
|
-
ubound=self.leader_board.
|
|
699
|
+
ubound=self.leader_board.RANK.max(),
|
|
604
700
|
lbound_inclusive=True, ubound_inclusive=True)
|
|
605
701
|
|
|
606
|
-
# Setting
|
|
607
|
-
self.
|
|
608
|
-
#
|
|
609
|
-
|
|
702
|
+
# Setting target column indicator to default value, i.e., True.
|
|
703
|
+
self.target_column_ind = True
|
|
704
|
+
# Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
|
|
705
|
+
rank = rank-1
|
|
706
|
+
|
|
707
|
+
# Setting indicator to False if target column doesn't exist
|
|
708
|
+
if self.target_column not in data.columns:
|
|
709
|
+
self.target_column_ind = False
|
|
710
|
+
|
|
711
|
+
# Checking if data is already transformed before or not
|
|
712
|
+
data_node_id = data._nodeid
|
|
713
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
714
|
+
# At first data transformation will be performed on raw test data
|
|
715
|
+
# then evaluation will happen.
|
|
716
|
+
self.transform_data(data)
|
|
717
|
+
else:
|
|
718
|
+
print("\nSkipping data transformation as data is already transformed.")
|
|
719
|
+
|
|
720
|
+
# Generating prediction
|
|
721
|
+
pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
722
|
+
table_name_mapping = self.table_name_mapping,
|
|
723
|
+
data_node_id = data_node_id,
|
|
724
|
+
target_column_ind = self.target_column_ind)
|
|
725
|
+
|
|
726
|
+
# Checking if problem type is classification and target label is present.
|
|
727
|
+
if self.is_classification_type() and self.target_label is not None:
|
|
728
|
+
# Displaying target column labels
|
|
729
|
+
tar_dct = {}
|
|
730
|
+
print('\nTarget Column Mapping:')
|
|
731
|
+
# Iterating rows
|
|
732
|
+
for row in self.target_label.result.itertuples():
|
|
733
|
+
# Retrieving the category names of encoded target column
|
|
734
|
+
# row[1] contains the orginal name of cateogry
|
|
735
|
+
# row[2] contains the encoded value
|
|
736
|
+
if row[1] != 'TD_CATEGORY_COUNT':
|
|
737
|
+
tar_dct[row[1]] = row[2]
|
|
738
|
+
|
|
739
|
+
for key, value in tar_dct.items():
|
|
740
|
+
print(f"{key}: {value}")
|
|
741
|
+
|
|
742
|
+
# Renaming probability column if any
|
|
743
|
+
prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
|
|
744
|
+
if len(prob_lst) > 0:
|
|
745
|
+
rename_dict ={}
|
|
746
|
+
for col in pred.result.columns:
|
|
747
|
+
if col not in prob_lst:
|
|
748
|
+
rename_dict[col] = getattr(pred.result, col)
|
|
749
|
+
else:
|
|
750
|
+
indx = int(col.split('_')[1])
|
|
751
|
+
rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
|
|
752
|
+
rename_dict['drop_columns'] = True
|
|
753
|
+
pred.result = pred.result.assign(**rename_dict)
|
|
754
|
+
|
|
755
|
+
print("\nPrediction : ")
|
|
756
|
+
print(pred.result)
|
|
757
|
+
|
|
758
|
+
if self.target_column_ind:
|
|
759
|
+
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
760
|
+
probability_column = 'prob_1'
|
|
761
|
+
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
762
|
+
if self.is_classification_type():
|
|
763
|
+
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
764
|
+
# Displaying ROC-AUC for binary classification
|
|
765
|
+
if self.target_count == 2:
|
|
766
|
+
fit_params = {
|
|
767
|
+
"probability_column" : probability_column,
|
|
768
|
+
"observation_column" : self.target_column,
|
|
769
|
+
"positive_class" : "1",
|
|
770
|
+
"data" : pred.result
|
|
771
|
+
}
|
|
772
|
+
# Fitting ROC
|
|
773
|
+
roc_out = ROC(**fit_params)
|
|
774
|
+
print("\nROC-AUC : ")
|
|
775
|
+
print_data(roc_out.result)
|
|
776
|
+
print_data(roc_out.output_data)
|
|
777
|
+
|
|
778
|
+
# Displaying confusion matrix for binary and multiclass classification
|
|
779
|
+
prediction_df=pred.result.to_pandas()
|
|
780
|
+
target_col = self.target_column
|
|
781
|
+
print("\nConfusion Matrix : ")
|
|
782
|
+
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
783
|
+
|
|
784
|
+
# Returning prediction
|
|
785
|
+
return pred.result
|
|
786
|
+
|
|
787
|
+
@collect_queryband(queryband="AutoML_evaluate")
|
|
788
|
+
def evaluate(self,
|
|
789
|
+
data,
|
|
790
|
+
rank = 1,
|
|
791
|
+
use_loaded_models = False
|
|
792
|
+
):
|
|
793
|
+
"""
|
|
794
|
+
DESCRIPTION:
|
|
795
|
+
Function evaluates on data using model rank in leaderboard
|
|
796
|
+
and generates performance metrics.
|
|
797
|
+
Note:
|
|
798
|
+
* If both fit and load method are called before predict, then fit method model will be used
|
|
799
|
+
for prediction by default unless 'use_loaded_models' is set to True in predict.
|
|
800
|
+
|
|
801
|
+
PARAMETERS:
|
|
802
|
+
data:
|
|
803
|
+
Required Argument.
|
|
804
|
+
Specifies the dataset on which performance metrics needs to be generated.
|
|
805
|
+
Types: teradataml DataFrame
|
|
806
|
+
|
|
807
|
+
Note:
|
|
808
|
+
* Target column used for generating model is mandatory in "data" for evaluation.
|
|
809
|
+
|
|
810
|
+
rank:
|
|
811
|
+
Optional Argument.
|
|
812
|
+
Specifies the rank of the model available in the leaderboard to be used for evaluation.
|
|
813
|
+
Default Value: 1
|
|
814
|
+
Types: int
|
|
815
|
+
|
|
816
|
+
use_loaded_models:
|
|
817
|
+
Optional Argument.
|
|
818
|
+
Specifies whether to use loaded models from database for prediction or not.
|
|
819
|
+
Default Value: False
|
|
820
|
+
Types: bool
|
|
821
|
+
|
|
822
|
+
RETURNS:
|
|
823
|
+
Pandas DataFrame with performance metrics.
|
|
824
|
+
|
|
825
|
+
RAISES:
|
|
826
|
+
TeradataMlException.
|
|
827
|
+
|
|
828
|
+
EXAMPLES:
|
|
829
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
830
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
831
|
+
# Perform fit() operation on the "automl_obj".
|
|
832
|
+
# Perform evaluate() operation on the "automl_obj".
|
|
833
|
+
|
|
834
|
+
# Example 1: Run evaluate on test data using best performing model.
|
|
835
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
836
|
+
>>> performance_metrics
|
|
837
|
+
|
|
838
|
+
# Example 2: Run evaluate on test data using second best performing model.
|
|
839
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, rank=2)
|
|
840
|
+
>>> performance_metrics
|
|
841
|
+
|
|
842
|
+
# Example 3: Run evaluate on test data using loaded model.
|
|
843
|
+
>>> automl_obj.load("model_table")
|
|
844
|
+
>>> evaluation = automl_obj.evaluate(admissions_test, rank=3)
|
|
845
|
+
>>> evaluation
|
|
846
|
+
|
|
847
|
+
# Example 4: Run predict on test data using loaded model when fit is also called.
|
|
848
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
849
|
+
>>> automl_obj.load("model_table")
|
|
850
|
+
>>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
|
|
851
|
+
>>> evaluation
|
|
852
|
+
"""
|
|
853
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
854
|
+
# raise ValueError("fit() method must be called before evaluating.")
|
|
855
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
856
|
+
"'evaluate' method", \
|
|
857
|
+
"'fit' or 'load' method must be called before" \
|
|
858
|
+
" running evaluate.")
|
|
859
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
860
|
+
# Appending evaluate arguments to list for validation.
|
|
861
|
+
arg_info_pred_matrix = []
|
|
862
|
+
arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
|
|
863
|
+
arg_info_pred_matrix.append(["rank", rank, True, (int), True])
|
|
864
|
+
arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
865
|
+
|
|
866
|
+
# Validate argument types
|
|
867
|
+
_Validators._validate_function_arguments(arg_info_pred_matrix)
|
|
868
|
+
|
|
869
|
+
# Run evaluate using loaded model
|
|
870
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
871
|
+
# Validate range for model rank
|
|
872
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
873
|
+
ubound=self.loaded_models_info.RANK.max(),
|
|
874
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
875
|
+
return self._run_loaded_model(data, rank, output_type="evaluate")
|
|
876
|
+
|
|
877
|
+
# Validate range for model rank
|
|
878
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
879
|
+
ubound=self.leader_board.RANK.max(),
|
|
880
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
881
|
+
|
|
610
882
|
# Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
|
|
611
883
|
rank = rank-1
|
|
884
|
+
|
|
885
|
+
# Raising exception if target column is not present in data
|
|
886
|
+
# as it is required for evaluation.
|
|
887
|
+
if self.target_column not in data.columns:
|
|
888
|
+
raise TeradataMlException(
|
|
889
|
+
Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
|
|
890
|
+
MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
|
|
891
|
+
|
|
892
|
+
# Checking if data is already transformed before or not
|
|
893
|
+
data_node_id = data._nodeid
|
|
894
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
895
|
+
# At first data transformation will be performed on raw test data
|
|
896
|
+
# then evaluation will happen.
|
|
897
|
+
self.transform_data(data)
|
|
898
|
+
else:
|
|
899
|
+
print("\nSkipping data transformation as data is already transformed.")
|
|
900
|
+
|
|
901
|
+
metrics = self.m_evaluator.model_evaluation(rank = rank,
|
|
902
|
+
table_name_mapping=self.table_name_mapping,
|
|
903
|
+
data_node_id = data_node_id,
|
|
904
|
+
get_metrics = True)
|
|
905
|
+
|
|
906
|
+
# Checking if problem type is classification and target label is present.
|
|
907
|
+
if self.is_classification_type() and self.target_label is not None:
|
|
908
|
+
# Displaying target column labels
|
|
909
|
+
tar_dct = {}
|
|
910
|
+
print('\nTarget Column Mapping:')
|
|
911
|
+
# Iterating rows
|
|
912
|
+
for row in self.target_label.result.itertuples():
|
|
913
|
+
# Retrieving the category names of encoded target column
|
|
914
|
+
# row[1] contains the orginal name of cateogry
|
|
915
|
+
# row[2] contains the encoded value
|
|
916
|
+
if row[1] != 'TD_CATEGORY_COUNT':
|
|
917
|
+
tar_dct[row[1]] = row[2]
|
|
918
|
+
|
|
919
|
+
for key, value in tar_dct.items():
|
|
920
|
+
print(f"{key}: {value}")
|
|
921
|
+
|
|
922
|
+
# Showing performance metrics
|
|
923
|
+
print("\nPerformance Metrics : ")
|
|
924
|
+
print(metrics.result)
|
|
925
|
+
if self.is_classification_type():
|
|
926
|
+
print("-"*80)
|
|
927
|
+
print(metrics.output_data)
|
|
928
|
+
|
|
929
|
+
# Returning performance metrics
|
|
930
|
+
return metrics.result
|
|
931
|
+
|
|
932
|
+
def transform_data(self,
|
|
933
|
+
data,
|
|
934
|
+
data_params = None,
|
|
935
|
+
auto = None,
|
|
936
|
+
verbose = None,
|
|
937
|
+
target_column_ind = None):
|
|
938
|
+
"""
|
|
939
|
+
DESCRIPTION:
|
|
940
|
+
Function transforms the data based on the data transformation parameters
|
|
941
|
+
generated during the fit phase.
|
|
942
|
+
|
|
943
|
+
PARAMETERS:
|
|
944
|
+
data:
|
|
945
|
+
Required Argument.
|
|
946
|
+
Specifies the dataset to be transformed.
|
|
947
|
+
Types: teradataml DataFrame
|
|
948
|
+
|
|
949
|
+
data_params:
|
|
950
|
+
Optional Argument.
|
|
951
|
+
Specifies the data transformation parameters.
|
|
952
|
+
Default Value: None
|
|
953
|
+
Types: dict
|
|
954
|
+
|
|
955
|
+
auto:
|
|
956
|
+
Optional Argument.
|
|
957
|
+
Specifies whether to AutoML ran in auto or custom mode.
|
|
958
|
+
Default Value: None
|
|
959
|
+
Types: bool
|
|
960
|
+
|
|
961
|
+
verbose:
|
|
962
|
+
Optional Argument.
|
|
963
|
+
Specifies the verbosity level.
|
|
964
|
+
Default Value: None
|
|
965
|
+
Types: int
|
|
966
|
+
|
|
967
|
+
target_column_ind:
|
|
968
|
+
Optional Argument.
|
|
969
|
+
Specifies whether target column is present in data or not.
|
|
970
|
+
Default Value: None
|
|
971
|
+
Types: bool
|
|
972
|
+
|
|
973
|
+
RETURNS:
|
|
974
|
+
None
|
|
975
|
+
"""
|
|
976
|
+
# Creating instance of DataTransformation
|
|
977
|
+
data_transform_instance = _DataTransformation(data = data,
|
|
978
|
+
data_transformation_params=data_params if data_params is not None else \
|
|
979
|
+
self.data_transformation_params,
|
|
980
|
+
auto=auto if data_params is not None else self.auto,
|
|
981
|
+
verbose=verbose if verbose is not None else self.verbose,
|
|
982
|
+
target_column_ind=target_column_ind if target_column_ind is not None else \
|
|
983
|
+
self.target_column_ind,
|
|
984
|
+
table_name_mapping=self.table_name_mapping)
|
|
985
|
+
|
|
986
|
+
# Storing mapping of table names for transformed data
|
|
987
|
+
self.table_name_mapping = data_transform_instance.data_transformation()
|
|
988
|
+
|
|
989
|
+
@collect_queryband(queryband="AutoML_leaderboard")
|
|
990
|
+
def leaderboard(self):
|
|
991
|
+
"""
|
|
992
|
+
DESCRIPTION:
|
|
993
|
+
Function displays leaderboard.
|
|
994
|
+
|
|
995
|
+
RETURNS:
|
|
996
|
+
Pandas DataFrame with Leaderboard information.
|
|
997
|
+
|
|
998
|
+
RAISES:
|
|
999
|
+
TeradataMlException.
|
|
1000
|
+
|
|
1001
|
+
EXAMPLES:
|
|
1002
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1003
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1004
|
+
# Perform fit() operation on the "automl_obj".
|
|
1005
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1006
|
+
>>> automl_obj.leaderboard()
|
|
1007
|
+
"""
|
|
1008
|
+
if not self._is_fit_called:
|
|
1009
|
+
# raise ValueError("fit() method must be called before generating leaderboard.")
|
|
1010
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1011
|
+
"'leaderboard' method", \
|
|
1012
|
+
"'fit' method must be called before" \
|
|
1013
|
+
" generating leaderboard.")
|
|
1014
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1015
|
+
return self.leader_board
|
|
1016
|
+
|
|
1017
|
+
@collect_queryband(queryband="AutoML_leader")
|
|
1018
|
+
def leader(self):
|
|
1019
|
+
"""
|
|
1020
|
+
DESCRIPTION:
|
|
1021
|
+
Function displays best performing model.
|
|
1022
|
+
|
|
1023
|
+
RETURNS:
|
|
1024
|
+
None
|
|
1025
|
+
|
|
1026
|
+
RAISES:
|
|
1027
|
+
TeradataMlException.
|
|
1028
|
+
|
|
1029
|
+
EXAMPLES:
|
|
1030
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1031
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1032
|
+
# Perform fit() operation on the "automl_obj".
|
|
1033
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1034
|
+
# Display best performing model using leader() method on "automl_obj".
|
|
1035
|
+
>>> automl_obj.leader()
|
|
1036
|
+
"""
|
|
1037
|
+
if not self._is_fit_called:
|
|
1038
|
+
# raise ValueError("fit() method must be called before generating leader.")
|
|
1039
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1040
|
+
"'leader' method", \
|
|
1041
|
+
"'fit' method must be called before" \
|
|
1042
|
+
" generating leader.")
|
|
1043
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1044
|
+
record = self.leader_board
|
|
1045
|
+
if not _is_terminal():
|
|
1046
|
+
display(record[record['RANK'] == 1])
|
|
1047
|
+
else:
|
|
1048
|
+
print(record[record['RANK'] == 1])
|
|
1049
|
+
|
|
1050
|
+
@collect_queryband(queryband="AutoML_hyperparameter")
|
|
1051
|
+
def model_hyperparameters(self,
|
|
1052
|
+
rank=1,
|
|
1053
|
+
use_loaded_models=False):
|
|
1054
|
+
"""
|
|
1055
|
+
DESCRIPTION:
|
|
1056
|
+
Get hyperparameters of the model based on rank in leaderboard.
|
|
1057
|
+
Note:
|
|
1058
|
+
* If both the fit() and load() methods are invoked before calling model_hyperparameters(),
|
|
1059
|
+
by default hyperparameters are retrieved from the fit leaderboard.
|
|
1060
|
+
To retrieve hyperparameters from the loaded models, set "use_loaded_models" to True in the model_hyperparameters call.
|
|
1061
|
+
|
|
1062
|
+
PARAMETERS:
|
|
1063
|
+
rank:
|
|
1064
|
+
Required Argument.
|
|
1065
|
+
Specifies the rank of the model in the leaderboard.
|
|
1066
|
+
Default Value: 1
|
|
1067
|
+
Types: int
|
|
1068
|
+
|
|
1069
|
+
use_loaded_models:
|
|
1070
|
+
Optional Argument.
|
|
1071
|
+
Specifies whether to use loaded models from database to get hyperparameters or not.
|
|
1072
|
+
Default Value: False
|
|
1073
|
+
Types: bool
|
|
1074
|
+
|
|
1075
|
+
RETURNS:
|
|
1076
|
+
Dictionary, containing hyperparameters.
|
|
1077
|
+
|
|
1078
|
+
RAISES:
|
|
1079
|
+
TeradataMlException.
|
|
1080
|
+
|
|
1081
|
+
EXAMPLES:
|
|
1082
|
+
# Example 1: Get hyperparameters of the model using fit models.
|
|
1083
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1084
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1085
|
+
# Perform fit() operation on the "automl_obj".
|
|
1086
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1087
|
+
>>> automl_obj = AutoML(task_type="Classification")
|
|
1088
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
1089
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1090
|
+
|
|
1091
|
+
# Example 2: Get hyperparameters of the model using loaded models.
|
|
1092
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1093
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1094
|
+
# Load models from the specified table.
|
|
1095
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1096
|
+
>>> automl_obj = AutoML()
|
|
1097
|
+
>>> automl_obj.load("model_table")
|
|
1098
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1099
|
+
|
|
1100
|
+
# Example 3: Get hyperparameters of the model when both fit and load method are called.
|
|
1101
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
1102
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1103
|
+
# Fit the data.
|
|
1104
|
+
# Load models from the specified table.
|
|
1105
|
+
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1106
|
+
>>> automl_obj = AutoML(task_type="Classification")
|
|
1107
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
1108
|
+
>>> automl_obj.load("model_table")
|
|
1109
|
+
|
|
1110
|
+
# Get hyperparameters of the model using loaded models.
|
|
1111
|
+
>>> automl_obj.model_hyperparameters(rank=1, use_loaded_models=True)
|
|
1112
|
+
# Get hyperparameters of the model using fit models.
|
|
1113
|
+
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1114
|
+
"""
|
|
1115
|
+
|
|
1116
|
+
if not self._is_fit_called and not self._is_load_model_called:
|
|
1117
|
+
# raise ValueError("fit() or load() method must be called before getting hyperparameters.")
|
|
1118
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1119
|
+
"'model_hyperparameters' method",
|
|
1120
|
+
"No models available to get hyperparameters. " \
|
|
1121
|
+
"Run 'fit()' or 'load()' methods to get models.")
|
|
1122
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1123
|
+
|
|
1124
|
+
arg_info_matrix = []
|
|
1125
|
+
arg_info_matrix.append(["rank", rank, True, (int), True])
|
|
1126
|
+
arg_info_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
|
|
1127
|
+
|
|
1128
|
+
# Validate argument types
|
|
1129
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1130
|
+
|
|
1131
|
+
leaderboard = None
|
|
1132
|
+
if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
|
|
1133
|
+
leaderboard = self.loaded_models_info
|
|
1134
|
+
else:
|
|
1135
|
+
leaderboard = self.model_info
|
|
1136
|
+
|
|
1137
|
+
# Validate range for model rank from loaded models
|
|
1138
|
+
_Validators._validate_argument_range(rank, "rank", lbound=1,
|
|
1139
|
+
ubound=leaderboard.RANK.max(),
|
|
1140
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1141
|
+
hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
|
|
1142
|
+
|
|
1143
|
+
# Deserializing hyperparameters
|
|
1144
|
+
hyperparams = ast.literal_eval(hyperparams)
|
|
1145
|
+
|
|
1146
|
+
# Removing 'data' from hyperparameters
|
|
1147
|
+
keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
|
|
1148
|
+
for key in keys_to_remove:
|
|
1149
|
+
hyperparams.pop(key, None)
|
|
1150
|
+
|
|
1151
|
+
return hyperparams
|
|
1152
|
+
|
|
1153
|
+
@collect_queryband(queryband="AutoML_load")
|
|
1154
|
+
def load(self,
|
|
1155
|
+
table_name):
|
|
1156
|
+
"""
|
|
1157
|
+
DESCRIPTION:
|
|
1158
|
+
Function loads models information from the specified table.
|
|
1159
|
+
|
|
1160
|
+
PARAMETERS:
|
|
1161
|
+
table_name:
|
|
1162
|
+
Required Argument.
|
|
1163
|
+
Specifies the table name from which models are to be loaded.
|
|
1164
|
+
Types: str
|
|
1165
|
+
|
|
1166
|
+
RETURNS:
|
|
1167
|
+
Pandas DataFrame with loaded models information.
|
|
1168
|
+
|
|
1169
|
+
RAISES:
|
|
1170
|
+
TeradataMlException.
|
|
1171
|
+
|
|
1172
|
+
EXAMPLES:
|
|
1173
|
+
# Create an instance of the AutoML called "obj"
|
|
1174
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1175
|
+
>>> obj = AutoML()
|
|
1176
|
+
# Load models from the specified table.
|
|
1177
|
+
>>> tab = obj.load("model_table")
|
|
1178
|
+
"""
|
|
1179
|
+
# Appending arguments to list for validation
|
|
1180
|
+
arg_info_matrix = []
|
|
1181
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1182
|
+
|
|
1183
|
+
# Validate argument types
|
|
1184
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1185
|
+
|
|
1186
|
+
# Loading models
|
|
1187
|
+
self.loaded_models_info = DataFrame(table_name).to_pandas()
|
|
1188
|
+
|
|
1189
|
+
self._load_data_transform_params()
|
|
1190
|
+
|
|
1191
|
+
self._is_load_model_called = True
|
|
1192
|
+
|
|
1193
|
+
return self.loaded_models_info.drop(['RESULT_TABLE', 'PARAMETERS'], axis=1)
|
|
1194
|
+
|
|
1195
|
+
def _load_data_transform_params(self):
|
|
1196
|
+
"""
|
|
1197
|
+
DESCRIPTION:
|
|
1198
|
+
Internal Function loads data transformation parameters from the specified table.
|
|
1199
|
+
"""
|
|
1200
|
+
from sklearn.decomposition import PCA
|
|
1201
|
+
|
|
1202
|
+
# Getting data transformation row
|
|
1203
|
+
data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
|
|
1204
|
+
|
|
1205
|
+
# Removing data transformation row and dropping 'DATA_PARAMS' column
|
|
1206
|
+
# from loaded models info
|
|
1207
|
+
self.loaded_models_info = self.loaded_models_info[self.loaded_models_info['RANK'] != -1]
|
|
1208
|
+
self.loaded_models_info.drop('DATA_PARAMS', axis=1, inplace=True)
|
|
1209
|
+
|
|
1210
|
+
# Loading data transformation parameters by deserializing
|
|
1211
|
+
buffer = BytesIO(data_transform_row['DATA_PARAMS'])
|
|
1212
|
+
data_params = joblib.load(buffer)
|
|
1213
|
+
|
|
1214
|
+
fit_obj_lst = json.loads(data_transform_row['PARAMETERS'])
|
|
1215
|
+
|
|
1216
|
+
# Generating Dataframe from table_names in data params
|
|
1217
|
+
# fit_obj_lst contain : ['one_hot_encoding_fit_obj', 'lasso_scale_fit_obj', 'pca_scale_fit_obj', imputation_fit_object]
|
|
1218
|
+
# Iterating over fit_obj_lst and converting table names to DataFrame
|
|
1219
|
+
for fit_obj_name in fit_obj_lst:
|
|
1220
|
+
if isinstance(data_params[fit_obj_name], dict):
|
|
1221
|
+
for key, val in data_params[fit_obj_name].items():
|
|
1222
|
+
# Key: automl transformation step name, val: table name
|
|
1223
|
+
data_params[fit_obj_name][key] = DataFrame(f'{val}')
|
|
1224
|
+
else:
|
|
1225
|
+
data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
|
|
1226
|
+
|
|
1227
|
+
# Manually deserializing and reconstructing PCA object
|
|
1228
|
+
load_pca_info = data_params['pca_fit_instance']
|
|
1229
|
+
pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
|
|
1230
|
+
pca.components_ = np.array(load_pca_info['components'])
|
|
1231
|
+
pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
|
|
1232
|
+
pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
|
|
1233
|
+
pca.mean_ = np.array(load_pca_info['mean'])
|
|
1234
|
+
pca.n_components_ = load_pca_info['n_components']
|
|
1235
|
+
pca.noise_variance_ = load_pca_info['noise_variance']
|
|
1236
|
+
pca.singular_values_ = np.array(load_pca_info['singular_values'])
|
|
1237
|
+
|
|
1238
|
+
data_params['pca_fit_instance'] = pca
|
|
1239
|
+
|
|
1240
|
+
self.loaded_data_transformation_params = data_params
|
|
1241
|
+
|
|
1242
|
+
def _validate_ranks(self, ranks):
|
|
1243
|
+
"""
|
|
1244
|
+
DESCRIPTION:
|
|
1245
|
+
Function validates the ranks argument.
|
|
1246
|
+
|
|
1247
|
+
PARAMETERS:
|
|
1248
|
+
ranks:
|
|
1249
|
+
Required Argument.
|
|
1250
|
+
Specifies the ranks for the models to be saved.
|
|
1251
|
+
Types: int or list of int
|
|
1252
|
+
|
|
1253
|
+
RAISES:
|
|
1254
|
+
TeradataMlException.
|
|
1255
|
+
"""
|
|
1256
|
+
start_rank, end_rank = ranks.start, ranks.stop
|
|
1257
|
+
|
|
1258
|
+
# Check if both parts are non-negative integers
|
|
1259
|
+
if not (start_rank > 0 and end_rank > 0):
|
|
1260
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1261
|
+
"'deploy' method", \
|
|
1262
|
+
"Provided start and end rank in 'ranks' "\
|
|
1263
|
+
"must be positive non-zero integers.")
|
|
1264
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1265
|
+
|
|
1266
|
+
# Check if start_rank is less than or equal to end_rank
|
|
1267
|
+
if start_rank > end_rank:
|
|
1268
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1269
|
+
"'deploy' method", \
|
|
1270
|
+
"Provided start rank in 'ranks' must be less than"\
|
|
1271
|
+
" or equal to end rank in 'ranks'.")
|
|
1272
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1273
|
+
|
|
1274
|
+
# check end rank is less than or equal to total models
|
|
1275
|
+
if end_rank > self.leader_board.RANK.max():
|
|
1276
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1277
|
+
"'deploy' method", \
|
|
1278
|
+
"Provided end rank in 'ranks' must be less than"\
|
|
1279
|
+
" or equal to total models available.")
|
|
1280
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1281
|
+
|
|
1282
|
+
return start_rank, end_rank
|
|
1283
|
+
|
|
1284
|
+
@collect_queryband(queryband="AutoML_deploy")
|
|
1285
|
+
def deploy(self,
|
|
1286
|
+
table_name,
|
|
1287
|
+
top_n = 3,
|
|
1288
|
+
ranks = None
|
|
1289
|
+
):
|
|
1290
|
+
"""
|
|
1291
|
+
DESCRIPTION:
|
|
1292
|
+
Function saves models to the specified table name.
|
|
1293
|
+
Note:
|
|
1294
|
+
* If 'ranks' is provided, specified models in 'ranks' will be saved
|
|
1295
|
+
and ranks will be reassigned to specified models based
|
|
1296
|
+
on the order of the leaderboard, non-specified models will be ignored.
|
|
1297
|
+
|
|
1298
|
+
PARAMETERS:
|
|
1299
|
+
table_name:
|
|
1300
|
+
Required Argument.
|
|
1301
|
+
Specifies the table name to which models information is to be saved.
|
|
1302
|
+
Types: str
|
|
1303
|
+
|
|
1304
|
+
top_n:
|
|
1305
|
+
Optional Argument.
|
|
1306
|
+
Specifies the top n models to be saved.
|
|
1307
|
+
Note:
|
|
1308
|
+
* If 'ranks' is not provided, the function saves the top 'top_n' models.
|
|
1309
|
+
|
|
1310
|
+
Default Value: 3
|
|
1311
|
+
Types: int
|
|
1312
|
+
|
|
1313
|
+
ranks:
|
|
1314
|
+
Optional Argument.
|
|
1315
|
+
Specifies the ranks for the models to be saved.
|
|
1316
|
+
Note:
|
|
1317
|
+
* If 'ranks' is provided, then 'top_n' is ignored.
|
|
1318
|
+
Types: int or list of int or range object
|
|
1319
|
+
|
|
1320
|
+
RETURNS:
|
|
1321
|
+
None
|
|
1322
|
+
|
|
1323
|
+
RAISES:
|
|
1324
|
+
TeradataMlException.
|
|
1325
|
+
|
|
1326
|
+
EXAMPLES:
|
|
1327
|
+
# Create an instance of the AutoML called "obj"
|
|
1328
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
1329
|
+
>>> obj = AutoML(task_type="Classification")
|
|
1330
|
+
>>> obj.fit(data = data, target_column = target_column)
|
|
1331
|
+
|
|
1332
|
+
# Save top 3 models to the specified table.
|
|
1333
|
+
>>> obj.deploy("model_table")
|
|
1334
|
+
|
|
1335
|
+
# Save top n models to the specified table.
|
|
1336
|
+
>>> obj.deploy("model_table", top_n=5)
|
|
1337
|
+
|
|
1338
|
+
# Save models based on specified ranks to the specified table.
|
|
1339
|
+
>>> obj.deploy("model_table", ranks=[1, 3, 5])
|
|
1340
|
+
|
|
1341
|
+
# Save models based on specified rank range to the specified table.
|
|
1342
|
+
>>> obj.deploy("model_table", ranks=range(2,6))
|
|
1343
|
+
"""
|
|
1344
|
+
# raise Error if fit is not called
|
|
1345
|
+
if not self._is_fit_called:
|
|
1346
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
1347
|
+
"'deploy' method", \
|
|
1348
|
+
"'fit' method must be called before" \
|
|
1349
|
+
" 'deploy'.")
|
|
1350
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1351
|
+
|
|
1352
|
+
# Appending arguments to list for validation
|
|
1353
|
+
arg_info_matrix = []
|
|
1354
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1355
|
+
arg_info_matrix.append(["top_n", top_n, True, (int)])
|
|
1356
|
+
if not isinstance(ranks, range):
|
|
1357
|
+
arg_info_matrix.append(["ranks", ranks, True, (int, list)])
|
|
1358
|
+
|
|
1359
|
+
# Validate argument types
|
|
1360
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1361
|
+
|
|
1362
|
+
if isinstance(ranks, int):
|
|
1363
|
+
ranks = [ranks]
|
|
1364
|
+
elif isinstance(ranks, range):
|
|
1365
|
+
start_rank, end_rank = self._validate_ranks(ranks)
|
|
1366
|
+
|
|
1367
|
+
if ranks is None or len(ranks) == 0:
|
|
1368
|
+
# If total models are greater than available models or less than 1
|
|
1369
|
+
try:
|
|
1370
|
+
_Validators._validate_argument_range(top_n, "top_n", lbound=1,
|
|
1371
|
+
ubound=self.leader_board.RANK.max(),
|
|
1372
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1373
|
+
except ValueError as e:
|
|
1374
|
+
msg = "\n'top_n' should be equal or less than the available models or greater than 0. " \
|
|
1375
|
+
"Deploying all available models to the table."
|
|
1376
|
+
warnings.warn(message=msg, stacklevel=2)
|
|
1377
|
+
top_n = self.leader_board.shape[0]
|
|
1378
|
+
elif isinstance(ranks, list):
|
|
1379
|
+
# If ranks is provided, then validating the ranks elements
|
|
1380
|
+
for ele in ranks:
|
|
1381
|
+
_Validators._validate_argument_range(ele, "element in ranks", lbound=1,
|
|
1382
|
+
ubound=self.leader_board.RANK.max(),
|
|
1383
|
+
lbound_inclusive=True, ubound_inclusive=True)
|
|
1384
|
+
|
|
1385
|
+
feature_selections = self.model_info['FEATURE_SELECTION'].unique().tolist()
|
|
1386
|
+
|
|
1387
|
+
# Mapping feature selection to training data,
|
|
1388
|
+
# we are creating a dictionary with key as feature selection and
|
|
1389
|
+
# value as temporary training data table name, so that we can copy
|
|
1390
|
+
# temporary training data to permanent table.
|
|
1391
|
+
# Here's an example of mapping:
|
|
1392
|
+
# Example: {'lasso': 'ml__survived_lasso_1717475362789542',
|
|
1393
|
+
# 'rfe': 'ml__survived_rfe_1717474570567062',
|
|
1394
|
+
# 'pca': 'ml__survived_pca_1717475375119752'}
|
|
1395
|
+
fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
|
|
1396
|
+
'DATA_TABLE'].iloc[0] for fs in feature_selections}
|
|
1397
|
+
|
|
1398
|
+
# Saving temporary training data to permanent table
|
|
1399
|
+
# We are replacing DATA_TABLE with permanent table name in model_info
|
|
1400
|
+
for key, val in fs_to_data_dict.items():
|
|
1401
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, key),
|
|
1402
|
+
persist_result_table=val)
|
|
1403
|
+
fs_to_data_dict[key] = per_name
|
|
1404
|
+
|
|
1405
|
+
# Persist flag
|
|
1406
|
+
persist = self.kwargs.get('persist', False)
|
|
1407
|
+
# If ranks is provided, then saving models based on specified rank
|
|
1408
|
+
# in list will be prioritized over 'top_n'.
|
|
1409
|
+
if ranks is None or len(ranks) == 0:
|
|
1410
|
+
# Saving only top 'top_n' models
|
|
1411
|
+
for index, row in self.model_info.iterrows():
|
|
1412
|
+
if index < top_n:
|
|
1413
|
+
self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1414
|
+
if not persist:
|
|
1415
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1416
|
+
persist_result_table=row['RESULT_TABLE'])
|
|
1417
|
+
self.model_info.loc[index, 'RESULT_TABLE'] = per_name
|
|
1418
|
+
else:
|
|
1419
|
+
break
|
|
1420
|
+
sv_models = self.model_info.drop('model-obj', axis=1).head(top_n)
|
|
1421
|
+
else:
|
|
1422
|
+
if isinstance(ranks, range):
|
|
1423
|
+
# Saving models based on start and end rank.
|
|
1424
|
+
sv_models = self.model_info[start_rank-1:end_rank].copy()
|
|
1425
|
+
else:
|
|
1426
|
+
# Saving models based on specified rank in list
|
|
1427
|
+
sv_models = self.model_info[self.model_info['RANK'].isin(ranks)].copy()
|
|
1428
|
+
sv_models.drop('model-obj', axis=1, inplace=True)
|
|
1429
|
+
sv_models.reset_index(drop=True, inplace=True)
|
|
1430
|
+
|
|
1431
|
+
for index, row in sv_models.iterrows():
|
|
1432
|
+
sv_models.loc[index, 'RANK'] = index + 1
|
|
1433
|
+
sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1434
|
+
if not persist:
|
|
1435
|
+
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1436
|
+
persist_result_table=row['RESULT_TABLE'])
|
|
1437
|
+
sv_models.loc[index, 'RESULT_TABLE'] = per_name
|
|
1438
|
+
|
|
1439
|
+
# Data Transformation Parameters
|
|
1440
|
+
df = self._deploy_data_transformation_params()
|
|
1441
|
+
|
|
1442
|
+
# Saving data transformation parameters to the specified table
|
|
1443
|
+
sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
|
|
1444
|
+
|
|
1445
|
+
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
|
|
1446
|
+
|
|
1447
|
+
print('Model Deployment Completed Successfully.')
|
|
1448
|
+
|
|
1449
|
+
def _create_per_result_table(self, prefix, persist_result_table):
|
|
1450
|
+
"""
|
|
1451
|
+
DESCRIPTION:
|
|
1452
|
+
Internal Function creates permanent table for the specified result table.
|
|
1453
|
+
|
|
1454
|
+
PARAMETERS:
|
|
1455
|
+
prefix:
|
|
1456
|
+
Required Argument.
|
|
1457
|
+
Specifies the prefix for the permanent table name.
|
|
1458
|
+
Types: str
|
|
1459
|
+
|
|
1460
|
+
persist_result_table:
|
|
1461
|
+
Required Argument.
|
|
1462
|
+
Specifies the result table name.
|
|
1463
|
+
Types: str
|
|
1464
|
+
|
|
1465
|
+
RETURNS:
|
|
1466
|
+
Permanent table name.
|
|
1467
|
+
|
|
1468
|
+
RAISES:
|
|
1469
|
+
TeradataMlException.
|
|
1470
|
+
"""
|
|
1471
|
+
|
|
1472
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix=prefix,
|
|
1473
|
+
table_type=TeradataConstants.TERADATA_TABLE,
|
|
1474
|
+
gc_on_quit=False)
|
|
1475
|
+
qry = f"SELECT * FROM {persist_result_table}"
|
|
1476
|
+
UtilFuncs._create_table(table_name=table_name,
|
|
1477
|
+
query=qry,
|
|
1478
|
+
volatile=False)
|
|
1479
|
+
return table_name
|
|
1480
|
+
|
|
1481
|
+
|
|
1482
|
+
def _deploy_data_transformation_params(self):
|
|
1483
|
+
"""
|
|
1484
|
+
DESCRIPTION:
|
|
1485
|
+
Internal Function converts data transformation parameters dictonary (information of each step of automl)
|
|
1486
|
+
to DataFrame with rank as -1 and return the DataFrame that can be concatenated with model_info DataFrame
|
|
1487
|
+
and saved to the user specified table in database.
|
|
612
1488
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
1489
|
+
PARAMETERS:
|
|
1490
|
+
None
|
|
1491
|
+
|
|
1492
|
+
RETURNS:
|
|
1493
|
+
None
|
|
1494
|
+
|
|
1495
|
+
RAISES:
|
|
1496
|
+
TeradataMlException.
|
|
1497
|
+
"""
|
|
1498
|
+
# Create a new dictionary to store the deep copy
|
|
1499
|
+
data_params = {}
|
|
1500
|
+
|
|
1501
|
+
# Define a recursive function to deep copy dictionaries
|
|
1502
|
+
def deep_copy_dict(d):
|
|
1503
|
+
if not isinstance(d, dict):
|
|
1504
|
+
return d # Base case: if it's not a dictionary, return the value directly
|
|
1505
|
+
return {k: deep_copy_dict(v) for k, v in d.items()} # Recursively copy each item
|
|
625
1506
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
# Checking for target column presence in passed test data.
|
|
638
|
-
# If present, then both prediction and evaluation metrics will be generated.
|
|
639
|
-
# If not present, then only prediction will be generated.
|
|
640
|
-
if self.target_column_ind:
|
|
641
|
-
metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
642
|
-
test_data_ind = \
|
|
643
|
-
self.test_data_ind,
|
|
644
|
-
target_column_ind = \
|
|
645
|
-
self.target_column_ind,
|
|
646
|
-
table_name_mapping=self.table_name_mapping)
|
|
1507
|
+
# Deep copy is needed as the original dictionary contains nested dictionaries
|
|
1508
|
+
# and we want to avoid modifying the original dictionary when changes are made.
|
|
1509
|
+
# The .copy() method creates a shallow copy, which does not suffice for nested dictionaries.
|
|
1510
|
+
# Iterate through the original dictionary to handle deep copying.
|
|
1511
|
+
for key, value in self.data_transformation_params.items():
|
|
1512
|
+
# Check if value is a dictionary
|
|
1513
|
+
if isinstance(value, dict):
|
|
1514
|
+
# If the value is a dictionary, create a deep copy of the dictionary
|
|
1515
|
+
# This ensures that nested dictionaries are also copied, not just referenced.
|
|
1516
|
+
data_params[key] = deep_copy_dict(value)
|
|
647
1517
|
else:
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
#
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
1518
|
+
# If the value is not a dictionary, perform a shallow copy (direct assignment)
|
|
1519
|
+
data_params[key] = value
|
|
1520
|
+
|
|
1521
|
+
# Names of fit objects that contain the table names
|
|
1522
|
+
# pointing to tables in the database.
|
|
1523
|
+
fit_obj_names = []
|
|
1524
|
+
|
|
1525
|
+
# Persist flag
|
|
1526
|
+
persist = self.kwargs.get('persist', False)
|
|
1527
|
+
|
|
1528
|
+
data_params['auto_mode'] = False if self.custom_data is not None else True
|
|
1529
|
+
|
|
1530
|
+
# Iterating over data transformation parameters
|
|
1531
|
+
# aml_step_name is the name of transformation step taken and val is the value
|
|
1532
|
+
for aml_step_name,val in data_params.items():
|
|
1533
|
+
# Checking if value is of type teradataml DataFrame
|
|
1534
|
+
# If yes, then creating permanent table for the same
|
|
1535
|
+
# and storing the table_name in data_params instead of dataframe.
|
|
1536
|
+
if isinstance(val, DataFrame):
|
|
1537
|
+
fit_obj_names.append(aml_step_name)
|
|
1538
|
+
if persist:
|
|
1539
|
+
data_params[aml_step_name] = val._table_name
|
|
1540
|
+
else:
|
|
1541
|
+
per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
|
|
1542
|
+
persist_result_table= val._table_name)
|
|
1543
|
+
data_params[aml_step_name] = per_name
|
|
1544
|
+
elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
|
|
1545
|
+
for key, val in val.items():
|
|
1546
|
+
if isinstance(val, DataFrame):
|
|
1547
|
+
fit_obj_names.append(aml_step_name)
|
|
1548
|
+
if persist:
|
|
1549
|
+
data_params[aml_step_name][key] = val._table_name
|
|
1550
|
+
else:
|
|
1551
|
+
per_name = self._create_per_result_table(prefix='{}'.format(key),
|
|
1552
|
+
persist_result_table= val._table_name)
|
|
1553
|
+
data_params[aml_step_name][key] = per_name
|
|
1554
|
+
elif aml_step_name == 'pca_fit_instance':
|
|
1555
|
+
# Serializing PCA object
|
|
1556
|
+
pca = data_params[aml_step_name]
|
|
1557
|
+
# Extract pca parameters
|
|
1558
|
+
pca_params = {
|
|
1559
|
+
'n_components': pca.n_components_,
|
|
1560
|
+
'components': pca.components_.tolist(),
|
|
1561
|
+
'explained_variance': pca.explained_variance_.tolist(),
|
|
1562
|
+
'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
|
|
1563
|
+
'mean': pca.mean_.tolist(),
|
|
1564
|
+
'singular_values': pca.singular_values_.tolist(),
|
|
1565
|
+
'noise_variance': pca.noise_variance_
|
|
1566
|
+
}
|
|
1567
|
+
data_params[aml_step_name] = pca_params
|
|
670
1568
|
|
|
671
|
-
#
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
1569
|
+
# Serializing data transformation parameters
|
|
1570
|
+
buffer = BytesIO()
|
|
1571
|
+
joblib.dump(data_params, buffer)
|
|
1572
|
+
buffer.seek(0)
|
|
1573
|
+
serialized_data = buffer.getvalue()
|
|
1574
|
+
|
|
1575
|
+
# Creating a string representation of fit object names
|
|
1576
|
+
param = json.dumps(fit_obj_names)
|
|
1577
|
+
|
|
1578
|
+
# Creating a DataFrame of data transformation information
|
|
1579
|
+
row = {
|
|
1580
|
+
'RANK':-1,
|
|
1581
|
+
'PARAMETERS':param,
|
|
1582
|
+
'DATA_PARAMS':serialized_data,
|
|
1583
|
+
}
|
|
1584
|
+
df = pd.DataFrame([row])
|
|
1585
|
+
|
|
1586
|
+
return df
|
|
676
1587
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
682
|
-
# Displaying ROC-AUC for binary classification
|
|
683
|
-
if self.target_count == 2:
|
|
684
|
-
fit_params = {
|
|
685
|
-
"probability_column" : prediction_column,
|
|
686
|
-
"observation_column" : self.target_column,
|
|
687
|
-
"positive_class" : "1",
|
|
688
|
-
"data" : pred.result
|
|
689
|
-
}
|
|
690
|
-
# Fitting ROC
|
|
691
|
-
roc_out = ROC(**fit_params)
|
|
692
|
-
print("\nROC-AUC : ")
|
|
693
|
-
print_data(roc_out.result)
|
|
694
|
-
print_data(roc_out.output_data)
|
|
695
|
-
|
|
696
|
-
# Displaying confusion matrix for binary and multiclass classification
|
|
697
|
-
prediction_df=pred.result.to_pandas()
|
|
698
|
-
target_col = self.target_column
|
|
699
|
-
print("\nConfusion Matrix : ")
|
|
700
|
-
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
701
|
-
|
|
702
|
-
# Returning prediction
|
|
703
|
-
return pred.result
|
|
704
|
-
|
|
705
|
-
@collect_queryband(queryband="AutoML_leaderboard")
|
|
706
|
-
def leaderboard(self):
|
|
1588
|
+
def _run_loaded_model(self,
|
|
1589
|
+
test_data,
|
|
1590
|
+
rank=1,
|
|
1591
|
+
output_type='prediction'):
|
|
707
1592
|
"""
|
|
708
1593
|
DESCRIPTION:
|
|
709
|
-
Function
|
|
1594
|
+
Internal Function generates prediction and performance metrics using the specified model rank
|
|
1595
|
+
in the loaded models leaderboard.
|
|
1596
|
+
|
|
1597
|
+
PARAMETERS:
|
|
1598
|
+
test_data:
|
|
1599
|
+
Required Argument.
|
|
1600
|
+
Specifies the test data on which prediction and performance metrics needs to be generated.
|
|
1601
|
+
Types: teradataml DataFrame
|
|
1602
|
+
|
|
1603
|
+
rank:
|
|
1604
|
+
Optional Argument.
|
|
1605
|
+
Specifies the rank of the model in the leaderboard to be used for prediction.
|
|
1606
|
+
Default Value: 1
|
|
1607
|
+
Types: int
|
|
1608
|
+
|
|
1609
|
+
output_type:
|
|
1610
|
+
Optional Argument.
|
|
1611
|
+
Specifies the type of output to be generated.
|
|
1612
|
+
Default Value: 'prediction'
|
|
1613
|
+
Types: str
|
|
1614
|
+
Permitted Values: 'prediction', 'metrics'
|
|
710
1615
|
|
|
711
1616
|
RETURNS:
|
|
712
|
-
|
|
1617
|
+
Tuple containing prediction and performance metrics.
|
|
713
1618
|
|
|
714
1619
|
RAISES:
|
|
715
1620
|
TeradataMlException.
|
|
716
1621
|
|
|
717
|
-
EXAMPLES:
|
|
718
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
719
|
-
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
720
|
-
# Perform fit() operation on the "automl_obj".
|
|
721
|
-
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
722
|
-
>>> automl_obj.leaderboard()
|
|
723
1622
|
"""
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
1623
|
+
# Indexing starts from 0
|
|
1624
|
+
rank = rank - 1
|
|
1625
|
+
# Extracting parameters
|
|
1626
|
+
parameters = ast.literal_eval(self.loaded_models_info.loc[rank, 'PARAMETERS'])
|
|
1627
|
+
# Model name
|
|
1628
|
+
model_name = self.loaded_models_info.loc[rank, 'MODEL_ID'].split('_')[0]
|
|
1629
|
+
# Feature selection
|
|
1630
|
+
fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
|
|
732
1631
|
|
|
733
|
-
|
|
734
|
-
|
|
1632
|
+
# Checking task type
|
|
1633
|
+
if 'R2' in self.loaded_models_info.columns:
|
|
1634
|
+
task_type='Regression'
|
|
1635
|
+
else:
|
|
1636
|
+
task_type='Classification'
|
|
1637
|
+
|
|
1638
|
+
# Model names mapping to Analytic Functions
|
|
1639
|
+
func_map = {
|
|
1640
|
+
'XGBOOST': lambda params: XGBoost(**params),
|
|
1641
|
+
'GLM': lambda params: GLM(**params),
|
|
1642
|
+
'SVM': lambda params: SVM(**params),
|
|
1643
|
+
'DECISIONFOREST': lambda params: DecisionForest(**params),
|
|
1644
|
+
'KNN': lambda params: KNN(**params)
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
if output_type == 'prediction':
|
|
1648
|
+
print('Generating prediction using:')
|
|
1649
|
+
else:
|
|
1650
|
+
print('Generating performance metrics using:')
|
|
1651
|
+
print(f"Model Name: {model_name}")
|
|
1652
|
+
print(f"Feature Selection: {fs}")
|
|
1653
|
+
|
|
1654
|
+
# Generating evaluation parameters
|
|
1655
|
+
eval_params = _ModelTraining._eval_params_generation(model_name,
|
|
1656
|
+
parameters['response_column'],
|
|
1657
|
+
task_type)
|
|
1658
|
+
if task_type == 'Classification':
|
|
1659
|
+
eval_params['output_responses'] = parameters['output_responses']
|
|
1660
|
+
|
|
1661
|
+
# Checking if response column is present in test data
|
|
1662
|
+
if parameters['response_column'] not in test_data.columns:
|
|
1663
|
+
# Checking if output type is evaluation
|
|
1664
|
+
if output_type == 'evaluation':
|
|
1665
|
+
# Response column is rqeuired for evaluation, raise error if not present
|
|
1666
|
+
raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
|
|
1667
|
+
eval_params.pop('accumulate', None)
|
|
1668
|
+
reponse_col_present = False
|
|
1669
|
+
else:
|
|
1670
|
+
reponse_col_present = True
|
|
1671
|
+
|
|
1672
|
+
# Checking if data is already transformed before or not
|
|
1673
|
+
data_node_id = test_data._nodeid
|
|
1674
|
+
if not self.table_name_mapping.get(data_node_id):
|
|
1675
|
+
# Data transformation will be performed on raw test data
|
|
1676
|
+
self.transform_data(data=test_data,
|
|
1677
|
+
data_params=self.loaded_data_transformation_params,
|
|
1678
|
+
auto=self.loaded_data_transformation_params['auto_mode'],
|
|
1679
|
+
verbose=0,
|
|
1680
|
+
target_column_ind=reponse_col_present)
|
|
1681
|
+
|
|
1682
|
+
# Extracting test data
|
|
1683
|
+
for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
|
|
1684
|
+
if fs in feature_selection:
|
|
1685
|
+
test_data = DataFrame(table_name)
|
|
1686
|
+
break
|
|
1687
|
+
|
|
1688
|
+
if model_name == 'KNN':
|
|
1689
|
+
train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
|
|
1690
|
+
|
|
1691
|
+
parameters['train_data'] = train_data
|
|
1692
|
+
parameters['test_data'] = test_data
|
|
1693
|
+
|
|
1694
|
+
if parameters['response_column'] in test_data.columns:
|
|
1695
|
+
parameters['accumulate'] = parameters['response_column']
|
|
1696
|
+
|
|
1697
|
+
knn = func_map[model_name](parameters)
|
|
1698
|
+
|
|
1699
|
+
# Checking if response column is present in test data
|
|
1700
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1701
|
+
metrics = knn.evaluate(test_data=test_data, **eval_params)
|
|
1702
|
+
else:
|
|
1703
|
+
predictions = knn.result
|
|
1704
|
+
else:
|
|
1705
|
+
# Extracting result table name
|
|
1706
|
+
result_table_name = self.loaded_models_info.loc[rank, 'RESULT_TABLE']
|
|
1707
|
+
result_table = DataFrame(result_table_name)
|
|
1708
|
+
params = {
|
|
1709
|
+
"skip_input_arg_processing":True,
|
|
1710
|
+
"skip_output_arg_processing":True,
|
|
1711
|
+
"skip_other_arg_processing":True,
|
|
1712
|
+
"skip_func_output_processing":True,
|
|
1713
|
+
"_result_data":result_table,
|
|
1714
|
+
"response_column": parameters['response_column']
|
|
1715
|
+
}
|
|
1716
|
+
model = func_map[model_name](params)
|
|
1717
|
+
# Checking if response column is present in test data
|
|
1718
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1719
|
+
metrics = model.evaluate(newdata=test_data, **eval_params)
|
|
1720
|
+
else:
|
|
1721
|
+
predictions = model.predict(newdata=test_data, **eval_params)
|
|
1722
|
+
|
|
1723
|
+
# Return prediction and metrics, when output type is metrics
|
|
1724
|
+
if reponse_col_present and output_type != 'prediction':
|
|
1725
|
+
return metrics
|
|
1726
|
+
|
|
1727
|
+
# Return prediction, when output type is prediction
|
|
1728
|
+
return predictions if model_name == 'KNN' else predictions.result
|
|
1729
|
+
|
|
1730
|
+
@collect_queryband(queryband="AutoML_remove_saved_models")
|
|
1731
|
+
def remove_saved_models(self,
|
|
1732
|
+
table_name):
|
|
735
1733
|
"""
|
|
736
1734
|
DESCRIPTION:
|
|
737
|
-
Function
|
|
738
|
-
|
|
1735
|
+
Function removes the specified table containing saved models.
|
|
1736
|
+
Note:
|
|
1737
|
+
* If any data table result table is not present inside the database,
|
|
1738
|
+
then it will be skipped.
|
|
1739
|
+
|
|
1740
|
+
PARAMETERS:
|
|
1741
|
+
table_name:
|
|
1742
|
+
Required Argument.
|
|
1743
|
+
Specifies the table name containing saved models.
|
|
1744
|
+
Types: str
|
|
1745
|
+
|
|
739
1746
|
RETURNS:
|
|
740
1747
|
None
|
|
741
1748
|
|
|
@@ -743,25 +1750,48 @@ class AutoML:
|
|
|
743
1750
|
TeradataMlException.
|
|
744
1751
|
|
|
745
1752
|
EXAMPLES:
|
|
746
|
-
# Create an instance of the AutoML called "
|
|
1753
|
+
# Create an instance of the AutoML called "obj"
|
|
747
1754
|
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
748
|
-
|
|
749
|
-
#
|
|
750
|
-
|
|
751
|
-
>>> automl_obj.leader()
|
|
1755
|
+
>>> obj = AutoML()
|
|
1756
|
+
# Remove saved models from the specified table.
|
|
1757
|
+
>>> obj.remove_saved_models("model_table")
|
|
752
1758
|
"""
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
1759
|
+
# Appending arguments to list for validation
|
|
1760
|
+
arg_info_matrix = []
|
|
1761
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
1762
|
+
|
|
1763
|
+
# Validate argument types
|
|
1764
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1765
|
+
|
|
1766
|
+
df = DataFrame(table_name).to_pandas()
|
|
1767
|
+
|
|
1768
|
+
drop_list = df['DATA_TABLE'].dropna().unique().tolist()
|
|
1769
|
+
drop_list.extend(df['RESULT_TABLE'].dropna().unique().tolist())
|
|
1770
|
+
|
|
1771
|
+
# Removing data transformation parameters tables
|
|
1772
|
+
data=df[df['RANK'] == -1].iloc[0]
|
|
1773
|
+
buffer = BytesIO(data['DATA_PARAMS'])
|
|
1774
|
+
data_params = joblib.load(buffer)
|
|
1775
|
+
fit_obj_lst = json.loads(data['PARAMETERS'])
|
|
1776
|
+
for i in fit_obj_lst:
|
|
1777
|
+
if isinstance(data_params[i], dict):
|
|
1778
|
+
drop_list.extend(data_params[i].values())
|
|
1779
|
+
else:
|
|
1780
|
+
drop_list.append(data_params[i])
|
|
1781
|
+
|
|
1782
|
+
non_existent_tables = []
|
|
1783
|
+
for table in drop_list:
|
|
1784
|
+
try:
|
|
1785
|
+
execute_sql(f"DROP TABLE {table};")
|
|
1786
|
+
except Exception as e:
|
|
1787
|
+
non_existent_tables.append(table)
|
|
1788
|
+
continue
|
|
1789
|
+
|
|
1790
|
+
if len(non_existent_tables) > 0:
|
|
1791
|
+
warnings.warn(message=f"\nThe following tables '{non_existent_tables}' do not exist in the database and have been skipped.",
|
|
1792
|
+
stacklevel=2)
|
|
1793
|
+
|
|
1794
|
+
db_drop_table(table_name)
|
|
765
1795
|
|
|
766
1796
|
@staticmethod
|
|
767
1797
|
def generate_custom_config(file_name = "custom"):
|
|
@@ -853,7 +1883,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
853
1883
|
max_runtime_secs = None,
|
|
854
1884
|
stopping_metric = None,
|
|
855
1885
|
stopping_tolerance = None,
|
|
856
|
-
max_models = None
|
|
1886
|
+
max_models = None,
|
|
1887
|
+
**kwargs):
|
|
857
1888
|
"""
|
|
858
1889
|
DESCRIPTION:
|
|
859
1890
|
Interal Function runs Regression.
|
|
@@ -895,6 +1926,25 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
895
1926
|
Specifies the maximum number of models to be trained.
|
|
896
1927
|
Types: int
|
|
897
1928
|
|
|
1929
|
+
volatile:
|
|
1930
|
+
Optional Argument.
|
|
1931
|
+
Specifies whether to put the results of the
|
|
1932
|
+
function in a volatile table or not. When set to
|
|
1933
|
+
True, results are stored in a volatile table,
|
|
1934
|
+
otherwise not.
|
|
1935
|
+
Default Value: False
|
|
1936
|
+
Types: bool
|
|
1937
|
+
|
|
1938
|
+
persist:
|
|
1939
|
+
Optional Argument.
|
|
1940
|
+
Specifies whether to persist the results of the
|
|
1941
|
+
function in a table or not. When set to True,
|
|
1942
|
+
results are persisted in a table; otherwise,
|
|
1943
|
+
results are garbage collected at the end of the
|
|
1944
|
+
session.
|
|
1945
|
+
Default Value: False
|
|
1946
|
+
Types: bool
|
|
1947
|
+
|
|
898
1948
|
RETURNS:
|
|
899
1949
|
a tuple containing, model information and leaderboard.
|
|
900
1950
|
"""
|
|
@@ -911,7 +1961,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
911
1961
|
target_column = self.target_column,
|
|
912
1962
|
model_list = model_list,
|
|
913
1963
|
verbose = verbose,
|
|
914
|
-
custom_data = self.custom_data
|
|
1964
|
+
custom_data = self.custom_data,
|
|
1965
|
+
**kwargs)
|
|
915
1966
|
# Start time
|
|
916
1967
|
start_time = time.time()
|
|
917
1968
|
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
@@ -923,7 +1974,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
923
1974
|
verbose = verbose,
|
|
924
1975
|
excluded_columns = excluded_columns,
|
|
925
1976
|
custom_data = self.custom_data,
|
|
926
|
-
data_transform_dict = data_transformation_params
|
|
1977
|
+
data_transform_dict = data_transformation_params,
|
|
1978
|
+
**kwargs)
|
|
927
1979
|
features, data_transformation_params = self.data_preparation(auto)
|
|
928
1980
|
|
|
929
1981
|
# Calculating max_runtime_secs for model training by,
|
|
@@ -943,7 +1995,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
943
1995
|
verbose = verbose,
|
|
944
1996
|
features = features,
|
|
945
1997
|
task_type = "Regression",
|
|
946
|
-
custom_data = self.custom_data
|
|
1998
|
+
custom_data = self.custom_data,
|
|
1999
|
+
**kwargs)
|
|
947
2000
|
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
948
2001
|
max_runtime_secs = max_runtime_secs,
|
|
949
2002
|
stopping_metric = stopping_metric,
|
|
@@ -989,7 +2042,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
989
2042
|
max_runtime_secs = None,
|
|
990
2043
|
stopping_metric = None,
|
|
991
2044
|
stopping_tolerance = None,
|
|
992
|
-
max_models = None
|
|
2045
|
+
max_models = None,
|
|
2046
|
+
**kwargs):
|
|
993
2047
|
"""
|
|
994
2048
|
DESCRIPTION:
|
|
995
2049
|
Interal Function runs Classification.
|
|
@@ -1030,12 +2084,30 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1030
2084
|
Optional Argument.
|
|
1031
2085
|
Specifies the maximum number of models to be trained.
|
|
1032
2086
|
Types: int
|
|
1033
|
-
|
|
2087
|
+
|
|
2088
|
+
volatile:
|
|
2089
|
+
Optional Argument.
|
|
2090
|
+
Specifies whether to put the results of the
|
|
2091
|
+
function in a volatile table or not. When set to
|
|
2092
|
+
True, results are stored in a volatile table,
|
|
2093
|
+
otherwise not.
|
|
2094
|
+
Default Value: False
|
|
2095
|
+
Types: bool
|
|
2096
|
+
|
|
2097
|
+
persist:
|
|
2098
|
+
Optional Argument.
|
|
2099
|
+
Specifies whether to persist the results of the
|
|
2100
|
+
function in a table or not. When set to True,
|
|
2101
|
+
results are persisted in a table; otherwise,
|
|
2102
|
+
results are garbage collected at the end of the
|
|
2103
|
+
session.
|
|
2104
|
+
Default Value: False
|
|
2105
|
+
Types: bool
|
|
2106
|
+
|
|
1034
2107
|
RETURNS:
|
|
1035
2108
|
a tuple containing, model information and leaderboard.
|
|
1036
2109
|
"""
|
|
1037
|
-
|
|
1038
|
-
|
|
2110
|
+
|
|
1039
2111
|
# Feature Exploration Phase
|
|
1040
2112
|
_FeatureExplore.__init__(self,
|
|
1041
2113
|
data = self.data,
|
|
@@ -1050,7 +2122,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1050
2122
|
model_list = model_list,
|
|
1051
2123
|
verbose = verbose,
|
|
1052
2124
|
task_type = "Classification",
|
|
1053
|
-
custom_data = self.custom_data
|
|
2125
|
+
custom_data = self.custom_data,
|
|
2126
|
+
**kwargs)
|
|
1054
2127
|
# Start time
|
|
1055
2128
|
start_time = time.time()
|
|
1056
2129
|
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
@@ -1062,7 +2135,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1062
2135
|
excluded_columns = excluded_columns,
|
|
1063
2136
|
custom_data = self.custom_data,
|
|
1064
2137
|
data_transform_dict = data_transformation_params,
|
|
1065
|
-
task_type = "Classification"
|
|
2138
|
+
task_type = "Classification",
|
|
2139
|
+
**kwargs)
|
|
1066
2140
|
features, data_transformation_params = self.data_preparation(auto)
|
|
1067
2141
|
|
|
1068
2142
|
# Calculating max_runtime_secs for model training by,
|
|
@@ -1082,7 +2156,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
1082
2156
|
verbose = verbose,
|
|
1083
2157
|
features = features,
|
|
1084
2158
|
task_type = "Classification",
|
|
1085
|
-
custom_data = self.custom_data
|
|
2159
|
+
custom_data = self.custom_data,
|
|
2160
|
+
**kwargs)
|
|
1086
2161
|
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
1087
2162
|
max_runtime_secs = max_runtime_secs,
|
|
1088
2163
|
stopping_metric = stopping_metric,
|
|
@@ -1243,7 +2318,8 @@ class AutoRegressor(AutoML):
|
|
|
1243
2318
|
stopping_metric=None,
|
|
1244
2319
|
stopping_tolerance=None,
|
|
1245
2320
|
max_models=None,
|
|
1246
|
-
custom_config_file=None
|
|
2321
|
+
custom_config_file=None,
|
|
2322
|
+
**kwargs
|
|
1247
2323
|
):
|
|
1248
2324
|
"""
|
|
1249
2325
|
DESCRIPTION:
|
|
@@ -1284,8 +2360,10 @@ class AutoRegressor(AutoML):
|
|
|
1284
2360
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1285
2361
|
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1286
2362
|
Permitted Values:
|
|
1287
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1288
|
-
"RMSE", "RMSLE"
|
|
2363
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
2364
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
2365
|
+
"ME", "EV", "MPD", "MGD"
|
|
2366
|
+
|
|
1289
2367
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1290
2368
|
'MICRO-RECALL','MACRO-RECALL',
|
|
1291
2369
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -1307,7 +2385,29 @@ class AutoRegressor(AutoML):
|
|
|
1307
2385
|
Optional Argument.
|
|
1308
2386
|
Specifies the path of JSON file in case of custom run.
|
|
1309
2387
|
Types: str
|
|
1310
|
-
|
|
2388
|
+
|
|
2389
|
+
**kwargs:
|
|
2390
|
+
Specifies the additional arguments for AutoRegressor. Below
|
|
2391
|
+
are the additional arguments:
|
|
2392
|
+
volatile:
|
|
2393
|
+
Optional Argument.
|
|
2394
|
+
Specifies whether to put the interim results of the
|
|
2395
|
+
functions in a volatile table or not. When set to
|
|
2396
|
+
True, results are stored in a volatile table,
|
|
2397
|
+
otherwise not.
|
|
2398
|
+
Default Value: False
|
|
2399
|
+
Types: bool
|
|
2400
|
+
|
|
2401
|
+
persist:
|
|
2402
|
+
Optional Argument.
|
|
2403
|
+
Specifies whether to persist the interim results of the
|
|
2404
|
+
functions in a table or not. When set to True,
|
|
2405
|
+
results are persisted in a table; otherwise,
|
|
2406
|
+
results are garbage collected at the end of the
|
|
2407
|
+
session.
|
|
2408
|
+
Default Value: False
|
|
2409
|
+
Types: bool
|
|
2410
|
+
|
|
1311
2411
|
RETURNS:
|
|
1312
2412
|
Instance of AutoRegressor.
|
|
1313
2413
|
|
|
@@ -1336,24 +2436,28 @@ class AutoRegressor(AutoML):
|
|
|
1336
2436
|
|
|
1337
2437
|
# Fit the data.
|
|
1338
2438
|
>>> automl_obj.fit(housing_train, "price")
|
|
2439
|
+
|
|
2440
|
+
# Display leaderboard.
|
|
2441
|
+
>>> automl_obj.leaderboard()
|
|
1339
2442
|
|
|
1340
|
-
#
|
|
1341
|
-
>>>
|
|
1342
|
-
>>> prediction
|
|
2443
|
+
# Display best performing model.
|
|
2444
|
+
>>> automl_obj.leader()
|
|
1343
2445
|
|
|
1344
|
-
# Run predict
|
|
2446
|
+
# Run predict on test data using best performing model.
|
|
1345
2447
|
>>> prediction = automl_obj.predict(housing_test)
|
|
1346
2448
|
>>> prediction
|
|
1347
2449
|
|
|
1348
|
-
# Run predict
|
|
2450
|
+
# Run predict on test data using second best performing model.
|
|
1349
2451
|
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
1350
2452
|
>>> prediction
|
|
1351
|
-
|
|
1352
|
-
#
|
|
1353
|
-
>>> automl_obj.
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
2453
|
+
|
|
2454
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2455
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2456
|
+
>>> performance_metrics
|
|
2457
|
+
|
|
2458
|
+
# Run evaluate to get performance metrics using second best performing model.
|
|
2459
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test, 2)
|
|
2460
|
+
>>> performance_metrics
|
|
1357
2461
|
|
|
1358
2462
|
# Example 2 : Run AutoRegressor for regression problem with early stopping metric and tolerance.
|
|
1359
2463
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -1374,13 +2478,17 @@ class AutoRegressor(AutoML):
|
|
|
1374
2478
|
>>> custom_config_file="custom_housing.json")
|
|
1375
2479
|
# Fit the data.
|
|
1376
2480
|
>>> automl_obj.fit(housing_train, "price")
|
|
1377
|
-
|
|
1378
|
-
# Run predict with best performing model.
|
|
1379
|
-
>>> prediction = automl_obj.predict()
|
|
1380
|
-
>>> prediction
|
|
1381
|
-
|
|
2481
|
+
|
|
1382
2482
|
# Display leaderboard.
|
|
1383
2483
|
>>> automl_obj.leaderboard()
|
|
2484
|
+
|
|
2485
|
+
# Run predict on test data using best performing model.
|
|
2486
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
2487
|
+
>>> prediction
|
|
2488
|
+
|
|
2489
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2490
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2491
|
+
>>> performance_metrics
|
|
1384
2492
|
|
|
1385
2493
|
# Example 3 : Run AutoRegressor for regression problem with maximum runtime.
|
|
1386
2494
|
# Scenario : Predict the price of house based on different factors.
|
|
@@ -1392,20 +2500,24 @@ class AutoRegressor(AutoML):
|
|
|
1392
2500
|
>>> max_runtime_secs=500)
|
|
1393
2501
|
# Fit the data.
|
|
1394
2502
|
>>> automl_obj.fit(housing_train, "price")
|
|
1395
|
-
|
|
1396
|
-
# Run predict with best performing model.
|
|
1397
|
-
>>> prediction = automl_obj.predict()
|
|
1398
|
-
>>> prediction
|
|
1399
|
-
|
|
1400
|
-
# Run predict with second best performing model.
|
|
1401
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1402
|
-
>>> prediction
|
|
1403
|
-
|
|
2503
|
+
|
|
1404
2504
|
# Display leaderboard.
|
|
1405
2505
|
>>> automl_obj.leaderboard()
|
|
1406
2506
|
|
|
1407
2507
|
# Display best performing model.
|
|
1408
2508
|
>>> automl_obj.leader()
|
|
2509
|
+
|
|
2510
|
+
# Run predict on test data using best performing model.
|
|
2511
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
2512
|
+
>>> prediction
|
|
2513
|
+
|
|
2514
|
+
# Run predict on test data using second best performing model.
|
|
2515
|
+
>>> prediction = automl_obj.predict(housing_test, 2)
|
|
2516
|
+
>>> prediction
|
|
2517
|
+
|
|
2518
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2519
|
+
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2520
|
+
>>> performance_metrics
|
|
1409
2521
|
"""
|
|
1410
2522
|
self.verbose = verbose
|
|
1411
2523
|
self.max_runtime_secs = max_runtime_secs
|
|
@@ -1425,7 +2537,8 @@ class AutoRegressor(AutoML):
|
|
|
1425
2537
|
stopping_metric=self.stopping_metric,
|
|
1426
2538
|
stopping_tolerance=self.stopping_tolerance,
|
|
1427
2539
|
max_models=self.max_models,
|
|
1428
|
-
custom_config_file=self.custom_config_file
|
|
2540
|
+
custom_config_file=self.custom_config_file,
|
|
2541
|
+
**kwargs)
|
|
1429
2542
|
class AutoClassifier(AutoML):
|
|
1430
2543
|
|
|
1431
2544
|
def __init__(self,
|
|
@@ -1436,7 +2549,8 @@ class AutoClassifier(AutoML):
|
|
|
1436
2549
|
stopping_metric=None,
|
|
1437
2550
|
stopping_tolerance=None,
|
|
1438
2551
|
max_models=None,
|
|
1439
|
-
custom_config_file=None
|
|
2552
|
+
custom_config_file=None,
|
|
2553
|
+
**kwargs
|
|
1440
2554
|
):
|
|
1441
2555
|
"""
|
|
1442
2556
|
DESCRIPTION:
|
|
@@ -1477,8 +2591,10 @@ class AutoClassifier(AutoML):
|
|
|
1477
2591
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1478
2592
|
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1479
2593
|
Permitted Values:
|
|
1480
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1481
|
-
"RMSE", "RMSLE"
|
|
2594
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
2595
|
+
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
2596
|
+
"ME", "EV", "MPD", "MGD"
|
|
2597
|
+
|
|
1482
2598
|
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1483
2599
|
'MICRO-RECALL','MACRO-RECALL',
|
|
1484
2600
|
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
@@ -1500,6 +2616,28 @@ class AutoClassifier(AutoML):
|
|
|
1500
2616
|
Optional Argument.
|
|
1501
2617
|
Specifies the path of json file in case of custom run.
|
|
1502
2618
|
Types: str
|
|
2619
|
+
|
|
2620
|
+
**kwargs:
|
|
2621
|
+
Specifies the additional arguments for AutoClassifier. Below
|
|
2622
|
+
are the additional arguments:
|
|
2623
|
+
volatile:
|
|
2624
|
+
Optional Argument.
|
|
2625
|
+
Specifies whether to put the interim results of the
|
|
2626
|
+
functions in a volatile table or not. When set to
|
|
2627
|
+
True, results are stored in a volatile table,
|
|
2628
|
+
otherwise not.
|
|
2629
|
+
Default Value: False
|
|
2630
|
+
Types: bool
|
|
2631
|
+
|
|
2632
|
+
persist:
|
|
2633
|
+
Optional Argument.
|
|
2634
|
+
Specifies whether to persist the interim results of the
|
|
2635
|
+
functions in a table or not. When set to True,
|
|
2636
|
+
results are persisted in a table; otherwise,
|
|
2637
|
+
results are garbage collected at the end of the
|
|
2638
|
+
session.
|
|
2639
|
+
Default Value: False
|
|
2640
|
+
Types: bool
|
|
1503
2641
|
|
|
1504
2642
|
RETURNS:
|
|
1505
2643
|
Instance of AutoClassifier.
|
|
@@ -1535,24 +2673,28 @@ class AutoClassifier(AutoML):
|
|
|
1535
2673
|
|
|
1536
2674
|
# Fit the data.
|
|
1537
2675
|
>>> automl_obj.fit(admissions_train, "admitted")
|
|
2676
|
+
|
|
2677
|
+
# Display leaderboard.
|
|
2678
|
+
>>> automl_obj.leaderboard()
|
|
1538
2679
|
|
|
1539
|
-
#
|
|
1540
|
-
>>>
|
|
1541
|
-
>>> prediction
|
|
2680
|
+
# Display best performing model.
|
|
2681
|
+
>>> automl_obj.leader()
|
|
1542
2682
|
|
|
1543
|
-
# Run predict
|
|
2683
|
+
# Run predict on test data using best performing model.
|
|
1544
2684
|
>>> prediction = automl_obj.predict(admissions_test)
|
|
1545
2685
|
>>> prediction
|
|
1546
2686
|
|
|
1547
|
-
# Run predict
|
|
2687
|
+
# Run predict on test data using second best performing model.
|
|
1548
2688
|
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
1549
2689
|
>>> prediction
|
|
1550
|
-
|
|
1551
|
-
#
|
|
1552
|
-
>>> automl_obj.
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
2690
|
+
|
|
2691
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2692
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test)
|
|
2693
|
+
>>> performance_metrics
|
|
2694
|
+
|
|
2695
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
2696
|
+
>>> performance_metrics = automl_obj.evaluate(admissions_test, 4)
|
|
2697
|
+
>>> performance_metrics
|
|
1556
2698
|
|
|
1557
2699
|
# Example 2 : Run AutoClassifier for binary classification.
|
|
1558
2700
|
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
@@ -1561,6 +2703,11 @@ class AutoClassifier(AutoML):
|
|
|
1561
2703
|
# configuration file to customize different processes of
|
|
1562
2704
|
# AutoML Run.
|
|
1563
2705
|
|
|
2706
|
+
# Split the data into train and test.
|
|
2707
|
+
>>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
|
|
2708
|
+
>>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2709
|
+
>>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2710
|
+
|
|
1564
2711
|
# Generate custom configuration file.
|
|
1565
2712
|
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1566
2713
|
|
|
@@ -1568,21 +2715,25 @@ class AutoClassifier(AutoML):
|
|
|
1568
2715
|
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1569
2716
|
>>> custom_config_file="custom_titanic.json")
|
|
1570
2717
|
# Fit the data.
|
|
1571
|
-
>>> automl_obj.fit(
|
|
1572
|
-
|
|
1573
|
-
# Run predict with best performing model.
|
|
1574
|
-
>>> prediction = automl_obj.predict()
|
|
1575
|
-
>>> prediction
|
|
1576
|
-
|
|
1577
|
-
# Run predict with second best performing model.
|
|
1578
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1579
|
-
>>> prediction
|
|
2718
|
+
>>> automl_obj.fit(titanic_train, titanic_train.survived)
|
|
1580
2719
|
|
|
1581
2720
|
# Display leaderboard.
|
|
1582
2721
|
>>> automl_obj.leaderboard()
|
|
1583
2722
|
|
|
1584
2723
|
# Display best performing model.
|
|
1585
2724
|
>>> automl_obj.leader()
|
|
2725
|
+
|
|
2726
|
+
# Run predict on test data using best performing model.
|
|
2727
|
+
>>> prediction = automl_obj.predict(titanic_test)
|
|
2728
|
+
>>> prediction
|
|
2729
|
+
|
|
2730
|
+
# Run predict on test data using second best performing model.
|
|
2731
|
+
>>> prediction = automl_obj.predict(titanic_test, rank=2)
|
|
2732
|
+
>>> prediction
|
|
2733
|
+
|
|
2734
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2735
|
+
>>> performance_metrics = automl_obj.evaluate(titanic_test)
|
|
2736
|
+
>>> performance_metrics
|
|
1586
2737
|
|
|
1587
2738
|
# Example 3 : Run AutoClassifier for multiclass classification problem.
|
|
1588
2739
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
@@ -1590,6 +2741,11 @@ class AutoClassifier(AutoML):
|
|
|
1590
2741
|
# models. Use custom configuration file to customize different
|
|
1591
2742
|
# processes of AutoML Run.
|
|
1592
2743
|
|
|
2744
|
+
# Split the data into train and test.
|
|
2745
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
2746
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2747
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2748
|
+
|
|
1593
2749
|
# Generate custom configuration file.
|
|
1594
2750
|
>>> AutoClassifier.generate_custom_config("custom_iris")
|
|
1595
2751
|
|
|
@@ -1597,18 +2753,22 @@ class AutoClassifier(AutoML):
|
|
|
1597
2753
|
>>> automl_obj = AutoClassifier(verbose=1,
|
|
1598
2754
|
>>> custom_config_file="custom_iris.json")
|
|
1599
2755
|
# Fit the data.
|
|
1600
|
-
>>> automl_obj.fit(
|
|
1601
|
-
|
|
1602
|
-
# Predict using best performing model.
|
|
1603
|
-
>>> prediction = automl_obj.predict()
|
|
1604
|
-
>>> prediction
|
|
1605
|
-
|
|
2756
|
+
>>> automl_obj.fit(iris_train, "species")
|
|
2757
|
+
|
|
1606
2758
|
# Display leaderboard.
|
|
1607
2759
|
>>> automl_obj.leaderboard()
|
|
1608
2760
|
|
|
1609
2761
|
# Display best performing model.
|
|
1610
2762
|
>>> automl_obj.leader()
|
|
1611
2763
|
|
|
2764
|
+
# Predict on test data using best performing model.
|
|
2765
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
2766
|
+
>>> prediction
|
|
2767
|
+
|
|
2768
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2769
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test)
|
|
2770
|
+
>>> performance_metrics
|
|
2771
|
+
|
|
1612
2772
|
# Example 4 : Run AutoClassifier for classification problem with stopping metric and tolerance.
|
|
1613
2773
|
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
1614
2774
|
# or not based on differect factors. Use custom configuration
|
|
@@ -1616,6 +2776,11 @@ class AutoClassifier(AutoML):
|
|
|
1616
2776
|
# performance threshold to acquire for the available models, and
|
|
1617
2777
|
# terminate training upon meeting the stipulated performance criteria.
|
|
1618
2778
|
|
|
2779
|
+
# Split the data into train and test.
|
|
2780
|
+
>>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
|
|
2781
|
+
>>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2782
|
+
>>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
2783
|
+
|
|
1619
2784
|
# Generate custom configuration file.
|
|
1620
2785
|
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1621
2786
|
|
|
@@ -1627,18 +2792,27 @@ class AutoClassifier(AutoML):
|
|
|
1627
2792
|
>>> max_models=8
|
|
1628
2793
|
>>> custom_config_file="custom_titanic.json")
|
|
1629
2794
|
# Fit the data.
|
|
1630
|
-
>>> automl_obj.fit(
|
|
1631
|
-
|
|
1632
|
-
# Run predict with best performing model.
|
|
1633
|
-
>>> prediction = automl_obj.predict()
|
|
1634
|
-
>>> prediction
|
|
1635
|
-
|
|
2795
|
+
>>> automl_obj.fit(titanic_train, titanic_train.survived)
|
|
2796
|
+
|
|
1636
2797
|
# Display leaderboard.
|
|
1637
2798
|
>>> automl_obj.leaderboard()
|
|
2799
|
+
|
|
2800
|
+
# Run predict on test data using best performing model.
|
|
2801
|
+
>>> prediction = automl_obj.predict(titanic_test)
|
|
2802
|
+
>>> prediction
|
|
2803
|
+
|
|
2804
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
2805
|
+
>>> performance_metrics = automl_obj.evaluate(titanic_test)
|
|
2806
|
+
>>> performance_metrics
|
|
1638
2807
|
|
|
1639
2808
|
# Example 5 : Run AutoClassifier for classification problem with maximum runtime.
|
|
1640
2809
|
# Scenario : Predict the species of iris flower based on different factors.
|
|
1641
2810
|
# Run AutoML to get the best performing model in specified time.
|
|
2811
|
+
|
|
2812
|
+
# Split the data into train and test.
|
|
2813
|
+
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
2814
|
+
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
2815
|
+
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
1642
2816
|
|
|
1643
2817
|
# Create instance of AutoClassifier.
|
|
1644
2818
|
>>> automl_obj = AutoClassifier(verbose=2,
|
|
@@ -1646,21 +2820,25 @@ class AutoClassifier(AutoML):
|
|
|
1646
2820
|
>>> max_runtime_secs=500)
|
|
1647
2821
|
>>> max_models=3)
|
|
1648
2822
|
# Fit the data.
|
|
1649
|
-
>>> automl_obj.fit(
|
|
1650
|
-
|
|
1651
|
-
# Run predict with best performing model.
|
|
1652
|
-
>>> prediction = automl_obj.predict()
|
|
1653
|
-
>>> prediction
|
|
1654
|
-
|
|
1655
|
-
# Run predict with second best performing model.
|
|
1656
|
-
>>> prediction = automl_obj.predict(rank=2)
|
|
1657
|
-
>>> prediction
|
|
1658
|
-
|
|
2823
|
+
>>> automl_obj.fit(iris_train, iris_train.species)
|
|
2824
|
+
|
|
1659
2825
|
# Display leaderboard.
|
|
1660
2826
|
>>> automl_obj.leaderboard()
|
|
1661
2827
|
|
|
1662
2828
|
# Display best performing model.
|
|
1663
|
-
>>> automl_obj.leader()
|
|
2829
|
+
>>> automl_obj.leader()
|
|
2830
|
+
|
|
2831
|
+
# Run predict on test data using best performing model.
|
|
2832
|
+
>>> prediction = automl_obj.predict(iris_test)
|
|
2833
|
+
>>> prediction
|
|
2834
|
+
|
|
2835
|
+
# Run predict on test data using second best performing model.
|
|
2836
|
+
>>> prediction = automl_obj.predict(iris_test, rank=2)
|
|
2837
|
+
>>> prediction
|
|
2838
|
+
|
|
2839
|
+
# Run evaluate to get performance metrics using model rank 3.
|
|
2840
|
+
>>> performance_metrics = automl_obj.evaluate(iris_test, 3)
|
|
2841
|
+
>>> performance_metrics
|
|
1664
2842
|
"""
|
|
1665
2843
|
self.verbose = verbose
|
|
1666
2844
|
self.max_runtime_secs = max_runtime_secs
|
|
@@ -1673,11 +2851,12 @@ class AutoClassifier(AutoML):
|
|
|
1673
2851
|
self.exclude = exclude
|
|
1674
2852
|
|
|
1675
2853
|
super(AutoClassifier, self).__init__(task_type=self.task_type,
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
2854
|
+
include = self.include,
|
|
2855
|
+
exclude = self.exclude,
|
|
2856
|
+
verbose=self.verbose,
|
|
2857
|
+
max_runtime_secs=self.max_runtime_secs,
|
|
2858
|
+
stopping_metric=self.stopping_metric,
|
|
2859
|
+
stopping_tolerance=self.stopping_tolerance,
|
|
2860
|
+
max_models=self.max_models,
|
|
2861
|
+
custom_config_file=self.custom_config_file,
|
|
2862
|
+
**kwargs)
|