teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
teradataml/automl/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Sweta Shaw
|
|
@@ -14,15 +14,18 @@
|
|
|
14
14
|
# ##################################################################
|
|
15
15
|
|
|
16
16
|
# Python libraries
|
|
17
|
+
import ast
|
|
18
|
+
from io import BytesIO
|
|
19
|
+
import joblib
|
|
17
20
|
import json
|
|
18
|
-
import
|
|
21
|
+
import matplotlib.pyplot as plt
|
|
19
22
|
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import seaborn as sns
|
|
20
25
|
from sklearn.metrics import confusion_matrix
|
|
26
|
+
from sklearn.decomposition import PCA
|
|
21
27
|
import time
|
|
22
|
-
import ast
|
|
23
28
|
import warnings
|
|
24
|
-
import joblib
|
|
25
|
-
from io import BytesIO
|
|
26
29
|
|
|
27
30
|
# Teradata libraries
|
|
28
31
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
@@ -37,7 +40,14 @@ from teradataml import TeradataMlException
|
|
|
37
40
|
from teradataml.common.messages import Messages, MessageCodes
|
|
38
41
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
39
42
|
from teradataml import TeradataConstants
|
|
40
|
-
from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
|
|
43
|
+
from teradataml import (XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table,
|
|
44
|
+
OutlierFilterFit, OutlierFilterTransform, SimpleImputeFit, SimpleImputeTransform,
|
|
45
|
+
ColumnSummary)
|
|
46
|
+
from teradataml import td_sklearn as skl
|
|
47
|
+
from teradataml import CategoricalSummary
|
|
48
|
+
from teradataml import TargetEncodingFit, TargetEncodingTransform
|
|
49
|
+
from teradataml import Shap
|
|
50
|
+
from teradataml import GarbageCollector
|
|
41
51
|
|
|
42
52
|
# AutoML Internal libraries
|
|
43
53
|
from teradataml.automl.data_preparation import _DataPreparation
|
|
@@ -47,20 +57,22 @@ from teradataml.automl.model_evaluation import _ModelEvaluator
|
|
|
47
57
|
from teradataml.automl.model_training import _ModelTraining
|
|
48
58
|
from teradataml.automl.data_transformation import _DataTransformation
|
|
49
59
|
from teradataml.automl.custom_json_utils import _GenerateCustomJson
|
|
50
|
-
|
|
60
|
+
from teradataml.common.constants import AutoMLConstants
|
|
51
61
|
|
|
52
62
|
class AutoML:
|
|
53
63
|
|
|
54
64
|
def __init__(self,
|
|
55
|
-
task_type
|
|
56
|
-
include
|
|
57
|
-
exclude
|
|
58
|
-
verbose
|
|
59
|
-
max_runtime_secs
|
|
60
|
-
stopping_metric
|
|
61
|
-
stopping_tolerance
|
|
62
|
-
max_models
|
|
63
|
-
custom_config_file
|
|
65
|
+
task_type="Default",
|
|
66
|
+
include=None,
|
|
67
|
+
exclude=None,
|
|
68
|
+
verbose=0,
|
|
69
|
+
max_runtime_secs=None,
|
|
70
|
+
stopping_metric=None,
|
|
71
|
+
stopping_tolerance=None,
|
|
72
|
+
max_models=None,
|
|
73
|
+
custom_config_file=None,
|
|
74
|
+
is_fraud=False,
|
|
75
|
+
is_churn=False,
|
|
64
76
|
**kwargs):
|
|
65
77
|
"""
|
|
66
78
|
DESCRIPTION:
|
|
@@ -72,23 +84,23 @@ class AutoML:
|
|
|
72
84
|
machine learning models, by automating some of the more time-consuming
|
|
73
85
|
and labor-intensive tasks involved in the process.
|
|
74
86
|
|
|
75
|
-
AutoML is designed to handle
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
AutoML by default, trains using all model algorithms applicable for the
|
|
81
|
-
task type problem. For example, "glm" and "svm" does not support multi-class
|
|
82
|
-
classification problem. Thus, only 3 models are available to train in case
|
|
83
|
-
of multi-class classification problem, by default. While for regression and
|
|
84
|
-
binary classification problem, all 5 models i.e., "glm", "svm", "knn",
|
|
85
|
-
"decision_forest", "xgboost" are available to train by default.
|
|
86
|
-
|
|
87
|
-
AutoML provides functionality to use specific model algorithms for training.
|
|
88
|
-
User can provide either include or exclude model. In case of include,
|
|
89
|
-
only specified models are trained while for exclude, all models except
|
|
90
|
-
specified model are trained.
|
|
87
|
+
AutoML is designed to handle regression, classification (binary and multiclass),
|
|
88
|
+
and clustering tasks. The user can specify the task type to apply regression,
|
|
89
|
+
classification, or clustering algorithms on the provided dataset. By default,
|
|
90
|
+
AutoML will automatically decide whether the task is regression or classification.
|
|
91
|
+
For clustering, it is mandatory for the user to specify the task type explicitly.
|
|
91
92
|
|
|
93
|
+
AutoML can also be run specifically for fraud detection and churn prediction
|
|
94
|
+
scenarios (binary classification). By setting the available parameters, users
|
|
95
|
+
can leverage specialized workflows and model selection tailored for these usecases,
|
|
96
|
+
enabling more effective handling of fraud and churn-related datasets.
|
|
97
|
+
|
|
98
|
+
By default, AutoML trains using all model algorithms that are applicable to
|
|
99
|
+
the selected task type. Beside that, AutoML also provides functionality to use
|
|
100
|
+
specific model algorithms for training. User can provide either include
|
|
101
|
+
or exclude model. In case of include, only specified models are trained
|
|
102
|
+
while for exclude, all models except specified model are trained.
|
|
103
|
+
|
|
92
104
|
AutoML also provides an option to customize the processes within feature
|
|
93
105
|
engineering, data preparation and model training phases. User can customize
|
|
94
106
|
the processes by passing the JSON file path in case of custom run. It also
|
|
@@ -100,20 +112,23 @@ class AutoML:
|
|
|
100
112
|
|
|
101
113
|
PARAMETERS:
|
|
102
114
|
task_type:
|
|
103
|
-
|
|
104
|
-
Specifies the
|
|
105
|
-
|
|
106
|
-
|
|
115
|
+
Required when clustering data is involved otherwise optional.
|
|
116
|
+
Specifies the type of machine learning task for AutoML: regression, classification, or
|
|
117
|
+
clustering. If set to "Default", AutoML will automatically determine whether to perform
|
|
118
|
+
regression or classification based on the target column. For clustering tasks, user must
|
|
119
|
+
explicitly set this parameter to "Clustering".
|
|
107
120
|
Default Value: "Default"
|
|
108
|
-
Permitted Values: "Regression", "Classification", "Default"
|
|
121
|
+
Permitted Values: "Regression", "Classification", "Default", "Clustering"
|
|
109
122
|
Types: str
|
|
110
123
|
|
|
111
124
|
include:
|
|
112
125
|
Optional Argument.
|
|
113
126
|
Specifies the model algorithms to be used for model training phase.
|
|
114
|
-
By default, all 5 models
|
|
115
|
-
classification problem, while only 3
|
|
116
|
-
|
|
127
|
+
By default, all 5 models ("glm", "svm", "knn", "decision_forest", "xgboost") are
|
|
128
|
+
used for training for regression and binary classification problem, while only 3
|
|
129
|
+
models ("knn", "decision_forest", "xgboost") are used for multi-class.
|
|
130
|
+
For clustering, only 2 models ("KMeans", "GaussianMixture") are used.
|
|
131
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost", "KMeans", "GaussianMixture"
|
|
117
132
|
Types: str OR list of str
|
|
118
133
|
|
|
119
134
|
|
|
@@ -121,7 +136,7 @@ class AutoML:
|
|
|
121
136
|
Optional Argument.
|
|
122
137
|
Specifies the model algorithms to be excluded from model training phase.
|
|
123
138
|
No model is excluded by default.
|
|
124
|
-
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
139
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost", "KMeans", "GaussianMixture"
|
|
125
140
|
Types: str OR list of str
|
|
126
141
|
|
|
127
142
|
verbose:
|
|
@@ -143,15 +158,14 @@ class AutoML:
|
|
|
143
158
|
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
144
159
|
Specifies the stopping metrics for stopping tolerance in model training.
|
|
145
160
|
Permitted Values:
|
|
146
|
-
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
147
|
-
"
|
|
148
|
-
"ME", "EV", "MPD", "MGD"
|
|
161
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE", "MAPE", "MPE",
|
|
162
|
+
"RMSE", "RMSLE", "ME", "EV", "MPD", "MGD"
|
|
149
163
|
|
|
150
|
-
* For task_type "Classification":
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
164
|
+
* For task_type "Classification": "MICRO-F1", "MACRO-F1", "MICRO-RECALL", "MACRO-RECALL",
|
|
165
|
+
"MICRO-PRECISION", "MACRO-PRECISION", "WEIGHTED-PRECISION",
|
|
166
|
+
"WEIGHTED-RECALL", "WEIGHTED-F1", "ACCURACY"
|
|
167
|
+
|
|
168
|
+
* For task_type "Clustering": "SILHOUETTE", "CALINSKI", "DAVIES"
|
|
155
169
|
Types: str
|
|
156
170
|
|
|
157
171
|
stopping_tolerance:
|
|
@@ -169,6 +183,18 @@ class AutoML:
|
|
|
169
183
|
Specifies the path of JSON file in case of custom run.
|
|
170
184
|
Types: str
|
|
171
185
|
|
|
186
|
+
is_fraud:
|
|
187
|
+
Optional Argument.
|
|
188
|
+
Specifies whether the usecase is for fraud detection.
|
|
189
|
+
Default Value: False
|
|
190
|
+
Types: bool
|
|
191
|
+
|
|
192
|
+
is_churn:
|
|
193
|
+
Optional Argument.
|
|
194
|
+
Specifies whether the usecase is for churn prediction.
|
|
195
|
+
Default Value: False
|
|
196
|
+
Types: bool
|
|
197
|
+
|
|
172
198
|
**kwargs:
|
|
173
199
|
Specifies the additional arguments for AutoML. Below
|
|
174
200
|
are the additional arguments:
|
|
@@ -199,6 +225,14 @@ class AutoML:
|
|
|
199
225
|
Specifies the random seed for reproducibility.
|
|
200
226
|
Default Value: 42
|
|
201
227
|
Types: int
|
|
228
|
+
|
|
229
|
+
imbalance_handling_method:
|
|
230
|
+
Optional Argument.
|
|
231
|
+
Specifies which data imbalance method to use for classification
|
|
232
|
+
problems.
|
|
233
|
+
Default Value: SMOTE
|
|
234
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
235
|
+
Types: str
|
|
202
236
|
|
|
203
237
|
RETURNS:
|
|
204
238
|
Instance of AutoML.
|
|
@@ -218,14 +252,20 @@ class AutoML:
|
|
|
218
252
|
>>> load_example_data("GLMPredict", ["admissions_test", "admissions_train"])
|
|
219
253
|
>>> load_example_data("decisionforestpredict", ["housing_train", "housing_test"])
|
|
220
254
|
>>> load_example_data("teradataml", "iris_input")
|
|
221
|
-
|
|
255
|
+
>>> load_example_data("teradataml", "credit_fraud_dataset")
|
|
256
|
+
>>> load_example_data("teradataml", "bank_churn")
|
|
257
|
+
>>> load_example_data("teradataml", "bank_marketing")
|
|
258
|
+
|
|
222
259
|
# Create teradataml DataFrames.
|
|
223
260
|
>>> admissions_train = DataFrame.from_table("admissions_train")
|
|
224
261
|
>>> admissions_test = DataFrame.from_table("admissions_test")
|
|
225
262
|
>>> housing_train = DataFrame.from_table("housing_train")
|
|
226
263
|
>>> housing_test = DataFrame.from_table("housing_test")
|
|
227
264
|
>>> iris_input = DataFrame.from_table("iris_input")
|
|
228
|
-
|
|
265
|
+
>>> credit_fraud_df = DataFrame.from_table("credit_fraud_dataset")
|
|
266
|
+
>>> churn_df = DataFrame.from_table("bank_churn")
|
|
267
|
+
>>> bank_df = DataFrame.from_table("bank_marketing")
|
|
268
|
+
|
|
229
269
|
# Example 1: Run AutoML for classification problem.
|
|
230
270
|
# Scenario: Predict whether a student will be admitted to a university
|
|
231
271
|
# based on different factors. Run AutoML to get the best
|
|
@@ -307,7 +347,7 @@ class AutoML:
|
|
|
307
347
|
|
|
308
348
|
# Split the data into train and test.
|
|
309
349
|
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
310
|
-
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
350
|
+
>>> iris_train = iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
311
351
|
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
312
352
|
|
|
313
353
|
# Generate custom JSON file
|
|
@@ -372,7 +412,7 @@ class AutoML:
|
|
|
372
412
|
|
|
373
413
|
# Split the data into train and test.
|
|
374
414
|
>>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
|
|
375
|
-
>>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
415
|
+
>>> iris_train = iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
376
416
|
>>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
377
417
|
|
|
378
418
|
# Create instance of AutoML.
|
|
@@ -404,25 +444,133 @@ class AutoML:
|
|
|
404
444
|
|
|
405
445
|
# Run evaluate to get performance metrics using model rank 4.
|
|
406
446
|
>>> performance_metrics = automl_obj.evaluate(iris_test, 4)
|
|
407
|
-
>>> performance_metrics
|
|
447
|
+
>>> performance_metrics
|
|
448
|
+
|
|
449
|
+
# Example 6 : Run AutoML for fraud detection problem.
|
|
450
|
+
# Scenario : Predict whether credit card transaction is Fraud or not.
|
|
451
|
+
|
|
452
|
+
# Split the data into train and test.
|
|
453
|
+
>>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
|
|
454
|
+
>>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
455
|
+
>>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
456
|
+
|
|
457
|
+
# Create instance of AutoML with is_fraud set to True.
|
|
458
|
+
>>> automl_obj = AutoML(is_fraud=True)
|
|
459
|
+
|
|
460
|
+
# Fit the data.
|
|
461
|
+
>>> automl_obj.fit(credit_fraud_train, "Credit_Class")
|
|
462
|
+
|
|
463
|
+
# Display leaderboard.
|
|
464
|
+
>>> automl_obj.leaderboard()
|
|
465
|
+
|
|
466
|
+
# Display best performing model.
|
|
467
|
+
>>> automl_obj.leader()
|
|
468
|
+
|
|
469
|
+
# Run predict on test data using best performing model.
|
|
470
|
+
>>> prediction = automl_obj.predict(credit_fraud_test)
|
|
471
|
+
>>> prediction
|
|
472
|
+
|
|
473
|
+
# Run predict on test data using second best performing model.
|
|
474
|
+
>>> prediction = automl_obj.predict(credit_fraud_test, rank=2)
|
|
475
|
+
>>> prediction
|
|
476
|
+
|
|
477
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
478
|
+
>>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
|
|
479
|
+
>>> performance_metrics
|
|
480
|
+
|
|
481
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
482
|
+
>>> performance_metrics = automl_obj.evaluate(credit_fraud_test, 4)
|
|
483
|
+
>>> performance_metrics
|
|
484
|
+
|
|
485
|
+
# Example 7 : Run AutoML for churn prediction problem.
|
|
486
|
+
# Scenario : Predict whether a customer churn for bank or not.
|
|
487
|
+
|
|
488
|
+
# Split the data into train and test.
|
|
489
|
+
>>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
|
|
490
|
+
>>> churn_train = churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
491
|
+
>>> churn_test = churn_sample[chrun_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
492
|
+
|
|
493
|
+
# Create instance of AutoML with is_churn=True
|
|
494
|
+
>>> automl_obj = AutoML(is_churn=True)
|
|
495
|
+
|
|
496
|
+
# Fit the data.
|
|
497
|
+
>>> automl_obj.fit(churn_train, "churn")
|
|
498
|
+
|
|
499
|
+
# Display leaderboard.
|
|
500
|
+
>>> automl_obj.leaderboard()
|
|
501
|
+
|
|
502
|
+
# Display best performing model.
|
|
503
|
+
>>> automl_obj.leader()
|
|
504
|
+
|
|
505
|
+
# Run predict on test data using best performing model.
|
|
506
|
+
>>> prediction = automl_obj.predict(churn_test)
|
|
507
|
+
>>> prediction
|
|
508
|
+
|
|
509
|
+
# Run predict on test data using second best performing model.
|
|
510
|
+
>>> prediction = automl_obj.predict(churn_test, rank=2)
|
|
511
|
+
>>> prediction
|
|
512
|
+
|
|
513
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
514
|
+
>>> performance_metrics = automl_obj.evaluate(churn_test)
|
|
515
|
+
>>> performance_metrics
|
|
516
|
+
|
|
517
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
518
|
+
>>> performance_metrics = automl_obj.evaluate(churn_test, 4)
|
|
519
|
+
>>> performance_metrics
|
|
520
|
+
|
|
521
|
+
# Example 8: Use AutoML for unsupervised clustering task based on bank data.
|
|
522
|
+
# Scenario: Automatically group similar records in the dataset into clusters.
|
|
523
|
+
|
|
524
|
+
# Split the data into train and test.
|
|
525
|
+
>>> bank_sample = bank_df.sample(frac = [0.8, 0.2])
|
|
526
|
+
>>> bank_train = bank_sample[bank_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
527
|
+
>>> bank_test = bank_sample[bank_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
528
|
+
|
|
529
|
+
# Create instance of AutoML.
|
|
530
|
+
>>> automl_obj = AutoML(task_type="Clustering")
|
|
531
|
+
|
|
532
|
+
# Fit the data.
|
|
533
|
+
>>> automl_obj.fit(bank_train)
|
|
534
|
+
|
|
535
|
+
# Display leaderboard.
|
|
536
|
+
>>> automl_obj.leaderboard()
|
|
537
|
+
|
|
538
|
+
# Display best performing model.
|
|
539
|
+
>>> automl_obj.leader()
|
|
540
|
+
|
|
541
|
+
# Run predict on test data using best performing model.
|
|
542
|
+
>>> prediction = automl_obj.predict(bank_test)
|
|
543
|
+
>>> prediction
|
|
544
|
+
|
|
545
|
+
# Run predict on test data using second best performing model.
|
|
546
|
+
>>> prediction = automl_obj.predict(bank_test, rank=2)
|
|
547
|
+
>>> prediction
|
|
408
548
|
"""
|
|
549
|
+
# Validate task_type first before using it in conditional logic
|
|
550
|
+
task_type_arg_info = [["task_type", task_type, True, (str), True, ["Regression", "Classification", "Clustering", "Default"]]]
|
|
551
|
+
_Validators._validate_function_arguments(task_type_arg_info)
|
|
552
|
+
|
|
409
553
|
# Appending arguments to list for validation
|
|
410
554
|
arg_info_matrix = []
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
555
|
+
|
|
556
|
+
if task_type.lower() == 'clustering':
|
|
557
|
+
arg_info_matrix.append(["include", include, True, (str, list), True, AutoMLConstants.CLUSTERING_MODELS.value])
|
|
558
|
+
arg_info_matrix.append(["exclude", exclude, True, (str, list), True, AutoMLConstants.CLUSTERING_MODELS.value])
|
|
559
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.CLUSTERING_METRICS.value])
|
|
560
|
+
else:
|
|
561
|
+
arg_info_matrix.append(["include", include, True, (str, list), True, AutoMLConstants.SUPERVISED_MODELS.value])
|
|
562
|
+
arg_info_matrix.append(["exclude", exclude, True, (str, list), True, AutoMLConstants.SUPERVISED_MODELS.value])
|
|
563
|
+
if task_type.lower() == "classification" or is_fraud or is_churn:
|
|
564
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.CLASSIFICATION_METRICS.value])
|
|
565
|
+
elif task_type.lower() == "regression":
|
|
566
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.REGRESSION_METRICS.value])
|
|
567
|
+
else:
|
|
568
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.ALL_METRICS.value])
|
|
569
|
+
|
|
570
|
+
|
|
416
571
|
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
417
572
|
arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
|
|
418
|
-
|
|
419
|
-
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
420
|
-
"ME", "EV", "MPD", "MGD",
|
|
421
|
-
'MICRO-F1','MACRO-F1',
|
|
422
|
-
'MICRO-RECALL','MACRO-RECALL',
|
|
423
|
-
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
424
|
-
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
425
|
-
'WEIGHTED-F1', 'ACCURACY']])
|
|
573
|
+
|
|
426
574
|
arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
|
|
427
575
|
arg_info_matrix.append(["max_models", max_models, True, (int)])
|
|
428
576
|
arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
|
|
@@ -430,10 +578,14 @@ class AutoML:
|
|
|
430
578
|
volatile = kwargs.get('volatile', False)
|
|
431
579
|
persist = kwargs.get('persist', False)
|
|
432
580
|
seed = kwargs.get('seed', 42)
|
|
581
|
+
imbalance_handling_method = kwargs.get('imbalance_handling_method', "SMOTE")
|
|
433
582
|
|
|
434
583
|
arg_info_matrix.append(["volatile", volatile, True, (bool)])
|
|
435
584
|
arg_info_matrix.append(["persist", persist, True, (bool)])
|
|
436
585
|
arg_info_matrix.append(["seed", seed, True, (int)])
|
|
586
|
+
arg_info_matrix.append(["imbalance_handling_method", imbalance_handling_method, True, (str), True, ["SMOTE", "ADASYN", "SMOTETomek", "NearMiss"]])
|
|
587
|
+
arg_info_matrix.append(["is_fraud", is_fraud, True, (bool)])
|
|
588
|
+
arg_info_matrix.append(["is_churn", is_churn, True, (bool)])
|
|
437
589
|
|
|
438
590
|
# Validate argument types
|
|
439
591
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -447,7 +599,24 @@ class AutoML:
|
|
|
447
599
|
_Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
|
|
448
600
|
# Validate lower range for max_models
|
|
449
601
|
_Validators._validate_argument_range(max_models, "max_models", lbound=1, lbound_inclusive=True)
|
|
450
|
-
|
|
602
|
+
# Either is_fraud or is_churn can be used.
|
|
603
|
+
if is_fraud or is_churn:
|
|
604
|
+
_Validators._validate_mutually_exclusive_arguments(is_fraud, "is_fraud", is_churn, "is_churn")
|
|
605
|
+
# Validate mutually exclusive arguments for clustering and is_fraud
|
|
606
|
+
if task_type.lower() == 'clustering' and is_fraud:
|
|
607
|
+
raise TeradataMlException(
|
|
608
|
+
Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH,
|
|
609
|
+
f"task_type={task_type}",
|
|
610
|
+
f"is_fraud={is_fraud}"),
|
|
611
|
+
MessageCodes.CANNOT_USE_TOGETHER_WITH)
|
|
612
|
+
# Validate mutually exclusive arguments for clustering and is_churn
|
|
613
|
+
if task_type.lower() == 'clustering' and is_churn:
|
|
614
|
+
raise TeradataMlException(
|
|
615
|
+
Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH,
|
|
616
|
+
f"task_type = {task_type}",
|
|
617
|
+
f"is_churn = {is_churn}"),
|
|
618
|
+
MessageCodes.CANNOT_USE_TOGETHER_WITH)
|
|
619
|
+
|
|
451
620
|
custom_data = None
|
|
452
621
|
self.auto = True
|
|
453
622
|
# Validate custom file
|
|
@@ -474,27 +643,43 @@ class AutoML:
|
|
|
474
643
|
self.stopping_metric = stopping_metric
|
|
475
644
|
self.stopping_tolerance = stopping_tolerance
|
|
476
645
|
self.max_models = max_models
|
|
477
|
-
self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
|
|
478
646
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
479
647
|
self._is_fit_called = False
|
|
480
648
|
self._is_load_model_called = False
|
|
481
|
-
|
|
482
|
-
self.table_name_mapping = {}
|
|
649
|
+
|
|
650
|
+
self.table_name_mapping = {}
|
|
483
651
|
# Stores the table name of all intermediate datas
|
|
484
|
-
self._intermediate_table_names={}
|
|
652
|
+
self._intermediate_table_names = {}
|
|
485
653
|
self._auto_dataprep = False
|
|
486
654
|
self._phases = None
|
|
487
655
|
self._progressbar_prefix = "AutoML Running:"
|
|
488
656
|
|
|
657
|
+
self.cluster = self.task_type.lower() == 'clustering'
|
|
658
|
+
self.fraud = is_fraud or kwargs.get("fraud", False)
|
|
659
|
+
self.churn = is_churn or kwargs.get("churn", False)
|
|
660
|
+
|
|
661
|
+
if self.cluster:
|
|
662
|
+
self.model_list = AutoMLConstants.CLUSTERING_MODELS.value
|
|
663
|
+
else:
|
|
664
|
+
self.model_list = AutoMLConstants.SUPERVISED_MODELS.value
|
|
665
|
+
kwargs.pop("churn", None)
|
|
666
|
+
kwargs.pop("fraud", None)
|
|
667
|
+
kwargs.pop("cluster", None)
|
|
668
|
+
self.kwargs = kwargs
|
|
669
|
+
|
|
670
|
+
self.volatile = volatile
|
|
671
|
+
self.persist = persist
|
|
672
|
+
|
|
673
|
+
|
|
489
674
|
@collect_queryband(queryband="AutoML_fit")
|
|
490
675
|
def fit(self,
|
|
491
676
|
data,
|
|
492
|
-
target_column):
|
|
677
|
+
target_column=None):
|
|
493
678
|
"""
|
|
494
679
|
DESCRIPTION:
|
|
495
|
-
Function triggers the AutoML run. It is designed to handle
|
|
496
|
-
|
|
497
|
-
|
|
680
|
+
Function triggers the AutoML run. It is designed to handle regression ,
|
|
681
|
+
classification and clustering tasks depending on the specified "task_type".
|
|
682
|
+
|
|
498
683
|
PARAMETERS:
|
|
499
684
|
data:
|
|
500
685
|
Required Argument.
|
|
@@ -502,7 +687,7 @@ class AutoML:
|
|
|
502
687
|
Types: teradataml Dataframe
|
|
503
688
|
|
|
504
689
|
target_column:
|
|
505
|
-
Required Argument.
|
|
690
|
+
Required Argument. Optional only for clustering tasks.
|
|
506
691
|
Specifies target column of dataset.
|
|
507
692
|
Types: str or ColumnExpression
|
|
508
693
|
|
|
@@ -513,41 +698,49 @@ class AutoML:
|
|
|
513
698
|
TeradataMlException, TypeError, ValueError
|
|
514
699
|
|
|
515
700
|
EXAMPLES:
|
|
516
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
517
|
-
#
|
|
701
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
702
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
703
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
518
704
|
# Perform fit() operation on the "automl_obj".
|
|
519
705
|
|
|
520
|
-
# Example 1:
|
|
706
|
+
# Example 1: Fit AutoML by passing column expression for target column.
|
|
521
707
|
>>> automl_obj.fit(data = housing_train, target_col = housing_train.price)
|
|
522
|
-
|
|
523
|
-
# Example 2: Passing name of target column.
|
|
524
|
-
>>> automl_obj.fit(data = housing_train, target_col = "price")
|
|
525
|
-
"""
|
|
526
708
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
709
|
+
# Example 2: Fit AutoML by passing name of target column.
|
|
710
|
+
>>> automl_obj.fit(data = housing_train, target_col = "price")
|
|
711
|
+
|
|
712
|
+
# Example 3: Fit fraud detection model on credit_fraud_df.
|
|
713
|
+
>>> automl_obj.fit(data=credit_fraud_df, target_column="Credit_Class")
|
|
714
|
+
|
|
715
|
+
# Example 4: Fit churn prediction model on churn_df.
|
|
716
|
+
>>> automl_obj.fit(data=churn_df, target_column="churn")
|
|
717
|
+
|
|
718
|
+
# Example 5: Passing clustering data for training,
|
|
719
|
+
# without specifying target column.
|
|
720
|
+
>>> automl_obj.fit(data = bank_train)
|
|
721
|
+
"""
|
|
536
722
|
|
|
723
|
+
self._is_fit_called = True
|
|
724
|
+
# Prepare argument validation matrix
|
|
725
|
+
arg_info_fit_matrix = [["data", data, False, (DataFrame), True]]
|
|
726
|
+
if not self.cluster:
|
|
727
|
+
# Checking if target column is of type ColumnExpression
|
|
728
|
+
if isinstance(target_column, ColumnExpression):
|
|
729
|
+
target_column = target_column.name
|
|
730
|
+
arg_info_fit_matrix.append(["target_column", target_column, False, (str), True])
|
|
537
731
|
# Validate argument types
|
|
538
732
|
_Validators._validate_function_arguments(arg_info_fit_matrix)
|
|
539
733
|
|
|
540
734
|
# Initializing class variables
|
|
541
735
|
self.data = data
|
|
542
|
-
self.
|
|
543
|
-
|
|
736
|
+
if not self.cluster:
|
|
737
|
+
self.target_column = target_column
|
|
544
738
|
# Checking if include model list is present
|
|
545
739
|
if self.include_model:
|
|
546
740
|
# Converting to list if passed as string
|
|
547
741
|
self.include_model = UtilFuncs._as_list(self.include_model)
|
|
548
742
|
# Updating model list based on include list
|
|
549
743
|
self.model_list = list(set(self.include_model))
|
|
550
|
-
self.model_list = [model.lower() for model in self.model_list]
|
|
551
744
|
|
|
552
745
|
# Checking if exclude model list is present
|
|
553
746
|
if self.exclude_model:
|
|
@@ -555,40 +748,40 @@ class AutoML:
|
|
|
555
748
|
self.exclude_model = UtilFuncs._as_list(self.exclude_model)
|
|
556
749
|
# Updating model list based on exclude list
|
|
557
750
|
self.model_list = list(set(self.model_list) - set(self.exclude_model))
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
if self.
|
|
565
|
-
# if target column is
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
"
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
751
|
+
|
|
752
|
+
# Normalize model names: lowercase for non-cluster, original for cluster
|
|
753
|
+
if self.include_model or self.exclude_model:
|
|
754
|
+
self.model_list = [model if self.cluster else model.lower() for model in self.model_list]
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
if not self.cluster:
|
|
758
|
+
# Checking if target column is present in data
|
|
759
|
+
_Validators._validate_dataframe_has_argument_columns(self.target_column, "target_column", self.data, "df")
|
|
760
|
+
|
|
761
|
+
# Handling default task type
|
|
762
|
+
if self.task_type.casefold() == "default":
|
|
763
|
+
# if target column is having distinct values less than or equal to 20,
|
|
764
|
+
# then it will be mapped to classification problem else regression problem
|
|
765
|
+
if self.data.drop_duplicate(self.target_column).size <= 20:
|
|
766
|
+
print("\nTask type is set to Classification as target column "
|
|
767
|
+
"is having distinct values less than or equal to 20.")
|
|
768
|
+
self.task_type = "Classification"
|
|
769
|
+
else:
|
|
770
|
+
print("\nTask type is set to Regression as target column is "
|
|
771
|
+
"having distinct values greater than 20.")
|
|
772
|
+
self.task_type = "Regression"
|
|
773
|
+
|
|
774
|
+
if self.is_classification_type():
|
|
775
|
+
if self.stopping_metric is not None:
|
|
776
|
+
_Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.CLASSIFICATION_METRICS.value, "stopping_metric")
|
|
777
|
+
elif self.task_type.lower() == "regression":
|
|
778
|
+
if self.stopping_metric is not None:
|
|
779
|
+
_Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.REGRESSION_METRICS.value, "stopping_metric")
|
|
584
780
|
else:
|
|
585
781
|
if self.stopping_metric is not None:
|
|
586
|
-
|
|
587
|
-
"MAPE", "MPE", "RMSE", "RMSLE",
|
|
588
|
-
"ME", "EV", "MPD", "MGD"]
|
|
589
|
-
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
782
|
+
_Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.CLUSTERING_METRICS.value, "stopping_metric")
|
|
590
783
|
|
|
591
|
-
if not self.is_classification_type():
|
|
784
|
+
if not self.is_classification_type() and not self.cluster:
|
|
592
785
|
_Validators._validate_column_type(self.data, self.target_column, 'target_column',
|
|
593
786
|
expected_types=UtilFuncs()._get_numeric_datatypes())
|
|
594
787
|
|
|
@@ -597,33 +790,40 @@ class AutoML:
|
|
|
597
790
|
print("\nReceived below input for customization : ")
|
|
598
791
|
print(json.dumps(self.custom_data, indent=4))
|
|
599
792
|
|
|
600
|
-
# Classification probelm
|
|
601
793
|
task_cls = _Classification
|
|
602
794
|
cls_method = "_classification"
|
|
603
|
-
|
|
795
|
+
if self.fraud:
|
|
796
|
+
task_cls = _AutoSpecific
|
|
797
|
+
cls_method = "fit"
|
|
798
|
+
elif self.churn:
|
|
799
|
+
task_cls = _AutoSpecific
|
|
800
|
+
cls_method = "fit"
|
|
604
801
|
# Regression problem
|
|
605
|
-
|
|
802
|
+
elif self.task_type.casefold() == "regression":
|
|
606
803
|
task_cls = _Regression
|
|
607
804
|
cls_method = "_regression"
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
805
|
+
elif self.cluster:
|
|
806
|
+
task_cls = _Clustering
|
|
807
|
+
cls_method = "_clustering"
|
|
808
|
+
|
|
611
809
|
|
|
810
|
+
# Running AutoML
|
|
811
|
+
clf = task_cls(data=self.data, target_column=self.target_column, custom_data=self.custom_data,
|
|
812
|
+
fraud=self.fraud, churn=self.churn, cluster=self.cluster, **self.kwargs)
|
|
813
|
+
|
|
612
814
|
self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
613
815
|
self.data_transformation_params, self._intermediate_table_names = getattr(clf, cls_method)(
|
|
614
|
-
model_list
|
|
615
|
-
auto
|
|
616
|
-
verbose
|
|
617
|
-
max_runtime_secs
|
|
618
|
-
stopping_metric
|
|
619
|
-
stopping_tolerance
|
|
620
|
-
max_models
|
|
621
|
-
auto_dataprep
|
|
622
|
-
automl_phases
|
|
623
|
-
progress_prefix
|
|
816
|
+
model_list=self.model_list,
|
|
817
|
+
auto=self.auto,
|
|
818
|
+
verbose=self.verbose,
|
|
819
|
+
max_runtime_secs=self.max_runtime_secs,
|
|
820
|
+
stopping_metric=self.stopping_metric,
|
|
821
|
+
stopping_tolerance=self.stopping_tolerance,
|
|
822
|
+
max_models=self.max_models,
|
|
823
|
+
auto_dataprep=self._auto_dataprep,
|
|
824
|
+
automl_phases=self._phases,
|
|
825
|
+
progress_prefix=self._progressbar_prefix,
|
|
624
826
|
**self.kwargs)
|
|
625
|
-
|
|
626
|
-
|
|
627
827
|
# table_name_mapping stores the table name of all intermediate datas (lasso, rfe, pca)
|
|
628
828
|
# used for training models
|
|
629
829
|
keys_to_extract = ['lasso_train', 'rfe_train', 'pca_train']
|
|
@@ -633,13 +833,14 @@ class AutoML:
|
|
|
633
833
|
# Model Evaluation Phase
|
|
634
834
|
self.m_evaluator = _ModelEvaluator(self.model_info,
|
|
635
835
|
self.target_column,
|
|
636
|
-
self.task_type
|
|
836
|
+
self.task_type,
|
|
837
|
+
cluster=self.cluster)
|
|
637
838
|
|
|
638
839
|
@collect_queryband(queryband="AutoML_predict")
|
|
639
840
|
def predict(self,
|
|
640
841
|
data,
|
|
641
|
-
rank
|
|
642
|
-
use_loaded_models
|
|
842
|
+
rank=1,
|
|
843
|
+
use_loaded_models=False):
|
|
643
844
|
"""
|
|
644
845
|
DESCRIPTION:
|
|
645
846
|
Function generates prediction on data using model rank in
|
|
@@ -673,9 +874,10 @@ class AutoML:
|
|
|
673
874
|
RAISES:
|
|
674
875
|
TeradataMlException, TypeError, ValueError
|
|
675
876
|
|
|
676
|
-
EXAMPLES:
|
|
677
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
678
|
-
#
|
|
877
|
+
EXAMPLES:
|
|
878
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
879
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
880
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
679
881
|
# Perform fit() operation on the "automl_obj".
|
|
680
882
|
# Perform predict() operation on the "automl_obj".
|
|
681
883
|
|
|
@@ -730,97 +932,107 @@ class AutoML:
|
|
|
730
932
|
rank = rank-1
|
|
731
933
|
|
|
732
934
|
# Setting indicator to False if target column doesn't exist
|
|
733
|
-
if self.target_column not in data.columns:
|
|
935
|
+
if self.cluster or self.target_column not in data.columns:
|
|
734
936
|
self.target_column_ind = False
|
|
735
937
|
|
|
736
938
|
# Checking if data is already transformed before or not
|
|
737
939
|
data_node_id = data._nodeid
|
|
940
|
+
|
|
941
|
+
selected_model_info = self.leader_board.iloc[rank]
|
|
942
|
+
feature_selection_method = selected_model_info.get("FEATURE_SELECTION", "pca")
|
|
738
943
|
if not self.table_name_mapping.get(data_node_id):
|
|
739
944
|
# At first data transformation will be performed on raw test data
|
|
740
945
|
# then evaluation will happen.
|
|
741
|
-
self.
|
|
946
|
+
self._transform_data(data, feature_selection_mtd=feature_selection_method)
|
|
742
947
|
else:
|
|
743
948
|
print("\nSkipping data transformation as data is already transformed.")
|
|
744
|
-
|
|
949
|
+
|
|
745
950
|
# Generating prediction
|
|
746
|
-
pred = self.m_evaluator.model_evaluation(rank
|
|
747
|
-
table_name_mapping
|
|
748
|
-
data_node_id
|
|
749
|
-
target_column_ind
|
|
750
|
-
|
|
951
|
+
pred = self.m_evaluator.model_evaluation(rank=rank,
|
|
952
|
+
table_name_mapping=self.table_name_mapping,
|
|
953
|
+
data_node_id=data_node_id,
|
|
954
|
+
target_column_ind=self.target_column_ind,
|
|
955
|
+
is_predict=True)
|
|
956
|
+
|
|
751
957
|
# Checking if problem type is classification and target label is present.
|
|
752
|
-
if
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
958
|
+
if not self.cluster:
|
|
959
|
+
self._display_target_column_mapping()
|
|
960
|
+
|
|
961
|
+
# Renaming probability column if any
|
|
962
|
+
prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
|
|
963
|
+
if len(prob_lst) > 0:
|
|
964
|
+
rename_dict = {}
|
|
965
|
+
for col in pred.result.columns:
|
|
966
|
+
if col not in prob_lst:
|
|
967
|
+
rename_dict[col] = getattr(pred.result, col)
|
|
968
|
+
else:
|
|
969
|
+
indx = int(col.split('_')[1])
|
|
970
|
+
rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
|
|
971
|
+
rename_dict['drop_columns'] = True
|
|
972
|
+
pred.result = pred.result.assign(**rename_dict)
|
|
973
|
+
|
|
974
|
+
print("\nPrediction : ")
|
|
975
|
+
print(pred.result)
|
|
976
|
+
|
|
977
|
+
if self.target_column_ind:
|
|
978
|
+
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
979
|
+
probability_column = 'prob_1'
|
|
980
|
+
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
981
|
+
if self.is_classification_type():
|
|
982
|
+
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
983
|
+
# Displaying ROC-AUC for binary classification
|
|
984
|
+
if self.target_count == 2:
|
|
985
|
+
fit_params = {
|
|
986
|
+
"probability_column" : probability_column,
|
|
987
|
+
"observation_column" : self.target_column,
|
|
988
|
+
"positive_class" : "1",
|
|
989
|
+
"data" : pred.result
|
|
990
|
+
}
|
|
991
|
+
# ROC can fail if the data is imbalanced. to handle it,
|
|
992
|
+
# we are skipping ROC calculation and giving warning.
|
|
993
|
+
try:
|
|
994
|
+
# Fitting ROC
|
|
995
|
+
roc_out = ROC(**fit_params)
|
|
996
|
+
print("\nROC-AUC : ")
|
|
997
|
+
print_data(roc_out.result)
|
|
998
|
+
print_data(roc_out.output_data)
|
|
999
|
+
except TeradataMlException as e:
|
|
1000
|
+
msg = f"ROC fitting skipped: {e}"
|
|
1001
|
+
warnings.warn(message=msg, stacklevel=2)
|
|
763
1002
|
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
pred
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
if self.target_column_ind:
|
|
784
|
-
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
785
|
-
probability_column = 'prob_1'
|
|
786
|
-
pred_target_count = pred.result.drop_duplicate(self.target_column).size
|
|
787
|
-
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
788
|
-
if self.is_classification_type():
|
|
789
|
-
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
790
|
-
# Displaying ROC-AUC for binary classification
|
|
791
|
-
if self.target_count == 2 and pred_target_count == 2:
|
|
792
|
-
fit_params = {
|
|
793
|
-
"probability_column" : probability_column,
|
|
794
|
-
"observation_column" : self.target_column,
|
|
795
|
-
"positive_class" : "1",
|
|
796
|
-
"data" : pred.result
|
|
797
|
-
}
|
|
798
|
-
# Fitting ROC
|
|
799
|
-
roc_out = ROC(**fit_params)
|
|
800
|
-
print("\nROC-AUC : ")
|
|
801
|
-
print_data(roc_out.result)
|
|
802
|
-
print_data(roc_out.output_data)
|
|
803
|
-
|
|
804
|
-
# Displaying confusion matrix for binary and multiclass classification
|
|
805
|
-
prediction_df=pred.result.to_pandas()
|
|
806
|
-
target_col = self.target_column
|
|
807
|
-
print("\nConfusion Matrix : ")
|
|
808
|
-
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
809
|
-
|
|
1003
|
+
# Displaying confusion matrix for binary and multiclass classification
|
|
1004
|
+
prediction_df = pred.result.to_pandas()
|
|
1005
|
+
target_col = self.target_column
|
|
1006
|
+
print("\nConfusion Matrix : ")
|
|
1007
|
+
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
1008
|
+
else:
|
|
1009
|
+
print("\n Cluster Assignment:")
|
|
1010
|
+
pred_cols = pred.columns
|
|
1011
|
+
# Auto-detect cluster prediction column
|
|
1012
|
+
cluster_col = [col for col in pred_cols if "predict" in col.lower()][0]
|
|
1013
|
+
|
|
1014
|
+
# Select and rename for pretty output
|
|
1015
|
+
|
|
1016
|
+
pred = pred.assign(cluster_assignment=getattr(pred, cluster_col))
|
|
1017
|
+
pred = pred.drop(columns=[cluster_col])
|
|
1018
|
+
prediction = pred.select(["id", "cluster_assignment"])
|
|
1019
|
+
# Display result
|
|
1020
|
+
print(prediction)
|
|
810
1021
|
# Returning prediction
|
|
811
|
-
return pred.result
|
|
1022
|
+
return pred.result if not self.cluster else prediction
|
|
812
1023
|
|
|
813
1024
|
@collect_queryband(queryband="AutoML_evaluate")
|
|
814
1025
|
def evaluate(self,
|
|
815
1026
|
data,
|
|
816
|
-
rank
|
|
817
|
-
use_loaded_models
|
|
1027
|
+
rank=1,
|
|
1028
|
+
use_loaded_models=False
|
|
818
1029
|
):
|
|
819
1030
|
"""
|
|
820
1031
|
DESCRIPTION:
|
|
821
1032
|
Function evaluates on data using model rank in leaderboard
|
|
822
1033
|
and generates performance metrics.
|
|
823
1034
|
Note:
|
|
1035
|
+
* AutoCluster does not support evaluate method, so it raises an exception.
|
|
824
1036
|
* If both fit and load method are called before predict, then fit method model will be used
|
|
825
1037
|
for prediction by default unless 'use_loaded_models' is set to True in predict.
|
|
826
1038
|
|
|
@@ -852,8 +1064,9 @@ class AutoML:
|
|
|
852
1064
|
TeradataMlException.
|
|
853
1065
|
|
|
854
1066
|
EXAMPLES:
|
|
855
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
856
|
-
#
|
|
1067
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1068
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1069
|
+
# "AutoFraud()" or "AutoChurn()" method.
|
|
857
1070
|
# Perform fit() operation on the "automl_obj".
|
|
858
1071
|
# Perform evaluate() operation on the "automl_obj".
|
|
859
1072
|
|
|
@@ -876,6 +1089,12 @@ class AutoML:
|
|
|
876
1089
|
>>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
|
|
877
1090
|
>>> evaluation
|
|
878
1091
|
"""
|
|
1092
|
+
# Currently AutoCluster does not support evaluate so raising the exception
|
|
1093
|
+
if self.cluster:
|
|
1094
|
+
raise TeradataMlException(
|
|
1095
|
+
Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
|
|
1096
|
+
MessageCodes.UNSUPPORTED_OPERATION)
|
|
1097
|
+
|
|
879
1098
|
# Raising exception if fit or load model is not called before evaluate
|
|
880
1099
|
_Validators._validate_dependent_method("evaluate", ["fit", "load"],
|
|
881
1100
|
[self._is_fit_called, self._is_load_model_called])
|
|
@@ -907,7 +1126,7 @@ class AutoML:
|
|
|
907
1126
|
|
|
908
1127
|
# Raising exception if target column is not present in data
|
|
909
1128
|
# as it is required for evaluation.
|
|
910
|
-
if self.target_column not in data.columns:
|
|
1129
|
+
if not self.cluster and self.target_column not in data.columns:
|
|
911
1130
|
raise TeradataMlException(
|
|
912
1131
|
Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
|
|
913
1132
|
MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
|
|
@@ -917,47 +1136,41 @@ class AutoML:
|
|
|
917
1136
|
if not self.table_name_mapping.get(data_node_id):
|
|
918
1137
|
# At first data transformation will be performed on raw test data
|
|
919
1138
|
# then evaluation will happen.
|
|
920
|
-
self.
|
|
1139
|
+
self._transform_data(data)
|
|
921
1140
|
else:
|
|
922
1141
|
print("\nSkipping data transformation as data is already transformed.")
|
|
923
1142
|
|
|
924
|
-
metrics = self.m_evaluator.model_evaluation(rank
|
|
1143
|
+
metrics = self.m_evaluator.model_evaluation(rank=rank,
|
|
925
1144
|
table_name_mapping=self.table_name_mapping,
|
|
926
|
-
data_node_id
|
|
927
|
-
get_metrics
|
|
1145
|
+
data_node_id=data_node_id,
|
|
1146
|
+
get_metrics=True,
|
|
1147
|
+
is_predict=False)
|
|
928
1148
|
|
|
929
1149
|
# Checking if problem type is classification and target label is present.
|
|
930
|
-
if
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
print("\nPerformance Metrics : ")
|
|
947
|
-
print(metrics.result)
|
|
948
|
-
if self.is_classification_type():
|
|
949
|
-
print("-"*80)
|
|
950
|
-
print(metrics.output_data)
|
|
1150
|
+
if not self.cluster:
|
|
1151
|
+
self._display_target_column_mapping()
|
|
1152
|
+
|
|
1153
|
+
# Showing performance metrics
|
|
1154
|
+
print("\nPerformance Metrics : ")
|
|
1155
|
+
print(metrics.result)
|
|
1156
|
+
if self.is_classification_type():
|
|
1157
|
+
print("-"*80)
|
|
1158
|
+
print(metrics.output_data)
|
|
1159
|
+
|
|
1160
|
+
# Returning performance metrics
|
|
1161
|
+
return metrics.result
|
|
1162
|
+
else:
|
|
1163
|
+
print("\nClustering Evaluation Metrics : ")
|
|
1164
|
+
print(metrics)
|
|
1165
|
+
return metrics
|
|
951
1166
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
verbose = None,
|
|
960
|
-
target_column_ind = None):
|
|
1167
|
+
def _transform_data(self,
|
|
1168
|
+
data,
|
|
1169
|
+
feature_selection_mtd=None,
|
|
1170
|
+
data_params=None,
|
|
1171
|
+
auto=None,
|
|
1172
|
+
verbose=None,
|
|
1173
|
+
target_column_ind=None):
|
|
961
1174
|
"""
|
|
962
1175
|
DESCRIPTION:
|
|
963
1176
|
Function transforms the data based on the data transformation parameters
|
|
@@ -968,7 +1181,13 @@ class AutoML:
|
|
|
968
1181
|
Required Argument.
|
|
969
1182
|
Specifies the dataset to be transformed.
|
|
970
1183
|
Types: teradataml DataFrame
|
|
971
|
-
|
|
1184
|
+
|
|
1185
|
+
feature_selection_mtd:
|
|
1186
|
+
Optional Argument.
|
|
1187
|
+
Specifies the feature selection method to be applied.
|
|
1188
|
+
Default Value: None
|
|
1189
|
+
Types: str
|
|
1190
|
+
|
|
972
1191
|
data_params:
|
|
973
1192
|
Optional Argument.
|
|
974
1193
|
Specifies the data transformation parameters.
|
|
@@ -997,14 +1216,16 @@ class AutoML:
|
|
|
997
1216
|
None
|
|
998
1217
|
"""
|
|
999
1218
|
# Creating instance of DataTransformation
|
|
1000
|
-
data_transform_instance = _DataTransformation(data
|
|
1219
|
+
data_transform_instance = _DataTransformation(data=data,
|
|
1001
1220
|
data_transformation_params=data_params if data_params is not None else \
|
|
1002
1221
|
self.data_transformation_params,
|
|
1003
1222
|
auto=auto if data_params is not None else self.auto,
|
|
1004
1223
|
verbose=verbose if verbose is not None else self.verbose,
|
|
1005
1224
|
target_column_ind=target_column_ind if target_column_ind is not None else \
|
|
1006
1225
|
self.target_column_ind,
|
|
1007
|
-
table_name_mapping=self.table_name_mapping
|
|
1226
|
+
table_name_mapping=self.table_name_mapping,
|
|
1227
|
+
cluster=self.cluster,
|
|
1228
|
+
feature_selection_method=feature_selection_mtd)
|
|
1008
1229
|
|
|
1009
1230
|
# Storing mapping of table names for transformed data
|
|
1010
1231
|
self.table_name_mapping = data_transform_instance.data_transformation()
|
|
@@ -1022,8 +1243,9 @@ class AutoML:
|
|
|
1022
1243
|
TeradataMlException.
|
|
1023
1244
|
|
|
1024
1245
|
EXAMPLES:
|
|
1025
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
1026
|
-
#
|
|
1246
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1247
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1248
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1027
1249
|
# Perform fit() operation on the "automl_obj".
|
|
1028
1250
|
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1029
1251
|
>>> automl_obj.leaderboard()
|
|
@@ -1046,8 +1268,9 @@ class AutoML:
|
|
|
1046
1268
|
TeradataMlException.
|
|
1047
1269
|
|
|
1048
1270
|
EXAMPLES:
|
|
1049
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
1050
|
-
#
|
|
1271
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1272
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1273
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1051
1274
|
# Perform fit() operation on the "automl_obj".
|
|
1052
1275
|
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
1053
1276
|
# Display best performing model using leader() method on "automl_obj".
|
|
@@ -1095,8 +1318,9 @@ class AutoML:
|
|
|
1095
1318
|
|
|
1096
1319
|
EXAMPLES:
|
|
1097
1320
|
# Example 1: Get hyperparameters of the model using fit models.
|
|
1098
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
1099
|
-
#
|
|
1321
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1322
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1323
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1100
1324
|
# Perform fit() operation on the "automl_obj".
|
|
1101
1325
|
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1102
1326
|
>>> automl_obj = AutoML(task_type="Classification")
|
|
@@ -1104,8 +1328,9 @@ class AutoML:
|
|
|
1104
1328
|
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1105
1329
|
|
|
1106
1330
|
# Example 2: Get hyperparameters of the model using loaded models.
|
|
1107
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
1108
|
-
#
|
|
1331
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1332
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1333
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1109
1334
|
# Load models from the specified table.
|
|
1110
1335
|
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
1111
1336
|
>>> automl_obj = AutoML()
|
|
@@ -1113,8 +1338,9 @@ class AutoML:
|
|
|
1113
1338
|
>>> automl_obj.model_hyperparameters(rank=1)
|
|
1114
1339
|
|
|
1115
1340
|
# Example 3: Get hyperparameters of the model when both fit and load method are called.
|
|
1116
|
-
# Create an instance of the AutoML called "automl_obj"
|
|
1117
|
-
#
|
|
1341
|
+
# Create an instance of the AutoML called "automl_obj" by referring
|
|
1342
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1343
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1118
1344
|
# Fit the data.
|
|
1119
1345
|
# Load models from the specified table.
|
|
1120
1346
|
# Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
|
|
@@ -1152,7 +1378,9 @@ class AutoML:
|
|
|
1152
1378
|
hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
|
|
1153
1379
|
|
|
1154
1380
|
# Deserializing hyperparameters
|
|
1155
|
-
|
|
1381
|
+
|
|
1382
|
+
if isinstance(hyperparams, str):
|
|
1383
|
+
hyperparams = ast.literal_eval(hyperparams)
|
|
1156
1384
|
|
|
1157
1385
|
# Removing 'data' from hyperparameters
|
|
1158
1386
|
keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
|
|
@@ -1167,7 +1395,8 @@ class AutoML:
|
|
|
1167
1395
|
"""
|
|
1168
1396
|
DESCRIPTION:
|
|
1169
1397
|
Function loads models information from the specified table.
|
|
1170
|
-
|
|
1398
|
+
Note:
|
|
1399
|
+
* AutoCluster does not support load method, so it raises an exception.
|
|
1171
1400
|
PARAMETERS:
|
|
1172
1401
|
table_name:
|
|
1173
1402
|
Required Argument.
|
|
@@ -1181,12 +1410,19 @@ class AutoML:
|
|
|
1181
1410
|
TeradataMlException.
|
|
1182
1411
|
|
|
1183
1412
|
EXAMPLES:
|
|
1184
|
-
# Create an instance of the AutoML called "obj"
|
|
1185
|
-
#
|
|
1413
|
+
# Create an instance of the AutoML called "obj" by referring
|
|
1414
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1415
|
+
# "AutoFraud()" or "AutoChurn()" method.
|
|
1186
1416
|
>>> obj = AutoML()
|
|
1187
1417
|
# Load models from the specified table.
|
|
1188
1418
|
>>> tab = obj.load("model_table")
|
|
1189
1419
|
"""
|
|
1420
|
+
# Currently AutoCluster does not support load so raising the exception
|
|
1421
|
+
if self.cluster:
|
|
1422
|
+
raise TeradataMlException(
|
|
1423
|
+
Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
|
|
1424
|
+
MessageCodes.UNSUPPORTED_OPERATION)
|
|
1425
|
+
|
|
1190
1426
|
# Appending arguments to list for validation
|
|
1191
1427
|
arg_info_matrix = []
|
|
1192
1428
|
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
@@ -1196,6 +1432,19 @@ class AutoML:
|
|
|
1196
1432
|
|
|
1197
1433
|
# Loading models
|
|
1198
1434
|
self.loaded_models_info = DataFrame(table_name).to_pandas()
|
|
1435
|
+
cols = self.loaded_models_info.columns
|
|
1436
|
+
|
|
1437
|
+
# Scan column names to determine task_type based on presence of "ACCURACY"
|
|
1438
|
+
if any("ACCURACY" in col.upper() for col in cols):
|
|
1439
|
+
self.task_type = "Classification"
|
|
1440
|
+
else:
|
|
1441
|
+
self.task_type = "Regression"
|
|
1442
|
+
|
|
1443
|
+
if not hasattr(self, "m_evaluator") or self.m_evaluator is None:
|
|
1444
|
+
self.m_evaluator = _ModelEvaluator(df=self.loaded_models_info,
|
|
1445
|
+
target_column=self.target_column,
|
|
1446
|
+
task_type=self.task_type,
|
|
1447
|
+
cluster=self.cluster)
|
|
1199
1448
|
|
|
1200
1449
|
self._load_data_transform_params()
|
|
1201
1450
|
|
|
@@ -1208,8 +1457,6 @@ class AutoML:
|
|
|
1208
1457
|
DESCRIPTION:
|
|
1209
1458
|
Internal Function loads data transformation parameters from the specified table.
|
|
1210
1459
|
"""
|
|
1211
|
-
from sklearn.decomposition import PCA
|
|
1212
|
-
|
|
1213
1460
|
# Getting data transformation row
|
|
1214
1461
|
data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
|
|
1215
1462
|
|
|
@@ -1236,22 +1483,23 @@ class AutoML:
|
|
|
1236
1483
|
data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
|
|
1237
1484
|
|
|
1238
1485
|
# Manually deserializing and reconstructing PCA object
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1486
|
+
if 'pca_fit_instance' in data_params:
|
|
1487
|
+
load_pca_info = data_params['pca_fit_instance']
|
|
1488
|
+
pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
|
|
1489
|
+
pca.components_ = np.array(load_pca_info['components'])
|
|
1490
|
+
pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
|
|
1491
|
+
pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
|
|
1492
|
+
pca.mean_ = np.array(load_pca_info['mean'])
|
|
1493
|
+
pca.n_components_ = load_pca_info['n_components']
|
|
1494
|
+
pca.noise_variance_ = load_pca_info['noise_variance']
|
|
1495
|
+
pca.singular_values_ = np.array(load_pca_info['singular_values'])
|
|
1496
|
+
pca.feature_names_in_ = data_params['pca_fit_columns']
|
|
1497
|
+
pca.n_features_in_ = len(data_params['pca_fit_columns'])
|
|
1498
|
+
|
|
1499
|
+
data_params['pca_fit_instance'] = pca
|
|
1252
1500
|
|
|
1253
1501
|
self.loaded_data_transformation_params = data_params
|
|
1254
|
-
|
|
1502
|
+
|
|
1255
1503
|
def _validate_ranks(self, ranks):
|
|
1256
1504
|
"""
|
|
1257
1505
|
DESCRIPTION:
|
|
@@ -1284,16 +1532,42 @@ class AutoML:
|
|
|
1284
1532
|
|
|
1285
1533
|
return start_rank, end_rank
|
|
1286
1534
|
|
|
1535
|
+
def _display_target_column_mapping(self):
|
|
1536
|
+
"""
|
|
1537
|
+
DESCRIPTION:
|
|
1538
|
+
Internal method to display target column mapping for classification problems.
|
|
1539
|
+
This method displays the mapping between original target column values and
|
|
1540
|
+
their encoded values.
|
|
1541
|
+
|
|
1542
|
+
RETURNS:
|
|
1543
|
+
None
|
|
1544
|
+
"""
|
|
1545
|
+
if not self.cluster and self.is_classification_type() and self.target_label is not None:
|
|
1546
|
+
# Displaying target column labels
|
|
1547
|
+
tar_dct = {}
|
|
1548
|
+
print('\nTarget Column Mapping:')
|
|
1549
|
+
# Iterating rows
|
|
1550
|
+
for row in self.target_label.result.itertuples():
|
|
1551
|
+
# Retrieving the category names of encoded target column
|
|
1552
|
+
# row[1] contains the orginal name of cateogry
|
|
1553
|
+
# row[2] contains the encoded value
|
|
1554
|
+
if row[1] != 'TD_CATEGORY_COUNT':
|
|
1555
|
+
tar_dct[row[1]] = row[2]
|
|
1556
|
+
|
|
1557
|
+
for key, value in tar_dct.items():
|
|
1558
|
+
print(f"{key}: {value}")
|
|
1559
|
+
|
|
1287
1560
|
@collect_queryband(queryband="AutoML_deploy")
|
|
1288
1561
|
def deploy(self,
|
|
1289
1562
|
table_name,
|
|
1290
|
-
top_n
|
|
1291
|
-
ranks
|
|
1563
|
+
top_n=3,
|
|
1564
|
+
ranks=None
|
|
1292
1565
|
):
|
|
1293
1566
|
"""
|
|
1294
1567
|
DESCRIPTION:
|
|
1295
1568
|
Function saves models to the specified table name.
|
|
1296
1569
|
Note:
|
|
1570
|
+
* AutoCluster does not support deploy method, so it raises an exception.
|
|
1297
1571
|
* If 'ranks' is provided, specified models in 'ranks' will be saved
|
|
1298
1572
|
and ranks will be reassigned to specified models based
|
|
1299
1573
|
on the order of the leaderboard, non-specified models will be ignored.
|
|
@@ -1327,8 +1601,9 @@ class AutoML:
|
|
|
1327
1601
|
TeradataMlException.
|
|
1328
1602
|
|
|
1329
1603
|
EXAMPLES:
|
|
1330
|
-
# Create an instance of the AutoML called "obj"
|
|
1331
|
-
#
|
|
1604
|
+
# Create an instance of the AutoML called "obj" by referring
|
|
1605
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
1606
|
+
# "AutoFraud()" or "AutoChurn()" method.
|
|
1332
1607
|
>>> obj = AutoML(task_type="Classification")
|
|
1333
1608
|
>>> obj.fit(data = data, target_column = target_column)
|
|
1334
1609
|
|
|
@@ -1344,6 +1619,11 @@ class AutoML:
|
|
|
1344
1619
|
# Save models based on specified rank range to the specified table.
|
|
1345
1620
|
>>> obj.deploy("model_table", ranks=range(2,6))
|
|
1346
1621
|
"""
|
|
1622
|
+
# Currently AutoCluster does not support deploy so raising the exception
|
|
1623
|
+
if self.cluster:
|
|
1624
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
|
|
1625
|
+
MessageCodes.UNSUPPORTED_OPERATION)
|
|
1626
|
+
|
|
1347
1627
|
# raise Error if fit is not called
|
|
1348
1628
|
_Validators._validate_dependent_method("deploy", "fit", self._is_fit_called)
|
|
1349
1629
|
|
|
@@ -1390,13 +1670,14 @@ class AutoML:
|
|
|
1390
1670
|
# Example: {'lasso': 'ml__survived_lasso_1717475362789542',
|
|
1391
1671
|
# 'rfe': 'ml__survived_rfe_1717474570567062',
|
|
1392
1672
|
# 'pca': 'ml__survived_pca_1717475375119752'}
|
|
1393
|
-
fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
|
|
1673
|
+
fs_to_data_dict = {fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
|
|
1394
1674
|
'DATA_TABLE'].iloc[0] for fs in feature_selections}
|
|
1395
1675
|
|
|
1396
1676
|
# Saving temporary training data to permanent table
|
|
1397
1677
|
# We are replacing DATA_TABLE with permanent table name in model_info
|
|
1398
1678
|
for key, val in fs_to_data_dict.items():
|
|
1399
|
-
|
|
1679
|
+
prefix = 'cluster_{}'.format(key) if self.cluster else '{}_{}'.format(self.target_column, key)
|
|
1680
|
+
per_name = self._create_per_result_table(prefix=prefix,
|
|
1400
1681
|
persist_result_table=val)
|
|
1401
1682
|
fs_to_data_dict[key] = per_name
|
|
1402
1683
|
|
|
@@ -1407,10 +1688,22 @@ class AutoML:
|
|
|
1407
1688
|
if ranks is None or len(ranks) == 0:
|
|
1408
1689
|
# Saving only top 'top_n' models
|
|
1409
1690
|
for index, row in self.model_info.iterrows():
|
|
1691
|
+
model_id = row['MODEL_ID']
|
|
1692
|
+
result_table = row['RESULT_TABLE']
|
|
1693
|
+
|
|
1694
|
+
if result_table is None:
|
|
1695
|
+
print(f" Skipping model {model_id} because RESULT_TABLE is None.")
|
|
1696
|
+
continue
|
|
1697
|
+
|
|
1698
|
+
if self.cluster:
|
|
1699
|
+
prefix = f"cluster_{model_id}"
|
|
1700
|
+
else:
|
|
1701
|
+
prefix = f"{self.target_column}_{model_id}"
|
|
1702
|
+
|
|
1410
1703
|
if index < top_n:
|
|
1411
1704
|
self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1412
1705
|
if not persist:
|
|
1413
|
-
per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1706
|
+
per_name = self._create_per_result_table(prefix= 'cluster_{}'.format(row['MODEL_ID']) if self.cluster else '{}_{}'.format(self.target_column, row['MODEL_ID']),
|
|
1414
1707
|
persist_result_table=row['RESULT_TABLE'])
|
|
1415
1708
|
self.model_info.loc[index, 'RESULT_TABLE'] = per_name
|
|
1416
1709
|
else:
|
|
@@ -1430,7 +1723,8 @@ class AutoML:
|
|
|
1430
1723
|
sv_models.loc[index, 'RANK'] = index + 1
|
|
1431
1724
|
sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
|
|
1432
1725
|
if not persist:
|
|
1433
|
-
|
|
1726
|
+
prefix = 'cluster_{}'.format(key) if self.cluster else '{}_{}'.format(self.target_column, key)
|
|
1727
|
+
per_name = self._create_per_result_table(prefix=prefix,
|
|
1434
1728
|
persist_result_table=row['RESULT_TABLE'])
|
|
1435
1729
|
sv_models.loc[index, 'RESULT_TABLE'] = per_name
|
|
1436
1730
|
|
|
@@ -1439,6 +1733,9 @@ class AutoML:
|
|
|
1439
1733
|
|
|
1440
1734
|
# Saving data transformation parameters to the specified table
|
|
1441
1735
|
sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
|
|
1736
|
+
|
|
1737
|
+
if "PARAMETERS" in sv_models.columns:
|
|
1738
|
+
sv_models["PARAMETERS"] = sv_models["PARAMETERS"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
|
|
1442
1739
|
|
|
1443
1740
|
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
|
|
1444
1741
|
'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
|
|
@@ -1477,7 +1774,6 @@ class AutoML:
|
|
|
1477
1774
|
volatile=False)
|
|
1478
1775
|
return table_name
|
|
1479
1776
|
|
|
1480
|
-
|
|
1481
1777
|
def _deploy_data_transformation_params(self):
|
|
1482
1778
|
"""
|
|
1483
1779
|
DESCRIPTION:
|
|
@@ -1538,7 +1834,7 @@ class AutoML:
|
|
|
1538
1834
|
data_params[aml_step_name] = val._table_name
|
|
1539
1835
|
else:
|
|
1540
1836
|
per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
|
|
1541
|
-
persist_result_table=
|
|
1837
|
+
persist_result_table=val._table_name)
|
|
1542
1838
|
data_params[aml_step_name] = per_name
|
|
1543
1839
|
elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
|
|
1544
1840
|
for key, val in val.items():
|
|
@@ -1548,7 +1844,7 @@ class AutoML:
|
|
|
1548
1844
|
data_params[aml_step_name][key] = val._table_name
|
|
1549
1845
|
else:
|
|
1550
1846
|
per_name = self._create_per_result_table(prefix='{}'.format(key),
|
|
1551
|
-
persist_result_table=
|
|
1847
|
+
persist_result_table=val._table_name)
|
|
1552
1848
|
data_params[aml_step_name][key] = per_name
|
|
1553
1849
|
elif aml_step_name == 'pca_fit_instance':
|
|
1554
1850
|
# Serializing PCA object
|
|
@@ -1629,14 +1925,22 @@ class AutoML:
|
|
|
1629
1925
|
fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
|
|
1630
1926
|
|
|
1631
1927
|
# Checking task type
|
|
1928
|
+
if 'SILHOUETTE' in self.loaded_models_info.columns or self.cluster:
|
|
1929
|
+
task_type = 'Clustering'
|
|
1632
1930
|
if 'R2' in self.loaded_models_info.columns:
|
|
1633
|
-
task_type='Regression'
|
|
1931
|
+
task_type = 'Regression'
|
|
1634
1932
|
else:
|
|
1635
|
-
task_type='Classification'
|
|
1933
|
+
task_type = 'Classification'
|
|
1636
1934
|
|
|
1637
1935
|
# Model names mapping to Analytic Functions
|
|
1638
|
-
|
|
1639
|
-
|
|
1936
|
+
if self.cluster:
|
|
1937
|
+
func_map = {
|
|
1938
|
+
'KMeans': lambda params: skl.KMeans(**params),
|
|
1939
|
+
'GaussianMixture': lambda params: skl.GaussianMixture(**params)
|
|
1940
|
+
}
|
|
1941
|
+
else:
|
|
1942
|
+
func_map = {
|
|
1943
|
+
'XGBOOST': lambda params: XGBoost(**params),
|
|
1640
1944
|
'GLM': lambda params: GLM(**params),
|
|
1641
1945
|
'SVM': lambda params: SVM(**params),
|
|
1642
1946
|
'DECISIONFOREST': lambda params: DecisionForest(**params),
|
|
@@ -1651,32 +1955,37 @@ class AutoML:
|
|
|
1651
1955
|
print(f"Feature Selection: {fs}")
|
|
1652
1956
|
|
|
1653
1957
|
# Generating evaluation parameters
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1958
|
+
if not self.cluster:
|
|
1959
|
+
eval_params = _ModelTraining._eval_params_generation(model_name,
|
|
1960
|
+
parameters['response_column'],
|
|
1961
|
+
task_type)
|
|
1962
|
+
if task_type == 'Classification':
|
|
1963
|
+
eval_params['output_responses'] = parameters['output_responses']
|
|
1964
|
+
|
|
1965
|
+
# Checking if response column is present in test data
|
|
1966
|
+
if parameters['response_column'] not in test_data.columns:
|
|
1967
|
+
# Checking if output type is evaluation
|
|
1968
|
+
if output_type == 'evaluation':
|
|
1969
|
+
# Response column is rqeuired for evaluation, raise error if not present
|
|
1970
|
+
raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
|
|
1971
|
+
eval_params.pop('accumulate', None)
|
|
1972
|
+
reponse_col_present = False
|
|
1973
|
+
else:
|
|
1974
|
+
reponse_col_present = True
|
|
1668
1975
|
else:
|
|
1669
|
-
|
|
1976
|
+
eval_params = {}
|
|
1977
|
+
reponse_col_present = False
|
|
1670
1978
|
|
|
1671
1979
|
# Checking if data is already transformed before or not
|
|
1672
1980
|
data_node_id = test_data._nodeid
|
|
1673
1981
|
if not self.table_name_mapping.get(data_node_id):
|
|
1674
1982
|
# Data transformation will be performed on raw test data
|
|
1675
|
-
self.
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1983
|
+
self._transform_data(data=test_data,
|
|
1984
|
+
data_params=self.loaded_data_transformation_params,
|
|
1985
|
+
feature_selection_mtd=fs,
|
|
1986
|
+
auto=self.loaded_data_transformation_params['auto_mode'],
|
|
1987
|
+
verbose=0,
|
|
1988
|
+
target_column_ind=reponse_col_present)
|
|
1680
1989
|
|
|
1681
1990
|
# Extracting test data
|
|
1682
1991
|
for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
|
|
@@ -1684,6 +1993,49 @@ class AutoML:
|
|
|
1684
1993
|
test_data = DataFrame(table_name)
|
|
1685
1994
|
break
|
|
1686
1995
|
|
|
1996
|
+
if self.cluster:
|
|
1997
|
+
# Only PCA is used in clustering
|
|
1998
|
+
X = test_data
|
|
1999
|
+
|
|
2000
|
+
if 'model-obj' in self.loaded_models_info.columns:
|
|
2001
|
+
model = self.loaded_models_info.loc[rank, 'model-obj']
|
|
2002
|
+
else:
|
|
2003
|
+
# Recreate model from parameters
|
|
2004
|
+
if model_name == "KMeans":
|
|
2005
|
+
model = skl.KMeans(**parameters)
|
|
2006
|
+
elif model_name == "GaussianMixture":
|
|
2007
|
+
model = skl.GaussianMixture(**parameters)
|
|
2008
|
+
else:
|
|
2009
|
+
raise ValueError(f"Unsupported clustering model: {model_name}")
|
|
2010
|
+
model.fit(X)
|
|
2011
|
+
result = model.predict(X)
|
|
2012
|
+
|
|
2013
|
+
if output_type != "prediction":
|
|
2014
|
+
silhouette = skl.silhouette_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
|
|
2015
|
+
calinski = skl.calinski_harabasz_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
|
|
2016
|
+
davies = skl.davies_bouldin_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
|
|
2017
|
+
return {
|
|
2018
|
+
"SILHOUETTE": silhouette,
|
|
2019
|
+
"CALINSKI": calinski,
|
|
2020
|
+
"DAVIES": davies
|
|
2021
|
+
}
|
|
2022
|
+
|
|
2023
|
+
pred_cols = result.columns
|
|
2024
|
+
cluster_col = [col for col in pred_cols if "predict" in col.lower()][0]
|
|
2025
|
+
|
|
2026
|
+
result = result.assign(cluster_assignment=getattr(result, cluster_col))
|
|
2027
|
+
result = result.drop(columns=[cluster_col])
|
|
2028
|
+
prediction = result.select(["id", "cluster_assignment"])
|
|
2029
|
+
|
|
2030
|
+
# Visualization
|
|
2031
|
+
|
|
2032
|
+
if hasattr(self, "m_evaluator") and self.m_evaluator:
|
|
2033
|
+
self.m_evaluator.table_name_mapping = self.table_name_mapping
|
|
2034
|
+
self.m_evaluator.data_node_id = list(self.table_name_mapping.keys())[0]
|
|
2035
|
+
|
|
2036
|
+
|
|
2037
|
+
return prediction
|
|
2038
|
+
|
|
1687
2039
|
if model_name == 'KNN':
|
|
1688
2040
|
train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
|
|
1689
2041
|
|
|
@@ -1723,6 +2075,16 @@ class AutoML:
|
|
|
1723
2075
|
if reponse_col_present and output_type != 'prediction':
|
|
1724
2076
|
return metrics
|
|
1725
2077
|
|
|
2078
|
+
if not self.cluster and hasattr(self, "m_evaluator") and self.m_evaluator:
|
|
2079
|
+
permitted_models = ["XGBOOST", "DECISIONFOREST"]
|
|
2080
|
+
if model_name.upper() in permitted_models and output_type == 'prediction':
|
|
2081
|
+
print("\nApplying SHAP for Model Interpretation (Load)...")
|
|
2082
|
+
self.m_evaluator.table_name_mapping = self.table_name_mapping
|
|
2083
|
+
self.m_evaluator.data_node_id = list(self.table_name_mapping.keys())[0]
|
|
2084
|
+
|
|
2085
|
+
self.m_evaluator._apply_shap(rank, isload =True)
|
|
2086
|
+
else:
|
|
2087
|
+
print(f"\nShap is not applicable for {model_name}")
|
|
1726
2088
|
# Return prediction, when output type is prediction
|
|
1727
2089
|
return predictions if model_name == 'KNN' else predictions.result
|
|
1728
2090
|
|
|
@@ -1749,8 +2111,9 @@ class AutoML:
|
|
|
1749
2111
|
TeradataMlException.
|
|
1750
2112
|
|
|
1751
2113
|
EXAMPLES:
|
|
1752
|
-
# Create an instance of the AutoML called "obj"
|
|
1753
|
-
#
|
|
2114
|
+
# Create an instance of the AutoML called "obj" by referring
|
|
2115
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
2116
|
+
# "AutoFraud()" or "AutoChurn()" method.
|
|
1754
2117
|
>>> obj = AutoML()
|
|
1755
2118
|
# Remove saved models from the specified table.
|
|
1756
2119
|
>>> obj.remove_saved_models("model_table")
|
|
@@ -1812,8 +2175,9 @@ class AutoML:
|
|
|
1812
2175
|
TeradataMlException.
|
|
1813
2176
|
|
|
1814
2177
|
EXAMPLES:
|
|
1815
|
-
# Create an instance of the AutoML called "obj"
|
|
1816
|
-
#
|
|
2178
|
+
# Create an instance of the AutoML called "obj" by referring
|
|
2179
|
+
# "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
|
|
2180
|
+
# "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
|
|
1817
2181
|
# 'persist' argument must be set to True in the AutoML object.
|
|
1818
2182
|
>>> obj = AutoML(verbose=2, max_models=10, persist=True)
|
|
1819
2183
|
|
|
@@ -1940,8 +2304,8 @@ class AutoML:
|
|
|
1940
2304
|
TeradataMlException.
|
|
1941
2305
|
|
|
1942
2306
|
EXAMPLES:
|
|
1943
|
-
# Import either of AutoML or AutoClassifier or AutoRegressor or
|
|
1944
|
-
# from teradataml.
|
|
2307
|
+
# Import either of "AutoML" or "AutoClassifier" or "AutoRegressor" or
|
|
2308
|
+
# or "AutoFraud" or "AutoChurn" or "AutoDataPrep" from teradataml.
|
|
1945
2309
|
>>> from teradataml import AutoML
|
|
1946
2310
|
>>> from teradataml import DataFrame
|
|
1947
2311
|
>>> load_example_data("teradataml", "titanic")
|
|
@@ -1969,10 +2333,10 @@ class AutoML:
|
|
|
1969
2333
|
... length = 20,
|
|
1970
2334
|
... breadth = 15)
|
|
1971
2335
|
"""
|
|
1972
|
-
_FeatureExplore._visualize(**kwargs)
|
|
2336
|
+
_FeatureExplore._visualize(**kwargs)
|
|
1973
2337
|
|
|
1974
2338
|
@staticmethod
|
|
1975
|
-
def generate_custom_config(file_name
|
|
2339
|
+
def generate_custom_config(file_name="custom", cluster=False):
|
|
1976
2340
|
"""
|
|
1977
2341
|
DESCRIPTION:
|
|
1978
2342
|
Function generates custom JSON file containing user customized input under current
|
|
@@ -1985,12 +2349,20 @@ class AutoML:
|
|
|
1985
2349
|
with extension. Extension '.json' is automatically added to specified file name.
|
|
1986
2350
|
Default Value: "custom"
|
|
1987
2351
|
Types: str
|
|
2352
|
+
|
|
2353
|
+
cluster:
|
|
2354
|
+
Optional Argument.
|
|
2355
|
+
Specifies whether to generate configuration for clustering tasks.
|
|
2356
|
+
When set to True, generates clustering-specific configuration options.
|
|
2357
|
+
Default Value: False
|
|
2358
|
+
Types: bool
|
|
1988
2359
|
|
|
1989
2360
|
RETURNS:
|
|
1990
2361
|
None
|
|
1991
2362
|
|
|
1992
2363
|
EXAMPLES:
|
|
1993
|
-
# Import either of AutoML or AutoClassifier or AutoRegressor
|
|
2364
|
+
# Import either of "AutoML" or "AutoClassifier" or "AutoRegressor" or
|
|
2365
|
+
# or "AutoFraud" or "AutoChurn" or "AutoCluster" from teradataml.
|
|
1994
2366
|
# As per requirement, generate json file using generate_custom_config() method.
|
|
1995
2367
|
|
|
1996
2368
|
# Generate a default file named "custom.json" file using either of below options.
|
|
@@ -1999,6 +2371,12 @@ class AutoML:
|
|
|
1999
2371
|
>>> AutoClassifier.generate_custom_config()
|
|
2000
2372
|
or
|
|
2001
2373
|
>>> AutoRegressor.generate_custom_config()
|
|
2374
|
+
or
|
|
2375
|
+
>>> AutoFraud.generate_custom_config()
|
|
2376
|
+
or
|
|
2377
|
+
>>> AutoChurn.generate_custom_config()
|
|
2378
|
+
or
|
|
2379
|
+
>>> AutoCluster.generate_custom_config()
|
|
2002
2380
|
# The above code will generate "custom.json" file under the current working directory.
|
|
2003
2381
|
|
|
2004
2382
|
# Generate different file name using "file_name" argument.
|
|
@@ -2011,7 +2389,7 @@ class AutoML:
|
|
|
2011
2389
|
|
|
2012
2390
|
"""
|
|
2013
2391
|
# Intializing class
|
|
2014
|
-
generator = _GenerateCustomJson()
|
|
2392
|
+
generator = _GenerateCustomJson(cluster=cluster)
|
|
2015
2393
|
# Generating custom JSON data
|
|
2016
2394
|
data = generator._generate_custom_json()
|
|
2017
2395
|
# Converting to JSON
|
|
@@ -2022,13 +2400,13 @@ class AutoML:
|
|
|
2022
2400
|
file.write(custom_json)
|
|
2023
2401
|
print(f"\n'{json_file}' file is generated successfully under the current working directory.")
|
|
2024
2402
|
|
|
2025
|
-
|
|
2026
2403
|
class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
|
|
2027
2404
|
|
|
2028
2405
|
def __init__(self,
|
|
2029
2406
|
data,
|
|
2030
2407
|
target_column,
|
|
2031
|
-
custom_data
|
|
2408
|
+
custom_data=None,
|
|
2409
|
+
**kwargs):
|
|
2032
2410
|
"""
|
|
2033
2411
|
DESCRIPTION:
|
|
2034
2412
|
Function initializes the data, target column for Regression.
|
|
@@ -2052,16 +2430,17 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
2052
2430
|
self.data = data
|
|
2053
2431
|
self.target_column = target_column
|
|
2054
2432
|
self.custom_data = custom_data
|
|
2055
|
-
|
|
2056
|
-
|
|
2433
|
+
|
|
2434
|
+
super().__init__(data=data, target_column=target_column, custom_data=custom_data, **kwargs)
|
|
2435
|
+
|
|
2057
2436
|
def _regression(self,
|
|
2058
2437
|
model_list=None,
|
|
2059
|
-
auto
|
|
2060
|
-
verbose
|
|
2061
|
-
max_runtime_secs
|
|
2062
|
-
stopping_metric
|
|
2063
|
-
stopping_tolerance
|
|
2064
|
-
max_models
|
|
2438
|
+
auto=False,
|
|
2439
|
+
verbose=0,
|
|
2440
|
+
max_runtime_secs=None,
|
|
2441
|
+
stopping_metric=None,
|
|
2442
|
+
stopping_tolerance=None,
|
|
2443
|
+
max_models=None,
|
|
2065
2444
|
**kwargs):
|
|
2066
2445
|
"""
|
|
2067
2446
|
DESCRIPTION:
|
|
@@ -2121,8 +2500,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
2121
2500
|
results are garbage collected at the end of the
|
|
2122
2501
|
session.
|
|
2123
2502
|
Default Value: False
|
|
2124
|
-
Types: bool
|
|
2125
|
-
|
|
2503
|
+
Types: bool
|
|
2504
|
+
|
|
2126
2505
|
seed:
|
|
2127
2506
|
Optional Argument.
|
|
2128
2507
|
Specifies the random seed for reproducibility.
|
|
@@ -2132,21 +2511,22 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
2132
2511
|
RETURNS:
|
|
2133
2512
|
a tuple containing, model information and leaderboard.
|
|
2134
2513
|
"""
|
|
2135
|
-
|
|
2136
2514
|
# Feature Exploration Phase
|
|
2137
2515
|
_FeatureExplore.__init__(self,
|
|
2138
|
-
data
|
|
2139
|
-
target_column
|
|
2140
|
-
|
|
2516
|
+
data=self.data,
|
|
2517
|
+
target_column=self.target_column,
|
|
2518
|
+
custom_data=self.custom_data,
|
|
2519
|
+
verbose=verbose,
|
|
2520
|
+
**kwargs)
|
|
2141
2521
|
if verbose > 0:
|
|
2142
2522
|
self._exploration(**kwargs)
|
|
2143
2523
|
# Feature Engineering Phase
|
|
2144
2524
|
_FeatureEngineering.__init__(self,
|
|
2145
|
-
data
|
|
2146
|
-
target_column
|
|
2147
|
-
model_list
|
|
2148
|
-
verbose
|
|
2149
|
-
custom_data
|
|
2525
|
+
data=self.data,
|
|
2526
|
+
target_column=self.target_column,
|
|
2527
|
+
model_list=model_list,
|
|
2528
|
+
verbose=verbose,
|
|
2529
|
+
custom_data=self.custom_data,
|
|
2150
2530
|
**kwargs)
|
|
2151
2531
|
# Start time
|
|
2152
2532
|
start_time = time.time()
|
|
@@ -2155,13 +2535,13 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
2155
2535
|
|
|
2156
2536
|
# Data preparation Phase
|
|
2157
2537
|
_DataPreparation.__init__(self,
|
|
2158
|
-
data
|
|
2159
|
-
target_column
|
|
2160
|
-
verbose
|
|
2161
|
-
excluded_columns
|
|
2162
|
-
custom_data
|
|
2163
|
-
data_transform_dict
|
|
2164
|
-
data_mapping
|
|
2538
|
+
data=self.data,
|
|
2539
|
+
target_column=self.target_column,
|
|
2540
|
+
verbose=verbose,
|
|
2541
|
+
excluded_columns=excluded_columns,
|
|
2542
|
+
custom_data=self.custom_data,
|
|
2543
|
+
data_transform_dict=data_transformation_params,
|
|
2544
|
+
data_mapping=data_mapping,
|
|
2165
2545
|
**kwargs)
|
|
2166
2546
|
features, data_transformation_params,\
|
|
2167
2547
|
data_mapping = self.data_preparation(auto)
|
|
@@ -2185,19 +2565,19 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
2185
2565
|
|
|
2186
2566
|
# Model Training
|
|
2187
2567
|
_ModelTraining.__init__(self,
|
|
2188
|
-
data
|
|
2189
|
-
target_column
|
|
2190
|
-
model_list
|
|
2191
|
-
verbose
|
|
2192
|
-
features
|
|
2193
|
-
task_type
|
|
2194
|
-
custom_data
|
|
2568
|
+
data=self.data,
|
|
2569
|
+
target_column=self.target_column,
|
|
2570
|
+
model_list=model_list,
|
|
2571
|
+
verbose=verbose,
|
|
2572
|
+
features=features,
|
|
2573
|
+
task_type="Regression",
|
|
2574
|
+
custom_data=self.custom_data,
|
|
2195
2575
|
**kwargs)
|
|
2196
|
-
models_info, leaderboard, target_count = self.model_training(auto
|
|
2197
|
-
max_runtime_secs
|
|
2198
|
-
stopping_metric
|
|
2199
|
-
stopping_tolerance
|
|
2200
|
-
max_models
|
|
2576
|
+
models_info, leaderboard, target_count = self.model_training(auto=auto,
|
|
2577
|
+
max_runtime_secs=max_runtime_secs,
|
|
2578
|
+
stopping_metric=stopping_metric,
|
|
2579
|
+
stopping_tolerance=stopping_tolerance,
|
|
2580
|
+
max_models=max_models)
|
|
2201
2581
|
|
|
2202
2582
|
return (models_info, leaderboard,
|
|
2203
2583
|
target_count, target_label,
|
|
@@ -2208,7 +2588,10 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2208
2588
|
def __init__(self,
|
|
2209
2589
|
data,
|
|
2210
2590
|
target_column,
|
|
2211
|
-
custom_data
|
|
2591
|
+
custom_data=None,
|
|
2592
|
+
fraud=False,
|
|
2593
|
+
churn=False,
|
|
2594
|
+
**kwargs):
|
|
2212
2595
|
"""
|
|
2213
2596
|
DESCRIPTION:
|
|
2214
2597
|
Function initializes the data, target column for Classification.
|
|
@@ -2228,19 +2611,37 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2228
2611
|
Optional Argument.
|
|
2229
2612
|
Specifies json object containing user customized input.
|
|
2230
2613
|
Types: json object
|
|
2614
|
+
|
|
2615
|
+
fraud:
|
|
2616
|
+
Optional Argument.
|
|
2617
|
+
Specifies whether to run fraud detection or not.
|
|
2618
|
+
Default Value: False
|
|
2619
|
+
Types: bool
|
|
2620
|
+
|
|
2621
|
+
churn:
|
|
2622
|
+
Optional Argument.
|
|
2623
|
+
Specifies whether to run churn prediction or not.
|
|
2624
|
+
Default Value: False
|
|
2625
|
+
Types: bool
|
|
2231
2626
|
"""
|
|
2232
2627
|
self.data = data
|
|
2233
2628
|
self.target_column = target_column
|
|
2234
2629
|
self.custom_data = custom_data
|
|
2235
2630
|
|
|
2631
|
+
self.fraud = fraud
|
|
2632
|
+
self.churn = churn
|
|
2633
|
+
|
|
2634
|
+
super().__init__(data=data, target_column=target_column, custom_data=custom_data,
|
|
2635
|
+
fraud=fraud, churn=churn, **kwargs)
|
|
2636
|
+
|
|
2236
2637
|
def _classification(self,
|
|
2237
2638
|
model_list=None,
|
|
2238
|
-
auto
|
|
2239
|
-
verbose
|
|
2240
|
-
max_runtime_secs
|
|
2241
|
-
stopping_metric
|
|
2242
|
-
stopping_tolerance
|
|
2243
|
-
max_models
|
|
2639
|
+
auto=False,
|
|
2640
|
+
verbose=0,
|
|
2641
|
+
max_runtime_secs=None,
|
|
2642
|
+
stopping_metric=None,
|
|
2643
|
+
stopping_tolerance=None,
|
|
2644
|
+
max_models=None,
|
|
2244
2645
|
**kwargs):
|
|
2245
2646
|
"""
|
|
2246
2647
|
DESCRIPTION:
|
|
@@ -2312,23 +2713,28 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2312
2713
|
a tuple containing, model information and leaderboard.
|
|
2313
2714
|
"""
|
|
2314
2715
|
|
|
2315
|
-
|
|
2316
2716
|
# Feature Exploration Phase
|
|
2317
2717
|
_FeatureExplore.__init__(self,
|
|
2318
|
-
data
|
|
2319
|
-
target_column
|
|
2718
|
+
data=self.data,
|
|
2719
|
+
target_column=self.target_column,
|
|
2720
|
+
custom_data=self.custom_data,
|
|
2320
2721
|
verbose=verbose,
|
|
2321
|
-
task_type
|
|
2722
|
+
task_type="classification",
|
|
2723
|
+
fraud=self.fraud,
|
|
2724
|
+
churn=self.churn,
|
|
2725
|
+
**kwargs)
|
|
2322
2726
|
if verbose > 0:
|
|
2323
2727
|
self._exploration(**kwargs)
|
|
2324
|
-
# Feature
|
|
2728
|
+
# Feature Engineering Phase
|
|
2325
2729
|
_FeatureEngineering.__init__(self,
|
|
2326
|
-
data
|
|
2327
|
-
target_column
|
|
2328
|
-
model_list
|
|
2329
|
-
verbose
|
|
2330
|
-
task_type
|
|
2331
|
-
custom_data
|
|
2730
|
+
data=self.data,
|
|
2731
|
+
target_column=self.target_column,
|
|
2732
|
+
model_list=model_list,
|
|
2733
|
+
verbose=verbose,
|
|
2734
|
+
task_type="Classification",
|
|
2735
|
+
custom_data=self.custom_data,
|
|
2736
|
+
fraud=self.fraud,
|
|
2737
|
+
churn=self.churn,
|
|
2332
2738
|
**kwargs)
|
|
2333
2739
|
# Start time
|
|
2334
2740
|
start_time = time.time()
|
|
@@ -2337,16 +2743,18 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2337
2743
|
|
|
2338
2744
|
# Data Preparation Phase
|
|
2339
2745
|
_DataPreparation.__init__(self,
|
|
2340
|
-
data
|
|
2341
|
-
target_column
|
|
2342
|
-
verbose
|
|
2343
|
-
excluded_columns
|
|
2344
|
-
custom_data
|
|
2345
|
-
data_transform_dict
|
|
2346
|
-
task_type
|
|
2347
|
-
data_mapping
|
|
2746
|
+
data=self.data,
|
|
2747
|
+
target_column=self.target_column,
|
|
2748
|
+
verbose=verbose,
|
|
2749
|
+
excluded_columns=excluded_columns,
|
|
2750
|
+
custom_data=self.custom_data,
|
|
2751
|
+
data_transform_dict=data_transformation_params,
|
|
2752
|
+
task_type="Classification",
|
|
2753
|
+
data_mapping=data_mapping,
|
|
2754
|
+
fraud=self.fraud,
|
|
2755
|
+
churn=self.churn,
|
|
2348
2756
|
**kwargs)
|
|
2349
|
-
|
|
2757
|
+
|
|
2350
2758
|
features, data_transformation_params, \
|
|
2351
2759
|
data_mapping = self.data_preparation(auto)
|
|
2352
2760
|
|
|
@@ -2366,26 +2774,42 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2366
2774
|
# Setting max_runtime_secs to 60 seconds if it is less than 0
|
|
2367
2775
|
max_runtime_secs = 60 if max_runtime_secs is not None and \
|
|
2368
2776
|
max_runtime_secs < 0 else max_runtime_secs
|
|
2369
|
-
|
|
2777
|
+
|
|
2370
2778
|
# Model training
|
|
2371
2779
|
_ModelTraining.__init__(self,
|
|
2372
|
-
data
|
|
2373
|
-
target_column
|
|
2374
|
-
model_list
|
|
2375
|
-
verbose
|
|
2376
|
-
features
|
|
2377
|
-
task_type
|
|
2378
|
-
custom_data
|
|
2780
|
+
data=self.data,
|
|
2781
|
+
target_column=self.target_column,
|
|
2782
|
+
model_list=self.model_list,
|
|
2783
|
+
verbose=verbose,
|
|
2784
|
+
features=features,
|
|
2785
|
+
task_type="Classification",
|
|
2786
|
+
custom_data=self.custom_data,
|
|
2787
|
+
fraud=self.fraud,
|
|
2788
|
+
churn=self.churn,
|
|
2379
2789
|
**kwargs)
|
|
2380
|
-
models_info, leaderboard, target_count = self.model_training(auto
|
|
2381
|
-
max_runtime_secs
|
|
2382
|
-
stopping_metric
|
|
2383
|
-
stopping_tolerance
|
|
2384
|
-
max_models
|
|
2790
|
+
models_info, leaderboard, target_count = self.model_training(auto=auto,
|
|
2791
|
+
max_runtime_secs=max_runtime_secs,
|
|
2792
|
+
stopping_metric=stopping_metric,
|
|
2793
|
+
stopping_tolerance=stopping_tolerance,
|
|
2794
|
+
max_models=max_models)
|
|
2385
2795
|
|
|
2386
2796
|
return (models_info, leaderboard,
|
|
2387
2797
|
target_count, target_label,
|
|
2388
2798
|
data_transformation_params, data_mapping)
|
|
2799
|
+
|
|
2800
|
+
def _target_column_details(self):
|
|
2801
|
+
"""
|
|
2802
|
+
DESCRIPTION:
|
|
2803
|
+
Internal function displays the target column distribution of Target column/ Response column.
|
|
2804
|
+
"""
|
|
2805
|
+
# If data visualization libraries are available
|
|
2806
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
2807
|
+
self._display_msg(msg='\nTarget Column Distribution:',
|
|
2808
|
+
show_data=True)
|
|
2809
|
+
plt.figure(figsize=(6, 6))
|
|
2810
|
+
# Ploting a histogram for target column
|
|
2811
|
+
sns.countplot(data=self.data.select([self.target_column]).to_pandas(), x=self.target_column)
|
|
2812
|
+
plt.show()
|
|
2389
2813
|
|
|
2390
2814
|
def _check_data_imbalance(self,
|
|
2391
2815
|
data=None):
|
|
@@ -2468,7 +2892,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2468
2892
|
show_data=True)
|
|
2469
2893
|
|
|
2470
2894
|
# Importing required libraries
|
|
2471
|
-
from imblearn.over_sampling import SMOTE
|
|
2895
|
+
from imblearn.over_sampling import SMOTE, ADASYN
|
|
2896
|
+
from imblearn.combine import SMOTETomek
|
|
2472
2897
|
from imblearn.under_sampling import NearMiss
|
|
2473
2898
|
|
|
2474
2899
|
st = time.time()
|
|
@@ -2480,10 +2905,18 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2480
2905
|
# Fetching the minimum target column label count and
|
|
2481
2906
|
# accordingly setting the number of neighbors for the sampler
|
|
2482
2907
|
min_label_count = min(data[self.target_column].value_counts())
|
|
2483
|
-
|
|
2908
|
+
self._display_msg(msg=f"\nApplying {self._data_sampling_method}...",
|
|
2909
|
+
progress_bar=self.progress_bar,
|
|
2910
|
+
show_data=True)
|
|
2911
|
+
if self._data_sampling_method.lower() == 'smote':
|
|
2484
2912
|
n_neighbors = min(5, min_label_count - 1)
|
|
2485
2913
|
sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=42)
|
|
2486
|
-
|
|
2914
|
+
elif self._data_sampling_method.lower() == 'adasyn':
|
|
2915
|
+
n_neighbors = min(5, min_label_count - 1)
|
|
2916
|
+
sampling_method = ADASYN(n_neighbors=n_neighbors, random_state=42)
|
|
2917
|
+
elif self._data_sampling_method.lower == 'smotetomek':
|
|
2918
|
+
sampling_method = SMOTETomek(random_state=42)
|
|
2919
|
+
elif self._data_sampling_method == 'nearmiss':
|
|
2487
2920
|
n_neighbors = min(3, min_label_count)
|
|
2488
2921
|
sampling_method = NearMiss(version=1, n_neighbors=n_neighbors)
|
|
2489
2922
|
|
|
@@ -2516,11 +2949,11 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2516
2949
|
class AutoRegressor(AutoML):
|
|
2517
2950
|
|
|
2518
2951
|
def __init__(self,
|
|
2519
|
-
include
|
|
2520
|
-
exclude
|
|
2952
|
+
include=None,
|
|
2953
|
+
exclude=None,
|
|
2521
2954
|
verbose=0,
|
|
2522
2955
|
max_runtime_secs=None,
|
|
2523
|
-
stopping_metric=None,
|
|
2956
|
+
stopping_metric=None,
|
|
2524
2957
|
stopping_tolerance=None,
|
|
2525
2958
|
max_models=None,
|
|
2526
2959
|
custom_config_file=None,
|
|
@@ -2532,7 +2965,6 @@ class AutoRegressor(AutoML):
|
|
|
2532
2965
|
Note:
|
|
2533
2966
|
* configure.temp_object_type="VT" follows sequential execution.
|
|
2534
2967
|
|
|
2535
|
-
|
|
2536
2968
|
PARAMETERS:
|
|
2537
2969
|
include:
|
|
2538
2970
|
Optional Argument.
|
|
@@ -2736,34 +3168,35 @@ class AutoRegressor(AutoML):
|
|
|
2736
3168
|
>>> performance_metrics = automl_obj.evaluate(housing_test)
|
|
2737
3169
|
>>> performance_metrics
|
|
2738
3170
|
"""
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
super(AutoRegressor, self).__init__(task_type=
|
|
2750
|
-
include
|
|
2751
|
-
exclude
|
|
2752
|
-
verbose=
|
|
2753
|
-
max_runtime_secs=
|
|
2754
|
-
stopping_metric=
|
|
2755
|
-
stopping_tolerance=
|
|
2756
|
-
max_models=
|
|
2757
|
-
custom_config_file=
|
|
2758
|
-
**kwargs)
|
|
3171
|
+
|
|
3172
|
+
# Validate unsupported 'task_type' argument
|
|
3173
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
3174
|
+
|
|
3175
|
+
# Validate unsupported 'is_churn' argument
|
|
3176
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
|
|
3177
|
+
|
|
3178
|
+
# Validate unsupported 'is_fraud' argument
|
|
3179
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
|
|
3180
|
+
|
|
3181
|
+
super(AutoRegressor, self).__init__(task_type="Regression",
|
|
3182
|
+
include=include,
|
|
3183
|
+
exclude=exclude,
|
|
3184
|
+
verbose=verbose,
|
|
3185
|
+
max_runtime_secs=max_runtime_secs,
|
|
3186
|
+
stopping_metric=stopping_metric,
|
|
3187
|
+
stopping_tolerance=stopping_tolerance,
|
|
3188
|
+
max_models=max_models,
|
|
3189
|
+
custom_config_file=custom_config_file,
|
|
3190
|
+
**kwargs)
|
|
3191
|
+
|
|
2759
3192
|
class AutoClassifier(AutoML):
|
|
2760
3193
|
|
|
2761
3194
|
def __init__(self,
|
|
2762
|
-
include
|
|
2763
|
-
exclude
|
|
3195
|
+
include=None,
|
|
3196
|
+
exclude=None,
|
|
2764
3197
|
verbose=0,
|
|
2765
3198
|
max_runtime_secs=None,
|
|
2766
|
-
stopping_metric=None,
|
|
3199
|
+
stopping_metric=None,
|
|
2767
3200
|
stopping_tolerance=None,
|
|
2768
3201
|
max_models=None,
|
|
2769
3202
|
custom_config_file=None,
|
|
@@ -2774,7 +3207,6 @@ class AutoClassifier(AutoML):
|
|
|
2774
3207
|
AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
|
|
2775
3208
|
Note:
|
|
2776
3209
|
* configure.temp_object_type="VT" follows sequential execution.
|
|
2777
|
-
|
|
2778
3210
|
|
|
2779
3211
|
PARAMETERS:
|
|
2780
3212
|
include:
|
|
@@ -2867,6 +3299,13 @@ class AutoClassifier(AutoML):
|
|
|
2867
3299
|
Specifies the random seed for reproducibility.
|
|
2868
3300
|
Default Value: 42
|
|
2869
3301
|
Types: int
|
|
3302
|
+
|
|
3303
|
+
imbalance_handling_method:
|
|
3304
|
+
Optional Argument.
|
|
3305
|
+
Specifies which data imbalance method to use
|
|
3306
|
+
Default Value: SMOTE
|
|
3307
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
3308
|
+
Types: str
|
|
2870
3309
|
|
|
2871
3310
|
RETURNS:
|
|
2872
3311
|
Instance of AutoClassifier.
|
|
@@ -3069,23 +3508,1489 @@ class AutoClassifier(AutoML):
|
|
|
3069
3508
|
>>> performance_metrics = automl_obj.evaluate(iris_test, 3)
|
|
3070
3509
|
>>> performance_metrics
|
|
3071
3510
|
"""
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
super(AutoClassifier, self).__init__(task_type=
|
|
3083
|
-
include
|
|
3084
|
-
exclude
|
|
3085
|
-
verbose=
|
|
3086
|
-
max_runtime_secs=
|
|
3087
|
-
stopping_metric=
|
|
3088
|
-
stopping_tolerance=
|
|
3089
|
-
max_models=
|
|
3090
|
-
custom_config_file=
|
|
3511
|
+
|
|
3512
|
+
# Validate unsupported 'task_type' argument
|
|
3513
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
3514
|
+
|
|
3515
|
+
# Validate unsupported 'is_churn' argument
|
|
3516
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
|
|
3517
|
+
|
|
3518
|
+
# Validate unsupported 'is_fraud' argument
|
|
3519
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
|
|
3520
|
+
|
|
3521
|
+
super(AutoClassifier, self).__init__(task_type="Classification",
|
|
3522
|
+
include=include,
|
|
3523
|
+
exclude=exclude,
|
|
3524
|
+
verbose=verbose,
|
|
3525
|
+
max_runtime_secs=max_runtime_secs,
|
|
3526
|
+
stopping_metric=stopping_metric,
|
|
3527
|
+
stopping_tolerance=stopping_tolerance,
|
|
3528
|
+
max_models=max_models,
|
|
3529
|
+
custom_config_file=custom_config_file,
|
|
3091
3530
|
**kwargs)
|
|
3531
|
+
|
|
3532
|
+
class _AutoSpecific(_Classification):
|
|
3533
|
+
|
|
3534
|
+
def __init__(self,
|
|
3535
|
+
data,
|
|
3536
|
+
target_column,
|
|
3537
|
+
custom_data,
|
|
3538
|
+
fraud=False,
|
|
3539
|
+
churn=False,
|
|
3540
|
+
**kwargs):
|
|
3541
|
+
"""
|
|
3542
|
+
|
|
3543
|
+
DESCRIPTION:
|
|
3544
|
+
Function initializes the data, target colum for AutoFraud.
|
|
3545
|
+
|
|
3546
|
+
PARAMETERS:
|
|
3547
|
+
data:
|
|
3548
|
+
Required Argument.
|
|
3549
|
+
Specifies the input teradataml Dataframe.
|
|
3550
|
+
Types: teradataml Dataframe
|
|
3551
|
+
|
|
3552
|
+
target_column:
|
|
3553
|
+
Required Argument.
|
|
3554
|
+
Specifies the name of the target column in "data".
|
|
3555
|
+
Types: str
|
|
3556
|
+
|
|
3557
|
+
custom_data:
|
|
3558
|
+
Optional Argument.
|
|
3559
|
+
Specifies json object containing user customized input.
|
|
3560
|
+
Types: json object
|
|
3561
|
+
|
|
3562
|
+
fraud:
|
|
3563
|
+
Optional Argument.
|
|
3564
|
+
Specifies whether to run AutoFraud or not.
|
|
3565
|
+
Default Value: False
|
|
3566
|
+
Types: bool
|
|
3567
|
+
|
|
3568
|
+
churn:
|
|
3569
|
+
Optional Argument.
|
|
3570
|
+
Specifies whether to run AutoChurn or not.
|
|
3571
|
+
Default Value: False
|
|
3572
|
+
Types: bool
|
|
3573
|
+
|
|
3574
|
+
**kwargs:
|
|
3575
|
+
Specifies the additional arguments for AutoChurn or AutoFraud. Below
|
|
3576
|
+
are the additional arguments:
|
|
3577
|
+
volatile:
|
|
3578
|
+
Optional Argument.
|
|
3579
|
+
Specifies whether to put the interim results of the
|
|
3580
|
+
functions in a volatile table or not. When set to
|
|
3581
|
+
True, results are stored in a volatile table,
|
|
3582
|
+
otherwise not.
|
|
3583
|
+
Default Value: False
|
|
3584
|
+
Types: bool
|
|
3585
|
+
|
|
3586
|
+
persist:
|
|
3587
|
+
Optional Argument.
|
|
3588
|
+
Specifies whether to persist the interim results of the
|
|
3589
|
+
functions in a table or not. When set to True,
|
|
3590
|
+
results are persisted in a table; otherwise,
|
|
3591
|
+
results are garbage collected at the end of the
|
|
3592
|
+
session.
|
|
3593
|
+
Note:
|
|
3594
|
+
* User is responsible for cleanup of the persisted tables. List of persisted tables
|
|
3595
|
+
in current session can be viewed using get_persisted_tables() method.
|
|
3596
|
+
Default Value: False
|
|
3597
|
+
Types: bool
|
|
3598
|
+
|
|
3599
|
+
seed:
|
|
3600
|
+
Optional Argument.
|
|
3601
|
+
Specifies the random seed for reproducibility.
|
|
3602
|
+
Default Value: 42
|
|
3603
|
+
Types: int
|
|
3604
|
+
"""
|
|
3605
|
+
self.fraud = fraud
|
|
3606
|
+
self.churn = churn
|
|
3607
|
+
|
|
3608
|
+
self.volatile = kwargs.get("volatile", False)
|
|
3609
|
+
self.persist = kwargs.get("persist", False)
|
|
3610
|
+
|
|
3611
|
+
super().__init__(data, target_column, custom_data, fraud=fraud, churn=churn, **kwargs)
|
|
3612
|
+
|
|
3613
|
+
def fit(self, **kwargs):
|
|
3614
|
+
"""
|
|
3615
|
+
DESCRIPTION:
|
|
3616
|
+
Function triggers the AutoFraud or AutoChurn run.
|
|
3617
|
+
PARAMETERS:
|
|
3618
|
+
**kwargs:
|
|
3619
|
+
Specifies the additional arguments for AutoChurn or AutoFraud. Below
|
|
3620
|
+
are the additional arguments:
|
|
3621
|
+
volatile:
|
|
3622
|
+
Optional Argument.
|
|
3623
|
+
Specifies whether to put the interim results of the
|
|
3624
|
+
functions in a volatile table or not. When set to
|
|
3625
|
+
True, results are stored in a volatile table,
|
|
3626
|
+
otherwise not.
|
|
3627
|
+
Default Value: False
|
|
3628
|
+
Types: bool
|
|
3629
|
+
|
|
3630
|
+
persist:
|
|
3631
|
+
Optional Argument.
|
|
3632
|
+
Specifies whether to persist the interim results of the
|
|
3633
|
+
functions in a table or not. When set to True,
|
|
3634
|
+
results are persisted in a table; otherwise,
|
|
3635
|
+
results are garbage collected at the end of the
|
|
3636
|
+
session.
|
|
3637
|
+
Note:
|
|
3638
|
+
* User is responsible for cleanup of the persisted tables. List of persisted tables
|
|
3639
|
+
in current session can be viewed using get_persisted_tables() method.
|
|
3640
|
+
Default Value: False
|
|
3641
|
+
Types: bool
|
|
3642
|
+
|
|
3643
|
+
seed:
|
|
3644
|
+
Optional Argument.
|
|
3645
|
+
Specifies the random seed for reproducibility.
|
|
3646
|
+
Default Value: 42
|
|
3647
|
+
Types: int
|
|
3648
|
+
"""
|
|
3649
|
+
self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
3650
|
+
self.data_transformation_params, self.table_name_mapping = super()._classification(**kwargs)
|
|
3651
|
+
self.m_evaluator = _ModelEvaluator(self.model_info,
|
|
3652
|
+
self.target_column,
|
|
3653
|
+
self.task_type)
|
|
3654
|
+
return (self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
3655
|
+
self.data_transformation_params, self.table_name_mapping)
|
|
3656
|
+
|
|
3657
|
+
def _handling_missing_value(self):
|
|
3658
|
+
"""
|
|
3659
|
+
DESCRIPTION:
|
|
3660
|
+
Override function for handling missing values in the dataset specifically for fraud detection.
|
|
3661
|
+
This function ensures that rows are flagged for imputation instead of being dropped while retaining
|
|
3662
|
+
the column-dropping behavior for columns with excessive missing values.
|
|
3663
|
+
"""
|
|
3664
|
+
fn_name = "AutoFraud " if self.fraud else ("AutoChurn " if self.churn else "")
|
|
3665
|
+
|
|
3666
|
+
self._display_msg(msg=f"\nChecking Missing values in dataset using {fn_name}function...",
|
|
3667
|
+
progress_bar=self.progress_bar,
|
|
3668
|
+
show_data=True)
|
|
3669
|
+
start_time = time.time()
|
|
3670
|
+
|
|
3671
|
+
# Number of rows
|
|
3672
|
+
d_size = self.data.shape[0]
|
|
3673
|
+
|
|
3674
|
+
drop_cols = []
|
|
3675
|
+
self.imputation_cols = {}
|
|
3676
|
+
|
|
3677
|
+
# Get count of missing values per column
|
|
3678
|
+
cols_miss_val = self._missing_count_per_column()
|
|
3679
|
+
|
|
3680
|
+
if len(cols_miss_val) != 0:
|
|
3681
|
+
self._display_msg(msg="Columns with their missing values:",
|
|
3682
|
+
col_lst=cols_miss_val,
|
|
3683
|
+
progress_bar=self.progress_bar)
|
|
3684
|
+
|
|
3685
|
+
# Get distinct value in each column
|
|
3686
|
+
self._get_distinct_count()
|
|
3687
|
+
|
|
3688
|
+
# Iterating over columns with missing values
|
|
3689
|
+
for col, val in cols_miss_val.items():
|
|
3690
|
+
|
|
3691
|
+
# Drop column if >60% values are missing
|
|
3692
|
+
if val > 0.6 * d_size:
|
|
3693
|
+
drop_cols.append(col)
|
|
3694
|
+
continue
|
|
3695
|
+
|
|
3696
|
+
# For numerical columns
|
|
3697
|
+
if self.data_types[col] in ['float', 'int']:
|
|
3698
|
+
corr_df = self.data[col].corr(self.data[self.target_column])
|
|
3699
|
+
corr_val = self.data.assign(True, corr_=corr_df)
|
|
3700
|
+
related = next(corr_val.itertuples())[0]
|
|
3701
|
+
|
|
3702
|
+
# Flag column for imputation instead of row deletion
|
|
3703
|
+
if val < 0.02 * d_size and related <= 0.25:
|
|
3704
|
+
self.imputation_cols[col] = val
|
|
3705
|
+
continue
|
|
3706
|
+
|
|
3707
|
+
# For categorical columns
|
|
3708
|
+
elif self.data_types[col] in ['str']:
|
|
3709
|
+
# Flag column for imputation instead of row deletion
|
|
3710
|
+
if val < 0.04 * d_size:
|
|
3711
|
+
self.imputation_cols[col] = val
|
|
3712
|
+
continue
|
|
3713
|
+
# Drop column if unique count >75%
|
|
3714
|
+
elif self.counts_dict[f'count_{col}'] > 0.75 * (d_size - val):
|
|
3715
|
+
drop_cols.append(col)
|
|
3716
|
+
continue
|
|
3717
|
+
|
|
3718
|
+
# Default: Flag column for imputation
|
|
3719
|
+
self.imputation_cols[col] = val
|
|
3720
|
+
|
|
3721
|
+
# Drop columns
|
|
3722
|
+
if len(drop_cols) != 0:
|
|
3723
|
+
self.data = self.data.drop(drop_cols, axis=1)
|
|
3724
|
+
# Store dropped columns in the data transform dictionary
|
|
3725
|
+
self.data_transform_dict['drop_missing_columns'] = drop_cols
|
|
3726
|
+
self._display_msg(msg='Dropping these columns for handling missing values:',
|
|
3727
|
+
col_lst=drop_cols,
|
|
3728
|
+
progress_bar=self.progress_bar)
|
|
3729
|
+
self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
|
|
3730
|
+
data=self.data,
|
|
3731
|
+
progress_bar=self.progress_bar)
|
|
3732
|
+
|
|
3733
|
+
# Display imputation details
|
|
3734
|
+
if len(self.imputation_cols) != 0:
|
|
3735
|
+
# Store imputation columns in the data transform dictionary
|
|
3736
|
+
self.data_transform_dict['imputation_columns'] = self.imputation_cols
|
|
3737
|
+
self._display_msg(msg="Flagging these columns for imputation:",
|
|
3738
|
+
col_lst=list(self.imputation_cols.keys()),
|
|
3739
|
+
progress_bar=self.progress_bar)
|
|
3740
|
+
|
|
3741
|
+
# If no missing values are detected
|
|
3742
|
+
if len(self.imputation_cols) == 0 and len(drop_cols) == 0:
|
|
3743
|
+
self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
|
|
3744
|
+
progress_bar=self.progress_bar)
|
|
3745
|
+
|
|
3746
|
+
end_time = time.time()
|
|
3747
|
+
self._display_msg(msg=f"Total time to find missing values in data using {fn_name}: {{:.2f}} sec ".format(end_time - start_time),
|
|
3748
|
+
progress_bar=self.progress_bar,
|
|
3749
|
+
show_data=True)
|
|
3750
|
+
|
|
3751
|
+
def _impute_missing_value(self):
|
|
3752
|
+
"""
|
|
3753
|
+
DESCRIPTION:
|
|
3754
|
+
Override Function performs the imputation on columns/features with missing values in the dataset
|
|
3755
|
+
using Partition column argument in SimpleImputeFit.
|
|
3756
|
+
"""
|
|
3757
|
+
|
|
3758
|
+
start_time = time.time()
|
|
3759
|
+
self._display_msg(msg="\nImputing Missing Values using SimpleImputeFit partition column...",
|
|
3760
|
+
progress_bar=self.progress_bar,
|
|
3761
|
+
show_data=True)
|
|
3762
|
+
|
|
3763
|
+
if len(self.imputation_cols) != 0:
|
|
3764
|
+
|
|
3765
|
+
# List of columns and imputation Method
|
|
3766
|
+
col_stat, stat = self._impute_helper()
|
|
3767
|
+
## Workaround done for bug https://teradata-pe.atlassian.net/browse/TDAF-15617.
|
|
3768
|
+
## Temporarily commenting out partition_column arguments.
|
|
3769
|
+
fit_obj = SimpleImputeFit(data=self.data,
|
|
3770
|
+
stats_columns=col_stat,
|
|
3771
|
+
#partition_column=self.target_column,
|
|
3772
|
+
stats=stat,
|
|
3773
|
+
volatile=self.volatile,
|
|
3774
|
+
persist=self.persist)
|
|
3775
|
+
|
|
3776
|
+
|
|
3777
|
+
# Storing fit object for imputation in data transform dictionary
|
|
3778
|
+
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
3779
|
+
#self.data_transform_dict['imputation_partition_column'] = self.target_column
|
|
3780
|
+
sm = SimpleImputeTransform(data=self.data,
|
|
3781
|
+
object=fit_obj.output,
|
|
3782
|
+
#data_partition_column = self.target_column,
|
|
3783
|
+
#object_partition_column = self.target_column,
|
|
3784
|
+
volatile=self.volatile,
|
|
3785
|
+
persist=self.persist)
|
|
3786
|
+
|
|
3787
|
+
self.data = sm.result
|
|
3788
|
+
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
3789
|
+
data=self.data,
|
|
3790
|
+
progress_bar=self.progress_bar)
|
|
3791
|
+
else:
|
|
3792
|
+
self._display_msg(inline_msg="Analysis completed. No imputation required.",
|
|
3793
|
+
progress_bar=self.progress_bar)
|
|
3794
|
+
|
|
3795
|
+
end_time = time.time()
|
|
3796
|
+
self._display_msg(msg="Time taken to perform imputation: {:.2f} sec ".format(end_time - start_time),
|
|
3797
|
+
progress_bar=self.progress_bar,
|
|
3798
|
+
show_data=True)
|
|
3799
|
+
|
|
3800
|
+
def _outlier_detection(self,
|
|
3801
|
+
column_list,
|
|
3802
|
+
outlier_method="percentile",
|
|
3803
|
+
lower_percentile=0.01,
|
|
3804
|
+
upper_percentile=0.99):
|
|
3805
|
+
"""
|
|
3806
|
+
DESCRIPTION:
|
|
3807
|
+
Function detects the outlier in numerical column and display thier percentage.
|
|
3808
|
+
|
|
3809
|
+
PARAMETERS:
|
|
3810
|
+
column_list:
|
|
3811
|
+
Required Argument.
|
|
3812
|
+
Specifies the numeric columns for outlier percentage calculation.
|
|
3813
|
+
Types: str or list of strings (str)
|
|
3814
|
+
|
|
3815
|
+
outlier_method:
|
|
3816
|
+
Required Argument.
|
|
3817
|
+
Specifies the outlier method required for outlier detection.
|
|
3818
|
+
Types: str
|
|
3819
|
+
Default Value: "percentile"
|
|
3820
|
+
Permitted Values: "percentile", "tukey", "carling"
|
|
3821
|
+
|
|
3822
|
+
lower_percentile:
|
|
3823
|
+
Optional Argument.
|
|
3824
|
+
Specifies the lower percentile value for outlier detection in case of percentile method.
|
|
3825
|
+
Types: float
|
|
3826
|
+
|
|
3827
|
+
upper_percentile:
|
|
3828
|
+
Optional Argument.
|
|
3829
|
+
Specifies the upper percentile value for outlier detection in case of percentile method.
|
|
3830
|
+
Types: float
|
|
3831
|
+
|
|
3832
|
+
RETURNS:
|
|
3833
|
+
Pandas DataFrame containing, column name with outlier percentage.
|
|
3834
|
+
|
|
3835
|
+
"""
|
|
3836
|
+
# Performing outlier fit on the data for replacing outliers with NULL value
|
|
3837
|
+
fit_params = {
|
|
3838
|
+
"data" : self.data,
|
|
3839
|
+
"target_columns" : column_list,
|
|
3840
|
+
"outlier_method" : "percentile",
|
|
3841
|
+
"lower_percentile" : lower_percentile,
|
|
3842
|
+
"upper_percentile" : upper_percentile,
|
|
3843
|
+
"replacement_value" : 'NULL'
|
|
3844
|
+
}
|
|
3845
|
+
OutlierFilterFit_out = OutlierFilterFit(**fit_params)
|
|
3846
|
+
transform_params = {
|
|
3847
|
+
"data" : self.data,
|
|
3848
|
+
"object" : OutlierFilterFit_out.result
|
|
3849
|
+
}
|
|
3850
|
+
# Performing outlier transformation on each column
|
|
3851
|
+
OutlierTransform_obj = OutlierFilterTransform(**transform_params)
|
|
3852
|
+
|
|
3853
|
+
# Column summary of each column of the data
|
|
3854
|
+
fit_params = {
|
|
3855
|
+
"data" : OutlierTransform_obj.result,
|
|
3856
|
+
"target_columns" : column_list
|
|
3857
|
+
}
|
|
3858
|
+
colSummary = ColumnSummary(**fit_params)
|
|
3859
|
+
|
|
3860
|
+
null_count_expr = colSummary.result.NullCount
|
|
3861
|
+
non_null_count_expr = colSummary.result.NonNullCount
|
|
3862
|
+
|
|
3863
|
+
# Calculating outlier percentage
|
|
3864
|
+
df = colSummary.result.assign(True,
|
|
3865
|
+
ColumnName = colSummary.result.ColumnName,
|
|
3866
|
+
OutlierPercentage = (null_count_expr/(non_null_count_expr+null_count_expr))*100)
|
|
3867
|
+
|
|
3868
|
+
# Displaying non-zero containing outlier percentage for columns
|
|
3869
|
+
df = df[df['OutlierPercentage']>0]
|
|
3870
|
+
if self.verbose > 0:
|
|
3871
|
+
print(" "*500, end='\r')
|
|
3872
|
+
if df.shape[0] > 0:
|
|
3873
|
+
self._display_msg(msg='Columns with outlier percentage :-',
|
|
3874
|
+
show_data=True)
|
|
3875
|
+
print(df)
|
|
3876
|
+
else:
|
|
3877
|
+
print("\nNo outlier found!")
|
|
3878
|
+
|
|
3879
|
+
return df
|
|
3880
|
+
|
|
3881
|
+
def _outlier_handling_techniques(self):
|
|
3882
|
+
"""
|
|
3883
|
+
DESCRIPTION:
|
|
3884
|
+
Override function to determine outlier handling techniques in AutoFraud.
|
|
3885
|
+
Ensures no rows are removed; all outlier-affected columns are flagged for imputation.
|
|
3886
|
+
"""
|
|
3887
|
+
columns_to_impute = []
|
|
3888
|
+
|
|
3889
|
+
# List of columns for outlier processing
|
|
3890
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
3891
|
+
|
|
3892
|
+
# Detecting outlier percentage in each column using the percentile method
|
|
3893
|
+
outlier_percentage_df = self._outlier_detection(outlier_columns)
|
|
3894
|
+
|
|
3895
|
+
# Flag all columns for imputation (no row deletion in AutoFraud)
|
|
3896
|
+
for i in outlier_percentage_df.itertuples():
|
|
3897
|
+
col = i[0] # Column name
|
|
3898
|
+
columns_to_impute.append(col)
|
|
3899
|
+
|
|
3900
|
+
return [], columns_to_impute # No columns will be marked for row deletion
|
|
3901
|
+
|
|
3902
|
+
def _outlier_handling(self, target_columns, outlier_method, replacement_value="MEDIAN"):
|
|
3903
|
+
"""
|
|
3904
|
+
DESCRIPTION:
|
|
3905
|
+
Override function to handle outliers in AutoFraud.
|
|
3906
|
+
Ensures no rows are removed while handling outliers by imputing them instead.
|
|
3907
|
+
"""
|
|
3908
|
+
# Enforce imputation strategy instead of row deletion
|
|
3909
|
+
replacement_value = "MEDIAN" if replacement_value == "DELETE" else replacement_value
|
|
3910
|
+
|
|
3911
|
+
# Setting volatile and persist parameters for Outlier handling function
|
|
3912
|
+
volatile, persist = self._get_generic_parameters(func_indicator='OutlierFilterIndicator',
|
|
3913
|
+
param_name='OutlierFilterParam')
|
|
3914
|
+
|
|
3915
|
+
# Performing fit on dataset for outlier handling
|
|
3916
|
+
fit_params = {
|
|
3917
|
+
"data": self.data,
|
|
3918
|
+
"target_columns": target_columns,
|
|
3919
|
+
"outlier_method": outlier_method,
|
|
3920
|
+
"replacement_value": replacement_value,
|
|
3921
|
+
"volatile": volatile,
|
|
3922
|
+
"persist": persist
|
|
3923
|
+
}
|
|
3924
|
+
outlier_fit_out = OutlierFilterFit(**fit_params)
|
|
3925
|
+
|
|
3926
|
+
# Performing transform on dataset for outlier handling
|
|
3927
|
+
transform_params = {
|
|
3928
|
+
"data": self.data,
|
|
3929
|
+
"object": outlier_fit_out.result,
|
|
3930
|
+
"persist": True
|
|
3931
|
+
}
|
|
3932
|
+
|
|
3933
|
+
# Disabling print if persist is True by default
|
|
3934
|
+
if not volatile and not persist:
|
|
3935
|
+
transform_params["display_table_name"] = False
|
|
3936
|
+
|
|
3937
|
+
if volatile:
|
|
3938
|
+
transform_params["volatile"] = True
|
|
3939
|
+
transform_params["persist"] = False
|
|
3940
|
+
|
|
3941
|
+
self.data = OutlierFilterTransform(**transform_params).result
|
|
3942
|
+
|
|
3943
|
+
if not volatile and not persist:
|
|
3944
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
3945
|
+
|
|
3946
|
+
# Returning outlier fit object to store in data mapping dictionary
|
|
3947
|
+
return outlier_fit_out
|
|
3948
|
+
|
|
3949
|
+
def _outlier_processing(self):
|
|
3950
|
+
"""
|
|
3951
|
+
DESCRIPTION:
|
|
3952
|
+
Override function to perform outlier processing in AutoFraud.
|
|
3953
|
+
Ensures no rows are removed while handling outliers.
|
|
3954
|
+
Instead, affected columns are flagged for imputation.
|
|
3955
|
+
"""
|
|
3956
|
+
|
|
3957
|
+
fn_name = "AutoFraud " if self.fraud else ("AutoChurn " if self.churn else "")
|
|
3958
|
+
|
|
3959
|
+
self._display_msg(msg=f"\n{fn_name}Outlier preprocessing using Percentile...",
|
|
3960
|
+
progress_bar=self.progress_bar,
|
|
3961
|
+
show_data=True)
|
|
3962
|
+
start_time = time.time()
|
|
3963
|
+
|
|
3964
|
+
# List of columns for imputation (No row deletion in AutoFraud)
|
|
3965
|
+
_, columns_to_impute = self._outlier_handling_techniques()
|
|
3966
|
+
|
|
3967
|
+
# Keeping default method for outlier handling as "Percentile"
|
|
3968
|
+
outlier_handling_method = "Percentile"
|
|
3969
|
+
|
|
3970
|
+
# Imputing Median value in place of outliers (No deletion)
|
|
3971
|
+
if len(columns_to_impute) != 0:
|
|
3972
|
+
self._display_msg(msg="Replacing outliers with median:",
|
|
3973
|
+
col_lst=columns_to_impute,
|
|
3974
|
+
progress_bar=self.progress_bar)
|
|
3975
|
+
target_columns = columns_to_impute
|
|
3976
|
+
replacement_strategy = "MEDIAN"
|
|
3977
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
3978
|
+
self._display_msg(msg="Sample of dataset after replacing outliers with MEDIAN:",
|
|
3979
|
+
data=self.data,
|
|
3980
|
+
progress_bar=self.progress_bar)
|
|
3981
|
+
|
|
3982
|
+
if len(columns_to_impute) == 0:
|
|
3983
|
+
self._display_msg(msg='Analysis indicates no significant outliers in the dataset. No Action Taken.',
|
|
3984
|
+
progress_bar=self.progress_bar)
|
|
3985
|
+
|
|
3986
|
+
end_time = time.time()
|
|
3987
|
+
self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
|
|
3988
|
+
progress_bar=self.progress_bar,
|
|
3989
|
+
show_data=True)
|
|
3990
|
+
|
|
3991
|
+
def _encoding_categorical_columns(self):
|
|
3992
|
+
"""
|
|
3993
|
+
DESCRIPTION:
|
|
3994
|
+
Override Function detects the categorical columns and performs target encoding instead of
|
|
3995
|
+
One Hot encoding on categorical columns in AutoFraud.
|
|
3996
|
+
"""
|
|
3997
|
+
self._display_msg(msg="\nPerforming target encoding for categorical columns ...",
|
|
3998
|
+
progress_bar=self.progress_bar,
|
|
3999
|
+
show_data=True)
|
|
4000
|
+
start_time = time.time()
|
|
4001
|
+
|
|
4002
|
+
target_encoding_list = {}
|
|
4003
|
+
|
|
4004
|
+
# List of columns before target encoding
|
|
4005
|
+
col_bf_encoding = self.data.columns
|
|
4006
|
+
|
|
4007
|
+
# Get distinct value in each column
|
|
4008
|
+
self._get_distinct_count()
|
|
4009
|
+
|
|
4010
|
+
# Detect categorical columns and prepare for target encoding
|
|
4011
|
+
for col, d_type in self.data._column_names_and_types:
|
|
4012
|
+
if d_type in ['str']:
|
|
4013
|
+
target_encoding_list[col] = {"encoder_method": "CBM_BETA",
|
|
4014
|
+
"response_column": self.target_column}
|
|
4015
|
+
|
|
4016
|
+
if len(target_encoding_list) == 0:
|
|
4017
|
+
self._display_msg(inline_msg="Analysis completed without target encoding. No categorical columns were found.",
|
|
4018
|
+
progress_bar=self.progress_bar)
|
|
4019
|
+
return
|
|
4020
|
+
|
|
4021
|
+
self._auto_target_encoding(target_encoding_list)
|
|
4022
|
+
|
|
4023
|
+
self._display_msg(msg="Target Encoding these Columns:",
|
|
4024
|
+
col_lst=list(target_encoding_list.keys()),
|
|
4025
|
+
progress_bar=self.progress_bar)
|
|
4026
|
+
self._display_msg(msg="Sample of dataset after performing target encoding:",
|
|
4027
|
+
data=self.data,
|
|
4028
|
+
progress_bar=self.progress_bar)
|
|
4029
|
+
|
|
4030
|
+
# List of columns after target encoding
|
|
4031
|
+
col_af_encoding = self.data.columns
|
|
4032
|
+
|
|
4033
|
+
# List of excluded columns from outlier processing and scaling
|
|
4034
|
+
self.excluded_cols = self._extract_list(col_af_encoding, col_bf_encoding)
|
|
4035
|
+
|
|
4036
|
+
end_time = time.time()
|
|
4037
|
+
self._display_msg(msg="Time taken to encode the columns: {:.2f} sec".format(end_time - start_time),
|
|
4038
|
+
progress_bar=self.progress_bar,
|
|
4039
|
+
show_data=True)
|
|
4040
|
+
|
|
4041
|
+
def _auto_target_encoding(self, target_encoding_list):
|
|
4042
|
+
"""
|
|
4043
|
+
DESCRIPTION:
|
|
4044
|
+
Function performs target encoding on categorical columns for AutoFraud.
|
|
4045
|
+
This function is separate from the custom target encoding method.
|
|
4046
|
+
|
|
4047
|
+
PARAMETERS:
|
|
4048
|
+
target_encoding_list:
|
|
4049
|
+
Required Argument.
|
|
4050
|
+
Dictionary specifying the categorical columns for which target encoding will be performed.
|
|
4051
|
+
Each key is a column name, and values contain encoding parameters.
|
|
4052
|
+
"""
|
|
4053
|
+
|
|
4054
|
+
# Fetching all columns on which target encoding will be performed
|
|
4055
|
+
target_columns = list(target_encoding_list.keys())
|
|
4056
|
+
|
|
4057
|
+
# Checking for column presence in dataset
|
|
4058
|
+
_Validators._validate_dataframe_has_argument_columns(target_columns, "TargetEncodingList", self.data, "df")
|
|
4059
|
+
|
|
4060
|
+
# Finding distinct values and counts for columns
|
|
4061
|
+
cat_sum = CategoricalSummary(data=self.data, target_columns=target_columns)
|
|
4062
|
+
category_data = cat_sum.result.groupby("ColumnName").count()
|
|
4063
|
+
category_data = category_data.assign(drop_columns=True,
|
|
4064
|
+
ColumnName=category_data.ColumnName,
|
|
4065
|
+
CategoryCount=category_data.count_DistinctValue)
|
|
4066
|
+
|
|
4067
|
+
# Storing encoding metadata
|
|
4068
|
+
self.data_transform_dict["auto_target_encoding_ind"] = True
|
|
4069
|
+
|
|
4070
|
+
# Setting volatile and persist parameters for performing encoding
|
|
4071
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
4072
|
+
param_name="CategoricalEncodingParam")
|
|
4073
|
+
|
|
4074
|
+
# Perform target encoding for each categorical column
|
|
4075
|
+
fit_params = {
|
|
4076
|
+
"data": self.data,
|
|
4077
|
+
"category_data": category_data,
|
|
4078
|
+
"encoder_method": "CBM_BETA",
|
|
4079
|
+
"target_columns": target_columns,
|
|
4080
|
+
"response_column": self.target_column,
|
|
4081
|
+
"volatile": volatile,
|
|
4082
|
+
"persist": persist,
|
|
4083
|
+
"default_values": [-1]
|
|
4084
|
+
}
|
|
4085
|
+
|
|
4086
|
+
# Perform target encoding
|
|
4087
|
+
tar_fit_obj = TargetEncodingFit(**fit_params)
|
|
4088
|
+
self.data_transform_dict["auto_target_encoding_fit_obj"] = tar_fit_obj.result
|
|
4089
|
+
|
|
4090
|
+
# Extracting accumulate columns
|
|
4091
|
+
accumulate_columns = self._extract_list(self.data.columns, target_columns)
|
|
4092
|
+
self.data_transform_dict["target_encoding_accumulate_columns"] = accumulate_columns
|
|
4093
|
+
# Apply the transformation
|
|
4094
|
+
transform_params = {
|
|
4095
|
+
"data": self.data,
|
|
4096
|
+
"object": tar_fit_obj,
|
|
4097
|
+
"accumulate": accumulate_columns,
|
|
4098
|
+
"persist": True
|
|
4099
|
+
}
|
|
4100
|
+
|
|
4101
|
+
# Disabling display table name if persist is True by default
|
|
4102
|
+
if not volatile and not persist:
|
|
4103
|
+
transform_params["display_table_name"] = False
|
|
4104
|
+
|
|
4105
|
+
if volatile:
|
|
4106
|
+
transform_params["volatile"] = True
|
|
4107
|
+
transform_params["persist"] = False
|
|
4108
|
+
|
|
4109
|
+
self.data = TargetEncodingTransform(**transform_params).result
|
|
4110
|
+
|
|
4111
|
+
if not volatile and not persist:
|
|
4112
|
+
# Adding transformed data containing table to garbage collector
|
|
4113
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
4114
|
+
|
|
4115
|
+
self._display_msg(msg="Target Encoding completed for categorical columns using CBM_BETA.",
|
|
4116
|
+
progress_bar=self.progress_bar)
|
|
4117
|
+
|
|
4118
|
+
class AutoFraud(AutoML):
|
|
4119
|
+
|
|
4120
|
+
def __init__(self,
|
|
4121
|
+
include=None,
|
|
4122
|
+
exclude=None,
|
|
4123
|
+
verbose=0,
|
|
4124
|
+
max_runtime_secs=None,
|
|
4125
|
+
stopping_metric=None,
|
|
4126
|
+
stopping_tolerance=None,
|
|
4127
|
+
max_models=None,
|
|
4128
|
+
custom_config_file=None,
|
|
4129
|
+
**kwargs
|
|
4130
|
+
):
|
|
4131
|
+
|
|
4132
|
+
"""
|
|
4133
|
+
DESCRIPTION:
|
|
4134
|
+
AutoFraud is a dedicated AutoML pipeline designed specifically for fraud detection
|
|
4135
|
+
tasks. It automates the process of building, training, and evaluating models
|
|
4136
|
+
tailored to identify fraudulent activities, streamlining the workflow for
|
|
4137
|
+
fraud detection use cases.
|
|
4138
|
+
|
|
4139
|
+
PARAMETERS:
|
|
4140
|
+
include:
|
|
4141
|
+
Optional Argument.
|
|
4142
|
+
Specifies the model algorithms to be used for model training phase.
|
|
4143
|
+
By default, all 5 models are used for training for this specific binary
|
|
4144
|
+
classification problem.
|
|
4145
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
4146
|
+
Types: str OR list of str
|
|
4147
|
+
|
|
4148
|
+
exclude:
|
|
4149
|
+
Optional Argument.
|
|
4150
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
4151
|
+
No model is excluded by default.
|
|
4152
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
4153
|
+
Types: str OR list of str
|
|
4154
|
+
|
|
4155
|
+
verbose:
|
|
4156
|
+
Optional Argument.
|
|
4157
|
+
Specifies the detailed execution steps based on verbose level.
|
|
4158
|
+
Default Value: 0
|
|
4159
|
+
Permitted Values:
|
|
4160
|
+
* 0: prints the progress bar and leaderboard
|
|
4161
|
+
* 1: prints the execution steps of AutoML.
|
|
4162
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
4163
|
+
Types: int
|
|
4164
|
+
|
|
4165
|
+
max_runtime_secs:
|
|
4166
|
+
Optional Argument.
|
|
4167
|
+
Specifies the time limit in seconds for model training.
|
|
4168
|
+
Types: int
|
|
4169
|
+
|
|
4170
|
+
stopping_metric:
|
|
4171
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
4172
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
4173
|
+
Permitted Values: 'MICRO-F1','MACRO-F1','MICRO-RECALL','MACRO-RECALL',
|
|
4174
|
+
'MICRO-PRECISION', 'MACRO-PRECISION','WEIGHTED-PRECISION',
|
|
4175
|
+
'WEIGHTED-RECALL', 'WEIGHTED-F1', 'ACCURACY'
|
|
4176
|
+
Types: str
|
|
4177
|
+
|
|
4178
|
+
stopping_tolerance:
|
|
4179
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
4180
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
4181
|
+
Types: float
|
|
4182
|
+
|
|
4183
|
+
max_models:
|
|
4184
|
+
Optional Argument.
|
|
4185
|
+
Specifies the maximum number of models to be trained.
|
|
4186
|
+
Types: int
|
|
4187
|
+
|
|
4188
|
+
custom_config_file:
|
|
4189
|
+
Optional Argument.
|
|
4190
|
+
Specifies the path of json file in case of custom run.
|
|
4191
|
+
Types: str
|
|
4192
|
+
|
|
4193
|
+
**kwargs:
|
|
4194
|
+
Specifies the additional arguments for AutoClassifier. Below
|
|
4195
|
+
are the additional arguments:
|
|
4196
|
+
volatile:
|
|
4197
|
+
Optional Argument.
|
|
4198
|
+
Specifies whether to put the interim results of the
|
|
4199
|
+
functions in a volatile table or not. When set to
|
|
4200
|
+
True, results are stored in a volatile table,
|
|
4201
|
+
otherwise not.
|
|
4202
|
+
Default Value: False
|
|
4203
|
+
Types: bool
|
|
4204
|
+
|
|
4205
|
+
persist:
|
|
4206
|
+
Optional Argument.
|
|
4207
|
+
Specifies whether to persist the interim results of the
|
|
4208
|
+
functions in a table or not. When set to True,
|
|
4209
|
+
results are persisted in a table; otherwise,
|
|
4210
|
+
results are garbage collected at the end of the
|
|
4211
|
+
session.
|
|
4212
|
+
Default Value: False
|
|
4213
|
+
Types: bool
|
|
4214
|
+
|
|
4215
|
+
seed:
|
|
4216
|
+
Optional Argument.
|
|
4217
|
+
Specifies the random seed for reproducibility.
|
|
4218
|
+
Default Value: 42
|
|
4219
|
+
Types: int
|
|
4220
|
+
|
|
4221
|
+
imbalance_handling_method:
|
|
4222
|
+
Optional Argument.
|
|
4223
|
+
Specifies which data imbalance method to use.
|
|
4224
|
+
Default Value: SMOTE
|
|
4225
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
4226
|
+
Types: str
|
|
4227
|
+
|
|
4228
|
+
RETURNS:
|
|
4229
|
+
Instance of AutoFraud.
|
|
4230
|
+
|
|
4231
|
+
RAISES:
|
|
4232
|
+
TeradataMlException, TypeError, ValueError
|
|
4233
|
+
|
|
4234
|
+
EXAMPLES:
|
|
4235
|
+
# Notes:
|
|
4236
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
4237
|
+
# 2. One must import the required functions mentioned in
|
|
4238
|
+
# the example from teradataml.
|
|
4239
|
+
# 3. Function will raise error if not supported on the Vantage
|
|
4240
|
+
# user is connected to.
|
|
4241
|
+
|
|
4242
|
+
# Load the example data.
|
|
4243
|
+
>>> load_example_data("teradataml", ["credit_fraud_dataset", "payment_fraud_datset"])
|
|
4244
|
+
|
|
4245
|
+
# Create teradataml DataFrame object.
|
|
4246
|
+
>>> credit_fraud_df = DataFrame.from_table("credit_fraud_dataset")
|
|
4247
|
+
>>> payment_fraud_df = DataFrame.from_table("payment_fraud_dataset")
|
|
4248
|
+
|
|
4249
|
+
# Example 1 : Run AutoFraud for fraud detection problem
|
|
4250
|
+
# Scenario : Predict whether a transaction is Fraud or not
|
|
4251
|
+
|
|
4252
|
+
# Split the data into train and test.
|
|
4253
|
+
>>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
|
|
4254
|
+
>>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4255
|
+
>>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4256
|
+
|
|
4257
|
+
# Create instance of AutoFraud.
|
|
4258
|
+
>>> automl_obj = AutoFraud()
|
|
4259
|
+
|
|
4260
|
+
# Fit the data.
|
|
4261
|
+
>>> automl_obj.fit(credit_fraud_train, "Credit_Class")
|
|
4262
|
+
|
|
4263
|
+
# Display leaderboard.
|
|
4264
|
+
>>> automl_obj.leaderboard()
|
|
4265
|
+
|
|
4266
|
+
# Display best performing model.
|
|
4267
|
+
>>> automl_obj.leader()
|
|
4268
|
+
|
|
4269
|
+
# Run predict on test data using best performing model.
|
|
4270
|
+
>>> prediction = automl_obj.predict(credit_fraud_test)
|
|
4271
|
+
>>> prediction
|
|
4272
|
+
|
|
4273
|
+
# Run predict on test data using second best performing model.
|
|
4274
|
+
>>> prediction = automl_obj.predict(credit_fraud_test, rank=2)
|
|
4275
|
+
>>> prediction
|
|
4276
|
+
|
|
4277
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
4278
|
+
>>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
|
|
4279
|
+
>>> performance_metrics
|
|
4280
|
+
|
|
4281
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
4282
|
+
>>> performance_metrics = automl_obj.evaluate(credit_fraud_test, 4)
|
|
4283
|
+
>>> performance_metrics
|
|
4284
|
+
|
|
4285
|
+
# Example 2 : Run AutoFraud for fraud detection.
|
|
4286
|
+
# Scenario : Predict whether transaction is Fraud or not. Run AutoFraud to get the
|
|
4287
|
+
# best performing model out of available models. Use custom
|
|
4288
|
+
# configuration file to customize different processes of
|
|
4289
|
+
# AutoFraud Run.
|
|
4290
|
+
|
|
4291
|
+
# Split the data into train and test.
|
|
4292
|
+
>>> payment_fraud_sample = payment_fraud_df.sample(frac = [0.8, 0.2])
|
|
4293
|
+
>>> payment_fraud_train = payment_fraud_sample[payment_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4294
|
+
>>> payment_fraud_test = payment_fraud_sample[payment_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4295
|
+
|
|
4296
|
+
# Generate custom configuration file.
|
|
4297
|
+
>>> AutoFraud.generate_custom_config("custom_fraud")
|
|
4298
|
+
|
|
4299
|
+
# Create instance of AutoFraud.
|
|
4300
|
+
>>> automl_obj = AutoFraud(verbose=2,
|
|
4301
|
+
>>> custom_config_file="custom_fraud.json")
|
|
4302
|
+
|
|
4303
|
+
# Fit the data.
|
|
4304
|
+
>>> automl_obj.fit(payment_fraud_train, payment_fraud_train.isFraud)
|
|
4305
|
+
|
|
4306
|
+
# Display leaderboard.
|
|
4307
|
+
>>> automl_obj.leaderboard()
|
|
4308
|
+
|
|
4309
|
+
# Display best performing model.
|
|
4310
|
+
>>> automl_obj.leader()
|
|
4311
|
+
|
|
4312
|
+
# Run predict on test data using best performing model.
|
|
4313
|
+
>>> prediction = automl_obj.predict(payment_fraud_test)
|
|
4314
|
+
>>> prediction
|
|
4315
|
+
|
|
4316
|
+
# Run predict on test data using second best performing model.
|
|
4317
|
+
>>> prediction = automl_obj.predict(payment_fraud_test, rank=2)
|
|
4318
|
+
>>> prediction
|
|
4319
|
+
|
|
4320
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
4321
|
+
>>> performance_metrics = automl_obj.evaluate(payment_fraud_test)
|
|
4322
|
+
>>> performance_metrics
|
|
4323
|
+
|
|
4324
|
+
|
|
4325
|
+
# Example 3 : Run AutoFraud for fraud detection with stopping metric and tolerance and imbalance handling method.
|
|
4326
|
+
# Scenario : Predict whether transaction is Fraud or not. Use custom configuration
|
|
4327
|
+
# file to customize different processes of AutoFraud Run. Define
|
|
4328
|
+
# performance threshold to acquire for the available models, and
|
|
4329
|
+
# terminate training upon meeting the stipulated performance criteria.
|
|
4330
|
+
|
|
4331
|
+
# Split the data into train and test.
|
|
4332
|
+
>>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
|
|
4333
|
+
>>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4334
|
+
>>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4335
|
+
|
|
4336
|
+
# Generate custom configuration file.
|
|
4337
|
+
>>> AutoFraud.generate_custom_config("custom_fraud")
|
|
4338
|
+
|
|
4339
|
+
# Create instance of AutoFraud.
|
|
4340
|
+
>>> automl_obj = AutoFraud(verbose=2,
|
|
4341
|
+
>>> stopping_metric="MACRO-F1",
|
|
4342
|
+
>>> stopping_tolerance=0.7,
|
|
4343
|
+
>>> imbalance_handling_method="ADASYN",
|
|
4344
|
+
>>> custom_config_file="custom_fraud.json")
|
|
4345
|
+
# Fit the data.
|
|
4346
|
+
>>> automl_obj.fit(credit_fraud_train, credit_fraud_train.Credit_Class)
|
|
4347
|
+
|
|
4348
|
+
# Display leaderboard.
|
|
4349
|
+
>>> automl_obj.leaderboard()
|
|
4350
|
+
|
|
4351
|
+
# Run predict on test data using best performing model.
|
|
4352
|
+
>>> prediction = automl_obj.predict(credit_fraud_test)
|
|
4353
|
+
>>> prediction
|
|
4354
|
+
|
|
4355
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
4356
|
+
>>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
|
|
4357
|
+
>>> performance_metrics
|
|
4358
|
+
"""
|
|
4359
|
+
# Validate unsupported 'task_type' argument
|
|
4360
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
4361
|
+
|
|
4362
|
+
# Validate unsupported 'is_fraud' argument
|
|
4363
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
|
|
4364
|
+
|
|
4365
|
+
# Validate unsupported 'is_churn' argument
|
|
4366
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
|
|
4367
|
+
|
|
4368
|
+
super().__init__(include=include,
|
|
4369
|
+
exclude=exclude,
|
|
4370
|
+
verbose=verbose,
|
|
4371
|
+
max_runtime_secs=max_runtime_secs,
|
|
4372
|
+
stopping_metric=stopping_metric,
|
|
4373
|
+
stopping_tolerance=stopping_tolerance,
|
|
4374
|
+
max_models=max_models,
|
|
4375
|
+
fraud=True,
|
|
4376
|
+
is_fraud=True,
|
|
4377
|
+
task_type="Classification",
|
|
4378
|
+
custom_config_file=custom_config_file,
|
|
4379
|
+
**kwargs)
|
|
4380
|
+
|
|
4381
|
+
class AutoChurn(AutoML):
|
|
4382
|
+
|
|
4383
|
+
def __init__(self,
|
|
4384
|
+
include=None,
|
|
4385
|
+
exclude=None,
|
|
4386
|
+
verbose=0,
|
|
4387
|
+
max_runtime_secs=None,
|
|
4388
|
+
stopping_metric=None,
|
|
4389
|
+
stopping_tolerance=None,
|
|
4390
|
+
max_models=None,
|
|
4391
|
+
custom_config_file=None,
|
|
4392
|
+
**kwargs):
|
|
4393
|
+
|
|
4394
|
+
"""
|
|
4395
|
+
DESCRIPTION:
|
|
4396
|
+
AutoChurn is a dedicated AutoML pipeline designed specifically for churn prediction
|
|
4397
|
+
tasks. It automates the process of building, training, and evaluating models
|
|
4398
|
+
tailored to identify customer churn, streamlining the workflow for churn prediction
|
|
4399
|
+
use cases.
|
|
4400
|
+
|
|
4401
|
+
PARAMETERS:
|
|
4402
|
+
include:
|
|
4403
|
+
Optional Argument.
|
|
4404
|
+
Specifies the model algorithms to be used for model training phase.
|
|
4405
|
+
By default, all 5 models are used for training for this specific binary
|
|
4406
|
+
classification problem.
|
|
4407
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
4408
|
+
Types: str OR list of str
|
|
4409
|
+
|
|
4410
|
+
exclude:
|
|
4411
|
+
Optional Argument.
|
|
4412
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
4413
|
+
No model is excluded by default.
|
|
4414
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
4415
|
+
Types: str OR list of str
|
|
4416
|
+
|
|
4417
|
+
verbose:
|
|
4418
|
+
Optional Argument.
|
|
4419
|
+
Specifies the detailed execution steps based on verbose level.
|
|
4420
|
+
Default Value: 0
|
|
4421
|
+
Permitted Values:
|
|
4422
|
+
* 0: prints the progress bar and leaderboard
|
|
4423
|
+
* 1: prints the execution steps of AutoML.
|
|
4424
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
4425
|
+
Types: int
|
|
4426
|
+
|
|
4427
|
+
max_runtime_secs:
|
|
4428
|
+
Optional Argument.
|
|
4429
|
+
Specifies the time limit in seconds for model training.
|
|
4430
|
+
Types: int
|
|
4431
|
+
|
|
4432
|
+
stopping_metric:
|
|
4433
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
4434
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
4435
|
+
Permitted Values: 'MICRO-F1','MACRO-F1','MICRO-RECALL','MACRO-RECALL',
|
|
4436
|
+
'MICRO-PRECISION', 'MACRO-PRECISION','WEIGHTED-PRECISION',
|
|
4437
|
+
'WEIGHTED-RECALL', 'WEIGHTED-F1', 'ACCURACY'
|
|
4438
|
+
Types: str
|
|
4439
|
+
|
|
4440
|
+
stopping_tolerance:
|
|
4441
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
4442
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
4443
|
+
Types: float
|
|
4444
|
+
|
|
4445
|
+
max_models:
|
|
4446
|
+
Optional Argument.
|
|
4447
|
+
Specifies the maximum number of models to be trained.
|
|
4448
|
+
Types: int
|
|
4449
|
+
|
|
4450
|
+
custom_config_file:
|
|
4451
|
+
Optional Argument.
|
|
4452
|
+
Specifies the path of json file in case of custom run.
|
|
4453
|
+
Types: str
|
|
4454
|
+
|
|
4455
|
+
**kwargs:
|
|
4456
|
+
Specifies the additional arguments for AutoClassifier. Below
|
|
4457
|
+
are the additional arguments:
|
|
4458
|
+
volatile:
|
|
4459
|
+
Optional Argument.
|
|
4460
|
+
Specifies whether to put the interim results of the
|
|
4461
|
+
functions in a volatile table or not. When set to
|
|
4462
|
+
True, results are stored in a volatile table,
|
|
4463
|
+
otherwise not.
|
|
4464
|
+
Default Value: False
|
|
4465
|
+
Types: bool
|
|
4466
|
+
|
|
4467
|
+
persist:
|
|
4468
|
+
Optional Argument.
|
|
4469
|
+
Specifies whether to persist the interim results of the
|
|
4470
|
+
functions in a table or not. When set to True,
|
|
4471
|
+
results are persisted in a table; otherwise,
|
|
4472
|
+
results are garbage collected at the end of the
|
|
4473
|
+
session.
|
|
4474
|
+
Default Value: False
|
|
4475
|
+
Types: bool
|
|
4476
|
+
|
|
4477
|
+
seed:
|
|
4478
|
+
Optional Argument.
|
|
4479
|
+
Specifies the random seed for reproducibility.
|
|
4480
|
+
Default Value: 42
|
|
4481
|
+
Types: int
|
|
4482
|
+
|
|
4483
|
+
imbalance_handling_method:
|
|
4484
|
+
Optional Argument.
|
|
4485
|
+
Specifies which data imbalance method to use.
|
|
4486
|
+
Default Value: SMOTE
|
|
4487
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
4488
|
+
Types: str
|
|
4489
|
+
|
|
4490
|
+
RETURNS:
|
|
4491
|
+
Instance of AutoChurn.
|
|
4492
|
+
|
|
4493
|
+
RAISES:
|
|
4494
|
+
TeradataMlException, TypeError, ValueError
|
|
4495
|
+
|
|
4496
|
+
EXAMPLES:
|
|
4497
|
+
# Notes:
|
|
4498
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
4499
|
+
# 2. One must import the required functions mentioned in
|
|
4500
|
+
# the example from teradataml.
|
|
4501
|
+
# 3. Function will raise error if not supported on the Vantage
|
|
4502
|
+
# user is connected to.
|
|
4503
|
+
|
|
4504
|
+
# Load the example data.
|
|
4505
|
+
>>> load_example_data("teradataml", "bank_churn")
|
|
4506
|
+
|
|
4507
|
+
# Create teradataml DataFrame object.
|
|
4508
|
+
>>> churn_df = DataFrame.from_table("bank_churn")
|
|
4509
|
+
|
|
4510
|
+
# Example 1 : Run AutoChurn for churn prediction problem
|
|
4511
|
+
# Scenario : Predict whether a customer churn for bank or not
|
|
4512
|
+
|
|
4513
|
+
# Split the data into train and test.
|
|
4514
|
+
>>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
|
|
4515
|
+
>>> churn_train= churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4516
|
+
>>> churn_test = churn_sample[churn_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4517
|
+
|
|
4518
|
+
# Create instance of AutoChurn.
|
|
4519
|
+
>>> automl_obj = AutoChurn()
|
|
4520
|
+
|
|
4521
|
+
# Fit the data.
|
|
4522
|
+
>>> automl_obj.fit(churn_train, "churn")
|
|
4523
|
+
|
|
4524
|
+
# Display leaderboard.
|
|
4525
|
+
>>> automl_obj.leaderboard()
|
|
4526
|
+
|
|
4527
|
+
# Display best performing model.
|
|
4528
|
+
>>> automl_obj.leader()
|
|
4529
|
+
|
|
4530
|
+
# Run predict on test data using best performing model.
|
|
4531
|
+
>>> prediction = automl_obj.predict(churn_test)
|
|
4532
|
+
>>> prediction
|
|
4533
|
+
|
|
4534
|
+
# Run predict on test data using second best performing model.
|
|
4535
|
+
>>> prediction = automl_obj.predict(churn_test, rank=2)
|
|
4536
|
+
>>> prediction
|
|
4537
|
+
|
|
4538
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
4539
|
+
>>> performance_metrics = automl_obj.evaluate(churn_test)
|
|
4540
|
+
>>> performance_metrics
|
|
4541
|
+
|
|
4542
|
+
# Run evaluate to get performance metrics using model rank 4.
|
|
4543
|
+
>>> performance_metrics = automl_obj.evaluate(churn_test, 4)
|
|
4544
|
+
>>> performance_metrics
|
|
4545
|
+
|
|
4546
|
+
# Example 2 : Run AutoChurn for churn prediction with stopping metric and tolerance and imbalance handling method.
|
|
4547
|
+
# Scenario : Predict whether a customer churn a bank or not. Use custom configuration
|
|
4548
|
+
# file to customize different processes of AutoML Run. Define
|
|
4549
|
+
# performance threshold to acquire for the available models, and
|
|
4550
|
+
# terminate training upon meeting the stipulated performance criteria.
|
|
4551
|
+
|
|
4552
|
+
# Split the data into train and test.
|
|
4553
|
+
>>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
|
|
4554
|
+
>>> churn_train= churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4555
|
+
>>> churn_test = churn_sample[chrun_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4556
|
+
|
|
4557
|
+
# Generate custom configuration file.
|
|
4558
|
+
>>> AutoChurn.generate_custom_config("custom_churn")
|
|
4559
|
+
|
|
4560
|
+
# Create instance of AutoChurn.
|
|
4561
|
+
>>> automl_obj = AutoChurn(verbose=2,
|
|
4562
|
+
>>> stopping_metric="MACRO-F1",
|
|
4563
|
+
>>> stopping_tolerance=0.7,
|
|
4564
|
+
>>> imbalance_handling_method="ADASYN",
|
|
4565
|
+
>>> custom_config_file="custom_churn.json")
|
|
4566
|
+
# Fit the data.
|
|
4567
|
+
>>> automl_obj.fit(churn_train, churn_train.churn)
|
|
4568
|
+
|
|
4569
|
+
# Display leaderboard.
|
|
4570
|
+
>>> automl_obj.leaderboard()
|
|
4571
|
+
|
|
4572
|
+
# Run predict on test data using best performing model.
|
|
4573
|
+
>>> prediction = automl_obj.predict(churn_test)
|
|
4574
|
+
>>> prediction
|
|
4575
|
+
|
|
4576
|
+
# Run evaluate to get performance metrics using best performing model.
|
|
4577
|
+
>>> performance_metrics = automl_obj.evaluate(churn_test)
|
|
4578
|
+
>>> performance_metrics
|
|
4579
|
+
"""
|
|
4580
|
+
|
|
4581
|
+
# Validate unsupported 'task_type' argument
|
|
4582
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
4583
|
+
|
|
4584
|
+
# Validate unsupported 'is_churn' argument
|
|
4585
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
|
|
4586
|
+
|
|
4587
|
+
# Validate unsupported 'is_fraud' argument
|
|
4588
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
|
|
4589
|
+
|
|
4590
|
+
super().__init__(include=include,
|
|
4591
|
+
exclude=exclude,
|
|
4592
|
+
verbose=verbose,
|
|
4593
|
+
max_runtime_secs=max_runtime_secs,
|
|
4594
|
+
stopping_metric=stopping_metric,
|
|
4595
|
+
stopping_tolerance=stopping_tolerance,
|
|
4596
|
+
max_models=max_models,
|
|
4597
|
+
churn=True,
|
|
4598
|
+
is_churn=True,
|
|
4599
|
+
task_type="Classification",
|
|
4600
|
+
custom_config_file=custom_config_file,
|
|
4601
|
+
**kwargs)
|
|
4602
|
+
|
|
4603
|
+
class _Clustering(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
|
|
4604
|
+
|
|
4605
|
+
def __init__(self,
|
|
4606
|
+
data,
|
|
4607
|
+
target_column=None,
|
|
4608
|
+
custom_data=None,
|
|
4609
|
+
**kwargs):
|
|
4610
|
+
"""
|
|
4611
|
+
DESCRIPTION:
|
|
4612
|
+
Function initializes the data for clustering pipeline using AutoML components.
|
|
4613
|
+
|
|
4614
|
+
PARAMETERS:
|
|
4615
|
+
data:
|
|
4616
|
+
Required Argument.
|
|
4617
|
+
Specifies the input teradataml Dataframe.
|
|
4618
|
+
Types: teradataml Dataframe
|
|
4619
|
+
|
|
4620
|
+
target_column:
|
|
4621
|
+
Set to None as no target column is present for clustering data.
|
|
4622
|
+
Types: str
|
|
4623
|
+
|
|
4624
|
+
custom_data:
|
|
4625
|
+
Optional Argument.
|
|
4626
|
+
Specifies json object containing user customized input.
|
|
4627
|
+
Types: json object
|
|
4628
|
+
"""
|
|
4629
|
+
# Validate unsupported 'task_type' argument
|
|
4630
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
4631
|
+
|
|
4632
|
+
self.data = data
|
|
4633
|
+
self.target_column = target_column # Typically None, but kept for compatibility
|
|
4634
|
+
self.custom_data = custom_data
|
|
4635
|
+
self.cluster = True
|
|
4636
|
+
self.task_type = "Clustering"
|
|
4637
|
+
|
|
4638
|
+
super().__init__(data=data,
|
|
4639
|
+
target_column=target_column,
|
|
4640
|
+
custom_data=custom_data,
|
|
4641
|
+
**kwargs)
|
|
4642
|
+
|
|
4643
|
+
def _clustering(self,
|
|
4644
|
+
model_list=None,
|
|
4645
|
+
auto=False,
|
|
4646
|
+
verbose=0,
|
|
4647
|
+
max_runtime_secs=None,
|
|
4648
|
+
stopping_metric=None,
|
|
4649
|
+
stopping_tolerance=None,
|
|
4650
|
+
max_models=None,
|
|
4651
|
+
**kwargs):
|
|
4652
|
+
"""
|
|
4653
|
+
DESCRIPTION:
|
|
4654
|
+
Internal Function runs Clustering using AutoML components.
|
|
4655
|
+
|
|
4656
|
+
PARAMETERS:
|
|
4657
|
+
model_list:
|
|
4658
|
+
Optional Argument.
|
|
4659
|
+
Specifies the list of model algorithms to be used for model training phase.
|
|
4660
|
+
Types: list of strings (str)
|
|
4661
|
+
Default Value: ["KMeans", "GaussianMixture"]
|
|
4662
|
+
Permitted Values: "KMeans", "GaussianMixture"
|
|
4663
|
+
auto:
|
|
4664
|
+
Optional Argument.
|
|
4665
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
4666
|
+
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
4667
|
+
Types: bool
|
|
4668
|
+
|
|
4669
|
+
verbose:
|
|
4670
|
+
Optional Argument.
|
|
4671
|
+
Specifies the detailed execution steps based on verbose level.
|
|
4672
|
+
Default Value: 0
|
|
4673
|
+
Permitted Values:
|
|
4674
|
+
* 0: prints the progress bar and leaderboard
|
|
4675
|
+
* 1: prints the execution steps of AutoML.
|
|
4676
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
4677
|
+
Types: int
|
|
4678
|
+
|
|
4679
|
+
max_runtime_secs:
|
|
4680
|
+
Optional Argument.
|
|
4681
|
+
Specifies the time limit in seconds for model training.
|
|
4682
|
+
Types: int
|
|
4683
|
+
|
|
4684
|
+
stopping_metric:
|
|
4685
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
4686
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
4687
|
+
Types: str
|
|
4688
|
+
|
|
4689
|
+
stopping_tolerance:
|
|
4690
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
4691
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
4692
|
+
Types: float
|
|
4693
|
+
|
|
4694
|
+
max_models:
|
|
4695
|
+
Optional Argument.
|
|
4696
|
+
Specifies the maximum number of models to be trained.
|
|
4697
|
+
Types: int
|
|
4698
|
+
|
|
4699
|
+
**kwargs:
|
|
4700
|
+
Specifies the additional arguments for AutoChurn or AutoFraud. Below
|
|
4701
|
+
are the additional arguments:
|
|
4702
|
+
volatile:
|
|
4703
|
+
Optional Argument.
|
|
4704
|
+
Specifies whether to put the results of the
|
|
4705
|
+
function in a volatile table or not. When set to
|
|
4706
|
+
True, results are stored in a volatile table,
|
|
4707
|
+
otherwise not.
|
|
4708
|
+
Default Value: False
|
|
4709
|
+
Types: bool
|
|
4710
|
+
|
|
4711
|
+
persist:
|
|
4712
|
+
Optional Argument.
|
|
4713
|
+
Specifies whether to persist the results of the
|
|
4714
|
+
function in a table or not. When set to True,
|
|
4715
|
+
results are persisted in a table; otherwise,
|
|
4716
|
+
results are garbage collected at the end of the
|
|
4717
|
+
session.
|
|
4718
|
+
Default Value: False
|
|
4719
|
+
Types: bool
|
|
4720
|
+
|
|
4721
|
+
RETURNS:
|
|
4722
|
+
a tuple containing, model information and leaderboard.
|
|
4723
|
+
"""
|
|
4724
|
+
|
|
4725
|
+
# Feature Exploration Phase
|
|
4726
|
+
_FeatureExplore.__init__(self,
|
|
4727
|
+
data=self.data,
|
|
4728
|
+
target_column=None,
|
|
4729
|
+
custom_data=self.custom_data,
|
|
4730
|
+
verbose=verbose,
|
|
4731
|
+
cluster=True,
|
|
4732
|
+
**kwargs)
|
|
4733
|
+
if verbose > 0:
|
|
4734
|
+
self._exploration()
|
|
4735
|
+
|
|
4736
|
+
# Feature Engineering Phase
|
|
4737
|
+
_FeatureEngineering.__init__(self,
|
|
4738
|
+
data=self.data,
|
|
4739
|
+
target_column=None,
|
|
4740
|
+
model_list=model_list,
|
|
4741
|
+
verbose=verbose,
|
|
4742
|
+
task_type="Clustering",
|
|
4743
|
+
custom_data=self.custom_data,
|
|
4744
|
+
cluster=True,
|
|
4745
|
+
**kwargs)
|
|
4746
|
+
|
|
4747
|
+
start_time = time.time()
|
|
4748
|
+
data, excluded_columns, _, data_transformation_params, data_mapping = self.feature_engineering(auto)
|
|
4749
|
+
|
|
4750
|
+
# Data Preparation Phase
|
|
4751
|
+
_DataPreparation.__init__(self,
|
|
4752
|
+
data=self.data,
|
|
4753
|
+
target_column=None,
|
|
4754
|
+
verbose=verbose,
|
|
4755
|
+
excluded_columns=excluded_columns,
|
|
4756
|
+
custom_data=self.custom_data,
|
|
4757
|
+
data_transform_dict=data_transformation_params,
|
|
4758
|
+
task_type="Clustering",
|
|
4759
|
+
data_mapping=data_mapping,
|
|
4760
|
+
cluster=True,
|
|
4761
|
+
**kwargs)
|
|
4762
|
+
features, data_transformation_params, data_mapping = self.data_preparation(auto)
|
|
4763
|
+
|
|
4764
|
+
# Adjust time left
|
|
4765
|
+
max_runtime_secs = max_runtime_secs - (time.time() - start_time) \
|
|
4766
|
+
if max_runtime_secs is not None else None
|
|
4767
|
+
max_runtime_secs = 200 if max_runtime_secs is not None and max_runtime_secs < 120 else max_runtime_secs
|
|
4768
|
+
|
|
4769
|
+
# Model Training Phase
|
|
4770
|
+
_ModelTraining.__init__(self,
|
|
4771
|
+
data=self.data,
|
|
4772
|
+
target_column=None,
|
|
4773
|
+
model_list=model_list,
|
|
4774
|
+
verbose=verbose,
|
|
4775
|
+
features=features,
|
|
4776
|
+
task_type="Clustering",
|
|
4777
|
+
custom_data=self.custom_data,
|
|
4778
|
+
cluster=True,
|
|
4779
|
+
**kwargs)
|
|
4780
|
+
models_info, leaderboard, _ = self.model_training(auto=auto,
|
|
4781
|
+
max_runtime_secs=max_runtime_secs,
|
|
4782
|
+
stopping_metric=stopping_metric,
|
|
4783
|
+
stopping_tolerance=stopping_tolerance,
|
|
4784
|
+
max_models=max_models)
|
|
4785
|
+
|
|
4786
|
+
return (models_info, leaderboard, None, None, data_transformation_params, data_mapping)
|
|
4787
|
+
|
|
4788
|
+
class AutoCluster(AutoML):
|
|
4789
|
+
|
|
4790
|
+
def __init__(self,
|
|
4791
|
+
include=None,
|
|
4792
|
+
exclude=None,
|
|
4793
|
+
verbose=0,
|
|
4794
|
+
max_runtime_secs=None,
|
|
4795
|
+
stopping_metric=None,
|
|
4796
|
+
stopping_tolerance=None,
|
|
4797
|
+
max_models=None,
|
|
4798
|
+
custom_config_file=None,
|
|
4799
|
+
**kwargs):
|
|
4800
|
+
|
|
4801
|
+
"""
|
|
4802
|
+
DESCRIPTION:
|
|
4803
|
+
AutoCluster is a dedicated AutoML pipeline designed specifically for clustering tasks.
|
|
4804
|
+
It automates the process of building, training, and evaluating clustering models,
|
|
4805
|
+
streamlining the workflow for unsupervised learning use cases where the goal is
|
|
4806
|
+
to group data into clusters.
|
|
4807
|
+
|
|
4808
|
+
PARAMETERS:
|
|
4809
|
+
include:
|
|
4810
|
+
Optional Argument.
|
|
4811
|
+
Specifies the model algorithms to be used for model training phase.
|
|
4812
|
+
By default, all 2 models are used for training for clustering.
|
|
4813
|
+
Permitted Values: "KMeans", "GaussianMixture"
|
|
4814
|
+
Types: str OR list of str
|
|
4815
|
+
|
|
4816
|
+
exclude:
|
|
4817
|
+
Optional Argument.
|
|
4818
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
4819
|
+
No model is excluded by default.
|
|
4820
|
+
Permitted Values: "KMeans", "GaussianMixture"
|
|
4821
|
+
Types: str OR list of str
|
|
4822
|
+
|
|
4823
|
+
verbose:
|
|
4824
|
+
Optional Argument.
|
|
4825
|
+
Specifies the detailed execution steps based on verbose level.
|
|
4826
|
+
Default Value: 0
|
|
4827
|
+
Permitted Values:
|
|
4828
|
+
* 0: prints the progress bar and leaderboard
|
|
4829
|
+
* 1: prints the execution steps of AutoML.
|
|
4830
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
4831
|
+
Types: int
|
|
4832
|
+
|
|
4833
|
+
max_runtime_secs:
|
|
4834
|
+
Optional Argument.
|
|
4835
|
+
Specifies the time limit in seconds for model training.
|
|
4836
|
+
Types: int
|
|
4837
|
+
|
|
4838
|
+
stopping_metric:
|
|
4839
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
4840
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
4841
|
+
Permitted Values: "SILHOUETTE", "CALINSKI", "DAVIES"
|
|
4842
|
+
|
|
4843
|
+
Types: str
|
|
4844
|
+
|
|
4845
|
+
stopping_tolerance:
|
|
4846
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
4847
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
4848
|
+
Types: float
|
|
4849
|
+
|
|
4850
|
+
max_models:
|
|
4851
|
+
Optional Argument.
|
|
4852
|
+
Specifies the maximum number of models to be trained.
|
|
4853
|
+
Types: int
|
|
4854
|
+
|
|
4855
|
+
custom_config_file:
|
|
4856
|
+
Optional Argument.
|
|
4857
|
+
Specifies the path of json file in case of custom run.
|
|
4858
|
+
Types: str
|
|
4859
|
+
|
|
4860
|
+
**kwargs:
|
|
4861
|
+
Specifies the additional arguments for AutoCluster. Below
|
|
4862
|
+
are the additional arguments:
|
|
4863
|
+
volatile:
|
|
4864
|
+
Optional Argument.
|
|
4865
|
+
Specifies whether to put the interim results of the
|
|
4866
|
+
functions in a volatile table or not. When set to
|
|
4867
|
+
True, results are stored in a volatile table,
|
|
4868
|
+
otherwise not.
|
|
4869
|
+
Default Value: False
|
|
4870
|
+
Types: bool
|
|
4871
|
+
|
|
4872
|
+
persist:
|
|
4873
|
+
Optional Argument.
|
|
4874
|
+
Specifies whether to persist the interim results of the
|
|
4875
|
+
functions in a table or not. When set to True,
|
|
4876
|
+
results are persisted in a table; otherwise,
|
|
4877
|
+
results are garbage collected at the end of the
|
|
4878
|
+
session.
|
|
4879
|
+
Default Value: False
|
|
4880
|
+
Types: bool
|
|
4881
|
+
|
|
4882
|
+
seed:
|
|
4883
|
+
Optional Argument.
|
|
4884
|
+
Specifies the random seed for reproducibility.
|
|
4885
|
+
Default Value: 42
|
|
4886
|
+
Types: int
|
|
4887
|
+
|
|
4888
|
+
RETURNS:
|
|
4889
|
+
Instance of AutoCluster.
|
|
4890
|
+
|
|
4891
|
+
RAISES:
|
|
4892
|
+
TeradataMlException, TypeError, ValueError
|
|
4893
|
+
|
|
4894
|
+
EXAMPLES:
|
|
4895
|
+
# Notes:
|
|
4896
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
4897
|
+
# 2. One must import the required functions mentioned in
|
|
4898
|
+
# the example from teradataml.
|
|
4899
|
+
# 3. Function will raise error if not supported on the Vantage
|
|
4900
|
+
# user is connected to.
|
|
4901
|
+
|
|
4902
|
+
# Load the example data.
|
|
4903
|
+
>>> load_example_data("teradataml", ["bank_marketing", "Mall_customer_data"])
|
|
4904
|
+
|
|
4905
|
+
# Create teradataml DataFrame object.
|
|
4906
|
+
>>> bank_df = DataFrame.from_table("bank_marketing")
|
|
4907
|
+
>>> mall_df = DataFrame.from_table("Mall_customer_data")
|
|
4908
|
+
|
|
4909
|
+
# Example 1: Use AutoCluster for unsupervised clustering task based on bank data.
|
|
4910
|
+
# Scenario: Automatically group similar records in the dataset into clusters.
|
|
4911
|
+
|
|
4912
|
+
# Split the data into train and test.
|
|
4913
|
+
>>> bank_sample = bank_df.sample(frac = [0.8, 0.2])
|
|
4914
|
+
>>> bank_train= bank_sample[bank_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4915
|
+
>>> bank_test = bank_sample[bank_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4916
|
+
|
|
4917
|
+
# Create instance of AutoCluster.
|
|
4918
|
+
>>> automl_obj = AutoCluster()
|
|
4919
|
+
|
|
4920
|
+
# Fit the data.
|
|
4921
|
+
>>> automl_obj.fit(bank_train)
|
|
4922
|
+
|
|
4923
|
+
# Display leaderboard.
|
|
4924
|
+
>>> automl_obj.leaderboard()
|
|
4925
|
+
|
|
4926
|
+
# Display best performing model.
|
|
4927
|
+
>>> automl_obj.leader()
|
|
4928
|
+
|
|
4929
|
+
# Run predict on test data using best performing model.
|
|
4930
|
+
>>> prediction = automl_obj.predict(bank_test)
|
|
4931
|
+
>>> prediction
|
|
4932
|
+
|
|
4933
|
+
# Run predict on test data using second best performing model.
|
|
4934
|
+
>>> prediction = automl_obj.predict(bank_test, rank=2)
|
|
4935
|
+
>>> prediction
|
|
4936
|
+
|
|
4937
|
+
|
|
4938
|
+
# Example 2: Use AutoCluster to segment Mall customer data.
|
|
4939
|
+
# Scenario: Automatically identify and group similar customers into clusters.
|
|
4940
|
+
|
|
4941
|
+
# Split the data into train and test.
|
|
4942
|
+
>>> mall_sample = mall_df.sample(frac = [0.8, 0.2])
|
|
4943
|
+
>>> mall_train= mall_sample[mall_sample['sampleid'] == 1].drop('sampleid', axis=1)
|
|
4944
|
+
>>> mall_test = mall_sample[mall_sample['sampleid'] == 2].drop('sampleid', axis=1)
|
|
4945
|
+
|
|
4946
|
+
# Generate custom configuration file.
|
|
4947
|
+
>>> AutoCluster.generate_custom_config("custom_mall_clustering")
|
|
4948
|
+
|
|
4949
|
+
# Create instance of AutoCluster.
|
|
4950
|
+
>>> automl_obj = AutoCluster(verbose=2,
|
|
4951
|
+
>>> custom_config_file="custom_mall_clustering.json")
|
|
4952
|
+
|
|
4953
|
+
# Fit the data.
|
|
4954
|
+
>>> automl_obj.fit(mall_train)
|
|
4955
|
+
|
|
4956
|
+
# Display leaderboard.
|
|
4957
|
+
>>> automl_obj.leaderboard()
|
|
4958
|
+
|
|
4959
|
+
# Display best performing model.
|
|
4960
|
+
>>> automl_obj.leader()
|
|
4961
|
+
|
|
4962
|
+
# Run predict on test data using best performing model.
|
|
4963
|
+
>>> prediction = automl_obj.predict(mall_test)
|
|
4964
|
+
>>> prediction
|
|
4965
|
+
|
|
4966
|
+
# Run predict on test data using second best performing model.
|
|
4967
|
+
>>> prediction = automl_obj.predict(mall_test, rank=2)
|
|
4968
|
+
>>> prediction
|
|
4969
|
+
"""
|
|
4970
|
+
|
|
4971
|
+
# Validate unsupported 'task_type' argument
|
|
4972
|
+
_Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
|
|
4973
|
+
|
|
4974
|
+
# Validate unsupported 'is_churn' argument
|
|
4975
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
|
|
4976
|
+
|
|
4977
|
+
# Validate unsupported 'is_fraud' argument
|
|
4978
|
+
_Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
|
|
4979
|
+
|
|
4980
|
+
super().__init__(include=include,
|
|
4981
|
+
exclude=exclude,
|
|
4982
|
+
verbose=verbose,
|
|
4983
|
+
max_runtime_secs=max_runtime_secs,
|
|
4984
|
+
stopping_metric=stopping_metric,
|
|
4985
|
+
stopping_tolerance=stopping_tolerance,
|
|
4986
|
+
max_models=max_models,
|
|
4987
|
+
task_type="Clustering",
|
|
4988
|
+
custom_config_file=custom_config_file,
|
|
4989
|
+
**kwargs)
|
|
4990
|
+
|
|
4991
|
+
@staticmethod
|
|
4992
|
+
def visualize(**kwargs):
|
|
4993
|
+
# Currently AutoCluster does not support visualize so raising the exception
|
|
4994
|
+
raise TeradataMlException(
|
|
4995
|
+
Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
|
|
4996
|
+
MessageCodes.UNSUPPORTED_OPERATION)
|