teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show
  1. teradataml/README.md +210 -0
  2. teradataml/__init__.py +1 -1
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +162 -76
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/json_parser/__init__.py +2 -0
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
  8. teradataml/analytics/json_parser/metadata.py +22 -4
  9. teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
  10. teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
  11. teradataml/analytics/sqle/__init__.py +3 -0
  12. teradataml/analytics/utils.py +4 -1
  13. teradataml/automl/__init__.py +2369 -464
  14. teradataml/automl/autodataprep/__init__.py +15 -0
  15. teradataml/automl/custom_json_utils.py +184 -112
  16. teradataml/automl/data_preparation.py +113 -58
  17. teradataml/automl/data_transformation.py +154 -53
  18. teradataml/automl/feature_engineering.py +113 -53
  19. teradataml/automl/feature_exploration.py +548 -25
  20. teradataml/automl/model_evaluation.py +260 -32
  21. teradataml/automl/model_training.py +399 -206
  22. teradataml/clients/auth_client.py +2 -2
  23. teradataml/common/aed_utils.py +11 -2
  24. teradataml/common/bulk_exposed_utils.py +4 -2
  25. teradataml/common/constants.py +62 -2
  26. teradataml/common/garbagecollector.py +50 -21
  27. teradataml/common/messagecodes.py +47 -2
  28. teradataml/common/messages.py +19 -1
  29. teradataml/common/sqlbundle.py +23 -6
  30. teradataml/common/utils.py +116 -10
  31. teradataml/context/aed_context.py +16 -10
  32. teradataml/data/Employee.csv +5 -0
  33. teradataml/data/Employee_Address.csv +4 -0
  34. teradataml/data/Employee_roles.csv +5 -0
  35. teradataml/data/JulesBelvezeDummyData.csv +100 -0
  36. teradataml/data/byom_example.json +5 -0
  37. teradataml/data/creditcard_data.csv +284618 -0
  38. teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
  39. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
  40. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
  42. teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
  43. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
  44. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
  45. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
  46. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
  47. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
  48. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
  49. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
  50. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
  51. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
  52. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
  53. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
  54. teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
  55. teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
  56. teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
  57. teradataml/data/load_example_data.py +29 -11
  58. teradataml/data/payment_fraud_dataset.csv +10001 -0
  59. teradataml/data/teradataml_example.json +67 -0
  60. teradataml/dataframe/copy_to.py +714 -54
  61. teradataml/dataframe/dataframe.py +1153 -33
  62. teradataml/dataframe/dataframe_utils.py +8 -3
  63. teradataml/dataframe/functions.py +168 -1
  64. teradataml/dataframe/setop.py +4 -1
  65. teradataml/dataframe/sql.py +141 -9
  66. teradataml/dbutils/dbutils.py +470 -35
  67. teradataml/dbutils/filemgr.py +1 -1
  68. teradataml/hyperparameter_tuner/optimizer.py +456 -142
  69. teradataml/lib/aed_0_1.dll +0 -0
  70. teradataml/lib/libaed_0_1.dylib +0 -0
  71. teradataml/lib/libaed_0_1.so +0 -0
  72. teradataml/lib/libaed_0_1_aarch64.so +0 -0
  73. teradataml/scriptmgmt/UserEnv.py +234 -34
  74. teradataml/scriptmgmt/lls_utils.py +43 -17
  75. teradataml/sdk/_json_parser.py +1 -1
  76. teradataml/sdk/api_client.py +9 -6
  77. teradataml/sdk/modelops/_client.py +3 -0
  78. teradataml/series/series.py +12 -7
  79. teradataml/store/feature_store/constants.py +601 -234
  80. teradataml/store/feature_store/feature_store.py +2886 -616
  81. teradataml/store/feature_store/mind_map.py +639 -0
  82. teradataml/store/feature_store/models.py +5831 -214
  83. teradataml/store/feature_store/utils.py +390 -0
  84. teradataml/table_operators/table_operator_util.py +1 -1
  85. teradataml/table_operators/templates/dataframe_register.template +6 -2
  86. teradataml/table_operators/templates/dataframe_udf.template +6 -2
  87. teradataml/utils/docstring.py +527 -0
  88. teradataml/utils/dtypes.py +93 -0
  89. teradataml/utils/internal_buffer.py +2 -2
  90. teradataml/utils/utils.py +41 -2
  91. teradataml/utils/validators.py +694 -17
  92. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
  93. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
  94. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
  95. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
  96. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  # ##################################################################
2
2
  #
3
- # Copyright 2024 Teradata. All rights reserved.
3
+ # Copyright 2025 Teradata. All rights reserved.
4
4
  # TERADATA CONFIDENTIAL AND TRADE SECRET
5
5
  #
6
6
  # Primary Owner: Sweta Shaw
@@ -14,15 +14,18 @@
14
14
  # ##################################################################
15
15
 
16
16
  # Python libraries
17
+ import ast
18
+ from io import BytesIO
19
+ import joblib
17
20
  import json
18
- import pandas as pd
21
+ import matplotlib.pyplot as plt
19
22
  import numpy as np
23
+ import pandas as pd
24
+ import seaborn as sns
20
25
  from sklearn.metrics import confusion_matrix
26
+ from sklearn.decomposition import PCA
21
27
  import time
22
- import ast
23
28
  import warnings
24
- import joblib
25
- from io import BytesIO
26
29
 
27
30
  # Teradata libraries
28
31
  from teradataml.dataframe.copy_to import copy_to_sql
@@ -37,7 +40,14 @@ from teradataml import TeradataMlException
37
40
  from teradataml.common.messages import Messages, MessageCodes
38
41
  from teradataml.telemetry_utils.queryband import collect_queryband
39
42
  from teradataml import TeradataConstants
40
- from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
43
+ from teradataml import (XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table,
44
+ OutlierFilterFit, OutlierFilterTransform, SimpleImputeFit, SimpleImputeTransform,
45
+ ColumnSummary)
46
+ from teradataml import td_sklearn as skl
47
+ from teradataml import CategoricalSummary
48
+ from teradataml import TargetEncodingFit, TargetEncodingTransform
49
+ from teradataml import Shap
50
+ from teradataml import GarbageCollector
41
51
 
42
52
  # AutoML Internal libraries
43
53
  from teradataml.automl.data_preparation import _DataPreparation
@@ -47,20 +57,22 @@ from teradataml.automl.model_evaluation import _ModelEvaluator
47
57
  from teradataml.automl.model_training import _ModelTraining
48
58
  from teradataml.automl.data_transformation import _DataTransformation
49
59
  from teradataml.automl.custom_json_utils import _GenerateCustomJson
50
-
60
+ from teradataml.common.constants import AutoMLConstants
51
61
 
52
62
  class AutoML:
53
63
 
54
64
  def __init__(self,
55
- task_type = "Default",
56
- include = None,
57
- exclude = None,
58
- verbose = 0,
59
- max_runtime_secs = None,
60
- stopping_metric = None,
61
- stopping_tolerance = None,
62
- max_models = None,
63
- custom_config_file = None,
65
+ task_type="Default",
66
+ include=None,
67
+ exclude=None,
68
+ verbose=0,
69
+ max_runtime_secs=None,
70
+ stopping_metric=None,
71
+ stopping_tolerance=None,
72
+ max_models=None,
73
+ custom_config_file=None,
74
+ is_fraud=False,
75
+ is_churn=False,
64
76
  **kwargs):
65
77
  """
66
78
  DESCRIPTION:
@@ -72,23 +84,23 @@ class AutoML:
72
84
  machine learning models, by automating some of the more time-consuming
73
85
  and labor-intensive tasks involved in the process.
74
86
 
75
- AutoML is designed to handle both regression and classification (binary and
76
- multiclass) tasks. User can specify the task type whether to apply
77
- regression OR classification algorithm on the provided dataset. By default, AutoML
78
- decides the task type.
79
-
80
- AutoML by default, trains using all model algorithms applicable for the
81
- task type problem. For example, "glm" and "svm" does not support multi-class
82
- classification problem. Thus, only 3 models are available to train in case
83
- of multi-class classification problem, by default. While for regression and
84
- binary classification problem, all 5 models i.e., "glm", "svm", "knn",
85
- "decision_forest", "xgboost" are available to train by default.
86
-
87
- AutoML provides functionality to use specific model algorithms for training.
88
- User can provide either include or exclude model. In case of include,
89
- only specified models are trained while for exclude, all models except
90
- specified model are trained.
87
+ AutoML is designed to handle regression, classification (binary and multiclass),
88
+ and clustering tasks. The user can specify the task type to apply regression,
89
+ classification, or clustering algorithms on the provided dataset. By default,
90
+ AutoML will automatically decide whether the task is regression or classification.
91
+ For clustering, it is mandatory for the user to specify the task type explicitly.
91
92
 
93
+ AutoML can also be run specifically for fraud detection and churn prediction
94
+ scenarios (binary classification). By setting the available parameters, users
95
+ can leverage specialized workflows and model selection tailored for these usecases,
96
+ enabling more effective handling of fraud and churn-related datasets.
97
+
98
+ By default, AutoML trains using all model algorithms that are applicable to
99
+ the selected task type. Beside that, AutoML also provides functionality to use
100
+ specific model algorithms for training. User can provide either include
101
+ or exclude model. In case of include, only specified models are trained
102
+ while for exclude, all models except specified model are trained.
103
+
92
104
  AutoML also provides an option to customize the processes within feature
93
105
  engineering, data preparation and model training phases. User can customize
94
106
  the processes by passing the JSON file path in case of custom run. It also
@@ -100,20 +112,23 @@ class AutoML:
100
112
 
101
113
  PARAMETERS:
102
114
  task_type:
103
- Optional Argument.
104
- Specifies the task type for AutoML, whether to apply regression OR classification
105
- on the provided dataset. If user wants AutoML to decide the task type automatically,
106
- then it should be set to "Default".
115
+ Required when clustering data is involved otherwise optional.
116
+ Specifies the type of machine learning task for AutoML: regression, classification, or
117
+ clustering. If set to "Default", AutoML will automatically determine whether to perform
118
+ regression or classification based on the target column. For clustering tasks, user must
119
+ explicitly set this parameter to "Clustering".
107
120
  Default Value: "Default"
108
- Permitted Values: "Regression", "Classification", "Default"
121
+ Permitted Values: "Regression", "Classification", "Default", "Clustering"
109
122
  Types: str
110
123
 
111
124
  include:
112
125
  Optional Argument.
113
126
  Specifies the model algorithms to be used for model training phase.
114
- By default, all 5 models are used for training for regression and binary
115
- classification problem, while only 3 models are used for multi-class.
116
- Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
127
+ By default, all 5 models ("glm", "svm", "knn", "decision_forest", "xgboost") are
128
+ used for training for regression and binary classification problem, while only 3
129
+ models ("knn", "decision_forest", "xgboost") are used for multi-class.
130
+ For clustering, only 2 models ("KMeans", "GaussianMixture") are used.
131
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost", "KMeans", "GaussianMixture"
117
132
  Types: str OR list of str
118
133
 
119
134
 
@@ -121,7 +136,7 @@ class AutoML:
121
136
  Optional Argument.
122
137
  Specifies the model algorithms to be excluded from model training phase.
123
138
  No model is excluded by default.
124
- Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
139
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost", "KMeans", "GaussianMixture"
125
140
  Types: str OR list of str
126
141
 
127
142
  verbose:
@@ -143,15 +158,14 @@ class AutoML:
143
158
  Required, when "stopping_tolerance" is set, otherwise optional.
144
159
  Specifies the stopping metrics for stopping tolerance in model training.
145
160
  Permitted Values:
146
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
147
- "MAPE", "MPE", "RMSE", "RMSLE",
148
- "ME", "EV", "MPD", "MGD"
161
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE", "MAPE", "MPE",
162
+ "RMSE", "RMSLE", "ME", "EV", "MPD", "MGD"
149
163
 
150
- * For task_type "Classification": 'MICRO-F1','MACRO-F1',
151
- 'MICRO-RECALL','MACRO-RECALL',
152
- 'MICRO-PRECISION', 'MACRO-PRECISION',
153
- 'WEIGHTED-PRECISION','WEIGHTED-RECALL',
154
- 'WEIGHTED-F1', 'ACCURACY'
164
+ * For task_type "Classification": "MICRO-F1", "MACRO-F1", "MICRO-RECALL", "MACRO-RECALL",
165
+ "MICRO-PRECISION", "MACRO-PRECISION", "WEIGHTED-PRECISION",
166
+ "WEIGHTED-RECALL", "WEIGHTED-F1", "ACCURACY"
167
+
168
+ * For task_type "Clustering": "SILHOUETTE", "CALINSKI", "DAVIES"
155
169
  Types: str
156
170
 
157
171
  stopping_tolerance:
@@ -169,6 +183,18 @@ class AutoML:
169
183
  Specifies the path of JSON file in case of custom run.
170
184
  Types: str
171
185
 
186
+ is_fraud:
187
+ Optional Argument.
188
+ Specifies whether the usecase is for fraud detection.
189
+ Default Value: False
190
+ Types: bool
191
+
192
+ is_churn:
193
+ Optional Argument.
194
+ Specifies whether the usecase is for churn prediction.
195
+ Default Value: False
196
+ Types: bool
197
+
172
198
  **kwargs:
173
199
  Specifies the additional arguments for AutoML. Below
174
200
  are the additional arguments:
@@ -199,6 +225,14 @@ class AutoML:
199
225
  Specifies the random seed for reproducibility.
200
226
  Default Value: 42
201
227
  Types: int
228
+
229
+ imbalance_handling_method:
230
+ Optional Argument.
231
+ Specifies which data imbalance method to use for classification
232
+ problems.
233
+ Default Value: SMOTE
234
+ Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
235
+ Types: str
202
236
 
203
237
  RETURNS:
204
238
  Instance of AutoML.
@@ -218,14 +252,20 @@ class AutoML:
218
252
  >>> load_example_data("GLMPredict", ["admissions_test", "admissions_train"])
219
253
  >>> load_example_data("decisionforestpredict", ["housing_train", "housing_test"])
220
254
  >>> load_example_data("teradataml", "iris_input")
221
-
255
+ >>> load_example_data("teradataml", "credit_fraud_dataset")
256
+ >>> load_example_data("teradataml", "bank_churn")
257
+ >>> load_example_data("teradataml", "bank_marketing")
258
+
222
259
  # Create teradataml DataFrames.
223
260
  >>> admissions_train = DataFrame.from_table("admissions_train")
224
261
  >>> admissions_test = DataFrame.from_table("admissions_test")
225
262
  >>> housing_train = DataFrame.from_table("housing_train")
226
263
  >>> housing_test = DataFrame.from_table("housing_test")
227
264
  >>> iris_input = DataFrame.from_table("iris_input")
228
-
265
+ >>> credit_fraud_df = DataFrame.from_table("credit_fraud_dataset")
266
+ >>> churn_df = DataFrame.from_table("bank_churn")
267
+ >>> bank_df = DataFrame.from_table("bank_marketing")
268
+
229
269
  # Example 1: Run AutoML for classification problem.
230
270
  # Scenario: Predict whether a student will be admitted to a university
231
271
  # based on different factors. Run AutoML to get the best
@@ -307,7 +347,7 @@ class AutoML:
307
347
 
308
348
  # Split the data into train and test.
309
349
  >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
310
- >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
350
+ >>> iris_train = iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
311
351
  >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
312
352
 
313
353
  # Generate custom JSON file
@@ -372,7 +412,7 @@ class AutoML:
372
412
 
373
413
  # Split the data into train and test.
374
414
  >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
375
- >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
415
+ >>> iris_train = iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
376
416
  >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
377
417
 
378
418
  # Create instance of AutoML.
@@ -404,25 +444,133 @@ class AutoML:
404
444
 
405
445
  # Run evaluate to get performance metrics using model rank 4.
406
446
  >>> performance_metrics = automl_obj.evaluate(iris_test, 4)
407
- >>> performance_metrics
447
+ >>> performance_metrics
448
+
449
+ # Example 6 : Run AutoML for fraud detection problem.
450
+ # Scenario : Predict whether credit card transaction is Fraud or not.
451
+
452
+ # Split the data into train and test.
453
+ >>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
454
+ >>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
455
+ >>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
456
+
457
+ # Create instance of AutoML with is_fraud set to True.
458
+ >>> automl_obj = AutoML(is_fraud=True)
459
+
460
+ # Fit the data.
461
+ >>> automl_obj.fit(credit_fraud_train, "Credit_Class")
462
+
463
+ # Display leaderboard.
464
+ >>> automl_obj.leaderboard()
465
+
466
+ # Display best performing model.
467
+ >>> automl_obj.leader()
468
+
469
+ # Run predict on test data using best performing model.
470
+ >>> prediction = automl_obj.predict(credit_fraud_test)
471
+ >>> prediction
472
+
473
+ # Run predict on test data using second best performing model.
474
+ >>> prediction = automl_obj.predict(credit_fraud_test, rank=2)
475
+ >>> prediction
476
+
477
+ # Run evaluate to get performance metrics using best performing model.
478
+ >>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
479
+ >>> performance_metrics
480
+
481
+ # Run evaluate to get performance metrics using model rank 4.
482
+ >>> performance_metrics = automl_obj.evaluate(credit_fraud_test, 4)
483
+ >>> performance_metrics
484
+
485
+ # Example 7 : Run AutoML for churn prediction problem.
486
+ # Scenario : Predict whether a customer churn for bank or not.
487
+
488
+ # Split the data into train and test.
489
+ >>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
490
+ >>> churn_train = churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
491
+ >>> churn_test = churn_sample[chrun_sample['sampleid'] == 2].drop('sampleid', axis=1)
492
+
493
+ # Create instance of AutoML with is_churn=True
494
+ >>> automl_obj = AutoML(is_churn=True)
495
+
496
+ # Fit the data.
497
+ >>> automl_obj.fit(churn_train, "churn")
498
+
499
+ # Display leaderboard.
500
+ >>> automl_obj.leaderboard()
501
+
502
+ # Display best performing model.
503
+ >>> automl_obj.leader()
504
+
505
+ # Run predict on test data using best performing model.
506
+ >>> prediction = automl_obj.predict(churn_test)
507
+ >>> prediction
508
+
509
+ # Run predict on test data using second best performing model.
510
+ >>> prediction = automl_obj.predict(churn_test, rank=2)
511
+ >>> prediction
512
+
513
+ # Run evaluate to get performance metrics using best performing model.
514
+ >>> performance_metrics = automl_obj.evaluate(churn_test)
515
+ >>> performance_metrics
516
+
517
+ # Run evaluate to get performance metrics using model rank 4.
518
+ >>> performance_metrics = automl_obj.evaluate(churn_test, 4)
519
+ >>> performance_metrics
520
+
521
+ # Example 8: Use AutoML for unsupervised clustering task based on bank data.
522
+ # Scenario: Automatically group similar records in the dataset into clusters.
523
+
524
+ # Split the data into train and test.
525
+ >>> bank_sample = bank_df.sample(frac = [0.8, 0.2])
526
+ >>> bank_train = bank_sample[bank_sample['sampleid'] == 1].drop('sampleid', axis=1)
527
+ >>> bank_test = bank_sample[bank_sample['sampleid'] == 2].drop('sampleid', axis=1)
528
+
529
+ # Create instance of AutoML.
530
+ >>> automl_obj = AutoML(task_type="Clustering")
531
+
532
+ # Fit the data.
533
+ >>> automl_obj.fit(bank_train)
534
+
535
+ # Display leaderboard.
536
+ >>> automl_obj.leaderboard()
537
+
538
+ # Display best performing model.
539
+ >>> automl_obj.leader()
540
+
541
+ # Run predict on test data using best performing model.
542
+ >>> prediction = automl_obj.predict(bank_test)
543
+ >>> prediction
544
+
545
+ # Run predict on test data using second best performing model.
546
+ >>> prediction = automl_obj.predict(bank_test, rank=2)
547
+ >>> prediction
408
548
  """
549
+ # Validate task_type first before using it in conditional logic
550
+ task_type_arg_info = [["task_type", task_type, True, (str), True, ["Regression", "Classification", "Clustering", "Default"]]]
551
+ _Validators._validate_function_arguments(task_type_arg_info)
552
+
409
553
  # Appending arguments to list for validation
410
554
  arg_info_matrix = []
411
- arg_info_matrix.append(["task_type", task_type, True, (str), True, ["Regression", "Classification", "Default"]])
412
- arg_info_matrix.append(["include", include, True, (str, list), True, ["glm", "svm", "knn",
413
- "decision_forest", "xgboost"]])
414
- arg_info_matrix.append(["exclude", exclude, True, (str, list), True, ["glm", "svm", "knn",
415
- "decision_forest", "xgboost"]])
555
+
556
+ if task_type.lower() == 'clustering':
557
+ arg_info_matrix.append(["include", include, True, (str, list), True, AutoMLConstants.CLUSTERING_MODELS.value])
558
+ arg_info_matrix.append(["exclude", exclude, True, (str, list), True, AutoMLConstants.CLUSTERING_MODELS.value])
559
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.CLUSTERING_METRICS.value])
560
+ else:
561
+ arg_info_matrix.append(["include", include, True, (str, list), True, AutoMLConstants.SUPERVISED_MODELS.value])
562
+ arg_info_matrix.append(["exclude", exclude, True, (str, list), True, AutoMLConstants.SUPERVISED_MODELS.value])
563
+ if task_type.lower() == "classification" or is_fraud or is_churn:
564
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.CLASSIFICATION_METRICS.value])
565
+ elif task_type.lower() == "regression":
566
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.REGRESSION_METRICS.value])
567
+ else:
568
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, AutoMLConstants.ALL_METRICS.value])
569
+
570
+
416
571
  arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
417
572
  arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
418
- arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", "MAE", "MSE", "MSLE",
419
- "MAPE", "MPE", "RMSE", "RMSLE",
420
- "ME", "EV", "MPD", "MGD",
421
- 'MICRO-F1','MACRO-F1',
422
- 'MICRO-RECALL','MACRO-RECALL',
423
- 'MICRO-PRECISION', 'MACRO-PRECISION',
424
- 'WEIGHTED-PRECISION','WEIGHTED-RECALL',
425
- 'WEIGHTED-F1', 'ACCURACY']])
573
+
426
574
  arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
427
575
  arg_info_matrix.append(["max_models", max_models, True, (int)])
428
576
  arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
@@ -430,10 +578,14 @@ class AutoML:
430
578
  volatile = kwargs.get('volatile', False)
431
579
  persist = kwargs.get('persist', False)
432
580
  seed = kwargs.get('seed', 42)
581
+ imbalance_handling_method = kwargs.get('imbalance_handling_method', "SMOTE")
433
582
 
434
583
  arg_info_matrix.append(["volatile", volatile, True, (bool)])
435
584
  arg_info_matrix.append(["persist", persist, True, (bool)])
436
585
  arg_info_matrix.append(["seed", seed, True, (int)])
586
+ arg_info_matrix.append(["imbalance_handling_method", imbalance_handling_method, True, (str), True, ["SMOTE", "ADASYN", "SMOTETomek", "NearMiss"]])
587
+ arg_info_matrix.append(["is_fraud", is_fraud, True, (bool)])
588
+ arg_info_matrix.append(["is_churn", is_churn, True, (bool)])
437
589
 
438
590
  # Validate argument types
439
591
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -447,7 +599,24 @@ class AutoML:
447
599
  _Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
448
600
  # Validate lower range for max_models
449
601
  _Validators._validate_argument_range(max_models, "max_models", lbound=1, lbound_inclusive=True)
450
-
602
+ # Either is_fraud or is_churn can be used.
603
+ if is_fraud or is_churn:
604
+ _Validators._validate_mutually_exclusive_arguments(is_fraud, "is_fraud", is_churn, "is_churn")
605
+ # Validate mutually exclusive arguments for clustering and is_fraud
606
+ if task_type.lower() == 'clustering' and is_fraud:
607
+ raise TeradataMlException(
608
+ Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH,
609
+ f"task_type={task_type}",
610
+ f"is_fraud={is_fraud}"),
611
+ MessageCodes.CANNOT_USE_TOGETHER_WITH)
612
+ # Validate mutually exclusive arguments for clustering and is_churn
613
+ if task_type.lower() == 'clustering' and is_churn:
614
+ raise TeradataMlException(
615
+ Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH,
616
+ f"task_type = {task_type}",
617
+ f"is_churn = {is_churn}"),
618
+ MessageCodes.CANNOT_USE_TOGETHER_WITH)
619
+
451
620
  custom_data = None
452
621
  self.auto = True
453
622
  # Validate custom file
@@ -474,27 +643,43 @@ class AutoML:
474
643
  self.stopping_metric = stopping_metric
475
644
  self.stopping_tolerance = stopping_tolerance
476
645
  self.max_models = max_models
477
- self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
478
646
  self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
479
647
  self._is_fit_called = False
480
648
  self._is_load_model_called = False
481
- self.kwargs = kwargs
482
- self.table_name_mapping = {}
649
+
650
+ self.table_name_mapping = {}
483
651
  # Stores the table name of all intermediate datas
484
- self._intermediate_table_names={}
652
+ self._intermediate_table_names = {}
485
653
  self._auto_dataprep = False
486
654
  self._phases = None
487
655
  self._progressbar_prefix = "AutoML Running:"
488
656
 
657
+ self.cluster = self.task_type.lower() == 'clustering'
658
+ self.fraud = is_fraud or kwargs.get("fraud", False)
659
+ self.churn = is_churn or kwargs.get("churn", False)
660
+
661
+ if self.cluster:
662
+ self.model_list = AutoMLConstants.CLUSTERING_MODELS.value
663
+ else:
664
+ self.model_list = AutoMLConstants.SUPERVISED_MODELS.value
665
+ kwargs.pop("churn", None)
666
+ kwargs.pop("fraud", None)
667
+ kwargs.pop("cluster", None)
668
+ self.kwargs = kwargs
669
+
670
+ self.volatile = volatile
671
+ self.persist = persist
672
+
673
+
489
674
  @collect_queryband(queryband="AutoML_fit")
490
675
  def fit(self,
491
676
  data,
492
- target_column):
677
+ target_column=None):
493
678
  """
494
679
  DESCRIPTION:
495
- Function triggers the AutoML run. It is designed to handle both
496
- regression and classification tasks depending on the specified "task_type".
497
-
680
+ Function triggers the AutoML run. It is designed to handle regression ,
681
+ classification and clustering tasks depending on the specified "task_type".
682
+
498
683
  PARAMETERS:
499
684
  data:
500
685
  Required Argument.
@@ -502,7 +687,7 @@ class AutoML:
502
687
  Types: teradataml Dataframe
503
688
 
504
689
  target_column:
505
- Required Argument.
690
+ Required Argument. Optional only for clustering tasks.
506
691
  Specifies target column of dataset.
507
692
  Types: str or ColumnExpression
508
693
 
@@ -513,41 +698,49 @@ class AutoML:
513
698
  TeradataMlException, TypeError, ValueError
514
699
 
515
700
  EXAMPLES:
516
- # Create an instance of the AutoML called "automl_obj"
517
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
701
+ # Create an instance of the AutoML called "automl_obj" by referring
702
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
703
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
518
704
  # Perform fit() operation on the "automl_obj".
519
705
 
520
- # Example 1: Passing column expression for target column.
706
+ # Example 1: Fit AutoML by passing column expression for target column.
521
707
  >>> automl_obj.fit(data = housing_train, target_col = housing_train.price)
522
-
523
- # Example 2: Passing name of target column.
524
- >>> automl_obj.fit(data = housing_train, target_col = "price")
525
- """
526
708
 
527
- self._is_fit_called = True
528
- # Checking if target column is of type ColumnExpression
529
- if isinstance(target_column, ColumnExpression):
530
- target_column = target_column.name
531
-
532
- # Appending fit arguments to list for validation
533
- arg_info_fit_matrix = []
534
- arg_info_fit_matrix.append(["data", data, False, (DataFrame), True])
535
- arg_info_fit_matrix.append(["target_column", target_column, False, (str), True])
709
+ # Example 2: Fit AutoML by passing name of target column.
710
+ >>> automl_obj.fit(data = housing_train, target_col = "price")
711
+
712
+ # Example 3: Fit fraud detection model on credit_fraud_df.
713
+ >>> automl_obj.fit(data=credit_fraud_df, target_column="Credit_Class")
714
+
715
+ # Example 4: Fit churn prediction model on churn_df.
716
+ >>> automl_obj.fit(data=churn_df, target_column="churn")
717
+
718
+ # Example 5: Passing clustering data for training,
719
+ # without specifying target column.
720
+ >>> automl_obj.fit(data = bank_train)
721
+ """
536
722
 
723
+ self._is_fit_called = True
724
+ # Prepare argument validation matrix
725
+ arg_info_fit_matrix = [["data", data, False, (DataFrame), True]]
726
+ if not self.cluster:
727
+ # Checking if target column is of type ColumnExpression
728
+ if isinstance(target_column, ColumnExpression):
729
+ target_column = target_column.name
730
+ arg_info_fit_matrix.append(["target_column", target_column, False, (str), True])
537
731
  # Validate argument types
538
732
  _Validators._validate_function_arguments(arg_info_fit_matrix)
539
733
 
540
734
  # Initializing class variables
541
735
  self.data = data
542
- self.target_column = target_column
543
-
736
+ if not self.cluster:
737
+ self.target_column = target_column
544
738
  # Checking if include model list is present
545
739
  if self.include_model:
546
740
  # Converting to list if passed as string
547
741
  self.include_model = UtilFuncs._as_list(self.include_model)
548
742
  # Updating model list based on include list
549
743
  self.model_list = list(set(self.include_model))
550
- self.model_list = [model.lower() for model in self.model_list]
551
744
 
552
745
  # Checking if exclude model list is present
553
746
  if self.exclude_model:
@@ -555,40 +748,40 @@ class AutoML:
555
748
  self.exclude_model = UtilFuncs._as_list(self.exclude_model)
556
749
  # Updating model list based on exclude list
557
750
  self.model_list = list(set(self.model_list) - set(self.exclude_model))
558
- self.model_list = [model.lower() for model in self.model_list]
559
-
560
- # Checking if target column is present in data
561
- _Validators._validate_dataframe_has_argument_columns(self.target_column, "target_column", self.data, "df")
562
-
563
- # Handling default task type
564
- if self.task_type.casefold() == "default":
565
- # if target column is having distinct values less than or equal to 20,
566
- # then it will be mapped to classification problem else regression problem
567
- if self.data.drop_duplicate(self.target_column).size <= 20:
568
- print("\nTask type is set to Classification as target column "
569
- "is having distinct values less than or equal to 20.")
570
- self.task_type = "Classification"
571
- else:
572
- print("\nTask type is set to Regression as target column is "
573
- "having distinct values greater than 20.")
574
- self.task_type = "Regression"
575
-
576
- if self.is_classification_type():
577
- if self.stopping_metric is not None:
578
- permitted_values = ["MICRO-F1", "MACRO-F1",
579
- "MICRO-RECALL", "MACRO-RECALL",
580
- "MICRO-PRECISION", "MACRO-PRECISION",
581
- "WEIGHTED-PRECISION", "WEIGHTED-RECALL",
582
- "WEIGHTED-F1", "ACCURACY"]
583
- _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
751
+
752
+ # Normalize model names: lowercase for non-cluster, original for cluster
753
+ if self.include_model or self.exclude_model:
754
+ self.model_list = [model if self.cluster else model.lower() for model in self.model_list]
755
+
756
+
757
+ if not self.cluster:
758
+ # Checking if target column is present in data
759
+ _Validators._validate_dataframe_has_argument_columns(self.target_column, "target_column", self.data, "df")
760
+
761
+ # Handling default task type
762
+ if self.task_type.casefold() == "default":
763
+ # if target column is having distinct values less than or equal to 20,
764
+ # then it will be mapped to classification problem else regression problem
765
+ if self.data.drop_duplicate(self.target_column).size <= 20:
766
+ print("\nTask type is set to Classification as target column "
767
+ "is having distinct values less than or equal to 20.")
768
+ self.task_type = "Classification"
769
+ else:
770
+ print("\nTask type is set to Regression as target column is "
771
+ "having distinct values greater than 20.")
772
+ self.task_type = "Regression"
773
+
774
+ if self.is_classification_type():
775
+ if self.stopping_metric is not None:
776
+ _Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.CLASSIFICATION_METRICS.value, "stopping_metric")
777
+ elif self.task_type.lower() == "regression":
778
+ if self.stopping_metric is not None:
779
+ _Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.REGRESSION_METRICS.value, "stopping_metric")
584
780
  else:
585
781
  if self.stopping_metric is not None:
586
- permitted_values = ["R2", "MAE", "MSE", "MSLE",
587
- "MAPE", "MPE", "RMSE", "RMSLE",
588
- "ME", "EV", "MPD", "MGD"]
589
- _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
782
+ _Validators._validate_permitted_values(self.stopping_metric, AutoMLConstants.CLUSTERING_METRICS.value, "stopping_metric")
590
783
 
591
- if not self.is_classification_type():
784
+ if not self.is_classification_type() and not self.cluster:
592
785
  _Validators._validate_column_type(self.data, self.target_column, 'target_column',
593
786
  expected_types=UtilFuncs()._get_numeric_datatypes())
594
787
 
@@ -597,33 +790,40 @@ class AutoML:
597
790
  print("\nReceived below input for customization : ")
598
791
  print(json.dumps(self.custom_data, indent=4))
599
792
 
600
- # Classification probelm
601
793
  task_cls = _Classification
602
794
  cls_method = "_classification"
603
-
795
+ if self.fraud:
796
+ task_cls = _AutoSpecific
797
+ cls_method = "fit"
798
+ elif self.churn:
799
+ task_cls = _AutoSpecific
800
+ cls_method = "fit"
604
801
  # Regression problem
605
- if self.task_type.casefold() == "regression":
802
+ elif self.task_type.casefold() == "regression":
606
803
  task_cls = _Regression
607
804
  cls_method = "_regression"
608
-
609
- # Running AutoML
610
- clf = task_cls(self.data, self.target_column, self.custom_data)
805
+ elif self.cluster:
806
+ task_cls = _Clustering
807
+ cls_method = "_clustering"
808
+
611
809
 
810
+ # Running AutoML
811
+ clf = task_cls(data=self.data, target_column=self.target_column, custom_data=self.custom_data,
812
+ fraud=self.fraud, churn=self.churn, cluster=self.cluster, **self.kwargs)
813
+
612
814
  self.model_info, self.leader_board, self.target_count, self.target_label, \
613
815
  self.data_transformation_params, self._intermediate_table_names = getattr(clf, cls_method)(
614
- model_list = self.model_list,
615
- auto = self.auto,
616
- verbose = self.verbose,
617
- max_runtime_secs = self.max_runtime_secs,
618
- stopping_metric = self.stopping_metric,
619
- stopping_tolerance = self.stopping_tolerance,
620
- max_models = self.max_models,
621
- auto_dataprep = self._auto_dataprep,
622
- automl_phases = self._phases,
623
- progress_prefix = self._progressbar_prefix,
816
+ model_list=self.model_list,
817
+ auto=self.auto,
818
+ verbose=self.verbose,
819
+ max_runtime_secs=self.max_runtime_secs,
820
+ stopping_metric=self.stopping_metric,
821
+ stopping_tolerance=self.stopping_tolerance,
822
+ max_models=self.max_models,
823
+ auto_dataprep=self._auto_dataprep,
824
+ automl_phases=self._phases,
825
+ progress_prefix=self._progressbar_prefix,
624
826
  **self.kwargs)
625
-
626
-
627
827
  # table_name_mapping stores the table name of all intermediate datas (lasso, rfe, pca)
628
828
  # used for training models
629
829
  keys_to_extract = ['lasso_train', 'rfe_train', 'pca_train']
@@ -633,13 +833,14 @@ class AutoML:
633
833
  # Model Evaluation Phase
634
834
  self.m_evaluator = _ModelEvaluator(self.model_info,
635
835
  self.target_column,
636
- self.task_type)
836
+ self.task_type,
837
+ cluster=self.cluster)
637
838
 
638
839
  @collect_queryband(queryband="AutoML_predict")
639
840
  def predict(self,
640
841
  data,
641
- rank = 1,
642
- use_loaded_models = False):
842
+ rank=1,
843
+ use_loaded_models=False):
643
844
  """
644
845
  DESCRIPTION:
645
846
  Function generates prediction on data using model rank in
@@ -673,9 +874,10 @@ class AutoML:
673
874
  RAISES:
674
875
  TeradataMlException, TypeError, ValueError
675
876
 
676
- EXAMPLES:
677
- # Create an instance of the AutoML called "automl_obj"
678
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
877
+ EXAMPLES:
878
+ # Create an instance of the AutoML called "automl_obj" by referring
879
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
880
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
679
881
  # Perform fit() operation on the "automl_obj".
680
882
  # Perform predict() operation on the "automl_obj".
681
883
 
@@ -730,97 +932,107 @@ class AutoML:
730
932
  rank = rank-1
731
933
 
732
934
  # Setting indicator to False if target column doesn't exist
733
- if self.target_column not in data.columns:
935
+ if self.cluster or self.target_column not in data.columns:
734
936
  self.target_column_ind = False
735
937
 
736
938
  # Checking if data is already transformed before or not
737
939
  data_node_id = data._nodeid
940
+
941
+ selected_model_info = self.leader_board.iloc[rank]
942
+ feature_selection_method = selected_model_info.get("FEATURE_SELECTION", "pca")
738
943
  if not self.table_name_mapping.get(data_node_id):
739
944
  # At first data transformation will be performed on raw test data
740
945
  # then evaluation will happen.
741
- self.transform_data(data)
946
+ self._transform_data(data, feature_selection_mtd=feature_selection_method)
742
947
  else:
743
948
  print("\nSkipping data transformation as data is already transformed.")
744
-
949
+
745
950
  # Generating prediction
746
- pred = self.m_evaluator.model_evaluation(rank = rank,
747
- table_name_mapping = self.table_name_mapping,
748
- data_node_id = data_node_id,
749
- target_column_ind = self.target_column_ind)
750
-
951
+ pred = self.m_evaluator.model_evaluation(rank=rank,
952
+ table_name_mapping=self.table_name_mapping,
953
+ data_node_id=data_node_id,
954
+ target_column_ind=self.target_column_ind,
955
+ is_predict=True)
956
+
751
957
  # Checking if problem type is classification and target label is present.
752
- if self.is_classification_type() and self.target_label is not None:
753
- # Displaying target column labels
754
- tar_dct = {}
755
- print('\nTarget Column Mapping:')
756
- # Iterating rows
757
- for row in self.target_label.result.itertuples():
758
- # Retrieving the category names of encoded target column
759
- # row[1] contains the orginal name of cateogry
760
- # row[2] contains the encoded value
761
- if row[1] != 'TD_CATEGORY_COUNT':
762
- tar_dct[row[1]] = row[2]
958
+ if not self.cluster:
959
+ self._display_target_column_mapping()
960
+
961
+ # Renaming probability column if any
962
+ prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
963
+ if len(prob_lst) > 0:
964
+ rename_dict = {}
965
+ for col in pred.result.columns:
966
+ if col not in prob_lst:
967
+ rename_dict[col] = getattr(pred.result, col)
968
+ else:
969
+ indx = int(col.split('_')[1])
970
+ rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
971
+ rename_dict['drop_columns'] = True
972
+ pred.result = pred.result.assign(**rename_dict)
973
+
974
+ print("\nPrediction : ")
975
+ print(pred.result)
976
+
977
+ if self.target_column_ind:
978
+ prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
979
+ probability_column = 'prob_1'
980
+ # Displaying confusion matrix and ROC-AUC for classification problem
981
+ if self.is_classification_type():
982
+ print_data = lambda data: print(data) if _is_terminal() else display(data)
983
+ # Displaying ROC-AUC for binary classification
984
+ if self.target_count == 2:
985
+ fit_params = {
986
+ "probability_column" : probability_column,
987
+ "observation_column" : self.target_column,
988
+ "positive_class" : "1",
989
+ "data" : pred.result
990
+ }
991
+ # ROC can fail if the data is imbalanced. to handle it,
992
+ # we are skipping ROC calculation and giving warning.
993
+ try:
994
+ # Fitting ROC
995
+ roc_out = ROC(**fit_params)
996
+ print("\nROC-AUC : ")
997
+ print_data(roc_out.result)
998
+ print_data(roc_out.output_data)
999
+ except TeradataMlException as e:
1000
+ msg = f"ROC fitting skipped: {e}"
1001
+ warnings.warn(message=msg, stacklevel=2)
763
1002
 
764
- for key, value in tar_dct.items():
765
- print(f"{key}: {value}")
766
-
767
- # Renaming probability column if any
768
- prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
769
- if len(prob_lst) > 0:
770
- rename_dict ={}
771
- for col in pred.result.columns:
772
- if col not in prob_lst:
773
- rename_dict[col] = getattr(pred.result, col)
774
- else:
775
- indx = int(col.split('_')[1])
776
- rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
777
- rename_dict['drop_columns'] = True
778
- pred.result = pred.result.assign(**rename_dict)
779
-
780
- print("\nPrediction : ")
781
- print(pred.result)
782
-
783
- if self.target_column_ind:
784
- prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
785
- probability_column = 'prob_1'
786
- pred_target_count = pred.result.drop_duplicate(self.target_column).size
787
- # Displaying confusion matrix and ROC-AUC for classification problem
788
- if self.is_classification_type():
789
- print_data = lambda data: print(data) if _is_terminal() else display(data)
790
- # Displaying ROC-AUC for binary classification
791
- if self.target_count == 2 and pred_target_count == 2:
792
- fit_params = {
793
- "probability_column" : probability_column,
794
- "observation_column" : self.target_column,
795
- "positive_class" : "1",
796
- "data" : pred.result
797
- }
798
- # Fitting ROC
799
- roc_out = ROC(**fit_params)
800
- print("\nROC-AUC : ")
801
- print_data(roc_out.result)
802
- print_data(roc_out.output_data)
803
-
804
- # Displaying confusion matrix for binary and multiclass classification
805
- prediction_df=pred.result.to_pandas()
806
- target_col = self.target_column
807
- print("\nConfusion Matrix : ")
808
- print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
809
-
1003
+ # Displaying confusion matrix for binary and multiclass classification
1004
+ prediction_df = pred.result.to_pandas()
1005
+ target_col = self.target_column
1006
+ print("\nConfusion Matrix : ")
1007
+ print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
1008
+ else:
1009
+ print("\n Cluster Assignment:")
1010
+ pred_cols = pred.columns
1011
+ # Auto-detect cluster prediction column
1012
+ cluster_col = [col for col in pred_cols if "predict" in col.lower()][0]
1013
+
1014
+ # Select and rename for pretty output
1015
+
1016
+ pred = pred.assign(cluster_assignment=getattr(pred, cluster_col))
1017
+ pred = pred.drop(columns=[cluster_col])
1018
+ prediction = pred.select(["id", "cluster_assignment"])
1019
+ # Display result
1020
+ print(prediction)
810
1021
  # Returning prediction
811
- return pred.result
1022
+ return pred.result if not self.cluster else prediction
812
1023
 
813
1024
  @collect_queryband(queryband="AutoML_evaluate")
814
1025
  def evaluate(self,
815
1026
  data,
816
- rank = 1,
817
- use_loaded_models = False
1027
+ rank=1,
1028
+ use_loaded_models=False
818
1029
  ):
819
1030
  """
820
1031
  DESCRIPTION:
821
1032
  Function evaluates on data using model rank in leaderboard
822
1033
  and generates performance metrics.
823
1034
  Note:
1035
+ * AutoCluster does not support evaluate method, so it raises an exception.
824
1036
  * If both fit and load method are called before predict, then fit method model will be used
825
1037
  for prediction by default unless 'use_loaded_models' is set to True in predict.
826
1038
 
@@ -852,8 +1064,9 @@ class AutoML:
852
1064
  TeradataMlException.
853
1065
 
854
1066
  EXAMPLES:
855
- # Create an instance of the AutoML called "automl_obj"
856
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1067
+ # Create an instance of the AutoML called "automl_obj" by referring
1068
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1069
+ # "AutoFraud()" or "AutoChurn()" method.
857
1070
  # Perform fit() operation on the "automl_obj".
858
1071
  # Perform evaluate() operation on the "automl_obj".
859
1072
 
@@ -876,6 +1089,12 @@ class AutoML:
876
1089
  >>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
877
1090
  >>> evaluation
878
1091
  """
1092
+ # Currently AutoCluster does not support evaluate so raising the exception
1093
+ if self.cluster:
1094
+ raise TeradataMlException(
1095
+ Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
1096
+ MessageCodes.UNSUPPORTED_OPERATION)
1097
+
879
1098
  # Raising exception if fit or load model is not called before evaluate
880
1099
  _Validators._validate_dependent_method("evaluate", ["fit", "load"],
881
1100
  [self._is_fit_called, self._is_load_model_called])
@@ -907,7 +1126,7 @@ class AutoML:
907
1126
 
908
1127
  # Raising exception if target column is not present in data
909
1128
  # as it is required for evaluation.
910
- if self.target_column not in data.columns:
1129
+ if not self.cluster and self.target_column not in data.columns:
911
1130
  raise TeradataMlException(
912
1131
  Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
913
1132
  MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
@@ -917,47 +1136,41 @@ class AutoML:
917
1136
  if not self.table_name_mapping.get(data_node_id):
918
1137
  # At first data transformation will be performed on raw test data
919
1138
  # then evaluation will happen.
920
- self.transform_data(data)
1139
+ self._transform_data(data)
921
1140
  else:
922
1141
  print("\nSkipping data transformation as data is already transformed.")
923
1142
 
924
- metrics = self.m_evaluator.model_evaluation(rank = rank,
1143
+ metrics = self.m_evaluator.model_evaluation(rank=rank,
925
1144
  table_name_mapping=self.table_name_mapping,
926
- data_node_id = data_node_id,
927
- get_metrics = True)
1145
+ data_node_id=data_node_id,
1146
+ get_metrics=True,
1147
+ is_predict=False)
928
1148
 
929
1149
  # Checking if problem type is classification and target label is present.
930
- if self.is_classification_type() and self.target_label is not None:
931
- # Displaying target column labels
932
- tar_dct = {}
933
- print('\nTarget Column Mapping:')
934
- # Iterating rows
935
- for row in self.target_label.result.itertuples():
936
- # Retrieving the category names of encoded target column
937
- # row[1] contains the orginal name of cateogry
938
- # row[2] contains the encoded value
939
- if row[1] != 'TD_CATEGORY_COUNT':
940
- tar_dct[row[1]] = row[2]
941
-
942
- for key, value in tar_dct.items():
943
- print(f"{key}: {value}")
944
-
945
- # Showing performance metrics
946
- print("\nPerformance Metrics : ")
947
- print(metrics.result)
948
- if self.is_classification_type():
949
- print("-"*80)
950
- print(metrics.output_data)
1150
+ if not self.cluster:
1151
+ self._display_target_column_mapping()
1152
+
1153
+ # Showing performance metrics
1154
+ print("\nPerformance Metrics : ")
1155
+ print(metrics.result)
1156
+ if self.is_classification_type():
1157
+ print("-"*80)
1158
+ print(metrics.output_data)
1159
+
1160
+ # Returning performance metrics
1161
+ return metrics.result
1162
+ else:
1163
+ print("\nClustering Evaluation Metrics : ")
1164
+ print(metrics)
1165
+ return metrics
951
1166
 
952
- # Returning performance metrics
953
- return metrics.result
954
-
955
- def transform_data(self,
956
- data,
957
- data_params = None,
958
- auto = None,
959
- verbose = None,
960
- target_column_ind = None):
1167
+ def _transform_data(self,
1168
+ data,
1169
+ feature_selection_mtd=None,
1170
+ data_params=None,
1171
+ auto=None,
1172
+ verbose=None,
1173
+ target_column_ind=None):
961
1174
  """
962
1175
  DESCRIPTION:
963
1176
  Function transforms the data based on the data transformation parameters
@@ -968,7 +1181,13 @@ class AutoML:
968
1181
  Required Argument.
969
1182
  Specifies the dataset to be transformed.
970
1183
  Types: teradataml DataFrame
971
-
1184
+
1185
+ feature_selection_mtd:
1186
+ Optional Argument.
1187
+ Specifies the feature selection method to be applied.
1188
+ Default Value: None
1189
+ Types: str
1190
+
972
1191
  data_params:
973
1192
  Optional Argument.
974
1193
  Specifies the data transformation parameters.
@@ -997,14 +1216,16 @@ class AutoML:
997
1216
  None
998
1217
  """
999
1218
  # Creating instance of DataTransformation
1000
- data_transform_instance = _DataTransformation(data = data,
1219
+ data_transform_instance = _DataTransformation(data=data,
1001
1220
  data_transformation_params=data_params if data_params is not None else \
1002
1221
  self.data_transformation_params,
1003
1222
  auto=auto if data_params is not None else self.auto,
1004
1223
  verbose=verbose if verbose is not None else self.verbose,
1005
1224
  target_column_ind=target_column_ind if target_column_ind is not None else \
1006
1225
  self.target_column_ind,
1007
- table_name_mapping=self.table_name_mapping)
1226
+ table_name_mapping=self.table_name_mapping,
1227
+ cluster=self.cluster,
1228
+ feature_selection_method=feature_selection_mtd)
1008
1229
 
1009
1230
  # Storing mapping of table names for transformed data
1010
1231
  self.table_name_mapping = data_transform_instance.data_transformation()
@@ -1022,8 +1243,9 @@ class AutoML:
1022
1243
  TeradataMlException.
1023
1244
 
1024
1245
  EXAMPLES:
1025
- # Create an instance of the AutoML called "automl_obj"
1026
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1246
+ # Create an instance of the AutoML called "automl_obj" by referring
1247
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1248
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1027
1249
  # Perform fit() operation on the "automl_obj".
1028
1250
  # Generate leaderboard using leaderboard() method on "automl_obj".
1029
1251
  >>> automl_obj.leaderboard()
@@ -1046,8 +1268,9 @@ class AutoML:
1046
1268
  TeradataMlException.
1047
1269
 
1048
1270
  EXAMPLES:
1049
- # Create an instance of the AutoML called "automl_obj"
1050
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1271
+ # Create an instance of the AutoML called "automl_obj" by referring
1272
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1273
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1051
1274
  # Perform fit() operation on the "automl_obj".
1052
1275
  # Generate leaderboard using leaderboard() method on "automl_obj".
1053
1276
  # Display best performing model using leader() method on "automl_obj".
@@ -1095,8 +1318,9 @@ class AutoML:
1095
1318
 
1096
1319
  EXAMPLES:
1097
1320
  # Example 1: Get hyperparameters of the model using fit models.
1098
- # Create an instance of the AutoML called "automl_obj"
1099
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1321
+ # Create an instance of the AutoML called "automl_obj" by referring
1322
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1323
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1100
1324
  # Perform fit() operation on the "automl_obj".
1101
1325
  # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1102
1326
  >>> automl_obj = AutoML(task_type="Classification")
@@ -1104,8 +1328,9 @@ class AutoML:
1104
1328
  >>> automl_obj.model_hyperparameters(rank=1)
1105
1329
 
1106
1330
  # Example 2: Get hyperparameters of the model using loaded models.
1107
- # Create an instance of the AutoML called "automl_obj"
1108
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1331
+ # Create an instance of the AutoML called "automl_obj" by referring
1332
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1333
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1109
1334
  # Load models from the specified table.
1110
1335
  # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1111
1336
  >>> automl_obj = AutoML()
@@ -1113,8 +1338,9 @@ class AutoML:
1113
1338
  >>> automl_obj.model_hyperparameters(rank=1)
1114
1339
 
1115
1340
  # Example 3: Get hyperparameters of the model when both fit and load method are called.
1116
- # Create an instance of the AutoML called "automl_obj"
1117
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1341
+ # Create an instance of the AutoML called "automl_obj" by referring
1342
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1343
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1118
1344
  # Fit the data.
1119
1345
  # Load models from the specified table.
1120
1346
  # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
@@ -1152,7 +1378,9 @@ class AutoML:
1152
1378
  hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
1153
1379
 
1154
1380
  # Deserializing hyperparameters
1155
- hyperparams = ast.literal_eval(hyperparams)
1381
+
1382
+ if isinstance(hyperparams, str):
1383
+ hyperparams = ast.literal_eval(hyperparams)
1156
1384
 
1157
1385
  # Removing 'data' from hyperparameters
1158
1386
  keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
@@ -1167,7 +1395,8 @@ class AutoML:
1167
1395
  """
1168
1396
  DESCRIPTION:
1169
1397
  Function loads models information from the specified table.
1170
-
1398
+ Note:
1399
+ * AutoCluster does not support load method, so it raises an exception.
1171
1400
  PARAMETERS:
1172
1401
  table_name:
1173
1402
  Required Argument.
@@ -1181,12 +1410,19 @@ class AutoML:
1181
1410
  TeradataMlException.
1182
1411
 
1183
1412
  EXAMPLES:
1184
- # Create an instance of the AutoML called "obj"
1185
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1413
+ # Create an instance of the AutoML called "obj" by referring
1414
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1415
+ # "AutoFraud()" or "AutoChurn()" method.
1186
1416
  >>> obj = AutoML()
1187
1417
  # Load models from the specified table.
1188
1418
  >>> tab = obj.load("model_table")
1189
1419
  """
1420
+ # Currently AutoCluster does not support load so raising the exception
1421
+ if self.cluster:
1422
+ raise TeradataMlException(
1423
+ Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
1424
+ MessageCodes.UNSUPPORTED_OPERATION)
1425
+
1190
1426
  # Appending arguments to list for validation
1191
1427
  arg_info_matrix = []
1192
1428
  arg_info_matrix.append(["table_name", table_name, True, (str), True])
@@ -1196,6 +1432,19 @@ class AutoML:
1196
1432
 
1197
1433
  # Loading models
1198
1434
  self.loaded_models_info = DataFrame(table_name).to_pandas()
1435
+ cols = self.loaded_models_info.columns
1436
+
1437
+ # Scan column names to determine task_type based on presence of "ACCURACY"
1438
+ if any("ACCURACY" in col.upper() for col in cols):
1439
+ self.task_type = "Classification"
1440
+ else:
1441
+ self.task_type = "Regression"
1442
+
1443
+ if not hasattr(self, "m_evaluator") or self.m_evaluator is None:
1444
+ self.m_evaluator = _ModelEvaluator(df=self.loaded_models_info,
1445
+ target_column=self.target_column,
1446
+ task_type=self.task_type,
1447
+ cluster=self.cluster)
1199
1448
 
1200
1449
  self._load_data_transform_params()
1201
1450
 
@@ -1208,8 +1457,6 @@ class AutoML:
1208
1457
  DESCRIPTION:
1209
1458
  Internal Function loads data transformation parameters from the specified table.
1210
1459
  """
1211
- from sklearn.decomposition import PCA
1212
-
1213
1460
  # Getting data transformation row
1214
1461
  data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
1215
1462
 
@@ -1236,22 +1483,23 @@ class AutoML:
1236
1483
  data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
1237
1484
 
1238
1485
  # Manually deserializing and reconstructing PCA object
1239
- load_pca_info = data_params['pca_fit_instance']
1240
- pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
1241
- pca.components_ = np.array(load_pca_info['components'])
1242
- pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
1243
- pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
1244
- pca.mean_ = np.array(load_pca_info['mean'])
1245
- pca.n_components_ = load_pca_info['n_components']
1246
- pca.noise_variance_ = load_pca_info['noise_variance']
1247
- pca.singular_values_ = np.array(load_pca_info['singular_values'])
1248
- pca.feature_names_in_ = data_params['pca_fit_columns']
1249
- pca.n_features_in_ = len(data_params['pca_fit_columns'])
1250
-
1251
- data_params['pca_fit_instance'] = pca
1486
+ if 'pca_fit_instance' in data_params:
1487
+ load_pca_info = data_params['pca_fit_instance']
1488
+ pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
1489
+ pca.components_ = np.array(load_pca_info['components'])
1490
+ pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
1491
+ pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
1492
+ pca.mean_ = np.array(load_pca_info['mean'])
1493
+ pca.n_components_ = load_pca_info['n_components']
1494
+ pca.noise_variance_ = load_pca_info['noise_variance']
1495
+ pca.singular_values_ = np.array(load_pca_info['singular_values'])
1496
+ pca.feature_names_in_ = data_params['pca_fit_columns']
1497
+ pca.n_features_in_ = len(data_params['pca_fit_columns'])
1498
+
1499
+ data_params['pca_fit_instance'] = pca
1252
1500
 
1253
1501
  self.loaded_data_transformation_params = data_params
1254
-
1502
+
1255
1503
  def _validate_ranks(self, ranks):
1256
1504
  """
1257
1505
  DESCRIPTION:
@@ -1284,16 +1532,42 @@ class AutoML:
1284
1532
 
1285
1533
  return start_rank, end_rank
1286
1534
 
1535
+ def _display_target_column_mapping(self):
1536
+ """
1537
+ DESCRIPTION:
1538
+ Internal method to display target column mapping for classification problems.
1539
+ This method displays the mapping between original target column values and
1540
+ their encoded values.
1541
+
1542
+ RETURNS:
1543
+ None
1544
+ """
1545
+ if not self.cluster and self.is_classification_type() and self.target_label is not None:
1546
+ # Displaying target column labels
1547
+ tar_dct = {}
1548
+ print('\nTarget Column Mapping:')
1549
+ # Iterating rows
1550
+ for row in self.target_label.result.itertuples():
1551
+ # Retrieving the category names of encoded target column
1552
+ # row[1] contains the orginal name of cateogry
1553
+ # row[2] contains the encoded value
1554
+ if row[1] != 'TD_CATEGORY_COUNT':
1555
+ tar_dct[row[1]] = row[2]
1556
+
1557
+ for key, value in tar_dct.items():
1558
+ print(f"{key}: {value}")
1559
+
1287
1560
  @collect_queryband(queryband="AutoML_deploy")
1288
1561
  def deploy(self,
1289
1562
  table_name,
1290
- top_n = 3,
1291
- ranks = None
1563
+ top_n=3,
1564
+ ranks=None
1292
1565
  ):
1293
1566
  """
1294
1567
  DESCRIPTION:
1295
1568
  Function saves models to the specified table name.
1296
1569
  Note:
1570
+ * AutoCluster does not support deploy method, so it raises an exception.
1297
1571
  * If 'ranks' is provided, specified models in 'ranks' will be saved
1298
1572
  and ranks will be reassigned to specified models based
1299
1573
  on the order of the leaderboard, non-specified models will be ignored.
@@ -1327,8 +1601,9 @@ class AutoML:
1327
1601
  TeradataMlException.
1328
1602
 
1329
1603
  EXAMPLES:
1330
- # Create an instance of the AutoML called "obj"
1331
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1604
+ # Create an instance of the AutoML called "obj" by referring
1605
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
1606
+ # "AutoFraud()" or "AutoChurn()" method.
1332
1607
  >>> obj = AutoML(task_type="Classification")
1333
1608
  >>> obj.fit(data = data, target_column = target_column)
1334
1609
 
@@ -1344,6 +1619,11 @@ class AutoML:
1344
1619
  # Save models based on specified rank range to the specified table.
1345
1620
  >>> obj.deploy("model_table", ranks=range(2,6))
1346
1621
  """
1622
+ # Currently AutoCluster does not support deploy so raising the exception
1623
+ if self.cluster:
1624
+ raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
1625
+ MessageCodes.UNSUPPORTED_OPERATION)
1626
+
1347
1627
  # raise Error if fit is not called
1348
1628
  _Validators._validate_dependent_method("deploy", "fit", self._is_fit_called)
1349
1629
 
@@ -1390,13 +1670,14 @@ class AutoML:
1390
1670
  # Example: {'lasso': 'ml__survived_lasso_1717475362789542',
1391
1671
  # 'rfe': 'ml__survived_rfe_1717474570567062',
1392
1672
  # 'pca': 'ml__survived_pca_1717475375119752'}
1393
- fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
1673
+ fs_to_data_dict = {fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
1394
1674
  'DATA_TABLE'].iloc[0] for fs in feature_selections}
1395
1675
 
1396
1676
  # Saving temporary training data to permanent table
1397
1677
  # We are replacing DATA_TABLE with permanent table name in model_info
1398
1678
  for key, val in fs_to_data_dict.items():
1399
- per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, key),
1679
+ prefix = 'cluster_{}'.format(key) if self.cluster else '{}_{}'.format(self.target_column, key)
1680
+ per_name = self._create_per_result_table(prefix=prefix,
1400
1681
  persist_result_table=val)
1401
1682
  fs_to_data_dict[key] = per_name
1402
1683
 
@@ -1407,10 +1688,22 @@ class AutoML:
1407
1688
  if ranks is None or len(ranks) == 0:
1408
1689
  # Saving only top 'top_n' models
1409
1690
  for index, row in self.model_info.iterrows():
1691
+ model_id = row['MODEL_ID']
1692
+ result_table = row['RESULT_TABLE']
1693
+
1694
+ if result_table is None:
1695
+ print(f" Skipping model {model_id} because RESULT_TABLE is None.")
1696
+ continue
1697
+
1698
+ if self.cluster:
1699
+ prefix = f"cluster_{model_id}"
1700
+ else:
1701
+ prefix = f"{self.target_column}_{model_id}"
1702
+
1410
1703
  if index < top_n:
1411
1704
  self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1412
1705
  if not persist:
1413
- per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1706
+ per_name = self._create_per_result_table(prefix= 'cluster_{}'.format(row['MODEL_ID']) if self.cluster else '{}_{}'.format(self.target_column, row['MODEL_ID']),
1414
1707
  persist_result_table=row['RESULT_TABLE'])
1415
1708
  self.model_info.loc[index, 'RESULT_TABLE'] = per_name
1416
1709
  else:
@@ -1430,7 +1723,8 @@ class AutoML:
1430
1723
  sv_models.loc[index, 'RANK'] = index + 1
1431
1724
  sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1432
1725
  if not persist:
1433
- per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1726
+ prefix = 'cluster_{}'.format(key) if self.cluster else '{}_{}'.format(self.target_column, key)
1727
+ per_name = self._create_per_result_table(prefix=prefix,
1434
1728
  persist_result_table=row['RESULT_TABLE'])
1435
1729
  sv_models.loc[index, 'RESULT_TABLE'] = per_name
1436
1730
 
@@ -1439,6 +1733,9 @@ class AutoML:
1439
1733
 
1440
1734
  # Saving data transformation parameters to the specified table
1441
1735
  sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
1736
+
1737
+ if "PARAMETERS" in sv_models.columns:
1738
+ sv_models["PARAMETERS"] = sv_models["PARAMETERS"].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
1442
1739
 
1443
1740
  copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
1444
1741
  'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
@@ -1477,7 +1774,6 @@ class AutoML:
1477
1774
  volatile=False)
1478
1775
  return table_name
1479
1776
 
1480
-
1481
1777
  def _deploy_data_transformation_params(self):
1482
1778
  """
1483
1779
  DESCRIPTION:
@@ -1538,7 +1834,7 @@ class AutoML:
1538
1834
  data_params[aml_step_name] = val._table_name
1539
1835
  else:
1540
1836
  per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
1541
- persist_result_table= val._table_name)
1837
+ persist_result_table=val._table_name)
1542
1838
  data_params[aml_step_name] = per_name
1543
1839
  elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
1544
1840
  for key, val in val.items():
@@ -1548,7 +1844,7 @@ class AutoML:
1548
1844
  data_params[aml_step_name][key] = val._table_name
1549
1845
  else:
1550
1846
  per_name = self._create_per_result_table(prefix='{}'.format(key),
1551
- persist_result_table= val._table_name)
1847
+ persist_result_table=val._table_name)
1552
1848
  data_params[aml_step_name][key] = per_name
1553
1849
  elif aml_step_name == 'pca_fit_instance':
1554
1850
  # Serializing PCA object
@@ -1629,14 +1925,22 @@ class AutoML:
1629
1925
  fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
1630
1926
 
1631
1927
  # Checking task type
1928
+ if 'SILHOUETTE' in self.loaded_models_info.columns or self.cluster:
1929
+ task_type = 'Clustering'
1632
1930
  if 'R2' in self.loaded_models_info.columns:
1633
- task_type='Regression'
1931
+ task_type = 'Regression'
1634
1932
  else:
1635
- task_type='Classification'
1933
+ task_type = 'Classification'
1636
1934
 
1637
1935
  # Model names mapping to Analytic Functions
1638
- func_map = {
1639
- 'XGBOOST': lambda params: XGBoost(**params),
1936
+ if self.cluster:
1937
+ func_map = {
1938
+ 'KMeans': lambda params: skl.KMeans(**params),
1939
+ 'GaussianMixture': lambda params: skl.GaussianMixture(**params)
1940
+ }
1941
+ else:
1942
+ func_map = {
1943
+ 'XGBOOST': lambda params: XGBoost(**params),
1640
1944
  'GLM': lambda params: GLM(**params),
1641
1945
  'SVM': lambda params: SVM(**params),
1642
1946
  'DECISIONFOREST': lambda params: DecisionForest(**params),
@@ -1651,32 +1955,37 @@ class AutoML:
1651
1955
  print(f"Feature Selection: {fs}")
1652
1956
 
1653
1957
  # Generating evaluation parameters
1654
- eval_params = _ModelTraining._eval_params_generation(model_name,
1655
- parameters['response_column'],
1656
- task_type)
1657
- if task_type == 'Classification':
1658
- eval_params['output_responses'] = parameters['output_responses']
1659
-
1660
- # Checking if response column is present in test data
1661
- if parameters['response_column'] not in test_data.columns:
1662
- # Checking if output type is evaluation
1663
- if output_type == 'evaluation':
1664
- # Response column is rqeuired for evaluation, raise error if not present
1665
- raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
1666
- eval_params.pop('accumulate', None)
1667
- reponse_col_present = False
1958
+ if not self.cluster:
1959
+ eval_params = _ModelTraining._eval_params_generation(model_name,
1960
+ parameters['response_column'],
1961
+ task_type)
1962
+ if task_type == 'Classification':
1963
+ eval_params['output_responses'] = parameters['output_responses']
1964
+
1965
+ # Checking if response column is present in test data
1966
+ if parameters['response_column'] not in test_data.columns:
1967
+ # Checking if output type is evaluation
1968
+ if output_type == 'evaluation':
1969
+ # Response column is rqeuired for evaluation, raise error if not present
1970
+ raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
1971
+ eval_params.pop('accumulate', None)
1972
+ reponse_col_present = False
1973
+ else:
1974
+ reponse_col_present = True
1668
1975
  else:
1669
- reponse_col_present = True
1976
+ eval_params = {}
1977
+ reponse_col_present = False
1670
1978
 
1671
1979
  # Checking if data is already transformed before or not
1672
1980
  data_node_id = test_data._nodeid
1673
1981
  if not self.table_name_mapping.get(data_node_id):
1674
1982
  # Data transformation will be performed on raw test data
1675
- self.transform_data(data=test_data,
1676
- data_params=self.loaded_data_transformation_params,
1677
- auto=self.loaded_data_transformation_params['auto_mode'],
1678
- verbose=0,
1679
- target_column_ind=reponse_col_present)
1983
+ self._transform_data(data=test_data,
1984
+ data_params=self.loaded_data_transformation_params,
1985
+ feature_selection_mtd=fs,
1986
+ auto=self.loaded_data_transformation_params['auto_mode'],
1987
+ verbose=0,
1988
+ target_column_ind=reponse_col_present)
1680
1989
 
1681
1990
  # Extracting test data
1682
1991
  for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
@@ -1684,6 +1993,49 @@ class AutoML:
1684
1993
  test_data = DataFrame(table_name)
1685
1994
  break
1686
1995
 
1996
+ if self.cluster:
1997
+ # Only PCA is used in clustering
1998
+ X = test_data
1999
+
2000
+ if 'model-obj' in self.loaded_models_info.columns:
2001
+ model = self.loaded_models_info.loc[rank, 'model-obj']
2002
+ else:
2003
+ # Recreate model from parameters
2004
+ if model_name == "KMeans":
2005
+ model = skl.KMeans(**parameters)
2006
+ elif model_name == "GaussianMixture":
2007
+ model = skl.GaussianMixture(**parameters)
2008
+ else:
2009
+ raise ValueError(f"Unsupported clustering model: {model_name}")
2010
+ model.fit(X)
2011
+ result = model.predict(X)
2012
+
2013
+ if output_type != "prediction":
2014
+ silhouette = skl.silhouette_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
2015
+ calinski = skl.calinski_harabasz_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
2016
+ davies = skl.davies_bouldin_score(X=result.select(X.columns), labels=result.select(["gridsearchcv_predict_1"]))
2017
+ return {
2018
+ "SILHOUETTE": silhouette,
2019
+ "CALINSKI": calinski,
2020
+ "DAVIES": davies
2021
+ }
2022
+
2023
+ pred_cols = result.columns
2024
+ cluster_col = [col for col in pred_cols if "predict" in col.lower()][0]
2025
+
2026
+ result = result.assign(cluster_assignment=getattr(result, cluster_col))
2027
+ result = result.drop(columns=[cluster_col])
2028
+ prediction = result.select(["id", "cluster_assignment"])
2029
+
2030
+ # Visualization
2031
+
2032
+ if hasattr(self, "m_evaluator") and self.m_evaluator:
2033
+ self.m_evaluator.table_name_mapping = self.table_name_mapping
2034
+ self.m_evaluator.data_node_id = list(self.table_name_mapping.keys())[0]
2035
+
2036
+
2037
+ return prediction
2038
+
1687
2039
  if model_name == 'KNN':
1688
2040
  train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
1689
2041
 
@@ -1723,6 +2075,16 @@ class AutoML:
1723
2075
  if reponse_col_present and output_type != 'prediction':
1724
2076
  return metrics
1725
2077
 
2078
+ if not self.cluster and hasattr(self, "m_evaluator") and self.m_evaluator:
2079
+ permitted_models = ["XGBOOST", "DECISIONFOREST"]
2080
+ if model_name.upper() in permitted_models and output_type == 'prediction':
2081
+ print("\nApplying SHAP for Model Interpretation (Load)...")
2082
+ self.m_evaluator.table_name_mapping = self.table_name_mapping
2083
+ self.m_evaluator.data_node_id = list(self.table_name_mapping.keys())[0]
2084
+
2085
+ self.m_evaluator._apply_shap(rank, isload =True)
2086
+ else:
2087
+ print(f"\nShap is not applicable for {model_name}")
1726
2088
  # Return prediction, when output type is prediction
1727
2089
  return predictions if model_name == 'KNN' else predictions.result
1728
2090
 
@@ -1749,8 +2111,9 @@ class AutoML:
1749
2111
  TeradataMlException.
1750
2112
 
1751
2113
  EXAMPLES:
1752
- # Create an instance of the AutoML called "obj"
1753
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
2114
+ # Create an instance of the AutoML called "obj" by referring
2115
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
2116
+ # "AutoFraud()" or "AutoChurn()" method.
1754
2117
  >>> obj = AutoML()
1755
2118
  # Remove saved models from the specified table.
1756
2119
  >>> obj.remove_saved_models("model_table")
@@ -1812,8 +2175,9 @@ class AutoML:
1812
2175
  TeradataMlException.
1813
2176
 
1814
2177
  EXAMPLES:
1815
- # Create an instance of the AutoML called "obj"
1816
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
2178
+ # Create an instance of the AutoML called "obj" by referring
2179
+ # "AutoML()" or "AutoRegressor()" or "AutoClassifier()" or
2180
+ # "AutoFraud()" or "AutoChurn()" or "AutoCluster()" method.
1817
2181
  # 'persist' argument must be set to True in the AutoML object.
1818
2182
  >>> obj = AutoML(verbose=2, max_models=10, persist=True)
1819
2183
 
@@ -1940,8 +2304,8 @@ class AutoML:
1940
2304
  TeradataMlException.
1941
2305
 
1942
2306
  EXAMPLES:
1943
- # Import either of AutoML or AutoClassifier or AutoRegressor or Autodataprep
1944
- # from teradataml.
2307
+ # Import either of "AutoML" or "AutoClassifier" or "AutoRegressor" or
2308
+ # or "AutoFraud" or "AutoChurn" or "AutoDataPrep" from teradataml.
1945
2309
  >>> from teradataml import AutoML
1946
2310
  >>> from teradataml import DataFrame
1947
2311
  >>> load_example_data("teradataml", "titanic")
@@ -1969,10 +2333,10 @@ class AutoML:
1969
2333
  ... length = 20,
1970
2334
  ... breadth = 15)
1971
2335
  """
1972
- _FeatureExplore._visualize(**kwargs)
2336
+ _FeatureExplore._visualize(**kwargs)
1973
2337
 
1974
2338
  @staticmethod
1975
- def generate_custom_config(file_name = "custom"):
2339
+ def generate_custom_config(file_name="custom", cluster=False):
1976
2340
  """
1977
2341
  DESCRIPTION:
1978
2342
  Function generates custom JSON file containing user customized input under current
@@ -1985,12 +2349,20 @@ class AutoML:
1985
2349
  with extension. Extension '.json' is automatically added to specified file name.
1986
2350
  Default Value: "custom"
1987
2351
  Types: str
2352
+
2353
+ cluster:
2354
+ Optional Argument.
2355
+ Specifies whether to generate configuration for clustering tasks.
2356
+ When set to True, generates clustering-specific configuration options.
2357
+ Default Value: False
2358
+ Types: bool
1988
2359
 
1989
2360
  RETURNS:
1990
2361
  None
1991
2362
 
1992
2363
  EXAMPLES:
1993
- # Import either of AutoML or AutoClassifier or AutoRegressor from teradataml.
2364
+ # Import either of "AutoML" or "AutoClassifier" or "AutoRegressor" or
2365
+ # or "AutoFraud" or "AutoChurn" or "AutoCluster" from teradataml.
1994
2366
  # As per requirement, generate json file using generate_custom_config() method.
1995
2367
 
1996
2368
  # Generate a default file named "custom.json" file using either of below options.
@@ -1999,6 +2371,12 @@ class AutoML:
1999
2371
  >>> AutoClassifier.generate_custom_config()
2000
2372
  or
2001
2373
  >>> AutoRegressor.generate_custom_config()
2374
+ or
2375
+ >>> AutoFraud.generate_custom_config()
2376
+ or
2377
+ >>> AutoChurn.generate_custom_config()
2378
+ or
2379
+ >>> AutoCluster.generate_custom_config()
2002
2380
  # The above code will generate "custom.json" file under the current working directory.
2003
2381
 
2004
2382
  # Generate different file name using "file_name" argument.
@@ -2011,7 +2389,7 @@ class AutoML:
2011
2389
 
2012
2390
  """
2013
2391
  # Intializing class
2014
- generator = _GenerateCustomJson()
2392
+ generator = _GenerateCustomJson(cluster=cluster)
2015
2393
  # Generating custom JSON data
2016
2394
  data = generator._generate_custom_json()
2017
2395
  # Converting to JSON
@@ -2022,13 +2400,13 @@ class AutoML:
2022
2400
  file.write(custom_json)
2023
2401
  print(f"\n'{json_file}' file is generated successfully under the current working directory.")
2024
2402
 
2025
-
2026
2403
  class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
2027
2404
 
2028
2405
  def __init__(self,
2029
2406
  data,
2030
2407
  target_column,
2031
- custom_data = None):
2408
+ custom_data=None,
2409
+ **kwargs):
2032
2410
  """
2033
2411
  DESCRIPTION:
2034
2412
  Function initializes the data, target column for Regression.
@@ -2052,16 +2430,17 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
2052
2430
  self.data = data
2053
2431
  self.target_column = target_column
2054
2432
  self.custom_data = custom_data
2055
-
2056
-
2433
+
2434
+ super().__init__(data=data, target_column=target_column, custom_data=custom_data, **kwargs)
2435
+
2057
2436
  def _regression(self,
2058
2437
  model_list=None,
2059
- auto = False,
2060
- verbose = 0,
2061
- max_runtime_secs = None,
2062
- stopping_metric = None,
2063
- stopping_tolerance = None,
2064
- max_models = None,
2438
+ auto=False,
2439
+ verbose=0,
2440
+ max_runtime_secs=None,
2441
+ stopping_metric=None,
2442
+ stopping_tolerance=None,
2443
+ max_models=None,
2065
2444
  **kwargs):
2066
2445
  """
2067
2446
  DESCRIPTION:
@@ -2121,8 +2500,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
2121
2500
  results are garbage collected at the end of the
2122
2501
  session.
2123
2502
  Default Value: False
2124
- Types: bool
2125
-
2503
+ Types: bool
2504
+
2126
2505
  seed:
2127
2506
  Optional Argument.
2128
2507
  Specifies the random seed for reproducibility.
@@ -2132,21 +2511,22 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
2132
2511
  RETURNS:
2133
2512
  a tuple containing, model information and leaderboard.
2134
2513
  """
2135
-
2136
2514
  # Feature Exploration Phase
2137
2515
  _FeatureExplore.__init__(self,
2138
- data = self.data,
2139
- target_column = self.target_column,
2140
- verbose=verbose)
2516
+ data=self.data,
2517
+ target_column=self.target_column,
2518
+ custom_data=self.custom_data,
2519
+ verbose=verbose,
2520
+ **kwargs)
2141
2521
  if verbose > 0:
2142
2522
  self._exploration(**kwargs)
2143
2523
  # Feature Engineering Phase
2144
2524
  _FeatureEngineering.__init__(self,
2145
- data = self.data,
2146
- target_column = self.target_column,
2147
- model_list = model_list,
2148
- verbose = verbose,
2149
- custom_data = self.custom_data,
2525
+ data=self.data,
2526
+ target_column=self.target_column,
2527
+ model_list=model_list,
2528
+ verbose=verbose,
2529
+ custom_data=self.custom_data,
2150
2530
  **kwargs)
2151
2531
  # Start time
2152
2532
  start_time = time.time()
@@ -2155,13 +2535,13 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
2155
2535
 
2156
2536
  # Data preparation Phase
2157
2537
  _DataPreparation.__init__(self,
2158
- data = self.data,
2159
- target_column = self.target_column,
2160
- verbose = verbose,
2161
- excluded_columns = excluded_columns,
2162
- custom_data = self.custom_data,
2163
- data_transform_dict = data_transformation_params,
2164
- data_mapping = data_mapping,
2538
+ data=self.data,
2539
+ target_column=self.target_column,
2540
+ verbose=verbose,
2541
+ excluded_columns=excluded_columns,
2542
+ custom_data=self.custom_data,
2543
+ data_transform_dict=data_transformation_params,
2544
+ data_mapping=data_mapping,
2165
2545
  **kwargs)
2166
2546
  features, data_transformation_params,\
2167
2547
  data_mapping = self.data_preparation(auto)
@@ -2185,19 +2565,19 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
2185
2565
 
2186
2566
  # Model Training
2187
2567
  _ModelTraining.__init__(self,
2188
- data = self.data,
2189
- target_column = self.target_column,
2190
- model_list = model_list,
2191
- verbose = verbose,
2192
- features = features,
2193
- task_type = "Regression",
2194
- custom_data = self.custom_data,
2568
+ data=self.data,
2569
+ target_column=self.target_column,
2570
+ model_list=model_list,
2571
+ verbose=verbose,
2572
+ features=features,
2573
+ task_type="Regression",
2574
+ custom_data=self.custom_data,
2195
2575
  **kwargs)
2196
- models_info, leaderboard, target_count = self.model_training(auto = auto,
2197
- max_runtime_secs = max_runtime_secs,
2198
- stopping_metric = stopping_metric,
2199
- stopping_tolerance = stopping_tolerance,
2200
- max_models = max_models)
2576
+ models_info, leaderboard, target_count = self.model_training(auto=auto,
2577
+ max_runtime_secs=max_runtime_secs,
2578
+ stopping_metric=stopping_metric,
2579
+ stopping_tolerance=stopping_tolerance,
2580
+ max_models=max_models)
2201
2581
 
2202
2582
  return (models_info, leaderboard,
2203
2583
  target_count, target_label,
@@ -2208,7 +2588,10 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2208
2588
  def __init__(self,
2209
2589
  data,
2210
2590
  target_column,
2211
- custom_data = None):
2591
+ custom_data=None,
2592
+ fraud=False,
2593
+ churn=False,
2594
+ **kwargs):
2212
2595
  """
2213
2596
  DESCRIPTION:
2214
2597
  Function initializes the data, target column for Classification.
@@ -2228,19 +2611,37 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2228
2611
  Optional Argument.
2229
2612
  Specifies json object containing user customized input.
2230
2613
  Types: json object
2614
+
2615
+ fraud:
2616
+ Optional Argument.
2617
+ Specifies whether to run fraud detection or not.
2618
+ Default Value: False
2619
+ Types: bool
2620
+
2621
+ churn:
2622
+ Optional Argument.
2623
+ Specifies whether to run churn prediction or not.
2624
+ Default Value: False
2625
+ Types: bool
2231
2626
  """
2232
2627
  self.data = data
2233
2628
  self.target_column = target_column
2234
2629
  self.custom_data = custom_data
2235
2630
 
2631
+ self.fraud = fraud
2632
+ self.churn = churn
2633
+
2634
+ super().__init__(data=data, target_column=target_column, custom_data=custom_data,
2635
+ fraud=fraud, churn=churn, **kwargs)
2636
+
2236
2637
  def _classification(self,
2237
2638
  model_list=None,
2238
- auto = False,
2239
- verbose = 0,
2240
- max_runtime_secs = None,
2241
- stopping_metric = None,
2242
- stopping_tolerance = None,
2243
- max_models = None,
2639
+ auto=False,
2640
+ verbose=0,
2641
+ max_runtime_secs=None,
2642
+ stopping_metric=None,
2643
+ stopping_tolerance=None,
2644
+ max_models=None,
2244
2645
  **kwargs):
2245
2646
  """
2246
2647
  DESCRIPTION:
@@ -2312,23 +2713,28 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2312
2713
  a tuple containing, model information and leaderboard.
2313
2714
  """
2314
2715
 
2315
-
2316
2716
  # Feature Exploration Phase
2317
2717
  _FeatureExplore.__init__(self,
2318
- data = self.data,
2319
- target_column = self.target_column,
2718
+ data=self.data,
2719
+ target_column=self.target_column,
2720
+ custom_data=self.custom_data,
2320
2721
  verbose=verbose,
2321
- task_type = "classification")
2722
+ task_type="classification",
2723
+ fraud=self.fraud,
2724
+ churn=self.churn,
2725
+ **kwargs)
2322
2726
  if verbose > 0:
2323
2727
  self._exploration(**kwargs)
2324
- # Feature Engineeting Phase
2728
+ # Feature Engineering Phase
2325
2729
  _FeatureEngineering.__init__(self,
2326
- data = self.data,
2327
- target_column = self.target_column,
2328
- model_list = model_list,
2329
- verbose = verbose,
2330
- task_type = "Classification",
2331
- custom_data = self.custom_data,
2730
+ data=self.data,
2731
+ target_column=self.target_column,
2732
+ model_list=model_list,
2733
+ verbose=verbose,
2734
+ task_type="Classification",
2735
+ custom_data=self.custom_data,
2736
+ fraud=self.fraud,
2737
+ churn=self.churn,
2332
2738
  **kwargs)
2333
2739
  # Start time
2334
2740
  start_time = time.time()
@@ -2337,16 +2743,18 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2337
2743
 
2338
2744
  # Data Preparation Phase
2339
2745
  _DataPreparation.__init__(self,
2340
- data = self.data,
2341
- target_column = self.target_column,
2342
- verbose = verbose,
2343
- excluded_columns = excluded_columns,
2344
- custom_data = self.custom_data,
2345
- data_transform_dict = data_transformation_params,
2346
- task_type = "Classification",
2347
- data_mapping = data_mapping,
2746
+ data=self.data,
2747
+ target_column=self.target_column,
2748
+ verbose=verbose,
2749
+ excluded_columns=excluded_columns,
2750
+ custom_data=self.custom_data,
2751
+ data_transform_dict=data_transformation_params,
2752
+ task_type="Classification",
2753
+ data_mapping=data_mapping,
2754
+ fraud=self.fraud,
2755
+ churn=self.churn,
2348
2756
  **kwargs)
2349
-
2757
+
2350
2758
  features, data_transformation_params, \
2351
2759
  data_mapping = self.data_preparation(auto)
2352
2760
 
@@ -2366,26 +2774,42 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2366
2774
  # Setting max_runtime_secs to 60 seconds if it is less than 0
2367
2775
  max_runtime_secs = 60 if max_runtime_secs is not None and \
2368
2776
  max_runtime_secs < 0 else max_runtime_secs
2369
-
2777
+
2370
2778
  # Model training
2371
2779
  _ModelTraining.__init__(self,
2372
- data = self.data,
2373
- target_column = self.target_column,
2374
- model_list = model_list,
2375
- verbose = verbose,
2376
- features = features,
2377
- task_type = "Classification",
2378
- custom_data = self.custom_data,
2780
+ data=self.data,
2781
+ target_column=self.target_column,
2782
+ model_list=self.model_list,
2783
+ verbose=verbose,
2784
+ features=features,
2785
+ task_type="Classification",
2786
+ custom_data=self.custom_data,
2787
+ fraud=self.fraud,
2788
+ churn=self.churn,
2379
2789
  **kwargs)
2380
- models_info, leaderboard, target_count = self.model_training(auto = auto,
2381
- max_runtime_secs = max_runtime_secs,
2382
- stopping_metric = stopping_metric,
2383
- stopping_tolerance = stopping_tolerance,
2384
- max_models = max_models)
2790
+ models_info, leaderboard, target_count = self.model_training(auto=auto,
2791
+ max_runtime_secs=max_runtime_secs,
2792
+ stopping_metric=stopping_metric,
2793
+ stopping_tolerance=stopping_tolerance,
2794
+ max_models=max_models)
2385
2795
 
2386
2796
  return (models_info, leaderboard,
2387
2797
  target_count, target_label,
2388
2798
  data_transformation_params, data_mapping)
2799
+
2800
+ def _target_column_details(self):
2801
+ """
2802
+ DESCRIPTION:
2803
+ Internal function displays the target column distribution of Target column/ Response column.
2804
+ """
2805
+ # If data visualization libraries are available
2806
+ if self._check_visualization_libraries() and not _is_terminal():
2807
+ self._display_msg(msg='\nTarget Column Distribution:',
2808
+ show_data=True)
2809
+ plt.figure(figsize=(6, 6))
2810
+ # Ploting a histogram for target column
2811
+ sns.countplot(data=self.data.select([self.target_column]).to_pandas(), x=self.target_column)
2812
+ plt.show()
2389
2813
 
2390
2814
  def _check_data_imbalance(self,
2391
2815
  data=None):
@@ -2468,7 +2892,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2468
2892
  show_data=True)
2469
2893
 
2470
2894
  # Importing required libraries
2471
- from imblearn.over_sampling import SMOTE
2895
+ from imblearn.over_sampling import SMOTE, ADASYN
2896
+ from imblearn.combine import SMOTETomek
2472
2897
  from imblearn.under_sampling import NearMiss
2473
2898
 
2474
2899
  st = time.time()
@@ -2480,10 +2905,18 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2480
2905
  # Fetching the minimum target column label count and
2481
2906
  # accordingly setting the number of neighbors for the sampler
2482
2907
  min_label_count = min(data[self.target_column].value_counts())
2483
- if self._data_sampling_method == 'SMOTE':
2908
+ self._display_msg(msg=f"\nApplying {self._data_sampling_method}...",
2909
+ progress_bar=self.progress_bar,
2910
+ show_data=True)
2911
+ if self._data_sampling_method.lower() == 'smote':
2484
2912
  n_neighbors = min(5, min_label_count - 1)
2485
2913
  sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=42)
2486
- else:
2914
+ elif self._data_sampling_method.lower() == 'adasyn':
2915
+ n_neighbors = min(5, min_label_count - 1)
2916
+ sampling_method = ADASYN(n_neighbors=n_neighbors, random_state=42)
2917
+ elif self._data_sampling_method.lower == 'smotetomek':
2918
+ sampling_method = SMOTETomek(random_state=42)
2919
+ elif self._data_sampling_method == 'nearmiss':
2487
2920
  n_neighbors = min(3, min_label_count)
2488
2921
  sampling_method = NearMiss(version=1, n_neighbors=n_neighbors)
2489
2922
 
@@ -2516,11 +2949,11 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2516
2949
  class AutoRegressor(AutoML):
2517
2950
 
2518
2951
  def __init__(self,
2519
- include = None,
2520
- exclude = None,
2952
+ include=None,
2953
+ exclude=None,
2521
2954
  verbose=0,
2522
2955
  max_runtime_secs=None,
2523
- stopping_metric=None,
2956
+ stopping_metric=None,
2524
2957
  stopping_tolerance=None,
2525
2958
  max_models=None,
2526
2959
  custom_config_file=None,
@@ -2532,7 +2965,6 @@ class AutoRegressor(AutoML):
2532
2965
  Note:
2533
2966
  * configure.temp_object_type="VT" follows sequential execution.
2534
2967
 
2535
-
2536
2968
  PARAMETERS:
2537
2969
  include:
2538
2970
  Optional Argument.
@@ -2736,34 +3168,35 @@ class AutoRegressor(AutoML):
2736
3168
  >>> performance_metrics = automl_obj.evaluate(housing_test)
2737
3169
  >>> performance_metrics
2738
3170
  """
2739
- self.verbose = verbose
2740
- self.max_runtime_secs = max_runtime_secs
2741
- self.stopping_metric = stopping_metric
2742
- self.stopping_tolerance = stopping_tolerance
2743
- self.max_models = max_models
2744
- self.custom_config_file = custom_config_file
2745
- self.task_type = "Regression"
2746
- self.include = include
2747
- self.exclude = exclude
2748
-
2749
- super(AutoRegressor, self).__init__(task_type=self.task_type,
2750
- include = self.include,
2751
- exclude = self.exclude,
2752
- verbose=self.verbose,
2753
- max_runtime_secs=self.max_runtime_secs,
2754
- stopping_metric=self.stopping_metric,
2755
- stopping_tolerance=self.stopping_tolerance,
2756
- max_models=self.max_models,
2757
- custom_config_file=self.custom_config_file,
2758
- **kwargs)
3171
+
3172
+ # Validate unsupported 'task_type' argument
3173
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
3174
+
3175
+ # Validate unsupported 'is_churn' argument
3176
+ _Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
3177
+
3178
+ # Validate unsupported 'is_fraud' argument
3179
+ _Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
3180
+
3181
+ super(AutoRegressor, self).__init__(task_type="Regression",
3182
+ include=include,
3183
+ exclude=exclude,
3184
+ verbose=verbose,
3185
+ max_runtime_secs=max_runtime_secs,
3186
+ stopping_metric=stopping_metric,
3187
+ stopping_tolerance=stopping_tolerance,
3188
+ max_models=max_models,
3189
+ custom_config_file=custom_config_file,
3190
+ **kwargs)
3191
+
2759
3192
  class AutoClassifier(AutoML):
2760
3193
 
2761
3194
  def __init__(self,
2762
- include = None,
2763
- exclude = None,
3195
+ include=None,
3196
+ exclude=None,
2764
3197
  verbose=0,
2765
3198
  max_runtime_secs=None,
2766
- stopping_metric=None,
3199
+ stopping_metric=None,
2767
3200
  stopping_tolerance=None,
2768
3201
  max_models=None,
2769
3202
  custom_config_file=None,
@@ -2774,7 +3207,6 @@ class AutoClassifier(AutoML):
2774
3207
  AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
2775
3208
  Note:
2776
3209
  * configure.temp_object_type="VT" follows sequential execution.
2777
-
2778
3210
 
2779
3211
  PARAMETERS:
2780
3212
  include:
@@ -2867,6 +3299,13 @@ class AutoClassifier(AutoML):
2867
3299
  Specifies the random seed for reproducibility.
2868
3300
  Default Value: 42
2869
3301
  Types: int
3302
+
3303
+ imbalance_handling_method:
3304
+ Optional Argument.
3305
+ Specifies which data imbalance method to use
3306
+ Default Value: SMOTE
3307
+ Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
3308
+ Types: str
2870
3309
 
2871
3310
  RETURNS:
2872
3311
  Instance of AutoClassifier.
@@ -3069,23 +3508,1489 @@ class AutoClassifier(AutoML):
3069
3508
  >>> performance_metrics = automl_obj.evaluate(iris_test, 3)
3070
3509
  >>> performance_metrics
3071
3510
  """
3072
- self.verbose = verbose
3073
- self.max_runtime_secs = max_runtime_secs
3074
- self.stopping_metric = stopping_metric
3075
- self.stopping_tolerance = stopping_tolerance
3076
- self.max_models = max_models
3077
- self.custom_config_file = custom_config_file
3078
- self.task_type = "Classification"
3079
- self.include = include
3080
- self.exclude = exclude
3081
-
3082
- super(AutoClassifier, self).__init__(task_type=self.task_type,
3083
- include = self.include,
3084
- exclude = self.exclude,
3085
- verbose=self.verbose,
3086
- max_runtime_secs=self.max_runtime_secs,
3087
- stopping_metric=self.stopping_metric,
3088
- stopping_tolerance=self.stopping_tolerance,
3089
- max_models=self.max_models,
3090
- custom_config_file=self.custom_config_file,
3511
+
3512
+ # Validate unsupported 'task_type' argument
3513
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
3514
+
3515
+ # Validate unsupported 'is_churn' argument
3516
+ _Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
3517
+
3518
+ # Validate unsupported 'is_fraud' argument
3519
+ _Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
3520
+
3521
+ super(AutoClassifier, self).__init__(task_type="Classification",
3522
+ include=include,
3523
+ exclude=exclude,
3524
+ verbose=verbose,
3525
+ max_runtime_secs=max_runtime_secs,
3526
+ stopping_metric=stopping_metric,
3527
+ stopping_tolerance=stopping_tolerance,
3528
+ max_models=max_models,
3529
+ custom_config_file=custom_config_file,
3091
3530
  **kwargs)
3531
+
3532
+ class _AutoSpecific(_Classification):
3533
+
3534
+ def __init__(self,
3535
+ data,
3536
+ target_column,
3537
+ custom_data,
3538
+ fraud=False,
3539
+ churn=False,
3540
+ **kwargs):
3541
+ """
3542
+
3543
+ DESCRIPTION:
3544
+ Function initializes the data, target colum for AutoFraud.
3545
+
3546
+ PARAMETERS:
3547
+ data:
3548
+ Required Argument.
3549
+ Specifies the input teradataml Dataframe.
3550
+ Types: teradataml Dataframe
3551
+
3552
+ target_column:
3553
+ Required Argument.
3554
+ Specifies the name of the target column in "data".
3555
+ Types: str
3556
+
3557
+ custom_data:
3558
+ Optional Argument.
3559
+ Specifies json object containing user customized input.
3560
+ Types: json object
3561
+
3562
+ fraud:
3563
+ Optional Argument.
3564
+ Specifies whether to run AutoFraud or not.
3565
+ Default Value: False
3566
+ Types: bool
3567
+
3568
+ churn:
3569
+ Optional Argument.
3570
+ Specifies whether to run AutoChurn or not.
3571
+ Default Value: False
3572
+ Types: bool
3573
+
3574
+ **kwargs:
3575
+ Specifies the additional arguments for AutoChurn or AutoFraud. Below
3576
+ are the additional arguments:
3577
+ volatile:
3578
+ Optional Argument.
3579
+ Specifies whether to put the interim results of the
3580
+ functions in a volatile table or not. When set to
3581
+ True, results are stored in a volatile table,
3582
+ otherwise not.
3583
+ Default Value: False
3584
+ Types: bool
3585
+
3586
+ persist:
3587
+ Optional Argument.
3588
+ Specifies whether to persist the interim results of the
3589
+ functions in a table or not. When set to True,
3590
+ results are persisted in a table; otherwise,
3591
+ results are garbage collected at the end of the
3592
+ session.
3593
+ Note:
3594
+ * User is responsible for cleanup of the persisted tables. List of persisted tables
3595
+ in current session can be viewed using get_persisted_tables() method.
3596
+ Default Value: False
3597
+ Types: bool
3598
+
3599
+ seed:
3600
+ Optional Argument.
3601
+ Specifies the random seed for reproducibility.
3602
+ Default Value: 42
3603
+ Types: int
3604
+ """
3605
+ self.fraud = fraud
3606
+ self.churn = churn
3607
+
3608
+ self.volatile = kwargs.get("volatile", False)
3609
+ self.persist = kwargs.get("persist", False)
3610
+
3611
+ super().__init__(data, target_column, custom_data, fraud=fraud, churn=churn, **kwargs)
3612
+
3613
+ def fit(self, **kwargs):
3614
+ """
3615
+ DESCRIPTION:
3616
+ Function triggers the AutoFraud or AutoChurn run.
3617
+ PARAMETERS:
3618
+ **kwargs:
3619
+ Specifies the additional arguments for AutoChurn or AutoFraud. Below
3620
+ are the additional arguments:
3621
+ volatile:
3622
+ Optional Argument.
3623
+ Specifies whether to put the interim results of the
3624
+ functions in a volatile table or not. When set to
3625
+ True, results are stored in a volatile table,
3626
+ otherwise not.
3627
+ Default Value: False
3628
+ Types: bool
3629
+
3630
+ persist:
3631
+ Optional Argument.
3632
+ Specifies whether to persist the interim results of the
3633
+ functions in a table or not. When set to True,
3634
+ results are persisted in a table; otherwise,
3635
+ results are garbage collected at the end of the
3636
+ session.
3637
+ Note:
3638
+ * User is responsible for cleanup of the persisted tables. List of persisted tables
3639
+ in current session can be viewed using get_persisted_tables() method.
3640
+ Default Value: False
3641
+ Types: bool
3642
+
3643
+ seed:
3644
+ Optional Argument.
3645
+ Specifies the random seed for reproducibility.
3646
+ Default Value: 42
3647
+ Types: int
3648
+ """
3649
+ self.model_info, self.leader_board, self.target_count, self.target_label, \
3650
+ self.data_transformation_params, self.table_name_mapping = super()._classification(**kwargs)
3651
+ self.m_evaluator = _ModelEvaluator(self.model_info,
3652
+ self.target_column,
3653
+ self.task_type)
3654
+ return (self.model_info, self.leader_board, self.target_count, self.target_label, \
3655
+ self.data_transformation_params, self.table_name_mapping)
3656
+
3657
+ def _handling_missing_value(self):
3658
+ """
3659
+ DESCRIPTION:
3660
+ Override function for handling missing values in the dataset specifically for fraud detection.
3661
+ This function ensures that rows are flagged for imputation instead of being dropped while retaining
3662
+ the column-dropping behavior for columns with excessive missing values.
3663
+ """
3664
+ fn_name = "AutoFraud " if self.fraud else ("AutoChurn " if self.churn else "")
3665
+
3666
+ self._display_msg(msg=f"\nChecking Missing values in dataset using {fn_name}function...",
3667
+ progress_bar=self.progress_bar,
3668
+ show_data=True)
3669
+ start_time = time.time()
3670
+
3671
+ # Number of rows
3672
+ d_size = self.data.shape[0]
3673
+
3674
+ drop_cols = []
3675
+ self.imputation_cols = {}
3676
+
3677
+ # Get count of missing values per column
3678
+ cols_miss_val = self._missing_count_per_column()
3679
+
3680
+ if len(cols_miss_val) != 0:
3681
+ self._display_msg(msg="Columns with their missing values:",
3682
+ col_lst=cols_miss_val,
3683
+ progress_bar=self.progress_bar)
3684
+
3685
+ # Get distinct value in each column
3686
+ self._get_distinct_count()
3687
+
3688
+ # Iterating over columns with missing values
3689
+ for col, val in cols_miss_val.items():
3690
+
3691
+ # Drop column if >60% values are missing
3692
+ if val > 0.6 * d_size:
3693
+ drop_cols.append(col)
3694
+ continue
3695
+
3696
+ # For numerical columns
3697
+ if self.data_types[col] in ['float', 'int']:
3698
+ corr_df = self.data[col].corr(self.data[self.target_column])
3699
+ corr_val = self.data.assign(True, corr_=corr_df)
3700
+ related = next(corr_val.itertuples())[0]
3701
+
3702
+ # Flag column for imputation instead of row deletion
3703
+ if val < 0.02 * d_size and related <= 0.25:
3704
+ self.imputation_cols[col] = val
3705
+ continue
3706
+
3707
+ # For categorical columns
3708
+ elif self.data_types[col] in ['str']:
3709
+ # Flag column for imputation instead of row deletion
3710
+ if val < 0.04 * d_size:
3711
+ self.imputation_cols[col] = val
3712
+ continue
3713
+ # Drop column if unique count >75%
3714
+ elif self.counts_dict[f'count_{col}'] > 0.75 * (d_size - val):
3715
+ drop_cols.append(col)
3716
+ continue
3717
+
3718
+ # Default: Flag column for imputation
3719
+ self.imputation_cols[col] = val
3720
+
3721
+ # Drop columns
3722
+ if len(drop_cols) != 0:
3723
+ self.data = self.data.drop(drop_cols, axis=1)
3724
+ # Store dropped columns in the data transform dictionary
3725
+ self.data_transform_dict['drop_missing_columns'] = drop_cols
3726
+ self._display_msg(msg='Dropping these columns for handling missing values:',
3727
+ col_lst=drop_cols,
3728
+ progress_bar=self.progress_bar)
3729
+ self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
3730
+ data=self.data,
3731
+ progress_bar=self.progress_bar)
3732
+
3733
+ # Display imputation details
3734
+ if len(self.imputation_cols) != 0:
3735
+ # Store imputation columns in the data transform dictionary
3736
+ self.data_transform_dict['imputation_columns'] = self.imputation_cols
3737
+ self._display_msg(msg="Flagging these columns for imputation:",
3738
+ col_lst=list(self.imputation_cols.keys()),
3739
+ progress_bar=self.progress_bar)
3740
+
3741
+ # If no missing values are detected
3742
+ if len(self.imputation_cols) == 0 and len(drop_cols) == 0:
3743
+ self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
3744
+ progress_bar=self.progress_bar)
3745
+
3746
+ end_time = time.time()
3747
+ self._display_msg(msg=f"Total time to find missing values in data using {fn_name}: {{:.2f}} sec ".format(end_time - start_time),
3748
+ progress_bar=self.progress_bar,
3749
+ show_data=True)
3750
+
3751
+ def _impute_missing_value(self):
3752
+ """
3753
+ DESCRIPTION:
3754
+ Override Function performs the imputation on columns/features with missing values in the dataset
3755
+ using Partition column argument in SimpleImputeFit.
3756
+ """
3757
+
3758
+ start_time = time.time()
3759
+ self._display_msg(msg="\nImputing Missing Values using SimpleImputeFit partition column...",
3760
+ progress_bar=self.progress_bar,
3761
+ show_data=True)
3762
+
3763
+ if len(self.imputation_cols) != 0:
3764
+
3765
+ # List of columns and imputation Method
3766
+ col_stat, stat = self._impute_helper()
3767
+ ## Workaround done for bug https://teradata-pe.atlassian.net/browse/TDAF-15617.
3768
+ ## Temporarily commenting out partition_column arguments.
3769
+ fit_obj = SimpleImputeFit(data=self.data,
3770
+ stats_columns=col_stat,
3771
+ #partition_column=self.target_column,
3772
+ stats=stat,
3773
+ volatile=self.volatile,
3774
+ persist=self.persist)
3775
+
3776
+
3777
+ # Storing fit object for imputation in data transform dictionary
3778
+ self.data_transform_dict['imputation_fit_object'] = fit_obj.output
3779
+ #self.data_transform_dict['imputation_partition_column'] = self.target_column
3780
+ sm = SimpleImputeTransform(data=self.data,
3781
+ object=fit_obj.output,
3782
+ #data_partition_column = self.target_column,
3783
+ #object_partition_column = self.target_column,
3784
+ volatile=self.volatile,
3785
+ persist=self.persist)
3786
+
3787
+ self.data = sm.result
3788
+ self._display_msg(msg="Sample of dataset after Imputation:",
3789
+ data=self.data,
3790
+ progress_bar=self.progress_bar)
3791
+ else:
3792
+ self._display_msg(inline_msg="Analysis completed. No imputation required.",
3793
+ progress_bar=self.progress_bar)
3794
+
3795
+ end_time = time.time()
3796
+ self._display_msg(msg="Time taken to perform imputation: {:.2f} sec ".format(end_time - start_time),
3797
+ progress_bar=self.progress_bar,
3798
+ show_data=True)
3799
+
3800
+ def _outlier_detection(self,
3801
+ column_list,
3802
+ outlier_method="percentile",
3803
+ lower_percentile=0.01,
3804
+ upper_percentile=0.99):
3805
+ """
3806
+ DESCRIPTION:
3807
+ Function detects the outlier in numerical column and display thier percentage.
3808
+
3809
+ PARAMETERS:
3810
+ column_list:
3811
+ Required Argument.
3812
+ Specifies the numeric columns for outlier percentage calculation.
3813
+ Types: str or list of strings (str)
3814
+
3815
+ outlier_method:
3816
+ Required Argument.
3817
+ Specifies the outlier method required for outlier detection.
3818
+ Types: str
3819
+ Default Value: "percentile"
3820
+ Permitted Values: "percentile", "tukey", "carling"
3821
+
3822
+ lower_percentile:
3823
+ Optional Argument.
3824
+ Specifies the lower percentile value for outlier detection in case of percentile method.
3825
+ Types: float
3826
+
3827
+ upper_percentile:
3828
+ Optional Argument.
3829
+ Specifies the upper percentile value for outlier detection in case of percentile method.
3830
+ Types: float
3831
+
3832
+ RETURNS:
3833
+ Pandas DataFrame containing, column name with outlier percentage.
3834
+
3835
+ """
3836
+ # Performing outlier fit on the data for replacing outliers with NULL value
3837
+ fit_params = {
3838
+ "data" : self.data,
3839
+ "target_columns" : column_list,
3840
+ "outlier_method" : "percentile",
3841
+ "lower_percentile" : lower_percentile,
3842
+ "upper_percentile" : upper_percentile,
3843
+ "replacement_value" : 'NULL'
3844
+ }
3845
+ OutlierFilterFit_out = OutlierFilterFit(**fit_params)
3846
+ transform_params = {
3847
+ "data" : self.data,
3848
+ "object" : OutlierFilterFit_out.result
3849
+ }
3850
+ # Performing outlier transformation on each column
3851
+ OutlierTransform_obj = OutlierFilterTransform(**transform_params)
3852
+
3853
+ # Column summary of each column of the data
3854
+ fit_params = {
3855
+ "data" : OutlierTransform_obj.result,
3856
+ "target_columns" : column_list
3857
+ }
3858
+ colSummary = ColumnSummary(**fit_params)
3859
+
3860
+ null_count_expr = colSummary.result.NullCount
3861
+ non_null_count_expr = colSummary.result.NonNullCount
3862
+
3863
+ # Calculating outlier percentage
3864
+ df = colSummary.result.assign(True,
3865
+ ColumnName = colSummary.result.ColumnName,
3866
+ OutlierPercentage = (null_count_expr/(non_null_count_expr+null_count_expr))*100)
3867
+
3868
+ # Displaying non-zero containing outlier percentage for columns
3869
+ df = df[df['OutlierPercentage']>0]
3870
+ if self.verbose > 0:
3871
+ print(" "*500, end='\r')
3872
+ if df.shape[0] > 0:
3873
+ self._display_msg(msg='Columns with outlier percentage :-',
3874
+ show_data=True)
3875
+ print(df)
3876
+ else:
3877
+ print("\nNo outlier found!")
3878
+
3879
+ return df
3880
+
3881
+ def _outlier_handling_techniques(self):
3882
+ """
3883
+ DESCRIPTION:
3884
+ Override function to determine outlier handling techniques in AutoFraud.
3885
+ Ensures no rows are removed; all outlier-affected columns are flagged for imputation.
3886
+ """
3887
+ columns_to_impute = []
3888
+
3889
+ # List of columns for outlier processing
3890
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
3891
+
3892
+ # Detecting outlier percentage in each column using the percentile method
3893
+ outlier_percentage_df = self._outlier_detection(outlier_columns)
3894
+
3895
+ # Flag all columns for imputation (no row deletion in AutoFraud)
3896
+ for i in outlier_percentage_df.itertuples():
3897
+ col = i[0] # Column name
3898
+ columns_to_impute.append(col)
3899
+
3900
+ return [], columns_to_impute # No columns will be marked for row deletion
3901
+
3902
+ def _outlier_handling(self, target_columns, outlier_method, replacement_value="MEDIAN"):
3903
+ """
3904
+ DESCRIPTION:
3905
+ Override function to handle outliers in AutoFraud.
3906
+ Ensures no rows are removed while handling outliers by imputing them instead.
3907
+ """
3908
+ # Enforce imputation strategy instead of row deletion
3909
+ replacement_value = "MEDIAN" if replacement_value == "DELETE" else replacement_value
3910
+
3911
+ # Setting volatile and persist parameters for Outlier handling function
3912
+ volatile, persist = self._get_generic_parameters(func_indicator='OutlierFilterIndicator',
3913
+ param_name='OutlierFilterParam')
3914
+
3915
+ # Performing fit on dataset for outlier handling
3916
+ fit_params = {
3917
+ "data": self.data,
3918
+ "target_columns": target_columns,
3919
+ "outlier_method": outlier_method,
3920
+ "replacement_value": replacement_value,
3921
+ "volatile": volatile,
3922
+ "persist": persist
3923
+ }
3924
+ outlier_fit_out = OutlierFilterFit(**fit_params)
3925
+
3926
+ # Performing transform on dataset for outlier handling
3927
+ transform_params = {
3928
+ "data": self.data,
3929
+ "object": outlier_fit_out.result,
3930
+ "persist": True
3931
+ }
3932
+
3933
+ # Disabling print if persist is True by default
3934
+ if not volatile and not persist:
3935
+ transform_params["display_table_name"] = False
3936
+
3937
+ if volatile:
3938
+ transform_params["volatile"] = True
3939
+ transform_params["persist"] = False
3940
+
3941
+ self.data = OutlierFilterTransform(**transform_params).result
3942
+
3943
+ if not volatile and not persist:
3944
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
3945
+
3946
+ # Returning outlier fit object to store in data mapping dictionary
3947
+ return outlier_fit_out
3948
+
3949
+ def _outlier_processing(self):
3950
+ """
3951
+ DESCRIPTION:
3952
+ Override function to perform outlier processing in AutoFraud.
3953
+ Ensures no rows are removed while handling outliers.
3954
+ Instead, affected columns are flagged for imputation.
3955
+ """
3956
+
3957
+ fn_name = "AutoFraud " if self.fraud else ("AutoChurn " if self.churn else "")
3958
+
3959
+ self._display_msg(msg=f"\n{fn_name}Outlier preprocessing using Percentile...",
3960
+ progress_bar=self.progress_bar,
3961
+ show_data=True)
3962
+ start_time = time.time()
3963
+
3964
+ # List of columns for imputation (No row deletion in AutoFraud)
3965
+ _, columns_to_impute = self._outlier_handling_techniques()
3966
+
3967
+ # Keeping default method for outlier handling as "Percentile"
3968
+ outlier_handling_method = "Percentile"
3969
+
3970
+ # Imputing Median value in place of outliers (No deletion)
3971
+ if len(columns_to_impute) != 0:
3972
+ self._display_msg(msg="Replacing outliers with median:",
3973
+ col_lst=columns_to_impute,
3974
+ progress_bar=self.progress_bar)
3975
+ target_columns = columns_to_impute
3976
+ replacement_strategy = "MEDIAN"
3977
+ fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
3978
+ self._display_msg(msg="Sample of dataset after replacing outliers with MEDIAN:",
3979
+ data=self.data,
3980
+ progress_bar=self.progress_bar)
3981
+
3982
+ if len(columns_to_impute) == 0:
3983
+ self._display_msg(msg='Analysis indicates no significant outliers in the dataset. No Action Taken.',
3984
+ progress_bar=self.progress_bar)
3985
+
3986
+ end_time = time.time()
3987
+ self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
3988
+ progress_bar=self.progress_bar,
3989
+ show_data=True)
3990
+
3991
+ def _encoding_categorical_columns(self):
3992
+ """
3993
+ DESCRIPTION:
3994
+ Override Function detects the categorical columns and performs target encoding instead of
3995
+ One Hot encoding on categorical columns in AutoFraud.
3996
+ """
3997
+ self._display_msg(msg="\nPerforming target encoding for categorical columns ...",
3998
+ progress_bar=self.progress_bar,
3999
+ show_data=True)
4000
+ start_time = time.time()
4001
+
4002
+ target_encoding_list = {}
4003
+
4004
+ # List of columns before target encoding
4005
+ col_bf_encoding = self.data.columns
4006
+
4007
+ # Get distinct value in each column
4008
+ self._get_distinct_count()
4009
+
4010
+ # Detect categorical columns and prepare for target encoding
4011
+ for col, d_type in self.data._column_names_and_types:
4012
+ if d_type in ['str']:
4013
+ target_encoding_list[col] = {"encoder_method": "CBM_BETA",
4014
+ "response_column": self.target_column}
4015
+
4016
+ if len(target_encoding_list) == 0:
4017
+ self._display_msg(inline_msg="Analysis completed without target encoding. No categorical columns were found.",
4018
+ progress_bar=self.progress_bar)
4019
+ return
4020
+
4021
+ self._auto_target_encoding(target_encoding_list)
4022
+
4023
+ self._display_msg(msg="Target Encoding these Columns:",
4024
+ col_lst=list(target_encoding_list.keys()),
4025
+ progress_bar=self.progress_bar)
4026
+ self._display_msg(msg="Sample of dataset after performing target encoding:",
4027
+ data=self.data,
4028
+ progress_bar=self.progress_bar)
4029
+
4030
+ # List of columns after target encoding
4031
+ col_af_encoding = self.data.columns
4032
+
4033
+ # List of excluded columns from outlier processing and scaling
4034
+ self.excluded_cols = self._extract_list(col_af_encoding, col_bf_encoding)
4035
+
4036
+ end_time = time.time()
4037
+ self._display_msg(msg="Time taken to encode the columns: {:.2f} sec".format(end_time - start_time),
4038
+ progress_bar=self.progress_bar,
4039
+ show_data=True)
4040
+
4041
+ def _auto_target_encoding(self, target_encoding_list):
4042
+ """
4043
+ DESCRIPTION:
4044
+ Function performs target encoding on categorical columns for AutoFraud.
4045
+ This function is separate from the custom target encoding method.
4046
+
4047
+ PARAMETERS:
4048
+ target_encoding_list:
4049
+ Required Argument.
4050
+ Dictionary specifying the categorical columns for which target encoding will be performed.
4051
+ Each key is a column name, and values contain encoding parameters.
4052
+ """
4053
+
4054
+ # Fetching all columns on which target encoding will be performed
4055
+ target_columns = list(target_encoding_list.keys())
4056
+
4057
+ # Checking for column presence in dataset
4058
+ _Validators._validate_dataframe_has_argument_columns(target_columns, "TargetEncodingList", self.data, "df")
4059
+
4060
+ # Finding distinct values and counts for columns
4061
+ cat_sum = CategoricalSummary(data=self.data, target_columns=target_columns)
4062
+ category_data = cat_sum.result.groupby("ColumnName").count()
4063
+ category_data = category_data.assign(drop_columns=True,
4064
+ ColumnName=category_data.ColumnName,
4065
+ CategoryCount=category_data.count_DistinctValue)
4066
+
4067
+ # Storing encoding metadata
4068
+ self.data_transform_dict["auto_target_encoding_ind"] = True
4069
+
4070
+ # Setting volatile and persist parameters for performing encoding
4071
+ volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
4072
+ param_name="CategoricalEncodingParam")
4073
+
4074
+ # Perform target encoding for each categorical column
4075
+ fit_params = {
4076
+ "data": self.data,
4077
+ "category_data": category_data,
4078
+ "encoder_method": "CBM_BETA",
4079
+ "target_columns": target_columns,
4080
+ "response_column": self.target_column,
4081
+ "volatile": volatile,
4082
+ "persist": persist,
4083
+ "default_values": [-1]
4084
+ }
4085
+
4086
+ # Perform target encoding
4087
+ tar_fit_obj = TargetEncodingFit(**fit_params)
4088
+ self.data_transform_dict["auto_target_encoding_fit_obj"] = tar_fit_obj.result
4089
+
4090
+ # Extracting accumulate columns
4091
+ accumulate_columns = self._extract_list(self.data.columns, target_columns)
4092
+ self.data_transform_dict["target_encoding_accumulate_columns"] = accumulate_columns
4093
+ # Apply the transformation
4094
+ transform_params = {
4095
+ "data": self.data,
4096
+ "object": tar_fit_obj,
4097
+ "accumulate": accumulate_columns,
4098
+ "persist": True
4099
+ }
4100
+
4101
+ # Disabling display table name if persist is True by default
4102
+ if not volatile and not persist:
4103
+ transform_params["display_table_name"] = False
4104
+
4105
+ if volatile:
4106
+ transform_params["volatile"] = True
4107
+ transform_params["persist"] = False
4108
+
4109
+ self.data = TargetEncodingTransform(**transform_params).result
4110
+
4111
+ if not volatile and not persist:
4112
+ # Adding transformed data containing table to garbage collector
4113
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
4114
+
4115
+ self._display_msg(msg="Target Encoding completed for categorical columns using CBM_BETA.",
4116
+ progress_bar=self.progress_bar)
4117
+
4118
+ class AutoFraud(AutoML):
4119
+
4120
+ def __init__(self,
4121
+ include=None,
4122
+ exclude=None,
4123
+ verbose=0,
4124
+ max_runtime_secs=None,
4125
+ stopping_metric=None,
4126
+ stopping_tolerance=None,
4127
+ max_models=None,
4128
+ custom_config_file=None,
4129
+ **kwargs
4130
+ ):
4131
+
4132
+ """
4133
+ DESCRIPTION:
4134
+ AutoFraud is a dedicated AutoML pipeline designed specifically for fraud detection
4135
+ tasks. It automates the process of building, training, and evaluating models
4136
+ tailored to identify fraudulent activities, streamlining the workflow for
4137
+ fraud detection use cases.
4138
+
4139
+ PARAMETERS:
4140
+ include:
4141
+ Optional Argument.
4142
+ Specifies the model algorithms to be used for model training phase.
4143
+ By default, all 5 models are used for training for this specific binary
4144
+ classification problem.
4145
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
4146
+ Types: str OR list of str
4147
+
4148
+ exclude:
4149
+ Optional Argument.
4150
+ Specifies the model algorithms to be excluded from model training phase.
4151
+ No model is excluded by default.
4152
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
4153
+ Types: str OR list of str
4154
+
4155
+ verbose:
4156
+ Optional Argument.
4157
+ Specifies the detailed execution steps based on verbose level.
4158
+ Default Value: 0
4159
+ Permitted Values:
4160
+ * 0: prints the progress bar and leaderboard
4161
+ * 1: prints the execution steps of AutoML.
4162
+ * 2: prints the intermediate data between the execution of each step of AutoML.
4163
+ Types: int
4164
+
4165
+ max_runtime_secs:
4166
+ Optional Argument.
4167
+ Specifies the time limit in seconds for model training.
4168
+ Types: int
4169
+
4170
+ stopping_metric:
4171
+ Required, when "stopping_tolerance" is set, otherwise optional.
4172
+ Specifies the stopping mertics for stopping tolerance in model training.
4173
+ Permitted Values: 'MICRO-F1','MACRO-F1','MICRO-RECALL','MACRO-RECALL',
4174
+ 'MICRO-PRECISION', 'MACRO-PRECISION','WEIGHTED-PRECISION',
4175
+ 'WEIGHTED-RECALL', 'WEIGHTED-F1', 'ACCURACY'
4176
+ Types: str
4177
+
4178
+ stopping_tolerance:
4179
+ Required, when "stopping_metric" is set, otherwise optional.
4180
+ Specifies the stopping tolerance for stopping metrics in model training.
4181
+ Types: float
4182
+
4183
+ max_models:
4184
+ Optional Argument.
4185
+ Specifies the maximum number of models to be trained.
4186
+ Types: int
4187
+
4188
+ custom_config_file:
4189
+ Optional Argument.
4190
+ Specifies the path of json file in case of custom run.
4191
+ Types: str
4192
+
4193
+ **kwargs:
4194
+ Specifies the additional arguments for AutoClassifier. Below
4195
+ are the additional arguments:
4196
+ volatile:
4197
+ Optional Argument.
4198
+ Specifies whether to put the interim results of the
4199
+ functions in a volatile table or not. When set to
4200
+ True, results are stored in a volatile table,
4201
+ otherwise not.
4202
+ Default Value: False
4203
+ Types: bool
4204
+
4205
+ persist:
4206
+ Optional Argument.
4207
+ Specifies whether to persist the interim results of the
4208
+ functions in a table or not. When set to True,
4209
+ results are persisted in a table; otherwise,
4210
+ results are garbage collected at the end of the
4211
+ session.
4212
+ Default Value: False
4213
+ Types: bool
4214
+
4215
+ seed:
4216
+ Optional Argument.
4217
+ Specifies the random seed for reproducibility.
4218
+ Default Value: 42
4219
+ Types: int
4220
+
4221
+ imbalance_handling_method:
4222
+ Optional Argument.
4223
+ Specifies which data imbalance method to use.
4224
+ Default Value: SMOTE
4225
+ Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
4226
+ Types: str
4227
+
4228
+ RETURNS:
4229
+ Instance of AutoFraud.
4230
+
4231
+ RAISES:
4232
+ TeradataMlException, TypeError, ValueError
4233
+
4234
+ EXAMPLES:
4235
+ # Notes:
4236
+ # 1. Get the connection to Vantage to execute the function.
4237
+ # 2. One must import the required functions mentioned in
4238
+ # the example from teradataml.
4239
+ # 3. Function will raise error if not supported on the Vantage
4240
+ # user is connected to.
4241
+
4242
+ # Load the example data.
4243
+ >>> load_example_data("teradataml", ["credit_fraud_dataset", "payment_fraud_datset"])
4244
+
4245
+ # Create teradataml DataFrame object.
4246
+ >>> credit_fraud_df = DataFrame.from_table("credit_fraud_dataset")
4247
+ >>> payment_fraud_df = DataFrame.from_table("payment_fraud_dataset")
4248
+
4249
+ # Example 1 : Run AutoFraud for fraud detection problem
4250
+ # Scenario : Predict whether a transaction is Fraud or not
4251
+
4252
+ # Split the data into train and test.
4253
+ >>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
4254
+ >>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
4255
+ >>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
4256
+
4257
+ # Create instance of AutoFraud.
4258
+ >>> automl_obj = AutoFraud()
4259
+
4260
+ # Fit the data.
4261
+ >>> automl_obj.fit(credit_fraud_train, "Credit_Class")
4262
+
4263
+ # Display leaderboard.
4264
+ >>> automl_obj.leaderboard()
4265
+
4266
+ # Display best performing model.
4267
+ >>> automl_obj.leader()
4268
+
4269
+ # Run predict on test data using best performing model.
4270
+ >>> prediction = automl_obj.predict(credit_fraud_test)
4271
+ >>> prediction
4272
+
4273
+ # Run predict on test data using second best performing model.
4274
+ >>> prediction = automl_obj.predict(credit_fraud_test, rank=2)
4275
+ >>> prediction
4276
+
4277
+ # Run evaluate to get performance metrics using best performing model.
4278
+ >>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
4279
+ >>> performance_metrics
4280
+
4281
+ # Run evaluate to get performance metrics using model rank 4.
4282
+ >>> performance_metrics = automl_obj.evaluate(credit_fraud_test, 4)
4283
+ >>> performance_metrics
4284
+
4285
+ # Example 2 : Run AutoFraud for fraud detection.
4286
+ # Scenario : Predict whether transaction is Fraud or not. Run AutoFraud to get the
4287
+ # best performing model out of available models. Use custom
4288
+ # configuration file to customize different processes of
4289
+ # AutoFraud Run.
4290
+
4291
+ # Split the data into train and test.
4292
+ >>> payment_fraud_sample = payment_fraud_df.sample(frac = [0.8, 0.2])
4293
+ >>> payment_fraud_train = payment_fraud_sample[payment_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
4294
+ >>> payment_fraud_test = payment_fraud_sample[payment_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
4295
+
4296
+ # Generate custom configuration file.
4297
+ >>> AutoFraud.generate_custom_config("custom_fraud")
4298
+
4299
+ # Create instance of AutoFraud.
4300
+ >>> automl_obj = AutoFraud(verbose=2,
4301
+ >>> custom_config_file="custom_fraud.json")
4302
+
4303
+ # Fit the data.
4304
+ >>> automl_obj.fit(payment_fraud_train, payment_fraud_train.isFraud)
4305
+
4306
+ # Display leaderboard.
4307
+ >>> automl_obj.leaderboard()
4308
+
4309
+ # Display best performing model.
4310
+ >>> automl_obj.leader()
4311
+
4312
+ # Run predict on test data using best performing model.
4313
+ >>> prediction = automl_obj.predict(payment_fraud_test)
4314
+ >>> prediction
4315
+
4316
+ # Run predict on test data using second best performing model.
4317
+ >>> prediction = automl_obj.predict(payment_fraud_test, rank=2)
4318
+ >>> prediction
4319
+
4320
+ # Run evaluate to get performance metrics using best performing model.
4321
+ >>> performance_metrics = automl_obj.evaluate(payment_fraud_test)
4322
+ >>> performance_metrics
4323
+
4324
+
4325
+ # Example 3 : Run AutoFraud for fraud detection with stopping metric and tolerance and imbalance handling method.
4326
+ # Scenario : Predict whether transaction is Fraud or not. Use custom configuration
4327
+ # file to customize different processes of AutoFraud Run. Define
4328
+ # performance threshold to acquire for the available models, and
4329
+ # terminate training upon meeting the stipulated performance criteria.
4330
+
4331
+ # Split the data into train and test.
4332
+ >>> credit_fraud_sample = credit_fraud_df.sample(frac = [0.8, 0.2])
4333
+ >>> credit_fraud_train = credit_fraud_sample[credit_fraud_sample['sampleid'] == 1].drop('sampleid', axis=1)
4334
+ >>> credit_fraud_test = credit_fraud_sample[credit_fraud_sample['sampleid'] == 2].drop('sampleid', axis=1)
4335
+
4336
+ # Generate custom configuration file.
4337
+ >>> AutoFraud.generate_custom_config("custom_fraud")
4338
+
4339
+ # Create instance of AutoFraud.
4340
+ >>> automl_obj = AutoFraud(verbose=2,
4341
+ >>> stopping_metric="MACRO-F1",
4342
+ >>> stopping_tolerance=0.7,
4343
+ >>> imbalance_handling_method="ADASYN",
4344
+ >>> custom_config_file="custom_fraud.json")
4345
+ # Fit the data.
4346
+ >>> automl_obj.fit(credit_fraud_train, credit_fraud_train.Credit_Class)
4347
+
4348
+ # Display leaderboard.
4349
+ >>> automl_obj.leaderboard()
4350
+
4351
+ # Run predict on test data using best performing model.
4352
+ >>> prediction = automl_obj.predict(credit_fraud_test)
4353
+ >>> prediction
4354
+
4355
+ # Run evaluate to get performance metrics using best performing model.
4356
+ >>> performance_metrics = automl_obj.evaluate(credit_fraud_test)
4357
+ >>> performance_metrics
4358
+ """
4359
+ # Validate unsupported 'task_type' argument
4360
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
4361
+
4362
+ # Validate unsupported 'is_fraud' argument
4363
+ _Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
4364
+
4365
+ # Validate unsupported 'is_churn' argument
4366
+ _Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
4367
+
4368
+ super().__init__(include=include,
4369
+ exclude=exclude,
4370
+ verbose=verbose,
4371
+ max_runtime_secs=max_runtime_secs,
4372
+ stopping_metric=stopping_metric,
4373
+ stopping_tolerance=stopping_tolerance,
4374
+ max_models=max_models,
4375
+ fraud=True,
4376
+ is_fraud=True,
4377
+ task_type="Classification",
4378
+ custom_config_file=custom_config_file,
4379
+ **kwargs)
4380
+
4381
+ class AutoChurn(AutoML):
4382
+
4383
+ def __init__(self,
4384
+ include=None,
4385
+ exclude=None,
4386
+ verbose=0,
4387
+ max_runtime_secs=None,
4388
+ stopping_metric=None,
4389
+ stopping_tolerance=None,
4390
+ max_models=None,
4391
+ custom_config_file=None,
4392
+ **kwargs):
4393
+
4394
+ """
4395
+ DESCRIPTION:
4396
+ AutoChurn is a dedicated AutoML pipeline designed specifically for churn prediction
4397
+ tasks. It automates the process of building, training, and evaluating models
4398
+ tailored to identify customer churn, streamlining the workflow for churn prediction
4399
+ use cases.
4400
+
4401
+ PARAMETERS:
4402
+ include:
4403
+ Optional Argument.
4404
+ Specifies the model algorithms to be used for model training phase.
4405
+ By default, all 5 models are used for training for this specific binary
4406
+ classification problem.
4407
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
4408
+ Types: str OR list of str
4409
+
4410
+ exclude:
4411
+ Optional Argument.
4412
+ Specifies the model algorithms to be excluded from model training phase.
4413
+ No model is excluded by default.
4414
+ Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
4415
+ Types: str OR list of str
4416
+
4417
+ verbose:
4418
+ Optional Argument.
4419
+ Specifies the detailed execution steps based on verbose level.
4420
+ Default Value: 0
4421
+ Permitted Values:
4422
+ * 0: prints the progress bar and leaderboard
4423
+ * 1: prints the execution steps of AutoML.
4424
+ * 2: prints the intermediate data between the execution of each step of AutoML.
4425
+ Types: int
4426
+
4427
+ max_runtime_secs:
4428
+ Optional Argument.
4429
+ Specifies the time limit in seconds for model training.
4430
+ Types: int
4431
+
4432
+ stopping_metric:
4433
+ Required, when "stopping_tolerance" is set, otherwise optional.
4434
+ Specifies the stopping mertics for stopping tolerance in model training.
4435
+ Permitted Values: 'MICRO-F1','MACRO-F1','MICRO-RECALL','MACRO-RECALL',
4436
+ 'MICRO-PRECISION', 'MACRO-PRECISION','WEIGHTED-PRECISION',
4437
+ 'WEIGHTED-RECALL', 'WEIGHTED-F1', 'ACCURACY'
4438
+ Types: str
4439
+
4440
+ stopping_tolerance:
4441
+ Required, when "stopping_metric" is set, otherwise optional.
4442
+ Specifies the stopping tolerance for stopping metrics in model training.
4443
+ Types: float
4444
+
4445
+ max_models:
4446
+ Optional Argument.
4447
+ Specifies the maximum number of models to be trained.
4448
+ Types: int
4449
+
4450
+ custom_config_file:
4451
+ Optional Argument.
4452
+ Specifies the path of json file in case of custom run.
4453
+ Types: str
4454
+
4455
+ **kwargs:
4456
+ Specifies the additional arguments for AutoClassifier. Below
4457
+ are the additional arguments:
4458
+ volatile:
4459
+ Optional Argument.
4460
+ Specifies whether to put the interim results of the
4461
+ functions in a volatile table or not. When set to
4462
+ True, results are stored in a volatile table,
4463
+ otherwise not.
4464
+ Default Value: False
4465
+ Types: bool
4466
+
4467
+ persist:
4468
+ Optional Argument.
4469
+ Specifies whether to persist the interim results of the
4470
+ functions in a table or not. When set to True,
4471
+ results are persisted in a table; otherwise,
4472
+ results are garbage collected at the end of the
4473
+ session.
4474
+ Default Value: False
4475
+ Types: bool
4476
+
4477
+ seed:
4478
+ Optional Argument.
4479
+ Specifies the random seed for reproducibility.
4480
+ Default Value: 42
4481
+ Types: int
4482
+
4483
+ imbalance_handling_method:
4484
+ Optional Argument.
4485
+ Specifies which data imbalance method to use.
4486
+ Default Value: SMOTE
4487
+ Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
4488
+ Types: str
4489
+
4490
+ RETURNS:
4491
+ Instance of AutoChurn.
4492
+
4493
+ RAISES:
4494
+ TeradataMlException, TypeError, ValueError
4495
+
4496
+ EXAMPLES:
4497
+ # Notes:
4498
+ # 1. Get the connection to Vantage to execute the function.
4499
+ # 2. One must import the required functions mentioned in
4500
+ # the example from teradataml.
4501
+ # 3. Function will raise error if not supported on the Vantage
4502
+ # user is connected to.
4503
+
4504
+ # Load the example data.
4505
+ >>> load_example_data("teradataml", "bank_churn")
4506
+
4507
+ # Create teradataml DataFrame object.
4508
+ >>> churn_df = DataFrame.from_table("bank_churn")
4509
+
4510
+ # Example 1 : Run AutoChurn for churn prediction problem
4511
+ # Scenario : Predict whether a customer churn for bank or not
4512
+
4513
+ # Split the data into train and test.
4514
+ >>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
4515
+ >>> churn_train= churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
4516
+ >>> churn_test = churn_sample[churn_sample['sampleid'] == 2].drop('sampleid', axis=1)
4517
+
4518
+ # Create instance of AutoChurn.
4519
+ >>> automl_obj = AutoChurn()
4520
+
4521
+ # Fit the data.
4522
+ >>> automl_obj.fit(churn_train, "churn")
4523
+
4524
+ # Display leaderboard.
4525
+ >>> automl_obj.leaderboard()
4526
+
4527
+ # Display best performing model.
4528
+ >>> automl_obj.leader()
4529
+
4530
+ # Run predict on test data using best performing model.
4531
+ >>> prediction = automl_obj.predict(churn_test)
4532
+ >>> prediction
4533
+
4534
+ # Run predict on test data using second best performing model.
4535
+ >>> prediction = automl_obj.predict(churn_test, rank=2)
4536
+ >>> prediction
4537
+
4538
+ # Run evaluate to get performance metrics using best performing model.
4539
+ >>> performance_metrics = automl_obj.evaluate(churn_test)
4540
+ >>> performance_metrics
4541
+
4542
+ # Run evaluate to get performance metrics using model rank 4.
4543
+ >>> performance_metrics = automl_obj.evaluate(churn_test, 4)
4544
+ >>> performance_metrics
4545
+
4546
+ # Example 2 : Run AutoChurn for churn prediction with stopping metric and tolerance and imbalance handling method.
4547
+ # Scenario : Predict whether a customer churn a bank or not. Use custom configuration
4548
+ # file to customize different processes of AutoML Run. Define
4549
+ # performance threshold to acquire for the available models, and
4550
+ # terminate training upon meeting the stipulated performance criteria.
4551
+
4552
+ # Split the data into train and test.
4553
+ >>> churn_sample = churn_df.sample(frac = [0.8, 0.2])
4554
+ >>> churn_train= churn_sample[churn_sample['sampleid'] == 1].drop('sampleid', axis=1)
4555
+ >>> churn_test = churn_sample[chrun_sample['sampleid'] == 2].drop('sampleid', axis=1)
4556
+
4557
+ # Generate custom configuration file.
4558
+ >>> AutoChurn.generate_custom_config("custom_churn")
4559
+
4560
+ # Create instance of AutoChurn.
4561
+ >>> automl_obj = AutoChurn(verbose=2,
4562
+ >>> stopping_metric="MACRO-F1",
4563
+ >>> stopping_tolerance=0.7,
4564
+ >>> imbalance_handling_method="ADASYN",
4565
+ >>> custom_config_file="custom_churn.json")
4566
+ # Fit the data.
4567
+ >>> automl_obj.fit(churn_train, churn_train.churn)
4568
+
4569
+ # Display leaderboard.
4570
+ >>> automl_obj.leaderboard()
4571
+
4572
+ # Run predict on test data using best performing model.
4573
+ >>> prediction = automl_obj.predict(churn_test)
4574
+ >>> prediction
4575
+
4576
+ # Run evaluate to get performance metrics using best performing model.
4577
+ >>> performance_metrics = automl_obj.evaluate(churn_test)
4578
+ >>> performance_metrics
4579
+ """
4580
+
4581
+ # Validate unsupported 'task_type' argument
4582
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
4583
+
4584
+ # Validate unsupported 'is_churn' argument
4585
+ _Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
4586
+
4587
+ # Validate unsupported 'is_fraud' argument
4588
+ _Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
4589
+
4590
+ super().__init__(include=include,
4591
+ exclude=exclude,
4592
+ verbose=verbose,
4593
+ max_runtime_secs=max_runtime_secs,
4594
+ stopping_metric=stopping_metric,
4595
+ stopping_tolerance=stopping_tolerance,
4596
+ max_models=max_models,
4597
+ churn=True,
4598
+ is_churn=True,
4599
+ task_type="Classification",
4600
+ custom_config_file=custom_config_file,
4601
+ **kwargs)
4602
+
4603
+ class _Clustering(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
4604
+
4605
+ def __init__(self,
4606
+ data,
4607
+ target_column=None,
4608
+ custom_data=None,
4609
+ **kwargs):
4610
+ """
4611
+ DESCRIPTION:
4612
+ Function initializes the data for clustering pipeline using AutoML components.
4613
+
4614
+ PARAMETERS:
4615
+ data:
4616
+ Required Argument.
4617
+ Specifies the input teradataml Dataframe.
4618
+ Types: teradataml Dataframe
4619
+
4620
+ target_column:
4621
+ Set to None as no target column is present for clustering data.
4622
+ Types: str
4623
+
4624
+ custom_data:
4625
+ Optional Argument.
4626
+ Specifies json object containing user customized input.
4627
+ Types: json object
4628
+ """
4629
+ # Validate unsupported 'task_type' argument
4630
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
4631
+
4632
+ self.data = data
4633
+ self.target_column = target_column # Typically None, but kept for compatibility
4634
+ self.custom_data = custom_data
4635
+ self.cluster = True
4636
+ self.task_type = "Clustering"
4637
+
4638
+ super().__init__(data=data,
4639
+ target_column=target_column,
4640
+ custom_data=custom_data,
4641
+ **kwargs)
4642
+
4643
+ def _clustering(self,
4644
+ model_list=None,
4645
+ auto=False,
4646
+ verbose=0,
4647
+ max_runtime_secs=None,
4648
+ stopping_metric=None,
4649
+ stopping_tolerance=None,
4650
+ max_models=None,
4651
+ **kwargs):
4652
+ """
4653
+ DESCRIPTION:
4654
+ Internal Function runs Clustering using AutoML components.
4655
+
4656
+ PARAMETERS:
4657
+ model_list:
4658
+ Optional Argument.
4659
+ Specifies the list of model algorithms to be used for model training phase.
4660
+ Types: list of strings (str)
4661
+ Default Value: ["KMeans", "GaussianMixture"]
4662
+ Permitted Values: "KMeans", "GaussianMixture"
4663
+ auto:
4664
+ Optional Argument.
4665
+ Specifies whether to run AutoML in custom mode or auto mode.
4666
+ When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
4667
+ Types: bool
4668
+
4669
+ verbose:
4670
+ Optional Argument.
4671
+ Specifies the detailed execution steps based on verbose level.
4672
+ Default Value: 0
4673
+ Permitted Values:
4674
+ * 0: prints the progress bar and leaderboard
4675
+ * 1: prints the execution steps of AutoML.
4676
+ * 2: prints the intermediate data between the execution of each step of AutoML.
4677
+ Types: int
4678
+
4679
+ max_runtime_secs:
4680
+ Optional Argument.
4681
+ Specifies the time limit in seconds for model training.
4682
+ Types: int
4683
+
4684
+ stopping_metric:
4685
+ Required, when "stopping_tolerance" is set, otherwise optional.
4686
+ Specifies the stopping mertics for stopping tolerance in model training.
4687
+ Types: str
4688
+
4689
+ stopping_tolerance:
4690
+ Required, when "stopping_metric" is set, otherwise optional.
4691
+ Specifies the stopping tolerance for stopping metrics in model training.
4692
+ Types: float
4693
+
4694
+ max_models:
4695
+ Optional Argument.
4696
+ Specifies the maximum number of models to be trained.
4697
+ Types: int
4698
+
4699
+ **kwargs:
4700
+ Specifies the additional arguments for AutoChurn or AutoFraud. Below
4701
+ are the additional arguments:
4702
+ volatile:
4703
+ Optional Argument.
4704
+ Specifies whether to put the results of the
4705
+ function in a volatile table or not. When set to
4706
+ True, results are stored in a volatile table,
4707
+ otherwise not.
4708
+ Default Value: False
4709
+ Types: bool
4710
+
4711
+ persist:
4712
+ Optional Argument.
4713
+ Specifies whether to persist the results of the
4714
+ function in a table or not. When set to True,
4715
+ results are persisted in a table; otherwise,
4716
+ results are garbage collected at the end of the
4717
+ session.
4718
+ Default Value: False
4719
+ Types: bool
4720
+
4721
+ RETURNS:
4722
+ a tuple containing, model information and leaderboard.
4723
+ """
4724
+
4725
+ # Feature Exploration Phase
4726
+ _FeatureExplore.__init__(self,
4727
+ data=self.data,
4728
+ target_column=None,
4729
+ custom_data=self.custom_data,
4730
+ verbose=verbose,
4731
+ cluster=True,
4732
+ **kwargs)
4733
+ if verbose > 0:
4734
+ self._exploration()
4735
+
4736
+ # Feature Engineering Phase
4737
+ _FeatureEngineering.__init__(self,
4738
+ data=self.data,
4739
+ target_column=None,
4740
+ model_list=model_list,
4741
+ verbose=verbose,
4742
+ task_type="Clustering",
4743
+ custom_data=self.custom_data,
4744
+ cluster=True,
4745
+ **kwargs)
4746
+
4747
+ start_time = time.time()
4748
+ data, excluded_columns, _, data_transformation_params, data_mapping = self.feature_engineering(auto)
4749
+
4750
+ # Data Preparation Phase
4751
+ _DataPreparation.__init__(self,
4752
+ data=self.data,
4753
+ target_column=None,
4754
+ verbose=verbose,
4755
+ excluded_columns=excluded_columns,
4756
+ custom_data=self.custom_data,
4757
+ data_transform_dict=data_transformation_params,
4758
+ task_type="Clustering",
4759
+ data_mapping=data_mapping,
4760
+ cluster=True,
4761
+ **kwargs)
4762
+ features, data_transformation_params, data_mapping = self.data_preparation(auto)
4763
+
4764
+ # Adjust time left
4765
+ max_runtime_secs = max_runtime_secs - (time.time() - start_time) \
4766
+ if max_runtime_secs is not None else None
4767
+ max_runtime_secs = 200 if max_runtime_secs is not None and max_runtime_secs < 120 else max_runtime_secs
4768
+
4769
+ # Model Training Phase
4770
+ _ModelTraining.__init__(self,
4771
+ data=self.data,
4772
+ target_column=None,
4773
+ model_list=model_list,
4774
+ verbose=verbose,
4775
+ features=features,
4776
+ task_type="Clustering",
4777
+ custom_data=self.custom_data,
4778
+ cluster=True,
4779
+ **kwargs)
4780
+ models_info, leaderboard, _ = self.model_training(auto=auto,
4781
+ max_runtime_secs=max_runtime_secs,
4782
+ stopping_metric=stopping_metric,
4783
+ stopping_tolerance=stopping_tolerance,
4784
+ max_models=max_models)
4785
+
4786
+ return (models_info, leaderboard, None, None, data_transformation_params, data_mapping)
4787
+
4788
+ class AutoCluster(AutoML):
4789
+
4790
+ def __init__(self,
4791
+ include=None,
4792
+ exclude=None,
4793
+ verbose=0,
4794
+ max_runtime_secs=None,
4795
+ stopping_metric=None,
4796
+ stopping_tolerance=None,
4797
+ max_models=None,
4798
+ custom_config_file=None,
4799
+ **kwargs):
4800
+
4801
+ """
4802
+ DESCRIPTION:
4803
+ AutoCluster is a dedicated AutoML pipeline designed specifically for clustering tasks.
4804
+ It automates the process of building, training, and evaluating clustering models,
4805
+ streamlining the workflow for unsupervised learning use cases where the goal is
4806
+ to group data into clusters.
4807
+
4808
+ PARAMETERS:
4809
+ include:
4810
+ Optional Argument.
4811
+ Specifies the model algorithms to be used for model training phase.
4812
+ By default, all 2 models are used for training for clustering.
4813
+ Permitted Values: "KMeans", "GaussianMixture"
4814
+ Types: str OR list of str
4815
+
4816
+ exclude:
4817
+ Optional Argument.
4818
+ Specifies the model algorithms to be excluded from model training phase.
4819
+ No model is excluded by default.
4820
+ Permitted Values: "KMeans", "GaussianMixture"
4821
+ Types: str OR list of str
4822
+
4823
+ verbose:
4824
+ Optional Argument.
4825
+ Specifies the detailed execution steps based on verbose level.
4826
+ Default Value: 0
4827
+ Permitted Values:
4828
+ * 0: prints the progress bar and leaderboard
4829
+ * 1: prints the execution steps of AutoML.
4830
+ * 2: prints the intermediate data between the execution of each step of AutoML.
4831
+ Types: int
4832
+
4833
+ max_runtime_secs:
4834
+ Optional Argument.
4835
+ Specifies the time limit in seconds for model training.
4836
+ Types: int
4837
+
4838
+ stopping_metric:
4839
+ Required, when "stopping_tolerance" is set, otherwise optional.
4840
+ Specifies the stopping mertics for stopping tolerance in model training.
4841
+ Permitted Values: "SILHOUETTE", "CALINSKI", "DAVIES"
4842
+
4843
+ Types: str
4844
+
4845
+ stopping_tolerance:
4846
+ Required, when "stopping_metric" is set, otherwise optional.
4847
+ Specifies the stopping tolerance for stopping metrics in model training.
4848
+ Types: float
4849
+
4850
+ max_models:
4851
+ Optional Argument.
4852
+ Specifies the maximum number of models to be trained.
4853
+ Types: int
4854
+
4855
+ custom_config_file:
4856
+ Optional Argument.
4857
+ Specifies the path of json file in case of custom run.
4858
+ Types: str
4859
+
4860
+ **kwargs:
4861
+ Specifies the additional arguments for AutoCluster. Below
4862
+ are the additional arguments:
4863
+ volatile:
4864
+ Optional Argument.
4865
+ Specifies whether to put the interim results of the
4866
+ functions in a volatile table or not. When set to
4867
+ True, results are stored in a volatile table,
4868
+ otherwise not.
4869
+ Default Value: False
4870
+ Types: bool
4871
+
4872
+ persist:
4873
+ Optional Argument.
4874
+ Specifies whether to persist the interim results of the
4875
+ functions in a table or not. When set to True,
4876
+ results are persisted in a table; otherwise,
4877
+ results are garbage collected at the end of the
4878
+ session.
4879
+ Default Value: False
4880
+ Types: bool
4881
+
4882
+ seed:
4883
+ Optional Argument.
4884
+ Specifies the random seed for reproducibility.
4885
+ Default Value: 42
4886
+ Types: int
4887
+
4888
+ RETURNS:
4889
+ Instance of AutoCluster.
4890
+
4891
+ RAISES:
4892
+ TeradataMlException, TypeError, ValueError
4893
+
4894
+ EXAMPLES:
4895
+ # Notes:
4896
+ # 1. Get the connection to Vantage to execute the function.
4897
+ # 2. One must import the required functions mentioned in
4898
+ # the example from teradataml.
4899
+ # 3. Function will raise error if not supported on the Vantage
4900
+ # user is connected to.
4901
+
4902
+ # Load the example data.
4903
+ >>> load_example_data("teradataml", ["bank_marketing", "Mall_customer_data"])
4904
+
4905
+ # Create teradataml DataFrame object.
4906
+ >>> bank_df = DataFrame.from_table("bank_marketing")
4907
+ >>> mall_df = DataFrame.from_table("Mall_customer_data")
4908
+
4909
+ # Example 1: Use AutoCluster for unsupervised clustering task based on bank data.
4910
+ # Scenario: Automatically group similar records in the dataset into clusters.
4911
+
4912
+ # Split the data into train and test.
4913
+ >>> bank_sample = bank_df.sample(frac = [0.8, 0.2])
4914
+ >>> bank_train= bank_sample[bank_sample['sampleid'] == 1].drop('sampleid', axis=1)
4915
+ >>> bank_test = bank_sample[bank_sample['sampleid'] == 2].drop('sampleid', axis=1)
4916
+
4917
+ # Create instance of AutoCluster.
4918
+ >>> automl_obj = AutoCluster()
4919
+
4920
+ # Fit the data.
4921
+ >>> automl_obj.fit(bank_train)
4922
+
4923
+ # Display leaderboard.
4924
+ >>> automl_obj.leaderboard()
4925
+
4926
+ # Display best performing model.
4927
+ >>> automl_obj.leader()
4928
+
4929
+ # Run predict on test data using best performing model.
4930
+ >>> prediction = automl_obj.predict(bank_test)
4931
+ >>> prediction
4932
+
4933
+ # Run predict on test data using second best performing model.
4934
+ >>> prediction = automl_obj.predict(bank_test, rank=2)
4935
+ >>> prediction
4936
+
4937
+
4938
+ # Example 2: Use AutoCluster to segment Mall customer data.
4939
+ # Scenario: Automatically identify and group similar customers into clusters.
4940
+
4941
+ # Split the data into train and test.
4942
+ >>> mall_sample = mall_df.sample(frac = [0.8, 0.2])
4943
+ >>> mall_train= mall_sample[mall_sample['sampleid'] == 1].drop('sampleid', axis=1)
4944
+ >>> mall_test = mall_sample[mall_sample['sampleid'] == 2].drop('sampleid', axis=1)
4945
+
4946
+ # Generate custom configuration file.
4947
+ >>> AutoCluster.generate_custom_config("custom_mall_clustering")
4948
+
4949
+ # Create instance of AutoCluster.
4950
+ >>> automl_obj = AutoCluster(verbose=2,
4951
+ >>> custom_config_file="custom_mall_clustering.json")
4952
+
4953
+ # Fit the data.
4954
+ >>> automl_obj.fit(mall_train)
4955
+
4956
+ # Display leaderboard.
4957
+ >>> automl_obj.leaderboard()
4958
+
4959
+ # Display best performing model.
4960
+ >>> automl_obj.leader()
4961
+
4962
+ # Run predict on test data using best performing model.
4963
+ >>> prediction = automl_obj.predict(mall_test)
4964
+ >>> prediction
4965
+
4966
+ # Run predict on test data using second best performing model.
4967
+ >>> prediction = automl_obj.predict(mall_test, rank=2)
4968
+ >>> prediction
4969
+ """
4970
+
4971
+ # Validate unsupported 'task_type' argument
4972
+ _Validators._validate_unsupported_argument(kwargs.get("task_type", None), "task_type")
4973
+
4974
+ # Validate unsupported 'is_churn' argument
4975
+ _Validators._validate_unsupported_argument(kwargs.get("is_churn", None), "is_churn")
4976
+
4977
+ # Validate unsupported 'is_fraud' argument
4978
+ _Validators._validate_unsupported_argument(kwargs.get("is_fraud", None), "is_fraud")
4979
+
4980
+ super().__init__(include=include,
4981
+ exclude=exclude,
4982
+ verbose=verbose,
4983
+ max_runtime_secs=max_runtime_secs,
4984
+ stopping_metric=stopping_metric,
4985
+ stopping_tolerance=stopping_tolerance,
4986
+ max_models=max_models,
4987
+ task_type="Clustering",
4988
+ custom_config_file=custom_config_file,
4989
+ **kwargs)
4990
+
4991
+ @staticmethod
4992
+ def visualize(**kwargs):
4993
+ # Currently AutoCluster does not support visualize so raising the exception
4994
+ raise TeradataMlException(
4995
+ Messages.get_message(MessageCodes.UNSUPPORTED_OPERATION),
4996
+ MessageCodes.UNSUPPORTED_OPERATION)