teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# ##################################################################
|
|
2
2
|
#
|
|
3
|
-
# Copyright
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
4
|
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
5
|
#
|
|
6
6
|
# Primary Owner: Sweta Shaw
|
|
@@ -29,7 +29,7 @@ from teradataml import execute_sql, get_connection
|
|
|
29
29
|
from teradataml import configure, SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
|
|
30
30
|
from teradataml.utils.validators import _Validators
|
|
31
31
|
from teradataml.common.utils import UtilFuncs
|
|
32
|
-
from teradataml.common.constants import TeradataConstants
|
|
32
|
+
from teradataml.common.constants import TeradataConstants, AutoMLConstants
|
|
33
33
|
|
|
34
34
|
class _ModelTraining:
|
|
35
35
|
|
|
@@ -54,7 +54,7 @@ class _ModelTraining:
|
|
|
54
54
|
Types: teradataml Dataframe
|
|
55
55
|
|
|
56
56
|
target_column:
|
|
57
|
-
Required Argument.
|
|
57
|
+
Required Argument. (Not required for Clustering task_type)
|
|
58
58
|
Specifies the target column present inside the dataset.
|
|
59
59
|
Types: str
|
|
60
60
|
|
|
@@ -83,9 +83,9 @@ class _ModelTraining:
|
|
|
83
83
|
task_type:
|
|
84
84
|
Required Argument.
|
|
85
85
|
Specifies the task type for AutoML, whether to apply regresion
|
|
86
|
-
or classification on the provived dataset.
|
|
86
|
+
or classification or clustering on the provived dataset.
|
|
87
87
|
Default Value: "Regression"
|
|
88
|
-
Permitted Values: "Regression", "Classification"
|
|
88
|
+
Permitted Values: "Regression", "Classification", "Clustering"
|
|
89
89
|
Types: str
|
|
90
90
|
|
|
91
91
|
custom_data:
|
|
@@ -120,12 +120,17 @@ class _ModelTraining:
|
|
|
120
120
|
Specifies the random seed for reproducibility.
|
|
121
121
|
Default Value: 42
|
|
122
122
|
Types: int
|
|
123
|
+
|
|
124
|
+
cluster:
|
|
125
|
+
Optional Argument.
|
|
126
|
+
Specifies whether to apply clustering techniques.
|
|
127
|
+
Default Value: False
|
|
128
|
+
Types: bool
|
|
123
129
|
"""
|
|
124
130
|
self.data = data
|
|
125
131
|
self.target_column = target_column
|
|
126
132
|
self.model_list = model_list
|
|
127
133
|
self.verbose = verbose
|
|
128
|
-
self.features = (features[1], features[0], features[2])
|
|
129
134
|
self.task_type = task_type
|
|
130
135
|
self.custom_data = custom_data
|
|
131
136
|
self.labels = self.data.drop_duplicate(self.target_column).size
|
|
@@ -133,14 +138,19 @@ class _ModelTraining:
|
|
|
133
138
|
self.persist = kwargs.get("persist", False)
|
|
134
139
|
self.volatile = kwargs.get("volatile", False)
|
|
135
140
|
self.seed = kwargs.get("seed", 42)
|
|
136
|
-
|
|
141
|
+
self.cluster = kwargs.get("cluster", False)
|
|
142
|
+
|
|
143
|
+
if not self.cluster:
|
|
144
|
+
self.features = (features[1], features[0], features[2])
|
|
145
|
+
else:
|
|
146
|
+
self.features = (features[1], features[0])
|
|
147
|
+
|
|
137
148
|
def model_training(self,
|
|
138
149
|
auto=True,
|
|
139
150
|
max_runtime_secs=None,
|
|
140
151
|
stopping_metric=None,
|
|
141
152
|
stopping_tolerance=0,
|
|
142
|
-
max_models=None
|
|
143
|
-
):
|
|
153
|
+
max_models=None):
|
|
144
154
|
"""
|
|
145
155
|
DESCRIPTION:
|
|
146
156
|
Function to perform following tasks:-
|
|
@@ -231,7 +241,12 @@ class _ModelTraining:
|
|
|
231
241
|
int containing, total number of models available for training.
|
|
232
242
|
"""
|
|
233
243
|
# Creating all possible combinations of hyperparameters
|
|
234
|
-
|
|
244
|
+
if 'param_grid' in hyperparameters:
|
|
245
|
+
grid = hyperparameters['param_grid']
|
|
246
|
+
else:
|
|
247
|
+
# AutoML style: full dict is hyperparameter space
|
|
248
|
+
grid = hyperparameters
|
|
249
|
+
all_combinations = list(product(*[v if isinstance(v, (list, tuple)) else [v] for v in grid.values()]))
|
|
235
250
|
# Getting total number of models for each model model training function
|
|
236
251
|
total_models = len(all_combinations)
|
|
237
252
|
return total_models
|
|
@@ -279,21 +294,34 @@ class _ModelTraining:
|
|
|
279
294
|
None
|
|
280
295
|
"""
|
|
281
296
|
self._display_msg(msg="\nHyperparameters used for model training: ",
|
|
282
|
-
progress_bar
|
|
297
|
+
progress_bar=self.progress_bar,
|
|
283
298
|
show_data=True)
|
|
284
299
|
print(" " *150, end='\r', flush=True)
|
|
285
300
|
|
|
286
301
|
# Iterating over hyperparameters_list
|
|
287
302
|
for hyperparameter_dct in hyperparameters_list:
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
# Displaying hyperparameters
|
|
291
|
-
print(f"{key} : {str(val)}")
|
|
303
|
+
name = hyperparameter_dct.get("name", "Unnamed Model")
|
|
304
|
+
print(f"Model: {name}")
|
|
292
305
|
|
|
293
|
-
|
|
306
|
+
if self.cluster and "param_grid" in hyperparameter_dct:
|
|
307
|
+
# Also show metadata outside param_grid
|
|
308
|
+
for meta_key, meta_val in hyperparameter_dct.items():
|
|
309
|
+
if meta_key != "param_grid":
|
|
310
|
+
print(f"{meta_key}: {meta_val}")
|
|
311
|
+
|
|
312
|
+
print("Hyperparameter Grid:")
|
|
313
|
+
for key, val in hyperparameter_dct["param_grid"].items():
|
|
314
|
+
print(f" {key}: {val}")
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
print("Hyperparameters:")
|
|
318
|
+
for key, val in hyperparameter_dct.items():
|
|
319
|
+
print(f" {key}: {val}")
|
|
320
|
+
|
|
294
321
|
total_models = self._get_model_param_space(hyperparameter_dct)
|
|
295
|
-
|
|
296
|
-
print(f"
|
|
322
|
+
|
|
323
|
+
print(f"Total number of models for {name}: {total_models}")
|
|
324
|
+
print(f"--" * 100 + "\n")
|
|
297
325
|
|
|
298
326
|
def _display_leaderboard(self,
|
|
299
327
|
trained_models_info):
|
|
@@ -311,14 +339,20 @@ class _ModelTraining:
|
|
|
311
339
|
pandas Dataframe.
|
|
312
340
|
"""
|
|
313
341
|
# Creating a copy to avoid use of same reference of memory
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
if not self.cluster:
|
|
345
|
+
if self.task_type != "Regression":
|
|
346
|
+
sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
|
|
347
|
+
ascending=[False, False]).reset_index(drop=True)
|
|
348
|
+
else:
|
|
349
|
+
sorted_model_df = trained_models_info.sort_values(by='R2',
|
|
350
|
+
ascending=False).reset_index(drop=True)
|
|
317
351
|
else:
|
|
318
|
-
sorted_model_df = trained_models_info.sort_values(by='
|
|
319
|
-
ascending=False).reset_index(drop=True)
|
|
352
|
+
sorted_model_df = trained_models_info.sort_values(by=['SILHOUETTE', 'CALINSKI', 'DAVIES'],
|
|
353
|
+
ascending=[False, False, True]).reset_index(drop=True)
|
|
354
|
+
|
|
320
355
|
|
|
321
|
-
|
|
322
356
|
# Adding rank to leaderboard
|
|
323
357
|
sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
|
|
324
358
|
|
|
@@ -326,7 +360,7 @@ class _ModelTraining:
|
|
|
326
360
|
dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
|
|
327
361
|
|
|
328
362
|
# Excluding the model object and model name from leaderboard
|
|
329
|
-
leaderboard = sorted_model_df.drop(dp_lst
|
|
363
|
+
leaderboard = sorted_model_df.drop(columns=[col for col in dp_lst if col in sorted_model_df.columns])
|
|
330
364
|
|
|
331
365
|
# filtering the rows based on the max_models
|
|
332
366
|
if self.max_models is not None:
|
|
@@ -363,24 +397,42 @@ class _ModelTraining:
|
|
|
363
397
|
"""
|
|
364
398
|
# Iterating over new hyperparameters and performing required operation
|
|
365
399
|
# based on passed method ADD or REPLACE
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
400
|
+
if self.cluster:
|
|
401
|
+
# Clustering: use param_grid
|
|
402
|
+
param_grid = existing_params.get("param_grid", {})
|
|
403
|
+
for feature, param_list in new_params.items():
|
|
404
|
+
if feature in param_grid:
|
|
405
|
+
if param_list["Method"] == "ADD":
|
|
406
|
+
param_grid[feature] = list(param_grid[feature])
|
|
407
|
+
param_grid[feature].extend(param_list["Value"])
|
|
408
|
+
param_grid[feature] = tuple(set(param_grid[feature]))
|
|
409
|
+
elif param_list["Method"] == "REPLACE":
|
|
410
|
+
param_grid[feature] = tuple(param_list["Value"])
|
|
411
|
+
else:
|
|
412
|
+
self._display_msg(inline_msg="Passed method is not valid.")
|
|
377
413
|
else:
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
414
|
+
param_grid[feature] = tuple(param_list["Value"])
|
|
415
|
+
existing_params["param_grid"] = param_grid
|
|
416
|
+
|
|
417
|
+
else:
|
|
418
|
+
for feature, param_list in new_params.items():
|
|
419
|
+
if feature in existing_params.keys():
|
|
420
|
+
if param_list["Method"] == "ADD":
|
|
421
|
+
# Extending existing list
|
|
422
|
+
existing_params[feature] = list(existing_params[feature])
|
|
423
|
+
existing_params[feature].extend(param_list["Value"])
|
|
424
|
+
# Updating list with unique values.
|
|
425
|
+
existing_params[feature]=tuple(set(existing_params[feature]))
|
|
426
|
+
elif param_list["Method"] == "REPLACE":
|
|
427
|
+
# Replacing with entirely new value
|
|
428
|
+
existing_params[feature] = tuple(param_list["Value"])
|
|
429
|
+
else:
|
|
430
|
+
self._display_msg(inline_msg="Passed method is not valid.")
|
|
431
|
+
else:
|
|
432
|
+
self._display_msg(inline_msg="\nPassed model argument {} is not"
|
|
433
|
+
" available for model {}. Skipping it."
|
|
434
|
+
.format(feature,existing_params['name']))
|
|
435
|
+
continue
|
|
384
436
|
# Returning updated hyperparamter
|
|
385
437
|
return existing_params
|
|
386
438
|
|
|
@@ -422,13 +474,13 @@ class _ModelTraining:
|
|
|
422
474
|
hyperparameters[model_index]=self._update_hyperparameters(hyperparameters[model_index],hyp_list)
|
|
423
475
|
# Displaying it after update
|
|
424
476
|
self._display_msg(inline_msg="\nCompleted customized hyperparameter update.",
|
|
425
|
-
|
|
477
|
+
progress_bar=self.progress_bar)
|
|
426
478
|
else:
|
|
427
479
|
self._display_msg(inline_msg="No information provided for custom hyperparameters. AutoML will proceed with default values.",
|
|
428
|
-
|
|
480
|
+
progress_bar=self.progress_bar)
|
|
429
481
|
else:
|
|
430
482
|
self._display_msg(inline_msg="\nSkipping customized hyperparameter tuning",
|
|
431
|
-
|
|
483
|
+
progress_bar=self.progress_bar)
|
|
432
484
|
# Retunring updated hyperparameters for all models
|
|
433
485
|
return hyperparameters
|
|
434
486
|
|
|
@@ -506,7 +558,7 @@ class _ModelTraining:
|
|
|
506
558
|
'max_depth': tuple(max_depth),
|
|
507
559
|
'min_node_size': tuple(min_node_size),
|
|
508
560
|
'iter_num': tuple(iter_num),
|
|
509
|
-
'seed':self.seed
|
|
561
|
+
'seed': self.seed
|
|
510
562
|
}
|
|
511
563
|
# Hyperparameters for Decision Forest model
|
|
512
564
|
df_params = {
|
|
@@ -517,7 +569,7 @@ class _ModelTraining:
|
|
|
517
569
|
'max_depth': tuple(max_depth),
|
|
518
570
|
'min_node_size': tuple(min_node_size),
|
|
519
571
|
'num_trees': tuple(num_trees),
|
|
520
|
-
'seed':self.seed
|
|
572
|
+
'seed': self.seed
|
|
521
573
|
}
|
|
522
574
|
|
|
523
575
|
# Updating model type in case of classification
|
|
@@ -663,6 +715,47 @@ class _ModelTraining:
|
|
|
663
715
|
else:
|
|
664
716
|
return None
|
|
665
717
|
|
|
718
|
+
def _get_kmeans_hyperparameters(self):
|
|
719
|
+
"""
|
|
720
|
+
DESCRIPTION:
|
|
721
|
+
Generates hyperparameters for KMeans clustering.
|
|
722
|
+
|
|
723
|
+
RETURNS:
|
|
724
|
+
dict containing hyperparameters for KMeans.
|
|
725
|
+
"""
|
|
726
|
+
params = {
|
|
727
|
+
"name": "KMeans",
|
|
728
|
+
"param_grid": {
|
|
729
|
+
'n_clusters': (2,3,4,5,6,7,8,9,10),
|
|
730
|
+
'init': ('k-means++', 'random'),
|
|
731
|
+
'n_init': (5, 10),
|
|
732
|
+
'max_iter': (100, 200),
|
|
733
|
+
'tol': (0.001, 0.01),
|
|
734
|
+
'algorithm': ('auto', 'full')
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
return params
|
|
739
|
+
|
|
740
|
+
def _get_gmm_hyperparameters(self):
|
|
741
|
+
"""
|
|
742
|
+
DESCRIPTION:
|
|
743
|
+
Generates hyperparameters for Gaussian Mixture Model (GMM).
|
|
744
|
+
|
|
745
|
+
RETURNS:
|
|
746
|
+
dict containing hyperparameters for GMM.
|
|
747
|
+
"""
|
|
748
|
+
params = {
|
|
749
|
+
"name": "GaussianMixture",
|
|
750
|
+
"param_grid": {
|
|
751
|
+
"n_components": (2,3,4,5,6,7,8,9,10),
|
|
752
|
+
"covariance_type": ("full", "tied", "diag", "spherical"),
|
|
753
|
+
"max_iter": (100, 300)
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
return params
|
|
758
|
+
|
|
666
759
|
def _generate_parameter(self):
|
|
667
760
|
"""
|
|
668
761
|
DESCRIPTION:
|
|
@@ -672,46 +765,54 @@ class _ModelTraining:
|
|
|
672
765
|
list containing, dict of hyperparameters for different ML models.
|
|
673
766
|
"""
|
|
674
767
|
# list for storing hyperparameters
|
|
675
|
-
parameters=[]
|
|
768
|
+
parameters = []
|
|
676
769
|
# Index for model mapping
|
|
677
|
-
model_index=0
|
|
770
|
+
model_index = 0
|
|
678
771
|
# Dictionary for mapping model with index
|
|
679
772
|
self.model_mapping={}
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
parameters.append(model_functions[model](num_rows, num_cols))
|
|
710
|
-
else:
|
|
711
|
-
parameters.append(model_functions[model](num_rows, num_cols, model))
|
|
712
|
-
model_index += 1
|
|
773
|
+
if not self.cluster:
|
|
774
|
+
# Getting number of rows and columns
|
|
775
|
+
num_rows = self.data.shape[0]
|
|
776
|
+
num_cols = self.data.shape[1]
|
|
777
|
+
|
|
778
|
+
# Model functions mapping for hyperparameter generation
|
|
779
|
+
model_functions = {
|
|
780
|
+
'decision_forest': self._get_tree_model_hyperparameters,
|
|
781
|
+
'xgboost': self._get_tree_model_hyperparameters,
|
|
782
|
+
'knn': self._get_knn_hyperparameters,
|
|
783
|
+
'glm': self._get_linear_model_hyperparameters,
|
|
784
|
+
'svm': self._get_linear_model_hyperparameters,
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
if not self.cluster:
|
|
788
|
+
supported_models = AutoMLConstants.SUPERVISED_MODELS.value
|
|
789
|
+
self.model_list = [model for model in self.model_list if model in supported_models]
|
|
790
|
+
|
|
791
|
+
# Generating hyperparameters for each model
|
|
792
|
+
if self.model_list:
|
|
793
|
+
for model in self.model_list:
|
|
794
|
+
self.model_mapping[model] = model_index
|
|
795
|
+
if model == 'knn':
|
|
796
|
+
parameters.append(model_functions[model](num_rows, num_cols))
|
|
797
|
+
else:
|
|
798
|
+
parameters.append(model_functions[model](num_rows, num_cols, model))
|
|
799
|
+
model_index += 1
|
|
800
|
+
else:
|
|
801
|
+
raise ValueError("No model is selected for training.")
|
|
713
802
|
else:
|
|
714
|
-
|
|
803
|
+
model_functions = {
|
|
804
|
+
'KMeans': self._get_kmeans_hyperparameters,
|
|
805
|
+
'GaussianMixture': self._get_gmm_hyperparameters,
|
|
806
|
+
}
|
|
807
|
+
supported_models = AutoMLConstants.CLUSTERING_MODELS.value
|
|
808
|
+
self.model_list = [model for model in self.model_list if model in supported_models]
|
|
809
|
+
if self.model_list:
|
|
810
|
+
for model in self.model_list:
|
|
811
|
+
self.model_mapping[model] = model_index
|
|
812
|
+
parameters.append(model_functions[model]())
|
|
813
|
+
model_index += 1
|
|
814
|
+
else:
|
|
815
|
+
raise ValueError("No model is selected for training.")
|
|
715
816
|
|
|
716
817
|
return parameters
|
|
717
818
|
|
|
@@ -723,8 +824,12 @@ class _ModelTraining:
|
|
|
723
824
|
RETURNS:
|
|
724
825
|
dictionary containing max_models distribution and list of models to remove.
|
|
725
826
|
"""
|
|
827
|
+
if self.cluster:
|
|
828
|
+
models = [model for model in self.model_list if model in AutoMLConstants.CLUSTERING_MODELS.value]
|
|
829
|
+
else:
|
|
830
|
+
models = [model for model in self.model_list if model in AutoMLConstants.SUPERVISED_MODELS.value]
|
|
726
831
|
# Getting total number of models
|
|
727
|
-
model_count=len(
|
|
832
|
+
model_count = len(models)
|
|
728
833
|
# Evenly distributing max_models across models
|
|
729
834
|
base_assign = self.max_models // model_count
|
|
730
835
|
# Creating list of max_models for each model
|
|
@@ -739,17 +844,20 @@ class _ModelTraining:
|
|
|
739
844
|
distribution[i] += 1
|
|
740
845
|
|
|
741
846
|
# Creating dictionary for model distribution
|
|
742
|
-
model_distribution = dict(zip(
|
|
847
|
+
model_distribution = dict(zip(models, distribution))
|
|
743
848
|
# Getting list of models with 0 distribution and removing them from model list
|
|
744
849
|
# While for model having distribution greater than 0, updating distribution with
|
|
745
850
|
# 1/3rd of original value as we are training with 3 different feature selection methods.
|
|
746
851
|
models_to_remove = []
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
852
|
+
if not self.cluster:
|
|
853
|
+
for model in models:
|
|
854
|
+
initial_count = model_distribution[model]
|
|
855
|
+
if initial_count == 0:
|
|
856
|
+
models_to_remove.append(model)
|
|
857
|
+
else:
|
|
858
|
+
model_distribution[model] = math.ceil(initial_count / 3)
|
|
859
|
+
else:
|
|
860
|
+
models_to_remove = [model for model, count in model_distribution.items() if count == 0]
|
|
753
861
|
|
|
754
862
|
return model_distribution, models_to_remove
|
|
755
863
|
|
|
@@ -768,22 +876,31 @@ class _ModelTraining:
|
|
|
768
876
|
RETURNS:
|
|
769
877
|
Pandas DataFrame containing, trained models information.
|
|
770
878
|
"""
|
|
771
|
-
|
|
879
|
+
self.model_id_counters = {}
|
|
772
880
|
# Hyperparameters for each model
|
|
773
881
|
model_params = parameters[:min(len(parameters), 5)]
|
|
774
882
|
self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
|
|
775
883
|
|
|
776
884
|
# Defining training data
|
|
777
|
-
|
|
778
|
-
|
|
885
|
+
if not self.cluster:
|
|
886
|
+
data_types = ['lasso', 'rfe', 'pca']
|
|
887
|
+
training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
|
|
888
|
+
else:
|
|
889
|
+
data_types = ['pca', 'non_pca']
|
|
890
|
+
training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
|
|
779
891
|
|
|
780
|
-
|
|
781
|
-
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
if self.task_type == "Classification" and not self.cluster:
|
|
895
|
+
response_values = training_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
|
|
782
896
|
self.output_response = [str(i) for i in response_values]
|
|
783
897
|
|
|
784
898
|
if self.stopping_metric is None:
|
|
785
|
-
|
|
786
|
-
|
|
899
|
+
if not self.cluster:
|
|
900
|
+
self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
|
|
901
|
+
if self.is_classification_type() else 'R2'
|
|
902
|
+
else:
|
|
903
|
+
self.stopping_tolerance, self.stopping_metric = 1.0, 'SILHOUETTE'
|
|
787
904
|
|
|
788
905
|
self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
|
|
789
906
|
if self.max_runtime_secs is not None else None
|
|
@@ -798,16 +915,17 @@ class _ModelTraining:
|
|
|
798
915
|
# Updating progress bar as we are removing model
|
|
799
916
|
self.progress_bar.update()
|
|
800
917
|
|
|
801
|
-
if self.is_classification_type():
|
|
918
|
+
if self.is_classification_type() and not self.cluster:
|
|
802
919
|
self.startify_col = self.target_column
|
|
803
920
|
|
|
804
921
|
trained_models = []
|
|
922
|
+
|
|
805
923
|
for param in model_params:
|
|
806
|
-
result = self._hyperparameter_tunning(param,
|
|
924
|
+
result = self._hyperparameter_tunning(param, training_datas)
|
|
807
925
|
if result is not None:
|
|
808
926
|
trained_models.append(result)
|
|
809
|
-
|
|
810
927
|
models_df = pd.concat(trained_models, ignore_index=True)
|
|
928
|
+
|
|
811
929
|
return models_df
|
|
812
930
|
|
|
813
931
|
def _hyperparameter_tunning(self,
|
|
@@ -816,7 +934,7 @@ class _ModelTraining:
|
|
|
816
934
|
"""
|
|
817
935
|
DESCRIPTION:
|
|
818
936
|
Internal function performs hyperparameter tuning on
|
|
819
|
-
ML models for regression/classification problems.
|
|
937
|
+
ML models for regression/classification/clustering problems.
|
|
820
938
|
|
|
821
939
|
PARAMETERS:
|
|
822
940
|
model_param
|
|
@@ -832,121 +950,196 @@ class _ModelTraining:
|
|
|
832
950
|
RETURNS:
|
|
833
951
|
pandas DataFrame containing, trained models information.
|
|
834
952
|
"""
|
|
835
|
-
#
|
|
836
|
-
|
|
837
|
-
|
|
953
|
+
# Passing verbose value based on user input
|
|
954
|
+
if self.verbose > 0:
|
|
955
|
+
print(" " *200, end='\r', flush=True)
|
|
956
|
+
verbose = 1
|
|
957
|
+
else:
|
|
958
|
+
verbose = 0
|
|
959
|
+
|
|
960
|
+
if not self.cluster:
|
|
961
|
+
# Mapping model names to functions
|
|
962
|
+
model_to_func = {"glm": GLM, "svm": SVM,
|
|
963
|
+
"xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
|
|
838
964
|
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
965
|
+
# Setting eval_params for hpt.
|
|
966
|
+
eval_params = _ModelTraining._eval_params_generation(model_param['name'],
|
|
967
|
+
self.target_column,
|
|
968
|
+
self.task_type)
|
|
843
969
|
|
|
844
|
-
|
|
845
|
-
|
|
970
|
+
# Input columns for model
|
|
971
|
+
model_param['input_columns'] = self.features
|
|
846
972
|
|
|
847
|
-
|
|
848
|
-
|
|
973
|
+
# Setting persist for model
|
|
974
|
+
model_param['persist'] = self.persist
|
|
849
975
|
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
976
|
+
self._display_msg(msg=model_param['name'],
|
|
977
|
+
progress_bar=self.progress_bar,
|
|
978
|
+
show_data=True)
|
|
979
|
+
|
|
980
|
+
# As we are using entire data for HPT training. So,
|
|
981
|
+
# passing prepared training data as test_data for KNN.
|
|
982
|
+
if model_param['name'] == 'knn':
|
|
983
|
+
model_param['test_data'] = train_data
|
|
858
984
|
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
985
|
+
if self.task_type == "Classification":
|
|
986
|
+
model_param['output_prob'] = True
|
|
987
|
+
model_param['output_responses'] = self.output_response
|
|
862
988
|
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
989
|
+
# Using RandomSearch for hyperparameter tunning when max_models is given.
|
|
990
|
+
# Otherwise, using GridSearch for hyperparameter tunning.
|
|
991
|
+
if self.max_models is not None:
|
|
992
|
+
# Setting max_models for RandomSearch based on model name
|
|
993
|
+
model_param['max_models'] = self.max_models_distribution[model_param['name']]
|
|
994
|
+
# Defining RandomSearch with ML model based on Name, and max_models
|
|
995
|
+
_obj = RandomSearch(func=model_to_func[model_param['name']],
|
|
996
|
+
params=model_param,
|
|
997
|
+
n_iter=model_param['max_models'])
|
|
998
|
+
else:
|
|
999
|
+
# Defining Gridsearch with ML model based on Name
|
|
1000
|
+
_obj = GridSearch(func=model_to_func[model_param['name']],
|
|
1001
|
+
params=model_param)
|
|
1002
|
+
|
|
1003
|
+
# Hyperparameter tunning
|
|
1004
|
+
# Parallel run opens multiple connections for parallel execution,
|
|
1005
|
+
# but volatile tables are not accessible across different sessions.
|
|
1006
|
+
# Therefore, execution is performed sequentially by setting run_parallel=False.
|
|
1007
|
+
|
|
1008
|
+
run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
1009
|
+
|
|
1010
|
+
common_params = {
|
|
1011
|
+
"data": train_data,
|
|
1012
|
+
"evaluation_metric": self.stopping_metric,
|
|
1013
|
+
"early_stop": self.stopping_tolerance,
|
|
1014
|
+
"run_parallel": run_parallel,
|
|
1015
|
+
"sample_seed": self.seed,
|
|
1016
|
+
"sample_id_column": "id",
|
|
1017
|
+
"discard_invalid_column_params": True,
|
|
1018
|
+
"stratify_column": self.startify_col,
|
|
1019
|
+
"verbose": verbose,
|
|
1020
|
+
"max_time": self.max_runtime_secs,
|
|
1021
|
+
"suppress_refer_msg": True
|
|
1022
|
+
}
|
|
882
1023
|
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
1024
|
+
if model_param['name'] == 'knn':
|
|
1025
|
+
_obj.fit(**common_params)
|
|
1026
|
+
else:
|
|
1027
|
+
_obj.fit(**common_params, **eval_params)
|
|
1028
|
+
|
|
1029
|
+
# Getting all passed models
|
|
1030
|
+
model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
|
|
1031
|
+
on='MODEL_ID', how='inner')
|
|
1032
|
+
if not model_info.empty:
|
|
1033
|
+
# Creating mapping data ID to feature selection method
|
|
1034
|
+
data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
|
|
1035
|
+
"DF_1": ('rfe', train_data[1]._table_name),
|
|
1036
|
+
"DF_2": ('pca', train_data[2]._table_name)}
|
|
1037
|
+
|
|
1038
|
+
# Updating model stats with feature selection method and result table
|
|
1039
|
+
for index, row in model_info.iterrows():
|
|
1040
|
+
model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
|
|
1041
|
+
model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
|
|
1042
|
+
model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
|
|
1043
|
+
model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
|
|
1044
|
+
|
|
1045
|
+
# Dropping column 'DATA_ID'
|
|
1046
|
+
model_info.drop(['DATA_ID'], axis=1, inplace=True)
|
|
1047
|
+
|
|
1048
|
+
model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
|
|
1049
|
+
|
|
1050
|
+
if not self.is_classification_type():
|
|
1051
|
+
# Calculating Adjusted-R2 for regression
|
|
1052
|
+
# Getting size and feature count for each feature selection method
|
|
1053
|
+
methods = ["lasso", "rfe", "pca"]
|
|
1054
|
+
size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
|
|
1055
|
+
feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
|
|
1056
|
+
model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
|
|
1057
|
+
1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
|
|
1058
|
+
(size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
|
|
1059
|
+
|
|
1060
|
+
self._display_msg(msg="-"*100,
|
|
1061
|
+
progress_bar=self.progress_bar,
|
|
1062
|
+
show_data=True)
|
|
1063
|
+
self.progress_bar.update()
|
|
1064
|
+
|
|
1065
|
+
return model_info
|
|
1066
|
+
# Returning None, if no model is passed
|
|
1067
|
+
return None
|
|
906
1068
|
else:
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
|
|
911
|
-
on='MODEL_ID', how='inner')
|
|
912
|
-
if not model_info.empty:
|
|
913
|
-
# Creating mapping data ID to feature selection method
|
|
914
|
-
data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
|
|
915
|
-
"DF_1": ('rfe', train_data[1]._table_name),
|
|
916
|
-
"DF_2": ('pca', train_data[2]._table_name)}
|
|
1069
|
+
import time
|
|
1070
|
+
from teradataml import td_sklearn as skl
|
|
1071
|
+
|
|
917
1072
|
|
|
918
|
-
|
|
919
|
-
for index, row in model_info.iterrows():
|
|
920
|
-
model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
|
|
921
|
-
model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
|
|
922
|
-
model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
|
|
923
|
-
model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
|
|
1073
|
+
model_name = model_param['name']
|
|
924
1074
|
|
|
925
|
-
# Dropping column 'DATA_ID'
|
|
926
|
-
model_info.drop(['DATA_ID'], axis=1, inplace=True)
|
|
927
1075
|
|
|
928
|
-
|
|
1076
|
+
self._display_msg(msg=model_name,
|
|
1077
|
+
progress_bar=self.progress_bar, show_data=True)
|
|
1078
|
+
|
|
1079
|
+
if model_name == "KMeans":
|
|
1080
|
+
model_func = skl.KMeans()
|
|
1081
|
+
param_key = "n_clusters"
|
|
1082
|
+
pred_col = "kmeans_predict_1"
|
|
1083
|
+
elif model_name == "GaussianMixture":
|
|
1084
|
+
model_func = skl.GaussianMixture()
|
|
1085
|
+
param_key = "n_components"
|
|
1086
|
+
pred_col = "gaussianmixture_predict_1"
|
|
1087
|
+
else:
|
|
1088
|
+
raise ValueError(f"Unsupported model: {model_name}")
|
|
1089
|
+
|
|
1090
|
+
model_param["input_columns"] = self.features
|
|
1091
|
+
model_param["persist"] = self.persist
|
|
929
1092
|
|
|
930
|
-
if not
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
1093
|
+
if self.max_models is not None:
|
|
1094
|
+
model_param['max_models'] = self.max_models_distribution[model_name]
|
|
1095
|
+
|
|
1096
|
+
search_obj = RandomSearch(func=model_func,
|
|
1097
|
+
params=model_param['param_grid'],
|
|
1098
|
+
n_iter=model_param['max_models'])
|
|
1099
|
+
else:
|
|
1100
|
+
search_obj = GridSearch(func=model_func, params=model_param["param_grid"])
|
|
1101
|
+
|
|
1102
|
+
search_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
|
|
1103
|
+
early_stop=self.stopping_tolerance, run_parallel=True,
|
|
1104
|
+
sample_seed=self.seed, verbose=verbose, max_time=self.max_runtime_secs)
|
|
1105
|
+
|
|
1106
|
+
model_df = search_obj.models[search_obj.models["STATUS"] == "PASS"]
|
|
1107
|
+
if model_df.empty:
|
|
1108
|
+
print("No models passed. Exiting.")
|
|
1109
|
+
self.progress_bar.update()
|
|
1110
|
+
return None
|
|
1111
|
+
|
|
1112
|
+
model_stats = search_obj.model_stats
|
|
1113
|
+
model_info = model_stats.merge(model_df[['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
|
|
1114
|
+
on="MODEL_ID", how="inner")
|
|
1115
|
+
|
|
1116
|
+
if not model_info.empty:
|
|
1117
|
+
# Creating mapping data ID to feature selection method
|
|
1118
|
+
data_id_to_table_map = {"DF_0": ('pca', train_data[1]._table_name),
|
|
1119
|
+
"DF_1": ('non_pca', train_data[0]._table_name)}
|
|
1120
|
+
|
|
1121
|
+
# Updating model stats with feature selection method and result table
|
|
1122
|
+
for index, row in model_info.iterrows():
|
|
1123
|
+
model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
|
|
1124
|
+
model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
|
|
1125
|
+
model_info.loc[index, 'model-obj'] = search_obj.get_model(row['MODEL_ID'])
|
|
1126
|
+
|
|
1127
|
+
# Dropping column 'DATA_ID'
|
|
1128
|
+
model_info.drop(['DATA_ID'], axis=1, inplace=True)
|
|
949
1129
|
|
|
1130
|
+
model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
self._display_msg(msg="-"*100,
|
|
1134
|
+
progress_bar=self.progress_bar,
|
|
1135
|
+
show_data=True)
|
|
1136
|
+
self.progress_bar.update()
|
|
1137
|
+
|
|
1138
|
+
return model_info
|
|
1139
|
+
|
|
1140
|
+
return None
|
|
1141
|
+
|
|
1142
|
+
|
|
950
1143
|
@staticmethod
|
|
951
1144
|
def _eval_params_generation(ml_name,
|
|
952
1145
|
target_column,
|
|
@@ -980,7 +1173,7 @@ class _ModelTraining:
|
|
|
980
1173
|
"""
|
|
981
1174
|
# Setting the eval_params
|
|
982
1175
|
eval_params = {"id_column": "id",
|
|
983
|
-
|
|
1176
|
+
"accumulate": target_column}
|
|
984
1177
|
|
|
985
1178
|
model_type = {
|
|
986
1179
|
'xgboost': 'model_type',
|
|
@@ -1013,4 +1206,4 @@ class _ModelTraining:
|
|
|
1013
1206
|
elif ml_name == 'glm':
|
|
1014
1207
|
eval_params['family'] = 'GAUSSIAN'
|
|
1015
1208
|
|
|
1016
|
-
return eval_params
|
|
1209
|
+
return eval_params
|