teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show
  1. teradataml/README.md +210 -0
  2. teradataml/__init__.py +1 -1
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +162 -76
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/json_parser/__init__.py +2 -0
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
  8. teradataml/analytics/json_parser/metadata.py +22 -4
  9. teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
  10. teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
  11. teradataml/analytics/sqle/__init__.py +3 -0
  12. teradataml/analytics/utils.py +4 -1
  13. teradataml/automl/__init__.py +2369 -464
  14. teradataml/automl/autodataprep/__init__.py +15 -0
  15. teradataml/automl/custom_json_utils.py +184 -112
  16. teradataml/automl/data_preparation.py +113 -58
  17. teradataml/automl/data_transformation.py +154 -53
  18. teradataml/automl/feature_engineering.py +113 -53
  19. teradataml/automl/feature_exploration.py +548 -25
  20. teradataml/automl/model_evaluation.py +260 -32
  21. teradataml/automl/model_training.py +399 -206
  22. teradataml/clients/auth_client.py +2 -2
  23. teradataml/common/aed_utils.py +11 -2
  24. teradataml/common/bulk_exposed_utils.py +4 -2
  25. teradataml/common/constants.py +62 -2
  26. teradataml/common/garbagecollector.py +50 -21
  27. teradataml/common/messagecodes.py +47 -2
  28. teradataml/common/messages.py +19 -1
  29. teradataml/common/sqlbundle.py +23 -6
  30. teradataml/common/utils.py +116 -10
  31. teradataml/context/aed_context.py +16 -10
  32. teradataml/data/Employee.csv +5 -0
  33. teradataml/data/Employee_Address.csv +4 -0
  34. teradataml/data/Employee_roles.csv +5 -0
  35. teradataml/data/JulesBelvezeDummyData.csv +100 -0
  36. teradataml/data/byom_example.json +5 -0
  37. teradataml/data/creditcard_data.csv +284618 -0
  38. teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
  39. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
  40. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
  42. teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
  43. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
  44. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
  45. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
  46. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
  47. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
  48. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
  49. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
  50. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
  51. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
  52. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
  53. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
  54. teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
  55. teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
  56. teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
  57. teradataml/data/load_example_data.py +29 -11
  58. teradataml/data/payment_fraud_dataset.csv +10001 -0
  59. teradataml/data/teradataml_example.json +67 -0
  60. teradataml/dataframe/copy_to.py +714 -54
  61. teradataml/dataframe/dataframe.py +1153 -33
  62. teradataml/dataframe/dataframe_utils.py +8 -3
  63. teradataml/dataframe/functions.py +168 -1
  64. teradataml/dataframe/setop.py +4 -1
  65. teradataml/dataframe/sql.py +141 -9
  66. teradataml/dbutils/dbutils.py +470 -35
  67. teradataml/dbutils/filemgr.py +1 -1
  68. teradataml/hyperparameter_tuner/optimizer.py +456 -142
  69. teradataml/lib/aed_0_1.dll +0 -0
  70. teradataml/lib/libaed_0_1.dylib +0 -0
  71. teradataml/lib/libaed_0_1.so +0 -0
  72. teradataml/lib/libaed_0_1_aarch64.so +0 -0
  73. teradataml/scriptmgmt/UserEnv.py +234 -34
  74. teradataml/scriptmgmt/lls_utils.py +43 -17
  75. teradataml/sdk/_json_parser.py +1 -1
  76. teradataml/sdk/api_client.py +9 -6
  77. teradataml/sdk/modelops/_client.py +3 -0
  78. teradataml/series/series.py +12 -7
  79. teradataml/store/feature_store/constants.py +601 -234
  80. teradataml/store/feature_store/feature_store.py +2886 -616
  81. teradataml/store/feature_store/mind_map.py +639 -0
  82. teradataml/store/feature_store/models.py +5831 -214
  83. teradataml/store/feature_store/utils.py +390 -0
  84. teradataml/table_operators/table_operator_util.py +1 -1
  85. teradataml/table_operators/templates/dataframe_register.template +6 -2
  86. teradataml/table_operators/templates/dataframe_udf.template +6 -2
  87. teradataml/utils/docstring.py +527 -0
  88. teradataml/utils/dtypes.py +93 -0
  89. teradataml/utils/internal_buffer.py +2 -2
  90. teradataml/utils/utils.py +41 -2
  91. teradataml/utils/validators.py +694 -17
  92. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
  93. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
  94. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
  95. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
  96. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  # ##################################################################
2
2
  #
3
- # Copyright 2024 Teradata. All rights reserved.
3
+ # Copyright 2025 Teradata. All rights reserved.
4
4
  # TERADATA CONFIDENTIAL AND TRADE SECRET
5
5
  #
6
6
  # Primary Owner: Sweta Shaw
@@ -29,7 +29,7 @@ from teradataml import execute_sql, get_connection
29
29
  from teradataml import configure, SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
30
30
  from teradataml.utils.validators import _Validators
31
31
  from teradataml.common.utils import UtilFuncs
32
- from teradataml.common.constants import TeradataConstants
32
+ from teradataml.common.constants import TeradataConstants, AutoMLConstants
33
33
 
34
34
  class _ModelTraining:
35
35
 
@@ -54,7 +54,7 @@ class _ModelTraining:
54
54
  Types: teradataml Dataframe
55
55
 
56
56
  target_column:
57
- Required Argument.
57
+ Required Argument. (Not required for Clustering task_type)
58
58
  Specifies the target column present inside the dataset.
59
59
  Types: str
60
60
 
@@ -83,9 +83,9 @@ class _ModelTraining:
83
83
  task_type:
84
84
  Required Argument.
85
85
  Specifies the task type for AutoML, whether to apply regresion
86
- or classification on the provived dataset.
86
+ or classification or clustering on the provived dataset.
87
87
  Default Value: "Regression"
88
- Permitted Values: "Regression", "Classification"
88
+ Permitted Values: "Regression", "Classification", "Clustering"
89
89
  Types: str
90
90
 
91
91
  custom_data:
@@ -120,12 +120,17 @@ class _ModelTraining:
120
120
  Specifies the random seed for reproducibility.
121
121
  Default Value: 42
122
122
  Types: int
123
+
124
+ cluster:
125
+ Optional Argument.
126
+ Specifies whether to apply clustering techniques.
127
+ Default Value: False
128
+ Types: bool
123
129
  """
124
130
  self.data = data
125
131
  self.target_column = target_column
126
132
  self.model_list = model_list
127
133
  self.verbose = verbose
128
- self.features = (features[1], features[0], features[2])
129
134
  self.task_type = task_type
130
135
  self.custom_data = custom_data
131
136
  self.labels = self.data.drop_duplicate(self.target_column).size
@@ -133,14 +138,19 @@ class _ModelTraining:
133
138
  self.persist = kwargs.get("persist", False)
134
139
  self.volatile = kwargs.get("volatile", False)
135
140
  self.seed = kwargs.get("seed", 42)
136
-
141
+ self.cluster = kwargs.get("cluster", False)
142
+
143
+ if not self.cluster:
144
+ self.features = (features[1], features[0], features[2])
145
+ else:
146
+ self.features = (features[1], features[0])
147
+
137
148
  def model_training(self,
138
149
  auto=True,
139
150
  max_runtime_secs=None,
140
151
  stopping_metric=None,
141
152
  stopping_tolerance=0,
142
- max_models=None
143
- ):
153
+ max_models=None):
144
154
  """
145
155
  DESCRIPTION:
146
156
  Function to perform following tasks:-
@@ -231,7 +241,12 @@ class _ModelTraining:
231
241
  int containing, total number of models available for training.
232
242
  """
233
243
  # Creating all possible combinations of hyperparameters
234
- all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
244
+ if 'param_grid' in hyperparameters:
245
+ grid = hyperparameters['param_grid']
246
+ else:
247
+ # AutoML style: full dict is hyperparameter space
248
+ grid = hyperparameters
249
+ all_combinations = list(product(*[v if isinstance(v, (list, tuple)) else [v] for v in grid.values()]))
235
250
  # Getting total number of models for each model model training function
236
251
  total_models = len(all_combinations)
237
252
  return total_models
@@ -279,21 +294,34 @@ class _ModelTraining:
279
294
  None
280
295
  """
281
296
  self._display_msg(msg="\nHyperparameters used for model training: ",
282
- progress_bar = self.progress_bar,
297
+ progress_bar=self.progress_bar,
283
298
  show_data=True)
284
299
  print(" " *150, end='\r', flush=True)
285
300
 
286
301
  # Iterating over hyperparameters_list
287
302
  for hyperparameter_dct in hyperparameters_list:
288
- # Extracting hyperparameter and their value from hyperparameters dictionary
289
- for key, val in hyperparameter_dct.items():
290
- # Displaying hyperparameters
291
- print(f"{key} : {str(val)}")
303
+ name = hyperparameter_dct.get("name", "Unnamed Model")
304
+ print(f"Model: {name}")
292
305
 
293
- # Displaying total number of models for each model
306
+ if self.cluster and "param_grid" in hyperparameter_dct:
307
+ # Also show metadata outside param_grid
308
+ for meta_key, meta_val in hyperparameter_dct.items():
309
+ if meta_key != "param_grid":
310
+ print(f"{meta_key}: {meta_val}")
311
+
312
+ print("Hyperparameter Grid:")
313
+ for key, val in hyperparameter_dct["param_grid"].items():
314
+ print(f" {key}: {val}")
315
+
316
+ else:
317
+ print("Hyperparameters:")
318
+ for key, val in hyperparameter_dct.items():
319
+ print(f" {key}: {val}")
320
+
294
321
  total_models = self._get_model_param_space(hyperparameter_dct)
295
- print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
296
- print(f"--"*100+'\n')
322
+
323
+ print(f"Total number of models for {name}: {total_models}")
324
+ print(f"--" * 100 + "\n")
297
325
 
298
326
  def _display_leaderboard(self,
299
327
  trained_models_info):
@@ -311,14 +339,20 @@ class _ModelTraining:
311
339
  pandas Dataframe.
312
340
  """
313
341
  # Creating a copy to avoid use of same reference of memory
314
- if self.task_type != "Regression":
315
- sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
316
- ascending=[False, False]).reset_index(drop=True)
342
+
343
+
344
+ if not self.cluster:
345
+ if self.task_type != "Regression":
346
+ sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
347
+ ascending=[False, False]).reset_index(drop=True)
348
+ else:
349
+ sorted_model_df = trained_models_info.sort_values(by='R2',
350
+ ascending=False).reset_index(drop=True)
317
351
  else:
318
- sorted_model_df = trained_models_info.sort_values(by='R2',
319
- ascending=False).reset_index(drop=True)
352
+ sorted_model_df = trained_models_info.sort_values(by=['SILHOUETTE', 'CALINSKI', 'DAVIES'],
353
+ ascending=[False, False, True]).reset_index(drop=True)
354
+
320
355
 
321
-
322
356
  # Adding rank to leaderboard
323
357
  sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
324
358
 
@@ -326,7 +360,7 @@ class _ModelTraining:
326
360
  dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
327
361
 
328
362
  # Excluding the model object and model name from leaderboard
329
- leaderboard = sorted_model_df.drop(dp_lst, axis=1)
363
+ leaderboard = sorted_model_df.drop(columns=[col for col in dp_lst if col in sorted_model_df.columns])
330
364
 
331
365
  # filtering the rows based on the max_models
332
366
  if self.max_models is not None:
@@ -363,24 +397,42 @@ class _ModelTraining:
363
397
  """
364
398
  # Iterating over new hyperparameters and performing required operation
365
399
  # based on passed method ADD or REPLACE
366
- for feature, param_list in new_params.items():
367
- if feature in existing_params.keys():
368
- if param_list["Method"] == "ADD":
369
- # Extending existing list
370
- existing_params[feature] = list(existing_params[feature])
371
- existing_params[feature].extend(param_list["Value"])
372
- # Updating list with unique values.
373
- existing_params[feature]=tuple(set(existing_params[feature]))
374
- elif param_list["Method"] == "REPLACE":
375
- # Replacing with entirely new value
376
- existing_params[feature] = tuple(param_list["Value"])
400
+ if self.cluster:
401
+ # Clustering: use param_grid
402
+ param_grid = existing_params.get("param_grid", {})
403
+ for feature, param_list in new_params.items():
404
+ if feature in param_grid:
405
+ if param_list["Method"] == "ADD":
406
+ param_grid[feature] = list(param_grid[feature])
407
+ param_grid[feature].extend(param_list["Value"])
408
+ param_grid[feature] = tuple(set(param_grid[feature]))
409
+ elif param_list["Method"] == "REPLACE":
410
+ param_grid[feature] = tuple(param_list["Value"])
411
+ else:
412
+ self._display_msg(inline_msg="Passed method is not valid.")
377
413
  else:
378
- self._display_msg(inline_msg="Passed method is not valid.")
379
- else:
380
- self._display_msg(inline_msg="\nPassed model argument {} is not"
381
- "available for model {}. Skipping it."
382
- .format(feature,existing_params['name']))
383
- continue
414
+ param_grid[feature] = tuple(param_list["Value"])
415
+ existing_params["param_grid"] = param_grid
416
+
417
+ else:
418
+ for feature, param_list in new_params.items():
419
+ if feature in existing_params.keys():
420
+ if param_list["Method"] == "ADD":
421
+ # Extending existing list
422
+ existing_params[feature] = list(existing_params[feature])
423
+ existing_params[feature].extend(param_list["Value"])
424
+ # Updating list with unique values.
425
+ existing_params[feature]=tuple(set(existing_params[feature]))
426
+ elif param_list["Method"] == "REPLACE":
427
+ # Replacing with entirely new value
428
+ existing_params[feature] = tuple(param_list["Value"])
429
+ else:
430
+ self._display_msg(inline_msg="Passed method is not valid.")
431
+ else:
432
+ self._display_msg(inline_msg="\nPassed model argument {} is not"
433
+ " available for model {}. Skipping it."
434
+ .format(feature,existing_params['name']))
435
+ continue
384
436
  # Returning updated hyperparamter
385
437
  return existing_params
386
438
 
@@ -422,13 +474,13 @@ class _ModelTraining:
422
474
  hyperparameters[model_index]=self._update_hyperparameters(hyperparameters[model_index],hyp_list)
423
475
  # Displaying it after update
424
476
  self._display_msg(inline_msg="\nCompleted customized hyperparameter update.",
425
- progress_bar=self.progress_bar)
477
+ progress_bar=self.progress_bar)
426
478
  else:
427
479
  self._display_msg(inline_msg="No information provided for custom hyperparameters. AutoML will proceed with default values.",
428
- progress_bar=self.progress_bar)
480
+ progress_bar=self.progress_bar)
429
481
  else:
430
482
  self._display_msg(inline_msg="\nSkipping customized hyperparameter tuning",
431
- progress_bar=self.progress_bar)
483
+ progress_bar=self.progress_bar)
432
484
  # Retunring updated hyperparameters for all models
433
485
  return hyperparameters
434
486
 
@@ -506,7 +558,7 @@ class _ModelTraining:
506
558
  'max_depth': tuple(max_depth),
507
559
  'min_node_size': tuple(min_node_size),
508
560
  'iter_num': tuple(iter_num),
509
- 'seed':self.seed
561
+ 'seed': self.seed
510
562
  }
511
563
  # Hyperparameters for Decision Forest model
512
564
  df_params = {
@@ -517,7 +569,7 @@ class _ModelTraining:
517
569
  'max_depth': tuple(max_depth),
518
570
  'min_node_size': tuple(min_node_size),
519
571
  'num_trees': tuple(num_trees),
520
- 'seed':self.seed
572
+ 'seed': self.seed
521
573
  }
522
574
 
523
575
  # Updating model type in case of classification
@@ -663,6 +715,47 @@ class _ModelTraining:
663
715
  else:
664
716
  return None
665
717
 
718
+ def _get_kmeans_hyperparameters(self):
719
+ """
720
+ DESCRIPTION:
721
+ Generates hyperparameters for KMeans clustering.
722
+
723
+ RETURNS:
724
+ dict containing hyperparameters for KMeans.
725
+ """
726
+ params = {
727
+ "name": "KMeans",
728
+ "param_grid": {
729
+ 'n_clusters': (2,3,4,5,6,7,8,9,10),
730
+ 'init': ('k-means++', 'random'),
731
+ 'n_init': (5, 10),
732
+ 'max_iter': (100, 200),
733
+ 'tol': (0.001, 0.01),
734
+ 'algorithm': ('auto', 'full')
735
+ }
736
+ }
737
+
738
+ return params
739
+
740
+ def _get_gmm_hyperparameters(self):
741
+ """
742
+ DESCRIPTION:
743
+ Generates hyperparameters for Gaussian Mixture Model (GMM).
744
+
745
+ RETURNS:
746
+ dict containing hyperparameters for GMM.
747
+ """
748
+ params = {
749
+ "name": "GaussianMixture",
750
+ "param_grid": {
751
+ "n_components": (2,3,4,5,6,7,8,9,10),
752
+ "covariance_type": ("full", "tied", "diag", "spherical"),
753
+ "max_iter": (100, 300)
754
+ }
755
+ }
756
+
757
+ return params
758
+
666
759
  def _generate_parameter(self):
667
760
  """
668
761
  DESCRIPTION:
@@ -672,46 +765,54 @@ class _ModelTraining:
672
765
  list containing, dict of hyperparameters for different ML models.
673
766
  """
674
767
  # list for storing hyperparameters
675
- parameters=[]
768
+ parameters = []
676
769
  # Index for model mapping
677
- model_index=0
770
+ model_index = 0
678
771
  # Dictionary for mapping model with index
679
772
  self.model_mapping={}
680
-
681
- # Getting number of rows and columns
682
- num_rows = self.data.shape[0]
683
- num_cols = self.data.shape[1]
684
-
685
- # Updating model list for multi-class classification
686
- if self.task_type.casefold() == "classification" and self.labels > 2:
687
- for model in ['glm','svm']:
688
- if model in self.model_list:
689
- self._display_msg(inline_msg="\nMulti-class classification is "
690
- "not supported by {} model. Skipping {} model."
691
- .format(model, model),
692
- progress_bar=self.progress_bar)
693
- self.model_list.remove(model)
694
-
695
- # Model functions mapping for hyperparameter generation
696
- model_functions = {
697
- 'decision_forest': self._get_tree_model_hyperparameters,
698
- 'xgboost': self._get_tree_model_hyperparameters,
699
- 'knn': self._get_knn_hyperparameters,
700
- 'glm': self._get_linear_model_hyperparameters,
701
- 'svm': self._get_linear_model_hyperparameters,
702
- }
703
-
704
- # Generating hyperparameters for each model
705
- if self.model_list:
706
- for model in self.model_list:
707
- self.model_mapping[model] = model_index
708
- if model == 'knn':
709
- parameters.append(model_functions[model](num_rows, num_cols))
710
- else:
711
- parameters.append(model_functions[model](num_rows, num_cols, model))
712
- model_index += 1
773
+ if not self.cluster:
774
+ # Getting number of rows and columns
775
+ num_rows = self.data.shape[0]
776
+ num_cols = self.data.shape[1]
777
+
778
+ # Model functions mapping for hyperparameter generation
779
+ model_functions = {
780
+ 'decision_forest': self._get_tree_model_hyperparameters,
781
+ 'xgboost': self._get_tree_model_hyperparameters,
782
+ 'knn': self._get_knn_hyperparameters,
783
+ 'glm': self._get_linear_model_hyperparameters,
784
+ 'svm': self._get_linear_model_hyperparameters,
785
+ }
786
+
787
+ if not self.cluster:
788
+ supported_models = AutoMLConstants.SUPERVISED_MODELS.value
789
+ self.model_list = [model for model in self.model_list if model in supported_models]
790
+
791
+ # Generating hyperparameters for each model
792
+ if self.model_list:
793
+ for model in self.model_list:
794
+ self.model_mapping[model] = model_index
795
+ if model == 'knn':
796
+ parameters.append(model_functions[model](num_rows, num_cols))
797
+ else:
798
+ parameters.append(model_functions[model](num_rows, num_cols, model))
799
+ model_index += 1
800
+ else:
801
+ raise ValueError("No model is selected for training.")
713
802
  else:
714
- raise ValueError("No model is selected for training.")
803
+ model_functions = {
804
+ 'KMeans': self._get_kmeans_hyperparameters,
805
+ 'GaussianMixture': self._get_gmm_hyperparameters,
806
+ }
807
+ supported_models = AutoMLConstants.CLUSTERING_MODELS.value
808
+ self.model_list = [model for model in self.model_list if model in supported_models]
809
+ if self.model_list:
810
+ for model in self.model_list:
811
+ self.model_mapping[model] = model_index
812
+ parameters.append(model_functions[model]())
813
+ model_index += 1
814
+ else:
815
+ raise ValueError("No model is selected for training.")
715
816
 
716
817
  return parameters
717
818
 
@@ -723,8 +824,12 @@ class _ModelTraining:
723
824
  RETURNS:
724
825
  dictionary containing max_models distribution and list of models to remove.
725
826
  """
827
+ if self.cluster:
828
+ models = [model for model in self.model_list if model in AutoMLConstants.CLUSTERING_MODELS.value]
829
+ else:
830
+ models = [model for model in self.model_list if model in AutoMLConstants.SUPERVISED_MODELS.value]
726
831
  # Getting total number of models
727
- model_count=len(self.model_list)
832
+ model_count = len(models)
728
833
  # Evenly distributing max_models across models
729
834
  base_assign = self.max_models // model_count
730
835
  # Creating list of max_models for each model
@@ -739,17 +844,20 @@ class _ModelTraining:
739
844
  distribution[i] += 1
740
845
 
741
846
  # Creating dictionary for model distribution
742
- model_distribution = dict(zip(self.model_list, distribution))
847
+ model_distribution = dict(zip(models, distribution))
743
848
  # Getting list of models with 0 distribution and removing them from model list
744
849
  # While for model having distribution greater than 0, updating distribution with
745
850
  # 1/3rd of original value as we are training with 3 different feature selection methods.
746
851
  models_to_remove = []
747
- for model in self.model_list:
748
- initial_count = model_distribution[model]
749
- if initial_count == 0:
750
- models_to_remove.append(model)
751
- else:
752
- model_distribution[model] = math.ceil(initial_count / 3)
852
+ if not self.cluster:
853
+ for model in models:
854
+ initial_count = model_distribution[model]
855
+ if initial_count == 0:
856
+ models_to_remove.append(model)
857
+ else:
858
+ model_distribution[model] = math.ceil(initial_count / 3)
859
+ else:
860
+ models_to_remove = [model for model, count in model_distribution.items() if count == 0]
753
861
 
754
862
  return model_distribution, models_to_remove
755
863
 
@@ -768,22 +876,31 @@ class _ModelTraining:
768
876
  RETURNS:
769
877
  Pandas DataFrame containing, trained models information.
770
878
  """
771
-
879
+ self.model_id_counters = {}
772
880
  # Hyperparameters for each model
773
881
  model_params = parameters[:min(len(parameters), 5)]
774
882
  self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
775
883
 
776
884
  # Defining training data
777
- data_types = ['lasso', 'rfe', 'pca']
778
- trainng_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
885
+ if not self.cluster:
886
+ data_types = ['lasso', 'rfe', 'pca']
887
+ training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
888
+ else:
889
+ data_types = ['pca', 'non_pca']
890
+ training_datas = tuple(DataFrame(self.data_mapping[f'{data_type}_train']) for data_type in data_types)
779
891
 
780
- if self.task_type == "Classification":
781
- response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
892
+
893
+
894
+ if self.task_type == "Classification" and not self.cluster:
895
+ response_values = training_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
782
896
  self.output_response = [str(i) for i in response_values]
783
897
 
784
898
  if self.stopping_metric is None:
785
- self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
786
- if self.is_classification_type() else 'R2'
899
+ if not self.cluster:
900
+ self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
901
+ if self.is_classification_type() else 'R2'
902
+ else:
903
+ self.stopping_tolerance, self.stopping_metric = 1.0, 'SILHOUETTE'
787
904
 
788
905
  self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
789
906
  if self.max_runtime_secs is not None else None
@@ -798,16 +915,17 @@ class _ModelTraining:
798
915
  # Updating progress bar as we are removing model
799
916
  self.progress_bar.update()
800
917
 
801
- if self.is_classification_type():
918
+ if self.is_classification_type() and not self.cluster:
802
919
  self.startify_col = self.target_column
803
920
 
804
921
  trained_models = []
922
+
805
923
  for param in model_params:
806
- result = self._hyperparameter_tunning(param, trainng_datas)
924
+ result = self._hyperparameter_tunning(param, training_datas)
807
925
  if result is not None:
808
926
  trained_models.append(result)
809
-
810
927
  models_df = pd.concat(trained_models, ignore_index=True)
928
+
811
929
  return models_df
812
930
 
813
931
  def _hyperparameter_tunning(self,
@@ -816,7 +934,7 @@ class _ModelTraining:
816
934
  """
817
935
  DESCRIPTION:
818
936
  Internal function performs hyperparameter tuning on
819
- ML models for regression/classification problems.
937
+ ML models for regression/classification/clustering problems.
820
938
 
821
939
  PARAMETERS:
822
940
  model_param
@@ -832,121 +950,196 @@ class _ModelTraining:
832
950
  RETURNS:
833
951
  pandas DataFrame containing, trained models information.
834
952
  """
835
- # Mapping model names to functions
836
- model_to_func = {"glm": GLM, "svm": SVM,
837
- "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
953
+ # Passing verbose value based on user input
954
+ if self.verbose > 0:
955
+ print(" " *200, end='\r', flush=True)
956
+ verbose = 1
957
+ else:
958
+ verbose = 0
959
+
960
+ if not self.cluster:
961
+ # Mapping model names to functions
962
+ model_to_func = {"glm": GLM, "svm": SVM,
963
+ "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
838
964
 
839
- # Setting eval_params for hpt.
840
- eval_params = _ModelTraining._eval_params_generation(model_param['name'],
841
- self.target_column,
842
- self.task_type)
965
+ # Setting eval_params for hpt.
966
+ eval_params = _ModelTraining._eval_params_generation(model_param['name'],
967
+ self.target_column,
968
+ self.task_type)
843
969
 
844
- # Input columns for model
845
- model_param['input_columns'] = self.features
970
+ # Input columns for model
971
+ model_param['input_columns'] = self.features
846
972
 
847
- # Setting persist for model
848
- model_param['persist'] = self.persist
973
+ # Setting persist for model
974
+ model_param['persist'] = self.persist
849
975
 
850
- self._display_msg(msg=model_param['name'],
851
- progress_bar=self.progress_bar,
852
- show_data=True)
853
-
854
- # As we are using entire data for HPT training. So,
855
- # passing prepared training data as test_data for KNN.
856
- if model_param['name'] == 'knn':
857
- model_param['test_data'] = train_data
976
+ self._display_msg(msg=model_param['name'],
977
+ progress_bar=self.progress_bar,
978
+ show_data=True)
979
+
980
+ # As we are using entire data for HPT training. So,
981
+ # passing prepared training data as test_data for KNN.
982
+ if model_param['name'] == 'knn':
983
+ model_param['test_data'] = train_data
858
984
 
859
- if self.task_type == "Classification":
860
- model_param['output_prob'] = True
861
- model_param['output_responses'] = self.output_response
985
+ if self.task_type == "Classification":
986
+ model_param['output_prob'] = True
987
+ model_param['output_responses'] = self.output_response
862
988
 
863
- # Using RandomSearch for hyperparameter tunning when max_models is given.
864
- # Otherwise, using GridSearch for hyperparameter tunning.
865
- if self.max_models is not None:
866
- # Setting max_models for RandomSearch based on model name
867
- model_param['max_models'] = self.max_models_distribution[model_param['name']]
868
- # Defining RandomSearch with ML model based on Name, and max_models
869
- _obj = RandomSearch(func=model_to_func[model_param['name']],
870
- params=model_param,
871
- n_iter=model_param['max_models'])
872
- else:
873
- # Defining Gridsearch with ML model based on Name
874
- _obj = GridSearch(func=model_to_func[model_param['name']],
875
- params=model_param)
876
-
877
- if self.verbose > 0:
878
- print(" " *200, end='\r', flush=True)
879
- verbose = 1
880
- else:
881
- verbose = 0
989
+ # Using RandomSearch for hyperparameter tunning when max_models is given.
990
+ # Otherwise, using GridSearch for hyperparameter tunning.
991
+ if self.max_models is not None:
992
+ # Setting max_models for RandomSearch based on model name
993
+ model_param['max_models'] = self.max_models_distribution[model_param['name']]
994
+ # Defining RandomSearch with ML model based on Name, and max_models
995
+ _obj = RandomSearch(func=model_to_func[model_param['name']],
996
+ params=model_param,
997
+ n_iter=model_param['max_models'])
998
+ else:
999
+ # Defining Gridsearch with ML model based on Name
1000
+ _obj = GridSearch(func=model_to_func[model_param['name']],
1001
+ params=model_param)
1002
+
1003
+ # Hyperparameter tunning
1004
+ # Parallel run opens multiple connections for parallel execution,
1005
+ # but volatile tables are not accessible across different sessions.
1006
+ # Therefore, execution is performed sequentially by setting run_parallel=False.
1007
+
1008
+ run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
1009
+
1010
+ common_params = {
1011
+ "data": train_data,
1012
+ "evaluation_metric": self.stopping_metric,
1013
+ "early_stop": self.stopping_tolerance,
1014
+ "run_parallel": run_parallel,
1015
+ "sample_seed": self.seed,
1016
+ "sample_id_column": "id",
1017
+ "discard_invalid_column_params": True,
1018
+ "stratify_column": self.startify_col,
1019
+ "verbose": verbose,
1020
+ "max_time": self.max_runtime_secs,
1021
+ "suppress_refer_msg": True
1022
+ }
882
1023
 
883
- # Hyperparameter tunning
884
- # Parallel run opens multiple connections for parallel execution,
885
- # but volatile tables are not accessible across different sessions.
886
- # Therefore, execution is performed sequentially by setting run_parallel=False.
887
-
888
- run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
889
-
890
- common_params = {
891
- "data": train_data,
892
- "evaluation_metric": self.stopping_metric,
893
- "early_stop": self.stopping_tolerance,
894
- "run_parallel": run_parallel,
895
- "sample_seed": self.seed,
896
- "sample_id_column": "id",
897
- "discard_invalid_column_params": True,
898
- "stratify_column": self.startify_col,
899
- "verbose": verbose,
900
- "max_time": self.max_runtime_secs,
901
- "suppress_refer_msg": True
902
- }
903
-
904
- if model_param['name'] == 'knn':
905
- _obj.fit(**common_params)
1024
+ if model_param['name'] == 'knn':
1025
+ _obj.fit(**common_params)
1026
+ else:
1027
+ _obj.fit(**common_params, **eval_params)
1028
+
1029
+ # Getting all passed models
1030
+ model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
1031
+ on='MODEL_ID', how='inner')
1032
+ if not model_info.empty:
1033
+ # Creating mapping data ID to feature selection method
1034
+ data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
1035
+ "DF_1": ('rfe', train_data[1]._table_name),
1036
+ "DF_2": ('pca', train_data[2]._table_name)}
1037
+
1038
+ # Updating model stats with feature selection method and result table
1039
+ for index, row in model_info.iterrows():
1040
+ model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
1041
+ model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
1042
+ model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
1043
+ model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
1044
+
1045
+ # Dropping column 'DATA_ID'
1046
+ model_info.drop(['DATA_ID'], axis=1, inplace=True)
1047
+
1048
+ model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
1049
+
1050
+ if not self.is_classification_type():
1051
+ # Calculating Adjusted-R2 for regression
1052
+ # Getting size and feature count for each feature selection method
1053
+ methods = ["lasso", "rfe", "pca"]
1054
+ size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
1055
+ feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
1056
+ model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
1057
+ 1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
1058
+ (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
1059
+
1060
+ self._display_msg(msg="-"*100,
1061
+ progress_bar=self.progress_bar,
1062
+ show_data=True)
1063
+ self.progress_bar.update()
1064
+
1065
+ return model_info
1066
+ # Returning None, if no model is passed
1067
+ return None
906
1068
  else:
907
- _obj.fit(**common_params, **eval_params)
908
-
909
- # Getting all passed models
910
- model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
911
- on='MODEL_ID', how='inner')
912
- if not model_info.empty:
913
- # Creating mapping data ID to feature selection method
914
- data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
915
- "DF_1": ('rfe', train_data[1]._table_name),
916
- "DF_2": ('pca', train_data[2]._table_name)}
1069
+ import time
1070
+ from teradataml import td_sklearn as skl
1071
+
917
1072
 
918
- # Updating model stats with feature selection method and result table
919
- for index, row in model_info.iterrows():
920
- model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
921
- model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
922
- model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
923
- model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
1073
+ model_name = model_param['name']
924
1074
 
925
- # Dropping column 'DATA_ID'
926
- model_info.drop(['DATA_ID'], axis=1, inplace=True)
927
1075
 
928
- model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
1076
+ self._display_msg(msg=model_name,
1077
+ progress_bar=self.progress_bar, show_data=True)
1078
+
1079
+ if model_name == "KMeans":
1080
+ model_func = skl.KMeans()
1081
+ param_key = "n_clusters"
1082
+ pred_col = "kmeans_predict_1"
1083
+ elif model_name == "GaussianMixture":
1084
+ model_func = skl.GaussianMixture()
1085
+ param_key = "n_components"
1086
+ pred_col = "gaussianmixture_predict_1"
1087
+ else:
1088
+ raise ValueError(f"Unsupported model: {model_name}")
1089
+
1090
+ model_param["input_columns"] = self.features
1091
+ model_param["persist"] = self.persist
929
1092
 
930
- if not self.is_classification_type():
931
- # Calculating Adjusted-R2 for regression
932
- # Getting size and feature count for each feature selection method
933
- methods = ["lasso", "rfe", "pca"]
934
- size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
935
- feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
936
- model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
937
- 1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
938
- (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
939
-
940
- self._display_msg(msg="-"*100,
941
- progress_bar=self.progress_bar,
942
- show_data=True)
943
- self.progress_bar.update()
944
-
945
- return model_info
946
-
947
- # Returning None, if no model is passed
948
- return None
1093
+ if self.max_models is not None:
1094
+ model_param['max_models'] = self.max_models_distribution[model_name]
1095
+
1096
+ search_obj = RandomSearch(func=model_func,
1097
+ params=model_param['param_grid'],
1098
+ n_iter=model_param['max_models'])
1099
+ else:
1100
+ search_obj = GridSearch(func=model_func, params=model_param["param_grid"])
1101
+
1102
+ search_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
1103
+ early_stop=self.stopping_tolerance, run_parallel=True,
1104
+ sample_seed=self.seed, verbose=verbose, max_time=self.max_runtime_secs)
1105
+
1106
+ model_df = search_obj.models[search_obj.models["STATUS"] == "PASS"]
1107
+ if model_df.empty:
1108
+ print("No models passed. Exiting.")
1109
+ self.progress_bar.update()
1110
+ return None
1111
+
1112
+ model_stats = search_obj.model_stats
1113
+ model_info = model_stats.merge(model_df[['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
1114
+ on="MODEL_ID", how="inner")
1115
+
1116
+ if not model_info.empty:
1117
+ # Creating mapping data ID to feature selection method
1118
+ data_id_to_table_map = {"DF_0": ('pca', train_data[1]._table_name),
1119
+ "DF_1": ('non_pca', train_data[0]._table_name)}
1120
+
1121
+ # Updating model stats with feature selection method and result table
1122
+ for index, row in model_info.iterrows():
1123
+ model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
1124
+ model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
1125
+ model_info.loc[index, 'model-obj'] = search_obj.get_model(row['MODEL_ID'])
1126
+
1127
+ # Dropping column 'DATA_ID'
1128
+ model_info.drop(['DATA_ID'], axis=1, inplace=True)
949
1129
 
1130
+ model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
1131
+
1132
+
1133
+ self._display_msg(msg="-"*100,
1134
+ progress_bar=self.progress_bar,
1135
+ show_data=True)
1136
+ self.progress_bar.update()
1137
+
1138
+ return model_info
1139
+
1140
+ return None
1141
+
1142
+
950
1143
  @staticmethod
951
1144
  def _eval_params_generation(ml_name,
952
1145
  target_column,
@@ -980,7 +1173,7 @@ class _ModelTraining:
980
1173
  """
981
1174
  # Setting the eval_params
982
1175
  eval_params = {"id_column": "id",
983
- "accumulate": target_column}
1176
+ "accumulate": target_column}
984
1177
 
985
1178
  model_type = {
986
1179
  'xgboost': 'model_type',
@@ -1013,4 +1206,4 @@ class _ModelTraining:
1013
1206
  elif ml_name == 'glm':
1014
1207
  eval_params['family'] = 'GAUSSIAN'
1015
1208
 
1016
- return eval_params
1209
+ return eval_params