teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (108) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +71 -0
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +51 -24
  6. teradataml/analytics/json_parser/utils.py +11 -17
  7. teradataml/automl/__init__.py +103 -48
  8. teradataml/automl/data_preparation.py +55 -37
  9. teradataml/automl/data_transformation.py +131 -69
  10. teradataml/automl/feature_engineering.py +117 -185
  11. teradataml/automl/feature_exploration.py +9 -2
  12. teradataml/automl/model_evaluation.py +13 -25
  13. teradataml/automl/model_training.py +214 -75
  14. teradataml/catalog/model_cataloging_utils.py +1 -1
  15. teradataml/clients/auth_client.py +133 -0
  16. teradataml/common/aed_utils.py +3 -2
  17. teradataml/common/constants.py +11 -6
  18. teradataml/common/garbagecollector.py +5 -0
  19. teradataml/common/messagecodes.py +3 -1
  20. teradataml/common/messages.py +2 -1
  21. teradataml/common/utils.py +6 -0
  22. teradataml/context/context.py +49 -29
  23. teradataml/data/advertising.csv +201 -0
  24. teradataml/data/bank_marketing.csv +11163 -0
  25. teradataml/data/bike_sharing.csv +732 -0
  26. teradataml/data/boston2cols.csv +721 -0
  27. teradataml/data/breast_cancer.csv +570 -0
  28. teradataml/data/customer_segmentation_test.csv +2628 -0
  29. teradataml/data/customer_segmentation_train.csv +8069 -0
  30. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  31. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  32. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  33. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  34. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  35. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  36. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  37. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  38. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  39. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  40. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  41. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  42. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  43. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  44. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  45. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  46. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  47. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  48. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  49. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  50. teradataml/data/glm_example.json +28 -1
  51. teradataml/data/housing_train_segment.csv +201 -0
  52. teradataml/data/insect2Cols.csv +61 -0
  53. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  54. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  55. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  56. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  57. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  58. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  59. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  60. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  61. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  62. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  63. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  64. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  65. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  66. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  67. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  68. teradataml/data/kmeans_example.json +5 -0
  69. teradataml/data/kmeans_table.csv +10 -0
  70. teradataml/data/onehot_encoder_train.csv +4 -0
  71. teradataml/data/openml_example.json +29 -0
  72. teradataml/data/scale_attributes.csv +3 -0
  73. teradataml/data/scale_example.json +52 -1
  74. teradataml/data/scale_input_part_sparse.csv +31 -0
  75. teradataml/data/scale_input_partitioned.csv +16 -0
  76. teradataml/data/scale_input_sparse.csv +11 -0
  77. teradataml/data/scale_parameters.csv +3 -0
  78. teradataml/data/scripts/deploy_script.py +20 -1
  79. teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
  80. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
  81. teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
  82. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  83. teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
  84. teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
  85. teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
  86. teradataml/data/teradataml_example.json +77 -0
  87. teradataml/data/ztest_example.json +16 -0
  88. teradataml/dataframe/copy_to.py +8 -3
  89. teradataml/dataframe/data_transfer.py +120 -61
  90. teradataml/dataframe/dataframe.py +102 -17
  91. teradataml/dataframe/dataframe_utils.py +47 -9
  92. teradataml/dataframe/fastload.py +272 -89
  93. teradataml/dataframe/sql.py +84 -0
  94. teradataml/dbutils/dbutils.py +2 -2
  95. teradataml/lib/aed_0_1.dll +0 -0
  96. teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
  97. teradataml/options/__init__.py +13 -4
  98. teradataml/options/configure.py +27 -6
  99. teradataml/scriptmgmt/UserEnv.py +19 -16
  100. teradataml/scriptmgmt/lls_utils.py +117 -14
  101. teradataml/table_operators/Script.py +2 -3
  102. teradataml/table_operators/TableOperator.py +58 -10
  103. teradataml/utils/validators.py +40 -2
  104. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
  105. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
  106. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
  107. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
  108. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
@@ -16,6 +16,7 @@
16
16
  # Python libraries
17
17
  import concurrent.futures
18
18
  from concurrent.futures import ThreadPoolExecutor
19
+ import math
19
20
  import pandas as pd
20
21
  from itertools import product
21
22
 
@@ -24,7 +25,8 @@ from teradataml.context import context as tdmlctx
24
25
  from teradataml.dataframe.copy_to import copy_to_sql
25
26
  from teradataml.dataframe.dataframe import DataFrame
26
27
  from teradataml import execute_sql, get_connection
27
- from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
28
+ from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
29
+ from teradataml.utils.validators import _Validators
28
30
 
29
31
 
30
32
  class _ModelTraining:
@@ -49,12 +51,12 @@ class _ModelTraining:
49
51
  Types: teradataml Dataframe
50
52
 
51
53
  target_column:
52
- Required Arugment.
54
+ Required Argument.
53
55
  Specifies the target column present inside the dataset.
54
56
  Types: str
55
57
 
56
58
  model_list:
57
- Required Arugment.
59
+ Required Argument.
58
60
  Specifies the list of models to be used for model training.
59
61
  Types: list
60
62
 
@@ -70,13 +72,13 @@ class _ModelTraining:
70
72
  Types: int
71
73
 
72
74
  features:
73
- Required Arugment.
75
+ Required Argument.
74
76
  Specifies the list of selected feature by rfe, lasso and pca
75
77
  respectively in this order.
76
78
  Types: list of list of strings (str)
77
79
 
78
80
  task_type:
79
- Required Arugment.
81
+ Required Argument.
80
82
  Specifies the task type for AutoML, whether to apply regresion
81
83
  or classification on the provived dataset.
82
84
  Default Value: "Regression"
@@ -84,7 +86,7 @@ class _ModelTraining:
84
86
  Types: str
85
87
 
86
88
  custom_data:
87
- Optional Arugment.
89
+ Optional Argument.
88
90
  Specifies json object containing user customized input.
89
91
  Types: json object
90
92
  """
@@ -96,12 +98,14 @@ class _ModelTraining:
96
98
  self.task_type = task_type
97
99
  self.custom_data = custom_data
98
100
  self.labels = self.data.drop_duplicate(self.target_column).size
101
+ self.startify_col = None
99
102
 
100
103
  def model_training(self,
101
104
  auto=True,
102
105
  max_runtime_secs=None,
103
106
  stopping_metric=None,
104
- stopping_tolerance=0
107
+ stopping_tolerance=0,
108
+ max_models=None
105
109
  ):
106
110
  """
107
111
  DESCRIPTION:
@@ -112,14 +116,14 @@ class _ModelTraining:
112
116
 
113
117
  PARAMETERS:
114
118
  auto:
115
- Optional Arugment.
119
+ Optional Argument.
116
120
  Specifies whether to run data preparation in auto mode or custom mode.
117
121
  When set to True, runs automtically otherwise, it take user inputs.
118
122
  Default Value: True
119
123
  Types: boolean
120
124
 
121
125
  max_runtime_secs:
122
- Optional Arugment.
126
+ Optional Argument.
123
127
  Specifies the time limit in seconds for model training.
124
128
  Types: int
125
129
 
@@ -132,6 +136,11 @@ class _ModelTraining:
132
136
  Required, when "stopping_metric" is set, otherwise optional.
133
137
  Specifies the stopping tolerance for stopping metrics in model training.
134
138
  Types: float
139
+
140
+ max_models:
141
+ Optional Argument.
142
+ Specifies the maximum number of models to be trained.
143
+ Types: int
135
144
 
136
145
  RETURNS:
137
146
  pandas dataframes containing model information, leaderboard and target
@@ -140,6 +149,7 @@ class _ModelTraining:
140
149
  self.stopping_metric = stopping_metric
141
150
  self.stopping_tolerance = stopping_tolerance
142
151
  self.max_runtime_secs = max_runtime_secs
152
+ self.max_models = max_models
143
153
 
144
154
  self._display_heading(phase=3, progress_bar=self.progress_bar)
145
155
  self._display_msg(msg='Model Training started ...',
@@ -152,6 +162,10 @@ class _ModelTraining:
152
162
  if not auto:
153
163
  parameters = self._custom_hyperparameters(parameters)
154
164
 
165
+ # Validates the upper limit of max_models based on total model combinations
166
+ if self.max_models is not None:
167
+ self._validate_upper_limit_for_max_models(parameters)
168
+
155
169
  if self.verbose == 2:
156
170
  self._display_hyperparameters(parameters)
157
171
 
@@ -167,6 +181,54 @@ class _ModelTraining:
167
181
 
168
182
  return models, leader_board, self.labels
169
183
 
184
+ def _get_model_param_space(self,
185
+ hyperparameters):
186
+ """
187
+ DESCRIPTION:
188
+ Internal function to calculate the total number of models to be trained for specific model.
189
+
190
+ PARAMETERS:
191
+ hyperparameters:
192
+ Required Argument.
193
+ Specifies the hyperparameters availables for ML model.
194
+ Types: list of dict
195
+
196
+ RETURNS:
197
+ int containing, total number of models available for training.
198
+ """
199
+ # Creating all possible combinations of hyperparameters
200
+ all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
201
+ # Getting total number of models for each model model training function
202
+ total_models = len(all_combinations)
203
+ return total_models
204
+
205
+ def _validate_upper_limit_for_max_models(self,
206
+ hyperparameters_list):
207
+ """
208
+ DESCRIPTION:
209
+ Internal function to validate the upper limit of max_models.
210
+
211
+ PARAMETERS:
212
+ hyperparameters_list:
213
+ Required Argument.
214
+ Specifies the hyperparameters for different ML models.
215
+ Types: list of dict
216
+
217
+ RETURNS:
218
+ None
219
+
220
+ RAISES:
221
+ TeradataMlException, ValueError
222
+ """
223
+ model_param_space = 0
224
+ for hyperparameter_dct in hyperparameters_list:
225
+ # getting total number of models for each model
226
+ total_models = self._get_model_param_space(hyperparameter_dct)
227
+ model_param_space += total_models
228
+
229
+ # Validating upper range for max_models
230
+ _Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
231
+
170
232
  def _display_hyperparameters(self,
171
233
  hyperparameters_list):
172
234
  """
@@ -175,7 +237,7 @@ class _ModelTraining:
175
237
 
176
238
  PARAMETERS:
177
239
  hyperparameters_list:
178
- Required Arugment.
240
+ Required Argument.
179
241
  Specifies the hyperparameters for different ML models.
180
242
  Types: list of dict
181
243
 
@@ -189,16 +251,13 @@ class _ModelTraining:
189
251
 
190
252
  # Iterating over hyperparameters_list
191
253
  for hyperparameter_dct in hyperparameters_list:
192
- # Extracting hyperparameter and thier value from hyperparameters dictionary
254
+ # Extracting hyperparameter and their value from hyperparameters dictionary
193
255
  for key, val in hyperparameter_dct.items():
194
256
  # Displaying hyperparameters
195
257
  print(f"{key} : {str(val)}")
196
258
 
197
- # Creating all possible combinations of hyperparameters
198
- all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
199
-
200
259
  # Displaying total number of models for each model
201
- total_models = len(all_combinations)
260
+ total_models = self._get_model_param_space(hyperparameter_dct)
202
261
  print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
203
262
  print(f"--"*100+'\n')
204
263
 
@@ -210,7 +269,7 @@ class _ModelTraining:
210
269
 
211
270
  PARAMETERS:
212
271
  trained_models_info:
213
- Required Arugment.
272
+ Required Argument.
214
273
  Specifies the trained models inforamtion to display.
215
274
  Types: pandas Dataframe
216
275
 
@@ -227,10 +286,12 @@ class _ModelTraining:
227
286
 
228
287
  # Adding rank to leaderboard
229
288
  sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
230
-
231
- # Assuming 'sorted_df' is your DataFrame
232
- # Excluding the "last_col"
233
- leaderboard = sorted_model_df.drop("model-obj", axis=1)
289
+
290
+ # Excluding the model object and model name from leaderboard
291
+ leaderboard = sorted_model_df.drop(["model-obj","Name"], axis=1)
292
+ # filtering the rows based on the max_models
293
+ if self.max_models is not None:
294
+ leaderboard = leaderboard[leaderboard["Rank"] <= self.max_models]
234
295
 
235
296
  self._display_msg(msg="Leaderboard",
236
297
  progress_bar=self.progress_bar,
@@ -343,12 +404,12 @@ class _ModelTraining:
343
404
 
344
405
  PARAMETERS:
345
406
  num_rows:
346
- Required Arugment.
407
+ Required Argument.
347
408
  Specifies the number of rows in dataset.
348
409
  Types: int
349
410
 
350
411
  num_cols:
351
- Required Arugment.
412
+ Required Argument.
352
413
  Specifies the number of columns in dataset.
353
414
  Types: int
354
415
 
@@ -409,7 +470,8 @@ class _ModelTraining:
409
470
  'shrinkage_factor': tuple(shrinkage_factor),
410
471
  'max_depth': tuple(max_depth),
411
472
  'min_node_size': tuple(min_node_size),
412
- 'iter_num': tuple(iter_num)
473
+ 'iter_num': tuple(iter_num),
474
+ 'seed':42
413
475
  }
414
476
  # Hyperparameters for Decision Forest model
415
477
  df_params = {
@@ -419,7 +481,8 @@ class _ModelTraining:
419
481
  'min_impurity': tuple(min_impurity),
420
482
  'max_depth': tuple(max_depth),
421
483
  'min_node_size': tuple(min_node_size),
422
- 'num_trees': tuple(num_trees)
484
+ 'num_trees': tuple(num_trees),
485
+ 'seed':42
423
486
  }
424
487
 
425
488
  # Updating model type in case of classification
@@ -445,12 +508,12 @@ class _ModelTraining:
445
508
 
446
509
  PARAMETERS:
447
510
  num_rows
448
- Required Arugment.
511
+ Required Argument.
449
512
  Specifies the number of rows in dataset.
450
513
  Types: int
451
514
 
452
515
  num_cols:
453
- Required Arugment.
516
+ Required Argument.
454
517
  Specifies the number of columns in dataset.
455
518
  Types: int
456
519
 
@@ -482,12 +545,12 @@ class _ModelTraining:
482
545
 
483
546
  PARAMETERS:
484
547
  num_rows:
485
- Required Arugment.
548
+ Required Argument.
486
549
  Specifies the number of rows in dataset.
487
550
  Types: int
488
551
 
489
552
  num_cols:
490
- Required Arugment.
553
+ Required Argument.
491
554
  Specifies the number of columns in dataset.
492
555
  Types: int
493
556
 
@@ -616,6 +679,44 @@ class _ModelTraining:
616
679
  raise ValueError("No model is selected for training.")
617
680
 
618
681
  return parameters
682
+
683
+ def distribute_max_models(self):
684
+ """
685
+ DESCRIPTION:
686
+ Internal function to distribute max_models across available model functions.
687
+
688
+ RETURNS:
689
+ dictionary containing max_models distribution and list of models to remove.
690
+ """
691
+ # Getting total number of models
692
+ model_count=len(self.model_list)
693
+ # Evenly distributing max_models across models
694
+ base_assign = self.max_models // model_count
695
+ # Creating list of max_models for each model
696
+ distribution = [base_assign] * model_count
697
+
698
+ # Calculating remaining models
699
+ remaining_model_count = self.max_models % model_count
700
+ if remaining_model_count:
701
+ # distributing remaining model across models.
702
+ # Starting from first model in list and distributing remaining models by 1 each.
703
+ for i in range(remaining_model_count):
704
+ distribution[i] += 1
705
+
706
+ # Creating dictionary for model distribution
707
+ model_distribution = dict(zip(self.model_list, distribution))
708
+ # Getting list of models with 0 distribution and removing them from model list
709
+ # While for model having distribution greater than 0, updating distribution with
710
+ # 1/3rd of original value as we are training with 3 different feature selection methods.
711
+ models_to_remove = []
712
+ for model in self.model_list:
713
+ initial_count = model_distribution[model]
714
+ if initial_count == 0:
715
+ models_to_remove.append(model)
716
+ else:
717
+ model_distribution[model] = math.ceil(initial_count / 3)
718
+
719
+ return model_distribution, models_to_remove
619
720
 
620
721
  def _parallel_training(self, parameters):
621
722
  """
@@ -648,6 +749,19 @@ class _ModelTraining:
648
749
 
649
750
  self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
650
751
  if self.max_runtime_secs is not None else None
752
+
753
+ if self.max_models is not None:
754
+ # Getting model distribution and models to remove
755
+ self.max_models_distribution, models_to_remove = self.distribute_max_models()
756
+ # Removing model parameters with 0 distribution
757
+ if len(models_to_remove):
758
+ for model in models_to_remove:
759
+ model_params = [param for param in model_params if param['name'] != model]
760
+ # Updating progress bar as we are removing model
761
+ self.progress_bar.update()
762
+
763
+ if self.is_classification_type():
764
+ self.startify_col = self.target_column
651
765
 
652
766
  trained_models = []
653
767
  for param in model_params:
@@ -677,12 +791,12 @@ class _ModelTraining:
677
791
  Types: tuple of Teradataml DataFrame
678
792
 
679
793
  model_info
680
- Required Arugment.
794
+ Required Argument.
681
795
  Specifies the trained models information.
682
796
  Types: Pandas DataFrame
683
797
 
684
798
  RETURNS:
685
- Pandas DataFrame containing, trained models with thier performance metrics.
799
+ Pandas DataFrame containing, trained models with their performance metrics.
686
800
  """
687
801
  self._display_msg(msg="Evaluating models performance ...",
688
802
  progress_bar = self.progress_bar,
@@ -697,9 +811,9 @@ class _ModelTraining:
697
811
 
698
812
  # Iterating over models
699
813
  for index, model_row in model_info.iterrows():
700
- # Extracting model name, feature selection method, and model object
701
- model_name, feature_selection, model_object = model_row['Name'], \
702
- model_row['Feature selection'], model_row['obj']
814
+ # Extracting model name, model id, feature selection method, and model object
815
+ model_name, model_id, feature_selection, model_object = model_row['Name'], \
816
+ model_row['Model-ID'], model_row['Feature-Selection'], model_row['obj']
703
817
 
704
818
  # Selecting test data based on feature selection method
705
819
  test_set = feature_selection_to_test_data[feature_selection]
@@ -708,7 +822,9 @@ class _ModelTraining:
708
822
  if model_name == 'knn':
709
823
  performance_metrics = model_object.evaluate(test_data=test_set)
710
824
  else:
711
- eval_params = self._eval_params_generation(model_name)
825
+ eval_params = _ModelTraining._eval_params_generation(model_name,
826
+ self.target_column,
827
+ self.task_type)
712
828
  performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
713
829
 
714
830
  # Extracting performance metrics
@@ -718,7 +834,7 @@ class _ModelTraining:
718
834
  performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
719
835
 
720
836
  # Combine all the elements to form a new row
721
- new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
837
+ new_row = [model_name, model_id, feature_selection] + performance_metrics_list + [model_object]
722
838
  else:
723
839
  # Regression
724
840
  regression_metrics = next(performance_metrics.result.itertuples())
@@ -726,22 +842,23 @@ class _ModelTraining:
726
842
  feature_count = len(test_set.columns) - 2
727
843
  r2_score = regression_metrics[8]
728
844
  adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
729
- new_row = [model_name, feature_selection, regression_metrics[0], regression_metrics[1], regression_metrics[2],
730
- regression_metrics[5], regression_metrics[6], r2_score, adjusted_r2_score, model_object]
845
+ new_row = [model_name, model_id, feature_selection, regression_metrics[0],
846
+ regression_metrics[1], regression_metrics[2], regression_metrics[5],
847
+ regression_metrics[6], r2_score, adjusted_r2_score, model_object]
731
848
 
732
849
  model_performance_data.append(new_row)
733
850
 
734
851
  if self.is_classification_type():
735
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Feature selection',
736
- 'Accuracy','Micro-Precision',
852
+ model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Model-ID',
853
+ 'Feature-Selection','Accuracy','Micro-Precision',
737
854
  'Micro-Recall','Micro-F1',
738
855
  'Macro-Precision','Macro-Recall',
739
856
  'Macro-F1','Weighted-Precision',
740
857
  'Weighted-Recall','Weighted-F1',
741
858
  'model-obj'])
742
859
  else:
743
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
744
- 'Feature selection',
860
+ model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name', 'Model-ID',
861
+ 'Feature-Selection',
745
862
  'MAE', 'MSE', 'MSLE',
746
863
  'RMSE', 'RMSLE',
747
864
  'R2-score',
@@ -764,12 +881,12 @@ class _ModelTraining:
764
881
 
765
882
  PARAMETERS:
766
883
  model_param
767
- Required Arugment.
884
+ Required Argument.
768
885
  Specifies the eval_params argument for GridSearch.
769
886
  Types: dict
770
887
 
771
888
  train_data:
772
- Required Arugment.
889
+ Required Argument.
773
890
  Specifies the training datasets.
774
891
  Types: tuple of Teradataml DataFrame
775
892
 
@@ -786,7 +903,9 @@ class _ModelTraining:
786
903
  "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
787
904
 
788
905
  # Setting eval_params for hpt.
789
- eval_params = self._eval_params_generation(model_param['name'])
906
+ eval_params = _ModelTraining._eval_params_generation(model_param['name'],
907
+ self.target_column,
908
+ self.task_type)
790
909
 
791
910
  # Input columns for model
792
911
  model_param['input_columns'] = self.features
@@ -799,8 +918,19 @@ class _ModelTraining:
799
918
  if model_param['name'] == 'knn':
800
919
  model_param['test_data'] = test_data
801
920
 
802
- # Defining Gridsearch with ML model based on Name
803
- _obj = GridSearch(func=model_to_func[model_param['name']], params=model_param)
921
+ # Using RandomSearch for hyperparameter tunning when max_models is given.
922
+ # Otherwise, using GridSearch for hyperparameter tunning.
923
+ if self.max_models is not None:
924
+ # Setting max_models for RandomSearch based on model name
925
+ model_param['max_models'] = self.max_models_distribution[model_param['name']]
926
+ # Defining RandomSearch with ML model based on Name, and max_models
927
+ _obj = RandomSearch(func=model_to_func[model_param['name']],
928
+ params=model_param,
929
+ n_iter=model_param['max_models'])
930
+ else:
931
+ # Defining Gridsearch with ML model based on Name
932
+ _obj = GridSearch(func=model_to_func[model_param['name']],
933
+ params=model_param)
804
934
 
805
935
  if self.verbose > 0:
806
936
  print(" " *200, end='\r', flush=True)
@@ -813,46 +943,39 @@ class _ModelTraining:
813
943
  _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
814
944
  early_stop=self.stopping_tolerance, run_parallel=True,
815
945
  sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
816
- verbose=verbose, max_time=self.max_runtime_secs)
946
+ stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
817
947
  else:
818
948
  _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
819
949
  early_stop=self.stopping_tolerance, **eval_params,
820
950
  run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
821
- sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
951
+ sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
822
952
 
823
953
  # Getting all passed models
824
954
  _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
955
+ # Creating mapping data ID to feature selection method
956
+ data_id_to_method_map = {"DF_0": "lasso", "DF_1": "rfe", "DF_2": "pca"}
957
+
958
+ # Mapping data ID to feature selection method
959
+ _df['Feature-Selection'] = _df['DATA_ID'].map(data_id_to_method_map)
960
+ # Getting model details
961
+ _df['Name'] = model_param['name']
962
+ _df['Model-ID'] = _df['MODEL_ID']
963
+ _df['obj'] = _df['MODEL_ID'].apply(lambda x: _obj.get_model(x))
964
+
965
+ # Extracting needed columns
966
+ model_info = _df[["Name", "Model-ID", "Feature-Selection", "obj"]]
825
967
 
826
- # Mapping data ID to DataFrame
827
- data_id_to_df = {"DF_0": _df[_df['DATA_ID']=='DF_0'],
828
- "DF_1": _df[_df['DATA_ID']=='DF_1'],
829
- "DF_2": _df[_df['DATA_ID']=='DF_2']}
830
-
831
- # Returns best model within a Data_ID group
832
- # get_best_model = lambda df: df.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'], ascending=[False, False]).iloc[0]['MODEL_ID']\
833
- # if self.task_type != 'Regression' else df.sort_values(by=['R2', 'MAE'], ascending=[False, False]).iloc[0]['MODEL_ID']
834
- get_best_model = lambda df, stats: df.sort_values(by=stats, ascending=[False, False]).iloc[0]['MODEL_ID']
835
-
836
- # best_model = get_best_model(data_id_to_df[data_id], stats)
837
- stats = ['MICRO-F1', 'WEIGHTED-F1'] if self.task_type != 'Regression' else ['R2', 'MAE']
838
- model_info_data = []
839
- # Extracting best model
840
- for data_id, df_name in zip(["DF_0", "DF_1", "DF_2"], ["lasso", "rfe", "pca"]):
841
- if not data_id_to_df[data_id].empty:
842
- best_model = get_best_model(data_id_to_df[data_id], stats)
843
- model_info_data.append([model_param['name'], df_name, _obj.get_model(best_model)])
844
- self._display_msg(inline_msg=best_model, progress_bar=self.progress_bar)
845
-
846
- model_info = pd.DataFrame(data=model_info_data, columns=["Name",'Feature selection', "obj"])
847
968
  self._display_msg(msg="-"*100,
848
969
  progress_bar=self.progress_bar,
849
970
  show_data=True)
850
971
  self.progress_bar.update()
851
972
 
852
973
  return model_info
853
-
854
- def _eval_params_generation(self,
855
- ml_name):
974
+
975
+ @staticmethod
976
+ def _eval_params_generation(ml_name,
977
+ target_column,
978
+ task_type):
856
979
  """
857
980
  DESCRIPTION:
858
981
  Internal function generates the eval_params for
@@ -860,23 +983,39 @@ class _ModelTraining:
860
983
 
861
984
  PARAMETERS:
862
985
  ml_name
863
- Required Arugment.
986
+ Required Argument.
864
987
  Specifies the ML name for eval_params generation.
865
988
  Types: str
989
+
990
+ target_column
991
+ Required Argument.
992
+ Specifies the target column.
993
+ Types: str
866
994
 
995
+ task_type:
996
+ Required Argument.
997
+ Specifies the task type for AutoML, whether to apply regresion
998
+ or classification on the provived dataset.
999
+ Default Value: "Regression"
1000
+ Permitted Values: "Regression", "Classification"
1001
+ Types: str
1002
+
867
1003
  RETURNS:
868
1004
  dict containing, eval_params for ML model.
869
1005
  """
870
1006
  # Setting the eval_params
871
1007
  eval_params = {"id_column": "id",
872
- "accumulate": self.target_column}
1008
+ "accumulate": target_column}
873
1009
 
874
1010
  # For Classification
875
- if self.task_type != "Regression":
1011
+ if task_type.lower() != "regression":
876
1012
  if ml_name == 'xgboost':
877
1013
  eval_params['model_type'] = 'Classification'
878
1014
  eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
879
1015
  else:
1016
+ if ml_name == 'glm':
1017
+ eval_params['family'] = 'BINOMIAL'
1018
+
880
1019
  eval_params['output_prob'] = True
881
1020
  else:
882
1021
  # For Regression
@@ -179,7 +179,7 @@ def __get_model_inputs_outputs(model, function_arg_map):
179
179
  tdp = preparer(td_dialect)
180
180
  nrows, ncols = member.shape
181
181
  db_schema = UtilFuncs._extract_db_name(member._table_name)
182
- # Add quotes around the DB name in case we are getting it using _get_current_databasename()
182
+ # Add quotes around the DB name in case we are getting it using _get_current_databasename().
183
183
  db_schema = tdp.quote(_get_current_databasename()) if db_schema is None else db_schema
184
184
  db_table_name = UtilFuncs._extract_table_name(member._table_name)
185
185