teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +71 -0
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +51 -24
- teradataml/analytics/json_parser/utils.py +11 -17
- teradataml/automl/__init__.py +103 -48
- teradataml/automl/data_preparation.py +55 -37
- teradataml/automl/data_transformation.py +131 -69
- teradataml/automl/feature_engineering.py +117 -185
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +13 -25
- teradataml/automl/model_training.py +214 -75
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +11 -6
- teradataml/common/garbagecollector.py +5 -0
- teradataml/common/messagecodes.py +3 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/utils.py +6 -0
- teradataml/context/context.py +49 -29
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/glm_example.json +28 -1
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +20 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
- teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
- teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
- teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
- teradataml/data/teradataml_example.json +77 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +120 -61
- teradataml/dataframe/dataframe.py +102 -17
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +272 -89
- teradataml/dataframe/sql.py +84 -0
- teradataml/dbutils/dbutils.py +2 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
- teradataml/options/__init__.py +13 -4
- teradataml/options/configure.py +27 -6
- teradataml/scriptmgmt/UserEnv.py +19 -16
- teradataml/scriptmgmt/lls_utils.py +117 -14
- teradataml/table_operators/Script.py +2 -3
- teradataml/table_operators/TableOperator.py +58 -10
- teradataml/utils/validators.py +40 -2
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
from concurrent.futures import ThreadPoolExecutor
|
|
19
|
+
import math
|
|
19
20
|
import pandas as pd
|
|
20
21
|
from itertools import product
|
|
21
22
|
|
|
@@ -24,7 +25,8 @@ from teradataml.context import context as tdmlctx
|
|
|
24
25
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
26
|
from teradataml.dataframe.dataframe import DataFrame
|
|
26
27
|
from teradataml import execute_sql, get_connection
|
|
27
|
-
from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
|
|
28
|
+
from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
|
|
29
|
+
from teradataml.utils.validators import _Validators
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
class _ModelTraining:
|
|
@@ -49,12 +51,12 @@ class _ModelTraining:
|
|
|
49
51
|
Types: teradataml Dataframe
|
|
50
52
|
|
|
51
53
|
target_column:
|
|
52
|
-
Required
|
|
54
|
+
Required Argument.
|
|
53
55
|
Specifies the target column present inside the dataset.
|
|
54
56
|
Types: str
|
|
55
57
|
|
|
56
58
|
model_list:
|
|
57
|
-
Required
|
|
59
|
+
Required Argument.
|
|
58
60
|
Specifies the list of models to be used for model training.
|
|
59
61
|
Types: list
|
|
60
62
|
|
|
@@ -70,13 +72,13 @@ class _ModelTraining:
|
|
|
70
72
|
Types: int
|
|
71
73
|
|
|
72
74
|
features:
|
|
73
|
-
Required
|
|
75
|
+
Required Argument.
|
|
74
76
|
Specifies the list of selected feature by rfe, lasso and pca
|
|
75
77
|
respectively in this order.
|
|
76
78
|
Types: list of list of strings (str)
|
|
77
79
|
|
|
78
80
|
task_type:
|
|
79
|
-
Required
|
|
81
|
+
Required Argument.
|
|
80
82
|
Specifies the task type for AutoML, whether to apply regresion
|
|
81
83
|
or classification on the provived dataset.
|
|
82
84
|
Default Value: "Regression"
|
|
@@ -84,7 +86,7 @@ class _ModelTraining:
|
|
|
84
86
|
Types: str
|
|
85
87
|
|
|
86
88
|
custom_data:
|
|
87
|
-
Optional
|
|
89
|
+
Optional Argument.
|
|
88
90
|
Specifies json object containing user customized input.
|
|
89
91
|
Types: json object
|
|
90
92
|
"""
|
|
@@ -96,12 +98,14 @@ class _ModelTraining:
|
|
|
96
98
|
self.task_type = task_type
|
|
97
99
|
self.custom_data = custom_data
|
|
98
100
|
self.labels = self.data.drop_duplicate(self.target_column).size
|
|
101
|
+
self.startify_col = None
|
|
99
102
|
|
|
100
103
|
def model_training(self,
|
|
101
104
|
auto=True,
|
|
102
105
|
max_runtime_secs=None,
|
|
103
106
|
stopping_metric=None,
|
|
104
|
-
stopping_tolerance=0
|
|
107
|
+
stopping_tolerance=0,
|
|
108
|
+
max_models=None
|
|
105
109
|
):
|
|
106
110
|
"""
|
|
107
111
|
DESCRIPTION:
|
|
@@ -112,14 +116,14 @@ class _ModelTraining:
|
|
|
112
116
|
|
|
113
117
|
PARAMETERS:
|
|
114
118
|
auto:
|
|
115
|
-
Optional
|
|
119
|
+
Optional Argument.
|
|
116
120
|
Specifies whether to run data preparation in auto mode or custom mode.
|
|
117
121
|
When set to True, runs automtically otherwise, it take user inputs.
|
|
118
122
|
Default Value: True
|
|
119
123
|
Types: boolean
|
|
120
124
|
|
|
121
125
|
max_runtime_secs:
|
|
122
|
-
Optional
|
|
126
|
+
Optional Argument.
|
|
123
127
|
Specifies the time limit in seconds for model training.
|
|
124
128
|
Types: int
|
|
125
129
|
|
|
@@ -132,6 +136,11 @@ class _ModelTraining:
|
|
|
132
136
|
Required, when "stopping_metric" is set, otherwise optional.
|
|
133
137
|
Specifies the stopping tolerance for stopping metrics in model training.
|
|
134
138
|
Types: float
|
|
139
|
+
|
|
140
|
+
max_models:
|
|
141
|
+
Optional Argument.
|
|
142
|
+
Specifies the maximum number of models to be trained.
|
|
143
|
+
Types: int
|
|
135
144
|
|
|
136
145
|
RETURNS:
|
|
137
146
|
pandas dataframes containing model information, leaderboard and target
|
|
@@ -140,6 +149,7 @@ class _ModelTraining:
|
|
|
140
149
|
self.stopping_metric = stopping_metric
|
|
141
150
|
self.stopping_tolerance = stopping_tolerance
|
|
142
151
|
self.max_runtime_secs = max_runtime_secs
|
|
152
|
+
self.max_models = max_models
|
|
143
153
|
|
|
144
154
|
self._display_heading(phase=3, progress_bar=self.progress_bar)
|
|
145
155
|
self._display_msg(msg='Model Training started ...',
|
|
@@ -152,6 +162,10 @@ class _ModelTraining:
|
|
|
152
162
|
if not auto:
|
|
153
163
|
parameters = self._custom_hyperparameters(parameters)
|
|
154
164
|
|
|
165
|
+
# Validates the upper limit of max_models based on total model combinations
|
|
166
|
+
if self.max_models is not None:
|
|
167
|
+
self._validate_upper_limit_for_max_models(parameters)
|
|
168
|
+
|
|
155
169
|
if self.verbose == 2:
|
|
156
170
|
self._display_hyperparameters(parameters)
|
|
157
171
|
|
|
@@ -167,6 +181,54 @@ class _ModelTraining:
|
|
|
167
181
|
|
|
168
182
|
return models, leader_board, self.labels
|
|
169
183
|
|
|
184
|
+
def _get_model_param_space(self,
|
|
185
|
+
hyperparameters):
|
|
186
|
+
"""
|
|
187
|
+
DESCRIPTION:
|
|
188
|
+
Internal function to calculate the total number of models to be trained for specific model.
|
|
189
|
+
|
|
190
|
+
PARAMETERS:
|
|
191
|
+
hyperparameters:
|
|
192
|
+
Required Argument.
|
|
193
|
+
Specifies the hyperparameters availables for ML model.
|
|
194
|
+
Types: list of dict
|
|
195
|
+
|
|
196
|
+
RETURNS:
|
|
197
|
+
int containing, total number of models available for training.
|
|
198
|
+
"""
|
|
199
|
+
# Creating all possible combinations of hyperparameters
|
|
200
|
+
all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
|
|
201
|
+
# Getting total number of models for each model model training function
|
|
202
|
+
total_models = len(all_combinations)
|
|
203
|
+
return total_models
|
|
204
|
+
|
|
205
|
+
def _validate_upper_limit_for_max_models(self,
|
|
206
|
+
hyperparameters_list):
|
|
207
|
+
"""
|
|
208
|
+
DESCRIPTION:
|
|
209
|
+
Internal function to validate the upper limit of max_models.
|
|
210
|
+
|
|
211
|
+
PARAMETERS:
|
|
212
|
+
hyperparameters_list:
|
|
213
|
+
Required Argument.
|
|
214
|
+
Specifies the hyperparameters for different ML models.
|
|
215
|
+
Types: list of dict
|
|
216
|
+
|
|
217
|
+
RETURNS:
|
|
218
|
+
None
|
|
219
|
+
|
|
220
|
+
RAISES:
|
|
221
|
+
TeradataMlException, ValueError
|
|
222
|
+
"""
|
|
223
|
+
model_param_space = 0
|
|
224
|
+
for hyperparameter_dct in hyperparameters_list:
|
|
225
|
+
# getting total number of models for each model
|
|
226
|
+
total_models = self._get_model_param_space(hyperparameter_dct)
|
|
227
|
+
model_param_space += total_models
|
|
228
|
+
|
|
229
|
+
# Validating upper range for max_models
|
|
230
|
+
_Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
|
|
231
|
+
|
|
170
232
|
def _display_hyperparameters(self,
|
|
171
233
|
hyperparameters_list):
|
|
172
234
|
"""
|
|
@@ -175,7 +237,7 @@ class _ModelTraining:
|
|
|
175
237
|
|
|
176
238
|
PARAMETERS:
|
|
177
239
|
hyperparameters_list:
|
|
178
|
-
Required
|
|
240
|
+
Required Argument.
|
|
179
241
|
Specifies the hyperparameters for different ML models.
|
|
180
242
|
Types: list of dict
|
|
181
243
|
|
|
@@ -189,16 +251,13 @@ class _ModelTraining:
|
|
|
189
251
|
|
|
190
252
|
# Iterating over hyperparameters_list
|
|
191
253
|
for hyperparameter_dct in hyperparameters_list:
|
|
192
|
-
# Extracting hyperparameter and
|
|
254
|
+
# Extracting hyperparameter and their value from hyperparameters dictionary
|
|
193
255
|
for key, val in hyperparameter_dct.items():
|
|
194
256
|
# Displaying hyperparameters
|
|
195
257
|
print(f"{key} : {str(val)}")
|
|
196
258
|
|
|
197
|
-
# Creating all possible combinations of hyperparameters
|
|
198
|
-
all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
|
|
199
|
-
|
|
200
259
|
# Displaying total number of models for each model
|
|
201
|
-
total_models =
|
|
260
|
+
total_models = self._get_model_param_space(hyperparameter_dct)
|
|
202
261
|
print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
|
|
203
262
|
print(f"--"*100+'\n')
|
|
204
263
|
|
|
@@ -210,7 +269,7 @@ class _ModelTraining:
|
|
|
210
269
|
|
|
211
270
|
PARAMETERS:
|
|
212
271
|
trained_models_info:
|
|
213
|
-
Required
|
|
272
|
+
Required Argument.
|
|
214
273
|
Specifies the trained models inforamtion to display.
|
|
215
274
|
Types: pandas Dataframe
|
|
216
275
|
|
|
@@ -227,10 +286,12 @@ class _ModelTraining:
|
|
|
227
286
|
|
|
228
287
|
# Adding rank to leaderboard
|
|
229
288
|
sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
|
|
230
|
-
|
|
231
|
-
#
|
|
232
|
-
|
|
233
|
-
|
|
289
|
+
|
|
290
|
+
# Excluding the model object and model name from leaderboard
|
|
291
|
+
leaderboard = sorted_model_df.drop(["model-obj","Name"], axis=1)
|
|
292
|
+
# filtering the rows based on the max_models
|
|
293
|
+
if self.max_models is not None:
|
|
294
|
+
leaderboard = leaderboard[leaderboard["Rank"] <= self.max_models]
|
|
234
295
|
|
|
235
296
|
self._display_msg(msg="Leaderboard",
|
|
236
297
|
progress_bar=self.progress_bar,
|
|
@@ -343,12 +404,12 @@ class _ModelTraining:
|
|
|
343
404
|
|
|
344
405
|
PARAMETERS:
|
|
345
406
|
num_rows:
|
|
346
|
-
Required
|
|
407
|
+
Required Argument.
|
|
347
408
|
Specifies the number of rows in dataset.
|
|
348
409
|
Types: int
|
|
349
410
|
|
|
350
411
|
num_cols:
|
|
351
|
-
Required
|
|
412
|
+
Required Argument.
|
|
352
413
|
Specifies the number of columns in dataset.
|
|
353
414
|
Types: int
|
|
354
415
|
|
|
@@ -409,7 +470,8 @@ class _ModelTraining:
|
|
|
409
470
|
'shrinkage_factor': tuple(shrinkage_factor),
|
|
410
471
|
'max_depth': tuple(max_depth),
|
|
411
472
|
'min_node_size': tuple(min_node_size),
|
|
412
|
-
'iter_num': tuple(iter_num)
|
|
473
|
+
'iter_num': tuple(iter_num),
|
|
474
|
+
'seed':42
|
|
413
475
|
}
|
|
414
476
|
# Hyperparameters for Decision Forest model
|
|
415
477
|
df_params = {
|
|
@@ -419,7 +481,8 @@ class _ModelTraining:
|
|
|
419
481
|
'min_impurity': tuple(min_impurity),
|
|
420
482
|
'max_depth': tuple(max_depth),
|
|
421
483
|
'min_node_size': tuple(min_node_size),
|
|
422
|
-
'num_trees': tuple(num_trees)
|
|
484
|
+
'num_trees': tuple(num_trees),
|
|
485
|
+
'seed':42
|
|
423
486
|
}
|
|
424
487
|
|
|
425
488
|
# Updating model type in case of classification
|
|
@@ -445,12 +508,12 @@ class _ModelTraining:
|
|
|
445
508
|
|
|
446
509
|
PARAMETERS:
|
|
447
510
|
num_rows
|
|
448
|
-
Required
|
|
511
|
+
Required Argument.
|
|
449
512
|
Specifies the number of rows in dataset.
|
|
450
513
|
Types: int
|
|
451
514
|
|
|
452
515
|
num_cols:
|
|
453
|
-
Required
|
|
516
|
+
Required Argument.
|
|
454
517
|
Specifies the number of columns in dataset.
|
|
455
518
|
Types: int
|
|
456
519
|
|
|
@@ -482,12 +545,12 @@ class _ModelTraining:
|
|
|
482
545
|
|
|
483
546
|
PARAMETERS:
|
|
484
547
|
num_rows:
|
|
485
|
-
Required
|
|
548
|
+
Required Argument.
|
|
486
549
|
Specifies the number of rows in dataset.
|
|
487
550
|
Types: int
|
|
488
551
|
|
|
489
552
|
num_cols:
|
|
490
|
-
Required
|
|
553
|
+
Required Argument.
|
|
491
554
|
Specifies the number of columns in dataset.
|
|
492
555
|
Types: int
|
|
493
556
|
|
|
@@ -616,6 +679,44 @@ class _ModelTraining:
|
|
|
616
679
|
raise ValueError("No model is selected for training.")
|
|
617
680
|
|
|
618
681
|
return parameters
|
|
682
|
+
|
|
683
|
+
def distribute_max_models(self):
|
|
684
|
+
"""
|
|
685
|
+
DESCRIPTION:
|
|
686
|
+
Internal function to distribute max_models across available model functions.
|
|
687
|
+
|
|
688
|
+
RETURNS:
|
|
689
|
+
dictionary containing max_models distribution and list of models to remove.
|
|
690
|
+
"""
|
|
691
|
+
# Getting total number of models
|
|
692
|
+
model_count=len(self.model_list)
|
|
693
|
+
# Evenly distributing max_models across models
|
|
694
|
+
base_assign = self.max_models // model_count
|
|
695
|
+
# Creating list of max_models for each model
|
|
696
|
+
distribution = [base_assign] * model_count
|
|
697
|
+
|
|
698
|
+
# Calculating remaining models
|
|
699
|
+
remaining_model_count = self.max_models % model_count
|
|
700
|
+
if remaining_model_count:
|
|
701
|
+
# distributing remaining model across models.
|
|
702
|
+
# Starting from first model in list and distributing remaining models by 1 each.
|
|
703
|
+
for i in range(remaining_model_count):
|
|
704
|
+
distribution[i] += 1
|
|
705
|
+
|
|
706
|
+
# Creating dictionary for model distribution
|
|
707
|
+
model_distribution = dict(zip(self.model_list, distribution))
|
|
708
|
+
# Getting list of models with 0 distribution and removing them from model list
|
|
709
|
+
# While for model having distribution greater than 0, updating distribution with
|
|
710
|
+
# 1/3rd of original value as we are training with 3 different feature selection methods.
|
|
711
|
+
models_to_remove = []
|
|
712
|
+
for model in self.model_list:
|
|
713
|
+
initial_count = model_distribution[model]
|
|
714
|
+
if initial_count == 0:
|
|
715
|
+
models_to_remove.append(model)
|
|
716
|
+
else:
|
|
717
|
+
model_distribution[model] = math.ceil(initial_count / 3)
|
|
718
|
+
|
|
719
|
+
return model_distribution, models_to_remove
|
|
619
720
|
|
|
620
721
|
def _parallel_training(self, parameters):
|
|
621
722
|
"""
|
|
@@ -648,6 +749,19 @@ class _ModelTraining:
|
|
|
648
749
|
|
|
649
750
|
self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
|
|
650
751
|
if self.max_runtime_secs is not None else None
|
|
752
|
+
|
|
753
|
+
if self.max_models is not None:
|
|
754
|
+
# Getting model distribution and models to remove
|
|
755
|
+
self.max_models_distribution, models_to_remove = self.distribute_max_models()
|
|
756
|
+
# Removing model parameters with 0 distribution
|
|
757
|
+
if len(models_to_remove):
|
|
758
|
+
for model in models_to_remove:
|
|
759
|
+
model_params = [param for param in model_params if param['name'] != model]
|
|
760
|
+
# Updating progress bar as we are removing model
|
|
761
|
+
self.progress_bar.update()
|
|
762
|
+
|
|
763
|
+
if self.is_classification_type():
|
|
764
|
+
self.startify_col = self.target_column
|
|
651
765
|
|
|
652
766
|
trained_models = []
|
|
653
767
|
for param in model_params:
|
|
@@ -677,12 +791,12 @@ class _ModelTraining:
|
|
|
677
791
|
Types: tuple of Teradataml DataFrame
|
|
678
792
|
|
|
679
793
|
model_info
|
|
680
|
-
Required
|
|
794
|
+
Required Argument.
|
|
681
795
|
Specifies the trained models information.
|
|
682
796
|
Types: Pandas DataFrame
|
|
683
797
|
|
|
684
798
|
RETURNS:
|
|
685
|
-
Pandas DataFrame containing, trained models with
|
|
799
|
+
Pandas DataFrame containing, trained models with their performance metrics.
|
|
686
800
|
"""
|
|
687
801
|
self._display_msg(msg="Evaluating models performance ...",
|
|
688
802
|
progress_bar = self.progress_bar,
|
|
@@ -697,9 +811,9 @@ class _ModelTraining:
|
|
|
697
811
|
|
|
698
812
|
# Iterating over models
|
|
699
813
|
for index, model_row in model_info.iterrows():
|
|
700
|
-
# Extracting model name, feature selection method, and model object
|
|
701
|
-
model_name, feature_selection, model_object = model_row['Name'], \
|
|
702
|
-
|
|
814
|
+
# Extracting model name, model id, feature selection method, and model object
|
|
815
|
+
model_name, model_id, feature_selection, model_object = model_row['Name'], \
|
|
816
|
+
model_row['Model-ID'], model_row['Feature-Selection'], model_row['obj']
|
|
703
817
|
|
|
704
818
|
# Selecting test data based on feature selection method
|
|
705
819
|
test_set = feature_selection_to_test_data[feature_selection]
|
|
@@ -708,7 +822,9 @@ class _ModelTraining:
|
|
|
708
822
|
if model_name == 'knn':
|
|
709
823
|
performance_metrics = model_object.evaluate(test_data=test_set)
|
|
710
824
|
else:
|
|
711
|
-
eval_params =
|
|
825
|
+
eval_params = _ModelTraining._eval_params_generation(model_name,
|
|
826
|
+
self.target_column,
|
|
827
|
+
self.task_type)
|
|
712
828
|
performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
|
|
713
829
|
|
|
714
830
|
# Extracting performance metrics
|
|
@@ -718,7 +834,7 @@ class _ModelTraining:
|
|
|
718
834
|
performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
|
|
719
835
|
|
|
720
836
|
# Combine all the elements to form a new row
|
|
721
|
-
new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
|
|
837
|
+
new_row = [model_name, model_id, feature_selection] + performance_metrics_list + [model_object]
|
|
722
838
|
else:
|
|
723
839
|
# Regression
|
|
724
840
|
regression_metrics = next(performance_metrics.result.itertuples())
|
|
@@ -726,22 +842,23 @@ class _ModelTraining:
|
|
|
726
842
|
feature_count = len(test_set.columns) - 2
|
|
727
843
|
r2_score = regression_metrics[8]
|
|
728
844
|
adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
|
|
729
|
-
new_row = [model_name, feature_selection, regression_metrics[0],
|
|
730
|
-
|
|
845
|
+
new_row = [model_name, model_id, feature_selection, regression_metrics[0],
|
|
846
|
+
regression_metrics[1], regression_metrics[2], regression_metrics[5],
|
|
847
|
+
regression_metrics[6], r2_score, adjusted_r2_score, model_object]
|
|
731
848
|
|
|
732
849
|
model_performance_data.append(new_row)
|
|
733
850
|
|
|
734
851
|
if self.is_classification_type():
|
|
735
|
-
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','
|
|
736
|
-
'Accuracy','Micro-Precision',
|
|
852
|
+
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Model-ID',
|
|
853
|
+
'Feature-Selection','Accuracy','Micro-Precision',
|
|
737
854
|
'Micro-Recall','Micro-F1',
|
|
738
855
|
'Macro-Precision','Macro-Recall',
|
|
739
856
|
'Macro-F1','Weighted-Precision',
|
|
740
857
|
'Weighted-Recall','Weighted-F1',
|
|
741
858
|
'model-obj'])
|
|
742
859
|
else:
|
|
743
|
-
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
|
|
744
|
-
'Feature
|
|
860
|
+
model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name', 'Model-ID',
|
|
861
|
+
'Feature-Selection',
|
|
745
862
|
'MAE', 'MSE', 'MSLE',
|
|
746
863
|
'RMSE', 'RMSLE',
|
|
747
864
|
'R2-score',
|
|
@@ -764,12 +881,12 @@ class _ModelTraining:
|
|
|
764
881
|
|
|
765
882
|
PARAMETERS:
|
|
766
883
|
model_param
|
|
767
|
-
Required
|
|
884
|
+
Required Argument.
|
|
768
885
|
Specifies the eval_params argument for GridSearch.
|
|
769
886
|
Types: dict
|
|
770
887
|
|
|
771
888
|
train_data:
|
|
772
|
-
Required
|
|
889
|
+
Required Argument.
|
|
773
890
|
Specifies the training datasets.
|
|
774
891
|
Types: tuple of Teradataml DataFrame
|
|
775
892
|
|
|
@@ -786,7 +903,9 @@ class _ModelTraining:
|
|
|
786
903
|
"xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
|
|
787
904
|
|
|
788
905
|
# Setting eval_params for hpt.
|
|
789
|
-
eval_params =
|
|
906
|
+
eval_params = _ModelTraining._eval_params_generation(model_param['name'],
|
|
907
|
+
self.target_column,
|
|
908
|
+
self.task_type)
|
|
790
909
|
|
|
791
910
|
# Input columns for model
|
|
792
911
|
model_param['input_columns'] = self.features
|
|
@@ -799,8 +918,19 @@ class _ModelTraining:
|
|
|
799
918
|
if model_param['name'] == 'knn':
|
|
800
919
|
model_param['test_data'] = test_data
|
|
801
920
|
|
|
802
|
-
#
|
|
803
|
-
|
|
921
|
+
# Using RandomSearch for hyperparameter tunning when max_models is given.
|
|
922
|
+
# Otherwise, using GridSearch for hyperparameter tunning.
|
|
923
|
+
if self.max_models is not None:
|
|
924
|
+
# Setting max_models for RandomSearch based on model name
|
|
925
|
+
model_param['max_models'] = self.max_models_distribution[model_param['name']]
|
|
926
|
+
# Defining RandomSearch with ML model based on Name, and max_models
|
|
927
|
+
_obj = RandomSearch(func=model_to_func[model_param['name']],
|
|
928
|
+
params=model_param,
|
|
929
|
+
n_iter=model_param['max_models'])
|
|
930
|
+
else:
|
|
931
|
+
# Defining Gridsearch with ML model based on Name
|
|
932
|
+
_obj = GridSearch(func=model_to_func[model_param['name']],
|
|
933
|
+
params=model_param)
|
|
804
934
|
|
|
805
935
|
if self.verbose > 0:
|
|
806
936
|
print(" " *200, end='\r', flush=True)
|
|
@@ -813,46 +943,39 @@ class _ModelTraining:
|
|
|
813
943
|
_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
|
|
814
944
|
early_stop=self.stopping_tolerance, run_parallel=True,
|
|
815
945
|
sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
|
|
816
|
-
verbose=verbose, max_time=self.max_runtime_secs)
|
|
946
|
+
stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
|
|
817
947
|
else:
|
|
818
948
|
_obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
|
|
819
949
|
early_stop=self.stopping_tolerance, **eval_params,
|
|
820
950
|
run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
|
|
821
|
-
sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
|
|
951
|
+
sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
|
|
822
952
|
|
|
823
953
|
# Getting all passed models
|
|
824
954
|
_df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
|
|
955
|
+
# Creating mapping data ID to feature selection method
|
|
956
|
+
data_id_to_method_map = {"DF_0": "lasso", "DF_1": "rfe", "DF_2": "pca"}
|
|
957
|
+
|
|
958
|
+
# Mapping data ID to feature selection method
|
|
959
|
+
_df['Feature-Selection'] = _df['DATA_ID'].map(data_id_to_method_map)
|
|
960
|
+
# Getting model details
|
|
961
|
+
_df['Name'] = model_param['name']
|
|
962
|
+
_df['Model-ID'] = _df['MODEL_ID']
|
|
963
|
+
_df['obj'] = _df['MODEL_ID'].apply(lambda x: _obj.get_model(x))
|
|
964
|
+
|
|
965
|
+
# Extracting needed columns
|
|
966
|
+
model_info = _df[["Name", "Model-ID", "Feature-Selection", "obj"]]
|
|
825
967
|
|
|
826
|
-
# Mapping data ID to DataFrame
|
|
827
|
-
data_id_to_df = {"DF_0": _df[_df['DATA_ID']=='DF_0'],
|
|
828
|
-
"DF_1": _df[_df['DATA_ID']=='DF_1'],
|
|
829
|
-
"DF_2": _df[_df['DATA_ID']=='DF_2']}
|
|
830
|
-
|
|
831
|
-
# Returns best model within a Data_ID group
|
|
832
|
-
# get_best_model = lambda df: df.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'], ascending=[False, False]).iloc[0]['MODEL_ID']\
|
|
833
|
-
# if self.task_type != 'Regression' else df.sort_values(by=['R2', 'MAE'], ascending=[False, False]).iloc[0]['MODEL_ID']
|
|
834
|
-
get_best_model = lambda df, stats: df.sort_values(by=stats, ascending=[False, False]).iloc[0]['MODEL_ID']
|
|
835
|
-
|
|
836
|
-
# best_model = get_best_model(data_id_to_df[data_id], stats)
|
|
837
|
-
stats = ['MICRO-F1', 'WEIGHTED-F1'] if self.task_type != 'Regression' else ['R2', 'MAE']
|
|
838
|
-
model_info_data = []
|
|
839
|
-
# Extracting best model
|
|
840
|
-
for data_id, df_name in zip(["DF_0", "DF_1", "DF_2"], ["lasso", "rfe", "pca"]):
|
|
841
|
-
if not data_id_to_df[data_id].empty:
|
|
842
|
-
best_model = get_best_model(data_id_to_df[data_id], stats)
|
|
843
|
-
model_info_data.append([model_param['name'], df_name, _obj.get_model(best_model)])
|
|
844
|
-
self._display_msg(inline_msg=best_model, progress_bar=self.progress_bar)
|
|
845
|
-
|
|
846
|
-
model_info = pd.DataFrame(data=model_info_data, columns=["Name",'Feature selection', "obj"])
|
|
847
968
|
self._display_msg(msg="-"*100,
|
|
848
969
|
progress_bar=self.progress_bar,
|
|
849
970
|
show_data=True)
|
|
850
971
|
self.progress_bar.update()
|
|
851
972
|
|
|
852
973
|
return model_info
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
974
|
+
|
|
975
|
+
@staticmethod
|
|
976
|
+
def _eval_params_generation(ml_name,
|
|
977
|
+
target_column,
|
|
978
|
+
task_type):
|
|
856
979
|
"""
|
|
857
980
|
DESCRIPTION:
|
|
858
981
|
Internal function generates the eval_params for
|
|
@@ -860,23 +983,39 @@ class _ModelTraining:
|
|
|
860
983
|
|
|
861
984
|
PARAMETERS:
|
|
862
985
|
ml_name
|
|
863
|
-
Required
|
|
986
|
+
Required Argument.
|
|
864
987
|
Specifies the ML name for eval_params generation.
|
|
865
988
|
Types: str
|
|
989
|
+
|
|
990
|
+
target_column
|
|
991
|
+
Required Argument.
|
|
992
|
+
Specifies the target column.
|
|
993
|
+
Types: str
|
|
866
994
|
|
|
995
|
+
task_type:
|
|
996
|
+
Required Argument.
|
|
997
|
+
Specifies the task type for AutoML, whether to apply regresion
|
|
998
|
+
or classification on the provived dataset.
|
|
999
|
+
Default Value: "Regression"
|
|
1000
|
+
Permitted Values: "Regression", "Classification"
|
|
1001
|
+
Types: str
|
|
1002
|
+
|
|
867
1003
|
RETURNS:
|
|
868
1004
|
dict containing, eval_params for ML model.
|
|
869
1005
|
"""
|
|
870
1006
|
# Setting the eval_params
|
|
871
1007
|
eval_params = {"id_column": "id",
|
|
872
|
-
"accumulate":
|
|
1008
|
+
"accumulate": target_column}
|
|
873
1009
|
|
|
874
1010
|
# For Classification
|
|
875
|
-
if
|
|
1011
|
+
if task_type.lower() != "regression":
|
|
876
1012
|
if ml_name == 'xgboost':
|
|
877
1013
|
eval_params['model_type'] = 'Classification'
|
|
878
1014
|
eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
879
1015
|
else:
|
|
1016
|
+
if ml_name == 'glm':
|
|
1017
|
+
eval_params['family'] = 'BINOMIAL'
|
|
1018
|
+
|
|
880
1019
|
eval_params['output_prob'] = True
|
|
881
1020
|
else:
|
|
882
1021
|
# For Regression
|
|
@@ -179,7 +179,7 @@ def __get_model_inputs_outputs(model, function_arg_map):
|
|
|
179
179
|
tdp = preparer(td_dialect)
|
|
180
180
|
nrows, ncols = member.shape
|
|
181
181
|
db_schema = UtilFuncs._extract_db_name(member._table_name)
|
|
182
|
-
# Add quotes around the DB name in case we are getting it using _get_current_databasename()
|
|
182
|
+
# Add quotes around the DB name in case we are getting it using _get_current_databasename().
|
|
183
183
|
db_schema = tdp.quote(_get_current_databasename()) if db_schema is None else db_schema
|
|
184
184
|
db_table_name = UtilFuncs._extract_table_name(member._table_name)
|
|
185
185
|
|