teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -16,15 +16,18 @@
16
16
  # Python libraries
17
17
  import concurrent.futures
18
18
  from concurrent.futures import ThreadPoolExecutor
19
+ import math
19
20
  import pandas as pd
20
21
  from itertools import product
22
+ import numpy as np
21
23
 
22
24
  # Teradata libraries
23
25
  from teradataml.context import context as tdmlctx
24
26
  from teradataml.dataframe.copy_to import copy_to_sql
25
27
  from teradataml.dataframe.dataframe import DataFrame
26
28
  from teradataml import execute_sql, get_connection
27
- from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN
29
+ from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
30
+ from teradataml.utils.validators import _Validators
28
31
 
29
32
 
30
33
  class _ModelTraining:
@@ -36,7 +39,8 @@ class _ModelTraining:
36
39
  verbose=0,
37
40
  features=None,
38
41
  task_type="Regression",
39
- custom_data = None):
42
+ custom_data = None,
43
+ **kwargs):
40
44
  """
41
45
  DESCRIPTION:
42
46
  Function initializes the data, target column, features and models
@@ -49,12 +53,12 @@ class _ModelTraining:
49
53
  Types: teradataml Dataframe
50
54
 
51
55
  target_column:
52
- Required Arugment.
56
+ Required Argument.
53
57
  Specifies the target column present inside the dataset.
54
58
  Types: str
55
59
 
56
60
  model_list:
57
- Required Arugment.
61
+ Required Argument.
58
62
  Specifies the list of models to be used for model training.
59
63
  Types: list
60
64
 
@@ -70,13 +74,13 @@ class _ModelTraining:
70
74
  Types: int
71
75
 
72
76
  features:
73
- Required Arugment.
77
+ Required Argument.
74
78
  Specifies the list of selected feature by rfe, lasso and pca
75
79
  respectively in this order.
76
80
  Types: list of list of strings (str)
77
81
 
78
82
  task_type:
79
- Required Arugment.
83
+ Required Argument.
80
84
  Specifies the task type for AutoML, whether to apply regresion
81
85
  or classification on the provived dataset.
82
86
  Default Value: "Regression"
@@ -84,9 +88,31 @@ class _ModelTraining:
84
88
  Types: str
85
89
 
86
90
  custom_data:
87
- Optional Arugment.
91
+ Optional Argument.
88
92
  Specifies json object containing user customized input.
89
93
  Types: json object
94
+
95
+ **kwargs:
96
+ Specifies the additional arguments for model training. Below
97
+ are the additional arguments:
98
+ volatile:
99
+ Optional Argument.
100
+ Specifies whether to put the interim results of the
101
+ functions in a volatile table or not. When set to
102
+ True, results are stored in a volatile table,
103
+ otherwise not.
104
+ Default Value: False
105
+ Types: bool
106
+
107
+ persist:
108
+ Optional Argument.
109
+ Specifies whether to persist the interim results of the
110
+ functions in a table or not. When set to True,
111
+ results are persisted in a table; otherwise,
112
+ results are garbage collected at the end of the
113
+ session.
114
+ Default Value: False
115
+ Types: bool
90
116
  """
91
117
  self.data = data
92
118
  self.target_column = target_column
@@ -96,12 +122,16 @@ class _ModelTraining:
96
122
  self.task_type = task_type
97
123
  self.custom_data = custom_data
98
124
  self.labels = self.data.drop_duplicate(self.target_column).size
125
+ self.startify_col = None
126
+ self.persist = kwargs.get("persist", False)
127
+ self.volatile = kwargs.get("volatile", False)
99
128
 
100
129
  def model_training(self,
101
130
  auto=True,
102
131
  max_runtime_secs=None,
103
132
  stopping_metric=None,
104
- stopping_tolerance=0
133
+ stopping_tolerance=0,
134
+ max_models=None
105
135
  ):
106
136
  """
107
137
  DESCRIPTION:
@@ -112,14 +142,14 @@ class _ModelTraining:
112
142
 
113
143
  PARAMETERS:
114
144
  auto:
115
- Optional Arugment.
145
+ Optional Argument.
116
146
  Specifies whether to run data preparation in auto mode or custom mode.
117
147
  When set to True, runs automtically otherwise, it take user inputs.
118
148
  Default Value: True
119
149
  Types: boolean
120
150
 
121
151
  max_runtime_secs:
122
- Optional Arugment.
152
+ Optional Argument.
123
153
  Specifies the time limit in seconds for model training.
124
154
  Types: int
125
155
 
@@ -132,6 +162,11 @@ class _ModelTraining:
132
162
  Required, when "stopping_metric" is set, otherwise optional.
133
163
  Specifies the stopping tolerance for stopping metrics in model training.
134
164
  Types: float
165
+
166
+ max_models:
167
+ Optional Argument.
168
+ Specifies the maximum number of models to be trained.
169
+ Types: int
135
170
 
136
171
  RETURNS:
137
172
  pandas dataframes containing model information, leaderboard and target
@@ -140,6 +175,7 @@ class _ModelTraining:
140
175
  self.stopping_metric = stopping_metric
141
176
  self.stopping_tolerance = stopping_tolerance
142
177
  self.max_runtime_secs = max_runtime_secs
178
+ self.max_models = max_models
143
179
 
144
180
  self._display_heading(phase=3, progress_bar=self.progress_bar)
145
181
  self._display_msg(msg='Model Training started ...',
@@ -152,6 +188,10 @@ class _ModelTraining:
152
188
  if not auto:
153
189
  parameters = self._custom_hyperparameters(parameters)
154
190
 
191
+ # Validates the upper limit of max_models based on total model combinations
192
+ if self.max_models is not None:
193
+ self._validate_upper_limit_for_max_models(parameters)
194
+
155
195
  if self.verbose == 2:
156
196
  self._display_hyperparameters(parameters)
157
197
 
@@ -167,6 +207,54 @@ class _ModelTraining:
167
207
 
168
208
  return models, leader_board, self.labels
169
209
 
210
+ def _get_model_param_space(self,
211
+ hyperparameters):
212
+ """
213
+ DESCRIPTION:
214
+ Internal function to calculate the total number of models to be trained for specific model.
215
+
216
+ PARAMETERS:
217
+ hyperparameters:
218
+ Required Argument.
219
+ Specifies the hyperparameters availables for ML model.
220
+ Types: list of dict
221
+
222
+ RETURNS:
223
+ int containing, total number of models available for training.
224
+ """
225
+ # Creating all possible combinations of hyperparameters
226
+ all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameters.values()]))
227
+ # Getting total number of models for each model model training function
228
+ total_models = len(all_combinations)
229
+ return total_models
230
+
231
+ def _validate_upper_limit_for_max_models(self,
232
+ hyperparameters_list):
233
+ """
234
+ DESCRIPTION:
235
+ Internal function to validate the upper limit of max_models.
236
+
237
+ PARAMETERS:
238
+ hyperparameters_list:
239
+ Required Argument.
240
+ Specifies the hyperparameters for different ML models.
241
+ Types: list of dict
242
+
243
+ RETURNS:
244
+ None
245
+
246
+ RAISES:
247
+ TeradataMlException, ValueError
248
+ """
249
+ model_param_space = 0
250
+ for hyperparameter_dct in hyperparameters_list:
251
+ # getting total number of models for each model
252
+ total_models = self._get_model_param_space(hyperparameter_dct)
253
+ model_param_space += total_models
254
+
255
+ # Validating upper range for max_models
256
+ _Validators._validate_argument_range(self.max_models, "max_models", ubound=model_param_space, ubound_inclusive=True)
257
+
170
258
  def _display_hyperparameters(self,
171
259
  hyperparameters_list):
172
260
  """
@@ -175,7 +263,7 @@ class _ModelTraining:
175
263
 
176
264
  PARAMETERS:
177
265
  hyperparameters_list:
178
- Required Arugment.
266
+ Required Argument.
179
267
  Specifies the hyperparameters for different ML models.
180
268
  Types: list of dict
181
269
 
@@ -189,16 +277,13 @@ class _ModelTraining:
189
277
 
190
278
  # Iterating over hyperparameters_list
191
279
  for hyperparameter_dct in hyperparameters_list:
192
- # Extracting hyperparameter and thier value from hyperparameters dictionary
280
+ # Extracting hyperparameter and their value from hyperparameters dictionary
193
281
  for key, val in hyperparameter_dct.items():
194
282
  # Displaying hyperparameters
195
283
  print(f"{key} : {str(val)}")
196
284
 
197
- # Creating all possible combinations of hyperparameters
198
- all_combinations = list(product(*[v if isinstance(v, tuple) else [v] for v in hyperparameter_dct.values()]))
199
-
200
285
  # Displaying total number of models for each model
201
- total_models = len(all_combinations)
286
+ total_models = self._get_model_param_space(hyperparameter_dct)
202
287
  print(f"Total number of models for {hyperparameter_dct['name']} : {total_models}")
203
288
  print(f"--"*100+'\n')
204
289
 
@@ -210,7 +295,7 @@ class _ModelTraining:
210
295
 
211
296
  PARAMETERS:
212
297
  trained_models_info:
213
- Required Arugment.
298
+ Required Argument.
214
299
  Specifies the trained models inforamtion to display.
215
300
  Types: pandas Dataframe
216
301
 
@@ -219,18 +304,25 @@ class _ModelTraining:
219
304
  """
220
305
  # Creating a copy to avoid use of same reference of memory
221
306
  if self.task_type != "Regression":
222
- sorted_model_df = trained_models_info.sort_values(by=['Micro-F1', 'Weighted-F1'],
223
- ascending=[False, False]).reset_index(drop=True)
307
+ sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
308
+ ascending=[False, False]).reset_index(drop=True)
224
309
  else:
225
- sorted_model_df = trained_models_info.sort_values(by='R2-score',
226
- ascending=False).reset_index(drop=True)
310
+ sorted_model_df = trained_models_info.sort_values(by='R2',
311
+ ascending=False).reset_index(drop=True)
312
+
227
313
 
228
314
  # Adding rank to leaderboard
229
- sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
230
-
231
- # Assuming 'sorted_df' is your DataFrame
232
- # Excluding the "last_col"
233
- leaderboard = sorted_model_df.drop("model-obj", axis=1)
315
+ sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
316
+
317
+ # Internal Data list for leaderboard
318
+ dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
319
+
320
+ # Excluding the model object and model name from leaderboard
321
+ leaderboard = sorted_model_df.drop(dp_lst, axis=1)
322
+
323
+ # filtering the rows based on the max_models
324
+ if self.max_models is not None:
325
+ leaderboard = leaderboard[leaderboard["RANK"] <= self.max_models]
234
326
 
235
327
  self._display_msg(msg="Leaderboard",
236
328
  progress_bar=self.progress_bar,
@@ -343,12 +435,12 @@ class _ModelTraining:
343
435
 
344
436
  PARAMETERS:
345
437
  num_rows:
346
- Required Arugment.
438
+ Required Argument.
347
439
  Specifies the number of rows in dataset.
348
440
  Types: int
349
441
 
350
442
  num_cols:
351
- Required Arugment.
443
+ Required Argument.
352
444
  Specifies the number of columns in dataset.
353
445
  Types: int
354
446
 
@@ -375,28 +467,24 @@ class _ModelTraining:
375
467
  max_depth.extend([6, 7, 8])
376
468
  min_node_size.extend([2])
377
469
  iter_num.extend([20])
378
- num_trees.extend([10, 20])
379
470
  elif num_rows < 10000 and num_cols < 15:
380
471
  min_impurity.extend([0.1, 0.2])
381
472
  shrinkage_factor.extend([0.1, 0.3])
382
473
  max_depth.extend([6, 8, 10])
383
474
  min_node_size.extend([2, 3])
384
475
  iter_num.extend([20, 30])
385
- num_trees.extend([20, 30])
386
476
  elif num_rows < 100000 and num_cols < 20:
387
477
  min_impurity.extend([0.2, 0.3])
388
478
  shrinkage_factor.extend([0.01, 0.1, 0.2])
389
479
  max_depth.extend([4, 6, 7])
390
480
  min_node_size.extend([3, 4])
391
481
  iter_num.extend([30, 40])
392
- num_trees.extend([30, 40])
393
482
  else:
394
483
  min_impurity.extend([0.1, 0.2, 0.3])
395
484
  shrinkage_factor.extend([0.01, 0.05, 0.1])
396
485
  max_depth.extend([3, 4, 7, 8])
397
486
  min_node_size.extend([2, 3, 4])
398
487
  iter_num.extend([20, 30, 40])
399
- num_trees.extend([20, 30, 40])
400
488
 
401
489
  # Hyperparameters for XGBoost model
402
490
  xgb_params = {
@@ -409,7 +497,8 @@ class _ModelTraining:
409
497
  'shrinkage_factor': tuple(shrinkage_factor),
410
498
  'max_depth': tuple(max_depth),
411
499
  'min_node_size': tuple(min_node_size),
412
- 'iter_num': tuple(iter_num)
500
+ 'iter_num': tuple(iter_num),
501
+ 'seed':42
413
502
  }
414
503
  # Hyperparameters for Decision Forest model
415
504
  df_params = {
@@ -419,7 +508,8 @@ class _ModelTraining:
419
508
  'min_impurity': tuple(min_impurity),
420
509
  'max_depth': tuple(max_depth),
421
510
  'min_node_size': tuple(min_node_size),
422
- 'num_trees': tuple(num_trees)
511
+ 'num_trees': tuple(num_trees),
512
+ 'seed':42
423
513
  }
424
514
 
425
515
  # Updating model type in case of classification
@@ -445,12 +535,12 @@ class _ModelTraining:
445
535
 
446
536
  PARAMETERS:
447
537
  num_rows
448
- Required Arugment.
538
+ Required Argument.
449
539
  Specifies the number of rows in dataset.
450
540
  Types: int
451
541
 
452
542
  num_cols:
453
- Required Arugment.
543
+ Required Argument.
454
544
  Specifies the number of columns in dataset.
455
545
  Types: int
456
546
 
@@ -482,12 +572,12 @@ class _ModelTraining:
482
572
 
483
573
  PARAMETERS:
484
574
  num_rows:
485
- Required Arugment.
575
+ Required Argument.
486
576
  Specifies the number of rows in dataset.
487
577
  Types: int
488
578
 
489
579
  num_cols:
490
- Required Arugment.
580
+ Required Argument.
491
581
  Specifies the number of columns in dataset.
492
582
  Types: int
493
583
 
@@ -616,6 +706,44 @@ class _ModelTraining:
616
706
  raise ValueError("No model is selected for training.")
617
707
 
618
708
  return parameters
709
+
710
+ def distribute_max_models(self):
711
+ """
712
+ DESCRIPTION:
713
+ Internal function to distribute max_models across available model functions.
714
+
715
+ RETURNS:
716
+ dictionary containing max_models distribution and list of models to remove.
717
+ """
718
+ # Getting total number of models
719
+ model_count=len(self.model_list)
720
+ # Evenly distributing max_models across models
721
+ base_assign = self.max_models // model_count
722
+ # Creating list of max_models for each model
723
+ distribution = [base_assign] * model_count
724
+
725
+ # Calculating remaining models
726
+ remaining_model_count = self.max_models % model_count
727
+ if remaining_model_count:
728
+ # distributing remaining model across models.
729
+ # Starting from first model in list and distributing remaining models by 1 each.
730
+ for i in range(remaining_model_count):
731
+ distribution[i] += 1
732
+
733
+ # Creating dictionary for model distribution
734
+ model_distribution = dict(zip(self.model_list, distribution))
735
+ # Getting list of models with 0 distribution and removing them from model list
736
+ # While for model having distribution greater than 0, updating distribution with
737
+ # 1/3rd of original value as we are training with 3 different feature selection methods.
738
+ models_to_remove = []
739
+ for model in self.model_list:
740
+ initial_count = model_distribution[model]
741
+ if initial_count == 0:
742
+ models_to_remove.append(model)
743
+ else:
744
+ model_distribution[model] = math.ceil(initial_count / 3)
745
+
746
+ return model_distribution, models_to_remove
619
747
 
620
748
  def _parallel_training(self, parameters):
621
749
  """
@@ -635,12 +763,15 @@ class _ModelTraining:
635
763
 
636
764
  # Hyperparameters for each model
637
765
  model_params = parameters[:min(len(parameters), 5)]
638
- self._display_msg(msg="\nPerforming hyperParameter tuning ...", progress_bar=self.progress_bar)
766
+ self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
639
767
 
640
- # Defining training and testing data
768
+ # Defining training data
641
769
  data_types = ['lasso', 'rfe', 'pca']
642
770
  trainng_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_train']) for data_type in data_types)
643
- testing_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_test']) for data_type in data_types)
771
+
772
+ if self.task_type == "Classification":
773
+ response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
774
+ self.output_response = [str(i) for i in response_values]
644
775
 
645
776
  if self.stopping_metric is None:
646
777
  self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
@@ -648,115 +779,31 @@ class _ModelTraining:
648
779
 
649
780
  self.max_runtime_secs = self.max_runtime_secs/len(model_params) \
650
781
  if self.max_runtime_secs is not None else None
782
+
783
+ if self.max_models is not None:
784
+ # Getting model distribution and models to remove
785
+ self.max_models_distribution, models_to_remove = self.distribute_max_models()
786
+ # Removing model parameters with 0 distribution
787
+ if len(models_to_remove):
788
+ for model in models_to_remove:
789
+ model_params = [param for param in model_params if param['name'] != model]
790
+ # Updating progress bar as we are removing model
791
+ self.progress_bar.update()
792
+
793
+ if self.is_classification_type():
794
+ self.startify_col = self.target_column
651
795
 
652
796
  trained_models = []
653
797
  for param in model_params:
654
- result = self._hyperparameter_tunning(param, trainng_datas, testing_datas)
798
+ result = self._hyperparameter_tunning(param, trainng_datas)
655
799
  trained_models.append(result)
656
800
 
657
801
  models_df = pd.concat(trained_models, ignore_index=True)
658
-
659
- # Score the model and combine the results into a single DataFrame
660
- trained_models_info = self._model_scoring(testing_datas, models_df)
661
- trained_models_info = trained_models_info.reset_index(drop=True)
662
-
663
- return trained_models_info
664
-
665
- def _model_scoring(self,
666
- test_data,
667
- model_info):
668
- """
669
- DESCRIPTION:
670
- Internal function generates the performance metrics for
671
- trained ML models using testing dataset.
672
-
673
- PARAMETERS:
674
- test_data
675
- Required Argument.
676
- Specifies the testing datasets
677
- Types: tuple of Teradataml DataFrame
678
-
679
- model_info
680
- Required Arugment.
681
- Specifies the trained models information.
682
- Types: Pandas DataFrame
683
-
684
- RETURNS:
685
- Pandas DataFrame containing, trained models with thier performance metrics.
686
- """
687
- self._display_msg(msg="Evaluating models performance ...",
688
- progress_bar = self.progress_bar,
689
- show_data=True)
690
- # Empty list for storing model performance metrics
691
- model_performance_data = []
692
-
693
- # Mapping feature selection methods to corresponding test data
694
- feature_selection_to_test_data = {"lasso": test_data[0],
695
- "rfe": test_data[1],
696
- "pca": test_data[2]}
697
-
698
- # Iterating over models
699
- for index, model_row in model_info.iterrows():
700
- # Extracting model name, feature selection method, and model object
701
- model_name, feature_selection, model_object = model_row['Name'], \
702
- model_row['Feature selection'], model_row['obj']
703
-
704
- # Selecting test data based on feature selection method
705
- test_set = feature_selection_to_test_data[feature_selection]
706
-
707
- # Model evaluation
708
- if model_name == 'knn':
709
- performance_metrics = model_object.evaluate(test_data=test_set)
710
- else:
711
- eval_params = self._eval_params_generation(model_name)
712
- performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
713
-
714
- # Extracting performance metrics
715
- if self.is_classification_type():
716
- # Classification
717
- # Extract performance metrics from the output data
718
- performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
719
-
720
- # Combine all the elements to form a new row
721
- new_row = [model_name, feature_selection] + performance_metrics_list + [model_object]
722
- else:
723
- # Regression
724
- regression_metrics = next(performance_metrics.result.itertuples())
725
- sample_size = test_set.select('id').size
726
- feature_count = len(test_set.columns) - 2
727
- r2_score = regression_metrics[8]
728
- adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
729
- new_row = [model_name, feature_selection, regression_metrics[0], regression_metrics[1], regression_metrics[2],
730
- regression_metrics[5], regression_metrics[6], r2_score, adjusted_r2_score, model_object]
731
-
732
- model_performance_data.append(new_row)
733
-
734
- if self.is_classification_type():
735
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Feature selection',
736
- 'Accuracy','Micro-Precision',
737
- 'Micro-Recall','Micro-F1',
738
- 'Macro-Precision','Macro-Recall',
739
- 'Macro-F1','Weighted-Precision',
740
- 'Weighted-Recall','Weighted-F1',
741
- 'model-obj'])
742
- else:
743
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name',
744
- 'Feature selection',
745
- 'MAE', 'MSE', 'MSLE',
746
- 'RMSE', 'RMSLE',
747
- 'R2-score',
748
- 'Adjusted R2-score',
749
- 'model-obj'])
750
- self._display_msg(msg="Evaluation completed.",
751
- progress_bar = self.progress_bar,
752
- show_data=True)
753
-
754
- return model_metrics_df
755
-
802
+ return models_df
803
+
756
804
  def _hyperparameter_tunning(self,
757
805
  model_param,
758
- train_data,
759
- test_data):
806
+ train_data):
760
807
  """
761
808
  DESCRIPTION:
762
809
  Internal function performs hyperparameter tuning on
@@ -764,18 +811,13 @@ class _ModelTraining:
764
811
 
765
812
  PARAMETERS:
766
813
  model_param
767
- Required Arugment.
814
+ Required Argument.
768
815
  Specifies the eval_params argument for GridSearch.
769
816
  Types: dict
770
817
 
771
818
  train_data:
772
- Required Arugment.
773
- Specifies the training datasets.
774
- Types: tuple of Teradataml DataFrame
775
-
776
- test_data
777
819
  Required Argument.
778
- Specifies the testing datasets
820
+ Specifies the training datasets.
779
821
  Types: tuple of Teradataml DataFrame
780
822
 
781
823
  RETURNS:
@@ -786,21 +828,42 @@ class _ModelTraining:
786
828
  "xgboost": XGBoost, "decision_forest": DecisionForest, "knn": KNN}
787
829
 
788
830
  # Setting eval_params for hpt.
789
- eval_params = self._eval_params_generation(model_param['name'])
831
+ eval_params = _ModelTraining._eval_params_generation(model_param['name'],
832
+ self.target_column,
833
+ self.task_type)
790
834
 
791
835
  # Input columns for model
792
836
  model_param['input_columns'] = self.features
793
837
 
838
+ # Setting persist for model
839
+ model_param['persist'] = self.persist
840
+
794
841
  self._display_msg(msg=model_param['name'],
795
842
  progress_bar=self.progress_bar,
796
843
  show_data=True)
797
844
 
798
- # Defining test data for KNN
845
+ # As we are using entire data for HPT training. So,
846
+ # passing prepared training data as test_data for KNN.
799
847
  if model_param['name'] == 'knn':
800
- model_param['test_data'] = test_data
848
+ model_param['test_data'] = train_data
801
849
 
802
- # Defining Gridsearch with ML model based on Name
803
- _obj = GridSearch(func=model_to_func[model_param['name']], params=model_param)
850
+ if self.task_type == "Classification":
851
+ model_param['output_prob'] = True
852
+ model_param['output_responses'] = self.output_response
853
+
854
+ # Using RandomSearch for hyperparameter tunning when max_models is given.
855
+ # Otherwise, using GridSearch for hyperparameter tunning.
856
+ if self.max_models is not None:
857
+ # Setting max_models for RandomSearch based on model name
858
+ model_param['max_models'] = self.max_models_distribution[model_param['name']]
859
+ # Defining RandomSearch with ML model based on Name, and max_models
860
+ _obj = RandomSearch(func=model_to_func[model_param['name']],
861
+ params=model_param,
862
+ n_iter=model_param['max_models'])
863
+ else:
864
+ # Defining Gridsearch with ML model based on Name
865
+ _obj = GridSearch(func=model_to_func[model_param['name']],
866
+ params=model_param)
804
867
 
805
868
  if self.verbose > 0:
806
869
  print(" " *200, end='\r', flush=True)
@@ -813,46 +876,54 @@ class _ModelTraining:
813
876
  _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
814
877
  early_stop=self.stopping_tolerance, run_parallel=True,
815
878
  sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
816
- verbose=verbose, max_time=self.max_runtime_secs)
879
+ stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
817
880
  else:
818
881
  _obj.fit(data=train_data, evaluation_metric=self.stopping_metric,
819
882
  early_stop=self.stopping_tolerance, **eval_params,
820
883
  run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
821
- sample_id_column='id', verbose=verbose, max_time=self.max_runtime_secs)
884
+ sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
822
885
 
823
886
  # Getting all passed models
824
- _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
825
-
826
- # Mapping data ID to DataFrame
827
- data_id_to_df = {"DF_0": _df[_df['DATA_ID']=='DF_0'],
828
- "DF_1": _df[_df['DATA_ID']=='DF_1'],
829
- "DF_2": _df[_df['DATA_ID']=='DF_2']}
830
-
831
- # Returns best model within a Data_ID group
832
- # get_best_model = lambda df: df.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'], ascending=[False, False]).iloc[0]['MODEL_ID']\
833
- # if self.task_type != 'Regression' else df.sort_values(by=['R2', 'MAE'], ascending=[False, False]).iloc[0]['MODEL_ID']
834
- get_best_model = lambda df, stats: df.sort_values(by=stats, ascending=[False, False]).iloc[0]['MODEL_ID']
835
-
836
- # best_model = get_best_model(data_id_to_df[data_id], stats)
837
- stats = ['MICRO-F1', 'WEIGHTED-F1'] if self.task_type != 'Regression' else ['R2', 'MAE']
838
- model_info_data = []
839
- # Extracting best model
840
- for data_id, df_name in zip(["DF_0", "DF_1", "DF_2"], ["lasso", "rfe", "pca"]):
841
- if not data_id_to_df[data_id].empty:
842
- best_model = get_best_model(data_id_to_df[data_id], stats)
843
- model_info_data.append([model_param['name'], df_name, _obj.get_model(best_model)])
844
- self._display_msg(inline_msg=best_model, progress_bar=self.progress_bar)
845
-
846
- model_info = pd.DataFrame(data=model_info_data, columns=["Name",'Feature selection', "obj"])
887
+ model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
888
+ on='MODEL_ID', how='inner')
889
+ # Creating mapping data ID to feature selection method
890
+ data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
891
+ "DF_1": ('rfe', train_data[1]._table_name),
892
+ "DF_2": ('pca', train_data[2]._table_name)}
893
+
894
+ # Updating model stats with feature selection method and result table
895
+ for index, row in model_info.iterrows():
896
+ model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
897
+ model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
898
+ model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
899
+ model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
900
+
901
+ # Dropping column 'DATA_ID'
902
+ model_info.drop(['DATA_ID'], axis=1, inplace=True)
903
+
904
+ model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
905
+
906
+ if not self.is_classification_type():
907
+ # Calculating Adjusted-R2 for regression
908
+ # Getting size and feature count for each feature selection method
909
+ methods = ["lasso", "rfe", "pca"]
910
+ size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
911
+ feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
912
+ model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
913
+ 1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
914
+ (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
915
+
847
916
  self._display_msg(msg="-"*100,
848
917
  progress_bar=self.progress_bar,
849
918
  show_data=True)
850
919
  self.progress_bar.update()
851
920
 
852
921
  return model_info
853
-
854
- def _eval_params_generation(self,
855
- ml_name):
922
+
923
+ @staticmethod
924
+ def _eval_params_generation(ml_name,
925
+ target_column,
926
+ task_type):
856
927
  """
857
928
  DESCRIPTION:
858
929
  Internal function generates the eval_params for
@@ -860,28 +931,59 @@ class _ModelTraining:
860
931
 
861
932
  PARAMETERS:
862
933
  ml_name
863
- Required Arugment.
934
+ Required Argument.
864
935
  Specifies the ML name for eval_params generation.
865
936
  Types: str
937
+
938
+ target_column
939
+ Required Argument.
940
+ Specifies the target column.
941
+ Types: str
866
942
 
943
+ task_type:
944
+ Required Argument.
945
+ Specifies the task type for AutoML, whether to apply regresion
946
+ or classification on the provived dataset.
947
+ Default Value: "Regression"
948
+ Permitted Values: "Regression", "Classification"
949
+ Types: str
950
+
867
951
  RETURNS:
868
952
  dict containing, eval_params for ML model.
869
953
  """
870
954
  # Setting the eval_params
871
955
  eval_params = {"id_column": "id",
872
- "accumulate": self.target_column}
956
+ "accumulate": target_column}
957
+
958
+ model_type = {
959
+ 'xgboost': 'model_type',
960
+ 'glm': 'model_type',
961
+ 'decisionforest': 'tree_type',
962
+ 'svm': 'model_type',
963
+ 'knn': 'model_type'
964
+ }
965
+
966
+ ml_name = ml_name.replace('_', '').lower()
873
967
 
874
968
  # For Classification
875
- if self.task_type != "Regression":
969
+ if task_type.lower() != "regression":
970
+ eval_params[model_type[ml_name]] = 'Classification'
971
+ eval_params['output_prob'] = True
972
+
876
973
  if ml_name == 'xgboost':
877
- eval_params['model_type'] = 'Classification'
878
974
  eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
879
- else:
880
- eval_params['output_prob'] = True
975
+
976
+ elif ml_name == 'glm':
977
+ eval_params['family'] = 'BINOMIAL'
978
+
881
979
  else:
882
980
  # For Regression
981
+ eval_params[model_type[ml_name]] = 'Regression'
982
+
883
983
  if ml_name == 'xgboost':
884
- eval_params['model_type'] = 'Regression'
885
984
  eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
985
+
986
+ elif ml_name == 'glm':
987
+ eval_params['family'] = 'GAUSSIAN'
886
988
 
887
989
  return eval_params