teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -15,21 +15,29 @@
15
15
 
16
16
  # Python libraries
17
17
  import json
18
+ import pandas as pd
18
19
  import numpy as np
19
20
  from sklearn.metrics import confusion_matrix
20
21
  import time
22
+ import ast
23
+ import warnings
24
+ import joblib
25
+ from io import BytesIO
21
26
 
22
27
  # Teradata libraries
23
28
  from teradataml.dataframe.copy_to import copy_to_sql
24
29
  from teradataml import ColumnExpression
25
30
  from teradataml.dataframe.dataframe import DataFrame
31
+ from teradataml.utils.utils import execute_sql
26
32
  from teradataml.utils.validators import _Validators
27
- from teradataml import ROC
28
- from teradataml.common.utils import UtilFuncs
33
+ from teradataml import ROC, BLOB
29
34
  from teradataml.utils.dtypes import _Dtypes
30
35
  from teradataml.common.utils import UtilFuncs
31
36
  from teradataml import TeradataMlException
32
37
  from teradataml.common.messages import Messages, MessageCodes
38
+ from teradataml.telemetry_utils.queryband import collect_queryband
39
+ from teradataml import TeradataConstants
40
+ from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
33
41
 
34
42
  # AutoML Internal libraries
35
43
  from teradataml.automl.data_preparation import _DataPreparation
@@ -51,7 +59,9 @@ class AutoML:
51
59
  max_runtime_secs = None,
52
60
  stopping_metric = None,
53
61
  stopping_tolerance = None,
54
- custom_config_file = None):
62
+ max_models = None,
63
+ custom_config_file = None,
64
+ **kwargs):
55
65
  """
56
66
  DESCRIPTION:
57
67
  AutoML (Automated Machine Learning) is an approach that automates the process
@@ -82,12 +92,12 @@ class AutoML:
82
92
  AutoML also provides an option to customize the processes within feature
83
93
  engineering, data preparation and model training phases. User can customize
84
94
  the processes by passing the JSON file path in case of custom run. It also
85
- supports early stopping of model training based on stopping metrics and
86
- maximum running time.
95
+ supports early stopping of model training based on stopping metrics,
96
+ maximum running time and maximum models to be trained.
87
97
 
88
98
  PARAMETERS:
89
99
  task_type:
90
- Optional Arugment.
100
+ Optional Argument.
91
101
  Specifies the task type for AutoML, whether to apply regression OR classification
92
102
  on the provided dataset. If user wants AutoML to decide the task type automatically,
93
103
  then it should be set to "Default".
@@ -122,7 +132,7 @@ class AutoML:
122
132
  Types: int
123
133
 
124
134
  max_runtime_secs:
125
- Optional Arugment.
135
+ Optional Argument.
126
136
  Specifies the time limit in seconds for model training.
127
137
  Types: int
128
138
 
@@ -130,8 +140,10 @@ class AutoML:
130
140
  Required, when "stopping_tolerance" is set, otherwise optional.
131
141
  Specifies the stopping metrics for stopping tolerance in model training.
132
142
  Permitted Values:
133
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
134
- "RMSE", "RMSLE"
143
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
144
+ "MAPE", "MPE", "RMSE", "RMSLE",
145
+ "ME", "EV", "MPD", "MGD"
146
+
135
147
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
136
148
  'MICRO-RECALL','MACRO-RECALL',
137
149
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -143,11 +155,38 @@ class AutoML:
143
155
  Required, when "stopping_metric" is set, otherwise optional.
144
156
  Specifies the stopping tolerance for stopping metrics in model training.
145
157
  Types: float
158
+
159
+ max_models:
160
+ Optional Argument.
161
+ Specifies the maximum number of models to be trained.
162
+ Types: int
146
163
 
147
164
  custom_config_file:
148
165
  Optional Argument.
149
166
  Specifies the path of JSON file in case of custom run.
150
167
  Types: str
168
+
169
+ **kwargs:
170
+ Specifies the additional arguments for AutoML. Below
171
+ are the additional arguments:
172
+ volatile:
173
+ Optional Argument.
174
+ Specifies whether to put the interim results of the
175
+ functions in a volatile table or not. When set to
176
+ True, results are stored in a volatile table,
177
+ otherwise not.
178
+ Default Value: False
179
+ Types: bool
180
+
181
+ persist:
182
+ Optional Argument.
183
+ Specifies whether to persist the interim results of the
184
+ functions in a table or not. When set to True,
185
+ results are persisted in a table; otherwise,
186
+ results are garbage collected at the end of the
187
+ session.
188
+ Default Value: False
189
+ Types: bool
151
190
 
152
191
  RETURNS:
153
192
  Instance of AutoML.
@@ -185,24 +224,28 @@ class AutoML:
185
224
 
186
225
  # Fit the data.
187
226
  >>> automl_obj.fit(admissions_train, "admitted")
188
-
189
- # Run predict with best performing model.
190
- >>> prediction = automl_obj.predict()
191
- >>> prediction
192
-
193
- # Run predict for new test data with best performing model.
194
- >>> prediction = automl_obj.predict(admissions_test)
195
- >>> prediction
196
227
 
197
- # Run predict for new test data with second best performing model.
198
- >>> prediction = automl_obj.predict(admissions_test, rank=2)
199
- >>> prediction
200
-
201
228
  # Display leaderboard.
202
229
  >>> automl_obj.leaderboard()
203
230
 
204
231
  # Display best performing model.
205
232
  >>> automl_obj.leader()
233
+
234
+ # Run predict on test data using best performing model.
235
+ >>> prediction = automl_obj.predict(admissions_test)
236
+ >>> prediction
237
+
238
+ # Run predict on test data using second best performing model.
239
+ >>> prediction = automl_obj.predict(admissions_test, rank=2)
240
+ >>> prediction
241
+
242
+ # Run evaluate to get performance metrics using best performing model.
243
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
244
+ >>> performance_metrics
245
+
246
+ # Run evaluate to get performance metrics using model rank 3.
247
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, rank=3)
248
+ >>> performance_metrics
206
249
 
207
250
  # Example 2 : Run AutoML for regression problem.
208
251
  # Scenario : Predict the price of house based on different factors.
@@ -221,24 +264,28 @@ class AutoML:
221
264
  >>> custom_config_file="custom_housing.json")
222
265
  # Fit the data.
223
266
  >>> automl_obj.fit(housing_train, "price")
224
-
225
- # Run predict with best performing model.
226
- >>> prediction = automl_obj.predict()
227
- >>> prediction
228
267
 
229
- # Run predict for new test data with best performing model.
230
- >>> prediction = automl_obj.predict(housing_test)
231
- >>> prediction
232
-
233
- # Run predict for new test data with second best performing model.
234
- >>> prediction = automl_obj.predict(housing_test, rank=2)
235
- >>> prediction
236
-
237
268
  # Display leaderboard.
238
269
  >>> automl_obj.leaderboard()
239
270
 
240
271
  # Display best performing model.
241
272
  >>> automl_obj.leader()
273
+
274
+ # Run predict on test data using best performing model.
275
+ >>> prediction = automl_obj.predict(housing_test)
276
+ >>> prediction
277
+
278
+ # Run predict on test data using second best performing model.
279
+ >>> prediction = automl_obj.predict(housing_test, rank=2)
280
+ >>> prediction
281
+
282
+ # Run evaluate to get performance metrics using best performing model.
283
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
284
+ >>> performance_metrics
285
+
286
+ # Run evaluate to get performance metrics using second best performing model.
287
+ >>> performance_metrics = automl_obj.evaluate(housing_test, rank=2)
288
+ >>> performance_metrics
242
289
 
243
290
  # Example 3 : Run AutoML for multiclass classification problem.
244
291
  # Scenario : Predict the species of iris flower based on different
@@ -246,6 +293,11 @@ class AutoML:
246
293
  # different processes of AutoML Run to get the best
247
294
  # performing model out of available models.
248
295
 
296
+ # Split the data into train and test.
297
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
298
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
299
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
300
+
249
301
  # Generate custom JSON file
250
302
  >>> AutoML.generate_custom_config()
251
303
 
@@ -253,22 +305,23 @@ class AutoML:
253
305
  >>> automl_obj = AutoML(verbose=2,
254
306
  >>> exclude="xgboost",
255
307
  >>> custom_config_file="custom.json")
308
+
256
309
  # Fit the data.
257
- >>> automl_obj.fit(iris_input, iris_input.species)
258
-
259
- # Run predict with best performing model.
260
- >>> prediction = automl_obj.predict()
261
- >>> prediction
262
-
263
- # Run predict with second best performing model.
264
- >>> prediction = automl_obj.predict(rank=2)
265
- >>> prediction
310
+ >>> automl_obj.fit(iris_train, iris_train.species)
266
311
 
267
312
  # Display leaderboard.
268
313
  >>> automl_obj.leaderboard()
269
314
 
270
315
  # Display best performing model.
271
316
  >>> automl_obj.leader()
317
+
318
+ # Run predict on test data using second best performing model.
319
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
320
+ >>> prediction
321
+
322
+ # Run evaluate to get performance metrics using best performing model.
323
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
324
+ >>> performance_metrics
272
325
 
273
326
  # Example 4 : Run AutoML for regression problem with early stopping metric and tolerance.
274
327
  # Scenario : Predict the price of house based on different factors.
@@ -285,41 +338,61 @@ class AutoML:
285
338
  >>> exclude="xgboost",
286
339
  >>> stopping_metric="R2",
287
340
  >>> stopping_tolerance=0.7,
341
+ >>> max_models=10,
288
342
  >>> custom_config_file="custom_housing.json")
289
343
  # Fit the data.
290
344
  >>> automl_obj.fit(housing_train, "price")
291
-
292
- # Run predict with best performing model.
293
- >>> prediction = automl_obj.predict()
294
- >>> prediction
295
-
345
+
296
346
  # Display leaderboard.
297
347
  >>> automl_obj.leaderboard()
348
+
349
+ # Run predict on test data using best performing model.
350
+ >>> prediction = automl_obj.predict(housing_test)
351
+ >>> prediction
352
+
353
+ # Run evaluate to get performance metrics using best performing model.
354
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
355
+ >>> performance_metrics
298
356
 
299
357
  # Example 5 : Run AutoML for regression problem with maximum runtime.
300
358
  # Scenario : Predict the species of iris flower based on different factors.
301
359
  # Run AutoML to get the best performing model in specified time.
302
360
 
361
+ # Split the data into train and test.
362
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
363
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
364
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
365
+
303
366
  # Create instance of AutoML.
304
367
  >>> automl_obj = AutoML(verbose=2,
305
368
  >>> exclude="xgboost",
306
- >>> max_runtime_secs=500)
369
+ >>> max_runtime_secs=500,
370
+ >>> max_models=3)
371
+
307
372
  # Fit the data.
308
- >>> automl_obj.fit(iris_input, iris_input.species)
309
-
310
- # Run predict with best performing model.
311
- >>> prediction = automl_obj.predict()
312
- >>> prediction
313
-
314
- # Run predict with second best performing model.
315
- >>> prediction = automl_obj.predict(rank=2)
316
- >>> prediction
317
-
373
+ >>> automl_obj.fit(iris_train, iris_train.species)
374
+
318
375
  # Display leaderboard.
319
376
  >>> automl_obj.leaderboard()
320
377
 
321
378
  # Display best performing model.
322
- >>> automl_obj.leader()
379
+ >>> automl_obj.leader()
380
+
381
+ # Run predict on test data using best performing model.
382
+ >>> prediction = automl_obj.predict(iris_test)
383
+ >>> prediction
384
+
385
+ # Run predict on test data using second best performing model.
386
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
387
+ >>> prediction
388
+
389
+ # Run evaluate to get performance metrics using best performing model.
390
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
391
+ >>> performance_metrics
392
+
393
+ # Run evaluate to get performance metrics using model rank 4.
394
+ >>> performance_metrics = automl_obj.evaluate(iris_test, 4)
395
+ >>> performance_metrics
323
396
  """
324
397
  # Appending arguments to list for validation
325
398
  arg_info_matrix = []
@@ -330,25 +403,36 @@ class AutoML:
330
403
  "decision_forest", "xgboost"]])
331
404
  arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
332
405
  arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
333
- arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", 'MAE',
334
- 'MSE', 'MSLE',
335
- 'RMSE', 'RMSLE',
406
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", "MAE", "MSE", "MSLE",
407
+ "MAPE", "MPE", "RMSE", "RMSLE",
408
+ "ME", "EV", "MPD", "MGD",
336
409
  'MICRO-F1','MACRO-F1',
337
410
  'MICRO-RECALL','MACRO-RECALL',
338
411
  'MICRO-PRECISION', 'MACRO-PRECISION',
339
412
  'WEIGHTED-PRECISION','WEIGHTED-RECALL',
340
413
  'WEIGHTED-F1', 'ACCURACY']])
341
414
  arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
415
+ arg_info_matrix.append(["max_models", max_models, True, (int)])
342
416
  arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
343
-
417
+
418
+ volatile = kwargs.get('volatile', False)
419
+ persist = kwargs.get('persist', False)
420
+
421
+ arg_info_matrix.append(["volatile", volatile, True, (bool)])
422
+ arg_info_matrix.append(["persist", persist, True, (bool)])
344
423
 
345
424
  # Validate argument types
346
425
  _Validators._validate_function_arguments(arg_info_matrix)
347
426
  # Either include or exclude can be used.
348
427
  if include is not None or exclude is not None:
349
428
  _Validators._validate_mutually_exclusive_arguments(include, "include", exclude, "exclude")
429
+ # Either volatile or persist can be used.
430
+ if volatile and persist:
431
+ _Validators._validate_mutually_exclusive_arguments(volatile, "volatlie", persist, "persist")
350
432
  # Validate mutually inclusive arguments
351
433
  _Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
434
+ # Validate lower range for max_models
435
+ _Validators._validate_argument_range(max_models, "max_models", lbound=1, lbound_inclusive=True)
352
436
 
353
437
  custom_data = None
354
438
  self.auto = True
@@ -375,10 +459,15 @@ class AutoML:
375
459
  self.max_runtime_secs = max_runtime_secs
376
460
  self.stopping_metric = stopping_metric
377
461
  self.stopping_tolerance = stopping_tolerance
462
+ self.max_models = max_models
378
463
  self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
379
464
  self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
380
465
  self._is_fit_called = False
466
+ self._is_load_model_called = False
467
+ self.kwargs = kwargs
468
+ self.table_name_mapping={}
381
469
 
470
+ @collect_queryband(queryband="AutoML_fit")
382
471
  def fit(self,
383
472
  data,
384
473
  target_column):
@@ -394,7 +483,7 @@ class AutoML:
394
483
  Types: teradataml Dataframe
395
484
 
396
485
  target_column:
397
- Required Arugment.
486
+ Required Argument.
398
487
  Specifies target column of dataset.
399
488
  Types: str or ColumnExpression
400
489
 
@@ -475,7 +564,9 @@ class AutoML:
475
564
  _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
476
565
  else:
477
566
  if self.stopping_metric is not None:
478
- permitted_values = ["R2", 'MAE', 'MSE', 'MSLE','RMSE', 'RMSLE']
567
+ permitted_values = ["R2", "MAE", "MSE", "MSLE",
568
+ "MAPE", "MPE", "RMSE", "RMSLE",
569
+ "ME", "EV", "MPD", "MGD"]
479
570
  _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
480
571
 
481
572
  if not self.is_classification_type():
@@ -484,7 +575,7 @@ class AutoML:
484
575
 
485
576
  # Displaying received custom input
486
577
  if self.custom_data:
487
- print("\n Received below input for customization : ")
578
+ print("\nReceived below input for customization : ")
488
579
  print(json.dumps(self.custom_data, indent=4))
489
580
 
490
581
  # Classification probelm
@@ -500,38 +591,39 @@ class AutoML:
500
591
  clf = task_cls(self.data, self.target_column, self.custom_data)
501
592
 
502
593
  self.model_info, self.leader_board, self.target_count, self.target_label, \
503
- self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
504
- model_list = self.model_list,
505
- auto = self.auto,
506
- verbose = self.verbose,
507
- max_runtime_secs = self.max_runtime_secs,
508
- stopping_metric = self.stopping_metric,
509
- stopping_tolerance = self.stopping_tolerance
510
- )
594
+ self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
595
+ model_list = self.model_list,
596
+ auto = self.auto,
597
+ verbose = self.verbose,
598
+ max_runtime_secs = self.max_runtime_secs,
599
+ stopping_metric = self.stopping_metric,
600
+ stopping_tolerance = self.stopping_tolerance,
601
+ max_models = self.max_models,
602
+ **self.kwargs)
603
+
511
604
  # Model Evaluation Phase
512
605
  self.m_evaluator = _ModelEvaluator(self.model_info,
513
606
  self.target_column,
514
607
  self.task_type)
515
608
 
609
+ @collect_queryband(queryband="AutoML_predict")
516
610
  def predict(self,
517
- data = None,
518
- rank = 1):
611
+ data,
612
+ rank = 1,
613
+ use_loaded_models = False):
519
614
  """
520
615
  DESCRIPTION:
521
- Function generates prediction on either default test data or any other data
522
- using model rank in leaderboard and displays performance metrics
523
- of the specified model.
524
-
525
- If test data contains target column, then it displays both prediction
526
- and performance metrics, otherwise displays only prediction.
616
+ Function generates prediction on data using model rank in
617
+ leaderboard.
618
+ Note:
619
+ * If both fit and load method are called before predict, then fit method model will be used
620
+ for prediction by default unless 'use_loaded_models' is set to True in predict.
527
621
 
528
622
  PARAMETERS:
529
623
  data:
530
- Optional Argument.
531
- Specifies the dataset on which prediction and performance
532
- metrices needs to be generated using model rank in leaderboard.
533
- When "data" is not specified default test data is used. Default
534
- test data is the dataset generated at the time of training.
624
+ Required Argument.
625
+ Specifies the dataset on which prediction needs to be generated
626
+ using model rank in leaderboard.
535
627
  Types: teradataml DataFrame
536
628
 
537
629
  rank:
@@ -539,6 +631,12 @@ class AutoML:
539
631
  Specifies the rank of the model in the leaderboard to be used for prediction.
540
632
  Default Value: 1
541
633
  Types: int
634
+
635
+ use_loaded_models:
636
+ Optional Argument.
637
+ Specifies whether to use loaded models from database for prediction or not.
638
+ Default Value: False
639
+ Types: bool
542
640
 
543
641
  RETURNS:
544
642
  Pandas DataFrame with predictions.
@@ -552,88 +650,84 @@ class AutoML:
552
650
  # Perform fit() operation on the "automl_obj".
553
651
  # Perform predict() operation on the "automl_obj".
554
652
 
555
- # Example 1: Run predict with best performing model.
556
- >>> prediction = automl_obj.predict()
557
- >>> prediction
558
-
559
- # Example 2: Run predict with second best performing model.
560
- >>> prediction = automl_obj.predict(rank=2)
561
- >>> prediction
562
-
563
- # Example 3: Run predict for new test data with best performing model.
653
+ # Example 1: Run predict on test data using best performing model.
564
654
  >>> prediction = automl_obj.predict(admissions_test)
565
655
  >>> prediction
566
656
 
567
- # Example 4: Run predict for new test data with second best performing model.
657
+ # Example 2: Run predict on test data using second best performing model.
568
658
  >>> prediction = automl_obj.predict(admissions_test, rank=2)
569
659
  >>> prediction
660
+
661
+ # Example 3: Run predict on test data using loaded model.
662
+ >>> automl_obj.load("model_table")
663
+ >>> prediction = automl_obj.predict(admissions_test, rank=3)
664
+ >>> prediction
665
+
666
+ # Example 4: Run predict on test data using loaded model when fit is also called.
667
+ >>> automl_obj.fit(admissions_train, "admitted")
668
+ >>> automl_obj.load("model_table")
669
+ >>> prediction = automl_obj.predict(admissions_test, rank=3, use_loaded_models=True)
670
+ >>> prediction
570
671
  """
571
- if not self._is_fit_called:
572
- # raise ValueError("fit() method must be called before generating prediction.")
672
+ # Checking if fit or load model is called before predict, If not raise error
673
+ if not self._is_fit_called and not self._is_load_model_called:
573
674
  err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
574
675
  "'predict' method", \
575
- "'fit' method must be called before" \
676
+ "'fit' or 'load' method must be called before" \
576
677
  " running predict.")
577
678
  raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
679
+
578
680
  # Appending predict arguments to list for validation.
579
681
  arg_info_pred_matrix = []
580
- arg_info_pred_matrix.append(["data", data, True, (DataFrame), True])
682
+ arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
581
683
  arg_info_pred_matrix.append(["rank", rank, True, (int), True])
684
+ arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
582
685
 
583
686
  # Validate argument types
584
687
  _Validators._validate_function_arguments(arg_info_pred_matrix)
688
+
689
+ # Run predict using loaded model
690
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
691
+ # Validate range for model rank
692
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
693
+ ubound=self.loaded_models_info.RANK.max(),
694
+ lbound_inclusive=True, ubound_inclusive=True)
695
+ return self._run_loaded_model(data, rank)
696
+
697
+ # Validate range for model rank
698
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
699
+ ubound=self.leader_board.RANK.max(),
700
+ lbound_inclusive=True, ubound_inclusive=True)
585
701
 
586
- # Setting test data indicator to default value, i.e., False.
587
- self.test_data_ind = False
588
- # Setting target column indicator to default value, i.e., False.
589
- self.target_column_ind = False
702
+ # Setting target column indicator to default value, i.e., True.
703
+ self.target_column_ind = True
590
704
  # Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
591
705
  rank = rank-1
706
+
707
+ # Setting indicator to False if target column doesn't exist
708
+ if self.target_column not in data.columns:
709
+ self.target_column_ind = False
592
710
 
593
- # Checking if there is test data provided or not.
594
- # If no, then model will generate predicion on default test data.
595
- # If yes, then at first data transformation will happen then prediction will be generated.
596
- if data is None:
597
- metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
598
- table_name_mapping=self.table_name_mapping)
711
+ # Checking if data is already transformed before or not
712
+ data_node_id = data._nodeid
713
+ if not self.table_name_mapping.get(data_node_id):
714
+ # At first data transformation will be performed on raw test data
715
+ # then evaluation will happen.
716
+ self.transform_data(data)
599
717
  else:
600
- # Setting test data indicator to True
601
- self.test_data_ind = True
602
- # Setting indicator to True if target column exists
603
- if self.target_column in data.columns:
604
- self.target_column_ind = True
605
-
606
- # Data Transformation Phase
607
- data_transform_instance = _DataTransformation(data = data,
608
- data_transformation_params = \
609
- self.data_transformation_params,
610
- auto = self.auto,
611
- verbose = self.verbose,
612
- target_column_ind = self.target_column_ind,
613
- table_name_mapping=self.table_name_mapping)
614
-
615
- self.table_name_mapping = data_transform_instance.data_transformation()
616
-
617
- # Checking for target column presence in passed test data.
618
- # If present, then both prediction and evaluation metrics will be generated.
619
- # If not present, then only prediction will be generated.
620
- if self.target_column_ind:
621
- metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
622
- test_data_ind = \
623
- self.test_data_ind,
624
- target_column_ind = \
625
- self.target_column_ind,
626
- table_name_mapping=self.table_name_mapping)
627
- else:
628
- pred = self.m_evaluator.model_evaluation(rank = rank,
629
- test_data_ind = \
630
- self.test_data_ind,
631
- table_name_mapping=self.table_name_mapping)
718
+ print("\nSkipping data transformation as data is already transformed.")
719
+
720
+ # Generating prediction
721
+ pred = self.m_evaluator.model_evaluation(rank = rank,
722
+ table_name_mapping = self.table_name_mapping,
723
+ data_node_id = data_node_id,
724
+ target_column_ind = self.target_column_ind)
725
+
632
726
  # Checking if problem type is classification and target label is present.
633
727
  if self.is_classification_type() and self.target_label is not None:
634
728
  # Displaying target column labels
635
729
  tar_dct = {}
636
- print('Target Column Mapping:')
730
+ print('\nTarget Column Mapping:')
637
731
  # Iterating rows
638
732
  for row in self.target_label.result.itertuples():
639
733
  # Retrieving the category names of encoded target column
@@ -644,76 +738,1011 @@ class AutoML:
644
738
 
645
739
  for key, value in tar_dct.items():
646
740
  print(f"{key}: {value}")
647
-
648
- print("\n Prediction : ")
741
+
742
+ # Renaming probability column if any
743
+ prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
744
+ if len(prob_lst) > 0:
745
+ rename_dict ={}
746
+ for col in pred.result.columns:
747
+ if col not in prob_lst:
748
+ rename_dict[col] = getattr(pred.result, col)
749
+ else:
750
+ indx = int(col.split('_')[1])
751
+ rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
752
+ rename_dict['drop_columns'] = True
753
+ pred.result = pred.result.assign(**rename_dict)
754
+
755
+ print("\nPrediction : ")
649
756
  print(pred.result)
650
757
 
651
- # Showing performance metrics if there is no test data
652
- # Or if target column is present in test data.
653
- if not self.test_data_ind or self.target_column_ind:
654
- print("\n Performance Metrics : ")
655
- print(metrics.result)
656
-
758
+ if self.target_column_ind:
657
759
  prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
658
-
760
+ probability_column = 'prob_1'
659
761
  # Displaying confusion matrix and ROC-AUC for classification problem
660
762
  if self.is_classification_type():
661
763
  print_data = lambda data: print(data) if _is_terminal() else display(data)
662
764
  # Displaying ROC-AUC for binary classification
663
765
  if self.target_count == 2:
664
766
  fit_params = {
665
- "probability_column" : prediction_column,
767
+ "probability_column" : probability_column,
666
768
  "observation_column" : self.target_column,
667
769
  "positive_class" : "1",
668
770
  "data" : pred.result
669
771
  }
670
772
  # Fitting ROC
671
773
  roc_out = ROC(**fit_params)
672
- print("\n ROC-AUC : ")
774
+ print("\nROC-AUC : ")
673
775
  print_data(roc_out.result)
674
776
  print_data(roc_out.output_data)
675
777
 
676
778
  # Displaying confusion matrix for binary and multiclass classification
677
779
  prediction_df=pred.result.to_pandas()
678
780
  target_col = self.target_column
679
- print("\n Confusion Matrix : ")
781
+ print("\nConfusion Matrix : ")
680
782
  print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
681
783
 
682
784
  # Returning prediction
683
- return pred.result
785
+ return pred.result
786
+
787
+ @collect_queryband(queryband="AutoML_evaluate")
788
+ def evaluate(self,
789
+ data,
790
+ rank = 1,
791
+ use_loaded_models = False
792
+ ):
793
+ """
794
+ DESCRIPTION:
795
+ Function evaluates on data using model rank in leaderboard
796
+ and generates performance metrics.
797
+ Note:
798
+ * If both fit and load method are called before predict, then fit method model will be used
799
+ for prediction by default unless 'use_loaded_models' is set to True in predict.
800
+
801
+ PARAMETERS:
802
+ data:
803
+ Required Argument.
804
+ Specifies the dataset on which performance metrics needs to be generated.
805
+ Types: teradataml DataFrame
806
+
807
+ Note:
808
+ * Target column used for generating model is mandatory in "data" for evaluation.
809
+
810
+ rank:
811
+ Optional Argument.
812
+ Specifies the rank of the model available in the leaderboard to be used for evaluation.
813
+ Default Value: 1
814
+ Types: int
815
+
816
+ use_loaded_models:
817
+ Optional Argument.
818
+ Specifies whether to use loaded models from database for prediction or not.
819
+ Default Value: False
820
+ Types: bool
821
+
822
+ RETURNS:
823
+ Pandas DataFrame with performance metrics.
824
+
825
+ RAISES:
826
+ TeradataMlException.
827
+
828
+ EXAMPLES:
829
+ # Create an instance of the AutoML called "automl_obj"
830
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
831
+ # Perform fit() operation on the "automl_obj".
832
+ # Perform evaluate() operation on the "automl_obj".
833
+
834
+ # Example 1: Run evaluate on test data using best performing model.
835
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
836
+ >>> performance_metrics
837
+
838
+ # Example 2: Run evaluate on test data using second best performing model.
839
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, rank=2)
840
+ >>> performance_metrics
841
+
842
+ # Example 3: Run evaluate on test data using loaded model.
843
+ >>> automl_obj.load("model_table")
844
+ >>> evaluation = automl_obj.evaluate(admissions_test, rank=3)
845
+ >>> evaluation
846
+
847
+ # Example 4: Run predict on test data using loaded model when fit is also called.
848
+ >>> automl_obj.fit(admissions_train, "admitted")
849
+ >>> automl_obj.load("model_table")
850
+ >>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
851
+ >>> evaluation
852
+ """
853
+ if not self._is_fit_called and not self._is_load_model_called:
854
+ # raise ValueError("fit() method must be called before evaluating.")
855
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
856
+ "'evaluate' method", \
857
+ "'fit' or 'load' method must be called before" \
858
+ " running evaluate.")
859
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
860
+ # Appending evaluate arguments to list for validation.
861
+ arg_info_pred_matrix = []
862
+ arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
863
+ arg_info_pred_matrix.append(["rank", rank, True, (int), True])
864
+ arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
865
+
866
+ # Validate argument types
867
+ _Validators._validate_function_arguments(arg_info_pred_matrix)
868
+
869
+ # Run evaluate using loaded model
870
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
871
+ # Validate range for model rank
872
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
873
+ ubound=self.loaded_models_info.RANK.max(),
874
+ lbound_inclusive=True, ubound_inclusive=True)
875
+ return self._run_loaded_model(data, rank, output_type="evaluate")
876
+
877
+ # Validate range for model rank
878
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
879
+ ubound=self.leader_board.RANK.max(),
880
+ lbound_inclusive=True, ubound_inclusive=True)
881
+
882
+ # Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
883
+ rank = rank-1
884
+
885
+ # Raising exception if target column is not present in data
886
+ # as it is required for evaluation.
887
+ if self.target_column not in data.columns:
888
+ raise TeradataMlException(
889
+ Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
890
+ MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
891
+
892
+ # Checking if data is already transformed before or not
893
+ data_node_id = data._nodeid
894
+ if not self.table_name_mapping.get(data_node_id):
895
+ # At first data transformation will be performed on raw test data
896
+ # then evaluation will happen.
897
+ self.transform_data(data)
898
+ else:
899
+ print("\nSkipping data transformation as data is already transformed.")
900
+
901
+ metrics = self.m_evaluator.model_evaluation(rank = rank,
902
+ table_name_mapping=self.table_name_mapping,
903
+ data_node_id = data_node_id,
904
+ get_metrics = True)
905
+
906
+ # Checking if problem type is classification and target label is present.
907
+ if self.is_classification_type() and self.target_label is not None:
908
+ # Displaying target column labels
909
+ tar_dct = {}
910
+ print('\nTarget Column Mapping:')
911
+ # Iterating rows
912
+ for row in self.target_label.result.itertuples():
913
+ # Retrieving the category names of encoded target column
914
+ # row[1] contains the orginal name of cateogry
915
+ # row[2] contains the encoded value
916
+ if row[1] != 'TD_CATEGORY_COUNT':
917
+ tar_dct[row[1]] = row[2]
918
+
919
+ for key, value in tar_dct.items():
920
+ print(f"{key}: {value}")
921
+
922
+ # Showing performance metrics
923
+ print("\nPerformance Metrics : ")
924
+ print(metrics.result)
925
+ if self.is_classification_type():
926
+ print("-"*80)
927
+ print(metrics.output_data)
928
+
929
+ # Returning performance metrics
930
+ return metrics.result
684
931
 
932
+ def transform_data(self,
933
+ data,
934
+ data_params = None,
935
+ auto = None,
936
+ verbose = None,
937
+ target_column_ind = None):
938
+ """
939
+ DESCRIPTION:
940
+ Function transforms the data based on the data transformation parameters
941
+ generated during the fit phase.
942
+
943
+ PARAMETERS:
944
+ data:
945
+ Required Argument.
946
+ Specifies the dataset to be transformed.
947
+ Types: teradataml DataFrame
948
+
949
+ data_params:
950
+ Optional Argument.
951
+ Specifies the data transformation parameters.
952
+ Default Value: None
953
+ Types: dict
954
+
955
+ auto:
956
+ Optional Argument.
957
+ Specifies whether to AutoML ran in auto or custom mode.
958
+ Default Value: None
959
+ Types: bool
960
+
961
+ verbose:
962
+ Optional Argument.
963
+ Specifies the verbosity level.
964
+ Default Value: None
965
+ Types: int
966
+
967
+ target_column_ind:
968
+ Optional Argument.
969
+ Specifies whether target column is present in data or not.
970
+ Default Value: None
971
+ Types: bool
972
+
973
+ RETURNS:
974
+ None
975
+ """
976
+ # Creating instance of DataTransformation
977
+ data_transform_instance = _DataTransformation(data = data,
978
+ data_transformation_params=data_params if data_params is not None else \
979
+ self.data_transformation_params,
980
+ auto=auto if data_params is not None else self.auto,
981
+ verbose=verbose if verbose is not None else self.verbose,
982
+ target_column_ind=target_column_ind if target_column_ind is not None else \
983
+ self.target_column_ind,
984
+ table_name_mapping=self.table_name_mapping)
985
+
986
+ # Storing mapping of table names for transformed data
987
+ self.table_name_mapping = data_transform_instance.data_transformation()
988
+
989
+ @collect_queryband(queryband="AutoML_leaderboard")
685
990
  def leaderboard(self):
686
991
  """
687
992
  DESCRIPTION:
688
- Function displays leaderboard.
993
+ Function displays leaderboard.
994
+
995
+ RETURNS:
996
+ Pandas DataFrame with Leaderboard information.
997
+
998
+ RAISES:
999
+ TeradataMlException.
1000
+
1001
+ EXAMPLES:
1002
+ # Create an instance of the AutoML called "automl_obj"
1003
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1004
+ # Perform fit() operation on the "automl_obj".
1005
+ # Generate leaderboard using leaderboard() method on "automl_obj".
1006
+ >>> automl_obj.leaderboard()
1007
+ """
1008
+ if not self._is_fit_called:
1009
+ # raise ValueError("fit() method must be called before generating leaderboard.")
1010
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1011
+ "'leaderboard' method", \
1012
+ "'fit' method must be called before" \
1013
+ " generating leaderboard.")
1014
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1015
+ return self.leader_board
1016
+
1017
+ @collect_queryband(queryband="AutoML_leader")
1018
+ def leader(self):
1019
+ """
1020
+ DESCRIPTION:
1021
+ Function displays best performing model.
1022
+
1023
+ RETURNS:
1024
+ None
1025
+
1026
+ RAISES:
1027
+ TeradataMlException.
1028
+
1029
+ EXAMPLES:
1030
+ # Create an instance of the AutoML called "automl_obj"
1031
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1032
+ # Perform fit() operation on the "automl_obj".
1033
+ # Generate leaderboard using leaderboard() method on "automl_obj".
1034
+ # Display best performing model using leader() method on "automl_obj".
1035
+ >>> automl_obj.leader()
1036
+ """
1037
+ if not self._is_fit_called:
1038
+ # raise ValueError("fit() method must be called before generating leader.")
1039
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1040
+ "'leader' method", \
1041
+ "'fit' method must be called before" \
1042
+ " generating leader.")
1043
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1044
+ record = self.leader_board
1045
+ if not _is_terminal():
1046
+ display(record[record['RANK'] == 1])
1047
+ else:
1048
+ print(record[record['RANK'] == 1])
1049
+
1050
+ @collect_queryband(queryband="AutoML_hyperparameter")
1051
+ def model_hyperparameters(self,
1052
+ rank=1,
1053
+ use_loaded_models=False):
1054
+ """
1055
+ DESCRIPTION:
1056
+ Get hyperparameters of the model based on rank in leaderboard.
1057
+ Note:
1058
+ * If both the fit() and load() methods are invoked before calling model_hyperparameters(),
1059
+ by default hyperparameters are retrieved from the fit leaderboard.
1060
+ To retrieve hyperparameters from the loaded models, set "use_loaded_models" to True in the model_hyperparameters call.
1061
+
1062
+ PARAMETERS:
1063
+ rank:
1064
+ Required Argument.
1065
+ Specifies the rank of the model in the leaderboard.
1066
+ Default Value: 1
1067
+ Types: int
1068
+
1069
+ use_loaded_models:
1070
+ Optional Argument.
1071
+ Specifies whether to use loaded models from database to get hyperparameters or not.
1072
+ Default Value: False
1073
+ Types: bool
1074
+
1075
+ RETURNS:
1076
+ Dictionary, containing hyperparameters.
1077
+
1078
+ RAISES:
1079
+ TeradataMlException.
1080
+
1081
+ EXAMPLES:
1082
+ # Example 1: Get hyperparameters of the model using fit models.
1083
+ # Create an instance of the AutoML called "automl_obj"
1084
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1085
+ # Perform fit() operation on the "automl_obj".
1086
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1087
+ >>> automl_obj = AutoML(task_type="Classification")
1088
+ >>> automl_obj.fit(admissions_train, "admitted")
1089
+ >>> automl_obj.model_hyperparameters(rank=1)
1090
+
1091
+ # Example 2: Get hyperparameters of the model using loaded models.
1092
+ # Create an instance of the AutoML called "automl_obj"
1093
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1094
+ # Load models from the specified table.
1095
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1096
+ >>> automl_obj = AutoML()
1097
+ >>> automl_obj.load("model_table")
1098
+ >>> automl_obj.model_hyperparameters(rank=1)
1099
+
1100
+ # Example 3: Get hyperparameters of the model when both fit and load method are called.
1101
+ # Create an instance of the AutoML called "automl_obj"
1102
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1103
+ # Fit the data.
1104
+ # Load models from the specified table.
1105
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1106
+ >>> automl_obj = AutoML(task_type="Classification")
1107
+ >>> automl_obj.fit(admissions_train, "admitted")
1108
+ >>> automl_obj.load("model_table")
1109
+
1110
+ # Get hyperparameters of the model using loaded models.
1111
+ >>> automl_obj.model_hyperparameters(rank=1, use_loaded_models=True)
1112
+ # Get hyperparameters of the model using fit models.
1113
+ >>> automl_obj.model_hyperparameters(rank=1)
1114
+ """
1115
+
1116
+ if not self._is_fit_called and not self._is_load_model_called:
1117
+ # raise ValueError("fit() or load() method must be called before getting hyperparameters.")
1118
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1119
+ "'model_hyperparameters' method",
1120
+ "No models available to get hyperparameters. " \
1121
+ "Run 'fit()' or 'load()' methods to get models.")
1122
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1123
+
1124
+ arg_info_matrix = []
1125
+ arg_info_matrix.append(["rank", rank, True, (int), True])
1126
+ arg_info_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
1127
+
1128
+ # Validate argument types
1129
+ _Validators._validate_function_arguments(arg_info_matrix)
1130
+
1131
+ leaderboard = None
1132
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
1133
+ leaderboard = self.loaded_models_info
1134
+ else:
1135
+ leaderboard = self.model_info
1136
+
1137
+ # Validate range for model rank from loaded models
1138
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
1139
+ ubound=leaderboard.RANK.max(),
1140
+ lbound_inclusive=True, ubound_inclusive=True)
1141
+ hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
1142
+
1143
+ # Deserializing hyperparameters
1144
+ hyperparams = ast.literal_eval(hyperparams)
1145
+
1146
+ # Removing 'data' from hyperparameters
1147
+ keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
1148
+ for key in keys_to_remove:
1149
+ hyperparams.pop(key, None)
1150
+
1151
+ return hyperparams
1152
+
1153
+ @collect_queryband(queryband="AutoML_load")
1154
+ def load(self,
1155
+ table_name):
1156
+ """
1157
+ DESCRIPTION:
1158
+ Function loads models information from the specified table.
1159
+
1160
+ PARAMETERS:
1161
+ table_name:
1162
+ Required Argument.
1163
+ Specifies the table name from which models are to be loaded.
1164
+ Types: str
1165
+
1166
+ RETURNS:
1167
+ Pandas DataFrame with loaded models information.
1168
+
1169
+ RAISES:
1170
+ TeradataMlException.
1171
+
1172
+ EXAMPLES:
1173
+ # Create an instance of the AutoML called "obj"
1174
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1175
+ >>> obj = AutoML()
1176
+ # Load models from the specified table.
1177
+ >>> tab = obj.load("model_table")
1178
+ """
1179
+ # Appending arguments to list for validation
1180
+ arg_info_matrix = []
1181
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1182
+
1183
+ # Validate argument types
1184
+ _Validators._validate_function_arguments(arg_info_matrix)
1185
+
1186
+ # Loading models
1187
+ self.loaded_models_info = DataFrame(table_name).to_pandas()
1188
+
1189
+ self._load_data_transform_params()
1190
+
1191
+ self._is_load_model_called = True
1192
+
1193
+ return self.loaded_models_info.drop(['RESULT_TABLE', 'PARAMETERS'], axis=1)
1194
+
1195
+ def _load_data_transform_params(self):
1196
+ """
1197
+ DESCRIPTION:
1198
+ Internal Function loads data transformation parameters from the specified table.
1199
+ """
1200
+ from sklearn.decomposition import PCA
1201
+
1202
+ # Getting data transformation row
1203
+ data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
1204
+
1205
+ # Removing data transformation row and dropping 'DATA_PARAMS' column
1206
+ # from loaded models info
1207
+ self.loaded_models_info = self.loaded_models_info[self.loaded_models_info['RANK'] != -1]
1208
+ self.loaded_models_info.drop('DATA_PARAMS', axis=1, inplace=True)
1209
+
1210
+ # Loading data transformation parameters by deserializing
1211
+ buffer = BytesIO(data_transform_row['DATA_PARAMS'])
1212
+ data_params = joblib.load(buffer)
1213
+
1214
+ fit_obj_lst = json.loads(data_transform_row['PARAMETERS'])
1215
+
1216
+ # Generating Dataframe from table_names in data params
1217
+ # fit_obj_lst contain : ['one_hot_encoding_fit_obj', 'lasso_scale_fit_obj', 'pca_scale_fit_obj', imputation_fit_object]
1218
+ # Iterating over fit_obj_lst and converting table names to DataFrame
1219
+ for fit_obj_name in fit_obj_lst:
1220
+ if isinstance(data_params[fit_obj_name], dict):
1221
+ for key, val in data_params[fit_obj_name].items():
1222
+ # Key: automl transformation step name, val: table name
1223
+ data_params[fit_obj_name][key] = DataFrame(f'{val}')
1224
+ else:
1225
+ data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
1226
+
1227
+ # Manually deserializing and reconstructing PCA object
1228
+ load_pca_info = data_params['pca_fit_instance']
1229
+ pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
1230
+ pca.components_ = np.array(load_pca_info['components'])
1231
+ pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
1232
+ pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
1233
+ pca.mean_ = np.array(load_pca_info['mean'])
1234
+ pca.n_components_ = load_pca_info['n_components']
1235
+ pca.noise_variance_ = load_pca_info['noise_variance']
1236
+ pca.singular_values_ = np.array(load_pca_info['singular_values'])
1237
+
1238
+ data_params['pca_fit_instance'] = pca
1239
+
1240
+ self.loaded_data_transformation_params = data_params
1241
+
1242
+ def _validate_ranks(self, ranks):
1243
+ """
1244
+ DESCRIPTION:
1245
+ Function validates the ranks argument.
1246
+
1247
+ PARAMETERS:
1248
+ ranks:
1249
+ Required Argument.
1250
+ Specifies the ranks for the models to be saved.
1251
+ Types: int or list of int
1252
+
1253
+ RAISES:
1254
+ TeradataMlException.
1255
+ """
1256
+ start_rank, end_rank = ranks.start, ranks.stop
1257
+
1258
+ # Check if both parts are non-negative integers
1259
+ if not (start_rank > 0 and end_rank > 0):
1260
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1261
+ "'deploy' method", \
1262
+ "Provided start and end rank in 'ranks' "\
1263
+ "must be positive non-zero integers.")
1264
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1265
+
1266
+ # Check if start_rank is less than or equal to end_rank
1267
+ if start_rank > end_rank:
1268
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1269
+ "'deploy' method", \
1270
+ "Provided start rank in 'ranks' must be less than"\
1271
+ " or equal to end rank in 'ranks'.")
1272
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1273
+
1274
+ # check end rank is less than or equal to total models
1275
+ if end_rank > self.leader_board.RANK.max():
1276
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1277
+ "'deploy' method", \
1278
+ "Provided end rank in 'ranks' must be less than"\
1279
+ " or equal to total models available.")
1280
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1281
+
1282
+ return start_rank, end_rank
1283
+
1284
+ @collect_queryband(queryband="AutoML_deploy")
1285
+ def deploy(self,
1286
+ table_name,
1287
+ top_n = 3,
1288
+ ranks = None
1289
+ ):
1290
+ """
1291
+ DESCRIPTION:
1292
+ Function saves models to the specified table name.
1293
+ Note:
1294
+ * If 'ranks' is provided, specified models in 'ranks' will be saved
1295
+ and ranks will be reassigned to specified models based
1296
+ on the order of the leaderboard, non-specified models will be ignored.
1297
+
1298
+ PARAMETERS:
1299
+ table_name:
1300
+ Required Argument.
1301
+ Specifies the table name to which models information is to be saved.
1302
+ Types: str
1303
+
1304
+ top_n:
1305
+ Optional Argument.
1306
+ Specifies the top n models to be saved.
1307
+ Note:
1308
+ * If 'ranks' is not provided, the function saves the top 'top_n' models.
1309
+
1310
+ Default Value: 3
1311
+ Types: int
1312
+
1313
+ ranks:
1314
+ Optional Argument.
1315
+ Specifies the ranks for the models to be saved.
1316
+ Note:
1317
+ * If 'ranks' is provided, then 'top_n' is ignored.
1318
+ Types: int or list of int or range object
1319
+
1320
+ RETURNS:
1321
+ None
1322
+
1323
+ RAISES:
1324
+ TeradataMlException.
1325
+
1326
+ EXAMPLES:
1327
+ # Create an instance of the AutoML called "obj"
1328
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1329
+ >>> obj = AutoML(task_type="Classification")
1330
+ >>> obj.fit(data = data, target_column = target_column)
1331
+
1332
+ # Save top 3 models to the specified table.
1333
+ >>> obj.deploy("model_table")
1334
+
1335
+ # Save top n models to the specified table.
1336
+ >>> obj.deploy("model_table", top_n=5)
1337
+
1338
+ # Save models based on specified ranks to the specified table.
1339
+ >>> obj.deploy("model_table", ranks=[1, 3, 5])
1340
+
1341
+ # Save models based on specified rank range to the specified table.
1342
+ >>> obj.deploy("model_table", ranks=range(2,6))
1343
+ """
1344
+ # raise Error if fit is not called
1345
+ if not self._is_fit_called:
1346
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1347
+ "'deploy' method", \
1348
+ "'fit' method must be called before" \
1349
+ " 'deploy'.")
1350
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1351
+
1352
+ # Appending arguments to list for validation
1353
+ arg_info_matrix = []
1354
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1355
+ arg_info_matrix.append(["top_n", top_n, True, (int)])
1356
+ if not isinstance(ranks, range):
1357
+ arg_info_matrix.append(["ranks", ranks, True, (int, list)])
1358
+
1359
+ # Validate argument types
1360
+ _Validators._validate_function_arguments(arg_info_matrix)
1361
+
1362
+ if isinstance(ranks, int):
1363
+ ranks = [ranks]
1364
+ elif isinstance(ranks, range):
1365
+ start_rank, end_rank = self._validate_ranks(ranks)
1366
+
1367
+ if ranks is None or len(ranks) == 0:
1368
+ # If total models are greater than available models or less than 1
1369
+ try:
1370
+ _Validators._validate_argument_range(top_n, "top_n", lbound=1,
1371
+ ubound=self.leader_board.RANK.max(),
1372
+ lbound_inclusive=True, ubound_inclusive=True)
1373
+ except ValueError as e:
1374
+ msg = "\n'top_n' should be equal or less than the available models or greater than 0. " \
1375
+ "Deploying all available models to the table."
1376
+ warnings.warn(message=msg, stacklevel=2)
1377
+ top_n = self.leader_board.shape[0]
1378
+ elif isinstance(ranks, list):
1379
+ # If ranks is provided, then validating the ranks elements
1380
+ for ele in ranks:
1381
+ _Validators._validate_argument_range(ele, "element in ranks", lbound=1,
1382
+ ubound=self.leader_board.RANK.max(),
1383
+ lbound_inclusive=True, ubound_inclusive=True)
1384
+
1385
+ feature_selections = self.model_info['FEATURE_SELECTION'].unique().tolist()
1386
+
1387
+ # Mapping feature selection to training data,
1388
+ # we are creating a dictionary with key as feature selection and
1389
+ # value as temporary training data table name, so that we can copy
1390
+ # temporary training data to permanent table.
1391
+ # Here's an example of mapping:
1392
+ # Example: {'lasso': 'ml__survived_lasso_1717475362789542',
1393
+ # 'rfe': 'ml__survived_rfe_1717474570567062',
1394
+ # 'pca': 'ml__survived_pca_1717475375119752'}
1395
+ fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
1396
+ 'DATA_TABLE'].iloc[0] for fs in feature_selections}
1397
+
1398
+ # Saving temporary training data to permanent table
1399
+ # We are replacing DATA_TABLE with permanent table name in model_info
1400
+ for key, val in fs_to_data_dict.items():
1401
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, key),
1402
+ persist_result_table=val)
1403
+ fs_to_data_dict[key] = per_name
1404
+
1405
+ # Persist flag
1406
+ persist = self.kwargs.get('persist', False)
1407
+ # If ranks is provided, then saving models based on specified rank
1408
+ # in list will be prioritized over 'top_n'.
1409
+ if ranks is None or len(ranks) == 0:
1410
+ # Saving only top 'top_n' models
1411
+ for index, row in self.model_info.iterrows():
1412
+ if index < top_n:
1413
+ self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1414
+ if not persist:
1415
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1416
+ persist_result_table=row['RESULT_TABLE'])
1417
+ self.model_info.loc[index, 'RESULT_TABLE'] = per_name
1418
+ else:
1419
+ break
1420
+ sv_models = self.model_info.drop('model-obj', axis=1).head(top_n)
1421
+ else:
1422
+ if isinstance(ranks, range):
1423
+ # Saving models based on start and end rank.
1424
+ sv_models = self.model_info[start_rank-1:end_rank].copy()
1425
+ else:
1426
+ # Saving models based on specified rank in list
1427
+ sv_models = self.model_info[self.model_info['RANK'].isin(ranks)].copy()
1428
+ sv_models.drop('model-obj', axis=1, inplace=True)
1429
+ sv_models.reset_index(drop=True, inplace=True)
1430
+
1431
+ for index, row in sv_models.iterrows():
1432
+ sv_models.loc[index, 'RANK'] = index + 1
1433
+ sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1434
+ if not persist:
1435
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1436
+ persist_result_table=row['RESULT_TABLE'])
1437
+ sv_models.loc[index, 'RESULT_TABLE'] = per_name
1438
+
1439
+ # Data Transformation Parameters
1440
+ df = self._deploy_data_transformation_params()
1441
+
1442
+ # Saving data transformation parameters to the specified table
1443
+ sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
1444
+
1445
+ copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
1446
+
1447
+ print('Model Deployment Completed Successfully.')
1448
+
1449
+ def _create_per_result_table(self, prefix, persist_result_table):
1450
+ """
1451
+ DESCRIPTION:
1452
+ Internal Function creates permanent table for the specified result table.
1453
+
1454
+ PARAMETERS:
1455
+ prefix:
1456
+ Required Argument.
1457
+ Specifies the prefix for the permanent table name.
1458
+ Types: str
1459
+
1460
+ persist_result_table:
1461
+ Required Argument.
1462
+ Specifies the result table name.
1463
+ Types: str
1464
+
1465
+ RETURNS:
1466
+ Permanent table name.
1467
+
1468
+ RAISES:
1469
+ TeradataMlException.
1470
+ """
1471
+
1472
+ table_name = UtilFuncs._generate_temp_table_name(prefix=prefix,
1473
+ table_type=TeradataConstants.TERADATA_TABLE,
1474
+ gc_on_quit=False)
1475
+ qry = f"SELECT * FROM {persist_result_table}"
1476
+ UtilFuncs._create_table(table_name=table_name,
1477
+ query=qry,
1478
+ volatile=False)
1479
+ return table_name
1480
+
1481
+
1482
+ def _deploy_data_transformation_params(self):
1483
+ """
1484
+ DESCRIPTION:
1485
+ Internal Function converts data transformation parameters dictonary (information of each step of automl)
1486
+ to DataFrame with rank as -1 and return the DataFrame that can be concatenated with model_info DataFrame
1487
+ and saved to the user specified table in database.
1488
+
1489
+ PARAMETERS:
1490
+ None
689
1491
 
690
1492
  RETURNS:
691
- Pandas DataFrame with Leaderboard information.
1493
+ None
692
1494
 
693
1495
  RAISES:
694
1496
  TeradataMlException.
695
-
696
- EXAMPLES:
697
- # Create an instance of the AutoML called "automl_obj"
698
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
699
- # Perform fit() operation on the "automl_obj".
700
- # Generate leaderboard using leaderboard() method on "automl_obj".
701
- >>> automl_obj.leaderboard()
702
1497
  """
703
- if not self._is_fit_called:
704
- # raise ValueError("fit() method must be called before generating leaderboard.")
705
- err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
706
- "'leaderboard' method", \
707
- "'fit' method must be called before" \
708
- " generating leaderboard.")
709
- raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
710
- return self.leader_board
1498
+ # Create a new dictionary to store the deep copy
1499
+ data_params = {}
1500
+
1501
+ # Define a recursive function to deep copy dictionaries
1502
+ def deep_copy_dict(d):
1503
+ if not isinstance(d, dict):
1504
+ return d # Base case: if it's not a dictionary, return the value directly
1505
+ return {k: deep_copy_dict(v) for k, v in d.items()} # Recursively copy each item
711
1506
 
712
- def leader(self):
1507
+ # Deep copy is needed as the original dictionary contains nested dictionaries
1508
+ # and we want to avoid modifying the original dictionary when changes are made.
1509
+ # The .copy() method creates a shallow copy, which does not suffice for nested dictionaries.
1510
+ # Iterate through the original dictionary to handle deep copying.
1511
+ for key, value in self.data_transformation_params.items():
1512
+ # Check if value is a dictionary
1513
+ if isinstance(value, dict):
1514
+ # If the value is a dictionary, create a deep copy of the dictionary
1515
+ # This ensures that nested dictionaries are also copied, not just referenced.
1516
+ data_params[key] = deep_copy_dict(value)
1517
+ else:
1518
+ # If the value is not a dictionary, perform a shallow copy (direct assignment)
1519
+ data_params[key] = value
1520
+
1521
+ # Names of fit objects that contain the table names
1522
+ # pointing to tables in the database.
1523
+ fit_obj_names = []
1524
+
1525
+ # Persist flag
1526
+ persist = self.kwargs.get('persist', False)
1527
+
1528
+ data_params['auto_mode'] = False if self.custom_data is not None else True
1529
+
1530
+ # Iterating over data transformation parameters
1531
+ # aml_step_name is the name of transformation step taken and val is the value
1532
+ for aml_step_name,val in data_params.items():
1533
+ # Checking if value is of type teradataml DataFrame
1534
+ # If yes, then creating permanent table for the same
1535
+ # and storing the table_name in data_params instead of dataframe.
1536
+ if isinstance(val, DataFrame):
1537
+ fit_obj_names.append(aml_step_name)
1538
+ if persist:
1539
+ data_params[aml_step_name] = val._table_name
1540
+ else:
1541
+ per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
1542
+ persist_result_table= val._table_name)
1543
+ data_params[aml_step_name] = per_name
1544
+ elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
1545
+ for key, val in val.items():
1546
+ if isinstance(val, DataFrame):
1547
+ fit_obj_names.append(aml_step_name)
1548
+ if persist:
1549
+ data_params[aml_step_name][key] = val._table_name
1550
+ else:
1551
+ per_name = self._create_per_result_table(prefix='{}'.format(key),
1552
+ persist_result_table= val._table_name)
1553
+ data_params[aml_step_name][key] = per_name
1554
+ elif aml_step_name == 'pca_fit_instance':
1555
+ # Serializing PCA object
1556
+ pca = data_params[aml_step_name]
1557
+ # Extract pca parameters
1558
+ pca_params = {
1559
+ 'n_components': pca.n_components_,
1560
+ 'components': pca.components_.tolist(),
1561
+ 'explained_variance': pca.explained_variance_.tolist(),
1562
+ 'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
1563
+ 'mean': pca.mean_.tolist(),
1564
+ 'singular_values': pca.singular_values_.tolist(),
1565
+ 'noise_variance': pca.noise_variance_
1566
+ }
1567
+ data_params[aml_step_name] = pca_params
1568
+
1569
+ # Serializing data transformation parameters
1570
+ buffer = BytesIO()
1571
+ joblib.dump(data_params, buffer)
1572
+ buffer.seek(0)
1573
+ serialized_data = buffer.getvalue()
1574
+
1575
+ # Creating a string representation of fit object names
1576
+ param = json.dumps(fit_obj_names)
1577
+
1578
+ # Creating a DataFrame of data transformation information
1579
+ row = {
1580
+ 'RANK':-1,
1581
+ 'PARAMETERS':param,
1582
+ 'DATA_PARAMS':serialized_data,
1583
+ }
1584
+ df = pd.DataFrame([row])
1585
+
1586
+ return df
1587
+
1588
+ def _run_loaded_model(self,
1589
+ test_data,
1590
+ rank=1,
1591
+ output_type='prediction'):
713
1592
  """
714
1593
  DESCRIPTION:
715
- Function displays best performing model.
1594
+ Internal Function generates prediction and performance metrics using the specified model rank
1595
+ in the loaded models leaderboard.
1596
+
1597
+ PARAMETERS:
1598
+ test_data:
1599
+ Required Argument.
1600
+ Specifies the test data on which prediction and performance metrics needs to be generated.
1601
+ Types: teradataml DataFrame
1602
+
1603
+ rank:
1604
+ Optional Argument.
1605
+ Specifies the rank of the model in the leaderboard to be used for prediction.
1606
+ Default Value: 1
1607
+ Types: int
1608
+
1609
+ output_type:
1610
+ Optional Argument.
1611
+ Specifies the type of output to be generated.
1612
+ Default Value: 'prediction'
1613
+ Types: str
1614
+ Permitted Values: 'prediction', 'metrics'
1615
+
1616
+ RETURNS:
1617
+ Tuple containing prediction and performance metrics.
1618
+
1619
+ RAISES:
1620
+ TeradataMlException.
1621
+
1622
+ """
1623
+ # Indexing starts from 0
1624
+ rank = rank - 1
1625
+ # Extracting parameters
1626
+ parameters = ast.literal_eval(self.loaded_models_info.loc[rank, 'PARAMETERS'])
1627
+ # Model name
1628
+ model_name = self.loaded_models_info.loc[rank, 'MODEL_ID'].split('_')[0]
1629
+ # Feature selection
1630
+ fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
1631
+
1632
+ # Checking task type
1633
+ if 'R2' in self.loaded_models_info.columns:
1634
+ task_type='Regression'
1635
+ else:
1636
+ task_type='Classification'
1637
+
1638
+ # Model names mapping to Analytic Functions
1639
+ func_map = {
1640
+ 'XGBOOST': lambda params: XGBoost(**params),
1641
+ 'GLM': lambda params: GLM(**params),
1642
+ 'SVM': lambda params: SVM(**params),
1643
+ 'DECISIONFOREST': lambda params: DecisionForest(**params),
1644
+ 'KNN': lambda params: KNN(**params)
1645
+ }
1646
+
1647
+ if output_type == 'prediction':
1648
+ print('Generating prediction using:')
1649
+ else:
1650
+ print('Generating performance metrics using:')
1651
+ print(f"Model Name: {model_name}")
1652
+ print(f"Feature Selection: {fs}")
1653
+
1654
+ # Generating evaluation parameters
1655
+ eval_params = _ModelTraining._eval_params_generation(model_name,
1656
+ parameters['response_column'],
1657
+ task_type)
1658
+ if task_type == 'Classification':
1659
+ eval_params['output_responses'] = parameters['output_responses']
1660
+
1661
+ # Checking if response column is present in test data
1662
+ if parameters['response_column'] not in test_data.columns:
1663
+ # Checking if output type is evaluation
1664
+ if output_type == 'evaluation':
1665
+ # Response column is rqeuired for evaluation, raise error if not present
1666
+ raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
1667
+ eval_params.pop('accumulate', None)
1668
+ reponse_col_present = False
1669
+ else:
1670
+ reponse_col_present = True
1671
+
1672
+ # Checking if data is already transformed before or not
1673
+ data_node_id = test_data._nodeid
1674
+ if not self.table_name_mapping.get(data_node_id):
1675
+ # Data transformation will be performed on raw test data
1676
+ self.transform_data(data=test_data,
1677
+ data_params=self.loaded_data_transformation_params,
1678
+ auto=self.loaded_data_transformation_params['auto_mode'],
1679
+ verbose=0,
1680
+ target_column_ind=reponse_col_present)
1681
+
1682
+ # Extracting test data
1683
+ for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
1684
+ if fs in feature_selection:
1685
+ test_data = DataFrame(table_name)
1686
+ break
1687
+
1688
+ if model_name == 'KNN':
1689
+ train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
1690
+
1691
+ parameters['train_data'] = train_data
1692
+ parameters['test_data'] = test_data
716
1693
 
1694
+ if parameters['response_column'] in test_data.columns:
1695
+ parameters['accumulate'] = parameters['response_column']
1696
+
1697
+ knn = func_map[model_name](parameters)
1698
+
1699
+ # Checking if response column is present in test data
1700
+ if reponse_col_present and output_type != 'prediction':
1701
+ metrics = knn.evaluate(test_data=test_data, **eval_params)
1702
+ else:
1703
+ predictions = knn.result
1704
+ else:
1705
+ # Extracting result table name
1706
+ result_table_name = self.loaded_models_info.loc[rank, 'RESULT_TABLE']
1707
+ result_table = DataFrame(result_table_name)
1708
+ params = {
1709
+ "skip_input_arg_processing":True,
1710
+ "skip_output_arg_processing":True,
1711
+ "skip_other_arg_processing":True,
1712
+ "skip_func_output_processing":True,
1713
+ "_result_data":result_table,
1714
+ "response_column": parameters['response_column']
1715
+ }
1716
+ model = func_map[model_name](params)
1717
+ # Checking if response column is present in test data
1718
+ if reponse_col_present and output_type != 'prediction':
1719
+ metrics = model.evaluate(newdata=test_data, **eval_params)
1720
+ else:
1721
+ predictions = model.predict(newdata=test_data, **eval_params)
1722
+
1723
+ # Return prediction and metrics, when output type is metrics
1724
+ if reponse_col_present and output_type != 'prediction':
1725
+ return metrics
1726
+
1727
+ # Return prediction, when output type is prediction
1728
+ return predictions if model_name == 'KNN' else predictions.result
1729
+
1730
+ @collect_queryband(queryband="AutoML_remove_saved_models")
1731
+ def remove_saved_models(self,
1732
+ table_name):
1733
+ """
1734
+ DESCRIPTION:
1735
+ Function removes the specified table containing saved models.
1736
+ Note:
1737
+ * If any data table result table is not present inside the database,
1738
+ then it will be skipped.
1739
+
1740
+ PARAMETERS:
1741
+ table_name:
1742
+ Required Argument.
1743
+ Specifies the table name containing saved models.
1744
+ Types: str
1745
+
717
1746
  RETURNS:
718
1747
  None
719
1748
 
@@ -721,25 +1750,48 @@ class AutoML:
721
1750
  TeradataMlException.
722
1751
 
723
1752
  EXAMPLES:
724
- # Create an instance of the AutoML called "automl_obj"
1753
+ # Create an instance of the AutoML called "obj"
725
1754
  # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
726
- # Perform fit() operation on the "automl_obj".
727
- # Generate leaderboard using leaderboard() method on "automl_obj".
728
- # Display best performing model using leader() method on "automl_obj".
729
- >>> automl_obj.leader()
1755
+ >>> obj = AutoML()
1756
+ # Remove saved models from the specified table.
1757
+ >>> obj.remove_saved_models("model_table")
730
1758
  """
731
- if not self._is_fit_called:
732
- # raise ValueError("fit() method must be called before generating leader.")
733
- err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
734
- "'leader' method", \
735
- "'fit' method must be called before" \
736
- " generating leader.")
737
- raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
738
- record = self.leader_board
739
- if not _is_terminal():
740
- display(record[record['Rank'] == 1])
741
- else:
742
- print(record[record['Rank'] == 1])
1759
+ # Appending arguments to list for validation
1760
+ arg_info_matrix = []
1761
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1762
+
1763
+ # Validate argument types
1764
+ _Validators._validate_function_arguments(arg_info_matrix)
1765
+
1766
+ df = DataFrame(table_name).to_pandas()
1767
+
1768
+ drop_list = df['DATA_TABLE'].dropna().unique().tolist()
1769
+ drop_list.extend(df['RESULT_TABLE'].dropna().unique().tolist())
1770
+
1771
+ # Removing data transformation parameters tables
1772
+ data=df[df['RANK'] == -1].iloc[0]
1773
+ buffer = BytesIO(data['DATA_PARAMS'])
1774
+ data_params = joblib.load(buffer)
1775
+ fit_obj_lst = json.loads(data['PARAMETERS'])
1776
+ for i in fit_obj_lst:
1777
+ if isinstance(data_params[i], dict):
1778
+ drop_list.extend(data_params[i].values())
1779
+ else:
1780
+ drop_list.append(data_params[i])
1781
+
1782
+ non_existent_tables = []
1783
+ for table in drop_list:
1784
+ try:
1785
+ execute_sql(f"DROP TABLE {table};")
1786
+ except Exception as e:
1787
+ non_existent_tables.append(table)
1788
+ continue
1789
+
1790
+ if len(non_existent_tables) > 0:
1791
+ warnings.warn(message=f"\nThe following tables '{non_existent_tables}' do not exist in the database and have been skipped.",
1792
+ stacklevel=2)
1793
+
1794
+ db_drop_table(table_name)
743
1795
 
744
1796
  @staticmethod
745
1797
  def generate_custom_config(file_name = "custom"):
@@ -810,12 +1862,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
810
1862
  Types: teradataml Dataframe
811
1863
 
812
1864
  target_column:
813
- Required Arugment.
1865
+ Required Argument.
814
1866
  Specifies the name of the target column in "data".
815
1867
  Types: str
816
1868
 
817
1869
  custom_data:
818
- Optional Arugment.
1870
+ Optional Argument.
819
1871
  Specifies json object containing user customized input.
820
1872
  Types: json object
821
1873
  """
@@ -830,14 +1882,16 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
830
1882
  verbose = 0,
831
1883
  max_runtime_secs = None,
832
1884
  stopping_metric = None,
833
- stopping_tolerance = None):
1885
+ stopping_tolerance = None,
1886
+ max_models = None,
1887
+ **kwargs):
834
1888
  """
835
1889
  DESCRIPTION:
836
1890
  Interal Function runs Regression.
837
1891
 
838
1892
  PARAMETERS:
839
1893
  auto:
840
- Optional Arugment.
1894
+ Optional Argument.
841
1895
  Specifies whether to run AutoML in custom mode or auto mode.
842
1896
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
843
1897
  Types: bool
@@ -853,20 +1907,44 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
853
1907
  Types: int
854
1908
 
855
1909
  max_runtime_secs:
856
- Optional Arugment.
1910
+ Optional Argument.
857
1911
  Specifies the time limit in seconds for model training.
858
1912
  Types: int
859
1913
 
860
1914
  stopping_metric:
861
1915
  Required, when "stopping_tolerance" is set, otherwise optional.
862
- Specifies the stopping mertics for stopping tolerance in model training.
1916
+ Specifies the stopping mertics for stopping tolerance in model training.
863
1917
  Types: str
864
1918
 
865
1919
  stopping_tolerance:
866
1920
  Required, when "stopping_metric" is set, otherwise optional.
867
- Specifies the stopping tolerance for stopping metrics in model training.
1921
+ Specifies the stopping tolerance for stopping metrics in model training.
868
1922
  Types: float
1923
+
1924
+ max_models:
1925
+ Optional Argument.
1926
+ Specifies the maximum number of models to be trained.
1927
+ Types: int
869
1928
 
1929
+ volatile:
1930
+ Optional Argument.
1931
+ Specifies whether to put the results of the
1932
+ function in a volatile table or not. When set to
1933
+ True, results are stored in a volatile table,
1934
+ otherwise not.
1935
+ Default Value: False
1936
+ Types: bool
1937
+
1938
+ persist:
1939
+ Optional Argument.
1940
+ Specifies whether to persist the results of the
1941
+ function in a table or not. When set to True,
1942
+ results are persisted in a table; otherwise,
1943
+ results are garbage collected at the end of the
1944
+ session.
1945
+ Default Value: False
1946
+ Types: bool
1947
+
870
1948
  RETURNS:
871
1949
  a tuple containing, model information and leaderboard.
872
1950
  """
@@ -883,7 +1961,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
883
1961
  target_column = self.target_column,
884
1962
  model_list = model_list,
885
1963
  verbose = verbose,
886
- custom_data = self.custom_data)
1964
+ custom_data = self.custom_data,
1965
+ **kwargs)
887
1966
  # Start time
888
1967
  start_time = time.time()
889
1968
  data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
@@ -895,7 +1974,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
895
1974
  verbose = verbose,
896
1975
  excluded_columns = excluded_columns,
897
1976
  custom_data = self.custom_data,
898
- data_transform_dict = data_transformation_params)
1977
+ data_transform_dict = data_transformation_params,
1978
+ **kwargs)
899
1979
  features, data_transformation_params = self.data_preparation(auto)
900
1980
 
901
1981
  # Calculating max_runtime_secs for model training by,
@@ -915,11 +1995,13 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
915
1995
  verbose = verbose,
916
1996
  features = features,
917
1997
  task_type = "Regression",
918
- custom_data = self.custom_data)
1998
+ custom_data = self.custom_data,
1999
+ **kwargs)
919
2000
  models_info, leaderboard, target_count = self.model_training(auto = auto,
920
2001
  max_runtime_secs = max_runtime_secs,
921
2002
  stopping_metric = stopping_metric,
922
- stopping_tolerance = stopping_tolerance)
2003
+ stopping_tolerance = stopping_tolerance,
2004
+ max_models = max_models)
923
2005
 
924
2006
  return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
925
2007
 
@@ -940,12 +2022,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
940
2022
  Types: teradataml Dataframe
941
2023
 
942
2024
  target_column:
943
- Required Arugment.
2025
+ Required Argument.
944
2026
  Specifies the name of the target column in "data".
945
2027
  Types: str
946
2028
 
947
2029
  custom_data:
948
- Optional Arugment.
2030
+ Optional Argument.
949
2031
  Specifies json object containing user customized input.
950
2032
  Types: json object
951
2033
  """
@@ -959,14 +2041,16 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
959
2041
  verbose = 0,
960
2042
  max_runtime_secs = None,
961
2043
  stopping_metric = None,
962
- stopping_tolerance = None):
2044
+ stopping_tolerance = None,
2045
+ max_models = None,
2046
+ **kwargs):
963
2047
  """
964
2048
  DESCRIPTION:
965
2049
  Interal Function runs Classification.
966
2050
 
967
2051
  PARAMETERS:
968
2052
  auto:
969
- Optional Arugment.
2053
+ Optional Argument.
970
2054
  Specifies whether to run AutoML in custom mode or auto mode.
971
2055
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
972
2056
  Types: bool
@@ -982,7 +2066,7 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
982
2066
  Types: int
983
2067
 
984
2068
  max_runtime_secs:
985
- Optional Arugment.
2069
+ Optional Argument.
986
2070
  Specifies the time limit in seconds for model training.
987
2071
  Types: int
988
2072
 
@@ -995,12 +2079,35 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
995
2079
  Required, when "stopping_metric" is set, otherwise optional.
996
2080
  Specifies the stopping tolerance for stopping metrics in model training.
997
2081
  Types: float
998
-
2082
+
2083
+ max_models:
2084
+ Optional Argument.
2085
+ Specifies the maximum number of models to be trained.
2086
+ Types: int
2087
+
2088
+ volatile:
2089
+ Optional Argument.
2090
+ Specifies whether to put the results of the
2091
+ function in a volatile table or not. When set to
2092
+ True, results are stored in a volatile table,
2093
+ otherwise not.
2094
+ Default Value: False
2095
+ Types: bool
2096
+
2097
+ persist:
2098
+ Optional Argument.
2099
+ Specifies whether to persist the results of the
2100
+ function in a table or not. When set to True,
2101
+ results are persisted in a table; otherwise,
2102
+ results are garbage collected at the end of the
2103
+ session.
2104
+ Default Value: False
2105
+ Types: bool
2106
+
999
2107
  RETURNS:
1000
2108
  a tuple containing, model information and leaderboard.
1001
2109
  """
1002
-
1003
-
2110
+
1004
2111
  # Feature Exploration Phase
1005
2112
  _FeatureExplore.__init__(self,
1006
2113
  data = self.data,
@@ -1015,7 +2122,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1015
2122
  model_list = model_list,
1016
2123
  verbose = verbose,
1017
2124
  task_type = "Classification",
1018
- custom_data = self.custom_data)
2125
+ custom_data = self.custom_data,
2126
+ **kwargs)
1019
2127
  # Start time
1020
2128
  start_time = time.time()
1021
2129
  data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
@@ -1027,7 +2135,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1027
2135
  excluded_columns = excluded_columns,
1028
2136
  custom_data = self.custom_data,
1029
2137
  data_transform_dict = data_transformation_params,
1030
- task_type = "Classification")
2138
+ task_type = "Classification",
2139
+ **kwargs)
1031
2140
  features, data_transformation_params = self.data_preparation(auto)
1032
2141
 
1033
2142
  # Calculating max_runtime_secs for model training by,
@@ -1047,11 +2156,13 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1047
2156
  verbose = verbose,
1048
2157
  features = features,
1049
2158
  task_type = "Classification",
1050
- custom_data = self.custom_data)
2159
+ custom_data = self.custom_data,
2160
+ **kwargs)
1051
2161
  models_info, leaderboard, target_count = self.model_training(auto = auto,
1052
2162
  max_runtime_secs = max_runtime_secs,
1053
2163
  stopping_metric = stopping_metric,
1054
- stopping_tolerance = stopping_tolerance)
2164
+ stopping_tolerance = stopping_tolerance,
2165
+ max_models = max_models)
1055
2166
 
1056
2167
  return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
1057
2168
 
@@ -1166,7 +2277,7 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1166
2277
  min_label_count = min(data[self.target_column].value_counts())
1167
2278
  if self._data_sampling_method == 'SMOTE':
1168
2279
  n_neighbors = min(5, min_label_count - 1)
1169
- sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=5)
2280
+ sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=42)
1170
2281
  else:
1171
2282
  n_neighbors = min(3, min_label_count)
1172
2283
  sampling_method = NearMiss(version=1, n_neighbors=n_neighbors)
@@ -1206,7 +2317,9 @@ class AutoRegressor(AutoML):
1206
2317
  max_runtime_secs=None,
1207
2318
  stopping_metric=None,
1208
2319
  stopping_tolerance=None,
1209
- custom_config_file=None
2320
+ max_models=None,
2321
+ custom_config_file=None,
2322
+ **kwargs
1210
2323
  ):
1211
2324
  """
1212
2325
  DESCRIPTION:
@@ -1239,7 +2352,7 @@ class AutoRegressor(AutoML):
1239
2352
  Types: int
1240
2353
 
1241
2354
  max_runtime_secs:
1242
- Optional Arugment.
2355
+ Optional Argument.
1243
2356
  Specifies the time limit in seconds for model training.
1244
2357
  Types: int
1245
2358
 
@@ -1247,8 +2360,10 @@ class AutoRegressor(AutoML):
1247
2360
  Required, when "stopping_tolerance" is set, otherwise optional.
1248
2361
  Specifies the stopping mertics for stopping tolerance in model training.
1249
2362
  Permitted Values:
1250
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
1251
- "RMSE", "RMSLE"
2363
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
2364
+ "MAPE", "MPE", "RMSE", "RMSLE",
2365
+ "ME", "EV", "MPD", "MGD"
2366
+
1252
2367
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
1253
2368
  'MICRO-RECALL','MACRO-RECALL',
1254
2369
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -1260,12 +2375,39 @@ class AutoRegressor(AutoML):
1260
2375
  Required, when "stopping_metric" is set, otherwise optional.
1261
2376
  Specifies the stopping tolerance for stopping metrics in model training.
1262
2377
  Types: float
2378
+
2379
+ max_models:
2380
+ Optional Argument.
2381
+ Specifies the maximum number of models to be trained.
2382
+ Types: int
1263
2383
 
1264
2384
  custom_config_file:
1265
2385
  Optional Argument.
1266
2386
  Specifies the path of JSON file in case of custom run.
1267
2387
  Types: str
1268
-
2388
+
2389
+ **kwargs:
2390
+ Specifies the additional arguments for AutoRegressor. Below
2391
+ are the additional arguments:
2392
+ volatile:
2393
+ Optional Argument.
2394
+ Specifies whether to put the interim results of the
2395
+ functions in a volatile table or not. When set to
2396
+ True, results are stored in a volatile table,
2397
+ otherwise not.
2398
+ Default Value: False
2399
+ Types: bool
2400
+
2401
+ persist:
2402
+ Optional Argument.
2403
+ Specifies whether to persist the interim results of the
2404
+ functions in a table or not. When set to True,
2405
+ results are persisted in a table; otherwise,
2406
+ results are garbage collected at the end of the
2407
+ session.
2408
+ Default Value: False
2409
+ Types: bool
2410
+
1269
2411
  RETURNS:
1270
2412
  Instance of AutoRegressor.
1271
2413
 
@@ -1294,24 +2436,28 @@ class AutoRegressor(AutoML):
1294
2436
 
1295
2437
  # Fit the data.
1296
2438
  >>> automl_obj.fit(housing_train, "price")
2439
+
2440
+ # Display leaderboard.
2441
+ >>> automl_obj.leaderboard()
1297
2442
 
1298
- # Predict using best performing model.
1299
- >>> prediction = automl_obj.predict()
1300
- >>> prediction
2443
+ # Display best performing model.
2444
+ >>> automl_obj.leader()
1301
2445
 
1302
- # Run predict for new test data with best performing model.
2446
+ # Run predict on test data using best performing model.
1303
2447
  >>> prediction = automl_obj.predict(housing_test)
1304
2448
  >>> prediction
1305
2449
 
1306
- # Run predict for new test data with second best performing model.
2450
+ # Run predict on test data using second best performing model.
1307
2451
  >>> prediction = automl_obj.predict(housing_test, rank=2)
1308
2452
  >>> prediction
1309
-
1310
- # Display leaderboard.
1311
- >>> automl_obj.leaderboard()
1312
-
1313
- # Display best performing model.
1314
- >>> automl_obj.leader()
2453
+
2454
+ # Run evaluate to get performance metrics using best performing model.
2455
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2456
+ >>> performance_metrics
2457
+
2458
+ # Run evaluate to get performance metrics using second best performing model.
2459
+ >>> performance_metrics = automl_obj.evaluate(housing_test, 2)
2460
+ >>> performance_metrics
1315
2461
 
1316
2462
  # Example 2 : Run AutoRegressor for regression problem with early stopping metric and tolerance.
1317
2463
  # Scenario : Predict the price of house based on different factors.
@@ -1325,19 +2471,24 @@ class AutoRegressor(AutoML):
1325
2471
 
1326
2472
  # Create instance of AutoRegressor.
1327
2473
  >>> automl_obj = AutoRegressor(verbose=2,
1328
- >>> exclude="xgboost",
1329
- >>> stopping_metric="R2",
1330
- >>> stopping_tolerance=0.7,
1331
- >>> custom_config_file="custom_housing.json")
2474
+ >>> exclude="xgboost",
2475
+ >>> stopping_metric="R2",
2476
+ >>> stopping_tolerance=0.7,
2477
+ >>> max_models=10,
2478
+ >>> custom_config_file="custom_housing.json")
1332
2479
  # Fit the data.
1333
2480
  >>> automl_obj.fit(housing_train, "price")
1334
-
1335
- # Run predict with best performing model.
1336
- >>> prediction = automl_obj.predict()
1337
- >>> prediction
1338
-
2481
+
1339
2482
  # Display leaderboard.
1340
2483
  >>> automl_obj.leaderboard()
2484
+
2485
+ # Run predict on test data using best performing model.
2486
+ >>> prediction = automl_obj.predict(housing_test)
2487
+ >>> prediction
2488
+
2489
+ # Run evaluate to get performance metrics using best performing model.
2490
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2491
+ >>> performance_metrics
1341
2492
 
1342
2493
  # Example 3 : Run AutoRegressor for regression problem with maximum runtime.
1343
2494
  # Scenario : Predict the price of house based on different factors.
@@ -1345,29 +2496,34 @@ class AutoRegressor(AutoML):
1345
2496
 
1346
2497
  # Create instance of AutoRegressor.
1347
2498
  >>> automl_obj = AutoRegressor(verbose=2,
1348
- >>> exclude="xgboost",
1349
- >>> max_runtime_secs=500)
2499
+ >>> exclude="xgboost",
2500
+ >>> max_runtime_secs=500)
1350
2501
  # Fit the data.
1351
2502
  >>> automl_obj.fit(housing_train, "price")
1352
-
1353
- # Run predict with best performing model.
1354
- >>> prediction = automl_obj.predict()
1355
- >>> prediction
1356
-
1357
- # Run predict with second best performing model.
1358
- >>> prediction = automl_obj.predict(rank=2)
1359
- >>> prediction
1360
-
2503
+
1361
2504
  # Display leaderboard.
1362
2505
  >>> automl_obj.leaderboard()
1363
2506
 
1364
2507
  # Display best performing model.
1365
2508
  >>> automl_obj.leader()
2509
+
2510
+ # Run predict on test data using best performing model.
2511
+ >>> prediction = automl_obj.predict(housing_test)
2512
+ >>> prediction
2513
+
2514
+ # Run predict on test data using second best performing model.
2515
+ >>> prediction = automl_obj.predict(housing_test, 2)
2516
+ >>> prediction
2517
+
2518
+ # Run evaluate to get performance metrics using best performing model.
2519
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2520
+ >>> performance_metrics
1366
2521
  """
1367
2522
  self.verbose = verbose
1368
2523
  self.max_runtime_secs = max_runtime_secs
1369
2524
  self.stopping_metric = stopping_metric
1370
2525
  self.stopping_tolerance = stopping_tolerance
2526
+ self.max_models = max_models
1371
2527
  self.custom_config_file = custom_config_file
1372
2528
  self.task_type = "Regression"
1373
2529
  self.include = include
@@ -1380,7 +2536,9 @@ class AutoRegressor(AutoML):
1380
2536
  max_runtime_secs=self.max_runtime_secs,
1381
2537
  stopping_metric=self.stopping_metric,
1382
2538
  stopping_tolerance=self.stopping_tolerance,
1383
- custom_config_file=self.custom_config_file)
2539
+ max_models=self.max_models,
2540
+ custom_config_file=self.custom_config_file,
2541
+ **kwargs)
1384
2542
  class AutoClassifier(AutoML):
1385
2543
 
1386
2544
  def __init__(self,
@@ -1390,7 +2548,9 @@ class AutoClassifier(AutoML):
1390
2548
  max_runtime_secs=None,
1391
2549
  stopping_metric=None,
1392
2550
  stopping_tolerance=None,
1393
- custom_config_file=None
2551
+ max_models=None,
2552
+ custom_config_file=None,
2553
+ **kwargs
1394
2554
  ):
1395
2555
  """
1396
2556
  DESCRIPTION:
@@ -1423,32 +2583,61 @@ class AutoClassifier(AutoML):
1423
2583
  Types: int
1424
2584
 
1425
2585
  max_runtime_secs:
1426
- Optional Arugment.
2586
+ Optional Argument.
1427
2587
  Specifies the time limit in seconds for model training.
1428
2588
  Types: int
1429
2589
 
1430
2590
  stopping_metric:
1431
2591
  Required, when "stopping_tolerance" is set, otherwise optional.
1432
2592
  Specifies the stopping mertics for stopping tolerance in model training.
1433
- Types: str
1434
-
1435
- stopping_tolerance:
1436
- Required, when "stopping_metric" is set, otherwise optional.
1437
- Specifies the stopping tolerance for stopping metrics in model training.
1438
2593
  Permitted Values:
1439
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
1440
- "RMSE", "RMSLE"
2594
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
2595
+ "MAPE", "MPE", "RMSE", "RMSLE",
2596
+ "ME", "EV", "MPD", "MGD"
2597
+
1441
2598
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
1442
2599
  'MICRO-RECALL','MACRO-RECALL',
1443
2600
  'MICRO-PRECISION', 'MACRO-PRECISION',
1444
2601
  'WEIGHTED-PRECISION','WEIGHTED-RECALL',
1445
2602
  'WEIGHTED-F1', 'ACCURACY'
2603
+ Types: str
2604
+
2605
+ stopping_tolerance:
2606
+ Required, when "stopping_metric" is set, otherwise optional.
2607
+ Specifies the stopping tolerance for stopping metrics in model training.
1446
2608
  Types: float
2609
+
2610
+ max_models:
2611
+ Optional Argument.
2612
+ Specifies the maximum number of models to be trained.
2613
+ Types: int
1447
2614
 
1448
2615
  custom_config_file:
1449
2616
  Optional Argument.
1450
2617
  Specifies the path of json file in case of custom run.
1451
2618
  Types: str
2619
+
2620
+ **kwargs:
2621
+ Specifies the additional arguments for AutoClassifier. Below
2622
+ are the additional arguments:
2623
+ volatile:
2624
+ Optional Argument.
2625
+ Specifies whether to put the interim results of the
2626
+ functions in a volatile table or not. When set to
2627
+ True, results are stored in a volatile table,
2628
+ otherwise not.
2629
+ Default Value: False
2630
+ Types: bool
2631
+
2632
+ persist:
2633
+ Optional Argument.
2634
+ Specifies whether to persist the interim results of the
2635
+ functions in a table or not. When set to True,
2636
+ results are persisted in a table; otherwise,
2637
+ results are garbage collected at the end of the
2638
+ session.
2639
+ Default Value: False
2640
+ Types: bool
1452
2641
 
1453
2642
  RETURNS:
1454
2643
  Instance of AutoClassifier.
@@ -1484,24 +2673,28 @@ class AutoClassifier(AutoML):
1484
2673
 
1485
2674
  # Fit the data.
1486
2675
  >>> automl_obj.fit(admissions_train, "admitted")
2676
+
2677
+ # Display leaderboard.
2678
+ >>> automl_obj.leaderboard()
1487
2679
 
1488
- # Predict using best performing model.
1489
- >>> prediction = automl_obj.predict()
1490
- >>> prediction
2680
+ # Display best performing model.
2681
+ >>> automl_obj.leader()
1491
2682
 
1492
- # Run predict for new test data with best performing model.
2683
+ # Run predict on test data using best performing model.
1493
2684
  >>> prediction = automl_obj.predict(admissions_test)
1494
2685
  >>> prediction
1495
2686
 
1496
- # Run predict for new test data with second best performing model.
2687
+ # Run predict on test data using second best performing model.
1497
2688
  >>> prediction = automl_obj.predict(admissions_test, rank=2)
1498
2689
  >>> prediction
1499
-
1500
- # Display leaderboard.
1501
- >>> automl_obj.leaderboard()
1502
-
1503
- # Display best performing model.
1504
- >>> automl_obj.leader()
2690
+
2691
+ # Run evaluate to get performance metrics using best performing model.
2692
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
2693
+ >>> performance_metrics
2694
+
2695
+ # Run evaluate to get performance metrics using model rank 4.
2696
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, 4)
2697
+ >>> performance_metrics
1505
2698
 
1506
2699
  # Example 2 : Run AutoClassifier for binary classification.
1507
2700
  # Scenario : Predict whether passenger aboard the RMS Titanic survived
@@ -1510,6 +2703,11 @@ class AutoClassifier(AutoML):
1510
2703
  # configuration file to customize different processes of
1511
2704
  # AutoML Run.
1512
2705
 
2706
+ # Split the data into train and test.
2707
+ >>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
2708
+ >>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
2709
+ >>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
2710
+
1513
2711
  # Generate custom configuration file.
1514
2712
  >>> AutoClassifier.generate_custom_config("custom_titanic")
1515
2713
 
@@ -1517,21 +2715,25 @@ class AutoClassifier(AutoML):
1517
2715
  >>> automl_obj = AutoClassifier(verbose=2,
1518
2716
  >>> custom_config_file="custom_titanic.json")
1519
2717
  # Fit the data.
1520
- >>> automl_obj.fit(titanic, titanic.survived)
1521
-
1522
- # Run predict with best performing model.
1523
- >>> prediction = automl_obj.predict()
1524
- >>> prediction
1525
-
1526
- # Run predict with second best performing model.
1527
- >>> prediction = automl_obj.predict(rank=2)
1528
- >>> prediction
2718
+ >>> automl_obj.fit(titanic_train, titanic_train.survived)
1529
2719
 
1530
2720
  # Display leaderboard.
1531
2721
  >>> automl_obj.leaderboard()
1532
2722
 
1533
2723
  # Display best performing model.
1534
2724
  >>> automl_obj.leader()
2725
+
2726
+ # Run predict on test data using best performing model.
2727
+ >>> prediction = automl_obj.predict(titanic_test)
2728
+ >>> prediction
2729
+
2730
+ # Run predict on test data using second best performing model.
2731
+ >>> prediction = automl_obj.predict(titanic_test, rank=2)
2732
+ >>> prediction
2733
+
2734
+ # Run evaluate to get performance metrics using best performing model.
2735
+ >>> performance_metrics = automl_obj.evaluate(titanic_test)
2736
+ >>> performance_metrics
1535
2737
 
1536
2738
  # Example 3 : Run AutoClassifier for multiclass classification problem.
1537
2739
  # Scenario : Predict the species of iris flower based on different factors.
@@ -1539,6 +2741,11 @@ class AutoClassifier(AutoML):
1539
2741
  # models. Use custom configuration file to customize different
1540
2742
  # processes of AutoML Run.
1541
2743
 
2744
+ # Split the data into train and test.
2745
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
2746
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
2747
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
2748
+
1542
2749
  # Generate custom configuration file.
1543
2750
  >>> AutoClassifier.generate_custom_config("custom_iris")
1544
2751
 
@@ -1546,18 +2753,22 @@ class AutoClassifier(AutoML):
1546
2753
  >>> automl_obj = AutoClassifier(verbose=1,
1547
2754
  >>> custom_config_file="custom_iris.json")
1548
2755
  # Fit the data.
1549
- >>> automl_obj.fit(iris_input, "species")
1550
-
1551
- # Predict using best performing model.
1552
- >>> prediction = automl_obj.predict()
1553
- >>> prediction
1554
-
2756
+ >>> automl_obj.fit(iris_train, "species")
2757
+
1555
2758
  # Display leaderboard.
1556
2759
  >>> automl_obj.leaderboard()
1557
2760
 
1558
2761
  # Display best performing model.
1559
2762
  >>> automl_obj.leader()
1560
2763
 
2764
+ # Predict on test data using best performing model.
2765
+ >>> prediction = automl_obj.predict(iris_test)
2766
+ >>> prediction
2767
+
2768
+ # Run evaluate to get performance metrics using best performing model.
2769
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
2770
+ >>> performance_metrics
2771
+
1561
2772
  # Example 4 : Run AutoClassifier for classification problem with stopping metric and tolerance.
1562
2773
  # Scenario : Predict whether passenger aboard the RMS Titanic survived
1563
2774
  # or not based on differect factors. Use custom configuration
@@ -1565,64 +2776,87 @@ class AutoClassifier(AutoML):
1565
2776
  # performance threshold to acquire for the available models, and
1566
2777
  # terminate training upon meeting the stipulated performance criteria.
1567
2778
 
2779
+ # Split the data into train and test.
2780
+ >>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
2781
+ >>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
2782
+ >>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
2783
+
1568
2784
  # Generate custom configuration file.
1569
2785
  >>> AutoClassifier.generate_custom_config("custom_titanic")
1570
2786
 
1571
2787
  # Create instance of AutoClassifier.
1572
2788
  >>> automl_obj = AutoClassifier(verbose=2,
1573
- >>> exclude="xgboost",
1574
- >>> stopping_metric="MICRO-F1",
1575
- >>> stopping_tolerance=0.7,
1576
- >>> custom_config_file="custom_titanic.json")
2789
+ >>> exclude="xgboost",
2790
+ >>> stopping_metric="MICRO-F1",
2791
+ >>> stopping_tolerance=0.7,
2792
+ >>> max_models=8
2793
+ >>> custom_config_file="custom_titanic.json")
1577
2794
  # Fit the data.
1578
- >>> automl_obj.fit(titanic, titanic.survived)
1579
-
1580
- # Run predict with best performing model.
1581
- >>> prediction = automl_obj.predict()
1582
- >>> prediction
1583
-
2795
+ >>> automl_obj.fit(titanic_train, titanic_train.survived)
2796
+
1584
2797
  # Display leaderboard.
1585
2798
  >>> automl_obj.leaderboard()
2799
+
2800
+ # Run predict on test data using best performing model.
2801
+ >>> prediction = automl_obj.predict(titanic_test)
2802
+ >>> prediction
2803
+
2804
+ # Run evaluate to get performance metrics using best performing model.
2805
+ >>> performance_metrics = automl_obj.evaluate(titanic_test)
2806
+ >>> performance_metrics
1586
2807
 
1587
2808
  # Example 5 : Run AutoClassifier for classification problem with maximum runtime.
1588
2809
  # Scenario : Predict the species of iris flower based on different factors.
1589
2810
  # Run AutoML to get the best performing model in specified time.
2811
+
2812
+ # Split the data into train and test.
2813
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
2814
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
2815
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
1590
2816
 
1591
2817
  # Create instance of AutoClassifier.
1592
2818
  >>> automl_obj = AutoClassifier(verbose=2,
1593
- >>> exclude="xgboost",
1594
- >>> max_runtime_secs=500)
2819
+ >>> exclude="xgboost",
2820
+ >>> max_runtime_secs=500)
2821
+ >>> max_models=3)
1595
2822
  # Fit the data.
1596
- >>> automl_obj.fit(iris_input, iris_input.species)
1597
-
1598
- # Run predict with best performing model.
1599
- >>> prediction = automl_obj.predict()
1600
- >>> prediction
1601
-
1602
- # Run predict with second best performing model.
1603
- >>> prediction = automl_obj.predict(rank=2)
1604
- >>> prediction
1605
-
2823
+ >>> automl_obj.fit(iris_train, iris_train.species)
2824
+
1606
2825
  # Display leaderboard.
1607
2826
  >>> automl_obj.leaderboard()
1608
2827
 
1609
2828
  # Display best performing model.
1610
- >>> automl_obj.leader()
2829
+ >>> automl_obj.leader()
2830
+
2831
+ # Run predict on test data using best performing model.
2832
+ >>> prediction = automl_obj.predict(iris_test)
2833
+ >>> prediction
2834
+
2835
+ # Run predict on test data using second best performing model.
2836
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
2837
+ >>> prediction
2838
+
2839
+ # Run evaluate to get performance metrics using model rank 3.
2840
+ >>> performance_metrics = automl_obj.evaluate(iris_test, 3)
2841
+ >>> performance_metrics
1611
2842
  """
1612
2843
  self.verbose = verbose
1613
2844
  self.max_runtime_secs = max_runtime_secs
1614
2845
  self.stopping_metric = stopping_metric
1615
2846
  self.stopping_tolerance = stopping_tolerance
2847
+ self.max_models = max_models
1616
2848
  self.custom_config_file = custom_config_file
1617
2849
  self.task_type = "Classification"
1618
2850
  self.include = include
1619
2851
  self.exclude = exclude
1620
2852
 
1621
2853
  super(AutoClassifier, self).__init__(task_type=self.task_type,
1622
- include = self.include,
1623
- exclude = self.exclude,
1624
- verbose=self.verbose,
1625
- max_runtime_secs=self.max_runtime_secs,
1626
- stopping_metric=self.stopping_metric,
1627
- stopping_tolerance=self.stopping_tolerance,
1628
- custom_config_file=self.custom_config_file)
2854
+ include = self.include,
2855
+ exclude = self.exclude,
2856
+ verbose=self.verbose,
2857
+ max_runtime_secs=self.max_runtime_secs,
2858
+ stopping_metric=self.stopping_metric,
2859
+ stopping_tolerance=self.stopping_tolerance,
2860
+ max_models=self.max_models,
2861
+ custom_config_file=self.custom_config_file,
2862
+ **kwargs)