teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +306 -0
  4. teradataml/__init__.py +10 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +299 -16
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +13 -3
  11. teradataml/analytics/json_parser/utils.py +13 -6
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +11 -2
  15. teradataml/analytics/table_operator/__init__.py +4 -3
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +66 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1502 -323
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +247 -307
  22. teradataml/automl/data_transformation.py +32 -12
  23. teradataml/automl/feature_engineering.py +325 -86
  24. teradataml/automl/model_evaluation.py +44 -35
  25. teradataml/automl/model_training.py +122 -153
  26. teradataml/catalog/byom.py +8 -8
  27. teradataml/clients/pkce_client.py +1 -1
  28. teradataml/common/__init__.py +2 -1
  29. teradataml/common/constants.py +72 -0
  30. teradataml/common/deprecations.py +13 -7
  31. teradataml/common/garbagecollector.py +152 -120
  32. teradataml/common/messagecodes.py +11 -2
  33. teradataml/common/messages.py +4 -1
  34. teradataml/common/sqlbundle.py +26 -4
  35. teradataml/common/utils.py +225 -14
  36. teradataml/common/wrapper_utils.py +1 -1
  37. teradataml/context/context.py +82 -2
  38. teradataml/data/SQL_Fundamentals.pdf +0 -0
  39. teradataml/data/complaints_test_tokenized.csv +353 -0
  40. teradataml/data/complaints_tokens_model.csv +348 -0
  41. teradataml/data/covid_confirm_sd.csv +83 -0
  42. teradataml/data/dataframe_example.json +27 -1
  43. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  44. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  45. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  46. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  47. teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
  48. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  49. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  50. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  51. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  52. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  53. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  54. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  55. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  56. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  57. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  58. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  59. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  60. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  61. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  62. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  63. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  64. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  65. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  66. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  67. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  68. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  69. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  70. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  71. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  72. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  74. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  75. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  76. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  77. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  78. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  79. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  80. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  81. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  82. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  83. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  84. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  85. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  86. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  87. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  88. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  89. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  90. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  91. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  92. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  93. teradataml/data/dwt2d_dataTable.csv +65 -0
  94. teradataml/data/dwt_dataTable.csv +8 -0
  95. teradataml/data/dwt_filterTable.csv +3 -0
  96. teradataml/data/finance_data4.csv +13 -0
  97. teradataml/data/grocery_transaction.csv +19 -0
  98. teradataml/data/idwt2d_dataTable.csv +5 -0
  99. teradataml/data/idwt_dataTable.csv +8 -0
  100. teradataml/data/idwt_filterTable.csv +3 -0
  101. teradataml/data/interval_data.csv +5 -0
  102. teradataml/data/jsons/paired_functions.json +14 -0
  103. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  104. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  105. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  106. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  107. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  108. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  109. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  110. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  111. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  112. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  113. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  114. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  115. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  116. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  117. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  118. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  119. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  120. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  121. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  122. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  123. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  124. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  125. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  126. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  127. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  128. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  129. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  130. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  131. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  132. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  133. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  134. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  135. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  136. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  137. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  138. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  139. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  140. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  141. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  142. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  143. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  144. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  145. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  146. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  147. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  148. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  149. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  150. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  151. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  152. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  153. teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
  154. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  155. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  156. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  157. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  158. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  159. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
  160. teradataml/data/load_example_data.py +8 -2
  161. teradataml/data/medical_readings.csv +101 -0
  162. teradataml/data/naivebayestextclassifier_example.json +1 -1
  163. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  164. teradataml/data/patient_profile.csv +101 -0
  165. teradataml/data/peppers.png +0 -0
  166. teradataml/data/real_values.csv +14 -0
  167. teradataml/data/sax_example.json +8 -0
  168. teradataml/data/scripts/deploy_script.py +1 -1
  169. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  170. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  171. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  172. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  173. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
  174. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  175. teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
  176. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  177. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  178. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  179. teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
  180. teradataml/data/star_pivot.csv +8 -0
  181. teradataml/data/target_udt_data.csv +8 -0
  182. teradataml/data/templates/open_source_ml.json +3 -1
  183. teradataml/data/teradataml_example.json +20 -1
  184. teradataml/data/timestamp_data.csv +4 -0
  185. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  186. teradataml/data/uaf_example.json +55 -1
  187. teradataml/data/unpivot_example.json +15 -0
  188. teradataml/data/url_data.csv +9 -0
  189. teradataml/data/vectordistance_example.json +4 -0
  190. teradataml/data/windowdfft.csv +16 -0
  191. teradataml/dataframe/copy_to.py +1 -1
  192. teradataml/dataframe/data_transfer.py +5 -3
  193. teradataml/dataframe/dataframe.py +1002 -201
  194. teradataml/dataframe/fastload.py +3 -3
  195. teradataml/dataframe/functions.py +867 -0
  196. teradataml/dataframe/row.py +160 -0
  197. teradataml/dataframe/setop.py +2 -2
  198. teradataml/dataframe/sql.py +840 -33
  199. teradataml/dataframe/window.py +1 -1
  200. teradataml/dbutils/dbutils.py +878 -34
  201. teradataml/dbutils/filemgr.py +48 -1
  202. teradataml/geospatial/geodataframe.py +1 -1
  203. teradataml/geospatial/geodataframecolumn.py +1 -1
  204. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  205. teradataml/lib/aed_0_1.dll +0 -0
  206. teradataml/opensource/__init__.py +1 -1
  207. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  208. teradataml/opensource/_lightgbm.py +950 -0
  209. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  210. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  211. teradataml/opensource/sklearn/__init__.py +0 -1
  212. teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
  213. teradataml/options/__init__.py +9 -23
  214. teradataml/options/configure.py +42 -4
  215. teradataml/options/display.py +2 -2
  216. teradataml/plot/axis.py +4 -4
  217. teradataml/scriptmgmt/UserEnv.py +13 -9
  218. teradataml/scriptmgmt/lls_utils.py +77 -23
  219. teradataml/store/__init__.py +13 -0
  220. teradataml/store/feature_store/__init__.py +0 -0
  221. teradataml/store/feature_store/constants.py +291 -0
  222. teradataml/store/feature_store/feature_store.py +2223 -0
  223. teradataml/store/feature_store/models.py +1505 -0
  224. teradataml/store/vector_store/__init__.py +1586 -0
  225. teradataml/table_operators/Script.py +2 -2
  226. teradataml/table_operators/TableOperator.py +106 -20
  227. teradataml/table_operators/query_generator.py +3 -0
  228. teradataml/table_operators/table_operator_query_generator.py +3 -1
  229. teradataml/table_operators/table_operator_util.py +102 -56
  230. teradataml/table_operators/templates/dataframe_register.template +69 -0
  231. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  232. teradataml/telemetry_utils/__init__.py +0 -0
  233. teradataml/telemetry_utils/queryband.py +52 -0
  234. teradataml/utils/dtypes.py +4 -2
  235. teradataml/utils/validators.py +34 -2
  236. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
  237. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
  238. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  239. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  240. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -15,22 +15,29 @@
15
15
 
16
16
  # Python libraries
17
17
  import json
18
+ import pandas as pd
18
19
  import numpy as np
19
20
  from sklearn.metrics import confusion_matrix
20
21
  import time
22
+ import ast
23
+ import warnings
24
+ import joblib
25
+ from io import BytesIO
21
26
 
22
27
  # Teradata libraries
23
28
  from teradataml.dataframe.copy_to import copy_to_sql
24
29
  from teradataml import ColumnExpression
25
30
  from teradataml.dataframe.dataframe import DataFrame
31
+ from teradataml.utils.utils import execute_sql
26
32
  from teradataml.utils.validators import _Validators
27
- from teradataml import ROC
28
- from teradataml.common.utils import UtilFuncs
33
+ from teradataml import ROC, BLOB
29
34
  from teradataml.utils.dtypes import _Dtypes
30
35
  from teradataml.common.utils import UtilFuncs
31
36
  from teradataml import TeradataMlException
32
37
  from teradataml.common.messages import Messages, MessageCodes
33
- from teradatasqlalchemy.telemetry.queryband import collect_queryband
38
+ from teradataml.telemetry_utils.queryband import collect_queryband
39
+ from teradataml import TeradataConstants
40
+ from teradataml import XGBoost, DecisionForest, KNN, SVM, GLM, db_drop_table
34
41
 
35
42
  # AutoML Internal libraries
36
43
  from teradataml.automl.data_preparation import _DataPreparation
@@ -53,7 +60,8 @@ class AutoML:
53
60
  stopping_metric = None,
54
61
  stopping_tolerance = None,
55
62
  max_models = None,
56
- custom_config_file = None):
63
+ custom_config_file = None,
64
+ **kwargs):
57
65
  """
58
66
  DESCRIPTION:
59
67
  AutoML (Automated Machine Learning) is an approach that automates the process
@@ -132,8 +140,10 @@ class AutoML:
132
140
  Required, when "stopping_tolerance" is set, otherwise optional.
133
141
  Specifies the stopping metrics for stopping tolerance in model training.
134
142
  Permitted Values:
135
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
136
- "RMSE", "RMSLE"
143
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
144
+ "MAPE", "MPE", "RMSE", "RMSLE",
145
+ "ME", "EV", "MPD", "MGD"
146
+
137
147
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
138
148
  'MICRO-RECALL','MACRO-RECALL',
139
149
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -155,6 +165,28 @@ class AutoML:
155
165
  Optional Argument.
156
166
  Specifies the path of JSON file in case of custom run.
157
167
  Types: str
168
+
169
+ **kwargs:
170
+ Specifies the additional arguments for AutoML. Below
171
+ are the additional arguments:
172
+ volatile:
173
+ Optional Argument.
174
+ Specifies whether to put the interim results of the
175
+ functions in a volatile table or not. When set to
176
+ True, results are stored in a volatile table,
177
+ otherwise not.
178
+ Default Value: False
179
+ Types: bool
180
+
181
+ persist:
182
+ Optional Argument.
183
+ Specifies whether to persist the interim results of the
184
+ functions in a table or not. When set to True,
185
+ results are persisted in a table; otherwise,
186
+ results are garbage collected at the end of the
187
+ session.
188
+ Default Value: False
189
+ Types: bool
158
190
 
159
191
  RETURNS:
160
192
  Instance of AutoML.
@@ -192,24 +224,28 @@ class AutoML:
192
224
 
193
225
  # Fit the data.
194
226
  >>> automl_obj.fit(admissions_train, "admitted")
195
-
196
- # Run predict with best performing model.
197
- >>> prediction = automl_obj.predict()
198
- >>> prediction
199
-
200
- # Run predict for new test data with best performing model.
201
- >>> prediction = automl_obj.predict(admissions_test)
202
- >>> prediction
203
227
 
204
- # Run predict for new test data with second best performing model.
205
- >>> prediction = automl_obj.predict(admissions_test, rank=2)
206
- >>> prediction
207
-
208
228
  # Display leaderboard.
209
229
  >>> automl_obj.leaderboard()
210
230
 
211
231
  # Display best performing model.
212
232
  >>> automl_obj.leader()
233
+
234
+ # Run predict on test data using best performing model.
235
+ >>> prediction = automl_obj.predict(admissions_test)
236
+ >>> prediction
237
+
238
+ # Run predict on test data using second best performing model.
239
+ >>> prediction = automl_obj.predict(admissions_test, rank=2)
240
+ >>> prediction
241
+
242
+ # Run evaluate to get performance metrics using best performing model.
243
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
244
+ >>> performance_metrics
245
+
246
+ # Run evaluate to get performance metrics using model rank 3.
247
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, rank=3)
248
+ >>> performance_metrics
213
249
 
214
250
  # Example 2 : Run AutoML for regression problem.
215
251
  # Scenario : Predict the price of house based on different factors.
@@ -228,24 +264,28 @@ class AutoML:
228
264
  >>> custom_config_file="custom_housing.json")
229
265
  # Fit the data.
230
266
  >>> automl_obj.fit(housing_train, "price")
231
-
232
- # Run predict with best performing model.
233
- >>> prediction = automl_obj.predict()
234
- >>> prediction
235
-
236
- # Run predict for new test data with best performing model.
237
- >>> prediction = automl_obj.predict(housing_test)
238
- >>> prediction
239
267
 
240
- # Run predict for new test data with second best performing model.
241
- >>> prediction = automl_obj.predict(housing_test, rank=2)
242
- >>> prediction
243
-
244
268
  # Display leaderboard.
245
269
  >>> automl_obj.leaderboard()
246
270
 
247
271
  # Display best performing model.
248
272
  >>> automl_obj.leader()
273
+
274
+ # Run predict on test data using best performing model.
275
+ >>> prediction = automl_obj.predict(housing_test)
276
+ >>> prediction
277
+
278
+ # Run predict on test data using second best performing model.
279
+ >>> prediction = automl_obj.predict(housing_test, rank=2)
280
+ >>> prediction
281
+
282
+ # Run evaluate to get performance metrics using best performing model.
283
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
284
+ >>> performance_metrics
285
+
286
+ # Run evaluate to get performance metrics using second best performing model.
287
+ >>> performance_metrics = automl_obj.evaluate(housing_test, rank=2)
288
+ >>> performance_metrics
249
289
 
250
290
  # Example 3 : Run AutoML for multiclass classification problem.
251
291
  # Scenario : Predict the species of iris flower based on different
@@ -253,6 +293,11 @@ class AutoML:
253
293
  # different processes of AutoML Run to get the best
254
294
  # performing model out of available models.
255
295
 
296
+ # Split the data into train and test.
297
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
298
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
299
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
300
+
256
301
  # Generate custom JSON file
257
302
  >>> AutoML.generate_custom_config()
258
303
 
@@ -260,22 +305,23 @@ class AutoML:
260
305
  >>> automl_obj = AutoML(verbose=2,
261
306
  >>> exclude="xgboost",
262
307
  >>> custom_config_file="custom.json")
308
+
263
309
  # Fit the data.
264
- >>> automl_obj.fit(iris_input, iris_input.species)
265
-
266
- # Run predict with best performing model.
267
- >>> prediction = automl_obj.predict()
268
- >>> prediction
269
-
270
- # Run predict with second best performing model.
271
- >>> prediction = automl_obj.predict(rank=2)
272
- >>> prediction
310
+ >>> automl_obj.fit(iris_train, iris_train.species)
273
311
 
274
312
  # Display leaderboard.
275
313
  >>> automl_obj.leaderboard()
276
314
 
277
315
  # Display best performing model.
278
316
  >>> automl_obj.leader()
317
+
318
+ # Run predict on test data using second best performing model.
319
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
320
+ >>> prediction
321
+
322
+ # Run evaluate to get performance metrics using best performing model.
323
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
324
+ >>> performance_metrics
279
325
 
280
326
  # Example 4 : Run AutoML for regression problem with early stopping metric and tolerance.
281
327
  # Scenario : Predict the price of house based on different factors.
@@ -296,39 +342,57 @@ class AutoML:
296
342
  >>> custom_config_file="custom_housing.json")
297
343
  # Fit the data.
298
344
  >>> automl_obj.fit(housing_train, "price")
299
-
300
- # Run predict with best performing model.
301
- >>> prediction = automl_obj.predict()
302
- >>> prediction
303
-
345
+
304
346
  # Display leaderboard.
305
347
  >>> automl_obj.leaderboard()
348
+
349
+ # Run predict on test data using best performing model.
350
+ >>> prediction = automl_obj.predict(housing_test)
351
+ >>> prediction
352
+
353
+ # Run evaluate to get performance metrics using best performing model.
354
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
355
+ >>> performance_metrics
306
356
 
307
357
  # Example 5 : Run AutoML for regression problem with maximum runtime.
308
358
  # Scenario : Predict the species of iris flower based on different factors.
309
359
  # Run AutoML to get the best performing model in specified time.
310
360
 
361
+ # Split the data into train and test.
362
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
363
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
364
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
365
+
311
366
  # Create instance of AutoML.
312
367
  >>> automl_obj = AutoML(verbose=2,
313
368
  >>> exclude="xgboost",
314
369
  >>> max_runtime_secs=500,
315
370
  >>> max_models=3)
371
+
316
372
  # Fit the data.
317
- >>> automl_obj.fit(iris_input, iris_input.species)
318
-
319
- # Run predict with best performing model.
320
- >>> prediction = automl_obj.predict()
321
- >>> prediction
322
-
323
- # Run predict with second best performing model.
324
- >>> prediction = automl_obj.predict(rank=2)
325
- >>> prediction
326
-
373
+ >>> automl_obj.fit(iris_train, iris_train.species)
374
+
327
375
  # Display leaderboard.
328
376
  >>> automl_obj.leaderboard()
329
377
 
330
378
  # Display best performing model.
331
- >>> automl_obj.leader()
379
+ >>> automl_obj.leader()
380
+
381
+ # Run predict on test data using best performing model.
382
+ >>> prediction = automl_obj.predict(iris_test)
383
+ >>> prediction
384
+
385
+ # Run predict on test data using second best performing model.
386
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
387
+ >>> prediction
388
+
389
+ # Run evaluate to get performance metrics using best performing model.
390
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
391
+ >>> performance_metrics
392
+
393
+ # Run evaluate to get performance metrics using model rank 4.
394
+ >>> performance_metrics = automl_obj.evaluate(iris_test, 4)
395
+ >>> performance_metrics
332
396
  """
333
397
  # Appending arguments to list for validation
334
398
  arg_info_matrix = []
@@ -339,9 +403,9 @@ class AutoML:
339
403
  "decision_forest", "xgboost"]])
340
404
  arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
341
405
  arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
342
- arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", 'MAE',
343
- 'MSE', 'MSLE',
344
- 'RMSE', 'RMSLE',
406
+ arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", "MAE", "MSE", "MSLE",
407
+ "MAPE", "MPE", "RMSE", "RMSLE",
408
+ "ME", "EV", "MPD", "MGD",
345
409
  'MICRO-F1','MACRO-F1',
346
410
  'MICRO-RECALL','MACRO-RECALL',
347
411
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -350,13 +414,21 @@ class AutoML:
350
414
  arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
351
415
  arg_info_matrix.append(["max_models", max_models, True, (int)])
352
416
  arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
353
-
417
+
418
+ volatile = kwargs.get('volatile', False)
419
+ persist = kwargs.get('persist', False)
420
+
421
+ arg_info_matrix.append(["volatile", volatile, True, (bool)])
422
+ arg_info_matrix.append(["persist", persist, True, (bool)])
354
423
 
355
424
  # Validate argument types
356
425
  _Validators._validate_function_arguments(arg_info_matrix)
357
426
  # Either include or exclude can be used.
358
427
  if include is not None or exclude is not None:
359
428
  _Validators._validate_mutually_exclusive_arguments(include, "include", exclude, "exclude")
429
+ # Either volatile or persist can be used.
430
+ if volatile and persist:
431
+ _Validators._validate_mutually_exclusive_arguments(volatile, "volatlie", persist, "persist")
360
432
  # Validate mutually inclusive arguments
361
433
  _Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
362
434
  # Validate lower range for max_models
@@ -391,6 +463,9 @@ class AutoML:
391
463
  self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
392
464
  self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
393
465
  self._is_fit_called = False
466
+ self._is_load_model_called = False
467
+ self.kwargs = kwargs
468
+ self.table_name_mapping={}
394
469
 
395
470
  @collect_queryband(queryband="AutoML_fit")
396
471
  def fit(self,
@@ -489,7 +564,9 @@ class AutoML:
489
564
  _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
490
565
  else:
491
566
  if self.stopping_metric is not None:
492
- permitted_values = ["R2", 'MAE', 'MSE', 'MSLE','RMSE', 'RMSLE']
567
+ permitted_values = ["R2", "MAE", "MSE", "MSLE",
568
+ "MAPE", "MPE", "RMSE", "RMSLE",
569
+ "ME", "EV", "MPD", "MGD"]
493
570
  _Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
494
571
 
495
572
  if not self.is_classification_type():
@@ -514,40 +591,39 @@ class AutoML:
514
591
  clf = task_cls(self.data, self.target_column, self.custom_data)
515
592
 
516
593
  self.model_info, self.leader_board, self.target_count, self.target_label, \
517
- self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
518
- model_list = self.model_list,
519
- auto = self.auto,
520
- verbose = self.verbose,
521
- max_runtime_secs = self.max_runtime_secs,
522
- stopping_metric = self.stopping_metric,
523
- stopping_tolerance = self.stopping_tolerance,
524
- max_models = self.max_models)
594
+ self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
595
+ model_list = self.model_list,
596
+ auto = self.auto,
597
+ verbose = self.verbose,
598
+ max_runtime_secs = self.max_runtime_secs,
599
+ stopping_metric = self.stopping_metric,
600
+ stopping_tolerance = self.stopping_tolerance,
601
+ max_models = self.max_models,
602
+ **self.kwargs)
525
603
 
526
604
  # Model Evaluation Phase
527
605
  self.m_evaluator = _ModelEvaluator(self.model_info,
528
606
  self.target_column,
529
607
  self.task_type)
530
608
 
531
- @collect_queryband(queryband="AutoML_predict")
609
+ @collect_queryband(queryband="AutoML_predict")
532
610
  def predict(self,
533
- data = None,
534
- rank = 1):
611
+ data,
612
+ rank = 1,
613
+ use_loaded_models = False):
535
614
  """
536
615
  DESCRIPTION:
537
- Function generates prediction on either default test data or any other data
538
- using model rank in leaderboard and displays performance metrics
539
- of the specified model.
540
-
541
- If test data contains target column, then it displays both prediction
542
- and performance metrics, otherwise displays only prediction.
616
+ Function generates prediction on data using model rank in
617
+ leaderboard.
618
+ Note:
619
+ * If both fit and load method are called before predict, then fit method model will be used
620
+ for prediction by default unless 'use_loaded_models' is set to True in predict.
543
621
 
544
622
  PARAMETERS:
545
623
  data:
546
- Optional Argument.
547
- Specifies the dataset on which prediction and performance
548
- metrices needs to be generated using model rank in leaderboard.
549
- When "data" is not specified default test data is used. Default
550
- test data is the dataset generated at the time of training.
624
+ Required Argument.
625
+ Specifies the dataset on which prediction needs to be generated
626
+ using model rank in leaderboard.
551
627
  Types: teradataml DataFrame
552
628
 
553
629
  rank:
@@ -555,6 +631,12 @@ class AutoML:
555
631
  Specifies the rank of the model in the leaderboard to be used for prediction.
556
632
  Default Value: 1
557
633
  Types: int
634
+
635
+ use_loaded_models:
636
+ Optional Argument.
637
+ Specifies whether to use loaded models from database for prediction or not.
638
+ Default Value: False
639
+ Types: bool
558
640
 
559
641
  RETURNS:
560
642
  Pandas DataFrame with predictions.
@@ -568,174 +650,1099 @@ class AutoML:
568
650
  # Perform fit() operation on the "automl_obj".
569
651
  # Perform predict() operation on the "automl_obj".
570
652
 
571
- # Example 1: Run predict with best performing model.
572
- >>> prediction = automl_obj.predict()
573
- >>> prediction
574
-
575
- # Example 2: Run predict with second best performing model.
576
- >>> prediction = automl_obj.predict(rank=2)
577
- >>> prediction
578
-
579
- # Example 3: Run predict for new test data with best performing model.
653
+ # Example 1: Run predict on test data using best performing model.
580
654
  >>> prediction = automl_obj.predict(admissions_test)
581
655
  >>> prediction
582
656
 
583
- # Example 4: Run predict for new test data with second best performing model.
657
+ # Example 2: Run predict on test data using second best performing model.
584
658
  >>> prediction = automl_obj.predict(admissions_test, rank=2)
585
659
  >>> prediction
660
+
661
+ # Example 3: Run predict on test data using loaded model.
662
+ >>> automl_obj.load("model_table")
663
+ >>> prediction = automl_obj.predict(admissions_test, rank=3)
664
+ >>> prediction
665
+
666
+ # Example 4: Run predict on test data using loaded model when fit is also called.
667
+ >>> automl_obj.fit(admissions_train, "admitted")
668
+ >>> automl_obj.load("model_table")
669
+ >>> prediction = automl_obj.predict(admissions_test, rank=3, use_loaded_models=True)
670
+ >>> prediction
586
671
  """
587
- if not self._is_fit_called:
588
- # raise ValueError("fit() method must be called before generating prediction.")
672
+ # Checking if fit or load model is called before predict, If not raise error
673
+ if not self._is_fit_called and not self._is_load_model_called:
589
674
  err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
590
675
  "'predict' method", \
591
- "'fit' method must be called before" \
676
+ "'fit' or 'load' method must be called before" \
592
677
  " running predict.")
593
678
  raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
679
+
594
680
  # Appending predict arguments to list for validation.
595
681
  arg_info_pred_matrix = []
596
- arg_info_pred_matrix.append(["data", data, True, (DataFrame), True])
682
+ arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
597
683
  arg_info_pred_matrix.append(["rank", rank, True, (int), True])
684
+ arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
598
685
 
599
686
  # Validate argument types
600
687
  _Validators._validate_function_arguments(arg_info_pred_matrix)
688
+
689
+ # Run predict using loaded model
690
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
691
+ # Validate range for model rank
692
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
693
+ ubound=self.loaded_models_info.RANK.max(),
694
+ lbound_inclusive=True, ubound_inclusive=True)
695
+ return self._run_loaded_model(data, rank)
696
+
601
697
  # Validate range for model rank
602
698
  _Validators._validate_argument_range(rank, "rank", lbound=1,
603
- ubound=self.leader_board.Rank.max(),
699
+ ubound=self.leader_board.RANK.max(),
604
700
  lbound_inclusive=True, ubound_inclusive=True)
605
701
 
606
- # Setting test data indicator to default value, i.e., False.
607
- self.test_data_ind = False
608
- # Setting target column indicator to default value, i.e., False.
609
- self.target_column_ind = False
702
+ # Setting target column indicator to default value, i.e., True.
703
+ self.target_column_ind = True
704
+ # Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
705
+ rank = rank-1
706
+
707
+ # Setting indicator to False if target column doesn't exist
708
+ if self.target_column not in data.columns:
709
+ self.target_column_ind = False
710
+
711
+ # Checking if data is already transformed before or not
712
+ data_node_id = data._nodeid
713
+ if not self.table_name_mapping.get(data_node_id):
714
+ # At first data transformation will be performed on raw test data
715
+ # then evaluation will happen.
716
+ self.transform_data(data)
717
+ else:
718
+ print("\nSkipping data transformation as data is already transformed.")
719
+
720
+ # Generating prediction
721
+ pred = self.m_evaluator.model_evaluation(rank = rank,
722
+ table_name_mapping = self.table_name_mapping,
723
+ data_node_id = data_node_id,
724
+ target_column_ind = self.target_column_ind)
725
+
726
+ # Checking if problem type is classification and target label is present.
727
+ if self.is_classification_type() and self.target_label is not None:
728
+ # Displaying target column labels
729
+ tar_dct = {}
730
+ print('\nTarget Column Mapping:')
731
+ # Iterating rows
732
+ for row in self.target_label.result.itertuples():
733
+ # Retrieving the category names of encoded target column
734
+ # row[1] contains the orginal name of cateogry
735
+ # row[2] contains the encoded value
736
+ if row[1] != 'TD_CATEGORY_COUNT':
737
+ tar_dct[row[1]] = row[2]
738
+
739
+ for key, value in tar_dct.items():
740
+ print(f"{key}: {value}")
741
+
742
+ # Renaming probability column if any
743
+ prob_lst = [item for item in pred.result.columns if item.startswith('Prob_')]
744
+ if len(prob_lst) > 0:
745
+ rename_dict ={}
746
+ for col in pred.result.columns:
747
+ if col not in prob_lst:
748
+ rename_dict[col] = getattr(pred.result, col)
749
+ else:
750
+ indx = int(col.split('_')[1])
751
+ rename_dict[f'prob_{indx}'] = getattr(pred.result, f'Prob_{indx}')
752
+ rename_dict['drop_columns'] = True
753
+ pred.result = pred.result.assign(**rename_dict)
754
+
755
+ print("\nPrediction : ")
756
+ print(pred.result)
757
+
758
+ if self.target_column_ind:
759
+ prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
760
+ probability_column = 'prob_1'
761
+ # Displaying confusion matrix and ROC-AUC for classification problem
762
+ if self.is_classification_type():
763
+ print_data = lambda data: print(data) if _is_terminal() else display(data)
764
+ # Displaying ROC-AUC for binary classification
765
+ if self.target_count == 2:
766
+ fit_params = {
767
+ "probability_column" : probability_column,
768
+ "observation_column" : self.target_column,
769
+ "positive_class" : "1",
770
+ "data" : pred.result
771
+ }
772
+ # Fitting ROC
773
+ roc_out = ROC(**fit_params)
774
+ print("\nROC-AUC : ")
775
+ print_data(roc_out.result)
776
+ print_data(roc_out.output_data)
777
+
778
+ # Displaying confusion matrix for binary and multiclass classification
779
+ prediction_df=pred.result.to_pandas()
780
+ target_col = self.target_column
781
+ print("\nConfusion Matrix : ")
782
+ print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
783
+
784
+ # Returning prediction
785
+ return pred.result
786
+
787
+ @collect_queryband(queryband="AutoML_evaluate")
788
+ def evaluate(self,
789
+ data,
790
+ rank = 1,
791
+ use_loaded_models = False
792
+ ):
793
+ """
794
+ DESCRIPTION:
795
+ Function evaluates on data using model rank in leaderboard
796
+ and generates performance metrics.
797
+ Note:
798
+ * If both fit and load method are called before predict, then fit method model will be used
799
+ for prediction by default unless 'use_loaded_models' is set to True in predict.
800
+
801
+ PARAMETERS:
802
+ data:
803
+ Required Argument.
804
+ Specifies the dataset on which performance metrics needs to be generated.
805
+ Types: teradataml DataFrame
806
+
807
+ Note:
808
+ * Target column used for generating model is mandatory in "data" for evaluation.
809
+
810
+ rank:
811
+ Optional Argument.
812
+ Specifies the rank of the model available in the leaderboard to be used for evaluation.
813
+ Default Value: 1
814
+ Types: int
815
+
816
+ use_loaded_models:
817
+ Optional Argument.
818
+ Specifies whether to use loaded models from database for prediction or not.
819
+ Default Value: False
820
+ Types: bool
821
+
822
+ RETURNS:
823
+ Pandas DataFrame with performance metrics.
824
+
825
+ RAISES:
826
+ TeradataMlException.
827
+
828
+ EXAMPLES:
829
+ # Create an instance of the AutoML called "automl_obj"
830
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
831
+ # Perform fit() operation on the "automl_obj".
832
+ # Perform evaluate() operation on the "automl_obj".
833
+
834
+ # Example 1: Run evaluate on test data using best performing model.
835
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
836
+ >>> performance_metrics
837
+
838
+ # Example 2: Run evaluate on test data using second best performing model.
839
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, rank=2)
840
+ >>> performance_metrics
841
+
842
+ # Example 3: Run evaluate on test data using loaded model.
843
+ >>> automl_obj.load("model_table")
844
+ >>> evaluation = automl_obj.evaluate(admissions_test, rank=3)
845
+ >>> evaluation
846
+
847
+ # Example 4: Run predict on test data using loaded model when fit is also called.
848
+ >>> automl_obj.fit(admissions_train, "admitted")
849
+ >>> automl_obj.load("model_table")
850
+ >>> evaluation = automl_obj.evaluate(admissions_test, rank=3, use_loaded_models=True)
851
+ >>> evaluation
852
+ """
853
+ if not self._is_fit_called and not self._is_load_model_called:
854
+ # raise ValueError("fit() method must be called before evaluating.")
855
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
856
+ "'evaluate' method", \
857
+ "'fit' or 'load' method must be called before" \
858
+ " running evaluate.")
859
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
860
+ # Appending evaluate arguments to list for validation.
861
+ arg_info_pred_matrix = []
862
+ arg_info_pred_matrix.append(["data", data, False, (DataFrame), True])
863
+ arg_info_pred_matrix.append(["rank", rank, True, (int), True])
864
+ arg_info_pred_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
865
+
866
+ # Validate argument types
867
+ _Validators._validate_function_arguments(arg_info_pred_matrix)
868
+
869
+ # Run evaluate using loaded model
870
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
871
+ # Validate range for model rank
872
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
873
+ ubound=self.loaded_models_info.RANK.max(),
874
+ lbound_inclusive=True, ubound_inclusive=True)
875
+ return self._run_loaded_model(data, rank, output_type="evaluate")
876
+
877
+ # Validate range for model rank
878
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
879
+ ubound=self.leader_board.RANK.max(),
880
+ lbound_inclusive=True, ubound_inclusive=True)
881
+
610
882
  # Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
611
883
  rank = rank-1
884
+
885
+ # Raising exception if target column is not present in data
886
+ # as it is required for evaluation.
887
+ if self.target_column not in data.columns:
888
+ raise TeradataMlException(
889
+ Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
890
+ MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
891
+
892
+ # Checking if data is already transformed before or not
893
+ data_node_id = data._nodeid
894
+ if not self.table_name_mapping.get(data_node_id):
895
+ # At first data transformation will be performed on raw test data
896
+ # then evaluation will happen.
897
+ self.transform_data(data)
898
+ else:
899
+ print("\nSkipping data transformation as data is already transformed.")
900
+
901
+ metrics = self.m_evaluator.model_evaluation(rank = rank,
902
+ table_name_mapping=self.table_name_mapping,
903
+ data_node_id = data_node_id,
904
+ get_metrics = True)
905
+
906
+ # Checking if problem type is classification and target label is present.
907
+ if self.is_classification_type() and self.target_label is not None:
908
+ # Displaying target column labels
909
+ tar_dct = {}
910
+ print('\nTarget Column Mapping:')
911
+ # Iterating rows
912
+ for row in self.target_label.result.itertuples():
913
+ # Retrieving the category names of encoded target column
914
+ # row[1] contains the orginal name of cateogry
915
+ # row[2] contains the encoded value
916
+ if row[1] != 'TD_CATEGORY_COUNT':
917
+ tar_dct[row[1]] = row[2]
918
+
919
+ for key, value in tar_dct.items():
920
+ print(f"{key}: {value}")
921
+
922
+ # Showing performance metrics
923
+ print("\nPerformance Metrics : ")
924
+ print(metrics.result)
925
+ if self.is_classification_type():
926
+ print("-"*80)
927
+ print(metrics.output_data)
928
+
929
+ # Returning performance metrics
930
+ return metrics.result
931
+
932
+ def transform_data(self,
933
+ data,
934
+ data_params = None,
935
+ auto = None,
936
+ verbose = None,
937
+ target_column_ind = None):
938
+ """
939
+ DESCRIPTION:
940
+ Function transforms the data based on the data transformation parameters
941
+ generated during the fit phase.
942
+
943
+ PARAMETERS:
944
+ data:
945
+ Required Argument.
946
+ Specifies the dataset to be transformed.
947
+ Types: teradataml DataFrame
948
+
949
+ data_params:
950
+ Optional Argument.
951
+ Specifies the data transformation parameters.
952
+ Default Value: None
953
+ Types: dict
954
+
955
+ auto:
956
+ Optional Argument.
957
+ Specifies whether to AutoML ran in auto or custom mode.
958
+ Default Value: None
959
+ Types: bool
960
+
961
+ verbose:
962
+ Optional Argument.
963
+ Specifies the verbosity level.
964
+ Default Value: None
965
+ Types: int
966
+
967
+ target_column_ind:
968
+ Optional Argument.
969
+ Specifies whether target column is present in data or not.
970
+ Default Value: None
971
+ Types: bool
972
+
973
+ RETURNS:
974
+ None
975
+ """
976
+ # Creating instance of DataTransformation
977
+ data_transform_instance = _DataTransformation(data = data,
978
+ data_transformation_params=data_params if data_params is not None else \
979
+ self.data_transformation_params,
980
+ auto=auto if data_params is not None else self.auto,
981
+ verbose=verbose if verbose is not None else self.verbose,
982
+ target_column_ind=target_column_ind if target_column_ind is not None else \
983
+ self.target_column_ind,
984
+ table_name_mapping=self.table_name_mapping)
985
+
986
+ # Storing mapping of table names for transformed data
987
+ self.table_name_mapping = data_transform_instance.data_transformation()
988
+
989
+ @collect_queryband(queryband="AutoML_leaderboard")
990
+ def leaderboard(self):
991
+ """
992
+ DESCRIPTION:
993
+ Function displays leaderboard.
994
+
995
+ RETURNS:
996
+ Pandas DataFrame with Leaderboard information.
997
+
998
+ RAISES:
999
+ TeradataMlException.
1000
+
1001
+ EXAMPLES:
1002
+ # Create an instance of the AutoML called "automl_obj"
1003
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1004
+ # Perform fit() operation on the "automl_obj".
1005
+ # Generate leaderboard using leaderboard() method on "automl_obj".
1006
+ >>> automl_obj.leaderboard()
1007
+ """
1008
+ if not self._is_fit_called:
1009
+ # raise ValueError("fit() method must be called before generating leaderboard.")
1010
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1011
+ "'leaderboard' method", \
1012
+ "'fit' method must be called before" \
1013
+ " generating leaderboard.")
1014
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1015
+ return self.leader_board
1016
+
1017
+ @collect_queryband(queryband="AutoML_leader")
1018
+ def leader(self):
1019
+ """
1020
+ DESCRIPTION:
1021
+ Function displays best performing model.
1022
+
1023
+ RETURNS:
1024
+ None
1025
+
1026
+ RAISES:
1027
+ TeradataMlException.
1028
+
1029
+ EXAMPLES:
1030
+ # Create an instance of the AutoML called "automl_obj"
1031
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1032
+ # Perform fit() operation on the "automl_obj".
1033
+ # Generate leaderboard using leaderboard() method on "automl_obj".
1034
+ # Display best performing model using leader() method on "automl_obj".
1035
+ >>> automl_obj.leader()
1036
+ """
1037
+ if not self._is_fit_called:
1038
+ # raise ValueError("fit() method must be called before generating leader.")
1039
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1040
+ "'leader' method", \
1041
+ "'fit' method must be called before" \
1042
+ " generating leader.")
1043
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1044
+ record = self.leader_board
1045
+ if not _is_terminal():
1046
+ display(record[record['RANK'] == 1])
1047
+ else:
1048
+ print(record[record['RANK'] == 1])
1049
+
1050
+ @collect_queryband(queryband="AutoML_hyperparameter")
1051
+ def model_hyperparameters(self,
1052
+ rank=1,
1053
+ use_loaded_models=False):
1054
+ """
1055
+ DESCRIPTION:
1056
+ Get hyperparameters of the model based on rank in leaderboard.
1057
+ Note:
1058
+ * If both the fit() and load() methods are invoked before calling model_hyperparameters(),
1059
+ by default hyperparameters are retrieved from the fit leaderboard.
1060
+ To retrieve hyperparameters from the loaded models, set "use_loaded_models" to True in the model_hyperparameters call.
1061
+
1062
+ PARAMETERS:
1063
+ rank:
1064
+ Required Argument.
1065
+ Specifies the rank of the model in the leaderboard.
1066
+ Default Value: 1
1067
+ Types: int
1068
+
1069
+ use_loaded_models:
1070
+ Optional Argument.
1071
+ Specifies whether to use loaded models from database to get hyperparameters or not.
1072
+ Default Value: False
1073
+ Types: bool
1074
+
1075
+ RETURNS:
1076
+ Dictionary, containing hyperparameters.
1077
+
1078
+ RAISES:
1079
+ TeradataMlException.
1080
+
1081
+ EXAMPLES:
1082
+ # Example 1: Get hyperparameters of the model using fit models.
1083
+ # Create an instance of the AutoML called "automl_obj"
1084
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1085
+ # Perform fit() operation on the "automl_obj".
1086
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1087
+ >>> automl_obj = AutoML(task_type="Classification")
1088
+ >>> automl_obj.fit(admissions_train, "admitted")
1089
+ >>> automl_obj.model_hyperparameters(rank=1)
1090
+
1091
+ # Example 2: Get hyperparameters of the model using loaded models.
1092
+ # Create an instance of the AutoML called "automl_obj"
1093
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1094
+ # Load models from the specified table.
1095
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1096
+ >>> automl_obj = AutoML()
1097
+ >>> automl_obj.load("model_table")
1098
+ >>> automl_obj.model_hyperparameters(rank=1)
1099
+
1100
+ # Example 3: Get hyperparameters of the model when both fit and load method are called.
1101
+ # Create an instance of the AutoML called "automl_obj"
1102
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1103
+ # Fit the data.
1104
+ # Load models from the specified table.
1105
+ # Get hyperparameters of the model using model_hyperparameters() method on "automl_obj".
1106
+ >>> automl_obj = AutoML(task_type="Classification")
1107
+ >>> automl_obj.fit(admissions_train, "admitted")
1108
+ >>> automl_obj.load("model_table")
1109
+
1110
+ # Get hyperparameters of the model using loaded models.
1111
+ >>> automl_obj.model_hyperparameters(rank=1, use_loaded_models=True)
1112
+ # Get hyperparameters of the model using fit models.
1113
+ >>> automl_obj.model_hyperparameters(rank=1)
1114
+ """
1115
+
1116
+ if not self._is_fit_called and not self._is_load_model_called:
1117
+ # raise ValueError("fit() or load() method must be called before getting hyperparameters.")
1118
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1119
+ "'model_hyperparameters' method",
1120
+ "No models available to get hyperparameters. " \
1121
+ "Run 'fit()' or 'load()' methods to get models.")
1122
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1123
+
1124
+ arg_info_matrix = []
1125
+ arg_info_matrix.append(["rank", rank, True, (int), True])
1126
+ arg_info_matrix.append(["use_loaded_models", use_loaded_models, True, (bool)])
1127
+
1128
+ # Validate argument types
1129
+ _Validators._validate_function_arguments(arg_info_matrix)
1130
+
1131
+ leaderboard = None
1132
+ if self._is_load_model_called and (not self._is_fit_called or use_loaded_models):
1133
+ leaderboard = self.loaded_models_info
1134
+ else:
1135
+ leaderboard = self.model_info
1136
+
1137
+ # Validate range for model rank from loaded models
1138
+ _Validators._validate_argument_range(rank, "rank", lbound=1,
1139
+ ubound=leaderboard.RANK.max(),
1140
+ lbound_inclusive=True, ubound_inclusive=True)
1141
+ hyperparams = leaderboard.loc[leaderboard['RANK'] == rank, 'PARAMETERS'].values[0]
1142
+
1143
+ # Deserializing hyperparameters
1144
+ hyperparams = ast.literal_eval(hyperparams)
1145
+
1146
+ # Removing 'data' from hyperparameters
1147
+ keys_to_remove = ['input_columns', 'data', 'train_data', 'test_data']
1148
+ for key in keys_to_remove:
1149
+ hyperparams.pop(key, None)
1150
+
1151
+ return hyperparams
1152
+
1153
+ @collect_queryband(queryband="AutoML_load")
1154
+ def load(self,
1155
+ table_name):
1156
+ """
1157
+ DESCRIPTION:
1158
+ Function loads models information from the specified table.
1159
+
1160
+ PARAMETERS:
1161
+ table_name:
1162
+ Required Argument.
1163
+ Specifies the table name from which models are to be loaded.
1164
+ Types: str
1165
+
1166
+ RETURNS:
1167
+ Pandas DataFrame with loaded models information.
1168
+
1169
+ RAISES:
1170
+ TeradataMlException.
1171
+
1172
+ EXAMPLES:
1173
+ # Create an instance of the AutoML called "obj"
1174
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1175
+ >>> obj = AutoML()
1176
+ # Load models from the specified table.
1177
+ >>> tab = obj.load("model_table")
1178
+ """
1179
+ # Appending arguments to list for validation
1180
+ arg_info_matrix = []
1181
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1182
+
1183
+ # Validate argument types
1184
+ _Validators._validate_function_arguments(arg_info_matrix)
1185
+
1186
+ # Loading models
1187
+ self.loaded_models_info = DataFrame(table_name).to_pandas()
1188
+
1189
+ self._load_data_transform_params()
1190
+
1191
+ self._is_load_model_called = True
1192
+
1193
+ return self.loaded_models_info.drop(['RESULT_TABLE', 'PARAMETERS'], axis=1)
1194
+
1195
+ def _load_data_transform_params(self):
1196
+ """
1197
+ DESCRIPTION:
1198
+ Internal Function loads data transformation parameters from the specified table.
1199
+ """
1200
+ from sklearn.decomposition import PCA
1201
+
1202
+ # Getting data transformation row
1203
+ data_transform_row = self.loaded_models_info[self.loaded_models_info['RANK'] == -1].iloc[0]
1204
+
1205
+ # Removing data transformation row and dropping 'DATA_PARAMS' column
1206
+ # from loaded models info
1207
+ self.loaded_models_info = self.loaded_models_info[self.loaded_models_info['RANK'] != -1]
1208
+ self.loaded_models_info.drop('DATA_PARAMS', axis=1, inplace=True)
1209
+
1210
+ # Loading data transformation parameters by deserializing
1211
+ buffer = BytesIO(data_transform_row['DATA_PARAMS'])
1212
+ data_params = joblib.load(buffer)
1213
+
1214
+ fit_obj_lst = json.loads(data_transform_row['PARAMETERS'])
1215
+
1216
+ # Generating Dataframe from table_names in data params
1217
+ # fit_obj_lst contain : ['one_hot_encoding_fit_obj', 'lasso_scale_fit_obj', 'pca_scale_fit_obj', imputation_fit_object]
1218
+ # Iterating over fit_obj_lst and converting table names to DataFrame
1219
+ for fit_obj_name in fit_obj_lst:
1220
+ if isinstance(data_params[fit_obj_name], dict):
1221
+ for key, val in data_params[fit_obj_name].items():
1222
+ # Key: automl transformation step name, val: table name
1223
+ data_params[fit_obj_name][key] = DataFrame(f'{val}')
1224
+ else:
1225
+ data_params[fit_obj_name] = DataFrame(f'{data_params[fit_obj_name]}')
1226
+
1227
+ # Manually deserializing and reconstructing PCA object
1228
+ load_pca_info = data_params['pca_fit_instance']
1229
+ pca = PCA(n_components=load_pca_info['n_components'], random_state=42)
1230
+ pca.components_ = np.array(load_pca_info['components'])
1231
+ pca.explained_variance_ = np.array(load_pca_info['explained_variance'])
1232
+ pca.explained_variance_ratio_ = np.array(load_pca_info['explained_variance_ratio'])
1233
+ pca.mean_ = np.array(load_pca_info['mean'])
1234
+ pca.n_components_ = load_pca_info['n_components']
1235
+ pca.noise_variance_ = load_pca_info['noise_variance']
1236
+ pca.singular_values_ = np.array(load_pca_info['singular_values'])
1237
+
1238
+ data_params['pca_fit_instance'] = pca
1239
+
1240
+ self.loaded_data_transformation_params = data_params
1241
+
1242
+ def _validate_ranks(self, ranks):
1243
+ """
1244
+ DESCRIPTION:
1245
+ Function validates the ranks argument.
1246
+
1247
+ PARAMETERS:
1248
+ ranks:
1249
+ Required Argument.
1250
+ Specifies the ranks for the models to be saved.
1251
+ Types: int or list of int
1252
+
1253
+ RAISES:
1254
+ TeradataMlException.
1255
+ """
1256
+ start_rank, end_rank = ranks.start, ranks.stop
1257
+
1258
+ # Check if both parts are non-negative integers
1259
+ if not (start_rank > 0 and end_rank > 0):
1260
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1261
+ "'deploy' method", \
1262
+ "Provided start and end rank in 'ranks' "\
1263
+ "must be positive non-zero integers.")
1264
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1265
+
1266
+ # Check if start_rank is less than or equal to end_rank
1267
+ if start_rank > end_rank:
1268
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1269
+ "'deploy' method", \
1270
+ "Provided start rank in 'ranks' must be less than"\
1271
+ " or equal to end rank in 'ranks'.")
1272
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1273
+
1274
+ # check end rank is less than or equal to total models
1275
+ if end_rank > self.leader_board.RANK.max():
1276
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1277
+ "'deploy' method", \
1278
+ "Provided end rank in 'ranks' must be less than"\
1279
+ " or equal to total models available.")
1280
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1281
+
1282
+ return start_rank, end_rank
1283
+
1284
+ @collect_queryband(queryband="AutoML_deploy")
1285
+ def deploy(self,
1286
+ table_name,
1287
+ top_n = 3,
1288
+ ranks = None
1289
+ ):
1290
+ """
1291
+ DESCRIPTION:
1292
+ Function saves models to the specified table name.
1293
+ Note:
1294
+ * If 'ranks' is provided, specified models in 'ranks' will be saved
1295
+ and ranks will be reassigned to specified models based
1296
+ on the order of the leaderboard, non-specified models will be ignored.
1297
+
1298
+ PARAMETERS:
1299
+ table_name:
1300
+ Required Argument.
1301
+ Specifies the table name to which models information is to be saved.
1302
+ Types: str
1303
+
1304
+ top_n:
1305
+ Optional Argument.
1306
+ Specifies the top n models to be saved.
1307
+ Note:
1308
+ * If 'ranks' is not provided, the function saves the top 'top_n' models.
1309
+
1310
+ Default Value: 3
1311
+ Types: int
1312
+
1313
+ ranks:
1314
+ Optional Argument.
1315
+ Specifies the ranks for the models to be saved.
1316
+ Note:
1317
+ * If 'ranks' is provided, then 'top_n' is ignored.
1318
+ Types: int or list of int or range object
1319
+
1320
+ RETURNS:
1321
+ None
1322
+
1323
+ RAISES:
1324
+ TeradataMlException.
1325
+
1326
+ EXAMPLES:
1327
+ # Create an instance of the AutoML called "obj"
1328
+ # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
1329
+ >>> obj = AutoML(task_type="Classification")
1330
+ >>> obj.fit(data = data, target_column = target_column)
1331
+
1332
+ # Save top 3 models to the specified table.
1333
+ >>> obj.deploy("model_table")
1334
+
1335
+ # Save top n models to the specified table.
1336
+ >>> obj.deploy("model_table", top_n=5)
1337
+
1338
+ # Save models based on specified ranks to the specified table.
1339
+ >>> obj.deploy("model_table", ranks=[1, 3, 5])
1340
+
1341
+ # Save models based on specified rank range to the specified table.
1342
+ >>> obj.deploy("model_table", ranks=range(2,6))
1343
+ """
1344
+ # raise Error if fit is not called
1345
+ if not self._is_fit_called:
1346
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
1347
+ "'deploy' method", \
1348
+ "'fit' method must be called before" \
1349
+ " 'deploy'.")
1350
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1351
+
1352
+ # Appending arguments to list for validation
1353
+ arg_info_matrix = []
1354
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1355
+ arg_info_matrix.append(["top_n", top_n, True, (int)])
1356
+ if not isinstance(ranks, range):
1357
+ arg_info_matrix.append(["ranks", ranks, True, (int, list)])
1358
+
1359
+ # Validate argument types
1360
+ _Validators._validate_function_arguments(arg_info_matrix)
1361
+
1362
+ if isinstance(ranks, int):
1363
+ ranks = [ranks]
1364
+ elif isinstance(ranks, range):
1365
+ start_rank, end_rank = self._validate_ranks(ranks)
1366
+
1367
+ if ranks is None or len(ranks) == 0:
1368
+ # If total models are greater than available models or less than 1
1369
+ try:
1370
+ _Validators._validate_argument_range(top_n, "top_n", lbound=1,
1371
+ ubound=self.leader_board.RANK.max(),
1372
+ lbound_inclusive=True, ubound_inclusive=True)
1373
+ except ValueError as e:
1374
+ msg = "\n'top_n' should be equal or less than the available models or greater than 0. " \
1375
+ "Deploying all available models to the table."
1376
+ warnings.warn(message=msg, stacklevel=2)
1377
+ top_n = self.leader_board.shape[0]
1378
+ elif isinstance(ranks, list):
1379
+ # If ranks is provided, then validating the ranks elements
1380
+ for ele in ranks:
1381
+ _Validators._validate_argument_range(ele, "element in ranks", lbound=1,
1382
+ ubound=self.leader_board.RANK.max(),
1383
+ lbound_inclusive=True, ubound_inclusive=True)
1384
+
1385
+ feature_selections = self.model_info['FEATURE_SELECTION'].unique().tolist()
1386
+
1387
+ # Mapping feature selection to training data,
1388
+ # we are creating a dictionary with key as feature selection and
1389
+ # value as temporary training data table name, so that we can copy
1390
+ # temporary training data to permanent table.
1391
+ # Here's an example of mapping:
1392
+ # Example: {'lasso': 'ml__survived_lasso_1717475362789542',
1393
+ # 'rfe': 'ml__survived_rfe_1717474570567062',
1394
+ # 'pca': 'ml__survived_pca_1717475375119752'}
1395
+ fs_to_data_dict ={fs:self.model_info.loc[self.model_info['FEATURE_SELECTION'] == fs, \
1396
+ 'DATA_TABLE'].iloc[0] for fs in feature_selections}
1397
+
1398
+ # Saving temporary training data to permanent table
1399
+ # We are replacing DATA_TABLE with permanent table name in model_info
1400
+ for key, val in fs_to_data_dict.items():
1401
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, key),
1402
+ persist_result_table=val)
1403
+ fs_to_data_dict[key] = per_name
1404
+
1405
+ # Persist flag
1406
+ persist = self.kwargs.get('persist', False)
1407
+ # If ranks is provided, then saving models based on specified rank
1408
+ # in list will be prioritized over 'top_n'.
1409
+ if ranks is None or len(ranks) == 0:
1410
+ # Saving only top 'top_n' models
1411
+ for index, row in self.model_info.iterrows():
1412
+ if index < top_n:
1413
+ self.model_info.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1414
+ if not persist:
1415
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1416
+ persist_result_table=row['RESULT_TABLE'])
1417
+ self.model_info.loc[index, 'RESULT_TABLE'] = per_name
1418
+ else:
1419
+ break
1420
+ sv_models = self.model_info.drop('model-obj', axis=1).head(top_n)
1421
+ else:
1422
+ if isinstance(ranks, range):
1423
+ # Saving models based on start and end rank.
1424
+ sv_models = self.model_info[start_rank-1:end_rank].copy()
1425
+ else:
1426
+ # Saving models based on specified rank in list
1427
+ sv_models = self.model_info[self.model_info['RANK'].isin(ranks)].copy()
1428
+ sv_models.drop('model-obj', axis=1, inplace=True)
1429
+ sv_models.reset_index(drop=True, inplace=True)
1430
+
1431
+ for index, row in sv_models.iterrows():
1432
+ sv_models.loc[index, 'RANK'] = index + 1
1433
+ sv_models.loc[index, 'DATA_TABLE'] = fs_to_data_dict[row['FEATURE_SELECTION']]
1434
+ if not persist:
1435
+ per_name = self._create_per_result_table(prefix='{}_{}'.format(self.target_column, row['MODEL_ID']),
1436
+ persist_result_table=row['RESULT_TABLE'])
1437
+ sv_models.loc[index, 'RESULT_TABLE'] = per_name
1438
+
1439
+ # Data Transformation Parameters
1440
+ df = self._deploy_data_transformation_params()
1441
+
1442
+ # Saving data transformation parameters to the specified table
1443
+ sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
1444
+
1445
+ copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
1446
+
1447
+ print('Model Deployment Completed Successfully.')
1448
+
1449
+ def _create_per_result_table(self, prefix, persist_result_table):
1450
+ """
1451
+ DESCRIPTION:
1452
+ Internal Function creates permanent table for the specified result table.
1453
+
1454
+ PARAMETERS:
1455
+ prefix:
1456
+ Required Argument.
1457
+ Specifies the prefix for the permanent table name.
1458
+ Types: str
1459
+
1460
+ persist_result_table:
1461
+ Required Argument.
1462
+ Specifies the result table name.
1463
+ Types: str
1464
+
1465
+ RETURNS:
1466
+ Permanent table name.
1467
+
1468
+ RAISES:
1469
+ TeradataMlException.
1470
+ """
1471
+
1472
+ table_name = UtilFuncs._generate_temp_table_name(prefix=prefix,
1473
+ table_type=TeradataConstants.TERADATA_TABLE,
1474
+ gc_on_quit=False)
1475
+ qry = f"SELECT * FROM {persist_result_table}"
1476
+ UtilFuncs._create_table(table_name=table_name,
1477
+ query=qry,
1478
+ volatile=False)
1479
+ return table_name
1480
+
1481
+
1482
+ def _deploy_data_transformation_params(self):
1483
+ """
1484
+ DESCRIPTION:
1485
+ Internal Function converts data transformation parameters dictonary (information of each step of automl)
1486
+ to DataFrame with rank as -1 and return the DataFrame that can be concatenated with model_info DataFrame
1487
+ and saved to the user specified table in database.
612
1488
 
613
- # Checking if there is test data provided or not.
614
- # If no, then model will generate predicion on default test data.
615
- # If yes, then at first data transformation will happen then prediction will be generated.
616
- if data is None:
617
- metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
618
- table_name_mapping=self.table_name_mapping)
619
- else:
620
- # Setting test data indicator to True
621
- self.test_data_ind = True
622
- # Setting indicator to True if target column exists
623
- if self.target_column in data.columns:
624
- self.target_column_ind = True
1489
+ PARAMETERS:
1490
+ None
1491
+
1492
+ RETURNS:
1493
+ None
1494
+
1495
+ RAISES:
1496
+ TeradataMlException.
1497
+ """
1498
+ # Create a new dictionary to store the deep copy
1499
+ data_params = {}
1500
+
1501
+ # Define a recursive function to deep copy dictionaries
1502
+ def deep_copy_dict(d):
1503
+ if not isinstance(d, dict):
1504
+ return d # Base case: if it's not a dictionary, return the value directly
1505
+ return {k: deep_copy_dict(v) for k, v in d.items()} # Recursively copy each item
625
1506
 
626
- # Data Transformation Phase
627
- data_transform_instance = _DataTransformation(data = data,
628
- data_transformation_params = \
629
- self.data_transformation_params,
630
- auto = self.auto,
631
- verbose = self.verbose,
632
- target_column_ind = self.target_column_ind,
633
- table_name_mapping=self.table_name_mapping)
634
-
635
- self.table_name_mapping = data_transform_instance.data_transformation()
636
-
637
- # Checking for target column presence in passed test data.
638
- # If present, then both prediction and evaluation metrics will be generated.
639
- # If not present, then only prediction will be generated.
640
- if self.target_column_ind:
641
- metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
642
- test_data_ind = \
643
- self.test_data_ind,
644
- target_column_ind = \
645
- self.target_column_ind,
646
- table_name_mapping=self.table_name_mapping)
1507
+ # Deep copy is needed as the original dictionary contains nested dictionaries
1508
+ # and we want to avoid modifying the original dictionary when changes are made.
1509
+ # The .copy() method creates a shallow copy, which does not suffice for nested dictionaries.
1510
+ # Iterate through the original dictionary to handle deep copying.
1511
+ for key, value in self.data_transformation_params.items():
1512
+ # Check if value is a dictionary
1513
+ if isinstance(value, dict):
1514
+ # If the value is a dictionary, create a deep copy of the dictionary
1515
+ # This ensures that nested dictionaries are also copied, not just referenced.
1516
+ data_params[key] = deep_copy_dict(value)
647
1517
  else:
648
- pred = self.m_evaluator.model_evaluation(rank = rank,
649
- test_data_ind = \
650
- self.test_data_ind,
651
- table_name_mapping=self.table_name_mapping)
652
- # Checking if problem type is classification and target label is present.
653
- if self.is_classification_type() and self.target_label is not None:
654
- # Displaying target column labels
655
- tar_dct = {}
656
- print('\nTarget Column Mapping:')
657
- # Iterating rows
658
- for row in self.target_label.result.itertuples():
659
- # Retrieving the category names of encoded target column
660
- # row[1] contains the orginal name of cateogry
661
- # row[2] contains the encoded value
662
- if row[1] != 'TD_CATEGORY_COUNT':
663
- tar_dct[row[1]] = row[2]
664
-
665
- for key, value in tar_dct.items():
666
- print(f"{key}: {value}")
667
-
668
- print("\nPrediction : ")
669
- print(pred.result)
1518
+ # If the value is not a dictionary, perform a shallow copy (direct assignment)
1519
+ data_params[key] = value
1520
+
1521
+ # Names of fit objects that contain the table names
1522
+ # pointing to tables in the database.
1523
+ fit_obj_names = []
1524
+
1525
+ # Persist flag
1526
+ persist = self.kwargs.get('persist', False)
1527
+
1528
+ data_params['auto_mode'] = False if self.custom_data is not None else True
1529
+
1530
+ # Iterating over data transformation parameters
1531
+ # aml_step_name is the name of transformation step taken and val is the value
1532
+ for aml_step_name,val in data_params.items():
1533
+ # Checking if value is of type teradataml DataFrame
1534
+ # If yes, then creating permanent table for the same
1535
+ # and storing the table_name in data_params instead of dataframe.
1536
+ if isinstance(val, DataFrame):
1537
+ fit_obj_names.append(aml_step_name)
1538
+ if persist:
1539
+ data_params[aml_step_name] = val._table_name
1540
+ else:
1541
+ per_name = self._create_per_result_table(prefix='{}'.format(aml_step_name),
1542
+ persist_result_table= val._table_name)
1543
+ data_params[aml_step_name] = per_name
1544
+ elif isinstance(val, dict) and 'fit_obj' in aml_step_name:
1545
+ for key, val in val.items():
1546
+ if isinstance(val, DataFrame):
1547
+ fit_obj_names.append(aml_step_name)
1548
+ if persist:
1549
+ data_params[aml_step_name][key] = val._table_name
1550
+ else:
1551
+ per_name = self._create_per_result_table(prefix='{}'.format(key),
1552
+ persist_result_table= val._table_name)
1553
+ data_params[aml_step_name][key] = per_name
1554
+ elif aml_step_name == 'pca_fit_instance':
1555
+ # Serializing PCA object
1556
+ pca = data_params[aml_step_name]
1557
+ # Extract pca parameters
1558
+ pca_params = {
1559
+ 'n_components': pca.n_components_,
1560
+ 'components': pca.components_.tolist(),
1561
+ 'explained_variance': pca.explained_variance_.tolist(),
1562
+ 'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
1563
+ 'mean': pca.mean_.tolist(),
1564
+ 'singular_values': pca.singular_values_.tolist(),
1565
+ 'noise_variance': pca.noise_variance_
1566
+ }
1567
+ data_params[aml_step_name] = pca_params
670
1568
 
671
- # Showing performance metrics if there is no test data
672
- # Or if target column is present in test data.
673
- if not self.test_data_ind or self.target_column_ind:
674
- print("\nPerformance Metrics : ")
675
- print(metrics.result)
1569
+ # Serializing data transformation parameters
1570
+ buffer = BytesIO()
1571
+ joblib.dump(data_params, buffer)
1572
+ buffer.seek(0)
1573
+ serialized_data = buffer.getvalue()
1574
+
1575
+ # Creating a string representation of fit object names
1576
+ param = json.dumps(fit_obj_names)
1577
+
1578
+ # Creating a DataFrame of data transformation information
1579
+ row = {
1580
+ 'RANK':-1,
1581
+ 'PARAMETERS':param,
1582
+ 'DATA_PARAMS':serialized_data,
1583
+ }
1584
+ df = pd.DataFrame([row])
1585
+
1586
+ return df
676
1587
 
677
- prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
678
-
679
- # Displaying confusion matrix and ROC-AUC for classification problem
680
- if self.is_classification_type():
681
- print_data = lambda data: print(data) if _is_terminal() else display(data)
682
- # Displaying ROC-AUC for binary classification
683
- if self.target_count == 2:
684
- fit_params = {
685
- "probability_column" : prediction_column,
686
- "observation_column" : self.target_column,
687
- "positive_class" : "1",
688
- "data" : pred.result
689
- }
690
- # Fitting ROC
691
- roc_out = ROC(**fit_params)
692
- print("\nROC-AUC : ")
693
- print_data(roc_out.result)
694
- print_data(roc_out.output_data)
695
-
696
- # Displaying confusion matrix for binary and multiclass classification
697
- prediction_df=pred.result.to_pandas()
698
- target_col = self.target_column
699
- print("\nConfusion Matrix : ")
700
- print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
701
-
702
- # Returning prediction
703
- return pred.result
704
-
705
- @collect_queryband(queryband="AutoML_leaderboard")
706
- def leaderboard(self):
1588
+ def _run_loaded_model(self,
1589
+ test_data,
1590
+ rank=1,
1591
+ output_type='prediction'):
707
1592
  """
708
1593
  DESCRIPTION:
709
- Function displays leaderboard.
1594
+ Internal Function generates prediction and performance metrics using the specified model rank
1595
+ in the loaded models leaderboard.
1596
+
1597
+ PARAMETERS:
1598
+ test_data:
1599
+ Required Argument.
1600
+ Specifies the test data on which prediction and performance metrics needs to be generated.
1601
+ Types: teradataml DataFrame
1602
+
1603
+ rank:
1604
+ Optional Argument.
1605
+ Specifies the rank of the model in the leaderboard to be used for prediction.
1606
+ Default Value: 1
1607
+ Types: int
1608
+
1609
+ output_type:
1610
+ Optional Argument.
1611
+ Specifies the type of output to be generated.
1612
+ Default Value: 'prediction'
1613
+ Types: str
1614
+ Permitted Values: 'prediction', 'metrics'
710
1615
 
711
1616
  RETURNS:
712
- Pandas DataFrame with Leaderboard information.
1617
+ Tuple containing prediction and performance metrics.
713
1618
 
714
1619
  RAISES:
715
1620
  TeradataMlException.
716
1621
 
717
- EXAMPLES:
718
- # Create an instance of the AutoML called "automl_obj"
719
- # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
720
- # Perform fit() operation on the "automl_obj".
721
- # Generate leaderboard using leaderboard() method on "automl_obj".
722
- >>> automl_obj.leaderboard()
723
1622
  """
724
- if not self._is_fit_called:
725
- # raise ValueError("fit() method must be called before generating leaderboard.")
726
- err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
727
- "'leaderboard' method", \
728
- "'fit' method must be called before" \
729
- " generating leaderboard.")
730
- raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
731
- return self.leader_board
1623
+ # Indexing starts from 0
1624
+ rank = rank - 1
1625
+ # Extracting parameters
1626
+ parameters = ast.literal_eval(self.loaded_models_info.loc[rank, 'PARAMETERS'])
1627
+ # Model name
1628
+ model_name = self.loaded_models_info.loc[rank, 'MODEL_ID'].split('_')[0]
1629
+ # Feature selection
1630
+ fs = self.loaded_models_info.loc[rank, 'FEATURE_SELECTION']
732
1631
 
733
- @collect_queryband(queryband="AutoML_leader")
734
- def leader(self):
1632
+ # Checking task type
1633
+ if 'R2' in self.loaded_models_info.columns:
1634
+ task_type='Regression'
1635
+ else:
1636
+ task_type='Classification'
1637
+
1638
+ # Model names mapping to Analytic Functions
1639
+ func_map = {
1640
+ 'XGBOOST': lambda params: XGBoost(**params),
1641
+ 'GLM': lambda params: GLM(**params),
1642
+ 'SVM': lambda params: SVM(**params),
1643
+ 'DECISIONFOREST': lambda params: DecisionForest(**params),
1644
+ 'KNN': lambda params: KNN(**params)
1645
+ }
1646
+
1647
+ if output_type == 'prediction':
1648
+ print('Generating prediction using:')
1649
+ else:
1650
+ print('Generating performance metrics using:')
1651
+ print(f"Model Name: {model_name}")
1652
+ print(f"Feature Selection: {fs}")
1653
+
1654
+ # Generating evaluation parameters
1655
+ eval_params = _ModelTraining._eval_params_generation(model_name,
1656
+ parameters['response_column'],
1657
+ task_type)
1658
+ if task_type == 'Classification':
1659
+ eval_params['output_responses'] = parameters['output_responses']
1660
+
1661
+ # Checking if response column is present in test data
1662
+ if parameters['response_column'] not in test_data.columns:
1663
+ # Checking if output type is evaluation
1664
+ if output_type == 'evaluation':
1665
+ # Response column is rqeuired for evaluation, raise error if not present
1666
+ raise ValueError(f"Response column '{parameters['response_column']}' is not present in test data for evaluation.")
1667
+ eval_params.pop('accumulate', None)
1668
+ reponse_col_present = False
1669
+ else:
1670
+ reponse_col_present = True
1671
+
1672
+ # Checking if data is already transformed before or not
1673
+ data_node_id = test_data._nodeid
1674
+ if not self.table_name_mapping.get(data_node_id):
1675
+ # Data transformation will be performed on raw test data
1676
+ self.transform_data(data=test_data,
1677
+ data_params=self.loaded_data_transformation_params,
1678
+ auto=self.loaded_data_transformation_params['auto_mode'],
1679
+ verbose=0,
1680
+ target_column_ind=reponse_col_present)
1681
+
1682
+ # Extracting test data
1683
+ for feature_selection, table_name in self.table_name_mapping[data_node_id].items():
1684
+ if fs in feature_selection:
1685
+ test_data = DataFrame(table_name)
1686
+ break
1687
+
1688
+ if model_name == 'KNN':
1689
+ train_data = DataFrame(self.loaded_models_info.loc[rank, 'DATA_TABLE'])
1690
+
1691
+ parameters['train_data'] = train_data
1692
+ parameters['test_data'] = test_data
1693
+
1694
+ if parameters['response_column'] in test_data.columns:
1695
+ parameters['accumulate'] = parameters['response_column']
1696
+
1697
+ knn = func_map[model_name](parameters)
1698
+
1699
+ # Checking if response column is present in test data
1700
+ if reponse_col_present and output_type != 'prediction':
1701
+ metrics = knn.evaluate(test_data=test_data, **eval_params)
1702
+ else:
1703
+ predictions = knn.result
1704
+ else:
1705
+ # Extracting result table name
1706
+ result_table_name = self.loaded_models_info.loc[rank, 'RESULT_TABLE']
1707
+ result_table = DataFrame(result_table_name)
1708
+ params = {
1709
+ "skip_input_arg_processing":True,
1710
+ "skip_output_arg_processing":True,
1711
+ "skip_other_arg_processing":True,
1712
+ "skip_func_output_processing":True,
1713
+ "_result_data":result_table,
1714
+ "response_column": parameters['response_column']
1715
+ }
1716
+ model = func_map[model_name](params)
1717
+ # Checking if response column is present in test data
1718
+ if reponse_col_present and output_type != 'prediction':
1719
+ metrics = model.evaluate(newdata=test_data, **eval_params)
1720
+ else:
1721
+ predictions = model.predict(newdata=test_data, **eval_params)
1722
+
1723
+ # Return prediction and metrics, when output type is metrics
1724
+ if reponse_col_present and output_type != 'prediction':
1725
+ return metrics
1726
+
1727
+ # Return prediction, when output type is prediction
1728
+ return predictions if model_name == 'KNN' else predictions.result
1729
+
1730
+ @collect_queryband(queryband="AutoML_remove_saved_models")
1731
+ def remove_saved_models(self,
1732
+ table_name):
735
1733
  """
736
1734
  DESCRIPTION:
737
- Function displays best performing model.
738
-
1735
+ Function removes the specified table containing saved models.
1736
+ Note:
1737
+ * If any data table result table is not present inside the database,
1738
+ then it will be skipped.
1739
+
1740
+ PARAMETERS:
1741
+ table_name:
1742
+ Required Argument.
1743
+ Specifies the table name containing saved models.
1744
+ Types: str
1745
+
739
1746
  RETURNS:
740
1747
  None
741
1748
 
@@ -743,25 +1750,48 @@ class AutoML:
743
1750
  TeradataMlException.
744
1751
 
745
1752
  EXAMPLES:
746
- # Create an instance of the AutoML called "automl_obj"
1753
+ # Create an instance of the AutoML called "obj"
747
1754
  # by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
748
- # Perform fit() operation on the "automl_obj".
749
- # Generate leaderboard using leaderboard() method on "automl_obj".
750
- # Display best performing model using leader() method on "automl_obj".
751
- >>> automl_obj.leader()
1755
+ >>> obj = AutoML()
1756
+ # Remove saved models from the specified table.
1757
+ >>> obj.remove_saved_models("model_table")
752
1758
  """
753
- if not self._is_fit_called:
754
- # raise ValueError("fit() method must be called before generating leader.")
755
- err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
756
- "'leader' method", \
757
- "'fit' method must be called before" \
758
- " generating leader.")
759
- raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
760
- record = self.leader_board
761
- if not _is_terminal():
762
- display(record[record['Rank'] == 1])
763
- else:
764
- print(record[record['Rank'] == 1])
1759
+ # Appending arguments to list for validation
1760
+ arg_info_matrix = []
1761
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
1762
+
1763
+ # Validate argument types
1764
+ _Validators._validate_function_arguments(arg_info_matrix)
1765
+
1766
+ df = DataFrame(table_name).to_pandas()
1767
+
1768
+ drop_list = df['DATA_TABLE'].dropna().unique().tolist()
1769
+ drop_list.extend(df['RESULT_TABLE'].dropna().unique().tolist())
1770
+
1771
+ # Removing data transformation parameters tables
1772
+ data=df[df['RANK'] == -1].iloc[0]
1773
+ buffer = BytesIO(data['DATA_PARAMS'])
1774
+ data_params = joblib.load(buffer)
1775
+ fit_obj_lst = json.loads(data['PARAMETERS'])
1776
+ for i in fit_obj_lst:
1777
+ if isinstance(data_params[i], dict):
1778
+ drop_list.extend(data_params[i].values())
1779
+ else:
1780
+ drop_list.append(data_params[i])
1781
+
1782
+ non_existent_tables = []
1783
+ for table in drop_list:
1784
+ try:
1785
+ execute_sql(f"DROP TABLE {table};")
1786
+ except Exception as e:
1787
+ non_existent_tables.append(table)
1788
+ continue
1789
+
1790
+ if len(non_existent_tables) > 0:
1791
+ warnings.warn(message=f"\nThe following tables '{non_existent_tables}' do not exist in the database and have been skipped.",
1792
+ stacklevel=2)
1793
+
1794
+ db_drop_table(table_name)
765
1795
 
766
1796
  @staticmethod
767
1797
  def generate_custom_config(file_name = "custom"):
@@ -853,7 +1883,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
853
1883
  max_runtime_secs = None,
854
1884
  stopping_metric = None,
855
1885
  stopping_tolerance = None,
856
- max_models = None):
1886
+ max_models = None,
1887
+ **kwargs):
857
1888
  """
858
1889
  DESCRIPTION:
859
1890
  Interal Function runs Regression.
@@ -895,6 +1926,25 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
895
1926
  Specifies the maximum number of models to be trained.
896
1927
  Types: int
897
1928
 
1929
+ volatile:
1930
+ Optional Argument.
1931
+ Specifies whether to put the results of the
1932
+ function in a volatile table or not. When set to
1933
+ True, results are stored in a volatile table,
1934
+ otherwise not.
1935
+ Default Value: False
1936
+ Types: bool
1937
+
1938
+ persist:
1939
+ Optional Argument.
1940
+ Specifies whether to persist the results of the
1941
+ function in a table or not. When set to True,
1942
+ results are persisted in a table; otherwise,
1943
+ results are garbage collected at the end of the
1944
+ session.
1945
+ Default Value: False
1946
+ Types: bool
1947
+
898
1948
  RETURNS:
899
1949
  a tuple containing, model information and leaderboard.
900
1950
  """
@@ -911,7 +1961,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
911
1961
  target_column = self.target_column,
912
1962
  model_list = model_list,
913
1963
  verbose = verbose,
914
- custom_data = self.custom_data)
1964
+ custom_data = self.custom_data,
1965
+ **kwargs)
915
1966
  # Start time
916
1967
  start_time = time.time()
917
1968
  data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
@@ -923,7 +1974,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
923
1974
  verbose = verbose,
924
1975
  excluded_columns = excluded_columns,
925
1976
  custom_data = self.custom_data,
926
- data_transform_dict = data_transformation_params)
1977
+ data_transform_dict = data_transformation_params,
1978
+ **kwargs)
927
1979
  features, data_transformation_params = self.data_preparation(auto)
928
1980
 
929
1981
  # Calculating max_runtime_secs for model training by,
@@ -943,7 +1995,8 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
943
1995
  verbose = verbose,
944
1996
  features = features,
945
1997
  task_type = "Regression",
946
- custom_data = self.custom_data)
1998
+ custom_data = self.custom_data,
1999
+ **kwargs)
947
2000
  models_info, leaderboard, target_count = self.model_training(auto = auto,
948
2001
  max_runtime_secs = max_runtime_secs,
949
2002
  stopping_metric = stopping_metric,
@@ -989,7 +2042,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
989
2042
  max_runtime_secs = None,
990
2043
  stopping_metric = None,
991
2044
  stopping_tolerance = None,
992
- max_models = None):
2045
+ max_models = None,
2046
+ **kwargs):
993
2047
  """
994
2048
  DESCRIPTION:
995
2049
  Interal Function runs Classification.
@@ -1030,12 +2084,30 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1030
2084
  Optional Argument.
1031
2085
  Specifies the maximum number of models to be trained.
1032
2086
  Types: int
1033
-
2087
+
2088
+ volatile:
2089
+ Optional Argument.
2090
+ Specifies whether to put the results of the
2091
+ function in a volatile table or not. When set to
2092
+ True, results are stored in a volatile table,
2093
+ otherwise not.
2094
+ Default Value: False
2095
+ Types: bool
2096
+
2097
+ persist:
2098
+ Optional Argument.
2099
+ Specifies whether to persist the results of the
2100
+ function in a table or not. When set to True,
2101
+ results are persisted in a table; otherwise,
2102
+ results are garbage collected at the end of the
2103
+ session.
2104
+ Default Value: False
2105
+ Types: bool
2106
+
1034
2107
  RETURNS:
1035
2108
  a tuple containing, model information and leaderboard.
1036
2109
  """
1037
-
1038
-
2110
+
1039
2111
  # Feature Exploration Phase
1040
2112
  _FeatureExplore.__init__(self,
1041
2113
  data = self.data,
@@ -1050,7 +2122,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1050
2122
  model_list = model_list,
1051
2123
  verbose = verbose,
1052
2124
  task_type = "Classification",
1053
- custom_data = self.custom_data)
2125
+ custom_data = self.custom_data,
2126
+ **kwargs)
1054
2127
  # Start time
1055
2128
  start_time = time.time()
1056
2129
  data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
@@ -1062,7 +2135,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1062
2135
  excluded_columns = excluded_columns,
1063
2136
  custom_data = self.custom_data,
1064
2137
  data_transform_dict = data_transformation_params,
1065
- task_type = "Classification")
2138
+ task_type = "Classification",
2139
+ **kwargs)
1066
2140
  features, data_transformation_params = self.data_preparation(auto)
1067
2141
 
1068
2142
  # Calculating max_runtime_secs for model training by,
@@ -1082,7 +2156,8 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
1082
2156
  verbose = verbose,
1083
2157
  features = features,
1084
2158
  task_type = "Classification",
1085
- custom_data = self.custom_data)
2159
+ custom_data = self.custom_data,
2160
+ **kwargs)
1086
2161
  models_info, leaderboard, target_count = self.model_training(auto = auto,
1087
2162
  max_runtime_secs = max_runtime_secs,
1088
2163
  stopping_metric = stopping_metric,
@@ -1243,7 +2318,8 @@ class AutoRegressor(AutoML):
1243
2318
  stopping_metric=None,
1244
2319
  stopping_tolerance=None,
1245
2320
  max_models=None,
1246
- custom_config_file=None
2321
+ custom_config_file=None,
2322
+ **kwargs
1247
2323
  ):
1248
2324
  """
1249
2325
  DESCRIPTION:
@@ -1284,8 +2360,10 @@ class AutoRegressor(AutoML):
1284
2360
  Required, when "stopping_tolerance" is set, otherwise optional.
1285
2361
  Specifies the stopping mertics for stopping tolerance in model training.
1286
2362
  Permitted Values:
1287
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
1288
- "RMSE", "RMSLE"
2363
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
2364
+ "MAPE", "MPE", "RMSE", "RMSLE",
2365
+ "ME", "EV", "MPD", "MGD"
2366
+
1289
2367
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
1290
2368
  'MICRO-RECALL','MACRO-RECALL',
1291
2369
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -1307,7 +2385,29 @@ class AutoRegressor(AutoML):
1307
2385
  Optional Argument.
1308
2386
  Specifies the path of JSON file in case of custom run.
1309
2387
  Types: str
1310
-
2388
+
2389
+ **kwargs:
2390
+ Specifies the additional arguments for AutoRegressor. Below
2391
+ are the additional arguments:
2392
+ volatile:
2393
+ Optional Argument.
2394
+ Specifies whether to put the interim results of the
2395
+ functions in a volatile table or not. When set to
2396
+ True, results are stored in a volatile table,
2397
+ otherwise not.
2398
+ Default Value: False
2399
+ Types: bool
2400
+
2401
+ persist:
2402
+ Optional Argument.
2403
+ Specifies whether to persist the interim results of the
2404
+ functions in a table or not. When set to True,
2405
+ results are persisted in a table; otherwise,
2406
+ results are garbage collected at the end of the
2407
+ session.
2408
+ Default Value: False
2409
+ Types: bool
2410
+
1311
2411
  RETURNS:
1312
2412
  Instance of AutoRegressor.
1313
2413
 
@@ -1336,24 +2436,28 @@ class AutoRegressor(AutoML):
1336
2436
 
1337
2437
  # Fit the data.
1338
2438
  >>> automl_obj.fit(housing_train, "price")
2439
+
2440
+ # Display leaderboard.
2441
+ >>> automl_obj.leaderboard()
1339
2442
 
1340
- # Predict using best performing model.
1341
- >>> prediction = automl_obj.predict()
1342
- >>> prediction
2443
+ # Display best performing model.
2444
+ >>> automl_obj.leader()
1343
2445
 
1344
- # Run predict for new test data with best performing model.
2446
+ # Run predict on test data using best performing model.
1345
2447
  >>> prediction = automl_obj.predict(housing_test)
1346
2448
  >>> prediction
1347
2449
 
1348
- # Run predict for new test data with second best performing model.
2450
+ # Run predict on test data using second best performing model.
1349
2451
  >>> prediction = automl_obj.predict(housing_test, rank=2)
1350
2452
  >>> prediction
1351
-
1352
- # Display leaderboard.
1353
- >>> automl_obj.leaderboard()
1354
-
1355
- # Display best performing model.
1356
- >>> automl_obj.leader()
2453
+
2454
+ # Run evaluate to get performance metrics using best performing model.
2455
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2456
+ >>> performance_metrics
2457
+
2458
+ # Run evaluate to get performance metrics using second best performing model.
2459
+ >>> performance_metrics = automl_obj.evaluate(housing_test, 2)
2460
+ >>> performance_metrics
1357
2461
 
1358
2462
  # Example 2 : Run AutoRegressor for regression problem with early stopping metric and tolerance.
1359
2463
  # Scenario : Predict the price of house based on different factors.
@@ -1374,13 +2478,17 @@ class AutoRegressor(AutoML):
1374
2478
  >>> custom_config_file="custom_housing.json")
1375
2479
  # Fit the data.
1376
2480
  >>> automl_obj.fit(housing_train, "price")
1377
-
1378
- # Run predict with best performing model.
1379
- >>> prediction = automl_obj.predict()
1380
- >>> prediction
1381
-
2481
+
1382
2482
  # Display leaderboard.
1383
2483
  >>> automl_obj.leaderboard()
2484
+
2485
+ # Run predict on test data using best performing model.
2486
+ >>> prediction = automl_obj.predict(housing_test)
2487
+ >>> prediction
2488
+
2489
+ # Run evaluate to get performance metrics using best performing model.
2490
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2491
+ >>> performance_metrics
1384
2492
 
1385
2493
  # Example 3 : Run AutoRegressor for regression problem with maximum runtime.
1386
2494
  # Scenario : Predict the price of house based on different factors.
@@ -1392,20 +2500,24 @@ class AutoRegressor(AutoML):
1392
2500
  >>> max_runtime_secs=500)
1393
2501
  # Fit the data.
1394
2502
  >>> automl_obj.fit(housing_train, "price")
1395
-
1396
- # Run predict with best performing model.
1397
- >>> prediction = automl_obj.predict()
1398
- >>> prediction
1399
-
1400
- # Run predict with second best performing model.
1401
- >>> prediction = automl_obj.predict(rank=2)
1402
- >>> prediction
1403
-
2503
+
1404
2504
  # Display leaderboard.
1405
2505
  >>> automl_obj.leaderboard()
1406
2506
 
1407
2507
  # Display best performing model.
1408
2508
  >>> automl_obj.leader()
2509
+
2510
+ # Run predict on test data using best performing model.
2511
+ >>> prediction = automl_obj.predict(housing_test)
2512
+ >>> prediction
2513
+
2514
+ # Run predict on test data using second best performing model.
2515
+ >>> prediction = automl_obj.predict(housing_test, 2)
2516
+ >>> prediction
2517
+
2518
+ # Run evaluate to get performance metrics using best performing model.
2519
+ >>> performance_metrics = automl_obj.evaluate(housing_test)
2520
+ >>> performance_metrics
1409
2521
  """
1410
2522
  self.verbose = verbose
1411
2523
  self.max_runtime_secs = max_runtime_secs
@@ -1425,7 +2537,8 @@ class AutoRegressor(AutoML):
1425
2537
  stopping_metric=self.stopping_metric,
1426
2538
  stopping_tolerance=self.stopping_tolerance,
1427
2539
  max_models=self.max_models,
1428
- custom_config_file=self.custom_config_file)
2540
+ custom_config_file=self.custom_config_file,
2541
+ **kwargs)
1429
2542
  class AutoClassifier(AutoML):
1430
2543
 
1431
2544
  def __init__(self,
@@ -1436,7 +2549,8 @@ class AutoClassifier(AutoML):
1436
2549
  stopping_metric=None,
1437
2550
  stopping_tolerance=None,
1438
2551
  max_models=None,
1439
- custom_config_file=None
2552
+ custom_config_file=None,
2553
+ **kwargs
1440
2554
  ):
1441
2555
  """
1442
2556
  DESCRIPTION:
@@ -1477,8 +2591,10 @@ class AutoClassifier(AutoML):
1477
2591
  Required, when "stopping_tolerance" is set, otherwise optional.
1478
2592
  Specifies the stopping mertics for stopping tolerance in model training.
1479
2593
  Permitted Values:
1480
- * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
1481
- "RMSE", "RMSLE"
2594
+ * For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
2595
+ "MAPE", "MPE", "RMSE", "RMSLE",
2596
+ "ME", "EV", "MPD", "MGD"
2597
+
1482
2598
  * For task_type "Classification": 'MICRO-F1','MACRO-F1',
1483
2599
  'MICRO-RECALL','MACRO-RECALL',
1484
2600
  'MICRO-PRECISION', 'MACRO-PRECISION',
@@ -1500,6 +2616,28 @@ class AutoClassifier(AutoML):
1500
2616
  Optional Argument.
1501
2617
  Specifies the path of json file in case of custom run.
1502
2618
  Types: str
2619
+
2620
+ **kwargs:
2621
+ Specifies the additional arguments for AutoClassifier. Below
2622
+ are the additional arguments:
2623
+ volatile:
2624
+ Optional Argument.
2625
+ Specifies whether to put the interim results of the
2626
+ functions in a volatile table or not. When set to
2627
+ True, results are stored in a volatile table,
2628
+ otherwise not.
2629
+ Default Value: False
2630
+ Types: bool
2631
+
2632
+ persist:
2633
+ Optional Argument.
2634
+ Specifies whether to persist the interim results of the
2635
+ functions in a table or not. When set to True,
2636
+ results are persisted in a table; otherwise,
2637
+ results are garbage collected at the end of the
2638
+ session.
2639
+ Default Value: False
2640
+ Types: bool
1503
2641
 
1504
2642
  RETURNS:
1505
2643
  Instance of AutoClassifier.
@@ -1535,24 +2673,28 @@ class AutoClassifier(AutoML):
1535
2673
 
1536
2674
  # Fit the data.
1537
2675
  >>> automl_obj.fit(admissions_train, "admitted")
2676
+
2677
+ # Display leaderboard.
2678
+ >>> automl_obj.leaderboard()
1538
2679
 
1539
- # Predict using best performing model.
1540
- >>> prediction = automl_obj.predict()
1541
- >>> prediction
2680
+ # Display best performing model.
2681
+ >>> automl_obj.leader()
1542
2682
 
1543
- # Run predict for new test data with best performing model.
2683
+ # Run predict on test data using best performing model.
1544
2684
  >>> prediction = automl_obj.predict(admissions_test)
1545
2685
  >>> prediction
1546
2686
 
1547
- # Run predict for new test data with second best performing model.
2687
+ # Run predict on test data using second best performing model.
1548
2688
  >>> prediction = automl_obj.predict(admissions_test, rank=2)
1549
2689
  >>> prediction
1550
-
1551
- # Display leaderboard.
1552
- >>> automl_obj.leaderboard()
1553
-
1554
- # Display best performing model.
1555
- >>> automl_obj.leader()
2690
+
2691
+ # Run evaluate to get performance metrics using best performing model.
2692
+ >>> performance_metrics = automl_obj.evaluate(admissions_test)
2693
+ >>> performance_metrics
2694
+
2695
+ # Run evaluate to get performance metrics using model rank 4.
2696
+ >>> performance_metrics = automl_obj.evaluate(admissions_test, 4)
2697
+ >>> performance_metrics
1556
2698
 
1557
2699
  # Example 2 : Run AutoClassifier for binary classification.
1558
2700
  # Scenario : Predict whether passenger aboard the RMS Titanic survived
@@ -1561,6 +2703,11 @@ class AutoClassifier(AutoML):
1561
2703
  # configuration file to customize different processes of
1562
2704
  # AutoML Run.
1563
2705
 
2706
+ # Split the data into train and test.
2707
+ >>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
2708
+ >>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
2709
+ >>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
2710
+
1564
2711
  # Generate custom configuration file.
1565
2712
  >>> AutoClassifier.generate_custom_config("custom_titanic")
1566
2713
 
@@ -1568,21 +2715,25 @@ class AutoClassifier(AutoML):
1568
2715
  >>> automl_obj = AutoClassifier(verbose=2,
1569
2716
  >>> custom_config_file="custom_titanic.json")
1570
2717
  # Fit the data.
1571
- >>> automl_obj.fit(titanic, titanic.survived)
1572
-
1573
- # Run predict with best performing model.
1574
- >>> prediction = automl_obj.predict()
1575
- >>> prediction
1576
-
1577
- # Run predict with second best performing model.
1578
- >>> prediction = automl_obj.predict(rank=2)
1579
- >>> prediction
2718
+ >>> automl_obj.fit(titanic_train, titanic_train.survived)
1580
2719
 
1581
2720
  # Display leaderboard.
1582
2721
  >>> automl_obj.leaderboard()
1583
2722
 
1584
2723
  # Display best performing model.
1585
2724
  >>> automl_obj.leader()
2725
+
2726
+ # Run predict on test data using best performing model.
2727
+ >>> prediction = automl_obj.predict(titanic_test)
2728
+ >>> prediction
2729
+
2730
+ # Run predict on test data using second best performing model.
2731
+ >>> prediction = automl_obj.predict(titanic_test, rank=2)
2732
+ >>> prediction
2733
+
2734
+ # Run evaluate to get performance metrics using best performing model.
2735
+ >>> performance_metrics = automl_obj.evaluate(titanic_test)
2736
+ >>> performance_metrics
1586
2737
 
1587
2738
  # Example 3 : Run AutoClassifier for multiclass classification problem.
1588
2739
  # Scenario : Predict the species of iris flower based on different factors.
@@ -1590,6 +2741,11 @@ class AutoClassifier(AutoML):
1590
2741
  # models. Use custom configuration file to customize different
1591
2742
  # processes of AutoML Run.
1592
2743
 
2744
+ # Split the data into train and test.
2745
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
2746
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
2747
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
2748
+
1593
2749
  # Generate custom configuration file.
1594
2750
  >>> AutoClassifier.generate_custom_config("custom_iris")
1595
2751
 
@@ -1597,18 +2753,22 @@ class AutoClassifier(AutoML):
1597
2753
  >>> automl_obj = AutoClassifier(verbose=1,
1598
2754
  >>> custom_config_file="custom_iris.json")
1599
2755
  # Fit the data.
1600
- >>> automl_obj.fit(iris_input, "species")
1601
-
1602
- # Predict using best performing model.
1603
- >>> prediction = automl_obj.predict()
1604
- >>> prediction
1605
-
2756
+ >>> automl_obj.fit(iris_train, "species")
2757
+
1606
2758
  # Display leaderboard.
1607
2759
  >>> automl_obj.leaderboard()
1608
2760
 
1609
2761
  # Display best performing model.
1610
2762
  >>> automl_obj.leader()
1611
2763
 
2764
+ # Predict on test data using best performing model.
2765
+ >>> prediction = automl_obj.predict(iris_test)
2766
+ >>> prediction
2767
+
2768
+ # Run evaluate to get performance metrics using best performing model.
2769
+ >>> performance_metrics = automl_obj.evaluate(iris_test)
2770
+ >>> performance_metrics
2771
+
1612
2772
  # Example 4 : Run AutoClassifier for classification problem with stopping metric and tolerance.
1613
2773
  # Scenario : Predict whether passenger aboard the RMS Titanic survived
1614
2774
  # or not based on differect factors. Use custom configuration
@@ -1616,6 +2776,11 @@ class AutoClassifier(AutoML):
1616
2776
  # performance threshold to acquire for the available models, and
1617
2777
  # terminate training upon meeting the stipulated performance criteria.
1618
2778
 
2779
+ # Split the data into train and test.
2780
+ >>> titanic_sample = titanic.sample(frac = [0.8, 0.2])
2781
+ >>> titanic_train= titanic_sample[titanic_sample['sampleid'] == 1].drop('sampleid', axis=1)
2782
+ >>> titanic_test = titanic_sample[titanic_sample['sampleid'] == 2].drop('sampleid', axis=1)
2783
+
1619
2784
  # Generate custom configuration file.
1620
2785
  >>> AutoClassifier.generate_custom_config("custom_titanic")
1621
2786
 
@@ -1627,18 +2792,27 @@ class AutoClassifier(AutoML):
1627
2792
  >>> max_models=8
1628
2793
  >>> custom_config_file="custom_titanic.json")
1629
2794
  # Fit the data.
1630
- >>> automl_obj.fit(titanic, titanic.survived)
1631
-
1632
- # Run predict with best performing model.
1633
- >>> prediction = automl_obj.predict()
1634
- >>> prediction
1635
-
2795
+ >>> automl_obj.fit(titanic_train, titanic_train.survived)
2796
+
1636
2797
  # Display leaderboard.
1637
2798
  >>> automl_obj.leaderboard()
2799
+
2800
+ # Run predict on test data using best performing model.
2801
+ >>> prediction = automl_obj.predict(titanic_test)
2802
+ >>> prediction
2803
+
2804
+ # Run evaluate to get performance metrics using best performing model.
2805
+ >>> performance_metrics = automl_obj.evaluate(titanic_test)
2806
+ >>> performance_metrics
1638
2807
 
1639
2808
  # Example 5 : Run AutoClassifier for classification problem with maximum runtime.
1640
2809
  # Scenario : Predict the species of iris flower based on different factors.
1641
2810
  # Run AutoML to get the best performing model in specified time.
2811
+
2812
+ # Split the data into train and test.
2813
+ >>> iris_sample = iris_input.sample(frac = [0.8, 0.2])
2814
+ >>> iris_train= iris_sample[iris_sample['sampleid'] == 1].drop('sampleid', axis=1)
2815
+ >>> iris_test = iris_sample[iris_sample['sampleid'] == 2].drop('sampleid', axis=1)
1642
2816
 
1643
2817
  # Create instance of AutoClassifier.
1644
2818
  >>> automl_obj = AutoClassifier(verbose=2,
@@ -1646,21 +2820,25 @@ class AutoClassifier(AutoML):
1646
2820
  >>> max_runtime_secs=500)
1647
2821
  >>> max_models=3)
1648
2822
  # Fit the data.
1649
- >>> automl_obj.fit(iris_input, iris_input.species)
1650
-
1651
- # Run predict with best performing model.
1652
- >>> prediction = automl_obj.predict()
1653
- >>> prediction
1654
-
1655
- # Run predict with second best performing model.
1656
- >>> prediction = automl_obj.predict(rank=2)
1657
- >>> prediction
1658
-
2823
+ >>> automl_obj.fit(iris_train, iris_train.species)
2824
+
1659
2825
  # Display leaderboard.
1660
2826
  >>> automl_obj.leaderboard()
1661
2827
 
1662
2828
  # Display best performing model.
1663
- >>> automl_obj.leader()
2829
+ >>> automl_obj.leader()
2830
+
2831
+ # Run predict on test data using best performing model.
2832
+ >>> prediction = automl_obj.predict(iris_test)
2833
+ >>> prediction
2834
+
2835
+ # Run predict on test data using second best performing model.
2836
+ >>> prediction = automl_obj.predict(iris_test, rank=2)
2837
+ >>> prediction
2838
+
2839
+ # Run evaluate to get performance metrics using model rank 3.
2840
+ >>> performance_metrics = automl_obj.evaluate(iris_test, 3)
2841
+ >>> performance_metrics
1664
2842
  """
1665
2843
  self.verbose = verbose
1666
2844
  self.max_runtime_secs = max_runtime_secs
@@ -1673,11 +2851,12 @@ class AutoClassifier(AutoML):
1673
2851
  self.exclude = exclude
1674
2852
 
1675
2853
  super(AutoClassifier, self).__init__(task_type=self.task_type,
1676
- include = self.include,
1677
- exclude = self.exclude,
1678
- verbose=self.verbose,
1679
- max_runtime_secs=self.max_runtime_secs,
1680
- stopping_metric=self.stopping_metric,
1681
- stopping_tolerance=self.stopping_tolerance,
1682
- max_models=self.max_models,
1683
- custom_config_file=self.custom_config_file)
2854
+ include = self.include,
2855
+ exclude = self.exclude,
2856
+ verbose=self.verbose,
2857
+ max_runtime_secs=self.max_runtime_secs,
2858
+ stopping_metric=self.stopping_metric,
2859
+ stopping_tolerance=self.stopping_tolerance,
2860
+ max_models=self.max_models,
2861
+ custom_config_file=self.custom_config_file,
2862
+ **kwargs)