teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +306 -0
  4. teradataml/__init__.py +10 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +299 -16
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +13 -3
  11. teradataml/analytics/json_parser/utils.py +13 -6
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +11 -2
  15. teradataml/analytics/table_operator/__init__.py +4 -3
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +66 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1502 -323
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +247 -307
  22. teradataml/automl/data_transformation.py +32 -12
  23. teradataml/automl/feature_engineering.py +325 -86
  24. teradataml/automl/model_evaluation.py +44 -35
  25. teradataml/automl/model_training.py +122 -153
  26. teradataml/catalog/byom.py +8 -8
  27. teradataml/clients/pkce_client.py +1 -1
  28. teradataml/common/__init__.py +2 -1
  29. teradataml/common/constants.py +72 -0
  30. teradataml/common/deprecations.py +13 -7
  31. teradataml/common/garbagecollector.py +152 -120
  32. teradataml/common/messagecodes.py +11 -2
  33. teradataml/common/messages.py +4 -1
  34. teradataml/common/sqlbundle.py +26 -4
  35. teradataml/common/utils.py +225 -14
  36. teradataml/common/wrapper_utils.py +1 -1
  37. teradataml/context/context.py +82 -2
  38. teradataml/data/SQL_Fundamentals.pdf +0 -0
  39. teradataml/data/complaints_test_tokenized.csv +353 -0
  40. teradataml/data/complaints_tokens_model.csv +348 -0
  41. teradataml/data/covid_confirm_sd.csv +83 -0
  42. teradataml/data/dataframe_example.json +27 -1
  43. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  44. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  45. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  46. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  47. teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
  48. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  49. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  50. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  51. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  52. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  53. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  54. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  55. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  56. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  57. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  58. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  59. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  60. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  61. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  62. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  63. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  64. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  65. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  66. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  67. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  68. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  69. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  70. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  71. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  72. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  74. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  75. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  76. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  77. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  78. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  79. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  80. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  81. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  82. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  83. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  84. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  85. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  86. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  87. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  88. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  89. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  90. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  91. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  92. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  93. teradataml/data/dwt2d_dataTable.csv +65 -0
  94. teradataml/data/dwt_dataTable.csv +8 -0
  95. teradataml/data/dwt_filterTable.csv +3 -0
  96. teradataml/data/finance_data4.csv +13 -0
  97. teradataml/data/grocery_transaction.csv +19 -0
  98. teradataml/data/idwt2d_dataTable.csv +5 -0
  99. teradataml/data/idwt_dataTable.csv +8 -0
  100. teradataml/data/idwt_filterTable.csv +3 -0
  101. teradataml/data/interval_data.csv +5 -0
  102. teradataml/data/jsons/paired_functions.json +14 -0
  103. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  104. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  105. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  106. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  107. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  108. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  109. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  110. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  111. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  112. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  113. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  114. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  115. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  116. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  117. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  118. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  119. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  120. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  121. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  122. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  123. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  124. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  125. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  126. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  127. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  128. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  129. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  130. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  131. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  132. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  133. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  134. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  135. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  136. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  137. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  138. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  139. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  140. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  141. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  142. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  143. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  144. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  145. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  146. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  147. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  148. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  149. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  150. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  151. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  152. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  153. teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
  154. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  155. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  156. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  157. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  158. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  159. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
  160. teradataml/data/load_example_data.py +8 -2
  161. teradataml/data/medical_readings.csv +101 -0
  162. teradataml/data/naivebayestextclassifier_example.json +1 -1
  163. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  164. teradataml/data/patient_profile.csv +101 -0
  165. teradataml/data/peppers.png +0 -0
  166. teradataml/data/real_values.csv +14 -0
  167. teradataml/data/sax_example.json +8 -0
  168. teradataml/data/scripts/deploy_script.py +1 -1
  169. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  170. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  171. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  172. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  173. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
  174. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  175. teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
  176. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  177. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  178. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  179. teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
  180. teradataml/data/star_pivot.csv +8 -0
  181. teradataml/data/target_udt_data.csv +8 -0
  182. teradataml/data/templates/open_source_ml.json +3 -1
  183. teradataml/data/teradataml_example.json +20 -1
  184. teradataml/data/timestamp_data.csv +4 -0
  185. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  186. teradataml/data/uaf_example.json +55 -1
  187. teradataml/data/unpivot_example.json +15 -0
  188. teradataml/data/url_data.csv +9 -0
  189. teradataml/data/vectordistance_example.json +4 -0
  190. teradataml/data/windowdfft.csv +16 -0
  191. teradataml/dataframe/copy_to.py +1 -1
  192. teradataml/dataframe/data_transfer.py +5 -3
  193. teradataml/dataframe/dataframe.py +1002 -201
  194. teradataml/dataframe/fastload.py +3 -3
  195. teradataml/dataframe/functions.py +867 -0
  196. teradataml/dataframe/row.py +160 -0
  197. teradataml/dataframe/setop.py +2 -2
  198. teradataml/dataframe/sql.py +840 -33
  199. teradataml/dataframe/window.py +1 -1
  200. teradataml/dbutils/dbutils.py +878 -34
  201. teradataml/dbutils/filemgr.py +48 -1
  202. teradataml/geospatial/geodataframe.py +1 -1
  203. teradataml/geospatial/geodataframecolumn.py +1 -1
  204. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  205. teradataml/lib/aed_0_1.dll +0 -0
  206. teradataml/opensource/__init__.py +1 -1
  207. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  208. teradataml/opensource/_lightgbm.py +950 -0
  209. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  210. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  211. teradataml/opensource/sklearn/__init__.py +0 -1
  212. teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
  213. teradataml/options/__init__.py +9 -23
  214. teradataml/options/configure.py +42 -4
  215. teradataml/options/display.py +2 -2
  216. teradataml/plot/axis.py +4 -4
  217. teradataml/scriptmgmt/UserEnv.py +13 -9
  218. teradataml/scriptmgmt/lls_utils.py +77 -23
  219. teradataml/store/__init__.py +13 -0
  220. teradataml/store/feature_store/__init__.py +0 -0
  221. teradataml/store/feature_store/constants.py +291 -0
  222. teradataml/store/feature_store/feature_store.py +2223 -0
  223. teradataml/store/feature_store/models.py +1505 -0
  224. teradataml/store/vector_store/__init__.py +1586 -0
  225. teradataml/table_operators/Script.py +2 -2
  226. teradataml/table_operators/TableOperator.py +106 -20
  227. teradataml/table_operators/query_generator.py +3 -0
  228. teradataml/table_operators/table_operator_query_generator.py +3 -1
  229. teradataml/table_operators/table_operator_util.py +102 -56
  230. teradataml/table_operators/templates/dataframe_register.template +69 -0
  231. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  232. teradataml/telemetry_utils/__init__.py +0 -0
  233. teradataml/telemetry_utils/queryband.py +52 -0
  234. teradataml/utils/dtypes.py +4 -2
  235. teradataml/utils/validators.py +34 -2
  236. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
  237. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
  238. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  239. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  240. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -15,6 +15,7 @@
15
15
 
16
16
  # Python libraries
17
17
  import time
18
+ import ast
18
19
 
19
20
  # Teradata libraries
20
21
  from teradataml.dataframe.dataframe import DataFrame
@@ -56,7 +57,12 @@ class _ModelEvaluator:
56
57
  self.target_column = target_column
57
58
  self.task_type = task_type
58
59
 
59
- def model_evaluation(self, rank, table_name_mapping, test_data_ind = False, target_column_ind = False):
60
+ def model_evaluation(self,
61
+ rank,
62
+ table_name_mapping,
63
+ data_node_id,
64
+ target_column_ind = True,
65
+ get_metrics = False):
60
66
  """
61
67
  DESCRIPTION:
62
68
  Function performs the model evaluation on the specified rank in leaderborad.
@@ -72,25 +78,32 @@ class _ModelEvaluator:
72
78
  Specifies the mapping of train,test table names.
73
79
  Types: dict
74
80
 
75
- test_data_ind:
76
- Optional Argument.
77
- Specifies whether test data is present or not.
78
- Default Value: False
79
- Types: bool
81
+ data_node_id:
82
+ Required Argument.
83
+ Specifies the test data node id.
84
+ Types: str
80
85
 
81
86
  target_column_ind:
82
87
  Optional Argument.
83
88
  Specifies whether target column is present in the dataset or not.
89
+ Default Value: True
90
+ Types: bool
91
+
92
+ get_metrics:
93
+ Optional Argument.
94
+ Specifies whether to return metrics or not.
84
95
  Default Value: False
96
+ Types: bool
85
97
 
86
98
  RETURNS:
87
99
  tuple containing, performance metrics and predicitions of specified rank ML model.
88
100
 
89
101
  """
90
- # Setting test data indicator and target column indicator
91
- self.test_data_ind = test_data_ind
102
+ # Setting target column indicator
92
103
  self.target_column_ind = target_column_ind
93
104
  self.table_name_mapping = table_name_mapping
105
+ self.data_node_id = data_node_id
106
+ self.get_metrics = get_metrics
94
107
 
95
108
  # Return predictions only if test data is present and target column is not present
96
109
  return self._evaluator(rank)
@@ -114,38 +127,34 @@ class _ModelEvaluator:
114
127
  """
115
128
  # Extracting model using rank
116
129
  model = self.model_info.loc[rank]
130
+
131
+ ml_name = self.model_info.loc[rank]['MODEL_ID'].split('_')[0]
117
132
 
118
133
  # Defining eval_params
119
- eval_params = _ModelTraining._eval_params_generation(model['Name'],
134
+ eval_params = _ModelTraining._eval_params_generation(ml_name,
120
135
  self.target_column,
121
136
  self.task_type)
122
137
 
123
- # Test Data
124
- test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
125
-
126
- # Getting test data from table
127
- if not self.test_data_ind:
128
- # Test Data
129
- test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
130
- else:
131
- test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature-Selection'])])
132
-
133
- print("\nFollowing model is being used for generating prediction :")
134
- print("Model ID :", model['Model-ID'],
135
- "\nFeature Selection Method :",model['Feature-Selection'])
138
+ # Extracting test data for evaluation based on data node id
139
+ test = DataFrame(self.table_name_mapping[self.data_node_id]['{}_new_test'.format(model['FEATURE_SELECTION'])])
136
140
 
137
- # Evaluation and predictions
138
- if model['Name'] == 'knn':
139
- metrics = model['model-obj'].evaluate(test_data=test)
140
- pred = model['model-obj'].predict(test_data=test)
141
+ print("\nFollowing model is being picked for evaluation:")
142
+ print("Model ID :", model['MODEL_ID'],
143
+ "\nFeature Selection Method :",model['FEATURE_SELECTION'])
144
+
145
+ if self.task_type.lower() == 'classification':
146
+ params = ast.literal_eval(model['PARAMETERS'])
147
+ eval_params['output_responses'] = params['output_responses']
148
+
149
+ # Mapping data according to model type
150
+ data_map = 'test_data' if ml_name == 'KNN' else 'newdata'
151
+ # Performing evaluation if get_metrics is True else returning predictions
152
+ if self.get_metrics:
153
+ metrics = model['model-obj'].evaluate(**{data_map: test}, **eval_params)
154
+ return metrics
141
155
  else:
142
- # Return predictions only if test data is present and target column is not present
143
- if self.test_data_ind and not self.target_column_ind:
156
+ # Removing accumulate parameter if target column is not present
157
+ if not self.target_column_ind:
144
158
  eval_params.pop("accumulate")
145
- pred = model['model-obj'].predict(newdata=test, **eval_params)
146
- return pred
147
- # Return both metrics and predictions for all other cases
148
- metrics = model['model-obj'].evaluate(newdata=test, **eval_params)
149
- pred = model['model-obj'].predict(newdata=test, **eval_params)
150
-
151
- return (metrics, pred)
159
+ pred = model['model-obj'].predict(**{data_map: test}, **eval_params)
160
+ return pred
@@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor
19
19
  import math
20
20
  import pandas as pd
21
21
  from itertools import product
22
+ import numpy as np
22
23
 
23
24
  # Teradata libraries
24
25
  from teradataml.context import context as tdmlctx
@@ -27,6 +28,7 @@ from teradataml.dataframe.dataframe import DataFrame
27
28
  from teradataml import execute_sql, get_connection
28
29
  from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
29
30
  from teradataml.utils.validators import _Validators
31
+ from teradataml.common.utils import UtilFuncs
30
32
 
31
33
 
32
34
  class _ModelTraining:
@@ -38,7 +40,8 @@ class _ModelTraining:
38
40
  verbose=0,
39
41
  features=None,
40
42
  task_type="Regression",
41
- custom_data = None):
43
+ custom_data = None,
44
+ **kwargs):
42
45
  """
43
46
  DESCRIPTION:
44
47
  Function initializes the data, target column, features and models
@@ -89,6 +92,28 @@ class _ModelTraining:
89
92
  Optional Argument.
90
93
  Specifies json object containing user customized input.
91
94
  Types: json object
95
+
96
+ **kwargs:
97
+ Specifies the additional arguments for model training. Below
98
+ are the additional arguments:
99
+ volatile:
100
+ Optional Argument.
101
+ Specifies whether to put the interim results of the
102
+ functions in a volatile table or not. When set to
103
+ True, results are stored in a volatile table,
104
+ otherwise not.
105
+ Default Value: False
106
+ Types: bool
107
+
108
+ persist:
109
+ Optional Argument.
110
+ Specifies whether to persist the interim results of the
111
+ functions in a table or not. When set to True,
112
+ results are persisted in a table; otherwise,
113
+ results are garbage collected at the end of the
114
+ session.
115
+ Default Value: False
116
+ Types: bool
92
117
  """
93
118
  self.data = data
94
119
  self.target_column = target_column
@@ -99,6 +124,8 @@ class _ModelTraining:
99
124
  self.custom_data = custom_data
100
125
  self.labels = self.data.drop_duplicate(self.target_column).size
101
126
  self.startify_col = None
127
+ self.persist = kwargs.get("persist", False)
128
+ self.volatile = kwargs.get("volatile", False)
102
129
 
103
130
  def model_training(self,
104
131
  auto=True,
@@ -278,20 +305,25 @@ class _ModelTraining:
278
305
  """
279
306
  # Creating a copy to avoid use of same reference of memory
280
307
  if self.task_type != "Regression":
281
- sorted_model_df = trained_models_info.sort_values(by=['Micro-F1', 'Weighted-F1'],
282
- ascending=[False, False]).reset_index(drop=True)
308
+ sorted_model_df = trained_models_info.sort_values(by=['MICRO-F1', 'WEIGHTED-F1'],
309
+ ascending=[False, False]).reset_index(drop=True)
283
310
  else:
284
- sorted_model_df = trained_models_info.sort_values(by='R2-score',
285
- ascending=False).reset_index(drop=True)
311
+ sorted_model_df = trained_models_info.sort_values(by='R2',
312
+ ascending=False).reset_index(drop=True)
313
+
286
314
 
287
315
  # Adding rank to leaderboard
288
- sorted_model_df.insert(0, 'Rank', sorted_model_df.index + 1)
316
+ sorted_model_df.insert(0, 'RANK', sorted_model_df.index + 1)
317
+
318
+ # Internal Data list for leaderboard
319
+ dp_lst = ["model-obj", "DATA_TABLE", "RESULT_TABLE", "PARAMETERS"]
289
320
 
290
321
  # Excluding the model object and model name from leaderboard
291
- leaderboard = sorted_model_df.drop(["model-obj","Name"], axis=1)
322
+ leaderboard = sorted_model_df.drop(dp_lst, axis=1)
323
+
292
324
  # filtering the rows based on the max_models
293
325
  if self.max_models is not None:
294
- leaderboard = leaderboard[leaderboard["Rank"] <= self.max_models]
326
+ leaderboard = leaderboard[leaderboard["RANK"] <= self.max_models]
295
327
 
296
328
  self._display_msg(msg="Leaderboard",
297
329
  progress_bar=self.progress_bar,
@@ -436,28 +468,24 @@ class _ModelTraining:
436
468
  max_depth.extend([6, 7, 8])
437
469
  min_node_size.extend([2])
438
470
  iter_num.extend([20])
439
- num_trees.extend([10, 20])
440
471
  elif num_rows < 10000 and num_cols < 15:
441
472
  min_impurity.extend([0.1, 0.2])
442
473
  shrinkage_factor.extend([0.1, 0.3])
443
474
  max_depth.extend([6, 8, 10])
444
475
  min_node_size.extend([2, 3])
445
476
  iter_num.extend([20, 30])
446
- num_trees.extend([20, 30])
447
477
  elif num_rows < 100000 and num_cols < 20:
448
478
  min_impurity.extend([0.2, 0.3])
449
479
  shrinkage_factor.extend([0.01, 0.1, 0.2])
450
480
  max_depth.extend([4, 6, 7])
451
481
  min_node_size.extend([3, 4])
452
482
  iter_num.extend([30, 40])
453
- num_trees.extend([30, 40])
454
483
  else:
455
484
  min_impurity.extend([0.1, 0.2, 0.3])
456
485
  shrinkage_factor.extend([0.01, 0.05, 0.1])
457
486
  max_depth.extend([3, 4, 7, 8])
458
487
  min_node_size.extend([2, 3, 4])
459
488
  iter_num.extend([20, 30, 40])
460
- num_trees.extend([20, 30, 40])
461
489
 
462
490
  # Hyperparameters for XGBoost model
463
491
  xgb_params = {
@@ -736,12 +764,15 @@ class _ModelTraining:
736
764
 
737
765
  # Hyperparameters for each model
738
766
  model_params = parameters[:min(len(parameters), 5)]
739
- self._display_msg(msg="\nPerforming hyperParameter tuning ...", progress_bar=self.progress_bar)
767
+ self._display_msg(msg="\nPerforming hyperparameter tuning ...", progress_bar=self.progress_bar)
740
768
 
741
- # Defining training and testing data
769
+ # Defining training data
742
770
  data_types = ['lasso', 'rfe', 'pca']
743
771
  trainng_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_train']) for data_type in data_types)
744
- testing_datas = tuple(DataFrame(self.table_name_mapping[f'{data_type}_test']) for data_type in data_types)
772
+
773
+ if self.task_type == "Classification":
774
+ response_values = trainng_datas[0].get(self.target_column).drop_duplicate().get_values().flatten().tolist()
775
+ self.output_response = [str(i) for i in response_values]
745
776
 
746
777
  if self.stopping_metric is None:
747
778
  self.stopping_tolerance, self.stopping_metric = 1.0, 'MICRO-F1' \
@@ -765,115 +796,16 @@ class _ModelTraining:
765
796
 
766
797
  trained_models = []
767
798
  for param in model_params:
768
- result = self._hyperparameter_tunning(param, trainng_datas, testing_datas)
769
- trained_models.append(result)
799
+ result = self._hyperparameter_tunning(param, trainng_datas)
800
+ if result is not None:
801
+ trained_models.append(result)
770
802
 
771
803
  models_df = pd.concat(trained_models, ignore_index=True)
772
-
773
- # Score the model and combine the results into a single DataFrame
774
- trained_models_info = self._model_scoring(testing_datas, models_df)
775
- trained_models_info = trained_models_info.reset_index(drop=True)
776
-
777
- return trained_models_info
778
-
779
- def _model_scoring(self,
780
- test_data,
781
- model_info):
782
- """
783
- DESCRIPTION:
784
- Internal function generates the performance metrics for
785
- trained ML models using testing dataset.
786
-
787
- PARAMETERS:
788
- test_data
789
- Required Argument.
790
- Specifies the testing datasets
791
- Types: tuple of Teradataml DataFrame
792
-
793
- model_info
794
- Required Argument.
795
- Specifies the trained models information.
796
- Types: Pandas DataFrame
797
-
798
- RETURNS:
799
- Pandas DataFrame containing, trained models with their performance metrics.
800
- """
801
- self._display_msg(msg="Evaluating models performance ...",
802
- progress_bar = self.progress_bar,
803
- show_data=True)
804
- # Empty list for storing model performance metrics
805
- model_performance_data = []
806
-
807
- # Mapping feature selection methods to corresponding test data
808
- feature_selection_to_test_data = {"lasso": test_data[0],
809
- "rfe": test_data[1],
810
- "pca": test_data[2]}
811
-
812
- # Iterating over models
813
- for index, model_row in model_info.iterrows():
814
- # Extracting model name, model id, feature selection method, and model object
815
- model_name, model_id, feature_selection, model_object = model_row['Name'], \
816
- model_row['Model-ID'], model_row['Feature-Selection'], model_row['obj']
817
-
818
- # Selecting test data based on feature selection method
819
- test_set = feature_selection_to_test_data[feature_selection]
820
-
821
- # Model evaluation
822
- if model_name == 'knn':
823
- performance_metrics = model_object.evaluate(test_data=test_set)
824
- else:
825
- eval_params = _ModelTraining._eval_params_generation(model_name,
826
- self.target_column,
827
- self.task_type)
828
- performance_metrics = model_object.evaluate(newdata=test_set, **eval_params)
829
-
830
- # Extracting performance metrics
831
- if self.is_classification_type():
832
- # Classification
833
- # Extract performance metrics from the output data
834
- performance_metrics_list = [metric[2] for metric in performance_metrics.output_data.itertuples()]
835
-
836
- # Combine all the elements to form a new row
837
- new_row = [model_name, model_id, feature_selection] + performance_metrics_list + [model_object]
838
- else:
839
- # Regression
840
- regression_metrics = next(performance_metrics.result.itertuples())
841
- sample_size = test_set.select('id').size
842
- feature_count = len(test_set.columns) - 2
843
- r2_score = regression_metrics[8]
844
- adjusted_r2_score = 1 - ((1 - r2_score) * (sample_size - 1) / (sample_size - feature_count - 1))
845
- new_row = [model_name, model_id, feature_selection, regression_metrics[0],
846
- regression_metrics[1], regression_metrics[2], regression_metrics[5],
847
- regression_metrics[6], r2_score, adjusted_r2_score, model_object]
848
-
849
- model_performance_data.append(new_row)
850
-
851
- if self.is_classification_type():
852
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name','Model-ID',
853
- 'Feature-Selection','Accuracy','Micro-Precision',
854
- 'Micro-Recall','Micro-F1',
855
- 'Macro-Precision','Macro-Recall',
856
- 'Macro-F1','Weighted-Precision',
857
- 'Weighted-Recall','Weighted-F1',
858
- 'model-obj'])
859
- else:
860
- model_metrics_df = pd.DataFrame(model_performance_data, columns=['Name', 'Model-ID',
861
- 'Feature-Selection',
862
- 'MAE', 'MSE', 'MSLE',
863
- 'RMSE', 'RMSLE',
864
- 'R2-score',
865
- 'Adjusted R2-score',
866
- 'model-obj'])
867
- self._display_msg(msg="Evaluation completed.",
868
- progress_bar = self.progress_bar,
869
- show_data=True)
870
-
871
- return model_metrics_df
872
-
804
+ return models_df
805
+
873
806
  def _hyperparameter_tunning(self,
874
807
  model_param,
875
- train_data,
876
- test_data):
808
+ train_data):
877
809
  """
878
810
  DESCRIPTION:
879
811
  Internal function performs hyperparameter tuning on
@@ -890,11 +822,6 @@ class _ModelTraining:
890
822
  Specifies the training datasets.
891
823
  Types: tuple of Teradataml DataFrame
892
824
 
893
- test_data
894
- Required Argument.
895
- Specifies the testing datasets
896
- Types: tuple of Teradataml DataFrame
897
-
898
825
  RETURNS:
899
826
  pandas DataFrame containing, trained models information.
900
827
  """
@@ -910,13 +837,21 @@ class _ModelTraining:
910
837
  # Input columns for model
911
838
  model_param['input_columns'] = self.features
912
839
 
840
+ # Setting persist for model
841
+ model_param['persist'] = self.persist
842
+
913
843
  self._display_msg(msg=model_param['name'],
914
844
  progress_bar=self.progress_bar,
915
845
  show_data=True)
916
846
 
917
- # Defining test data for KNN
847
+ # As we are using entire data for HPT training. So,
848
+ # passing prepared training data as test_data for KNN.
918
849
  if model_param['name'] == 'knn':
919
- model_param['test_data'] = test_data
850
+ model_param['test_data'] = train_data
851
+
852
+ if self.task_type == "Classification":
853
+ model_param['output_prob'] = True
854
+ model_param['output_responses'] = self.output_response
920
855
 
921
856
  # Using RandomSearch for hyperparameter tunning when max_models is given.
922
857
  # Otherwise, using GridSearch for hyperparameter tunning.
@@ -951,26 +886,45 @@ class _ModelTraining:
951
886
  sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
952
887
 
953
888
  # Getting all passed models
954
- _df = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID']], on='MODEL_ID', how='inner')
955
- # Creating mapping data ID to feature selection method
956
- data_id_to_method_map = {"DF_0": "lasso", "DF_1": "rfe", "DF_2": "pca"}
957
-
958
- # Mapping data ID to feature selection method
959
- _df['Feature-Selection'] = _df['DATA_ID'].map(data_id_to_method_map)
960
- # Getting model details
961
- _df['Name'] = model_param['name']
962
- _df['Model-ID'] = _df['MODEL_ID']
963
- _df['obj'] = _df['MODEL_ID'].apply(lambda x: _obj.get_model(x))
964
-
965
- # Extracting needed columns
966
- model_info = _df[["Name", "Model-ID", "Feature-Selection", "obj"]]
967
-
968
- self._display_msg(msg="-"*100,
969
- progress_bar=self.progress_bar,
970
- show_data=True)
971
- self.progress_bar.update()
889
+ model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
890
+ on='MODEL_ID', how='inner')
891
+ if not model_info.empty:
892
+ # Creating mapping data ID to feature selection method
893
+ data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
894
+ "DF_1": ('rfe', train_data[1]._table_name),
895
+ "DF_2": ('pca', train_data[2]._table_name)}
896
+
897
+ # Updating model stats with feature selection method and result table
898
+ for index, row in model_info.iterrows():
899
+ model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
900
+ model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
901
+ model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
902
+ model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
903
+
904
+ # Dropping column 'DATA_ID'
905
+ model_info.drop(['DATA_ID'], axis=1, inplace=True)
972
906
 
973
- return model_info
907
+ model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
908
+
909
+ if not self.is_classification_type():
910
+ # Calculating Adjusted-R2 for regression
911
+ # Getting size and feature count for each feature selection method
912
+ methods = ["lasso", "rfe", "pca"]
913
+ size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
914
+ feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
915
+ model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
916
+ 1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
917
+ (size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
918
+
919
+ self._display_msg(msg="-"*100,
920
+ progress_bar=self.progress_bar,
921
+ show_data=True)
922
+ self.progress_bar.update()
923
+
924
+ return model_info
925
+
926
+ # Returning None, if no model is passed
927
+ return None
974
928
 
975
929
  @staticmethod
976
930
  def _eval_params_generation(ml_name,
@@ -1006,21 +960,36 @@ class _ModelTraining:
1006
960
  # Setting the eval_params
1007
961
  eval_params = {"id_column": "id",
1008
962
  "accumulate": target_column}
963
+
964
+ model_type = {
965
+ 'xgboost': 'model_type',
966
+ 'glm': 'model_type',
967
+ 'decisionforest': 'tree_type',
968
+ 'svm': 'model_type',
969
+ 'knn': 'model_type'
970
+ }
971
+
972
+ ml_name = ml_name.replace('_', '').lower()
1009
973
 
1010
974
  # For Classification
1011
975
  if task_type.lower() != "regression":
976
+ eval_params[model_type[ml_name]] = 'Classification'
977
+ eval_params['output_prob'] = True
978
+
1012
979
  if ml_name == 'xgboost':
1013
- eval_params['model_type'] = 'Classification'
1014
980
  eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
1015
- else:
1016
- if ml_name == 'glm':
1017
- eval_params['family'] = 'BINOMIAL'
1018
-
1019
- eval_params['output_prob'] = True
981
+
982
+ elif ml_name == 'glm':
983
+ eval_params['family'] = 'BINOMIAL'
984
+
1020
985
  else:
1021
986
  # For Regression
987
+ eval_params[model_type[ml_name]] = 'Regression'
988
+
1022
989
  if ml_name == 'xgboost':
1023
- eval_params['model_type'] = 'Regression'
1024
990
  eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
991
+
992
+ elif ml_name == 'glm':
993
+ eval_params['family'] = 'GAUSSIAN'
1025
994
 
1026
995
  return eval_params
@@ -26,7 +26,7 @@ from teradataml.options.display import display
26
26
  from teradataml.common.constants import ModelCatalogingConstants as mac
27
27
  from teradataml.options.configure import configure
28
28
  from teradataml.utils.utils import execute_sql
29
- from teradatasqlalchemy.telemetry.queryband import collect_queryband
29
+ from teradataml.telemetry_utils.queryband import collect_queryband
30
30
 
31
31
  validator = _Validators()
32
32
 
@@ -541,13 +541,12 @@ def save_byom(model_id,
541
541
  # If exists, extract required information about table columns types
542
542
  # else extract from additional_columns_types.
543
543
  # Also validate model_id against allowed length.
544
- table_exists = connection.dialect.has_table(connection, table_name=table_name, schema=schema_name)
544
+ table_exists = connection.dialect.has_table(connection, table_name=table_name,
545
+ schema=schema_name, table_only=True)
545
546
  if table_exists:
546
547
  # Check if model exists or not. If exists, raise error.
547
548
  __check_if_model_exists(
548
549
  model_id, table_name, schema_name, raise_error_if_model_found=True)
549
- if len(additional_columns_types) != 0:
550
- warnings.warn("Argument additional_columns_types is ignored since table already exists.", stacklevel=2)
551
550
 
552
551
  # Gather column name and type information from existing table
553
552
  existing_table_df = DataFrame(in_schema(schema_name, table_name))
@@ -807,7 +806,7 @@ def delete_byom(model_id, table_name=None, schema_name=None):
807
806
 
808
807
  # Before proceed further, check whether table exists or not.
809
808
  conn = get_connection()
810
- if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
809
+ if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
811
810
  error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
812
811
  error_msg = Messages.get_message(
813
812
  error_code, "delete", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))
@@ -1472,7 +1471,7 @@ def retrieve_byom(model_id,
1472
1471
 
1473
1472
  # Before proceeding further, check whether table exists or not.
1474
1473
  conn = get_connection()
1475
- if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
1474
+ if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
1476
1475
  error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
1477
1476
  error_msg = Messages.get_message(
1478
1477
  error_code, "retrieve", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))
@@ -1535,7 +1534,8 @@ def retrieve_byom(model_id,
1535
1534
  license_table = in_schema(license_schema_name, license_table_name)
1536
1535
 
1537
1536
  # Check whether license table exists or not before proceed further.
1538
- if not conn.dialect.has_table(conn, table_name=license_table_name, schema=license_schema_name):
1537
+ if not conn.dialect.has_table(conn, table_name=license_table_name, schema=license_schema_name,
1538
+ table_only=True):
1539
1539
  error_code = MessageCodes.EXECUTION_FAILED
1540
1540
  error_msg = Messages.get_message(
1541
1541
  error_code, "retrieve the model", 'Table "{}" does not exist.'.format(license_table))
@@ -1723,7 +1723,7 @@ def list_byom(table_name=None, schema_name=None, model_id=None):
1723
1723
 
1724
1724
  # Before proceeding further, check whether table exists or not.
1725
1725
  conn = get_connection()
1726
- if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name):
1726
+ if not conn.dialect.has_table(conn, table_name=table_name, schema=schema_name, table_only=True):
1727
1727
  error_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
1728
1728
  error_msg = Messages.get_message(
1729
1729
  error_code, "list", 'Table "{}.{}" does not exist.'.format(schema_name, table_name))
@@ -425,7 +425,7 @@ class _DAWorkflow:
425
425
  """
426
426
  device_cfg = requests.post(
427
427
  url=self.device_auth_end_point,
428
- data={'client_id': self.__client_id})
428
+ data={'client_id': self.__client_id, 'scope': 'openid'})
429
429
 
430
430
  # Check the status. If response is not 200, raise error.
431
431
  _Validators._validate_http_response(device_cfg, 200, "get the device metadata")
@@ -1 +1,2 @@
1
- from teradataml.common.formula import as_categorical
1
+ from teradataml.common.formula import as_categorical
2
+ from teradataml.common.constants import Action, Permission