teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +306 -0
  4. teradataml/__init__.py +10 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +299 -16
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +13 -3
  11. teradataml/analytics/json_parser/utils.py +13 -6
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +11 -2
  15. teradataml/analytics/table_operator/__init__.py +4 -3
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +66 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1502 -323
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +247 -307
  22. teradataml/automl/data_transformation.py +32 -12
  23. teradataml/automl/feature_engineering.py +325 -86
  24. teradataml/automl/model_evaluation.py +44 -35
  25. teradataml/automl/model_training.py +122 -153
  26. teradataml/catalog/byom.py +8 -8
  27. teradataml/clients/pkce_client.py +1 -1
  28. teradataml/common/__init__.py +2 -1
  29. teradataml/common/constants.py +72 -0
  30. teradataml/common/deprecations.py +13 -7
  31. teradataml/common/garbagecollector.py +152 -120
  32. teradataml/common/messagecodes.py +11 -2
  33. teradataml/common/messages.py +4 -1
  34. teradataml/common/sqlbundle.py +26 -4
  35. teradataml/common/utils.py +225 -14
  36. teradataml/common/wrapper_utils.py +1 -1
  37. teradataml/context/context.py +82 -2
  38. teradataml/data/SQL_Fundamentals.pdf +0 -0
  39. teradataml/data/complaints_test_tokenized.csv +353 -0
  40. teradataml/data/complaints_tokens_model.csv +348 -0
  41. teradataml/data/covid_confirm_sd.csv +83 -0
  42. teradataml/data/dataframe_example.json +27 -1
  43. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  44. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  45. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  46. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  47. teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
  48. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  49. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  50. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  51. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  52. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  53. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  54. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  55. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  56. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  57. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  58. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  59. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  60. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  61. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  62. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  63. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  64. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  65. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  66. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  67. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  68. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  69. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  70. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  71. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  72. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  74. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  75. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  76. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  77. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  78. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  79. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  80. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  81. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  82. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  83. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  84. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  85. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  86. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  87. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  88. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  89. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  90. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  91. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  92. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  93. teradataml/data/dwt2d_dataTable.csv +65 -0
  94. teradataml/data/dwt_dataTable.csv +8 -0
  95. teradataml/data/dwt_filterTable.csv +3 -0
  96. teradataml/data/finance_data4.csv +13 -0
  97. teradataml/data/grocery_transaction.csv +19 -0
  98. teradataml/data/idwt2d_dataTable.csv +5 -0
  99. teradataml/data/idwt_dataTable.csv +8 -0
  100. teradataml/data/idwt_filterTable.csv +3 -0
  101. teradataml/data/interval_data.csv +5 -0
  102. teradataml/data/jsons/paired_functions.json +14 -0
  103. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  104. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  105. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  106. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  107. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  108. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  109. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  110. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  111. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  112. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  113. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  114. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  115. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  116. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  117. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  118. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  119. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  120. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  121. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  122. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  123. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  124. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  125. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  126. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  127. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  128. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  129. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  130. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  131. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  132. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  133. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  134. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  135. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  136. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  137. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  138. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  139. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  140. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  141. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  142. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  143. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  144. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  145. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  146. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  147. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  148. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  149. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  150. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  151. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  152. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  153. teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
  154. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  155. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  156. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  157. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  158. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  159. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
  160. teradataml/data/load_example_data.py +8 -2
  161. teradataml/data/medical_readings.csv +101 -0
  162. teradataml/data/naivebayestextclassifier_example.json +1 -1
  163. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  164. teradataml/data/patient_profile.csv +101 -0
  165. teradataml/data/peppers.png +0 -0
  166. teradataml/data/real_values.csv +14 -0
  167. teradataml/data/sax_example.json +8 -0
  168. teradataml/data/scripts/deploy_script.py +1 -1
  169. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  170. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  171. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  172. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  173. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
  174. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  175. teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
  176. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  177. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  178. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  179. teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
  180. teradataml/data/star_pivot.csv +8 -0
  181. teradataml/data/target_udt_data.csv +8 -0
  182. teradataml/data/templates/open_source_ml.json +3 -1
  183. teradataml/data/teradataml_example.json +20 -1
  184. teradataml/data/timestamp_data.csv +4 -0
  185. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  186. teradataml/data/uaf_example.json +55 -1
  187. teradataml/data/unpivot_example.json +15 -0
  188. teradataml/data/url_data.csv +9 -0
  189. teradataml/data/vectordistance_example.json +4 -0
  190. teradataml/data/windowdfft.csv +16 -0
  191. teradataml/dataframe/copy_to.py +1 -1
  192. teradataml/dataframe/data_transfer.py +5 -3
  193. teradataml/dataframe/dataframe.py +1002 -201
  194. teradataml/dataframe/fastload.py +3 -3
  195. teradataml/dataframe/functions.py +867 -0
  196. teradataml/dataframe/row.py +160 -0
  197. teradataml/dataframe/setop.py +2 -2
  198. teradataml/dataframe/sql.py +840 -33
  199. teradataml/dataframe/window.py +1 -1
  200. teradataml/dbutils/dbutils.py +878 -34
  201. teradataml/dbutils/filemgr.py +48 -1
  202. teradataml/geospatial/geodataframe.py +1 -1
  203. teradataml/geospatial/geodataframecolumn.py +1 -1
  204. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  205. teradataml/lib/aed_0_1.dll +0 -0
  206. teradataml/opensource/__init__.py +1 -1
  207. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  208. teradataml/opensource/_lightgbm.py +950 -0
  209. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  210. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  211. teradataml/opensource/sklearn/__init__.py +0 -1
  212. teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
  213. teradataml/options/__init__.py +9 -23
  214. teradataml/options/configure.py +42 -4
  215. teradataml/options/display.py +2 -2
  216. teradataml/plot/axis.py +4 -4
  217. teradataml/scriptmgmt/UserEnv.py +13 -9
  218. teradataml/scriptmgmt/lls_utils.py +77 -23
  219. teradataml/store/__init__.py +13 -0
  220. teradataml/store/feature_store/__init__.py +0 -0
  221. teradataml/store/feature_store/constants.py +291 -0
  222. teradataml/store/feature_store/feature_store.py +2223 -0
  223. teradataml/store/feature_store/models.py +1505 -0
  224. teradataml/store/vector_store/__init__.py +1586 -0
  225. teradataml/table_operators/Script.py +2 -2
  226. teradataml/table_operators/TableOperator.py +106 -20
  227. teradataml/table_operators/query_generator.py +3 -0
  228. teradataml/table_operators/table_operator_query_generator.py +3 -1
  229. teradataml/table_operators/table_operator_util.py +102 -56
  230. teradataml/table_operators/templates/dataframe_register.template +69 -0
  231. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  232. teradataml/telemetry_utils/__init__.py +0 -0
  233. teradataml/telemetry_utils/queryband.py +52 -0
  234. teradataml/utils/dtypes.py +4 -2
  235. teradataml/utils/validators.py +34 -2
  236. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
  237. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
  238. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  239. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  240. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@ from collections import OrderedDict, defaultdict
19
19
  from importlib import import_module
20
20
 
21
21
  import base64
22
- import functools
23
22
  import json
24
23
  import numpy
25
24
  import os
@@ -28,7 +27,7 @@ import time
28
27
  import inspect
29
28
  import warnings
30
29
  import json
31
- import random
30
+ import math
32
31
  import pandas as pd
33
32
  from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
34
33
  import pandas.api.types as pt
@@ -41,19 +40,18 @@ from teradataml.context.context import _get_current_databasename, get_connection
41
40
  from teradataml.dbutils.filemgr import install_file, remove_file
42
41
  from teradataml.utils.utils import execute_sql
43
42
  from teradataml.options.configure import configure
44
- from teradataml.opensource.sklearn._wrapper_utils import _validate_fit_run, _generate_new_name,\
43
+ from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
45
44
  _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
46
- from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
45
+ from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
47
46
  _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
48
47
  _OSML_ADDITIONAL_COLUMN_TYPES
49
48
  from teradataml.common.messagecodes import MessageCodes
50
49
  from teradataml.common.messages import Messages
51
50
  from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
52
- from teradataml.dbutils.dbutils import _create_table
51
+ from teradataml.dbutils.dbutils import _create_table, set_session_param
53
52
  from teradataml.utils.validators import _Validators
54
53
  from teradataml.dataframe.dataframe import DataFrame
55
54
  from teradataml.dataframe.dataframe_utils import DataFrameUtils
56
- from teradataml.scriptmgmt.lls_utils import create_env, get_env
57
55
  from teradataml.common.garbagecollector import GarbageCollector
58
56
  from teradataml.common.constants import TeradataConstants
59
57
 
@@ -64,8 +62,15 @@ validator = _Validators()
64
62
 
65
63
  installed_model_files = defaultdict(int)
66
64
 
65
+ ## Flag to ensure the sklearn script
66
+ ## installation occurs only once.
67
+ _file_installed = False
68
+
67
69
  class _GenericObjectWrapper:
68
70
  def __init__(self) -> None:
71
+ if not get_connection():
72
+ raise TeradataMlException(Messages.get_message(MessageCodes.INVALID_CONTEXT_CONNECTION),
73
+ MessageCodes.INVALID_CONTEXT_CONNECTION)
69
74
  self._db_name = _get_current_databasename()
70
75
 
71
76
  self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
@@ -86,43 +91,24 @@ class _GenericObjectWrapper:
86
91
  if configure.openml_user_env is not None:
87
92
  self._env = configure.openml_user_env
88
93
  else:
89
- self._create_or_get_env()
94
+ self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
90
95
  else:
91
- execute_sql(f"SET SESSION SEARCHUIFDBPATH = {self._db_name};")
92
-
93
- def _create_or_get_env(self):
94
- """
95
- Internal function to return the env if already exists else
96
- creates the environment using template file and return the env.
97
- """
98
- # Get the template file path.
99
- template_dir_path = os.path.join(_TDML_DIRECTORY, "data", "templates",
100
- "open_source_ml.json")
96
+ set_session_param("searchuifdbpath",self._db_name)
101
97
 
102
- # Read template file.
103
- with open(template_dir_path, "r") as r_file:
104
- data = json.load(r_file)
98
+ global _file_installed
99
+ ## Flag to check whether trained model is installed or not.
100
+ self._is_trained_model_installed = False
105
101
 
106
- # Get env_name.
107
- _env_name = data["env_specs"][0]["env_name"]
102
+ ## Install all sklearn script files on Vantage.
103
+ if not _file_installed:
104
+ sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
105
+ "sklearn_transform.py", "sklearn_fit_predict.py",
106
+ "sklearn_neighbors.py", "sklearn_model_selection_split.py"]
107
+ for script_file in sklearn_script_files:
108
+ self._install_script_file(file_identifier=script_file.split(".")[0],
109
+ file_name=script_file)
108
110
 
109
- try:
110
- # Call function to 'openml_env' get env.
111
- self._env = get_env(_env_name)
112
- except TeradataMlException as tdml_e:
113
- # We will get here when error says, env does not exist otherwise raise the exception as is.
114
- # Env does not exist so create one.
115
-
116
- exc_msg = "Failed to execute get_env(). User environment '{}' not " \
117
- "found.".format(_env_name)
118
- if exc_msg in tdml_e.args[0]:
119
- print(f"No OpenAF environment with name '{_env_name}' found. Creating one with "\
120
- "latest supported python and required packages.")
121
- _env = create_env(template=template_dir_path)
122
- else:
123
- raise tdml_e
124
- except Exception as exc:
125
- raise exc
111
+ _file_installed = True
126
112
 
127
113
  def _get_columns_as_list(self, cols):
128
114
  """
@@ -205,13 +191,32 @@ class _GenericObjectWrapper:
205
191
  is_binary=is_binary)
206
192
  else:
207
193
  status = self._env.install_file(file_path=new_script,
208
- replace=True,
209
- suppress_output=True)
194
+ replace=True,
195
+ suppress_output=True)
210
196
  if not status:
211
197
  raise TeradataMlException(
212
198
  f"Script file '{file_name}' failed to get installed/replaced in Vantage."
213
199
  )
214
200
 
201
+ def _remove_script_file(self, file_name):
202
+ """
203
+ Internal function to remove script file in Vantage.
204
+ """
205
+ # _env is set while object creation
206
+ # If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
207
+
208
+ if not self._is_lake_system:
209
+ status = remove_file(file_identifier=file_name.split(".")[0],
210
+ force_remove=True,
211
+ suppress_output=True)
212
+ else:
213
+ status = self._env.remove_file(file_name=file_name,
214
+ suppress_output=True)
215
+ if not status:
216
+ raise TeradataMlException(
217
+ f"Script file '{file_name}' failed to remove in Vantage."
218
+ )
219
+
215
220
  def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
216
221
  idx_delim=",",
217
222
  types_delim="--"):
@@ -261,7 +266,7 @@ class _GenericObjectWrapper:
261
266
  args_str += f" {strr}"
262
267
  return args_str
263
268
 
264
- def extract_sklearn_obj(self, n_unique_partitions = 1, n_partition_cols = 1):
269
+ def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1):
265
270
  """
266
271
  Internal function to extract sklearn object from the model(s) depending on the number of
267
272
  partitions. When it is only one model, it is directly used as sklearn object (modelObj).
@@ -294,33 +299,130 @@ class _GenericObjectWrapper:
294
299
 
295
300
  warnings.filterwarnings("default")
296
301
 
302
+ def _validate_existence_of_partition_columns(self, partition_columns, all_columns, arg_names_for_dfs):
303
+ """
304
+ Validate if columns in "partition_columns" argument are present in any of the given
305
+ dataframes.
306
+ """
307
+ invalid_part_cols = [c for c in partition_columns if c not in all_columns]
297
308
 
298
- class _OpenSourceObjectWrapper(_GenericObjectWrapper):
299
- # This has to be set for every package which subclasses this class.
300
- OPENSOURCE_PACKAGE_NAME = None
309
+ if invalid_part_cols:
310
+ raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
311
+ ", ".join(invalid_part_cols),
312
+ "', '".join(arg_names_for_dfs))
313
+ )
301
314
 
302
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
303
- if not model and not module_name and not class_name:
304
- raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
305
- "module_name and class_name"),
306
- MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
315
+ def _prepare_data_args_string(self, kwargs):
316
+ """
317
+ Get column indices and types of each data related arguments in the format:
318
+ "{<arg_name>-<comma separated indices>-<comma separated types>}--
319
+ {<arg_name>-<comma separated indices>-<comma separated types>}"
320
+ """
321
+ data_args_str = []
322
+ for arg_name in list(self._data_args.keys()):
323
+ # Remove DataFrame arguments from kwargs, which will be passed to Script.
324
+ kwargs.pop(arg_name)
307
325
 
308
- validator._validate_mutually_inclusive_arguments(module_name, "module_name",
309
- class_name, "class_name")
326
+ # Get column indices and their types for each dataframe from parent dataframe.
327
+ _, partition_indices_str, partition_types_str, _ = \
328
+ self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
329
+ self._data_args[arg_name].columns,
330
+ idx_delim=",",
331
+ types_delim=",")
332
+
333
+ # Format "<arg_name>-<comma separated indices>-<comma separated types>"
334
+ data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
335
+
336
+ # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
337
+ # {<arg_name>-<comma separated indices>-<comma separated types>}"
338
+ return "--".join(data_args_str)
310
339
 
311
- super().__init__()
340
+ def _prepare_and_install_file(self, replace_dict):
341
+ """
342
+ Prepare function script file from template file and install it in Vantage.
343
+ Takes the dictionary with keys as strings to be replaced in script and values as
344
+ strings which should be added in place of keys.
345
+ """
312
346
 
313
- self.module_name = module_name
314
- self.class_name = class_name
315
- self.kwargs = kwargs if kwargs is not None else {}
316
- self.pos_args = pos_args if pos_args is not None else tuple()
347
+ with open(os.path.join(self._scripts_path, self._template_file)) as fp:
348
+ script_data = fp.read()
349
+
350
+ for old, new in replace_dict.items():
351
+ script_data = script_data.replace(old, new)
317
352
 
318
- self._fit_label_columns_types = None
319
- self._table_name_prefix = None
353
+ self._script_file_local = os.path.join(self._tdml_tmp_dir, self._script_file_name)
320
354
 
321
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
322
- self._fit_partition_colums_non_default = None
323
- self._is_default_partition_value_predict = True # False when the user provides partition columns.
355
+ with open(self._script_file_local, "w") as fp:
356
+ fp.write(script_data)
357
+
358
+ self._install_script_file(file_identifier=self._script_file_name.split(".")[0],
359
+ file_name=self._script_file_name,
360
+ file_location=self._tdml_tmp_dir)
361
+
362
+ def _get_dataframe_related_args_and_their_columns(self, kwargs):
363
+ """
364
+ Get dataframe related arguments and return all their column names from kwargs.
365
+ """
366
+ __data_columns = []
367
+ __data_args_dict = OrderedDict()
368
+
369
+ # Separate dataframe related arguments and their column names from actual kwargs.
370
+ for k, v in kwargs.items():
371
+ if isinstance(v, DataFrame):
372
+ # All dataframes should be select of parent dataframe.
373
+ _validate_df_query_type(v, "select", k)
374
+
375
+ # Save all columns in dataframe related arguments.
376
+ __data_columns.extend(v.columns)
377
+
378
+ __data_args_dict[k] = v
379
+
380
+ return __data_args_dict, __data_columns
381
+
382
+ def _process_data_for_funcs_returning_objects(self, kwargs):
383
+ """
384
+ Internal function to process all arguments and assign self._data_args, self._tdml_df
385
+ and return
386
+ 1. dictionary of elements (needed to replace in the script template file)
387
+ 2. partition columns list.
388
+ """
389
+ partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
390
+ if partition_cols:
391
+ kwargs.pop("partition_columns")
392
+
393
+ self._data_args, __data_columns = self._get_dataframe_related_args_and_their_columns(kwargs)
394
+
395
+ arg_names_for_dfs = list(self._data_args.keys())
396
+
397
+ # Get common parent dataframe from all dataframes.
398
+ self._tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self._data_args.values()))
399
+
400
+ self._tdml_df = self._tdml_df.select(__data_columns + partition_cols)
401
+
402
+ self._validate_existence_of_partition_columns(partition_cols, self._tdml_df.columns, arg_names_for_dfs)
403
+
404
+ self._tdml_df, partition_cols = self._get_data_and_data_partition_columns(self._tdml_df,
405
+ __data_columns,
406
+ [],
407
+ partition_cols
408
+ )
409
+
410
+ # Prepare string of data arguments with name, indices where columns of that argument resides
411
+ # and types of each of the column.
412
+ data_args_str = self._prepare_data_args_string(kwargs)
413
+
414
+ # Get indices of partition_columns and types of all columns.
415
+ data_column_types_str, partition_indices_str, _, partition_cols = \
416
+ self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
417
+ partition_cols,
418
+ types_delim=None,
419
+ idx_delim=None)
420
+
421
+ replace_dict = {"<partition_cols_indices>": str(partition_indices_str),
422
+ "<types_of_data_cols>": str(data_column_types_str),
423
+ "<data_args_info_str>": f"'{data_args_str}'"}
424
+
425
+ return replace_dict, partition_cols
324
426
 
325
427
  def _validate_equality_of_partition_values(self, fit_values, trans_values):
326
428
  """
@@ -335,294 +437,139 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
335
437
 
336
438
  return True
337
439
 
338
- def _validate_unique_partition_values(self, data, partition_columns):
440
+ def _get_non_data_related_args_from_kwargs(self, kwargs):
339
441
  """
340
- Internal function to validate if the partition values in partition_columns used in fit()
341
- and predict() are same.
442
+ Get all non-data related arguments from kwargs.
342
443
  """
343
- data._index_label = None
344
- unique_values = data.drop_duplicate(partition_columns).get_values()
345
-
346
- trans_unique_values = sorted(unique_values.tolist(), key=lambda x: tuple(x))
347
- fit_unique_values = sorted(self._fit_partition_unique_values.tolist() \
348
- if not isinstance(self._fit_partition_unique_values, list) \
349
- else self._fit_partition_unique_values, key=lambda x: tuple(x))
350
- default_unique_values = [[self._default_data_partition_value]]
351
-
352
- if fit_unique_values == default_unique_values and \
353
- trans_unique_values != default_unique_values:
354
- error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
355
- "without", "with")
356
- msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
357
- raise TeradataMlException(error_msg, msg_code)
444
+ non_data_related_args = {}
445
+ for k, v in kwargs.items():
446
+ if not isinstance(v, DataFrame):
447
+ non_data_related_args[k] = v
448
+ non_data_related_args.pop("partition_columns", None)
449
+ return non_data_related_args
358
450
 
359
- if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
360
- raise TeradataMlException(
361
- Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING),
362
- MessageCodes.PARTITION_VALUES_NOT_MATCHING
363
- )
451
+ def _read_from_template_and_write_dict_to_file(self, template_file, replace_dict,
452
+ output_script_file_name=None):
453
+ """
454
+ Read template file, replace the keys with values and write to new file.
455
+ """
456
+ with open(os.path.join(self._scripts_path, template_file)) as fp:
457
+ script_data = fp.read()
458
+
459
+ for old, new in replace_dict.items():
460
+ script_data = script_data.replace(old, new)
364
461
 
365
- def fit(self, **kwargs):
366
- pass
462
+ if output_script_file_name is None:
463
+ output_script_file_name = self._script_file_name
464
+ file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
465
+ with open(file_path, "w") as fp:
466
+ fp.write(script_data)
367
467
 
368
- def __get_obj_attributes_multi_model(self, name):
468
+ def _generate_script_file_from_template_file(self, kwargs, template_file, func_name,
469
+ output_script_file_name=None):
369
470
  """
370
- Internal function to get attributes of all sklearn model objects when multiple models are
371
- generated by fit.
471
+ Internal function to generate script file from template file. It just adds the non-data
472
+ related arguments to the template file and writes the contents to new file, so that these
473
+ arguments are available in the script file for running this function "func_name".
372
474
  """
373
- # Wrapper function to invoke dynamic method, using arguments
374
- # passed by user, on model in each row.
375
- def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
376
- multi_models = self.modelObj.copy()
377
- for i in range(multi_models.shape[0]):
378
- curr_model = multi_models.iloc[i]["model"]
379
- multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
380
- return multi_models.rename(columns={"model": name})
475
+ # Take out all non-data related arguments to write to template file.
476
+ non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
381
477
 
382
- # Identify if attribute is callable or not to avoid
383
- # this check in loop for every model.
384
- is_attr_callable = False
385
- # Assuming that self.modelObj will have at least 1 row.
386
- is_attr_callable = callable(getattr(self.modelObj.iloc[0]["model"], name))
478
+ # Read template file and write the contents to new file with non-data related arguments.
479
+ template_f = os.path.join(self._scripts_path, template_file)
480
+ with open(template_f, "r") as f:
481
+ template = f.read()
387
482
 
388
- # If attribute is callable, it should be applied on model in each row
389
- # using passed arguments.
390
- if is_attr_callable:
391
- return __sklearn_method_invoker_for_multimodel
483
+ if output_script_file_name is None:
484
+ output_script_file_name = self._script_file_name
485
+ file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
486
+ with open(file_path, "w") as f:
487
+ f.write("import json\n")
488
+ f.write(f"params = json.loads('{json.dumps(non_data_related_args)}')\n")
489
+ f.write(template)
392
490
 
393
- output_attributes = self.modelObj.copy()
394
- for i in range(output_attributes.shape[0]):
395
- model = output_attributes.iloc[i]["model"]
396
- output_attributes.at[i, "model"] = getattr(model, name)
397
- return output_attributes.rename(columns={"model": name})
491
+ kwargs["file_name"] = output_script_file_name
492
+ kwargs["name"] = func_name
398
493
 
399
- def __getattr__(self, name):
400
- # This just run attributes (functions and properties) from sklearn object.
401
- def __sklearn_method_invoker(*c, **kwargs):
402
- return atrribute_instance(*c, **kwargs)
403
- if isinstance(self.modelObj, pd.DataFrame):
404
- return self.__get_obj_attributes_multi_model(name)
494
+ def _remove_data_related_args_from_kwargs(self, kwargs):
495
+ """
496
+ Internal function to remove data related arguments from kwargs.
497
+ """
498
+ kwargs.pop("data", None)
499
+ kwargs.pop("feature_columns", None)
500
+ kwargs.pop("group_columns", None)
501
+ kwargs.pop("partition_columns", None)
502
+ kwargs.pop("label_columns", None)
405
503
 
406
- atrribute_instance = getattr(self.modelObj, name)
407
- if callable(atrribute_instance):
408
- return __sklearn_method_invoker
409
- return atrribute_instance
504
+ def _convert_pos_args_to_kwargs_for_function(self, pos_args, kwargs, func_name):
505
+ """
506
+ Internal function to convert positional arguments to keyword arguments.
507
+ """
508
+ fn = getattr(getattr(import_module(self.module_name), self.class_name), func_name)
509
+ kwargs.update(zip(fn.__code__.co_varnames[1:], pos_args))
410
510
 
411
- @classmethod
412
- def _validate_model_supportability(cls, model):
511
+ def _install_model_and_script_files(self, file_name, file_location):
413
512
  """
414
- Internal function to validate if the model provided for deployment is supported by
415
- teradataml's opensourceML.
513
+ Internal function to install model and script files to Vantage.
416
514
  """
417
- error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
418
- "The given model is not a supported opensource model.")
419
- msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
420
- try:
421
- # For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
422
- # TODO: check for other supported packages.
423
- if model.__module__.split(".")[0] not in OpenSourcePackage.values():
424
- raise TeradataMlException(error_msg, msg_code)
425
- except Exception as ex:
426
- # If in case, model.__module__ fails.
427
- raise TeradataMlException(error_msg, msg_code) from ex
515
+ self._install_initial_model_file()
516
+ self._install_script_file(file_identifier=file_name.split(".")[0],
517
+ file_name=file_name,
518
+ is_binary=False,
519
+ file_location=file_location)
428
520
 
429
- def _save_model(self, model_name, replace_if_exists=False):
521
+ def _assign_fit_variables_after_execution(self, data, partition_columns, label_columns):
430
522
  """
431
- Internal function to save the model stored in file at location mentioned by class variable
432
- "model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
433
- on the value of "replace_if_exists" argument.
523
+ Internal function to assign fit related variables.
434
524
  """
435
- # Creating a table, if doesn't exist, in Vantage to store the model info.
436
- conn = get_connection()
437
- osml_models_table_exists = conn.dialect.has_table(conn,
438
- table_name=_OSML_MODELS_TABLE_NAME,
439
- schema=self._db_name)
440
- if not osml_models_table_exists:
441
- all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
442
- all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
443
- _create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
444
- primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
525
+ # Extract sklearn object(s) from the depending on the number of unique partitioning values.
526
+ self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
527
+ n_partition_cols=len(partition_columns))
445
528
 
446
- model_obj = OpensourceModels(is_default_partition_value=self._is_default_partition_value_fit,
447
- partition_file_prefix=self._model_file_name_prefix,
448
- fit_partition_columns_non_default=self._fit_partition_colums_non_default,
449
- model=self.modelObj,
450
- pos_args=self.pos_args,
451
- key_args=self.kwargs)
529
+ # Need this label columns types in prediction.
530
+ self._fit_label_columns_types = []
531
+ self._fit_label_columns_python_types = []
452
532
 
453
- # Saved the model object to a file to be used in save_byom() for writing to Vantage table.
454
- file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
455
- with open(file_name, "wb+") as fp:
456
- fp.write(pickle.dumps(model_obj))
533
+ for l_c in label_columns:
534
+ column_data = data._td_column_names_and_sqlalchemy_types[l_c.lower()]
535
+ self._fit_label_columns_types.append(column_data)
536
+ self._fit_label_columns_python_types.append(column_data.python_type.__name__)
457
537
 
458
- try:
459
- save_byom(model_id=model_name,
460
- model_file=file_name,
461
- table_name=_OSML_MODELS_TABLE_NAME,
462
- additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
463
- additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
464
- except TeradataMlException as ex:
465
- model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
466
- if not replace_if_exists and model_exists_msg == str(ex):
467
- raise
468
- elif replace_if_exists and model_exists_msg == str(ex):
469
- # Delete the model from Model table and save again.
470
- delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
471
- save_byom(model_id=model_name,
472
- model_file=file_name,
473
- table_name=_OSML_MODELS_TABLE_NAME,
474
- additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
475
- additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
476
- else:
477
- raise
478
- finally:
479
- os.remove(file_name)
538
+ # If the model is trained a second time after the object creation,
539
+ # or if set_params() is called after the first model training,
540
+ # this flag will reset to False. So that for subsequent predict/score
541
+ # operations, the newly trained model will be installed.
542
+ if self._is_trained_model_installed:
543
+ self._is_trained_model_installed = False
480
544
 
481
- @classmethod
482
- def _deploy(cls, model_name, model, replace_if_exists=False):
483
- """
484
- Internal function to create an instance of the class using the model and deploy
485
- the model to Vantage.
486
- """
487
- cls._validate_model_supportability(model=model)
488
545
 
489
- cls = cls(model=model)
490
- # Load the model file into Vantage node as file can be used in
491
- # predict or other operations.
492
- cls._install_initial_model_file()
546
+ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
547
+ # This has to be set for every package which subclasses this class.
548
+ OPENSOURCE_PACKAGE_NAME = None
493
549
 
494
- cls._save_model(model_name, replace_if_exists)
495
-
496
- return cls
497
-
498
- @classmethod
499
- def _load(cls, model_name):
500
- """
501
- Internal function to load model corresponding to the package (like sklearn etc)
502
- from Vantage to client using retrieve_byom() and create an instance of the class if
503
- the model is from the same package.
504
- """
505
- try:
506
- model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
507
- return_addition_columns=True)
508
- except TeradataMlException as ex:
509
- # Not showing table name in error message as it is an internal table.
510
- part_msg = f"Model '{model_name}' not found in the table "
511
- if part_msg in str(ex):
512
- raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
513
- MessageCodes.MODEL_NOT_FOUND)
514
- raise
515
-
516
- model_vals_list = model.get_values()[0]
517
- # List of 3 elements -
518
- # - model name as index column,
519
- # - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
520
- # - 2nd contains package name.
521
- model_obj = pickle.loads(model_vals_list[0])
522
- model = model_obj.model
523
- package = model_vals_list[1]
524
-
525
- if package != cls.OPENSOURCE_PACKAGE_NAME.value:
526
- # Raise error if trying to access model of different package.
527
- raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
528
- f". Requested model is from '{package}' package"),
529
- MessageCodes.MODEL_NOT_FOUND)
530
-
531
- if isinstance(model, pd.DataFrame):
532
- # Create a new instance of the class and set the model object to the instance.
533
- # Instantiation can take only model, not model object. Hence, passing one of the model
534
- # from pandas df. Updating modelObj and other fields later
535
- cls = cls(model=model.iloc[1,2])
536
- cls.modelObj = model
537
- cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
538
- else:
539
- cls = cls(model=model)
540
-
541
- cls._model_file_name_prefix = model_obj.partition_file_prefix
542
- cls._is_default_partition_value_fit = model_obj.is_default_partition_value
543
- cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
544
- cls.pos_args = model_obj.pos_args
545
- cls.kwargs = model_obj.key_args
546
-
547
- # Load the model file into Vantage node as file can be used in
548
- # predict or other operations.
549
- cls._install_initial_model_file()
550
-
551
- return cls
552
-
553
- def deploy(self, model_name, replace_if_exists=False):
554
- """
555
- DESCRIPTION:
556
- Deploys the model held by interface object to Vantage.
557
-
558
- PARAMETERS:
559
- model_name:
560
- Required Argument.
561
- Specifies the unique name of the model to be deployed.
562
- Types: str
563
-
564
- replace_if_exists:
565
- Optional Argument.
566
- Specifies whether to replace the model if a model with the same name already
567
- exists in Vantage. If this argument is set to False and a model with the same
568
- name already exists, then the function raises an exception.
569
- Default Value: False
570
- Types: bool
571
-
572
- RETURNS:
573
- The opensource object wrapper.
574
-
575
- RAISES:
576
- TeradataMLException if model with "model_name" already exists and the argument
577
- "replace_if_exists" is set to False.
578
-
579
- EXAMPLES:
580
- >>> from teradataml import td_sklearn
581
- >>> model = td_sklearn.LinearRegression(normalize=True)
582
- >>> model
583
- LinearRegression(normalize=True)
584
-
585
- # Example 1: Deploy the model held by interface object to Vantage.
586
- >>> lin_reg = model.deploy("linreg_model_ver_2")
587
- Model is saved.
588
- >>> lin_reg
589
- LinearRegression(normalize=True)
590
-
591
- # Example 2: Deploy the model held by interface object to Vantage with the name same
592
- # as that of model that already existed in Vantage.
593
- >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
594
- Model is deleted.
595
- Model is saved.
596
- >>> lin_reg
597
- LinearRegression(normalize=True)
598
- """
599
-
600
- # Install model file into Vantage, if not installed.
601
- self._install_initial_model_file()
602
-
603
- self._save_model(model_name, replace_if_exists)
604
- return self
550
+ def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
551
+ if model is None and not module_name and not class_name:
552
+ raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
553
+ "module_name and class_name"),
554
+ MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
605
555
 
556
+ validator._validate_mutually_inclusive_arguments(module_name, "module_name",
557
+ class_name, "class_name")
606
558
 
607
- class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
559
+ super().__init__()
608
560
 
609
- OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
561
+ self.module_name = module_name
562
+ self.class_name = class_name
563
+ self.kwargs = kwargs if kwargs is not None else {}
564
+ self.pos_args = pos_args if pos_args is not None else tuple()
610
565
 
611
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
612
- super().__init__(model=model, module_name=module_name, class_name=class_name,
613
- pos_args=pos_args, kwargs=kwargs)
566
+ self._fit_label_columns_types = None
567
+ self._fit_label_columns_python_types = None
568
+ self._table_name_prefix = None
614
569
 
615
- self._initialize_variables()
616
- if model:
617
- self.modelObj = model
618
- self.module_name = model.__module__.split("._")[0]
619
- self.class_name = model.__class__.__name__
620
- # __dict__ gets all the arguments as dictionary including default ones and positional
621
- # args.
622
- self.kwargs = model.__dict__
623
- self.pos_args = tuple() # Kept empty as all are moved to kwargs.
624
- else:
625
- self._initialize_object()
570
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
571
+ self._fit_partition_colums_non_default = None
572
+ self._is_default_partition_value_predict = True # False when the user provides partition columns.
626
573
 
627
574
  def __repr__(self):
628
575
  if self._is_default_partition_value_fit:
@@ -636,19 +583,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
636
583
  pd.reset_option("display.max_colwidth")
637
584
  return opt
638
585
 
639
- def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
640
- skip_either_or_that=False):
641
- """
642
- Internal function to validate arguments passed to exposed opensource APIs and return
643
- parent DataFrame, feature columns, label columns, group columns, data partition columns.
644
- """
645
- _validate_opensource_func_args(X=X, y=y, groups=groups,
646
- fit_partition_cols=self._fit_partition_colums_non_default,
647
- kwargs=kwargs,
648
- skip_either_or_that=skip_either_or_that)
649
- return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
650
- fit_partition_cols=self._fit_partition_colums_non_default)
651
-
652
586
  def _initialize_object(self):
653
587
  """
654
588
  Internal function to initialize sklearn object from module name and class name.
@@ -657,6 +591,13 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
657
591
  imported_args = {}
658
592
  # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
659
593
  # corresponding sklearn object.
594
+ _partition_column_names = None
595
+ if "partition_columns" in self.kwargs:
596
+ self._fit_partition_colums_non_default = self.kwargs["partition_columns"]
597
+ self._is_default_partition_value_fit = False
598
+ _partition_column_names = self._fit_partition_colums_non_default
599
+
600
+
660
601
  new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
661
602
  new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
662
603
 
@@ -681,19 +622,33 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
681
622
  # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
682
623
  # are not supported yet due to pickling issue.
683
624
  continue
684
- if k in self.get_params():
685
- self.kwargs[k] = v
625
+ if self.get_params():
626
+ if k in self.get_params():
627
+ self.kwargs[k] = v
628
+ else:
629
+ _model_init_arguments = None
630
+ try:
631
+ _model_init_arguments = self.modelObj.__init__.__code__.co_varnames
632
+ except AttributeError:
633
+ pass
634
+ if _model_init_arguments:
635
+ self.kwargs = dict((k, v) for k, v in _arguments.items() if k in _model_init_arguments)
636
+ else:
637
+ self.kwargs = _arguments
686
638
  else:
687
639
  # Model selection classes will not have `get_params`, in which case modelObj's __dict__
688
640
  # is saved as kwargs.
689
641
  self.kwargs = _arguments
690
642
 
691
- def _initialize_variables(self):
643
+ if _partition_column_names:
644
+ self.kwargs["partition_columns"] = _partition_column_names
645
+
646
+ def _initialize_variables(self, table_name_prefix):
692
647
  """
693
648
  Internal function to initialize variables used in this class.
694
649
  """
695
650
  self.feature_names_in_ = None
696
- self._table_name_prefix = "td_sklearn_"
651
+ self._table_name_prefix = table_name_prefix
697
652
  self._model_file_name_prefix = _generate_new_name(type="file")
698
653
  self.model_file_paths_local = set()
699
654
 
@@ -710,6 +665,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
710
665
  self._is_model_installed = False
711
666
  self._fit_partition_unique_values = [[self._default_data_partition_value]]
712
667
 
668
+ def _get_returning_df(self, script_df, partition_column, returns):
669
+ """
670
+ Internal function to return the teradataml Dataframe except
671
+ partition_column.
672
+ """
673
+ if self._is_default_partition_value_fit:
674
+ # For single model case, partition column is internally generated
675
+ # and no point in returning it to the user.
676
+
677
+ # Extract columns from return types.
678
+ returning_cols = [col[0] for col in returns[len(partition_column):]]
679
+ return script_df.select(returning_cols)
680
+ return script_df
681
+
713
682
  def modify_args(self, fp1, arg, imported_args):
714
683
  """
715
684
  Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
@@ -752,61 +721,480 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
752
721
  self.modify_args(fp1, k, imported_args),
753
722
  self.modify_args(fp1, v, imported_args),
754
723
  )
755
- for k, v in arg.items()
724
+ for k, v in arg.items() if k != "partition_columns"
756
725
  )
726
+ # elif arg == "partition_columns":
727
+
757
728
  else:
758
729
  return arg
759
730
 
760
- def _install_initial_model_file(self):
761
- """
762
- If model file(s) is/are not installed in Vantage, then install it/them.
731
+ def _install_initial_model_file(self, use_dummy_initial_file=False):
732
+ """
733
+ If model file(s) is/are not installed in Vantage, then install it/them.
734
+ """
735
+ if isinstance(self.modelObj, pd.DataFrame):
736
+ # Get list of unique partition values and corresponding model object as dict.
737
+ partition_values_model_dict = {}
738
+ obj_list = self.modelObj.values.tolist()
739
+ for lst in obj_list:
740
+ partition_values_model_dict[tuple(lst[:len(self._fit_partition_colums_non_default)])] = \
741
+ lst[len(self._fit_partition_colums_non_default)]
742
+
743
+ for partition in self._fit_partition_unique_values:
744
+ # Create a new file with file name with partition values and
745
+ # dump sklearn object into it. Finally install the file to Vantage.
746
+ partition_join = "_".join([str(x) for x in partition])
747
+ file_name = f"{self._model_file_name_prefix}_{partition_join}"
748
+ # Replace '-' with '_' as '-' can't be present in file identifier.
749
+ # Needed this replace because partition_columns can be negative.
750
+ file_name = file_name.replace("-", "_")
751
+ full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
752
+ with open(full_file_name, "wb+") as fp:
753
+ # Write sklearn object to file.
754
+ if isinstance(self.modelObj, pd.DataFrame):
755
+ # If multiple models, then write the model corresponding to the partition value.
756
+ fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
757
+ else:
758
+ if use_dummy_initial_file:
759
+ fp.write(pickle.dumps("abc"))
760
+ else:
761
+ fp.write(pickle.dumps(self.modelObj))
762
+ self.model_file_paths_local.add(file_name)
763
+
764
+ self._install_script_file(file_identifier=file_name,
765
+ file_name=file_name,
766
+ is_binary=True,
767
+ file_location=self._tdml_tmp_dir)
768
+
769
+ if self._is_lake_system:
770
+ # Need to pass env_name along with file_name for cleaning up the files in env.
771
+ obj = f"{self._env.env_name}::{file_name}"
772
+ if installed_model_files[obj] == 0:
773
+ # Add to GC for the first time the model file (along with env name) is encountered.
774
+ installed_model_files[obj] = 1
775
+ GarbageCollector._add_to_garbagecollector(object_name=obj,
776
+ object_type=TeradataConstants.TERADATA_APPLY)
777
+ else:
778
+ if installed_model_files[file_name] == 0:
779
+ # Add to GC for the first time the model file is encountered.
780
+ installed_model_files[file_name] = 1
781
+ GarbageCollector._add_to_garbagecollector(object_name=file_name,
782
+ object_type=TeradataConstants.TERADATA_SCRIPT)
783
+
784
+ self._is_model_installed = True
785
+
786
+ def _validate_unique_partition_values(self, data, partition_columns):
787
+ """
788
+ Internal function to validate if the partition values in partition_columns used in fit()
789
+ and predict() are same.
790
+ """
791
+ data._index_label = None
792
+ unique_values = data.drop_duplicate(partition_columns).get_values()
793
+
794
+ trans_unique_values = sorted(unique_values.tolist(), key=lambda x: tuple(x))
795
+ fit_unique_values = sorted(self._fit_partition_unique_values.tolist() \
796
+ if not isinstance(self._fit_partition_unique_values, list) \
797
+ else self._fit_partition_unique_values, key=lambda x: tuple(x))
798
+ default_unique_values = [[self._default_data_partition_value]]
799
+
800
+ if fit_unique_values == default_unique_values and \
801
+ trans_unique_values != default_unique_values:
802
+ error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
803
+ "without", "with")
804
+ msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
805
+ raise TeradataMlException(error_msg, msg_code)
806
+
807
+ if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
808
+ raise TeradataMlException(
809
+ Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING, "training", "test"),
810
+ MessageCodes.PARTITION_VALUES_NOT_MATCHING
811
+ )
812
+
813
+ def fit(self, **kwargs):
814
+ pass
815
+
816
+ def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
817
+ """
818
+ Internal function to convert all OpensourceML related objects in arguments to
819
+ underlying model objects.
820
+ """
821
+ if isinstance(args, dict):
822
+ new_args = args.copy() # To avoid updating
823
+ for k, v in new_args.items():
824
+ if isinstance(v, type(self)):
825
+ if idx_multi_model is not None:
826
+ # single model. This argument is set only when modelObj is single model.
827
+ new_args[k] = v.modelObj
828
+ else:
829
+ # multi-model. Get appropriate model from modelObj.
830
+ new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
831
+ else:
832
+ new_args[k] = v
833
+ return new_args
834
+
835
+ # If args is tuple, convert all elements to underlying model object.
836
+ elif isinstance(args, tuple):
837
+ new_args = tuple()
838
+ for arg in args:
839
+ if isinstance(arg, type(self)):
840
+ if idx_multi_model is None:
841
+ # single model. This argument is set only when modelObj is single model.
842
+ new_args += (arg.modelObj,)
843
+ else:
844
+ # multi-model. Get appropriate model from modelObj.
845
+ new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
846
+ else:
847
+ new_args += (arg,)
848
+ return new_args
849
+ return args
850
+
851
+ def __get_obj_attributes_multi_model(self, name):
852
+ """
853
+ Internal function to get attributes of all sklearn model objects when multiple models are
854
+ generated by fit.
855
+ """
856
+
857
+ def __generate_model_object(model_obj_value, init_model_obj):
858
+ """
859
+ Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
860
+ """
861
+ # Create _SkLearnObjectWrapper object from opensource model object.
862
+ model_obj = self.__class__(model=init_model_obj)
863
+
864
+ model_obj.modelObj = model_obj_value
865
+ model_obj._is_model_installed = True
866
+
867
+ # Setting other model attributes.
868
+ model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
869
+ model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
870
+ model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
871
+ model_obj._fit_partition_unique_values = self._fit_partition_unique_values
872
+ return model_obj
873
+
874
+ # Wrapper function to invoke dynamic method, using arguments
875
+ # passed by user, on model in each row.
876
+ def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
877
+ multi_models = self.modelObj.copy()
878
+ for i in range(multi_models.shape[0]):
879
+ curr_model = multi_models.iloc[i]["model"]
880
+ partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
881
+ partition_values = "_".join([str(x) for x in partition_values])
882
+ if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
883
+ # filename is first argument.
884
+ kwargs1 = kwargs.copy()
885
+ c1 = c
886
+
887
+ if len(c) > 0:
888
+ c1 = list(c1)
889
+ c1[0] = f"{c1[0]}_{partition_values}"
890
+ c1 = tuple(c1)
891
+ if len(kwargs) > 0 and kwargs.get("filename", None):
892
+ kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
893
+
894
+ multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
895
+ **self._convert_arguments_to_modelObj(kwargs1, i))
896
+ else:
897
+ multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
898
+ **self._convert_arguments_to_modelObj(kwargs, i))
899
+
900
+ first_function_value = multi_models.at[0, "model"]
901
+ if self.__class__._validate_model_supportability(first_function_value):
902
+ return __generate_model_object(multi_models, init_model_obj=first_function_value)
903
+
904
+ multi_models = multi_models.rename(columns={"model": name})
905
+
906
+ # Select only partition columns and the attribute column.
907
+ return multi_models[self._fit_partition_colums_non_default + [name]]
908
+
909
+ # Assuming that self.modelObj will have at least 1 row.
910
+
911
+ # Get attribute instance from first model object.
912
+ first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
913
+
914
+ # If first_atrribute_instance is callable, it should be applied on model in each row
915
+ # using passed arguments.
916
+ if callable(first_atrribute_instance):
917
+ return __sklearn_method_invoker_for_multimodel
918
+
919
+ output_attributes = self.modelObj.copy()
920
+ for i in range(output_attributes.shape[0]):
921
+ model = output_attributes.iloc[i]["model"]
922
+ output_attributes.at[i, "model"] = getattr(model, name)
923
+
924
+ if self.__class__._validate_model_supportability(first_atrribute_instance):
925
+ return __generate_model_object(output_attributes, init_model_obj=first_atrribute_instance)
926
+
927
+ return output_attributes.rename(columns={"model": name})
928
+
929
+ def __getattr__(self, name):
930
+ # This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
931
+ def __sklearn_method_invoker(*c, **kwargs):
932
+ # Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
933
+ model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
934
+ if self.__class__._validate_model_supportability(model_obj):
935
+ model_obj = self.__class__(model=model_obj)
936
+ model_obj._is_model_installed = True # Trained model is returned by function call.
937
+ return model_obj
938
+
939
+ if isinstance(self.modelObj, pd.DataFrame):
940
+ return self.__get_obj_attributes_multi_model(name)
941
+
942
+ attribute_instance = getattr(self.modelObj, name)
943
+
944
+ if callable(attribute_instance):
945
+ return __sklearn_method_invoker
946
+
947
+ if self.__class__._validate_model_supportability(attribute_instance):
948
+ # sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
949
+ model_obj = self.__class__(model=attribute_instance)
950
+ model_obj._is_model_installed = True # Trained model is returned as attribute.
951
+ return model_obj
952
+
953
+ return attribute_instance
954
+
955
+ @classmethod
956
+ def _validate_model_supportability(cls, model):
957
+ """
958
+ Internal function to validate if the model provided for deployment is supported by
959
+ teradataml's opensourceML.
960
+ """
961
+ error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
962
+ "The given model is not a supported opensource model.")
963
+ msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
964
+ package_name = None
965
+ class_name = None
966
+ try:
967
+ # For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
968
+ # TODO: check for other supported packages.
969
+ if hasattr(model, "__module__"):
970
+ package_name = model.__module__.split(".")[0]
971
+ if package_name not in OpenSourcePackage.values():
972
+ return False
973
+ if hasattr(model, "__class__"):
974
+ class_name = model.__class__.__name__
975
+ except Exception as ex:
976
+ # If in case, model.__module__ fails.
977
+ raise TeradataMlException(error_msg, msg_code) from ex
978
+
979
+ # True only if package name is opensource package name and class name is not internal class.
980
+ return True if package_name and class_name and \
981
+ package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
982
+
983
+ def _save_model(self, model_name, replace_if_exists=False):
984
+ """
985
+ Internal function to save the model stored in file at location mentioned by class variable
986
+ "model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
987
+ on the value of "replace_if_exists" argument.
988
+ """
989
+ # Creating a table, if doesn't exist, in Vantage to store the model info.
990
+ conn = get_connection()
991
+ osml_models_table_exists = conn.dialect.has_table(conn,
992
+ table_name=_OSML_MODELS_TABLE_NAME,
993
+ schema=self._db_name,
994
+ table_only=True)
995
+ if not osml_models_table_exists:
996
+ all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
997
+ all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
998
+ _create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
999
+ primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
1000
+
1001
+ model_obj = OpensourceModels(is_default_partition_value=self._is_default_partition_value_fit,
1002
+ partition_file_prefix=self._model_file_name_prefix,
1003
+ fit_partition_columns_non_default=self._fit_partition_colums_non_default,
1004
+ model=self.modelObj,
1005
+ pos_args=self.pos_args,
1006
+ key_args=self.kwargs)
1007
+
1008
+ # Saved the model object to a file to be used in save_byom() for writing to Vantage table.
1009
+ file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
1010
+ with open(file_name, "wb+") as fp:
1011
+ fp.write(pickle.dumps(model_obj))
1012
+
1013
+ try:
1014
+ save_byom(model_id=model_name,
1015
+ model_file=file_name,
1016
+ table_name=_OSML_MODELS_TABLE_NAME,
1017
+ additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
1018
+ additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
1019
+ except TeradataMlException as ex:
1020
+ model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
1021
+ if not replace_if_exists and model_exists_msg == str(ex):
1022
+ raise
1023
+ elif replace_if_exists and model_exists_msg == str(ex):
1024
+ # Delete the model from Model table and save again.
1025
+ delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
1026
+ save_byom(model_id=model_name,
1027
+ model_file=file_name,
1028
+ table_name=_OSML_MODELS_TABLE_NAME,
1029
+ additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
1030
+ additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
1031
+ else:
1032
+ raise
1033
+ finally:
1034
+ os.remove(file_name)
1035
+
1036
+ @classmethod
1037
+ def _deploy(cls, model_name, model, replace_if_exists=False):
1038
+ """
1039
+ Internal function to create an instance of the class using the model and deploy
1040
+ the model to Vantage.
1041
+ """
1042
+ is_model_supportable = cls._validate_model_supportability(model=model)
1043
+ if not is_model_supportable:
1044
+ raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
1045
+ "deploy", "The given model is not a supported opensource model."),
1046
+ MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
1047
+
1048
+ cls = cls(model=model)
1049
+ # Load the model file into Vantage node as file can be used in
1050
+ # predict or other operations.
1051
+ cls._install_initial_model_file()
1052
+
1053
+ cls._save_model(model_name, replace_if_exists)
1054
+
1055
+ return cls
1056
+
1057
+ @classmethod
1058
+ def _load(cls, model_name):
1059
+ """
1060
+ Internal function to load model corresponding to the package (like sklearn etc)
1061
+ from Vantage to client using retrieve_byom() and create an instance of the class if
1062
+ the model is from the same package.
1063
+ """
1064
+ try:
1065
+ model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
1066
+ return_addition_columns=True)
1067
+ except TeradataMlException as ex:
1068
+ # Not showing table name in error message as it is an internal table.
1069
+ part_msg = f"Model '{model_name}' not found in the table "
1070
+ if part_msg in str(ex):
1071
+ raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
1072
+ MessageCodes.MODEL_NOT_FOUND)
1073
+ raise
1074
+
1075
+ model_vals_list = model.get_values()[0]
1076
+ # List of 3 elements -
1077
+ # - model name as index column,
1078
+ # - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
1079
+ # - 2nd contains package name.
1080
+ model_obj = pickle.loads(model_vals_list[0])
1081
+ model = model_obj.model
1082
+ package = model_vals_list[1]
1083
+
1084
+ if package != cls.OPENSOURCE_PACKAGE_NAME.value:
1085
+ # Raise error if trying to access model of different package.
1086
+ raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
1087
+ f". Requested model is from '{package}' package"),
1088
+ MessageCodes.MODEL_NOT_FOUND)
1089
+
1090
+ if isinstance(model, pd.DataFrame):
1091
+ # Create a new instance of the class and set the model object to the instance.
1092
+ # Instantiation can take only model, not model object. Hence, passing one of the model
1093
+ # from pandas df. Updating modelObj and other fields later
1094
+ cls = cls(model=model.iloc[1,2])
1095
+ cls.modelObj = model
1096
+ cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
1097
+ else:
1098
+ cls = cls(model=model)
1099
+
1100
+ cls._model_file_name_prefix = model_obj.partition_file_prefix
1101
+ cls._is_default_partition_value_fit = model_obj.is_default_partition_value
1102
+ cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
1103
+ cls.pos_args = model_obj.pos_args
1104
+ cls.kwargs = model_obj.key_args
1105
+
1106
+ # Load the model file into Vantage node as file can be used in
1107
+ # predict or other operations.
1108
+ cls._install_initial_model_file()
1109
+
1110
+ return cls
1111
+
1112
+ def deploy(self, model_name, replace_if_exists=False):
1113
+ """
1114
+ DESCRIPTION:
1115
+ Deploys the model held by interface object to Vantage.
1116
+
1117
+ PARAMETERS:
1118
+ model_name:
1119
+ Required Argument.
1120
+ Specifies the unique name of the model to be deployed.
1121
+ Types: str
1122
+
1123
+ replace_if_exists:
1124
+ Optional Argument.
1125
+ Specifies whether to replace the model if a model with the same name already
1126
+ exists in Vantage. If this argument is set to False and a model with the same
1127
+ name already exists, then the function raises an exception.
1128
+ Default Value: False
1129
+ Types: bool
1130
+
1131
+ RETURNS:
1132
+ The opensource object wrapper.
1133
+
1134
+ RAISES:
1135
+ TeradataMLException if model with "model_name" already exists and the argument
1136
+ "replace_if_exists" is set to False.
1137
+
1138
+ EXAMPLES:
1139
+ >>> from teradataml import td_sklearn
1140
+ >>> model = td_sklearn.LinearRegression(normalize=True)
1141
+ >>> model
1142
+ LinearRegression(normalize=True)
1143
+
1144
+ # Example 1: Deploy the model held by interface object to Vantage.
1145
+ >>> lin_reg = model.deploy("linreg_model_ver_2")
1146
+ Model is saved.
1147
+ >>> lin_reg
1148
+ LinearRegression(normalize=True)
1149
+
1150
+ # Example 2: Deploy the model held by interface object to Vantage with the name same
1151
+ # as that of model that already existed in Vantage.
1152
+ >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
1153
+ Model is deleted.
1154
+ Model is saved.
1155
+ >>> lin_reg
1156
+ LinearRegression(normalize=True)
763
1157
  """
764
- if isinstance(self.modelObj, pd.DataFrame):
765
- # Get list of unique partition values and corresponding model object as dict.
766
- partition_values_model_dict = {}
767
- obj_list = self.modelObj.values.tolist()
768
- for lst in obj_list:
769
- partition_values_model_dict[tuple(lst[:len(lst)-1])] = lst[len(lst)-1]
770
1158
 
771
- for partition in self._fit_partition_unique_values:
772
- # Create a new file with file name with partition values and
773
- # dump sklearn object into it. Finally install the file to Vantage.
774
- partition_join = "_".join([str(x) for x in partition])
775
- file_name = f"{self._model_file_name_prefix}_{partition_join}"
776
- # Replace '-' with '_' as '-' can't be present in file identifier.
777
- # Needed this replace because partition_columns can be negative.
778
- file_name = file_name.replace("-", "_")
779
- full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
780
- with open(full_file_name, "wb+") as fp:
781
- # Write sklearn object to file.
782
- if isinstance(self.modelObj, pd.DataFrame):
783
- # If multiple models, then write the model corresponding to the partition value.
784
- fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
785
- else:
786
- fp.write(pickle.dumps(self.modelObj))
787
- self.model_file_paths_local.add(file_name)
1159
+ # Install model file into Vantage, if not installed.
1160
+ self._install_initial_model_file()
788
1161
 
789
- self._install_script_file(file_identifier=file_name,
790
- file_name=file_name,
791
- is_binary=True,
792
- file_location=self._tdml_tmp_dir)
1162
+ self._save_model(model_name, replace_if_exists)
1163
+ return self
793
1164
 
794
- if self._is_lake_system:
795
- # Need to pass env_name along with file_name for cleaning up the files in env.
796
- obj = f"{self._env.env_name}::{file_name}"
797
- if installed_model_files[obj] == 0:
798
- # Add to GC for the first time the model file (along with env name) is encountered.
799
- installed_model_files[obj] = 1
800
- GarbageCollector._add_to_garbagecollector(object_name=obj,
801
- object_type=TeradataConstants.TERADATA_APPLY)
802
- else:
803
- if installed_model_files[file_name] == 0:
804
- # Add to GC for the first time the model file is encountered.
805
- installed_model_files[file_name] = 1
806
- GarbageCollector._add_to_garbagecollector(object_name=file_name,
807
- object_type=TeradataConstants.TERADATA_SCRIPT)
808
1165
 
809
- self._is_model_installed = True
1166
+ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1167
+
1168
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
1169
+
1170
+ def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
1171
+ super().__init__(model=model, module_name=module_name, class_name=class_name,
1172
+ pos_args=pos_args, kwargs=kwargs)
1173
+
1174
+ self._initialize_variables(table_name_prefix="td_sklearn_")
1175
+ if model is not None:
1176
+ self.modelObj = model
1177
+ self.module_name = model.__module__.split("._")[0]
1178
+ self.class_name = model.__class__.__name__
1179
+ # __dict__ gets all the arguments as dictionary including default ones and positional
1180
+ # args.
1181
+ self.kwargs = model.__dict__
1182
+ self.pos_args = tuple() # Kept empty as all are moved to kwargs.
1183
+ else:
1184
+ self._initialize_object()
1185
+
1186
+ def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
1187
+ skip_either_or_that=False):
1188
+ """
1189
+ Internal function to validate arguments passed to exposed opensource APIs and return
1190
+ parent DataFrame, feature columns, label columns, group columns, data partition columns.
1191
+ """
1192
+ _validate_opensource_func_args(X=X, y=y, groups=groups,
1193
+ fit_partition_cols=self._fit_partition_colums_non_default,
1194
+ kwargs=kwargs,
1195
+ skip_either_or_that=skip_either_or_that)
1196
+ return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
1197
+ fit_partition_cols=self._fit_partition_colums_non_default)
810
1198
 
811
1199
  def _run_fit_related_functions(self,
812
1200
  data,
@@ -814,7 +1202,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
814
1202
  label_columns,
815
1203
  partition_columns,
816
1204
  func,
817
- classes=None):
1205
+ classes=None,
1206
+ file_name="sklearn_fit.py"):
818
1207
  """
819
1208
  Internal function to run fit() and partial_fit() functions.
820
1209
  """
@@ -829,9 +1218,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
829
1218
  return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
830
1219
  for col in new_partition_columns] + [("model", model_type)]
831
1220
 
832
- file_name = "sklearn_fit.py"
833
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
834
-
835
1221
  if classes:
836
1222
  class_type = type(classes[0]).__name__
837
1223
  classes = "--".join([str(x) for x in classes])
@@ -857,13 +1243,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
857
1243
  self._model_data = self._run_script(data, script_command, new_partition_columns,
858
1244
  return_types)
859
1245
 
860
- # Extract sklearn object(s) from the depending on the number of unique partitioning values.
861
- self.extract_sklearn_obj(n_unique_partitions=len(self._fit_partition_unique_values),
862
- n_partition_cols=len(new_partition_columns))
863
-
864
- # Need this label columns types in prediction.
865
- self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
866
- for l_c in label_columns]
1246
+ self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
867
1247
 
868
1248
  def partial_fit(self, X=None, y=None, classes=None, **kwargs):
869
1249
  """
@@ -911,11 +1291,19 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
911
1291
  self._is_default_partition_value_fit = False
912
1292
  self._fit_partition_colums_non_default = partition_columns
913
1293
 
914
- self._run_fit_related_functions(data,
915
- feature_columns,
916
- label_columns,
917
- partition_columns,
918
- inspect.stack()[0][3])
1294
+ file_name = kwargs.pop("file_name", None)
1295
+ func_name = kwargs.pop("name", "fit")
1296
+
1297
+ args = {"data": data,
1298
+ "feature_columns": feature_columns,
1299
+ "label_columns": label_columns,
1300
+ "partition_columns": partition_columns,
1301
+ "func": func_name}
1302
+
1303
+ if file_name is not None:
1304
+ args["file_name"] = file_name
1305
+
1306
+ self._run_fit_related_functions(**args)
919
1307
 
920
1308
  self._fit_execution_time = time.time() - st_time
921
1309
 
@@ -980,10 +1368,130 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
980
1368
 
981
1369
  return super().__getattr__(name)
982
1370
 
1371
+ def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
1372
+ func_name, **kwargs):
1373
+ """
1374
+ Internal function to handle multi model case for transform function for functions
1375
+ ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
1376
+ and "Birch" of cluster module.
1377
+ These functions generate multiple models and when transform is applied to each model, it generates
1378
+ output with different number of columns.
1379
+ """
1380
+ skl_objs_dict = {}
1381
+ no_of_unique_partitions = len(self._fit_partition_unique_values)
1382
+ no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
1383
+
1384
+ # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
1385
+ # and get the maximum number of columns and their types.
1386
+ for i in range(no_of_unique_partitions):
1387
+ skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
1388
+
1389
+
1390
+ data = data.select(feature_columns + label_columns + partition_columns)
1391
+ ten_row_data = data.head(10).get_values()
1392
+ X = numpy.array(ten_row_data)
1393
+
1394
+ # For multi-model case, model in one AMP can give more number of columns than other AMPs.
1395
+ # Returns clause can't contain different number of columns in different AMPs. Hence, taking
1396
+ # maximum number of columns and their types from all models.
1397
+ max_no_of_columns = 0
1398
+ max_col_names = []
1399
+ max_col_types = []
1400
+
1401
+ def _get_input_row_without_nans(row):
1402
+ """
1403
+ `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
1404
+ """
1405
+ X1 = []
1406
+ for _, v in enumerate(row):
1407
+ if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
1408
+ # Add to list when:
1409
+ # - v is None or
1410
+ # - v is string or
1411
+ # - v is not nan or
1412
+ # - if module is impute (which transforms nan values) even though v is nan.
1413
+ X1.append(v)
1414
+ else:
1415
+ # skip nan values.
1416
+ pass
1417
+ return X1
1418
+
1419
+ for i in range(X.shape[0]):
1420
+ # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
1421
+ partition_values = tuple(X[i, -no_of_partitioning_cols:])
1422
+ skl_obj = skl_objs_dict[partition_values]
1423
+
1424
+ X1 = X[i, :-no_of_partitioning_cols]
1425
+ # Since Nans/NULLs are added in transform for last columns where some models generated
1426
+ # less number of columns, removing Nans/NULLs from the input row for inverse_transform
1427
+ # using function _get_input_row_without_nans().
1428
+ X1 = numpy.array([_get_input_row_without_nans(X1)])
1429
+
1430
+ trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
1431
+
1432
+ no_of_columns = 1
1433
+
1434
+ if trans_opt.shape == (X1.shape[0],):
1435
+ trans_opt = trans_opt.reshape(X1.shape[0], 1)
1436
+
1437
+ if isinstance(trans_opt[0], numpy.ndarray) \
1438
+ or isinstance(trans_opt[0], list) \
1439
+ or isinstance(trans_opt[0], tuple):
1440
+ no_of_columns = len(trans_opt[0])
1441
+
1442
+ col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
1443
+
1444
+ # Get new column sqlalchemy types for pandas df columns of transform output.
1445
+ opt_pd = pd.DataFrame(trans_opt)
1446
+
1447
+ # Get output column types for each column in pandas df from the output of transform
1448
+ # type functions.
1449
+ types = {}
1450
+ for idx in range(no_of_columns):
1451
+ col = list(opt_pd.columns)[idx]
1452
+
1453
+ # Only one row in trans_opt.
1454
+ if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
1455
+ type_ = type(trans_opt[0][idx])
1456
+ else:
1457
+ # only one value in the output.
1458
+ type_ = type(trans_opt[0])
1459
+
1460
+ # If type of the output value (trans_opt) is None, then use `str` as type since
1461
+ # pandas astype() does not accept None type.
1462
+ if type_ is type(None):
1463
+ type_ = str
1464
+
1465
+ # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1466
+ # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1467
+ # Error while type casting for column '2'"
1468
+ # Hence, using pd.Int64Dtype() for integer columns with nan values.
1469
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1470
+
1471
+ # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1472
+ opt_pd = opt_pd.astype(types)
1473
+
1474
+ # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1475
+ # TIMESTAMP(timezone=True) else map it according to default value.
1476
+ col_types = [TIMESTAMP(timezone=True)
1477
+ if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
1478
+ else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
1479
+ for key, col_name in enumerate(list(opt_pd.columns))]
1480
+
1481
+ # Different models in multi model case can generate different number of output columns for example in
1482
+ # SelectFpr. Hence, taking the model which generates maximum number of columns.
1483
+ if no_of_columns > max_no_of_columns:
1484
+ max_no_of_columns = no_of_columns
1485
+ max_col_names = col_names
1486
+ max_col_types = col_types
1487
+
1488
+ return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
1489
+
983
1490
  def _get_return_columns_for_function_(self,
984
1491
  data,
985
1492
  feature_columns,
986
1493
  label_columns,
1494
+ partition_columns,
987
1495
  func_name,
988
1496
  kwargs):
989
1497
  """
@@ -997,7 +1505,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
997
1505
  return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
998
1506
  data._td_column_names_and_sqlalchemy_types[col.lower()])
999
1507
  for i, col in enumerate(label_columns)]
1000
- if func_name == "predict":
1508
+
1509
+ if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
1001
1510
  """
1002
1511
  Return predict columns using either label_columns (if provided) or
1003
1512
  self._fit_label_columns_types (if the function is trained using label columns).
@@ -1012,8 +1521,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1012
1521
  return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
1013
1522
  for i, col_type in enumerate(self._fit_label_columns_types)]
1014
1523
 
1015
- data = data.select(feature_columns + label_columns)
1016
-
1017
1524
  ## If function is not `fit_predict`:
1018
1525
  # then take one row of transform/other functions to execute in client
1019
1526
  # to get number of columns in return clause and their Vantage types.
@@ -1027,8 +1534,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1027
1534
  skl_obj = self.modelObj
1028
1535
  else:
1029
1536
  # Multi model case.
1537
+ if (func_name in ["transform", "inverse_transform"] and \
1538
+ self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
1539
+ (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
1540
+ # Special handling for multi model case for transform function as these classes
1541
+ # generate transform output with different number of columns for each model.
1542
+ # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
1543
+ # some models.
1544
+ return self._special_handling_multimodel_(data, feature_columns, label_columns,
1545
+ partition_columns, func_name, **kwargs)
1546
+
1030
1547
  skl_obj = self.modelObj.iloc[0]["model"]
1031
1548
 
1549
+ data = data.select(feature_columns + label_columns)
1550
+
1032
1551
  ten_row_data = data.head(10).get_values()
1033
1552
  X = numpy.array(ten_row_data)
1034
1553
  if label_columns:
@@ -1122,7 +1641,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1122
1641
  # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1123
1642
  # Error while type casting for column '2'"
1124
1643
  # Hence, using pd.Int64Dtype() for integer columns with nan values.
1125
- types[col] = type_ if type_ != numpy.int64 else pd.Int64Dtype()
1644
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1126
1645
 
1127
1646
  # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1128
1647
  opt_pd = opt_pd.astype(types)
@@ -1137,7 +1656,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1137
1656
  return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
1138
1657
 
1139
1658
  @_validate_fit_run
1140
- def _run_function_needing_all_rows(self, X=None, y=None, **kwargs):
1659
+ def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
1141
1660
  """
1142
1661
  Internal function to run functions like score, aic, bic which needs all rows and return
1143
1662
  one floating number as result.
@@ -1160,9 +1679,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1160
1679
  label_columns,
1161
1680
  partition_columns)
1162
1681
 
1163
- file_name = "sklearn_score.py"
1164
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1165
-
1166
1682
  script_file_path = f"{file_name}" if self._is_lake_system \
1167
1683
  else f"./{self._db_name}/{file_name}"
1168
1684
 
@@ -1180,7 +1696,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1180
1696
  return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1181
1697
  for col in new_partition_columns] + [(func_name, FLOAT())]
1182
1698
 
1183
- self._install_initial_model_file()
1699
+ # Checking the trained model installation. If not installed,
1700
+ # install it and set flag to True.
1701
+ if not self._is_trained_model_installed:
1702
+ self._install_initial_model_file()
1703
+ self._is_trained_model_installed = True
1184
1704
 
1185
1705
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1186
1706
 
@@ -1194,7 +1714,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1194
1714
  return opt
1195
1715
 
1196
1716
  @_validate_fit_run
1197
- def _transform(self, X=None, y=None, **kwargs):
1717
+ def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
1198
1718
  """
1199
1719
  Internal function to run predict/transform and similar functions, which returns
1200
1720
  multiple columns. This function will return data row along with the generated
@@ -1217,19 +1737,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1217
1737
  partition_columns)
1218
1738
 
1219
1739
  # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
1220
- if "data" in kwargs:
1221
- kwargs.pop("data")
1222
- if "feature_columns" in kwargs:
1223
- kwargs.pop("feature_columns")
1224
- if "group_columns" in kwargs:
1225
- kwargs.pop("group_columns")
1226
- if "partition_columns" in kwargs:
1227
- kwargs.pop("partition_columns")
1228
- if "label_columns" in kwargs:
1229
- kwargs.pop("label_columns")
1230
-
1231
- file_name = "sklearn_transform.py"
1232
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1740
+ self._remove_data_related_args_from_kwargs(kwargs)
1233
1741
 
1234
1742
  script_file_path = f"{file_name}" if self._is_lake_system \
1235
1743
  else f"./{self._db_name}/{file_name}"
@@ -1239,26 +1747,42 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1239
1747
 
1240
1748
  self._validate_unique_partition_values(data, new_partition_columns)
1241
1749
 
1242
- py_exc = UtilFuncs._get_python_execution_path()
1243
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1244
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1245
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1750
+ return_columns_python_types = None
1751
+ if self._fit_label_columns_python_types:
1752
+ return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
1246
1753
 
1247
1754
  # Returning feature columns also along with transformed columns because we don't know the
1248
1755
  # mapping of feature columns to the transformed columns.
1249
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1250
- for col in (new_partition_columns + feature_columns)]
1756
+ ## 'correct_covariance()' returns the (n_features, n_features)
1757
+ if func_name == "correct_covariance":
1758
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1759
+ for col in new_partition_columns]
1760
+ else:
1761
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1762
+ for col in (new_partition_columns + feature_columns)]
1251
1763
  if func_name in ["predict", "decision_function"] and label_columns:
1252
1764
  return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1253
1765
  for col in label_columns]
1254
- return_types += self._get_return_columns_for_function_(data,
1255
- feature_columns,
1256
- label_columns,
1257
- func_name,
1258
- kwargs)
1259
1766
 
1260
- # Installing model files before running sklearn_transform.py.
1261
- self._install_initial_model_file()
1767
+ output_cols_types = self._get_return_columns_for_function_(data,
1768
+ feature_columns,
1769
+ label_columns,
1770
+ new_partition_columns,
1771
+ func_name,
1772
+ kwargs)
1773
+ return_types += output_cols_types
1774
+
1775
+ py_exc = UtilFuncs._get_python_execution_path()
1776
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1777
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1778
+ f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
1779
+ f"{return_columns_python_types}"
1780
+
1781
+ # Checking the trained model installation. If not installed,
1782
+ # install it and set flag to True.
1783
+ if not self._is_trained_model_installed:
1784
+ self._install_initial_model_file()
1785
+ self._is_trained_model_installed = True
1262
1786
 
1263
1787
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1264
1788
 
@@ -1294,6 +1818,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1294
1818
  return_types += self._get_return_columns_for_function_(data,
1295
1819
  feature_columns,
1296
1820
  label_columns,
1821
+ new_partition_columns,
1297
1822
  func_name,
1298
1823
  {})
1299
1824
  else:
@@ -1302,7 +1827,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1302
1827
  return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
1303
1828
 
1304
1829
  file_name = "sklearn_fit_predict.py"
1305
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1306
1830
 
1307
1831
  data_column_types_str, partition_indices_str, _, new_partition_columns = \
1308
1832
  self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
@@ -1317,7 +1841,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1317
1841
  # Get unique values in partitioning columns.
1318
1842
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1319
1843
 
1320
- self._install_initial_model_file()
1844
+ # Checking the trained model installation. If not installed,
1845
+ # install it and flag to True.
1846
+ if not self._is_trained_model_installed:
1847
+ self._install_initial_model_file()
1848
+ self._is_trained_model_installed = True
1321
1849
 
1322
1850
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1323
1851
 
@@ -1376,14 +1904,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1376
1904
  skip_either_or_that=True)
1377
1905
 
1378
1906
  # Remove the kwargs data.
1379
- input_data = kwargs.pop("data", None)
1380
- partition_cols = kwargs.pop("partition_columns", None)
1381
- feature_cols = kwargs.pop("feature_columns", None)
1382
- label_cols = kwargs.pop("label_columns", None)
1907
+ self._remove_data_related_args_from_kwargs(kwargs)
1383
1908
 
1384
1909
  if partition_columns:
1385
1910
  # kwargs are passed to kneighbors function. So, removing them from kwargs.
1386
- kwargs.pop("partition_columns")
1387
1911
  self._is_default_partition_value_fit = False
1388
1912
 
1389
1913
  # Generating new partition column name.
@@ -1395,7 +1919,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1395
1919
  args_str = self._get_kwargs_str(kwargs)
1396
1920
 
1397
1921
  file_name = "sklearn_neighbors.py"
1398
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1399
1922
 
1400
1923
  script_file_path = f"{file_name}" if self._is_lake_system \
1401
1924
  else f"./{self._db_name}/{file_name}"
@@ -1429,7 +1952,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1429
1952
  # Get unique values in partitioning columns.
1430
1953
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1431
1954
 
1432
- self._install_initial_model_file()
1955
+ # Checking the trained model installation. If not installed,
1956
+ # install it and set flag to True.
1957
+ if not self._is_trained_model_installed:
1958
+ self._install_initial_model_file()
1959
+ self._is_trained_model_installed = True
1433
1960
 
1434
1961
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1435
1962
 
@@ -1513,7 +2040,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1513
2040
  group_columns)
1514
2041
 
1515
2042
  file_name = "sklearn_model_selection_split.py"
1516
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1517
2043
 
1518
2044
  script_file_path = f"{file_name}" if self._is_lake_system \
1519
2045
  else f"./{self._db_name}/{file_name}"
@@ -1548,7 +2074,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1548
2074
  # Get unique values in partitioning columns.
1549
2075
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1550
2076
 
1551
- self._install_initial_model_file()
2077
+ # Checking the trained model installation. If not installed,
2078
+ # install it and set flag to True.
2079
+ if not self._is_trained_model_installed:
2080
+ self._install_initial_model_file()
2081
+ self._is_trained_model_installed = True
1552
2082
 
1553
2083
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1554
2084
 
@@ -1562,154 +2092,69 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1562
2092
 
1563
2093
  return opt
1564
2094
 
1565
- def _get_returning_df(self, script_df, partition_column, returns):
1566
- """
1567
- Internal function to return the teradataml Dataframe except
1568
- partition_column.
1569
- """
1570
- if self._is_default_partition_value_fit:
1571
- # For single model case, partition column is internally generated
1572
- # and no point in returning it to the user.
1573
-
1574
- # Extract columns from return types.
1575
- returning_cols = [col[0] for col in returns[len(partition_column):]]
1576
- return script_df.select(returning_cols)
1577
- return script_df
1578
-
1579
2095
 
1580
- class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1581
- def __init__(self, module_name, func_name):
2096
+ class _FunctionWrapper(_GenericObjectWrapper):
2097
+ def __init__(self, module_name, func_name, file_type, template_file):
1582
2098
  super().__init__()
1583
- self.__module_name = module_name
1584
- self.__func_name = func_name
1585
- self.__params = None
1586
- self.__data_args = OrderedDict()
1587
- self._model_file_name = _generate_new_name(type="file_function", extension="py")
2099
+ self._module_name = module_name
2100
+ self._func_name = func_name
2101
+ self._params = None
2102
+ self._data_args = OrderedDict()
2103
+ self._template_file = template_file
2104
+ self._script_file_name = _generate_new_name(type=file_type, extension="py")
1588
2105
 
1589
2106
  def __call__(self, **kwargs):
1590
2107
  """
1591
2108
  Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
1592
2109
  """
1593
- __data_columns = []
1594
-
1595
- partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
1596
- if partition_cols:
1597
- kwargs.pop("partition_columns")
1598
-
1599
- # Separate dataframe related arguments and their column names from actual kwargs.
1600
- for k, v in kwargs.items():
1601
- if isinstance(v, DataFrame):
1602
- # All dataframes should be select of parent dataframe.
1603
- _validate_df_query_type(v, "select", k)
1604
-
1605
- # Save all columns in dataframe related arguments.
1606
- __data_columns.extend(v.columns)
1607
-
1608
- self.__data_args[k] = v
2110
+ replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
1609
2111
 
2112
+ script_file_path = f"{self._script_file_name}" if self._is_lake_system \
2113
+ else f"./{self._db_name}/{self._script_file_name}"
1610
2114
 
1611
- # Get common parent dataframe from all dataframes.
1612
- self.__tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self.__data_args.values()))
1613
-
1614
- self._validate_existence_of_partition_columns(partition_cols, self.__tdml_df.columns)
1615
-
1616
- self.__tdml_df = self.__tdml_df.select(__data_columns + partition_cols)
1617
-
1618
- self.__tdml_df, partition_cols = self._get_data_and_data_partition_columns(self.__tdml_df,
1619
- __data_columns,
1620
- [],
1621
- partition_cols
1622
- )
1623
-
1624
- # Prepare string of data arguments with name, indices where columns of that argument resides
1625
- # and types of each of the column.
1626
- data_args_str = self._prepare_data_args_string(kwargs)
2115
+ model_file_prefix = None
2116
+ if self._is_lake_system:
2117
+ model_file_prefix = self._script_file_name.replace(".py", "")
1627
2118
 
1628
- self.__params = kwargs
2119
+ py_exc = UtilFuncs._get_python_execution_path()
2120
+ script_command = f"{py_exc} {script_file_path} {model_file_prefix} {self._is_lake_system}"
1629
2121
 
1630
- # Get indices of partition_columns and types of all columns.
1631
- data_column_types_str, partition_indices_str, _, partition_cols = \
1632
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
2122
+ model_type = BLOB() if self._is_lake_system else CLOB()
1633
2123
 
1634
- script_file_path = f"{self._model_file_name}" if self._is_lake_system \
1635
- else f"./{self._db_name}/{self._model_file_name}"
1636
- py_exc = UtilFuncs._get_python_execution_path()
1637
- script_command = f"{py_exc} {script_file_path} {partition_indices_str} {data_column_types_str} {data_args_str}"
2124
+ return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
2125
+ for col in partition_cols] + [(self._func_name, model_type)]
1638
2126
 
1639
- return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
1640
- for col in partition_cols] + [(self.__func_name, CLOB())]
2127
+ replace_dict.update({"<module_name>": self._module_name,
2128
+ "<func_name>": self._func_name,
2129
+ "<params>": json.dumps(kwargs)})
1641
2130
 
1642
2131
  # Generate new file in .teradataml directory and install it to Vantage.
1643
- self._prepare_and_install_file()
2132
+ self._prepare_and_install_file(replace_dict=replace_dict)
2133
+
2134
+ try:
2135
+ self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
2136
+ self._model_data._index_label = None
1644
2137
 
1645
- self._model_data = self._run_script(self.__tdml_df, script_command, partition_cols, return_types)
1646
- self._model_data._index_label = None
2138
+ fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
1647
2139
 
1648
- fit_partition_unique_values = self.__tdml_df.drop_duplicate(partition_cols).get_values()
2140
+ self._extract_model_objs(n_unique_partitions=len(fit_partition_unique_values),
2141
+ n_partition_cols=len(partition_cols))
1649
2142
 
1650
- self.extract_sklearn_obj(n_unique_partitions=len(fit_partition_unique_values),
1651
- n_partition_cols=len(partition_cols))
2143
+ except Exception as ex:
2144
+ # File cleanup if script execution fails or unable to fetch modelObj.
2145
+ os.remove(self._script_file_local)
2146
+ self._remove_script_file(self._script_file_name)
2147
+ raise
1652
2148
 
1653
2149
  # File cleanup after processing.
1654
- os.remove(self._model_file_local)
1655
- remove_file(file_identifier=self._model_file_name.split(".")[0], suppress_output=True,
1656
- force_remove=True)
2150
+ os.remove(self._script_file_local)
2151
+ self._remove_script_file(self._script_file_name)
1657
2152
 
1658
2153
  return self.modelObj
1659
2154
 
1660
- def _prepare_data_args_string(self, kwargs):
1661
- """
1662
- Get column indices and types of each data related arguments in the format:
1663
- "{<arg_name>-<comma separated indices>-<comma separated types>}--
1664
- {<arg_name>-<comma separated indices>-<comma separated types>}"
1665
- """
1666
- data_args_str = []
1667
- for arg_name in list(self.__data_args.keys()):
1668
- # Remove DataFrame arguments from kwargs, which will be passed to Script.
1669
- kwargs.pop(arg_name)
1670
-
1671
- # Get column indices and their types for each dataframe from parent dataframe.
1672
- _, partition_indices_str, partition_types_str, _ = \
1673
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
1674
- self.__data_args[arg_name].columns,
1675
- idx_delim=",",
1676
- types_delim=",")
1677
-
1678
- # Format "<arg_name>-<comma separated indices>-<comma separated types>"
1679
- data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
1680
-
1681
- # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
1682
- # {<arg_name>-<comma separated indices>-<comma separated types>}"
1683
- return "--".join(data_args_str)
1684
-
1685
- def _validate_existence_of_partition_columns(self, partition_columns, all_columns):
1686
- """
1687
- Validate if columns in "partition_columns" argument are present in any of the given
1688
- dataframes.
1689
- """
1690
- invalid_part_cols = [c for c in partition_columns if c not in all_columns]
1691
-
1692
- if invalid_part_cols:
1693
- raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
1694
- ", ".join(invalid_part_cols),
1695
- "', '".join(list(self.__data_args.keys())))
1696
- )
1697
-
1698
- def _prepare_and_install_file(self):
1699
- """
1700
- Prepare function script file from template file and install it in Vaantage.
1701
- """
1702
- with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
1703
- script_data = fp.read()
1704
- script_data = script_data.replace("<module_name>",self.__module_name).\
1705
- replace("<func_name>",self.__func_name).replace("<params>", json.dumps(self.__params))
1706
-
1707
- self._model_file_local = os.path.join(self._tdml_tmp_dir, self._model_file_name)
1708
-
1709
- with open(self._model_file_local, "w") as fp:
1710
- fp.write(script_data)
1711
-
1712
- self._install_script_file(file_identifier=self._model_file_name.split(".")[0],
1713
- file_name=self._model_file_name,
1714
- file_location=self._tdml_tmp_dir)
1715
2155
 
2156
+ class _SKLearnFunctionWrapper(_FunctionWrapper):
2157
+ def __init__(self, module_name, func_name):
2158
+ file_type = "file_fn_sklearn"
2159
+ template_file = "sklearn_function.template"
2160
+ super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)