teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (126) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +315 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +95 -8
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/metadata.py +12 -3
  8. teradataml/analytics/json_parser/utils.py +7 -2
  9. teradataml/analytics/sqle/__init__.py +5 -1
  10. teradataml/analytics/table_operator/__init__.py +1 -1
  11. teradataml/analytics/uaf/__init__.py +1 -1
  12. teradataml/analytics/utils.py +4 -0
  13. teradataml/analytics/valib.py +18 -4
  14. teradataml/automl/__init__.py +51 -6
  15. teradataml/automl/data_preparation.py +59 -35
  16. teradataml/automl/data_transformation.py +58 -33
  17. teradataml/automl/feature_engineering.py +27 -12
  18. teradataml/automl/model_training.py +73 -46
  19. teradataml/common/constants.py +88 -29
  20. teradataml/common/garbagecollector.py +2 -1
  21. teradataml/common/messagecodes.py +19 -3
  22. teradataml/common/messages.py +6 -1
  23. teradataml/common/sqlbundle.py +64 -12
  24. teradataml/common/utils.py +246 -47
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +161 -27
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/byom_example.json +11 -0
  29. teradataml/data/dataframe_example.json +18 -2
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  37. teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
  38. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  39. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  40. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  41. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  42. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  43. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  44. teradataml/data/hnsw_alter_data.csv +5 -0
  45. teradataml/data/hnsw_data.csv +10 -0
  46. teradataml/data/jsons/byom/h2opredict.json +1 -1
  47. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  48. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  49. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  50. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  51. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  52. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  53. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  54. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  55. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  56. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  57. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  58. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  59. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  60. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  61. teradataml/data/medical_readings.csv +101 -0
  62. teradataml/data/patient_profile.csv +101 -0
  63. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  64. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  65. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  66. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  67. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  68. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  69. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  70. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  71. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  72. teradataml/data/target_udt_data.csv +8 -0
  73. teradataml/data/templates/open_source_ml.json +3 -2
  74. teradataml/data/teradataml_example.json +8 -0
  75. teradataml/data/vectordistance_example.json +4 -0
  76. teradataml/dataframe/copy_to.py +8 -3
  77. teradataml/dataframe/data_transfer.py +11 -1
  78. teradataml/dataframe/dataframe.py +1049 -285
  79. teradataml/dataframe/dataframe_utils.py +152 -20
  80. teradataml/dataframe/functions.py +578 -35
  81. teradataml/dataframe/setop.py +11 -6
  82. teradataml/dataframe/sql.py +185 -16
  83. teradataml/dbutils/dbutils.py +1049 -115
  84. teradataml/dbutils/filemgr.py +48 -1
  85. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  86. teradataml/lib/aed_0_1.dll +0 -0
  87. teradataml/opensource/__init__.py +1 -1
  88. teradataml/opensource/_base.py +1466 -0
  89. teradataml/opensource/_class.py +464 -0
  90. teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
  91. teradataml/opensource/_lightgbm.py +949 -0
  92. teradataml/opensource/_sklearn.py +1008 -0
  93. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
  94. teradataml/options/__init__.py +54 -38
  95. teradataml/options/configure.py +131 -27
  96. teradataml/options/display.py +13 -2
  97. teradataml/plot/axis.py +47 -8
  98. teradataml/plot/figure.py +33 -0
  99. teradataml/plot/plot.py +63 -13
  100. teradataml/scriptmgmt/UserEnv.py +5 -5
  101. teradataml/scriptmgmt/lls_utils.py +130 -40
  102. teradataml/store/__init__.py +12 -0
  103. teradataml/store/feature_store/__init__.py +0 -0
  104. teradataml/store/feature_store/constants.py +291 -0
  105. teradataml/store/feature_store/feature_store.py +2318 -0
  106. teradataml/store/feature_store/models.py +1505 -0
  107. teradataml/table_operators/Apply.py +32 -18
  108. teradataml/table_operators/Script.py +3 -1
  109. teradataml/table_operators/TableOperator.py +3 -1
  110. teradataml/table_operators/query_generator.py +3 -0
  111. teradataml/table_operators/table_operator_query_generator.py +3 -1
  112. teradataml/table_operators/table_operator_util.py +37 -38
  113. teradataml/table_operators/templates/dataframe_register.template +69 -0
  114. teradataml/utils/dtypes.py +51 -2
  115. teradataml/utils/internal_buffer.py +18 -0
  116. teradataml/utils/validators.py +99 -8
  117. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
  118. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
  119. teradataml/libaed_0_1.dylib +0 -0
  120. teradataml/libaed_0_1.so +0 -0
  121. teradataml/opensource/sklearn/__init__.py +0 -1
  122. teradataml/opensource/sklearn/_class.py +0 -255
  123. teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
  124. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  125. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  126. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
@@ -1,1800 +0,0 @@
1
- # ##################################################################
2
- #
3
- # Copyright 2023 Teradata. All rights reserved.
4
- # TERADATA CONFIDENTIAL AND TRADE SECRET
5
- #
6
- # Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
7
- # Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
8
- #
9
- # Version: 1.0
10
- # Function Version: 1.0
11
- #
12
- # This file contains object wrapper class for opensource packages and child object
13
- # wrapper classes for each opensource package. Currently, we have child object
14
- # wrapper class for scikit-learn.
15
- #
16
- # ##################################################################
17
-
18
- from collections import OrderedDict, defaultdict
19
- from importlib import import_module
20
-
21
- import base64
22
- import functools
23
- import json
24
- import numpy
25
- import os
26
- import pickle
27
- import time
28
- import inspect
29
- import warnings
30
- import json
31
- import random
32
- import pandas as pd
33
- from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
34
- import pandas.api.types as pt
35
-
36
- from teradataml import _TDML_DIRECTORY, Script, TeradataMlException, Apply
37
- from teradataml.dataframe.copy_to import _get_sqlalchemy_mapping
38
- from teradataml.common import pylogger
39
- from teradataml.common.utils import UtilFuncs
40
- from teradataml.context.context import _get_current_databasename, get_connection
41
- from teradataml.dbutils.filemgr import install_file, remove_file
42
- from teradataml.utils.utils import execute_sql
43
- from teradataml.options.configure import configure
44
- from teradataml.opensource.sklearn._wrapper_utils import _validate_fit_run, _generate_new_name,\
45
- _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
46
- from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
47
- _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
48
- _OSML_ADDITIONAL_COLUMN_TYPES
49
- from teradataml.common.messagecodes import MessageCodes
50
- from teradataml.common.messages import Messages
51
- from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
52
- from teradataml.dbutils.dbutils import _create_table, set_session_param
53
- from teradataml.utils.validators import _Validators
54
- from teradataml.dataframe.dataframe import DataFrame
55
- from teradataml.dataframe.dataframe_utils import DataFrameUtils
56
- from teradataml.scriptmgmt.lls_utils import create_env, get_env
57
- from teradataml.common.garbagecollector import GarbageCollector
58
- from teradataml.common.constants import TeradataConstants
59
-
60
-
61
- logger = pylogger.getLogger()
62
-
63
- validator = _Validators()
64
-
65
- installed_model_files = defaultdict(int)
66
-
67
- ## Flag to ensure the sklearn script
68
- ## installation occurs only once.
69
- _file_installed = False
70
-
71
- class _GenericObjectWrapper:
72
- def __init__(self) -> None:
73
- self._db_name = _get_current_databasename()
74
-
75
- self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
76
-
77
- # Some random number to be used as partition value if partition_columns is None for fit().
78
- self._default_data_partition_value = -1001
79
-
80
- self.modelObj = None
81
- self._model_data = None
82
-
83
- self._tdml_tmp_dir = GarbageCollector._get_temp_dir_name()
84
-
85
- self._env = None
86
-
87
- self._is_lake_system = UtilFuncs._is_lake()
88
-
89
- if self._is_lake_system:
90
- if configure.openml_user_env is not None:
91
- self._env = configure.openml_user_env
92
- else:
93
- self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
94
- else:
95
- set_session_param("searchuifdbpath",self._db_name)
96
-
97
- global _file_installed
98
- ## Flag to check whether trained model is installed or not.
99
- self._is_trained_model_installed = False
100
-
101
- ## Install all sklearn script files on Vantage.
102
- if not _file_installed:
103
- sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
104
- "sklearn_transform.py", "sklearn_fit_predict.py",
105
- "sklearn_neighbors.py", "sklearn_model_selection_split.py"]
106
- for script_file in sklearn_script_files:
107
- self._install_script_file(file_identifier=script_file.split(".")[0],
108
- file_name=script_file)
109
-
110
- _file_installed = True
111
-
112
- def _get_columns_as_list(self, cols):
113
- """
114
- Internal function to get columns as list of strings.
115
- Empty list is returned if cols is None.
116
- """
117
- if cols is None:
118
- return []
119
- if not isinstance(cols, list) and not isinstance(cols, tuple):
120
- return [cols]
121
- return cols
122
-
123
- def _get_data_and_data_partition_columns(self, data, feature_columns, label_columns,
124
- partition_columns=None, group_columns=[]):
125
- """
126
- Internal function to generate one new partition column (if not provided) and return
127
- data and partition columns (either generated or passed one).
128
- """
129
- new_partition_columns = self._get_columns_as_list(partition_columns)
130
-
131
- if not partition_columns:
132
- # If partition column is not specified, create a partition column and run Script.
133
- # This runs the Script in one AMP as we are partitioning data using this column
134
- # which contains only one value.
135
- new_partition_columns = [_generate_new_name(type="column")]
136
- data = data.assign(**{new_partition_columns[0]: self._default_data_partition_value})
137
-
138
- # Filter out partition columns from feature columns and label columns.
139
- new_partition_columns_filtered = [col for col in new_partition_columns
140
- if col not in (feature_columns + label_columns + group_columns)]
141
-
142
- all_columns = feature_columns + label_columns + group_columns + new_partition_columns_filtered
143
- return data.select(all_columns), new_partition_columns
144
-
145
- def _run_script(self, data, command, partition_columns, return_types):
146
- """
147
- Internal function to run Script(), given the argument needed by STO's or
148
- Apply's Script.
149
- """
150
- if isinstance(partition_columns, list) and len(partition_columns) == 0:
151
- partition_columns = None
152
-
153
- if self._is_lake_system:
154
- obj = Apply(data=data,
155
- returns=OrderedDict(return_types),
156
- apply_command=command,
157
- data_partition_column=partition_columns,
158
- env_name=self._env,
159
- delimiter="\t")
160
- else:
161
- obj = Script(data=data,
162
- returns=OrderedDict(return_types),
163
- script_command=command,
164
- data_partition_column=partition_columns)
165
- obj.check_reserved_keyword = False
166
-
167
- obj.skip_argument_validation = True
168
- return obj.execute_script(output_style="TABLE")
169
-
170
- def _install_script_file(self,
171
- file_identifier=None,
172
- file_name=None,
173
- is_binary=False,
174
- file_location=None):
175
- """
176
- Internal function to install script file in Vantage.
177
- """
178
- if file_location is None:
179
- file_location = self._scripts_path
180
- new_script = os.path.join(file_location, file_name)
181
-
182
- # _env is set while object creation
183
- # If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
184
-
185
- if not self._is_lake_system:
186
- status = install_file(file_identifier=file_identifier,
187
- file_path=new_script,
188
- replace=True,
189
- suppress_output=True,
190
- is_binary=is_binary)
191
- else:
192
- status = self._env.install_file(file_path=new_script,
193
- replace=True,
194
- suppress_output=True)
195
- if not status:
196
- raise TeradataMlException(
197
- f"Script file '{file_name}' failed to get installed/replaced in Vantage."
198
- )
199
-
200
- def _remove_script_file(self, file_name):
201
- """
202
- Internal function to remove script file in Vantage.
203
- """
204
- # _env is set while object creation
205
- # If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
206
-
207
- if not self._is_lake_system:
208
- status = remove_file(file_identifier=file_name.split(".")[0],
209
- force_remove=True,
210
- suppress_output=True)
211
- else:
212
- status = self._env.remove_file(file_name=file_name,
213
- suppress_output=True)
214
- if not status:
215
- raise TeradataMlException(
216
- f"Script file '{file_name}' failed to remove in Vantage."
217
- )
218
- def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
219
- idx_delim=",",
220
- types_delim="--"):
221
- """
222
- Internal function to get the data column types and partition column names, indices and types.
223
- Function returns delimiter separated string of types and indices if idx_delim and
224
- types_delim are provided. Otherwise, it returns list of types and indices. Partition names
225
- are returned as list always.
226
- """
227
- data_column_types = "" if types_delim else []
228
- partition_indices = "" if idx_delim else []
229
- partition_types = "" if types_delim else []
230
- new_partition_columns = []
231
- j = 0
232
- for i, col in enumerate(data.columns):
233
- _type = data._td_column_names_and_sqlalchemy_types[col.lower()].python_type.__name__
234
- if types_delim:
235
- data_column_types += (_type if i == 0 else f"{types_delim}{_type}")
236
- else:
237
- data_column_types.append(_type)
238
- if col in partition_columns:
239
- new_partition_columns.append(col)
240
- if idx_delim:
241
- partition_indices += (str(i) if j == 0 else f"{idx_delim}{str(i)}")
242
- else:
243
- partition_indices.append(i)
244
- if types_delim:
245
- partition_types += (_type if j == 0 else f"{types_delim}{_type}")
246
- else:
247
- partition_types.append(_type)
248
- j += 1
249
- # Return types of all columns (as list or str), partition column indices (as list or str)
250
- # and partition column types (as list or str).
251
- return data_column_types, partition_indices, partition_types, new_partition_columns
252
-
253
- def _get_kwargs_str(self, kwargs):
254
- """
255
- Returns string of kwargs in the format:
256
- key1 val1-type1 key2 val2-type2 ...
257
- """
258
- args_str = ""
259
- for key, val in kwargs.items():
260
- strr = f"{key} {str(val)}-{type(val).__name__}"
261
- if args_str == "":
262
- args_str += strr
263
- else:
264
- args_str += f" {strr}"
265
- return args_str
266
-
267
- def extract_sklearn_obj(self, n_unique_partitions = 1, n_partition_cols = 1):
268
- """
269
- Internal function to extract sklearn object from the model(s) depending on the number of
270
- partitions. When it is only one model, it is directly used as sklearn object (modelObj).
271
- When it is multiple models, it is converted to pandas DataFrame and stored in sklearn
272
- object.
273
- """
274
- vals = execute_sql("select * from {}".format(self._model_data._table_name)).fetchall()
275
-
276
- # pickle will issue a caution warning, if model pickling was done with
277
- # different library version than used here. The following disables any warnings
278
- # that might otherwise show in the scriptlog files on the Advanced SQL Engine
279
- # nodes in this case. Yet, do keep an eye for incompatible pickle versions.
280
- warnings.filterwarnings("ignore")
281
-
282
- model_obj = None
283
- # Extract and unpickle last column which is the model object.
284
- for i, row in enumerate(vals):
285
- if self._is_lake_system:
286
- model_obj = pickle.loads(row[n_partition_cols])
287
- else:
288
- model_obj = pickle.loads(base64.b64decode(row[n_partition_cols].partition("'")[2]))
289
- row[n_partition_cols] = model_obj
290
- vals[i] = row
291
- if n_unique_partitions == 1:
292
- self.modelObj = model_obj
293
- elif n_unique_partitions > 1:
294
- self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
295
- else:
296
- ValueError("Number of partitions should be greater than 0.")
297
-
298
- warnings.filterwarnings("default")
299
-
300
-
301
- class _OpenSourceObjectWrapper(_GenericObjectWrapper):
302
- # This has to be set for every package which subclasses this class.
303
- OPENSOURCE_PACKAGE_NAME = None
304
-
305
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
306
- if not model and not module_name and not class_name:
307
- raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
308
- "module_name and class_name"),
309
- MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
310
-
311
- validator._validate_mutually_inclusive_arguments(module_name, "module_name",
312
- class_name, "class_name")
313
-
314
- super().__init__()
315
-
316
- self.module_name = module_name
317
- self.class_name = class_name
318
- self.kwargs = kwargs if kwargs is not None else {}
319
- self.pos_args = pos_args if pos_args is not None else tuple()
320
-
321
- self._fit_label_columns_types = None
322
- self._table_name_prefix = None
323
-
324
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
325
- self._fit_partition_colums_non_default = None
326
- self._is_default_partition_value_predict = True # False when the user provides partition columns.
327
-
328
- def _validate_equality_of_partition_values(self, fit_values, trans_values):
329
- """
330
- Internal function to compare the partition values in fit() and predict() are same.
331
- """
332
- if len(fit_values) != len(trans_values):
333
- return False
334
-
335
- for val in fit_values:
336
- if not all([val in trans_values]):
337
- return False
338
-
339
- return True
340
-
341
- def _validate_unique_partition_values(self, data, partition_columns):
342
- """
343
- Internal function to validate if the partition values in partition_columns used in fit()
344
- and predict() are same.
345
- """
346
- data._index_label = None
347
- unique_values = data.drop_duplicate(partition_columns).get_values()
348
-
349
- trans_unique_values = sorted(unique_values.tolist(), key=lambda x: tuple(x))
350
- fit_unique_values = sorted(self._fit_partition_unique_values.tolist() \
351
- if not isinstance(self._fit_partition_unique_values, list) \
352
- else self._fit_partition_unique_values, key=lambda x: tuple(x))
353
- default_unique_values = [[self._default_data_partition_value]]
354
-
355
- if fit_unique_values == default_unique_values and \
356
- trans_unique_values != default_unique_values:
357
- error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
358
- "without", "with")
359
- msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
360
- raise TeradataMlException(error_msg, msg_code)
361
-
362
- if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
363
- raise TeradataMlException(
364
- Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING),
365
- MessageCodes.PARTITION_VALUES_NOT_MATCHING
366
- )
367
-
368
- def fit(self, **kwargs):
369
- pass
370
-
371
- def __get_obj_attributes_multi_model(self, name):
372
- """
373
- Internal function to get attributes of all sklearn model objects when multiple models are
374
- generated by fit.
375
- """
376
-
377
- def __generate_model_object(model_obj_value):
378
- """
379
- Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
380
- """
381
- # Create _SkLearnObjectWrapper object from opensource model object.
382
- model_obj = self.__class__(model=first_atrribute_instance)
383
- model_obj.modelObj = model_obj_value
384
- model_obj._is_model_installed = True
385
-
386
- # Setting other model attributes.
387
- model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
388
- model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
389
- model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
390
- model_obj._fit_partition_unique_values = self._fit_partition_unique_values
391
- return model_obj
392
-
393
- # Wrapper function to invoke dynamic method, using arguments
394
- # passed by user, on model in each row.
395
- def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
396
- multi_models = self.modelObj.copy()
397
- for i in range(multi_models.shape[0]):
398
- curr_model = multi_models.iloc[i]["model"]
399
- multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
400
-
401
- first_function_instance = multi_models.at[0, "model"]
402
- if self.__class__._validate_model_supportability(first_function_instance):
403
- return __generate_model_object(multi_models)
404
-
405
- return multi_models.rename(columns={"model": name})
406
-
407
- # Assuming that self.modelObj will have at least 1 row.
408
-
409
- # Get attribute instance from first model object.
410
- first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
411
-
412
- # If first_atrribute_instance is callable, it should be applied on model in each row
413
- # using passed arguments.
414
- if callable(first_atrribute_instance):
415
- return __sklearn_method_invoker_for_multimodel
416
-
417
- output_attributes = self.modelObj.copy()
418
- for i in range(output_attributes.shape[0]):
419
- model = output_attributes.iloc[i]["model"]
420
- output_attributes.at[i, "model"] = getattr(model, name)
421
-
422
- if self.__class__._validate_model_supportability(first_atrribute_instance):
423
- return __generate_model_object(output_attributes)
424
-
425
- return output_attributes.rename(columns={"model": name})
426
-
427
- def __getattr__(self, name):
428
- # This just run attributes (functions and properties) from sklearn object.
429
- def __sklearn_method_invoker(*c, **kwargs):
430
- # sklearn model is returned from the function call. Create _SkLearnObjectWrapper object.
431
- model_obj = attribute_instance(*c, **kwargs)
432
- if self.__class__._validate_model_supportability(model_obj):
433
- model_obj = self.__class__(model=model_obj)
434
- model_obj._is_model_installed = True # Trained model is returned by function call.
435
- return model_obj
436
-
437
- if isinstance(self.modelObj, pd.DataFrame):
438
- return self.__get_obj_attributes_multi_model(name)
439
-
440
- attribute_instance = getattr(self.modelObj, name)
441
-
442
- if callable(attribute_instance):
443
- return __sklearn_method_invoker
444
-
445
- if self.__class__._validate_model_supportability(attribute_instance):
446
- # sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
447
- model_obj = self.__class__(model=attribute_instance)
448
- model_obj._is_model_installed = True # Trained model is returned as attribute.
449
- return model_obj
450
-
451
- return attribute_instance
452
-
453
- @classmethod
454
- def _validate_model_supportability(cls, model):
455
- """
456
- Internal function to validate if the model provided for deployment is supported by
457
- teradataml's opensourceML.
458
- """
459
- error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
460
- "The given model is not a supported opensource model.")
461
- msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
462
- package_name = None
463
- class_name = None
464
- try:
465
- # For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
466
- # TODO: check for other supported packages.
467
- if hasattr(model, "__module__"):
468
- package_name = model.__module__.split(".")[0]
469
- if package_name not in OpenSourcePackage.values():
470
- return False
471
- if hasattr(model, "__class__"):
472
- class_name = model.__class__.__name__
473
- except Exception as ex:
474
- # If in case, model.__module__ fails.
475
- raise TeradataMlException(error_msg, msg_code) from ex
476
-
477
- # True only if package name is opensource package name and class name is not internal class.
478
- return True if package_name and class_name and \
479
- package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
480
-
481
- def _save_model(self, model_name, replace_if_exists=False):
482
- """
483
- Internal function to save the model stored in file at location mentioned by class variable
484
- "model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
485
- on the value of "replace_if_exists" argument.
486
- """
487
- # Creating a table, if doesn't exist, in Vantage to store the model info.
488
- conn = get_connection()
489
- osml_models_table_exists = conn.dialect.has_table(conn,
490
- table_name=_OSML_MODELS_TABLE_NAME,
491
- schema=self._db_name,
492
- table_only=True)
493
- if not osml_models_table_exists:
494
- all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
495
- all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
496
- _create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
497
- primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
498
-
499
- model_obj = OpensourceModels(is_default_partition_value=self._is_default_partition_value_fit,
500
- partition_file_prefix=self._model_file_name_prefix,
501
- fit_partition_columns_non_default=self._fit_partition_colums_non_default,
502
- model=self.modelObj,
503
- pos_args=self.pos_args,
504
- key_args=self.kwargs)
505
-
506
- # Saved the model object to a file to be used in save_byom() for writing to Vantage table.
507
- file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
508
- with open(file_name, "wb+") as fp:
509
- fp.write(pickle.dumps(model_obj))
510
-
511
- try:
512
- save_byom(model_id=model_name,
513
- model_file=file_name,
514
- table_name=_OSML_MODELS_TABLE_NAME,
515
- additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
516
- additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
517
- except TeradataMlException as ex:
518
- model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
519
- if not replace_if_exists and model_exists_msg == str(ex):
520
- raise
521
- elif replace_if_exists and model_exists_msg == str(ex):
522
- # Delete the model from Model table and save again.
523
- delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
524
- save_byom(model_id=model_name,
525
- model_file=file_name,
526
- table_name=_OSML_MODELS_TABLE_NAME,
527
- additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
528
- additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
529
- else:
530
- raise
531
- finally:
532
- os.remove(file_name)
533
-
534
- @classmethod
535
- def _deploy(cls, model_name, model, replace_if_exists=False):
536
- """
537
- Internal function to create an instance of the class using the model and deploy
538
- the model to Vantage.
539
- """
540
- is_model_supportable = cls._validate_model_supportability(model=model)
541
- if not is_model_supportable:
542
- raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
543
- "deploy", "The given model is not a supported opensource model."),
544
- MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
545
-
546
- cls = cls(model=model)
547
- # Load the model file into Vantage node as file can be used in
548
- # predict or other operations.
549
- cls._install_initial_model_file()
550
-
551
- cls._save_model(model_name, replace_if_exists)
552
-
553
- return cls
554
-
555
- @classmethod
556
- def _load(cls, model_name):
557
- """
558
- Internal function to load model corresponding to the package (like sklearn etc)
559
- from Vantage to client using retrieve_byom() and create an instance of the class if
560
- the model is from the same package.
561
- """
562
- try:
563
- model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
564
- return_addition_columns=True)
565
- except TeradataMlException as ex:
566
- # Not showing table name in error message as it is an internal table.
567
- part_msg = f"Model '{model_name}' not found in the table "
568
- if part_msg in str(ex):
569
- raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
570
- MessageCodes.MODEL_NOT_FOUND)
571
- raise
572
-
573
- model_vals_list = model.get_values()[0]
574
- # List of 3 elements -
575
- # - model name as index column,
576
- # - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
577
- # - 2nd contains package name.
578
- model_obj = pickle.loads(model_vals_list[0])
579
- model = model_obj.model
580
- package = model_vals_list[1]
581
-
582
- if package != cls.OPENSOURCE_PACKAGE_NAME.value:
583
- # Raise error if trying to access model of different package.
584
- raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
585
- f". Requested model is from '{package}' package"),
586
- MessageCodes.MODEL_NOT_FOUND)
587
-
588
- if isinstance(model, pd.DataFrame):
589
- # Create a new instance of the class and set the model object to the instance.
590
- # Instantiation can take only model, not model object. Hence, passing one of the model
591
- # from pandas df. Updating modelObj and other fields later
592
- cls = cls(model=model.iloc[1,2])
593
- cls.modelObj = model
594
- cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
595
- else:
596
- cls = cls(model=model)
597
-
598
- cls._model_file_name_prefix = model_obj.partition_file_prefix
599
- cls._is_default_partition_value_fit = model_obj.is_default_partition_value
600
- cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
601
- cls.pos_args = model_obj.pos_args
602
- cls.kwargs = model_obj.key_args
603
-
604
- # Load the model file into Vantage node as file can be used in
605
- # predict or other operations.
606
- cls._install_initial_model_file()
607
-
608
- return cls
609
-
610
- def deploy(self, model_name, replace_if_exists=False):
611
- """
612
- DESCRIPTION:
613
- Deploys the model held by interface object to Vantage.
614
-
615
- PARAMETERS:
616
- model_name:
617
- Required Argument.
618
- Specifies the unique name of the model to be deployed.
619
- Types: str
620
-
621
- replace_if_exists:
622
- Optional Argument.
623
- Specifies whether to replace the model if a model with the same name already
624
- exists in Vantage. If this argument is set to False and a model with the same
625
- name already exists, then the function raises an exception.
626
- Default Value: False
627
- Types: bool
628
-
629
- RETURNS:
630
- The opensource object wrapper.
631
-
632
- RAISES:
633
- TeradataMLException if model with "model_name" already exists and the argument
634
- "replace_if_exists" is set to False.
635
-
636
- EXAMPLES:
637
- >>> from teradataml import td_sklearn
638
- >>> model = td_sklearn.LinearRegression(normalize=True)
639
- >>> model
640
- LinearRegression(normalize=True)
641
-
642
- # Example 1: Deploy the model held by interface object to Vantage.
643
- >>> lin_reg = model.deploy("linreg_model_ver_2")
644
- Model is saved.
645
- >>> lin_reg
646
- LinearRegression(normalize=True)
647
-
648
- # Example 2: Deploy the model held by interface object to Vantage with the name same
649
- # as that of model that already existed in Vantage.
650
- >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
651
- Model is deleted.
652
- Model is saved.
653
- >>> lin_reg
654
- LinearRegression(normalize=True)
655
- """
656
-
657
- # Install model file into Vantage, if not installed.
658
- self._install_initial_model_file()
659
-
660
- self._save_model(model_name, replace_if_exists)
661
- return self
662
-
663
-
664
- class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
665
-
666
- OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
667
-
668
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
669
- super().__init__(model=model, module_name=module_name, class_name=class_name,
670
- pos_args=pos_args, kwargs=kwargs)
671
-
672
- self._initialize_variables()
673
- if model:
674
- self.modelObj = model
675
- self.module_name = model.__module__.split("._")[0]
676
- self.class_name = model.__class__.__name__
677
- # __dict__ gets all the arguments as dictionary including default ones and positional
678
- # args.
679
- self.kwargs = model.__dict__
680
- self.pos_args = tuple() # Kept empty as all are moved to kwargs.
681
- else:
682
- self._initialize_object()
683
-
684
- def __repr__(self):
685
- if self._is_default_partition_value_fit:
686
- # Single model use case.
687
- return self.modelObj.__repr__()
688
-
689
- pd.set_option("display.expand_frame_repr", None)
690
- pd.set_option("display.max_colwidth", None)
691
- opt = self.modelObj.__repr__()
692
- pd.reset_option("display.expand_frame_repr")
693
- pd.reset_option("display.max_colwidth")
694
- return opt
695
-
696
- def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
697
- skip_either_or_that=False):
698
- """
699
- Internal function to validate arguments passed to exposed opensource APIs and return
700
- parent DataFrame, feature columns, label columns, group columns, data partition columns.
701
- """
702
- _validate_opensource_func_args(X=X, y=y, groups=groups,
703
- fit_partition_cols=self._fit_partition_colums_non_default,
704
- kwargs=kwargs,
705
- skip_either_or_that=skip_either_or_that)
706
- return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
707
- fit_partition_cols=self._fit_partition_colums_non_default)
708
-
709
- def _initialize_object(self):
710
- """
711
- Internal function to initialize sklearn object from module name and class name.
712
- """
713
- # Needed when writing imported modules to generated file. TODO: Remove later.
714
- imported_args = {}
715
- # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
716
- # corresponding sklearn object.
717
- new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
718
- new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
719
-
720
- # Create model object from new positional and keyword arguments.
721
- class_obj = getattr(import_module(self.module_name), self.class_name)
722
- if new_sklearn_pos_args:
723
- self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
724
- else:
725
- self.modelObj = class_obj(**new_sklearn_kwargs)
726
-
727
- # All arguments are moved to kwargs and kept pos_args empty.
728
- # Might help in set_params() bug fix.
729
- self.pos_args = tuple()
730
- _arguments = self.modelObj.__dict__
731
-
732
- if hasattr(self.modelObj, "get_params"):
733
- # Update kwargs that are both in modelObj and get_params() as there are
734
- # some classes which return other internals variables also.
735
- # Hence, filtering them using get_params().
736
- for k, v in _arguments.items():
737
- if type(v).__name__ in ["function", "generator"]:
738
- # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
739
- # are not supported yet due to pickling issue.
740
- continue
741
- if k in self.get_params():
742
- self.kwargs[k] = v
743
- else:
744
- # Model selection classes will not have `get_params`, in which case modelObj's __dict__
745
- # is saved as kwargs.
746
- self.kwargs = _arguments
747
-
748
- def _initialize_variables(self):
749
- """
750
- Internal function to initialize variables used in this class.
751
- """
752
- self.feature_names_in_ = None
753
- self._table_name_prefix = "td_sklearn_"
754
- self._model_file_name_prefix = _generate_new_name(type="file")
755
- self.model_file_paths_local = set()
756
-
757
- self._fit_execution_time = None
758
- self._fit_predict_execution_time = None
759
- self._partial_fit_execution_time = None
760
- self._predict_execution_time = None
761
- self._transform_execution_time = None
762
- self._score_execution_time = None
763
-
764
- # Set to partition columns when training is done with partition columns.
765
- self._fit_partition_colums_non_default = None
766
-
767
- self._is_model_installed = False
768
- self._fit_partition_unique_values = [[self._default_data_partition_value]]
769
-
770
- def modify_args(self, fp1, arg, imported_args):
771
- """
772
- Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
773
- of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
774
- object.
775
- This function can also be used to write import statements to file (if "fp1" is not
776
- None). Update "imported_args" dictionary with imported module and class name to avoid
777
- importing same module and class again when writing to file. This is useful when we want to
778
- generate script from template file.
779
- Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
780
- object to corresponding sklearn object.
781
- """
782
- if isinstance(arg, type(self)):
783
- imported_tuple = (arg.module_name, arg.class_name)
784
- already_imported = imported_args.get(imported_tuple, False)
785
- if not already_imported:
786
- imported_args[imported_tuple] = True
787
- if fp1:
788
- fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
789
- self.modify_args(fp1, arg.pos_args, imported_args)
790
- self.modify_args(fp1, arg.kwargs, imported_args)
791
- return arg.modelObj
792
- elif isinstance(arg, list):
793
- return [self.modify_args(fp1, val, imported_args) for val in arg]
794
- elif isinstance(arg, tuple):
795
- return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
796
- elif type(arg).__name__ == "generator":
797
- # Raising exception as generator object can't be pickled.
798
- # TODO: ELE-6351 - Find ways to pickle generator object later.
799
- raise ValueError("Generator type/iterator is not supported for any argument. "\
800
- "Support will be added later.")
801
- elif type(arg).__name__ == "function":
802
- # Raising exception as functions/lambda functions can't be pickled.
803
- # TODO: ELE-6351 - Find ways to pickle functions later.
804
- raise ValueError("Functions are not supported for any argument. "\
805
- "Support will be added later.")
806
- elif isinstance(arg, dict):
807
- return dict(
808
- (
809
- self.modify_args(fp1, k, imported_args),
810
- self.modify_args(fp1, v, imported_args),
811
- )
812
- for k, v in arg.items()
813
- )
814
- else:
815
- return arg
816
-
817
- def _install_initial_model_file(self):
818
- """
819
- If model file(s) is/are not installed in Vantage, then install it/them.
820
- """
821
- if isinstance(self.modelObj, pd.DataFrame):
822
- # Get list of unique partition values and corresponding model object as dict.
823
- partition_values_model_dict = {}
824
- obj_list = self.modelObj.values.tolist()
825
- for lst in obj_list:
826
- partition_values_model_dict[tuple(lst[:len(lst)-1])] = lst[len(lst)-1]
827
-
828
- for partition in self._fit_partition_unique_values:
829
- # Create a new file with file name with partition values and
830
- # dump sklearn object into it. Finally install the file to Vantage.
831
- partition_join = "_".join([str(x) for x in partition])
832
- file_name = f"{self._model_file_name_prefix}_{partition_join}"
833
- # Replace '-' with '_' as '-' can't be present in file identifier.
834
- # Needed this replace because partition_columns can be negative.
835
- file_name = file_name.replace("-", "_")
836
- full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
837
- with open(full_file_name, "wb+") as fp:
838
- # Write sklearn object to file.
839
- if isinstance(self.modelObj, pd.DataFrame):
840
- # If multiple models, then write the model corresponding to the partition value.
841
- fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
842
- else:
843
- fp.write(pickle.dumps(self.modelObj))
844
- self.model_file_paths_local.add(file_name)
845
-
846
- self._install_script_file(file_identifier=file_name,
847
- file_name=file_name,
848
- is_binary=True,
849
- file_location=self._tdml_tmp_dir)
850
-
851
- if self._is_lake_system:
852
- # Need to pass env_name along with file_name for cleaning up the files in env.
853
- obj = f"{self._env.env_name}::{file_name}"
854
- if installed_model_files[obj] == 0:
855
- # Add to GC for the first time the model file (along with env name) is encountered.
856
- installed_model_files[obj] = 1
857
- GarbageCollector._add_to_garbagecollector(object_name=obj,
858
- object_type=TeradataConstants.TERADATA_APPLY)
859
- else:
860
- if installed_model_files[file_name] == 0:
861
- # Add to GC for the first time the model file is encountered.
862
- installed_model_files[file_name] = 1
863
- GarbageCollector._add_to_garbagecollector(object_name=file_name,
864
- object_type=TeradataConstants.TERADATA_SCRIPT)
865
-
866
- self._is_model_installed = True
867
-
868
- def _run_fit_related_functions(self,
869
- data,
870
- feature_columns,
871
- label_columns,
872
- partition_columns,
873
- func,
874
- classes=None):
875
- """
876
- Internal function to run fit() and partial_fit() functions.
877
- """
878
- label_columns = self._get_columns_as_list(label_columns)
879
-
880
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
881
- feature_columns,
882
- label_columns,
883
- partition_columns)
884
-
885
- model_type = BLOB() if self._is_lake_system else CLOB()
886
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
887
- for col in new_partition_columns] + [("model", model_type)]
888
-
889
- file_name = "sklearn_fit.py"
890
-
891
- if classes:
892
- class_type = type(classes[0]).__name__
893
- classes = "--".join([str(x) for x in classes])
894
- else:
895
- classes = str(None)
896
- class_type = str(None)
897
-
898
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
899
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
900
-
901
- # db_name is applicable for enterprise system.
902
- db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
903
- py_exc = UtilFuncs._get_python_execution_path()
904
- script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
905
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
906
- f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
907
-
908
- # Get unique values in partitioning columns.
909
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
910
-
911
- self._install_initial_model_file()
912
-
913
- self._model_data = self._run_script(data, script_command, new_partition_columns,
914
- return_types)
915
-
916
- # Extract sklearn object(s) from the depending on the number of unique partitioning values.
917
- self.extract_sklearn_obj(n_unique_partitions=len(self._fit_partition_unique_values),
918
- n_partition_cols=len(new_partition_columns))
919
-
920
- # Need this label columns types in prediction.
921
- self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
922
- for l_c in label_columns]
923
-
924
- # If the model is trained a second time after the object creation,
925
- # or if set_params() is called after the first model training,
926
- # this flag will reset to False. So that for subsequent predict/score
927
- # operations, the newly trained model will be installed.
928
- if self._is_trained_model_installed:
929
- self._is_trained_model_installed = False
930
-
931
- def partial_fit(self, X=None, y=None, classes=None, **kwargs):
932
- """
933
- Please check the description in Docs/OpensourceML/sklearn.py.
934
- """
935
- st_time = time.time()
936
-
937
- # "classes" argument validation.
938
- arg_info_matrix = []
939
- arg_info_matrix.append(["classes", classes, True, (list)])
940
- _Validators._validate_function_arguments(arg_info_matrix)
941
-
942
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
943
-
944
- data, feature_columns, label_columns, _, partition_columns = \
945
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
946
-
947
- if partition_columns:
948
- self._is_default_partition_value_fit = False
949
- self._fit_partition_colums_non_default = partition_columns
950
-
951
- self._run_fit_related_functions(data,
952
- feature_columns,
953
- label_columns,
954
- partition_columns,
955
- inspect.stack()[0][3],
956
- classes)
957
-
958
- self._partial_fit_execution_time = time.time() - st_time
959
-
960
- return self
961
-
962
- def fit(self, X=None, y=None, **kwargs):
963
- """
964
- Please check the description in Docs/OpensourceML/sklearn.py.
965
- """
966
- st_time = time.time()
967
-
968
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
969
-
970
- data, feature_columns, label_columns, _, partition_columns = \
971
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
972
-
973
- if partition_columns:
974
- self._is_default_partition_value_fit = False
975
- self._fit_partition_colums_non_default = partition_columns
976
-
977
- self._run_fit_related_functions(data,
978
- feature_columns,
979
- label_columns,
980
- partition_columns,
981
- inspect.stack()[0][3])
982
-
983
- self._fit_execution_time = time.time() - st_time
984
-
985
- return self
986
-
987
- def set_params(self, **params):
988
- """
989
- Please check the description in Docs/OpensourceML/sklearn.py.
990
- """
991
- for key, val in params.items():
992
- self.kwargs[key] = val
993
-
994
- # Initialize with new arguments and return the class/model object.
995
- # set_params takes all keyword arguments and no positional arguments.
996
- self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
997
- return self
998
-
999
- # get_params() will be executed through __getattr__().
1000
-
1001
- # @_validate_fit_run
1002
- def __getattr__(self, name):
1003
- def __run_transform(*c, **kwargs):
1004
- kwargs["name"] = name
1005
- return self._transform(*c, **kwargs)
1006
-
1007
- def __run_function_needing_all_rows(*c, **kwargs):
1008
- kwargs["name"] = name
1009
- return self._run_function_needing_all_rows(*c, **kwargs)
1010
-
1011
- def __run_kneighbors(*c, **kwargs):
1012
- kwargs["name"] = name
1013
- return self._run_neighbors(*c, **kwargs)
1014
-
1015
- if name in ["score", "aic", "bic", "perplexity"]:
1016
- # TODO: ELE-6352 - Implement error_norm() function later.
1017
- return __run_function_needing_all_rows
1018
-
1019
- if name in ["kneighbors",
1020
- "radius_neighbors",
1021
- "kneighbors_graph",
1022
- "radius_neighbors_graph"]:
1023
- return __run_kneighbors
1024
-
1025
- if name in ["predict",
1026
- "transform",
1027
- "inverse_transform",
1028
- "predict_proba",
1029
- "predict_log_proba",
1030
- "decision_function",
1031
- "score_samples",
1032
- "decision_path",
1033
- "apply",
1034
- "cost_complexity_pruning_path",
1035
- "gibbs",
1036
- "kneighbors_graph",
1037
- "radius_neighbors_graph",
1038
- "mahalanobis",
1039
- "correct_covariance",
1040
- "reweight_covariance",
1041
- "path"]:
1042
- return __run_transform
1043
-
1044
- return super().__getattr__(name)
1045
-
1046
- def _get_return_columns_for_function_(self,
1047
- data,
1048
- feature_columns,
1049
- label_columns,
1050
- func_name,
1051
- kwargs):
1052
- """
1053
- Internal function to return list of column names and their sqlalchemy types
1054
- which should be used in return_types of Script.
1055
- """
1056
- if func_name == "fit_predict":
1057
- """
1058
- Get return columns using label_columns.
1059
- """
1060
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
1061
- data._td_column_names_and_sqlalchemy_types[col.lower()])
1062
- for i, col in enumerate(label_columns)]
1063
- if func_name == "predict":
1064
- """
1065
- Return predict columns using either label_columns (if provided) or
1066
- self._fit_label_columns_types (if the function is trained using label columns).
1067
- Otherwise run predict on ten rows of data to get the number of columns and their types
1068
- after this if condition.
1069
- """
1070
- if label_columns:
1071
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
1072
- data._td_column_names_and_sqlalchemy_types[col.lower()])
1073
- for i, col in enumerate(label_columns)]
1074
- if self._fit_label_columns_types:
1075
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
1076
- for i, col_type in enumerate(self._fit_label_columns_types)]
1077
-
1078
- data = data.select(feature_columns + label_columns)
1079
-
1080
- ## If function is not `fit_predict`:
1081
- # then take one row of transform/other functions to execute in client
1082
- # to get number of columns in return clause and their Vantage types.
1083
- n_f = len(feature_columns)
1084
- n_c = len(label_columns)
1085
-
1086
- # For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
1087
- # Just for getting the number of columns and their types, using only one model of all.
1088
- if len(self._fit_partition_unique_values) == 1:
1089
- # Single model case.
1090
- skl_obj = self.modelObj
1091
- else:
1092
- # Multi model case.
1093
- skl_obj = self.modelObj.iloc[0]["model"]
1094
-
1095
- ten_row_data = data.head(10).get_values()
1096
- X = numpy.array(ten_row_data)
1097
- if label_columns:
1098
- y = X[:,n_f : n_f + n_c]
1099
- X = X[:,:n_f]
1100
- # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
1101
- # in local run if passed. Generally, 'y' is passed to return y along with actual output.
1102
- try:
1103
- trans_opt = getattr(skl_obj, func_name)(X, y, **kwargs)
1104
- except TypeError as ex:
1105
- # Function which does not accept 'y' like predict_proba() raises error like
1106
- # "predict_proba() takes 2 positional arguments but 3 were given".
1107
- trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
1108
- else:
1109
- trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
1110
-
1111
- if func_name == "path":
1112
- raise NotImplementedError(
1113
- "path() returns tuple of ndarrays of different shapes. Not Implemented yet."
1114
- )
1115
-
1116
- if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
1117
- trans_opt = trans_opt.reshape(X.shape[0], 1)
1118
-
1119
- if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
1120
- no_of_columns = trans_opt.get_shape()[1]
1121
- trans_opt = trans_opt.toarray()
1122
- elif isinstance(trans_opt, dict):
1123
- raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
1124
- elif isinstance(trans_opt[0], numpy.ndarray) \
1125
- or isinstance(trans_opt[0], list) \
1126
- or isinstance(trans_opt[0], tuple):
1127
- no_of_columns = len(trans_opt[0])
1128
- else:
1129
- no_of_columns = 1
1130
-
1131
- # Special handling when inverse_transform of no_of_columns returns no of rows
1132
- # less than the no of classes. Such columns are filled with NaN values.
1133
- # Updating number of columns here (new columns with NaN values will be added).
1134
- if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
1135
- no_of_columns = len(self.classes_)
1136
- for i in range(len(ten_row_data)):
1137
- trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
1138
-
1139
- # Special handling required for cross_decomposition classes's transform function, which
1140
- # takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
1141
- # y_scores. If label columns are not provided, only x_scores are returned.
1142
- if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
1143
- # For cross_decomposition, output is a tuple of arrays when label columns are provided
1144
- # along with feature columns for transform function. In this case, concatenate the
1145
- # arrays and return the column names accordingly.
1146
- if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
1147
- assert trans_opt[0].shape == trans_opt[1].shape,\
1148
- "Output arrays should be of same shape when transform/fit_transform is run "\
1149
- "with label columns for cross_decomposition classes.."
1150
- first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
1151
- second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
1152
- no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
1153
- col_names = first_cols + second_cols
1154
-
1155
- trans_opt = numpy.concatenate(trans_opt, axis=1)
1156
- else:
1157
- assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
1158
- "without label columns for cross_decomposition classes, "\
1159
- "output should be a numpy array."
1160
- no_of_columns = trans_opt.shape[1]
1161
- col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
1162
- else:
1163
- # Generate list of new column names.
1164
- col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
1165
-
1166
- # Get new column sqlalchemy types for pandas df columns of transform output.
1167
- opt_pd = pd.DataFrame(trans_opt)
1168
-
1169
- # Get output column types for each column in pandas df from the output of transform
1170
- # type functions.
1171
- types = {}
1172
- for idx, col in enumerate(list(opt_pd.columns)):
1173
- # Get type of column using data from all rows, in case if the column has None values.
1174
- # 'and' of types of all values in the column with type(None) gives the type of the column.
1175
- type_ = type(None)
1176
- for i in range(len(trans_opt)):
1177
- type_ = type_ and type(trans_opt[i][idx])
1178
-
1179
- # If all the values of the output (trans_opt) is None, thelen use `str` as type since
1180
- # pandas astype() does not accept None type.
1181
- if type_ is type(None):
1182
- type_ = str
1183
-
1184
- # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1185
- # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1186
- # Error while type casting for column '2'"
1187
- # Hence, using pd.Int64Dtype() for integer columns with nan values.
1188
- types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1189
-
1190
- # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1191
- opt_pd = opt_pd.astype(types)
1192
-
1193
- # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1194
- # TIMESTAMP(timezone=True) else map it according to default value.
1195
- col_types = [TIMESTAMP(timezone=True)
1196
- if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
1197
- else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
1198
- for key, col_name in enumerate(list(opt_pd.columns))]
1199
-
1200
- return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
1201
-
1202
- @_validate_fit_run
1203
- def _run_function_needing_all_rows(self, X=None, y=None, **kwargs):
1204
- """
1205
- Internal function to run functions like score, aic, bic which needs all rows and return
1206
- one floating number as result.
1207
- """
1208
- st_time = time.time()
1209
-
1210
- assert kwargs["name"], "function name should be passed."
1211
- func_name = kwargs["name"]
1212
-
1213
- # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
1214
- kwargs.pop("name")
1215
-
1216
- data, feature_columns, label_columns, _, partition_columns = \
1217
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1218
-
1219
- label_columns = self._get_columns_as_list(label_columns)
1220
-
1221
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1222
- feature_columns,
1223
- label_columns,
1224
- partition_columns)
1225
-
1226
- file_name = "sklearn_score.py"
1227
-
1228
- script_file_path = f"{file_name}" if self._is_lake_system \
1229
- else f"./{self._db_name}/{file_name}"
1230
-
1231
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1232
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1233
-
1234
- self._validate_unique_partition_values(data, new_partition_columns)
1235
-
1236
- py_exc = UtilFuncs._get_python_execution_path()
1237
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1238
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1239
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1240
-
1241
- # score, aic, bic returns float values.
1242
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1243
- for col in new_partition_columns] + [(func_name, FLOAT())]
1244
-
1245
- # Checking the trained model installation. If not installed,
1246
- # install it and set flag to True.
1247
- if not self._is_trained_model_installed:
1248
- self._install_initial_model_file()
1249
- self._is_trained_model_installed = True
1250
-
1251
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1252
-
1253
- self._score_execution_time = time.time() - st_time
1254
-
1255
- if self._is_default_partition_value_fit:
1256
- # For single model case, partition column is internally generated and
1257
- # no point in returning it to the user.
1258
- return opt.select(func_name)
1259
-
1260
- return opt
1261
-
1262
- @_validate_fit_run
1263
- def _transform(self, X=None, y=None, **kwargs):
1264
- """
1265
- Internal function to run predict/transform and similar functions, which returns
1266
- multiple columns. This function will return data row along with the generated
1267
- columns' row data, unlike sklearn's functions which returns just output data.
1268
- """
1269
- st_time = time.time()
1270
-
1271
- assert kwargs["name"], "function name should be passed."
1272
- func_name = kwargs["name"]
1273
-
1274
- # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
1275
- kwargs.pop("name")
1276
-
1277
- data, feature_columns, label_columns, _, partition_columns = \
1278
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1279
-
1280
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1281
- feature_columns,
1282
- label_columns,
1283
- partition_columns)
1284
-
1285
- # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
1286
- if "data" in kwargs:
1287
- kwargs.pop("data")
1288
- if "feature_columns" in kwargs:
1289
- kwargs.pop("feature_columns")
1290
- if "group_columns" in kwargs:
1291
- kwargs.pop("group_columns")
1292
- if "partition_columns" in kwargs:
1293
- kwargs.pop("partition_columns")
1294
- if "label_columns" in kwargs:
1295
- kwargs.pop("label_columns")
1296
-
1297
- file_name = "sklearn_transform.py"
1298
-
1299
- script_file_path = f"{file_name}" if self._is_lake_system \
1300
- else f"./{self._db_name}/{file_name}"
1301
-
1302
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1303
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1304
-
1305
- self._validate_unique_partition_values(data, new_partition_columns)
1306
-
1307
- py_exc = UtilFuncs._get_python_execution_path()
1308
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1309
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1310
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1311
-
1312
- # Returning feature columns also along with transformed columns because we don't know the
1313
- # mapping of feature columns to the transformed columns.
1314
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1315
- for col in (new_partition_columns + feature_columns)]
1316
- if func_name in ["predict", "decision_function"] and label_columns:
1317
- return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1318
- for col in label_columns]
1319
-
1320
- return_types += self._get_return_columns_for_function_(data,
1321
- feature_columns,
1322
- label_columns,
1323
- func_name,
1324
- kwargs)
1325
-
1326
- # Checking the trained model installation. If not installed,
1327
- # install it and set flag to True.
1328
- if not self._is_trained_model_installed:
1329
- self._install_initial_model_file()
1330
- self._is_trained_model_installed = True
1331
-
1332
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1333
-
1334
- self._transform_execution_time = time.time() - st_time
1335
-
1336
- return self._get_returning_df(opt, new_partition_columns, return_types)
1337
-
1338
- def fit_predict(self, X=None, y=None, **kwargs):
1339
- """
1340
- Please check the description in Docs/OpensourceML/sklearn.py.
1341
- """
1342
- st_time = time.time()
1343
-
1344
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1345
-
1346
- data, feature_columns, label_columns, _, partition_columns = \
1347
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1348
-
1349
- if partition_columns:
1350
- self._is_default_partition_value_fit = False
1351
-
1352
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1353
- feature_columns,
1354
- label_columns,
1355
- partition_columns)
1356
-
1357
- # Return label_columns also if user provides in the function call.
1358
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1359
- for col in (new_partition_columns + feature_columns + label_columns)]
1360
-
1361
- func_name = inspect.stack()[0][3]
1362
- if label_columns:
1363
- return_types += self._get_return_columns_for_function_(data,
1364
- feature_columns,
1365
- label_columns,
1366
- func_name,
1367
- {})
1368
- else:
1369
- # If there are no label_columns, we will have only one
1370
- # predicted column.
1371
- return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
1372
-
1373
- file_name = "sklearn_fit_predict.py"
1374
-
1375
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1376
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1377
-
1378
- script_file_name = f"{file_name}" if self._is_lake_system \
1379
- else f"./{self._db_name}/{file_name}"
1380
- py_exc = UtilFuncs._get_python_execution_path()
1381
- script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
1382
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1383
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1384
-
1385
- # Get unique values in partitioning columns.
1386
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1387
-
1388
- # Checking the trained model installation. If not installed,
1389
- # install it and flag to True.
1390
- if not self._is_trained_model_installed:
1391
- self._install_initial_model_file()
1392
- self._is_trained_model_installed = True
1393
-
1394
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1395
-
1396
- self._fit_predict_execution_time = time.time() - st_time
1397
-
1398
- if self._is_default_partition_value_fit:
1399
- # For single model case, partition column is internally generated and no point in
1400
- # returning it to the user.
1401
-
1402
- # Extract columns from return types.
1403
- returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
1404
- return opt.select(returning_cols)
1405
-
1406
- return opt
1407
-
1408
- def fit_transform(self, X=None, y=None, **kwargs):
1409
- """
1410
- Please check the description in Docs/OpensourceML/sklearn.py.
1411
- """
1412
- # 'y' is not needed for transform().
1413
- fit_obj = self.fit(X, y, **kwargs)
1414
- kwargs["label_columns"] = None
1415
- return fit_obj.transform(X, None, **kwargs)
1416
-
1417
- @_validate_fit_run
1418
- def _run_neighbors(self, X=None, **kwargs):
1419
- """
1420
- Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
1421
- radius_neighbors_graph which returns multiple columns. This function will return data row
1422
- along with the generated columns' row data, unlike sklearn's functions which returns just
1423
- output data.
1424
- """
1425
- assert kwargs["name"], "function name should be passed."
1426
- func_name = kwargs["name"]
1427
- kwargs.pop("name")
1428
-
1429
- if self.module_name != "sklearn.neighbors":
1430
- raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
1431
-
1432
- data = kwargs.get("data", None)
1433
- partition_columns = kwargs.get("partition_columns", None)
1434
-
1435
- if not X and not partition_columns and not data:
1436
- # If data is not passed, then run from client only.
1437
- # TODO: decide whether to run from client or from Vantage.
1438
- opt = super().__getattr__(func_name)(**kwargs)
1439
- from scipy.sparse.csr import csr_matrix
1440
- if isinstance(opt, csr_matrix):
1441
- return opt.toarray()
1442
- return opt
1443
-
1444
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1445
-
1446
- data, feature_columns, _, _, new_partition_columns = \
1447
- self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
1448
- skip_either_or_that=True)
1449
-
1450
- # Remove the kwargs data.
1451
- input_data = kwargs.pop("data", None)
1452
- partition_cols = kwargs.pop("partition_columns", None)
1453
- feature_cols = kwargs.pop("feature_columns", None)
1454
- label_cols = kwargs.pop("label_columns", None)
1455
-
1456
- if partition_columns:
1457
- # kwargs are passed to kneighbors function. So, removing them from kwargs.
1458
- kwargs.pop("partition_columns")
1459
- self._is_default_partition_value_fit = False
1460
-
1461
- # Generating new partition column name.
1462
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1463
- feature_columns,
1464
- [],
1465
- partition_columns)
1466
-
1467
- args_str = self._get_kwargs_str(kwargs)
1468
-
1469
- file_name = "sklearn_neighbors.py"
1470
-
1471
- script_file_path = f"{file_name}" if self._is_lake_system \
1472
- else f"./{self._db_name}/{file_name}"
1473
-
1474
- # Returning feature columns also along with new columns.
1475
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1476
- for col in (new_partition_columns + feature_columns)]
1477
-
1478
- # `return_distance` is needed as the result is a tuple of two arrays when it is True.
1479
- return_distance = kwargs.get("return_distance", True) # Default value is True.
1480
-
1481
- # Though new columns return numpy arrays, we are returning them as strings.
1482
- # TODO: Will update to columns later, if requested later.
1483
- if func_name in ['kneighbors', 'radius_neighbors']:
1484
- if return_distance:
1485
- return_types += [("neigh_dist", VARCHAR())]
1486
- return_types += [("neigh_ind", VARCHAR())]
1487
- elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
1488
- return_types += [("A", VARCHAR())]
1489
- else:
1490
- return_types += [("output", VARCHAR())]
1491
-
1492
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1493
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1494
-
1495
- py_exc = UtilFuncs._get_python_execution_path()
1496
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1497
- f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
1498
- f"{args_str}"
1499
-
1500
- # Get unique values in partitioning columns.
1501
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1502
-
1503
- # Checking the trained model installation. If not installed,
1504
- # install it and set flag to True.
1505
- if not self._is_trained_model_installed:
1506
- self._install_initial_model_file()
1507
- self._is_trained_model_installed = True
1508
-
1509
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1510
-
1511
- return self._get_returning_df(opt, new_partition_columns, return_types)
1512
-
1513
- def split(self, X=None, y=None, groups=None, **kwargs):
1514
- """
1515
- Please check the description in Docs/OpensourceML/sklearn.py.
1516
- """
1517
- opt = self._run_model_selection("split", X=X, y=y, groups=groups,
1518
- skip_either_or_that=True, kwargs=kwargs)
1519
-
1520
- # Get number of splits in the result DataFrame.
1521
- n_splits = opt.drop_duplicate("split_id").shape[0]
1522
-
1523
- data = kwargs.get("data", None)
1524
- feature_columns = kwargs.get("feature_columns", [])
1525
- label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
1526
-
1527
- # If there is not X and y, get feature_columns and label_columns for "data".
1528
- partition_columns = kwargs.get("partition_columns", [])
1529
- feature_columns = [col for col in X.columns if col not in partition_columns] \
1530
- if X and not data and not feature_columns else feature_columns
1531
- label_columns = y.columns if y and not data and not label_columns else label_columns
1532
-
1533
- # Return iterator of the train and test dataframes for each split.
1534
- for i in range(1, n_splits+1):
1535
- train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
1536
- .select(partition_columns + feature_columns + label_columns)
1537
- train_df._index_label = None
1538
- test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
1539
- .select(partition_columns + feature_columns + label_columns)
1540
- test_df._index_label = None
1541
-
1542
- yield train_df, test_df
1543
-
1544
- def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
1545
- """
1546
- Please check the description in Docs/OpensourceML/sklearn.py.
1547
- """
1548
- return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
1549
- skip_either_or_that=True, kwargs=kwargs)
1550
-
1551
- def _run_model_selection(self,
1552
- func_name,
1553
- X=None,
1554
- y=None,
1555
- groups=None,
1556
- skip_either_or_that=False,
1557
- kwargs={}):
1558
- """
1559
- Internal function to run functions like split, get_n_splits of model selection module.
1560
- - get_n_splits() returns number of splits as value, not as teradataml DataFrame.
1561
- - split() returns teradataml DataFrame containing train and test data for each split
1562
- (add partition information if the argument "partition_cols" is provided).
1563
- """
1564
- if self.module_name != "sklearn.model_selection":
1565
- raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
1566
- f"have {func_name}() method.")
1567
-
1568
- data = kwargs.get("data", None)
1569
-
1570
- if not X and not y and not groups and not data:
1571
- # If data is not passed, then run from client only.
1572
- # TODO: decide whether to run from client or from Vantage.
1573
- return super().__getattr__(func_name)()
1574
-
1575
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1576
-
1577
- data, feature_columns, label_columns, group_columns, partition_columns = \
1578
- self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
1579
- skip_either_or_that=skip_either_or_that)
1580
-
1581
- if partition_columns:
1582
- self._is_default_partition_value_fit = False
1583
-
1584
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1585
- feature_columns,
1586
- label_columns,
1587
- partition_columns,
1588
- group_columns)
1589
-
1590
- file_name = "sklearn_model_selection_split.py"
1591
-
1592
- script_file_path = f"{file_name}" if self._is_lake_system \
1593
- else f"./{self._db_name}/{file_name}"
1594
-
1595
- if func_name == "split":
1596
- # Need to generate data into splits of train and test.
1597
- # split_id - the column which will be used to identify the split.
1598
- # data_type - the column which will be used to identify whether the row is
1599
- # train or test row.
1600
- return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
1601
- # Returning feature columns and label columns as well.
1602
- return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1603
- for col in (feature_columns + label_columns)]
1604
- else:
1605
- # Return Varchar by default.
1606
- # Returns Varchar even for functions like `get_n_splits` which returns large integer
1607
- # numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
1608
- # `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
1609
- return_types = [(func_name, VARCHAR())]
1610
-
1611
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1612
- for col in new_partition_columns] + return_types
1613
-
1614
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1615
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1616
-
1617
- py_exc = UtilFuncs._get_python_execution_path()
1618
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1619
- f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
1620
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1621
-
1622
- # Get unique values in partitioning columns.
1623
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1624
-
1625
- # Checking the trained model installation. If not installed,
1626
- # install it and set flag to True.
1627
- if not self._is_trained_model_installed:
1628
- self._install_initial_model_file()
1629
- self._is_trained_model_installed = True
1630
-
1631
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1632
-
1633
- if func_name == "get_n_splits" and not partition_columns:
1634
- # Return number of splits as value, not as dataframe.
1635
- vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
1636
- opt = vals.fetchall()[0][0]
1637
-
1638
- # Varchar is returned by the script. Convert it to int.
1639
- return int(opt)
1640
-
1641
- return opt
1642
-
1643
- def _get_returning_df(self, script_df, partition_column, returns):
1644
- """
1645
- Internal function to return the teradataml Dataframe except
1646
- partition_column.
1647
- """
1648
- if self._is_default_partition_value_fit:
1649
- # For single model case, partition column is internally generated
1650
- # and no point in returning it to the user.
1651
-
1652
- # Extract columns from return types.
1653
- returning_cols = [col[0] for col in returns[len(partition_column):]]
1654
- return script_df.select(returning_cols)
1655
- return script_df
1656
-
1657
-
1658
- class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1659
- def __init__(self, module_name, func_name):
1660
- super().__init__()
1661
- self.__module_name = module_name
1662
- self.__func_name = func_name
1663
- self.__params = None
1664
- self.__data_args = OrderedDict()
1665
- self._model_file_name = _generate_new_name(type="file_function", extension="py")
1666
-
1667
- def __call__(self, **kwargs):
1668
- """
1669
- Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
1670
- """
1671
- __data_columns = []
1672
-
1673
- partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
1674
- if partition_cols:
1675
- kwargs.pop("partition_columns")
1676
-
1677
- # Separate dataframe related arguments and their column names from actual kwargs.
1678
- for k, v in kwargs.items():
1679
- if isinstance(v, DataFrame):
1680
- # All dataframes should be select of parent dataframe.
1681
- _validate_df_query_type(v, "select", k)
1682
-
1683
- # Save all columns in dataframe related arguments.
1684
- __data_columns.extend(v.columns)
1685
-
1686
- self.__data_args[k] = v
1687
-
1688
-
1689
- # Get common parent dataframe from all dataframes.
1690
- self.__tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self.__data_args.values()))
1691
-
1692
- self._validate_existence_of_partition_columns(partition_cols, self.__tdml_df.columns)
1693
-
1694
- self.__tdml_df = self.__tdml_df.select(__data_columns + partition_cols)
1695
-
1696
- self.__tdml_df, partition_cols = self._get_data_and_data_partition_columns(self.__tdml_df,
1697
- __data_columns,
1698
- [],
1699
- partition_cols
1700
- )
1701
-
1702
- # Prepare string of data arguments with name, indices where columns of that argument resides
1703
- # and types of each of the column.
1704
- data_args_str = self._prepare_data_args_string(kwargs)
1705
-
1706
- self.__params = kwargs
1707
-
1708
- # Get indices of partition_columns and types of all columns.
1709
- data_column_types_str, partition_indices_str, _, partition_cols = \
1710
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
1711
-
1712
- script_file_path = f"{self._model_file_name}" if self._is_lake_system \
1713
- else f"./{self._db_name}/{self._model_file_name}"
1714
-
1715
- model_file_prefix = None
1716
- if self._is_lake_system:
1717
- model_file_prefix = self._model_file_name.replace(".py", "")
1718
-
1719
- py_exc = UtilFuncs._get_python_execution_path()
1720
- script_command = (f"{py_exc} {script_file_path} {partition_indices_str} "\
1721
- f"{data_column_types_str} {data_args_str} {self._is_lake_system}"\
1722
- f" {model_file_prefix}")
1723
-
1724
- model_type = BLOB() if self._is_lake_system else CLOB()
1725
- return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
1726
- for col in partition_cols] + [(self.__func_name, model_type)]
1727
-
1728
- # Generate new file in .teradataml directory and install it to Vantage.
1729
- self._prepare_and_install_file()
1730
-
1731
- self._model_data = self._run_script(self.__tdml_df, script_command, partition_cols, return_types)
1732
- self._model_data._index_label = None
1733
-
1734
- fit_partition_unique_values = self.__tdml_df.drop_duplicate(partition_cols).get_values()
1735
-
1736
- self.extract_sklearn_obj(n_unique_partitions=len(fit_partition_unique_values),
1737
- n_partition_cols=len(partition_cols))
1738
-
1739
- # File cleanup after processing.
1740
- os.remove(self._model_file_local)
1741
- self._remove_script_file(self._model_file_name)
1742
-
1743
- return self.modelObj
1744
-
1745
- def _prepare_data_args_string(self, kwargs):
1746
- """
1747
- Get column indices and types of each data related arguments in the format:
1748
- "{<arg_name>-<comma separated indices>-<comma separated types>}--
1749
- {<arg_name>-<comma separated indices>-<comma separated types>}"
1750
- """
1751
- data_args_str = []
1752
- for arg_name in list(self.__data_args.keys()):
1753
- # Remove DataFrame arguments from kwargs, which will be passed to Script.
1754
- kwargs.pop(arg_name)
1755
-
1756
- # Get column indices and their types for each dataframe from parent dataframe.
1757
- _, partition_indices_str, partition_types_str, _ = \
1758
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
1759
- self.__data_args[arg_name].columns,
1760
- idx_delim=",",
1761
- types_delim=",")
1762
-
1763
- # Format "<arg_name>-<comma separated indices>-<comma separated types>"
1764
- data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
1765
-
1766
- # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
1767
- # {<arg_name>-<comma separated indices>-<comma separated types>}"
1768
- return "--".join(data_args_str)
1769
-
1770
- def _validate_existence_of_partition_columns(self, partition_columns, all_columns):
1771
- """
1772
- Validate if columns in "partition_columns" argument are present in any of the given
1773
- dataframes.
1774
- """
1775
- invalid_part_cols = [c for c in partition_columns if c not in all_columns]
1776
-
1777
- if invalid_part_cols:
1778
- raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
1779
- ", ".join(invalid_part_cols),
1780
- "', '".join(list(self.__data_args.keys())))
1781
- )
1782
-
1783
- def _prepare_and_install_file(self):
1784
- """
1785
- Prepare function script file from template file and install it in Vantage.
1786
- """
1787
- with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
1788
- script_data = fp.read()
1789
- script_data = script_data.replace("<module_name>",self.__module_name).\
1790
- replace("<func_name>",self.__func_name).replace("<params>", json.dumps(self.__params))
1791
-
1792
- self._model_file_local = os.path.join(self._tdml_tmp_dir, self._model_file_name)
1793
-
1794
- with open(self._model_file_local, "w") as fp:
1795
- fp.write(script_data)
1796
-
1797
- self._install_script_file(file_identifier=self._model_file_name.split(".")[0],
1798
- file_name=self._model_file_name,
1799
- file_location=self._tdml_tmp_dir)
1800
-