teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,1466 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2023 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
|
|
7
|
+
# Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
|
|
8
|
+
#
|
|
9
|
+
# Version: 1.0
|
|
10
|
+
# Function Version: 1.0
|
|
11
|
+
#
|
|
12
|
+
# This file contains object wrapper class for opensource packages and child object
|
|
13
|
+
# wrapper classes for each opensource package. Currently, we have child object
|
|
14
|
+
# wrapper class for scikit-learn.
|
|
15
|
+
#
|
|
16
|
+
# ##################################################################
|
|
17
|
+
|
|
18
|
+
import base64
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import pickle
|
|
22
|
+
import warnings
|
|
23
|
+
from collections import OrderedDict, defaultdict
|
|
24
|
+
from importlib import import_module
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
from teradataml.scriptmgmt.lls_utils import list_user_envs
|
|
28
|
+
from teradatasqlalchemy import BLOB, CLOB
|
|
29
|
+
|
|
30
|
+
from teradataml import _TDML_DIRECTORY, Apply, Script, TeradataMlException
|
|
31
|
+
from teradataml.catalog.byom import delete_byom, retrieve_byom, save_byom
|
|
32
|
+
from teradataml.common import pylogger
|
|
33
|
+
from teradataml.common.constants import TeradataConstants
|
|
34
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
|
+
from teradataml.common.messagecodes import MessageCodes
|
|
36
|
+
from teradataml.common.messages import Messages
|
|
37
|
+
from teradataml.common.utils import UtilFuncs
|
|
38
|
+
from teradataml.common.warnings import OneTimeUserWarning
|
|
39
|
+
from teradataml.context.context import (_get_current_databasename,
|
|
40
|
+
get_connection)
|
|
41
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
42
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
43
|
+
from teradataml.dbutils.dbutils import (_create_table,
|
|
44
|
+
execute_sql, set_session_param)
|
|
45
|
+
from teradataml.dbutils.filemgr import install_file, remove_file
|
|
46
|
+
from teradataml.opensource._constants import (
|
|
47
|
+
_OSML_ADDITIONAL_COLUMN_TYPES, _OSML_MODELS_PRIMARY_INDEX,
|
|
48
|
+
_OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, _OSML_MODELS_TABLE_NAME,
|
|
49
|
+
OpensourceModels, OpenSourcePackage, _packages_verified_in_vantage)
|
|
50
|
+
from teradataml.opensource._wrapper_utils import (_generate_new_name,
|
|
51
|
+
_validate_df_query_type)
|
|
52
|
+
from teradataml.options.configure import configure
|
|
53
|
+
from teradataml.utils.validators import _Validators
|
|
54
|
+
|
|
55
|
+
logger = pylogger.getLogger()
|
|
56
|
+
|
|
57
|
+
validator = _Validators()
|
|
58
|
+
|
|
59
|
+
installed_model_files = defaultdict(int)
|
|
60
|
+
|
|
61
|
+
## Flag to ensure the sklearn script
|
|
62
|
+
## installation occurs only once.
|
|
63
|
+
_file_installed = False
|
|
64
|
+
|
|
65
|
+
class _GenericObjectWrapper:
|
|
66
|
+
def __init__(self) -> None:
|
|
67
|
+
if not get_connection():
|
|
68
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.INVALID_CONTEXT_CONNECTION),
|
|
69
|
+
MessageCodes.INVALID_CONTEXT_CONNECTION)
|
|
70
|
+
self._db_name = _get_current_databasename()
|
|
71
|
+
|
|
72
|
+
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
|
|
73
|
+
|
|
74
|
+
# Some random number to be used as partition value if partition_columns is None for fit().
|
|
75
|
+
self._default_data_partition_value = -1001
|
|
76
|
+
|
|
77
|
+
self.modelObj = None
|
|
78
|
+
self._model_data = None
|
|
79
|
+
|
|
80
|
+
self._tdml_tmp_dir = GarbageCollector._get_temp_dir_name()
|
|
81
|
+
|
|
82
|
+
self._env = None
|
|
83
|
+
|
|
84
|
+
self._is_lake_system = UtilFuncs._is_lake()
|
|
85
|
+
|
|
86
|
+
if self._is_lake_system:
|
|
87
|
+
if configure.openml_user_env is not None:
|
|
88
|
+
self._env = configure.openml_user_env
|
|
89
|
+
else:
|
|
90
|
+
self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
|
|
91
|
+
|
|
92
|
+
# Raise exception when python versions don't match between Lake user environment and local.
|
|
93
|
+
self._process_python_version_diff_lake()
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
set_session_param("searchuifdbpath",self._db_name)
|
|
97
|
+
from teradataml.dbutils.dbutils import db_python_version_diff
|
|
98
|
+
if len(db_python_version_diff()) > 0:
|
|
99
|
+
# Raise exception when python versions don't match between Vantage and local.
|
|
100
|
+
from teradataml.context import context as tdmlctx
|
|
101
|
+
py_major_vantage_version = tdmlctx.python_version_vantage.rsplit(".", 1)[0]
|
|
102
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.PYTHON_VERSION_MISMATCH,
|
|
103
|
+
tdmlctx.python_version_vantage, py_major_vantage_version),
|
|
104
|
+
MessageCodes.PYTHON_VERSION_MISMATCH)
|
|
105
|
+
|
|
106
|
+
# Raise warning when python package versions don't match between Vantage and local.
|
|
107
|
+
self._process_package_differences()
|
|
108
|
+
|
|
109
|
+
global _file_installed
|
|
110
|
+
## Flag to check whether trained model is installed or not.
|
|
111
|
+
self._is_trained_model_installed = False
|
|
112
|
+
|
|
113
|
+
## Install all sklearn script files on Vantage.
|
|
114
|
+
if not _file_installed:
|
|
115
|
+
sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
|
|
116
|
+
"sklearn_transform.py", "sklearn_fit_predict.py",
|
|
117
|
+
"sklearn_neighbors.py", "sklearn_model_selection_split.py"]
|
|
118
|
+
for script_file in sklearn_script_files:
|
|
119
|
+
self._install_script_file(file_identifier=script_file.split(".")[0],
|
|
120
|
+
file_name=script_file)
|
|
121
|
+
|
|
122
|
+
_file_installed = True
|
|
123
|
+
|
|
124
|
+
def _process_python_version_diff_lake(self):
|
|
125
|
+
"""
|
|
126
|
+
DESCRIPTION:
|
|
127
|
+
Internal function to process Python interpreter version differences between Lake user environment and local.
|
|
128
|
+
Note:
|
|
129
|
+
* Raises an exception if the Python interpreter major versions are not consistent between
|
|
130
|
+
Lake user environment and local.
|
|
131
|
+
|
|
132
|
+
PARAMETERS:
|
|
133
|
+
None
|
|
134
|
+
|
|
135
|
+
RETURNS:
|
|
136
|
+
None
|
|
137
|
+
|
|
138
|
+
RAISES:
|
|
139
|
+
TeradataMlException
|
|
140
|
+
|
|
141
|
+
EXAMPLES:
|
|
142
|
+
self._process_python_version_diff_lake()
|
|
143
|
+
"""
|
|
144
|
+
# Get the Python interpreter version of the user environment.
|
|
145
|
+
env_list = list_user_envs()
|
|
146
|
+
user_env_name = self._env.env_name
|
|
147
|
+
env_base_version = env_list[env_list['env_name'] == user_env_name].base_env_name.values[0]
|
|
148
|
+
python_env = env_base_version.split("_")[1]
|
|
149
|
+
|
|
150
|
+
# Get the Python interpreter version of the local environment.
|
|
151
|
+
from teradataml.context import context as tdmlctx
|
|
152
|
+
python_local = tdmlctx.python_version_local.rsplit(".", 1)[0]
|
|
153
|
+
# Check if the Python interpreter major versions are consistent between Lake user environment and local.
|
|
154
|
+
# If not, raise an exception.
|
|
155
|
+
if python_env != python_local:
|
|
156
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.PYTHON_VERSION_MISMATCH_OAF,
|
|
157
|
+
python_env, python_local),
|
|
158
|
+
MessageCodes.PYTHON_VERSION_MISMATCH_OAF)
|
|
159
|
+
|
|
160
|
+
def _process_package_differences(self):
|
|
161
|
+
"""
|
|
162
|
+
DESCRIPTION:
|
|
163
|
+
Internal function to process package differences between Vantage and local.
|
|
164
|
+
Note:
|
|
165
|
+
* Raises a warning if the versions of certain Python packages are not consistent between Vantage and local.
|
|
166
|
+
|
|
167
|
+
PARAMETERS:
|
|
168
|
+
None
|
|
169
|
+
|
|
170
|
+
RETURNS:
|
|
171
|
+
None
|
|
172
|
+
|
|
173
|
+
RAISES:
|
|
174
|
+
PackageNotFoundError: If the package is not found in the local environment for OAF.
|
|
175
|
+
|
|
176
|
+
EXAMPLES:
|
|
177
|
+
self._process_package_differences()
|
|
178
|
+
"""
|
|
179
|
+
# OPENSOURCE_PACKAGE_NAME is set for each opensource package, but not for the base class.
|
|
180
|
+
# Add a check to avoid running this function for the base class.
|
|
181
|
+
if self.OPENSOURCE_PACKAGE_NAME is None:
|
|
182
|
+
return
|
|
183
|
+
_is_packages_verfied_in_vantage = _packages_verified_in_vantage.get(
|
|
184
|
+
self.OPENSOURCE_PACKAGE_NAME.value, None)
|
|
185
|
+
if _is_packages_verfied_in_vantage:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
if self._is_lake_system:
|
|
189
|
+
env_pkg_df = self._env.libs
|
|
190
|
+
pkgs_dict = dict(zip(env_pkg_df['name'], env_pkg_df['version']))
|
|
191
|
+
|
|
192
|
+
i = 0
|
|
193
|
+
from importlib.metadata import version
|
|
194
|
+
warning_raised = False
|
|
195
|
+
for pkg in self._pkgs:
|
|
196
|
+
env_version = pkgs_dict.get(pkg)
|
|
197
|
+
try:
|
|
198
|
+
local_version = version(pkg)
|
|
199
|
+
except Exception as e:
|
|
200
|
+
raise
|
|
201
|
+
if env_version != local_version:
|
|
202
|
+
warning_raised = True
|
|
203
|
+
if i == 0:
|
|
204
|
+
strr = f"{pkg}=={local_version}\n"
|
|
205
|
+
i += 1
|
|
206
|
+
else:
|
|
207
|
+
strr += f"{pkg}=={local_version}\n"
|
|
208
|
+
|
|
209
|
+
# If there are differences in package versions, display a warning message to the user.
|
|
210
|
+
# about the package differences and the requirements file created for the user to install the packages
|
|
211
|
+
if warning_raised:
|
|
212
|
+
file_name = f"requirements_{self.OPENSOURCE_PACKAGE_NAME.value}.txt"
|
|
213
|
+
req_file = os.path.join(self._tdml_tmp_dir, file_name)
|
|
214
|
+
with open(req_file, "w") as f:
|
|
215
|
+
f.write(strr)
|
|
216
|
+
_pkgs = "', '".join(self._pkgs[:-1]) + "' and '" + self._pkgs[-1]
|
|
217
|
+
warning_msg = "The versions of certain Python packages are not consistent between "\
|
|
218
|
+
"Lake user environment and local. OpenSourceML compares the versions of '{}' "\
|
|
219
|
+
f"(and also matches the patterns of these packages) used by 'td_{self.OPENSOURCE_PACKAGE_NAME.value}'. "\
|
|
220
|
+
"Teradata recommends same versions for all the Python packages between Lake "\
|
|
221
|
+
"user environment and local."
|
|
222
|
+
req = f"\nA requirements file listing all '{self.OPENSOURCE_PACKAGE_NAME.value}' " + \
|
|
223
|
+
f"related packages and their versions has been written to '{req_file}'. "+ \
|
|
224
|
+
"Update the Lake user environment with the required packages.\n"
|
|
225
|
+
|
|
226
|
+
warning_msg += req
|
|
227
|
+
warnings.warn(warning_msg.format(_pkgs), category=OneTimeUserWarning)
|
|
228
|
+
|
|
229
|
+
else:
|
|
230
|
+
# Check if the versions of Python packages are consistent between Vantage and local.
|
|
231
|
+
from teradataml.dbutils.dbutils import _db_python_package_version_diff
|
|
232
|
+
all_package_versions = _db_python_package_version_diff(self._pkgs, only_diff=False)
|
|
233
|
+
package_difference = \
|
|
234
|
+
all_package_versions[all_package_versions.vantage != all_package_versions.local]
|
|
235
|
+
# If there are differences in package versions, raise a warning.
|
|
236
|
+
if package_difference.shape[0] > 0:
|
|
237
|
+
_pkgs = "', '".join(self._pkgs[:-1]) + "' and '" + self._pkgs[-1]
|
|
238
|
+
warning_msg = "The versions of certain Python packages are not consistent between "\
|
|
239
|
+
"Vantage and local. User can identify them using db_python_package_version_diff() "\
|
|
240
|
+
"function. OpenSourceML compares the versions of '{}' (and also matches the "\
|
|
241
|
+
f"patterns of these packages) used by 'td_{self.OPENSOURCE_PACKAGE_NAME.value}'. Teradata "\
|
|
242
|
+
"recommends to maintain same versions for all the Python packages between Vantage "\
|
|
243
|
+
"and local."
|
|
244
|
+
i = 0
|
|
245
|
+
# Write the requirements file listing all the related packages and their versions.
|
|
246
|
+
for rec in all_package_versions.to_records():
|
|
247
|
+
if i == 0:
|
|
248
|
+
strr = f"{rec[1]}=={rec[2]}\n"
|
|
249
|
+
i += 1
|
|
250
|
+
else:
|
|
251
|
+
strr += f"{rec[1]}=={rec[2]}\n"
|
|
252
|
+
file_name = f"requirements_{self.OPENSOURCE_PACKAGE_NAME.value}.txt"
|
|
253
|
+
req_file = os.path.join(self._tdml_tmp_dir, file_name)
|
|
254
|
+
with open(req_file, "w") as f:
|
|
255
|
+
f.write(strr)
|
|
256
|
+
|
|
257
|
+
# Display a warning message to the user about the package differences
|
|
258
|
+
# and the requirements file created for the user to install the packages.
|
|
259
|
+
req = f"\nA requirements file listing all '{self.OPENSOURCE_PACKAGE_NAME.value}' " + \
|
|
260
|
+
f"related packages and their versions has been written to '{req_file}'.\n"
|
|
261
|
+
|
|
262
|
+
warning_msg += req
|
|
263
|
+
warnings.warn(warning_msg.format(_pkgs), category=OneTimeUserWarning)
|
|
264
|
+
|
|
265
|
+
_packages_verified_in_vantage[self.OPENSOURCE_PACKAGE_NAME.value] = True
|
|
266
|
+
|
|
267
|
+
def _get_columns_as_list(self, cols):
|
|
268
|
+
"""
|
|
269
|
+
Internal function to get columns as list of strings.
|
|
270
|
+
Empty list is returned if cols is None.
|
|
271
|
+
"""
|
|
272
|
+
if cols is None:
|
|
273
|
+
return []
|
|
274
|
+
if not isinstance(cols, list) and not isinstance(cols, tuple):
|
|
275
|
+
return [cols]
|
|
276
|
+
return cols
|
|
277
|
+
|
|
278
|
+
def _get_data_and_data_partition_columns(self, data, feature_columns, label_columns,
|
|
279
|
+
partition_columns=None, group_columns=[]):
|
|
280
|
+
"""
|
|
281
|
+
Internal function to generate one new partition column (if not provided) and return
|
|
282
|
+
data and partition columns (either generated or passed one).
|
|
283
|
+
"""
|
|
284
|
+
new_partition_columns = self._get_columns_as_list(partition_columns)
|
|
285
|
+
|
|
286
|
+
if not partition_columns:
|
|
287
|
+
# If partition column is not specified, create a partition column and run Script.
|
|
288
|
+
# This runs the Script in one AMP as we are partitioning data using this column
|
|
289
|
+
# which contains only one value.
|
|
290
|
+
new_partition_columns = [_generate_new_name(type="column")]
|
|
291
|
+
data = data.assign(**{new_partition_columns[0]: self._default_data_partition_value})
|
|
292
|
+
|
|
293
|
+
# Filter out partition columns from feature columns and label columns.
|
|
294
|
+
new_partition_columns_filtered = [col for col in new_partition_columns
|
|
295
|
+
if col not in (feature_columns + label_columns + group_columns)]
|
|
296
|
+
|
|
297
|
+
all_columns = feature_columns + label_columns + group_columns + new_partition_columns_filtered
|
|
298
|
+
return data.select(all_columns), new_partition_columns
|
|
299
|
+
|
|
300
|
+
def _run_script(self, data, command, partition_columns, return_types):
|
|
301
|
+
"""
|
|
302
|
+
Internal function to run Script(), given the argument needed by STO's or
|
|
303
|
+
Apply's Script.
|
|
304
|
+
"""
|
|
305
|
+
if isinstance(partition_columns, list) and len(partition_columns) == 0:
|
|
306
|
+
partition_columns = None
|
|
307
|
+
|
|
308
|
+
if self._is_lake_system:
|
|
309
|
+
obj = Apply(data=data,
|
|
310
|
+
returns=OrderedDict(return_types),
|
|
311
|
+
apply_command=command,
|
|
312
|
+
data_partition_column=partition_columns,
|
|
313
|
+
env_name=self._env,
|
|
314
|
+
delimiter="\t")
|
|
315
|
+
else:
|
|
316
|
+
obj = Script(data=data,
|
|
317
|
+
returns=OrderedDict(return_types),
|
|
318
|
+
script_command=command,
|
|
319
|
+
data_partition_column=partition_columns)
|
|
320
|
+
obj.check_reserved_keyword = False
|
|
321
|
+
|
|
322
|
+
obj.skip_argument_validation = True
|
|
323
|
+
return obj.execute_script(output_style="TABLE")
|
|
324
|
+
|
|
325
|
+
def _install_script_file(self,
|
|
326
|
+
file_identifier=None,
|
|
327
|
+
file_name=None,
|
|
328
|
+
is_binary=False,
|
|
329
|
+
file_location=None):
|
|
330
|
+
"""
|
|
331
|
+
Internal function to install script file in Vantage.
|
|
332
|
+
"""
|
|
333
|
+
if file_location is None:
|
|
334
|
+
file_location = self._scripts_path
|
|
335
|
+
new_script = os.path.join(file_location, file_name)
|
|
336
|
+
|
|
337
|
+
# _env is set while object creation
|
|
338
|
+
# If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
|
|
339
|
+
|
|
340
|
+
if not self._is_lake_system:
|
|
341
|
+
status = install_file(file_identifier=file_identifier,
|
|
342
|
+
file_path=new_script,
|
|
343
|
+
replace=True,
|
|
344
|
+
suppress_output=True,
|
|
345
|
+
is_binary=is_binary)
|
|
346
|
+
else:
|
|
347
|
+
status = self._env.install_file(file_path=new_script,
|
|
348
|
+
replace=True,
|
|
349
|
+
suppress_output=True)
|
|
350
|
+
if not status:
|
|
351
|
+
raise TeradataMlException(
|
|
352
|
+
f"Script file '{file_name}' failed to get installed/replaced in Vantage."
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def _remove_script_file(self, file_name):
|
|
356
|
+
"""
|
|
357
|
+
Internal function to remove script file in Vantage.
|
|
358
|
+
"""
|
|
359
|
+
# _env is set while object creation
|
|
360
|
+
# If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
|
|
361
|
+
|
|
362
|
+
if not self._is_lake_system:
|
|
363
|
+
status = remove_file(file_identifier=file_name.split(".")[0],
|
|
364
|
+
force_remove=True,
|
|
365
|
+
suppress_output=True)
|
|
366
|
+
else:
|
|
367
|
+
status = self._env.remove_file(file_name=file_name,
|
|
368
|
+
suppress_output=True)
|
|
369
|
+
if not status:
|
|
370
|
+
raise TeradataMlException(
|
|
371
|
+
f"Script file '{file_name}' failed to remove in Vantage."
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
|
|
375
|
+
idx_delim=",",
|
|
376
|
+
types_delim="--"):
|
|
377
|
+
"""
|
|
378
|
+
Internal function to get the data column types and partition column names, indices and types.
|
|
379
|
+
Function returns delimiter separated string of types and indices if idx_delim and
|
|
380
|
+
types_delim are provided. Otherwise, it returns list of types and indices. Partition names
|
|
381
|
+
are returned as list always.
|
|
382
|
+
"""
|
|
383
|
+
data_column_types = "" if types_delim else []
|
|
384
|
+
partition_indices = "" if idx_delim else []
|
|
385
|
+
partition_types = "" if types_delim else []
|
|
386
|
+
new_partition_columns = []
|
|
387
|
+
j = 0
|
|
388
|
+
for i, col in enumerate(data.columns):
|
|
389
|
+
_type = data._td_column_names_and_sqlalchemy_types[col.lower()].python_type.__name__
|
|
390
|
+
if types_delim:
|
|
391
|
+
data_column_types += (_type if i == 0 else f"{types_delim}{_type}")
|
|
392
|
+
else:
|
|
393
|
+
data_column_types.append(_type)
|
|
394
|
+
if col in partition_columns:
|
|
395
|
+
new_partition_columns.append(col)
|
|
396
|
+
if idx_delim:
|
|
397
|
+
partition_indices += (str(i) if j == 0 else f"{idx_delim}{str(i)}")
|
|
398
|
+
else:
|
|
399
|
+
partition_indices.append(i)
|
|
400
|
+
if types_delim:
|
|
401
|
+
partition_types += (_type if j == 0 else f"{types_delim}{_type}")
|
|
402
|
+
else:
|
|
403
|
+
partition_types.append(_type)
|
|
404
|
+
j += 1
|
|
405
|
+
# Return types of all columns (as list or str), partition column indices (as list or str)
|
|
406
|
+
# and partition column types (as list or str).
|
|
407
|
+
return data_column_types, partition_indices, partition_types, new_partition_columns
|
|
408
|
+
|
|
409
|
+
def _get_kwargs_str(self, kwargs):
|
|
410
|
+
"""
|
|
411
|
+
Returns string of kwargs in the format:
|
|
412
|
+
key1 val1-type1 key2 val2-type2 ...
|
|
413
|
+
"""
|
|
414
|
+
args_str = ""
|
|
415
|
+
for key, val in kwargs.items():
|
|
416
|
+
strr = f"{key} {str(val)}-{type(val).__name__}"
|
|
417
|
+
if args_str == "":
|
|
418
|
+
args_str += strr
|
|
419
|
+
else:
|
|
420
|
+
args_str += f" {strr}"
|
|
421
|
+
return args_str
|
|
422
|
+
|
|
423
|
+
def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1):
|
|
424
|
+
"""
|
|
425
|
+
Internal function to extract sklearn object from the model(s) depending on the number of
|
|
426
|
+
partitions. When it is only one model, it is directly used as sklearn object (modelObj).
|
|
427
|
+
When it is multiple models, it is converted to pandas DataFrame and stored in sklearn
|
|
428
|
+
object.
|
|
429
|
+
"""
|
|
430
|
+
vals = execute_sql("select * from {}".format(self._model_data._table_name)).fetchall()
|
|
431
|
+
|
|
432
|
+
# pickle will issue a caution warning, if model pickling was done with
|
|
433
|
+
# different library version than used here. The following disables any warnings
|
|
434
|
+
# that might otherwise show in the scriptlog files on the Advanced SQL Engine
|
|
435
|
+
# nodes in this case. Yet, do keep an eye for incompatible pickle versions.
|
|
436
|
+
warnings.filterwarnings("ignore")
|
|
437
|
+
|
|
438
|
+
model_obj = None
|
|
439
|
+
# Extract and unpickle last column which is the model object.
|
|
440
|
+
for i, row in enumerate(vals):
|
|
441
|
+
if self._is_lake_system:
|
|
442
|
+
model_obj = pickle.loads(row[n_partition_cols])
|
|
443
|
+
else:
|
|
444
|
+
model_obj = pickle.loads(base64.b64decode(row[n_partition_cols].partition("'")[2]))
|
|
445
|
+
row[n_partition_cols] = model_obj
|
|
446
|
+
vals[i] = row
|
|
447
|
+
if n_unique_partitions == 1:
|
|
448
|
+
self.modelObj = model_obj
|
|
449
|
+
elif n_unique_partitions > 1:
|
|
450
|
+
self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
|
|
451
|
+
else:
|
|
452
|
+
raise ValueError("Number of partitions should be greater than 0.")
|
|
453
|
+
|
|
454
|
+
warnings.filterwarnings("default")
|
|
455
|
+
|
|
456
|
+
def _validate_existence_of_partition_columns(self, partition_columns, all_columns, arg_names_for_dfs):
|
|
457
|
+
"""
|
|
458
|
+
Validate if columns in "partition_columns" argument are present in any of the given
|
|
459
|
+
dataframes.
|
|
460
|
+
"""
|
|
461
|
+
invalid_part_cols = [c for c in partition_columns if c not in all_columns]
|
|
462
|
+
|
|
463
|
+
if invalid_part_cols:
|
|
464
|
+
raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
|
|
465
|
+
", ".join(invalid_part_cols),
|
|
466
|
+
"', '".join(arg_names_for_dfs))
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def _prepare_data_args_string(self, kwargs):
|
|
470
|
+
"""
|
|
471
|
+
Get column indices and types of each data related arguments in the format:
|
|
472
|
+
"{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
473
|
+
{<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
474
|
+
"""
|
|
475
|
+
data_args_str = []
|
|
476
|
+
for arg_name in list(self._data_args.keys()):
|
|
477
|
+
# Remove DataFrame arguments from kwargs, which will be passed to Script.
|
|
478
|
+
kwargs.pop(arg_name)
|
|
479
|
+
|
|
480
|
+
# Get column indices and their types for each dataframe from parent dataframe.
|
|
481
|
+
_, partition_indices_str, partition_types_str, _ = \
|
|
482
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
|
|
483
|
+
self._data_args[arg_name].columns,
|
|
484
|
+
idx_delim=",",
|
|
485
|
+
types_delim=",")
|
|
486
|
+
|
|
487
|
+
# Format "<arg_name>-<comma separated indices>-<comma separated types>"
|
|
488
|
+
data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
|
|
489
|
+
|
|
490
|
+
# Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
491
|
+
# {<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
492
|
+
return "--".join(data_args_str)
|
|
493
|
+
|
|
494
|
+
def _prepare_and_install_file(self, replace_dict):
|
|
495
|
+
"""
|
|
496
|
+
Prepare function script file from template file and install it in Vantage.
|
|
497
|
+
Takes the dictionary with keys as strings to be replaced in script and values as
|
|
498
|
+
strings which should be added in place of keys.
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
with open(os.path.join(self._scripts_path, self._template_file)) as fp:
|
|
502
|
+
script_data = fp.read()
|
|
503
|
+
|
|
504
|
+
for old, new in replace_dict.items():
|
|
505
|
+
script_data = script_data.replace(old, new)
|
|
506
|
+
|
|
507
|
+
self._script_file_local = os.path.join(self._tdml_tmp_dir, self._script_file_name)
|
|
508
|
+
|
|
509
|
+
with open(self._script_file_local, "w") as fp:
|
|
510
|
+
fp.write(script_data)
|
|
511
|
+
|
|
512
|
+
self._install_script_file(file_identifier=self._script_file_name.split(".")[0],
|
|
513
|
+
file_name=self._script_file_name,
|
|
514
|
+
file_location=self._tdml_tmp_dir)
|
|
515
|
+
|
|
516
|
+
def _get_dataframe_related_args_and_their_columns(self, kwargs):
|
|
517
|
+
"""
|
|
518
|
+
Get dataframe related arguments and return all their column names from kwargs.
|
|
519
|
+
"""
|
|
520
|
+
__data_columns = []
|
|
521
|
+
__data_args_dict = OrderedDict()
|
|
522
|
+
|
|
523
|
+
# Separate dataframe related arguments and their column names from actual kwargs.
|
|
524
|
+
for k, v in kwargs.items():
|
|
525
|
+
if isinstance(v, DataFrame):
|
|
526
|
+
# All dataframes should be select of parent dataframe.
|
|
527
|
+
_validate_df_query_type(v, "select", k)
|
|
528
|
+
|
|
529
|
+
# Save all columns in dataframe related arguments.
|
|
530
|
+
__data_columns.extend(v.columns)
|
|
531
|
+
|
|
532
|
+
__data_args_dict[k] = v
|
|
533
|
+
|
|
534
|
+
return __data_args_dict, __data_columns
|
|
535
|
+
|
|
536
|
+
def _process_data_for_funcs_returning_objects(self, kwargs):
|
|
537
|
+
"""
|
|
538
|
+
Internal function to process all arguments and assign self._data_args, self._tdml_df
|
|
539
|
+
and return
|
|
540
|
+
1. dictionary of elements (needed to replace in the script template file)
|
|
541
|
+
2. partition columns list.
|
|
542
|
+
"""
|
|
543
|
+
partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
|
|
544
|
+
if partition_cols:
|
|
545
|
+
kwargs.pop("partition_columns")
|
|
546
|
+
|
|
547
|
+
self._data_args, __data_columns = self._get_dataframe_related_args_and_their_columns(kwargs)
|
|
548
|
+
|
|
549
|
+
arg_names_for_dfs = list(self._data_args.keys())
|
|
550
|
+
|
|
551
|
+
# Get common parent dataframe from all dataframes.
|
|
552
|
+
self._tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self._data_args.values()))
|
|
553
|
+
|
|
554
|
+
self._tdml_df = self._tdml_df.select(__data_columns + partition_cols)
|
|
555
|
+
|
|
556
|
+
self._validate_existence_of_partition_columns(partition_cols, self._tdml_df.columns, arg_names_for_dfs)
|
|
557
|
+
|
|
558
|
+
self._tdml_df, partition_cols = self._get_data_and_data_partition_columns(self._tdml_df,
|
|
559
|
+
__data_columns,
|
|
560
|
+
[],
|
|
561
|
+
partition_cols
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Prepare string of data arguments with name, indices where columns of that argument resides
|
|
565
|
+
# and types of each of the column.
|
|
566
|
+
data_args_str = self._prepare_data_args_string(kwargs)
|
|
567
|
+
|
|
568
|
+
# Get indices of partition_columns and types of all columns.
|
|
569
|
+
data_column_types_str, partition_indices_str, _, partition_cols = \
|
|
570
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
|
|
571
|
+
partition_cols,
|
|
572
|
+
types_delim=None,
|
|
573
|
+
idx_delim=None)
|
|
574
|
+
|
|
575
|
+
replace_dict = {"<partition_cols_indices>": str(partition_indices_str),
|
|
576
|
+
"<types_of_data_cols>": str(data_column_types_str),
|
|
577
|
+
"<data_args_info_str>": f"'{data_args_str}'"}
|
|
578
|
+
|
|
579
|
+
return replace_dict, partition_cols
|
|
580
|
+
|
|
581
|
+
def _validate_equality_of_partition_values(self, fit_values, trans_values):
|
|
582
|
+
"""
|
|
583
|
+
Internal function to compare the partition values in fit() and predict() are same.
|
|
584
|
+
"""
|
|
585
|
+
if len(fit_values) != len(trans_values):
|
|
586
|
+
return False
|
|
587
|
+
|
|
588
|
+
for val in fit_values:
|
|
589
|
+
if not all([val in trans_values]):
|
|
590
|
+
return False
|
|
591
|
+
|
|
592
|
+
return True
|
|
593
|
+
|
|
594
|
+
def _get_non_data_related_args_from_kwargs(self, kwargs):
|
|
595
|
+
"""
|
|
596
|
+
Get all non-data related arguments from kwargs.
|
|
597
|
+
"""
|
|
598
|
+
non_data_related_args = {}
|
|
599
|
+
for k, v in kwargs.items():
|
|
600
|
+
if not isinstance(v, DataFrame):
|
|
601
|
+
non_data_related_args[k] = v
|
|
602
|
+
non_data_related_args.pop("partition_columns", None)
|
|
603
|
+
return non_data_related_args
|
|
604
|
+
|
|
605
|
+
def _read_from_template_and_write_dict_to_file(self, template_file, replace_dict,
|
|
606
|
+
output_script_file_name=None):
|
|
607
|
+
"""
|
|
608
|
+
Read template file, replace the keys with values and write to new file.
|
|
609
|
+
"""
|
|
610
|
+
with open(os.path.join(self._scripts_path, template_file)) as fp:
|
|
611
|
+
script_data = fp.read()
|
|
612
|
+
|
|
613
|
+
for old, new in replace_dict.items():
|
|
614
|
+
script_data = script_data.replace(old, new)
|
|
615
|
+
|
|
616
|
+
if output_script_file_name is None:
|
|
617
|
+
output_script_file_name = self._script_file_name
|
|
618
|
+
file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
|
|
619
|
+
with open(file_path, "w") as fp:
|
|
620
|
+
fp.write(script_data)
|
|
621
|
+
|
|
622
|
+
def _generate_script_file_from_template_file(self, kwargs, template_file, func_name,
|
|
623
|
+
output_script_file_name=None):
|
|
624
|
+
"""
|
|
625
|
+
Internal function to generate script file from template file. It just adds the non-data
|
|
626
|
+
related arguments to the template file and writes the contents to new file, so that these
|
|
627
|
+
arguments are available in the script file for running this function "func_name".
|
|
628
|
+
"""
|
|
629
|
+
# Take out all non-data related arguments to write to template file.
|
|
630
|
+
non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
|
|
631
|
+
|
|
632
|
+
# Read template file and write the contents to new file with non-data related arguments.
|
|
633
|
+
template_f = os.path.join(self._scripts_path, template_file)
|
|
634
|
+
with open(template_f, "r") as f:
|
|
635
|
+
template = f.read()
|
|
636
|
+
|
|
637
|
+
if output_script_file_name is None:
|
|
638
|
+
output_script_file_name = self._script_file_name
|
|
639
|
+
file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
|
|
640
|
+
with open(file_path, "w") as f:
|
|
641
|
+
f.write("import json\n")
|
|
642
|
+
f.write(f"params = json.loads('{json.dumps(non_data_related_args)}')\n")
|
|
643
|
+
f.write(template)
|
|
644
|
+
|
|
645
|
+
kwargs["file_name"] = output_script_file_name
|
|
646
|
+
kwargs["name"] = func_name
|
|
647
|
+
|
|
648
|
+
def _remove_data_related_args_from_kwargs(self, kwargs):
|
|
649
|
+
"""
|
|
650
|
+
Internal function to remove data related arguments from kwargs.
|
|
651
|
+
"""
|
|
652
|
+
kwargs.pop("data", None)
|
|
653
|
+
kwargs.pop("feature_columns", None)
|
|
654
|
+
kwargs.pop("group_columns", None)
|
|
655
|
+
kwargs.pop("partition_columns", None)
|
|
656
|
+
kwargs.pop("label_columns", None)
|
|
657
|
+
|
|
658
|
+
def _convert_pos_args_to_kwargs_for_function(self, pos_args, kwargs, func_name):
|
|
659
|
+
"""
|
|
660
|
+
Internal function to convert positional arguments to keyword arguments.
|
|
661
|
+
"""
|
|
662
|
+
fn = getattr(getattr(import_module(self.module_name), self.class_name), func_name)
|
|
663
|
+
kwargs.update(zip(fn.__code__.co_varnames[1:], pos_args))
|
|
664
|
+
|
|
665
|
+
def _install_model_and_script_files(self, file_name, file_location):
|
|
666
|
+
"""
|
|
667
|
+
Internal function to install model and script files to Vantage.
|
|
668
|
+
"""
|
|
669
|
+
self._install_initial_model_file()
|
|
670
|
+
self._install_script_file(file_identifier=file_name.split(".")[0],
|
|
671
|
+
file_name=file_name,
|
|
672
|
+
is_binary=False,
|
|
673
|
+
file_location=file_location)
|
|
674
|
+
|
|
675
|
+
def _assign_fit_variables_after_execution(self, data, partition_columns, label_columns):
|
|
676
|
+
"""
|
|
677
|
+
Internal function to assign fit related variables.
|
|
678
|
+
"""
|
|
679
|
+
# Extract sklearn object(s) from the depending on the number of unique partitioning values.
|
|
680
|
+
self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
|
|
681
|
+
n_partition_cols=len(partition_columns))
|
|
682
|
+
|
|
683
|
+
# Need this label columns types in prediction.
|
|
684
|
+
self._fit_label_columns_types = []
|
|
685
|
+
self._fit_label_columns_python_types = []
|
|
686
|
+
|
|
687
|
+
for l_c in label_columns:
|
|
688
|
+
column_data = data._td_column_names_and_sqlalchemy_types[l_c.lower()]
|
|
689
|
+
self._fit_label_columns_types.append(column_data)
|
|
690
|
+
self._fit_label_columns_python_types.append(column_data.python_type.__name__)
|
|
691
|
+
|
|
692
|
+
# If the model is trained a second time after the object creation,
|
|
693
|
+
# or if set_params() is called after the first model training,
|
|
694
|
+
# this flag will reset to False. So that for subsequent predict/score
|
|
695
|
+
# operations, the newly trained model will be installed.
|
|
696
|
+
if self._is_trained_model_installed:
|
|
697
|
+
self._is_trained_model_installed = False
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
701
|
+
# This has to be set for every package which subclasses this class.
|
|
702
|
+
OPENSOURCE_PACKAGE_NAME = None
|
|
703
|
+
|
|
704
|
+
def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
|
|
705
|
+
if model is None and not module_name and not class_name:
|
|
706
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
|
|
707
|
+
"module_name and class_name"),
|
|
708
|
+
MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
|
|
709
|
+
|
|
710
|
+
validator._validate_mutually_inclusive_arguments(module_name, "module_name",
|
|
711
|
+
class_name, "class_name")
|
|
712
|
+
|
|
713
|
+
super().__init__()
|
|
714
|
+
|
|
715
|
+
self.module_name = module_name
|
|
716
|
+
self.class_name = class_name
|
|
717
|
+
self.kwargs = kwargs if kwargs is not None else {}
|
|
718
|
+
self.pos_args = pos_args if pos_args is not None else tuple()
|
|
719
|
+
|
|
720
|
+
self._fit_label_columns_types = None
|
|
721
|
+
self._fit_label_columns_python_types = None
|
|
722
|
+
self._table_name_prefix = None
|
|
723
|
+
|
|
724
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
725
|
+
self._fit_partition_colums_non_default = None
|
|
726
|
+
self._is_default_partition_value_predict = True # False when the user provides partition columns.
|
|
727
|
+
|
|
728
|
+
def __repr__(self):
|
|
729
|
+
if self._is_default_partition_value_fit:
|
|
730
|
+
# Single model use case.
|
|
731
|
+
return self.modelObj.__repr__()
|
|
732
|
+
|
|
733
|
+
pd.set_option("display.expand_frame_repr", None)
|
|
734
|
+
pd.set_option("display.max_colwidth", None)
|
|
735
|
+
opt = self.modelObj.__repr__()
|
|
736
|
+
pd.reset_option("display.expand_frame_repr")
|
|
737
|
+
pd.reset_option("display.max_colwidth")
|
|
738
|
+
return opt
|
|
739
|
+
|
|
740
|
+
def _initialize_object(self):
|
|
741
|
+
"""
|
|
742
|
+
Internal function to initialize sklearn object from module name and class name.
|
|
743
|
+
"""
|
|
744
|
+
# Needed when writing imported modules to generated file. TODO: Remove later.
|
|
745
|
+
imported_args = {}
|
|
746
|
+
# If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
|
|
747
|
+
# corresponding sklearn object.
|
|
748
|
+
_partition_column_names = None
|
|
749
|
+
if "partition_columns" in self.kwargs:
|
|
750
|
+
self._fit_partition_colums_non_default = self.kwargs["partition_columns"]
|
|
751
|
+
self._is_default_partition_value_fit = False
|
|
752
|
+
_partition_column_names = self._fit_partition_colums_non_default
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
|
|
756
|
+
new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
|
|
757
|
+
|
|
758
|
+
# Create model object from new positional and keyword arguments.
|
|
759
|
+
class_obj = getattr(import_module(self.module_name), self.class_name)
|
|
760
|
+
if new_sklearn_pos_args:
|
|
761
|
+
self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
|
|
762
|
+
else:
|
|
763
|
+
self.modelObj = class_obj(**new_sklearn_kwargs)
|
|
764
|
+
|
|
765
|
+
# All arguments are moved to kwargs and kept pos_args empty.
|
|
766
|
+
# Might help in set_params() bug fix.
|
|
767
|
+
self.pos_args = tuple()
|
|
768
|
+
_arguments = self.modelObj.__dict__
|
|
769
|
+
|
|
770
|
+
if hasattr(self.modelObj, "get_params"):
|
|
771
|
+
# Update kwargs that are both in modelObj and get_params() as there are
|
|
772
|
+
# some classes which return other internals variables also.
|
|
773
|
+
# Hence, filtering them using get_params().
|
|
774
|
+
for k, v in _arguments.items():
|
|
775
|
+
if type(v).__name__ in ["function", "generator"]:
|
|
776
|
+
# TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
|
|
777
|
+
# are not supported yet due to pickling issue.
|
|
778
|
+
continue
|
|
779
|
+
if self.get_params():
|
|
780
|
+
if k in self.get_params():
|
|
781
|
+
self.kwargs[k] = v
|
|
782
|
+
else:
|
|
783
|
+
_model_init_arguments = None
|
|
784
|
+
try:
|
|
785
|
+
_model_init_arguments = self.modelObj.__init__.__code__.co_varnames
|
|
786
|
+
except AttributeError:
|
|
787
|
+
pass
|
|
788
|
+
if _model_init_arguments:
|
|
789
|
+
self.kwargs = dict((k, v) for k, v in _arguments.items() if k in _model_init_arguments)
|
|
790
|
+
else:
|
|
791
|
+
self.kwargs = _arguments
|
|
792
|
+
else:
|
|
793
|
+
# Model selection classes will not have `get_params`, in which case modelObj's __dict__
|
|
794
|
+
# is saved as kwargs.
|
|
795
|
+
self.kwargs = _arguments
|
|
796
|
+
|
|
797
|
+
if _partition_column_names:
|
|
798
|
+
self.kwargs["partition_columns"] = _partition_column_names
|
|
799
|
+
|
|
800
|
+
def _initialize_variables(self, table_name_prefix):
|
|
801
|
+
"""
|
|
802
|
+
Internal function to initialize variables used in this class.
|
|
803
|
+
"""
|
|
804
|
+
self.feature_names_in_ = None
|
|
805
|
+
self._table_name_prefix = table_name_prefix
|
|
806
|
+
self._model_file_name_prefix = _generate_new_name(type="file")
|
|
807
|
+
self.model_file_paths_local = set()
|
|
808
|
+
|
|
809
|
+
self._fit_execution_time = None
|
|
810
|
+
self._fit_predict_execution_time = None
|
|
811
|
+
self._partial_fit_execution_time = None
|
|
812
|
+
self._predict_execution_time = None
|
|
813
|
+
self._transform_execution_time = None
|
|
814
|
+
self._score_execution_time = None
|
|
815
|
+
|
|
816
|
+
# Set to partition columns when training is done with partition columns.
|
|
817
|
+
self._fit_partition_colums_non_default = None
|
|
818
|
+
|
|
819
|
+
self._is_model_installed = False
|
|
820
|
+
self._fit_partition_unique_values = [[self._default_data_partition_value]]
|
|
821
|
+
|
|
822
|
+
def _get_returning_df(self, script_df, partition_column, returns):
|
|
823
|
+
"""
|
|
824
|
+
Internal function to return the teradataml Dataframe except
|
|
825
|
+
partition_column.
|
|
826
|
+
"""
|
|
827
|
+
if self._is_default_partition_value_fit:
|
|
828
|
+
# For single model case, partition column is internally generated
|
|
829
|
+
# and no point in returning it to the user.
|
|
830
|
+
|
|
831
|
+
# Extract columns from return types.
|
|
832
|
+
returning_cols = [col[0] for col in returns[len(partition_column):]]
|
|
833
|
+
return script_df.select(returning_cols)
|
|
834
|
+
return script_df
|
|
835
|
+
|
|
836
|
+
def modify_args(self, fp1, arg, imported_args):
|
|
837
|
+
"""
|
|
838
|
+
Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
|
|
839
|
+
of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
|
|
840
|
+
object.
|
|
841
|
+
This function can also be used to write import statements to file (if "fp1" is not
|
|
842
|
+
None). Update "imported_args" dictionary with imported module and class name to avoid
|
|
843
|
+
importing same module and class again when writing to file. This is useful when we want to
|
|
844
|
+
generate script from template file.
|
|
845
|
+
Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
|
|
846
|
+
object to corresponding sklearn object.
|
|
847
|
+
"""
|
|
848
|
+
if isinstance(arg, type(self)):
|
|
849
|
+
imported_tuple = (arg.module_name, arg.class_name)
|
|
850
|
+
already_imported = imported_args.get(imported_tuple, False)
|
|
851
|
+
if not already_imported:
|
|
852
|
+
imported_args[imported_tuple] = True
|
|
853
|
+
if fp1:
|
|
854
|
+
fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
|
|
855
|
+
self.modify_args(fp1, arg.pos_args, imported_args)
|
|
856
|
+
self.modify_args(fp1, arg.kwargs, imported_args)
|
|
857
|
+
return arg.modelObj
|
|
858
|
+
elif isinstance(arg, list):
|
|
859
|
+
return [self.modify_args(fp1, val, imported_args) for val in arg]
|
|
860
|
+
elif isinstance(arg, tuple):
|
|
861
|
+
return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
|
|
862
|
+
elif type(arg).__name__ == "generator":
|
|
863
|
+
# Raising exception as generator object can't be pickled.
|
|
864
|
+
# TODO: ELE-6351 - Find ways to pickle generator object later.
|
|
865
|
+
raise ValueError("Generator type/iterator is not supported for any argument. "\
|
|
866
|
+
"Support will be added later.")
|
|
867
|
+
elif type(arg).__name__ == "function":
|
|
868
|
+
# Raising exception as functions/lambda functions can't be pickled.
|
|
869
|
+
# TODO: ELE-6351 - Find ways to pickle functions later.
|
|
870
|
+
raise ValueError("Functions are not supported for any argument. "\
|
|
871
|
+
"Support will be added later.")
|
|
872
|
+
elif isinstance(arg, dict):
|
|
873
|
+
return dict(
|
|
874
|
+
(
|
|
875
|
+
self.modify_args(fp1, k, imported_args),
|
|
876
|
+
self.modify_args(fp1, v, imported_args),
|
|
877
|
+
)
|
|
878
|
+
for k, v in arg.items() if k != "partition_columns"
|
|
879
|
+
)
|
|
880
|
+
# elif arg == "partition_columns":
|
|
881
|
+
|
|
882
|
+
else:
|
|
883
|
+
return arg
|
|
884
|
+
|
|
885
|
+
def _install_initial_model_file(self, use_dummy_initial_file=False):
|
|
886
|
+
"""
|
|
887
|
+
If model file(s) is/are not installed in Vantage, then install it/them.
|
|
888
|
+
"""
|
|
889
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
890
|
+
# Get list of unique partition values and corresponding model object as dict.
|
|
891
|
+
partition_values_model_dict = {}
|
|
892
|
+
obj_list = self.modelObj.values.tolist()
|
|
893
|
+
for lst in obj_list:
|
|
894
|
+
partition_values_model_dict[tuple(lst[:len(self._fit_partition_colums_non_default)])] = \
|
|
895
|
+
lst[len(self._fit_partition_colums_non_default)]
|
|
896
|
+
|
|
897
|
+
for partition in self._fit_partition_unique_values:
|
|
898
|
+
# Create a new file with file name with partition values and
|
|
899
|
+
# dump sklearn object into it. Finally install the file to Vantage.
|
|
900
|
+
partition_join = "_".join([str(x) for x in partition])
|
|
901
|
+
file_name = f"{self._model_file_name_prefix}_{partition_join}"
|
|
902
|
+
# Replace '-' with '_' as '-' can't be present in file identifier.
|
|
903
|
+
# Needed this replace because partition_columns can be negative.
|
|
904
|
+
file_name = file_name.replace("-", "_")
|
|
905
|
+
full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
|
|
906
|
+
with open(full_file_name, "wb+") as fp:
|
|
907
|
+
# Write sklearn object to file.
|
|
908
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
909
|
+
# If multiple models, then write the model corresponding to the partition value.
|
|
910
|
+
fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
|
|
911
|
+
else:
|
|
912
|
+
if use_dummy_initial_file:
|
|
913
|
+
fp.write(pickle.dumps("abc"))
|
|
914
|
+
else:
|
|
915
|
+
fp.write(pickle.dumps(self.modelObj))
|
|
916
|
+
self.model_file_paths_local.add(file_name)
|
|
917
|
+
|
|
918
|
+
self._install_script_file(file_identifier=file_name,
|
|
919
|
+
file_name=file_name,
|
|
920
|
+
is_binary=True,
|
|
921
|
+
file_location=self._tdml_tmp_dir)
|
|
922
|
+
|
|
923
|
+
if self._is_lake_system:
|
|
924
|
+
# Need to pass env_name along with file_name for cleaning up the files in env.
|
|
925
|
+
obj = f"{self._env.env_name}::{file_name}"
|
|
926
|
+
if installed_model_files[obj] == 0:
|
|
927
|
+
# Add to GC for the first time the model file (along with env name) is encountered.
|
|
928
|
+
installed_model_files[obj] = 1
|
|
929
|
+
GarbageCollector._add_to_garbagecollector(object_name=obj,
|
|
930
|
+
object_type=TeradataConstants.TERADATA_APPLY)
|
|
931
|
+
else:
|
|
932
|
+
if installed_model_files[file_name] == 0:
|
|
933
|
+
# Add to GC for the first time the model file is encountered.
|
|
934
|
+
installed_model_files[file_name] = 1
|
|
935
|
+
GarbageCollector._add_to_garbagecollector(object_name=file_name,
|
|
936
|
+
object_type=TeradataConstants.TERADATA_SCRIPT)
|
|
937
|
+
|
|
938
|
+
self._is_model_installed = True
|
|
939
|
+
|
|
940
|
+
def _validate_unique_partition_values(self, data, partition_columns):
|
|
941
|
+
"""
|
|
942
|
+
Internal function to validate if the partition values in partition_columns used in fit()
|
|
943
|
+
and predict() are same.
|
|
944
|
+
"""
|
|
945
|
+
data._index_label = None
|
|
946
|
+
unique_values = data.drop_duplicate(partition_columns).get_values()
|
|
947
|
+
|
|
948
|
+
trans_unique_values = sorted(unique_values.tolist(), key=lambda x: tuple(x))
|
|
949
|
+
fit_unique_values = sorted(self._fit_partition_unique_values.tolist() \
|
|
950
|
+
if not isinstance(self._fit_partition_unique_values, list) \
|
|
951
|
+
else self._fit_partition_unique_values, key=lambda x: tuple(x))
|
|
952
|
+
default_unique_values = [[self._default_data_partition_value]]
|
|
953
|
+
|
|
954
|
+
if fit_unique_values == default_unique_values and \
|
|
955
|
+
trans_unique_values != default_unique_values:
|
|
956
|
+
error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
|
|
957
|
+
"without", "with")
|
|
958
|
+
msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
|
|
959
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
960
|
+
|
|
961
|
+
if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
|
|
962
|
+
raise TeradataMlException(
|
|
963
|
+
Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING, "training", "test"),
|
|
964
|
+
MessageCodes.PARTITION_VALUES_NOT_MATCHING
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
def fit(self, **kwargs):
|
|
968
|
+
pass
|
|
969
|
+
|
|
970
|
+
def _convert_arguments_to_modelObj(self, args, partition_col_values=None):
|
|
971
|
+
"""
|
|
972
|
+
Internal function to get appropriate model from <argument>.modelObj when multiple models are
|
|
973
|
+
generated by fit, based on partition_col_values. If partition_col_values is None, then it is
|
|
974
|
+
single model case.
|
|
975
|
+
"""
|
|
976
|
+
if isinstance(args, dict):
|
|
977
|
+
new_args = args.copy() # To avoid updating
|
|
978
|
+
for k, v in new_args.items():
|
|
979
|
+
if isinstance(v, _OpenSourceObjectWrapper):
|
|
980
|
+
arg_model_obj = v.modelObj
|
|
981
|
+
if isinstance(arg_model_obj, pd.DataFrame):
|
|
982
|
+
# multi-model. Get appropriate model from modelObj.
|
|
983
|
+
arg_partition_values_model_dict = v._get_partition_columns_to_model_dict()
|
|
984
|
+
new_args[k] = arg_partition_values_model_dict[partition_col_values]
|
|
985
|
+
else:
|
|
986
|
+
# single model.
|
|
987
|
+
new_args[k] = arg_model_obj
|
|
988
|
+
return new_args
|
|
989
|
+
|
|
990
|
+
if isinstance(args, tuple):
|
|
991
|
+
new_args = tuple()
|
|
992
|
+
for arg in args:
|
|
993
|
+
if isinstance(arg, type(self)):
|
|
994
|
+
arg_model_obj = arg.modelObj
|
|
995
|
+
if isinstance(arg_model_obj, pd.DataFrame):
|
|
996
|
+
# multi-model. Get appropriate model from modelObj.
|
|
997
|
+
arg_partition_values_model_dict = arg._get_partition_columns_to_model_dict()
|
|
998
|
+
new_args += (arg_partition_values_model_dict[partition_col_values],)
|
|
999
|
+
else:
|
|
1000
|
+
# single model.
|
|
1001
|
+
new_args += (arg_model_obj,)
|
|
1002
|
+
else:
|
|
1003
|
+
new_args += (arg,)
|
|
1004
|
+
return new_args
|
|
1005
|
+
return args
|
|
1006
|
+
|
|
1007
|
+
def _get_partition_columns_to_model_dict(self):
|
|
1008
|
+
"""
|
|
1009
|
+
Internal function to get partition columns to model dictionary.
|
|
1010
|
+
"""
|
|
1011
|
+
partition_values_model_dict = {}
|
|
1012
|
+
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
1013
|
+
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
1014
|
+
|
|
1015
|
+
for i in range(no_of_unique_partitions):
|
|
1016
|
+
partition_values_model_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
|
|
1017
|
+
|
|
1018
|
+
return partition_values_model_dict
|
|
1019
|
+
|
|
1020
|
+
def __get_obj_attributes_multi_model(self, name):
|
|
1021
|
+
"""
|
|
1022
|
+
Internal function to get attributes of all sklearn model objects when multiple models are
|
|
1023
|
+
generated by fit.
|
|
1024
|
+
"""
|
|
1025
|
+
|
|
1026
|
+
def __generate_model_object(model_obj_value, init_model_obj):
|
|
1027
|
+
"""
|
|
1028
|
+
Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
|
|
1029
|
+
"""
|
|
1030
|
+
# Create _SkLearnObjectWrapper object from opensource model object.
|
|
1031
|
+
model_obj = self.__class__(model=init_model_obj)
|
|
1032
|
+
|
|
1033
|
+
model_obj.modelObj = model_obj_value
|
|
1034
|
+
model_obj._is_model_installed = True
|
|
1035
|
+
|
|
1036
|
+
# Setting other model attributes.
|
|
1037
|
+
model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
|
|
1038
|
+
model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
|
|
1039
|
+
model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
|
|
1040
|
+
model_obj._fit_partition_unique_values = self._fit_partition_unique_values
|
|
1041
|
+
return model_obj
|
|
1042
|
+
|
|
1043
|
+
# Wrapper function to invoke dynamic method, using arguments
|
|
1044
|
+
# passed by user, on model in each row.
|
|
1045
|
+
def __opensource_method_invoker_for_multimodel(*c, **kwargs):
|
|
1046
|
+
"""
|
|
1047
|
+
Internal function to run functions not taking data related arguments but taking
|
|
1048
|
+
arguments, which might contain other model objects.
|
|
1049
|
+
"""
|
|
1050
|
+
multi_models = self.modelObj.copy()
|
|
1051
|
+
for i in range(multi_models.shape[0]):
|
|
1052
|
+
curr_model = multi_models.iloc[i]["model"]
|
|
1053
|
+
partition_values = tuple(multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list())
|
|
1054
|
+
|
|
1055
|
+
partition_values_joined = "_".join([str(x) for x in partition_values])
|
|
1056
|
+
if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
|
|
1057
|
+
# filename is first argument.
|
|
1058
|
+
kwargs1 = kwargs.copy()
|
|
1059
|
+
c1 = c
|
|
1060
|
+
|
|
1061
|
+
if len(c) > 0:
|
|
1062
|
+
c1 = list(c1)
|
|
1063
|
+
c1[0] = f"{c1[0]}_{partition_values_joined}"
|
|
1064
|
+
c1 = tuple(c1)
|
|
1065
|
+
if len(kwargs) > 0 and kwargs.get("filename", None):
|
|
1066
|
+
kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values_joined}"
|
|
1067
|
+
|
|
1068
|
+
pos_args = self._convert_arguments_to_modelObj(c1, partition_values)
|
|
1069
|
+
key_args = self._convert_arguments_to_modelObj(kwargs1, partition_values)
|
|
1070
|
+
else:
|
|
1071
|
+
pos_args = self._convert_arguments_to_modelObj(c, partition_values)
|
|
1072
|
+
key_args = self._convert_arguments_to_modelObj(kwargs, partition_values)
|
|
1073
|
+
|
|
1074
|
+
multi_models.at[i, "model"] = getattr(curr_model, name)(*pos_args, **key_args)
|
|
1075
|
+
|
|
1076
|
+
first_function_value = multi_models.at[0, "model"]
|
|
1077
|
+
if self.__class__._validate_model_supportability(first_function_value):
|
|
1078
|
+
return __generate_model_object(multi_models, init_model_obj=first_function_value)
|
|
1079
|
+
|
|
1080
|
+
multi_models = multi_models.rename(columns={"model": name})
|
|
1081
|
+
|
|
1082
|
+
# Select only partition columns and the attribute column.
|
|
1083
|
+
return multi_models[self._fit_partition_colums_non_default + [name]]
|
|
1084
|
+
|
|
1085
|
+
# Assuming that self.modelObj will have at least 1 row.
|
|
1086
|
+
|
|
1087
|
+
# Get attribute instance from first model object.
|
|
1088
|
+
first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
|
|
1089
|
+
|
|
1090
|
+
# If first_atrribute_instance is callable, it should be applied on model in each row
|
|
1091
|
+
# using passed arguments.
|
|
1092
|
+
if callable(first_atrribute_instance):
|
|
1093
|
+
return __opensource_method_invoker_for_multimodel
|
|
1094
|
+
|
|
1095
|
+
output_attributes = self.modelObj.copy()
|
|
1096
|
+
for i in range(output_attributes.shape[0]):
|
|
1097
|
+
model = output_attributes.iloc[i]["model"]
|
|
1098
|
+
output_attributes.at[i, "model"] = getattr(model, name)
|
|
1099
|
+
|
|
1100
|
+
if self.__class__._validate_model_supportability(first_atrribute_instance):
|
|
1101
|
+
return __generate_model_object(output_attributes, init_model_obj=first_atrribute_instance)
|
|
1102
|
+
|
|
1103
|
+
return output_attributes.rename(columns={"model": name})
|
|
1104
|
+
|
|
1105
|
+
def __getattr__(self, name):
|
|
1106
|
+
# This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
|
|
1107
|
+
def __opensource_method_invoker(*c, **kwargs):
|
|
1108
|
+
# Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
|
|
1109
|
+
model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
|
|
1110
|
+
if self.__class__._validate_model_supportability(model_obj):
|
|
1111
|
+
model_obj = self.__class__(model=model_obj)
|
|
1112
|
+
model_obj._is_model_installed = True # Trained model is returned by function call.
|
|
1113
|
+
return model_obj
|
|
1114
|
+
|
|
1115
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
1116
|
+
return self.__get_obj_attributes_multi_model(name)
|
|
1117
|
+
|
|
1118
|
+
attribute_instance = getattr(self.modelObj, name)
|
|
1119
|
+
|
|
1120
|
+
if callable(attribute_instance):
|
|
1121
|
+
return __opensource_method_invoker
|
|
1122
|
+
|
|
1123
|
+
if self.__class__._validate_model_supportability(attribute_instance):
|
|
1124
|
+
# sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
|
|
1125
|
+
model_obj = self.__class__(model=attribute_instance)
|
|
1126
|
+
model_obj._is_model_installed = True # Trained model is returned as attribute.
|
|
1127
|
+
return model_obj
|
|
1128
|
+
|
|
1129
|
+
return attribute_instance
|
|
1130
|
+
|
|
1131
|
+
@classmethod
|
|
1132
|
+
def _validate_model_supportability(cls, model):
|
|
1133
|
+
"""
|
|
1134
|
+
Internal function to validate if the model provided for deployment is supported by
|
|
1135
|
+
teradataml's opensourceML.
|
|
1136
|
+
"""
|
|
1137
|
+
error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
|
|
1138
|
+
"The given model is not a supported opensource model.")
|
|
1139
|
+
msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
|
|
1140
|
+
package_name = None
|
|
1141
|
+
class_name = None
|
|
1142
|
+
try:
|
|
1143
|
+
# For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
|
|
1144
|
+
# TODO: check for other supported packages.
|
|
1145
|
+
if hasattr(model, "__module__"):
|
|
1146
|
+
package_name = model.__module__.split(".")[0]
|
|
1147
|
+
if package_name not in OpenSourcePackage.values():
|
|
1148
|
+
return False
|
|
1149
|
+
if hasattr(model, "__class__"):
|
|
1150
|
+
class_name = model.__class__.__name__
|
|
1151
|
+
except Exception as ex:
|
|
1152
|
+
# If in case, model.__module__ fails.
|
|
1153
|
+
raise TeradataMlException(error_msg, msg_code) from ex
|
|
1154
|
+
|
|
1155
|
+
# True only if package name is opensource package name and class name is not internal class.
|
|
1156
|
+
return True if package_name and class_name and \
|
|
1157
|
+
package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
|
|
1158
|
+
|
|
1159
|
+
def _save_model(self, model_name, replace_if_exists=False):
|
|
1160
|
+
"""
|
|
1161
|
+
Internal function to save the model stored in file at location mentioned by class variable
|
|
1162
|
+
"model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
|
|
1163
|
+
on the value of "replace_if_exists" argument.
|
|
1164
|
+
"""
|
|
1165
|
+
# Creating a table, if doesn't exist, in Vantage to store the model info.
|
|
1166
|
+
conn = get_connection()
|
|
1167
|
+
osml_models_table_exists = conn.dialect.has_table(conn,
|
|
1168
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
1169
|
+
schema=self._db_name,
|
|
1170
|
+
table_only=True)
|
|
1171
|
+
if not osml_models_table_exists:
|
|
1172
|
+
all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
|
|
1173
|
+
all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
|
|
1174
|
+
_create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
|
|
1175
|
+
primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
|
|
1176
|
+
|
|
1177
|
+
model_obj = OpensourceModels(is_default_partition_value=self._is_default_partition_value_fit,
|
|
1178
|
+
partition_file_prefix=self._model_file_name_prefix,
|
|
1179
|
+
fit_partition_columns_non_default=self._fit_partition_colums_non_default,
|
|
1180
|
+
model=self.modelObj,
|
|
1181
|
+
pos_args=self.pos_args,
|
|
1182
|
+
key_args=self.kwargs,
|
|
1183
|
+
osml_class=self.__class__.__name__,
|
|
1184
|
+
osml_module=self.__module__)
|
|
1185
|
+
|
|
1186
|
+
# Saved the model object to a file to be used in save_byom() for writing to Vantage table.
|
|
1187
|
+
file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
|
|
1188
|
+
with open(file_name, "wb+") as fp:
|
|
1189
|
+
fp.write(pickle.dumps(model_obj))
|
|
1190
|
+
|
|
1191
|
+
try:
|
|
1192
|
+
save_byom(model_id=model_name,
|
|
1193
|
+
model_file=file_name,
|
|
1194
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
1195
|
+
additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
|
|
1196
|
+
additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
|
|
1197
|
+
except TeradataMlException as ex:
|
|
1198
|
+
model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
|
|
1199
|
+
if not replace_if_exists and model_exists_msg == str(ex):
|
|
1200
|
+
raise
|
|
1201
|
+
elif replace_if_exists and model_exists_msg == str(ex):
|
|
1202
|
+
# Delete the model from Model table and save again.
|
|
1203
|
+
delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
|
|
1204
|
+
save_byom(model_id=model_name,
|
|
1205
|
+
model_file=file_name,
|
|
1206
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
1207
|
+
additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
|
|
1208
|
+
additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
|
|
1209
|
+
else:
|
|
1210
|
+
raise
|
|
1211
|
+
finally:
|
|
1212
|
+
os.remove(file_name)
|
|
1213
|
+
|
|
1214
|
+
@classmethod
|
|
1215
|
+
def _deploy(cls, model_name, model, replace_if_exists=False):
|
|
1216
|
+
"""
|
|
1217
|
+
Internal function to create an instance of the class using the model and deploy
|
|
1218
|
+
the model to Vantage.
|
|
1219
|
+
"""
|
|
1220
|
+
is_model_supportable = cls._validate_model_supportability(model=model)
|
|
1221
|
+
if not is_model_supportable:
|
|
1222
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
|
|
1223
|
+
"deploy", "The given model is not a supported opensource model."),
|
|
1224
|
+
MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
|
|
1225
|
+
|
|
1226
|
+
cls = cls(model=model)
|
|
1227
|
+
# Load the model file into Vantage node as file can be used in
|
|
1228
|
+
# predict or other operations.
|
|
1229
|
+
cls._install_initial_model_file(False)
|
|
1230
|
+
|
|
1231
|
+
cls._save_model(model_name, replace_if_exists)
|
|
1232
|
+
|
|
1233
|
+
return cls
|
|
1234
|
+
|
|
1235
|
+
@classmethod
|
|
1236
|
+
def _load(cls, model_name):
|
|
1237
|
+
"""
|
|
1238
|
+
Internal function to load model corresponding to the package (like sklearn etc)
|
|
1239
|
+
from Vantage to client using retrieve_byom() and create an instance of the class if
|
|
1240
|
+
the model is from the same package.
|
|
1241
|
+
"""
|
|
1242
|
+
try:
|
|
1243
|
+
model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
|
|
1244
|
+
return_addition_columns=True)
|
|
1245
|
+
except TeradataMlException as ex:
|
|
1246
|
+
# Not showing table name in error message as it is an internal table.
|
|
1247
|
+
part_msg = f"Model '{model_name}' not found in the table "
|
|
1248
|
+
if part_msg in str(ex):
|
|
1249
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
|
|
1250
|
+
MessageCodes.MODEL_NOT_FOUND)
|
|
1251
|
+
raise
|
|
1252
|
+
|
|
1253
|
+
model_vals_list = model.get_values()[0]
|
|
1254
|
+
# List of 3 elements -
|
|
1255
|
+
# - model name as index column,
|
|
1256
|
+
# - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
|
|
1257
|
+
# - 2nd contains package name.
|
|
1258
|
+
model_obj = pickle.loads(model_vals_list[0])
|
|
1259
|
+
model = model_obj.model
|
|
1260
|
+
osml_module = model_obj.osml_module if hasattr(model_obj, "osml_module") else None
|
|
1261
|
+
osml_class = model_obj.osml_class if hasattr(model_obj, "osml_class") else None
|
|
1262
|
+
|
|
1263
|
+
new_cls = cls
|
|
1264
|
+
if osml_module is not None and osml_class is not None:
|
|
1265
|
+
new_cls = getattr(import_module(osml_module), osml_class)
|
|
1266
|
+
|
|
1267
|
+
package = model_vals_list[1]
|
|
1268
|
+
|
|
1269
|
+
if package != new_cls.OPENSOURCE_PACKAGE_NAME.value:
|
|
1270
|
+
# Raise error if trying to access model of different package.
|
|
1271
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
|
|
1272
|
+
f". Requested model is from '{package}' package"),
|
|
1273
|
+
MessageCodes.MODEL_NOT_FOUND)
|
|
1274
|
+
|
|
1275
|
+
if isinstance(model, pd.DataFrame):
|
|
1276
|
+
# Create a new instance of the class and set the model object to the instance.
|
|
1277
|
+
# Instantiation can take only model, not model object. Hence, passing one of the model
|
|
1278
|
+
# from pandas df. Updating modelObj and other fields later
|
|
1279
|
+
new_cls = new_cls(model=model.iloc[1,2])
|
|
1280
|
+
new_cls.modelObj = model
|
|
1281
|
+
new_cls._fit_partition_unique_values = [lst[:len(model_obj.fit_partition_columns_non_default)]
|
|
1282
|
+
for lst in model.values.tolist()]
|
|
1283
|
+
else:
|
|
1284
|
+
new_cls = new_cls(model=model)
|
|
1285
|
+
|
|
1286
|
+
new_cls._model_file_name_prefix = model_obj.partition_file_prefix
|
|
1287
|
+
new_cls._is_default_partition_value_fit = model_obj.is_default_partition_value
|
|
1288
|
+
new_cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
|
|
1289
|
+
new_cls.pos_args = model_obj.pos_args
|
|
1290
|
+
new_cls.kwargs = model_obj.key_args
|
|
1291
|
+
|
|
1292
|
+
# Load the model file into Vantage node as file can be used in
|
|
1293
|
+
# predict or other operations.
|
|
1294
|
+
new_cls._install_initial_model_file(False)
|
|
1295
|
+
|
|
1296
|
+
return new_cls
|
|
1297
|
+
|
|
1298
|
+
def deploy(self, model_name, replace_if_exists=False):
|
|
1299
|
+
"""
|
|
1300
|
+
DESCRIPTION:
|
|
1301
|
+
Deploys the model held by interface object to Vantage.
|
|
1302
|
+
|
|
1303
|
+
PARAMETERS:
|
|
1304
|
+
model_name:
|
|
1305
|
+
Required Argument.
|
|
1306
|
+
Specifies the unique name of the model to be deployed.
|
|
1307
|
+
Types: str
|
|
1308
|
+
|
|
1309
|
+
replace_if_exists:
|
|
1310
|
+
Optional Argument.
|
|
1311
|
+
Specifies whether to replace the model if a model with the same name already
|
|
1312
|
+
exists in Vantage. If this argument is set to False and a model with the same
|
|
1313
|
+
name already exists, then the function raises an exception.
|
|
1314
|
+
Default Value: False
|
|
1315
|
+
Types: bool
|
|
1316
|
+
|
|
1317
|
+
RETURNS:
|
|
1318
|
+
The opensource object wrapper.
|
|
1319
|
+
|
|
1320
|
+
RAISES:
|
|
1321
|
+
TeradataMLException if model with "model_name" already exists and the argument
|
|
1322
|
+
"replace_if_exists" is set to False.
|
|
1323
|
+
|
|
1324
|
+
EXAMPLES:
|
|
1325
|
+
## sklearn examples.
|
|
1326
|
+
|
|
1327
|
+
# Import the required libraries and create LinearRegression Opensource object wrapper.
|
|
1328
|
+
>>> from teradataml import td_sklearn
|
|
1329
|
+
>>> model = td_sklearn.LinearRegression(normalize=True)
|
|
1330
|
+
>>> model
|
|
1331
|
+
LinearRegression(normalize=True)
|
|
1332
|
+
|
|
1333
|
+
# Example 1: Deploy the model held by LinearRegression Opensource object to Vantage.
|
|
1334
|
+
>>> lin_reg = model.deploy("linreg_model_ver_2")
|
|
1335
|
+
Model is saved.
|
|
1336
|
+
>>> lin_reg
|
|
1337
|
+
LinearRegression(normalize=True)
|
|
1338
|
+
|
|
1339
|
+
# Example 2: Deploy the model held by LinearRegression Opensource object to Vantage
|
|
1340
|
+
# with the name same as that of model that already existed in Vantage.
|
|
1341
|
+
>>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
|
|
1342
|
+
Model is deleted.
|
|
1343
|
+
Model is saved.
|
|
1344
|
+
>>> lin_reg
|
|
1345
|
+
LinearRegression(normalize=True)
|
|
1346
|
+
|
|
1347
|
+
## lightgbm examples.
|
|
1348
|
+
|
|
1349
|
+
# For lightGBM, there are two types of models created by `td_lightgbm` interface object.
|
|
1350
|
+
# - the model object created using LGBMClassifier or other class of lightgbm.sklearn module.
|
|
1351
|
+
# - the model object created using train() method (object of lightgbm.Booster class)
|
|
1352
|
+
# or standalone object of lightgbm.Booster class.
|
|
1353
|
+
|
|
1354
|
+
# Import the required libraries and create LGBMClassifier Opensource object wrapper.
|
|
1355
|
+
>>> from teradataml import td_lightgbm
|
|
1356
|
+
>>> model = td_lightgbm.LGBMClassifier()
|
|
1357
|
+
>>> model
|
|
1358
|
+
LGBMClassifier()
|
|
1359
|
+
|
|
1360
|
+
# Example 1: Deploy the model held by LGBMClassifier Opensource object to Vantage.
|
|
1361
|
+
>>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2")
|
|
1362
|
+
Model is saved.
|
|
1363
|
+
>>> lgbm_cls
|
|
1364
|
+
LGBMClassifier()
|
|
1365
|
+
|
|
1366
|
+
# Example 2: Deploy the model held by LGBMClassifier Opensource object to Vantage with
|
|
1367
|
+
# the name same as that of model that already existed in Vantage.
|
|
1368
|
+
>>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2", replace_if_exists=True)
|
|
1369
|
+
Model is deleted.
|
|
1370
|
+
Model is saved.
|
|
1371
|
+
>>> lgbm_cls
|
|
1372
|
+
LGBMClassifier()
|
|
1373
|
+
|
|
1374
|
+
# Example 3: Deploy the model trained using td_lightgbm.train() function to Vantage.
|
|
1375
|
+
# Create Dataset object, assuming df_x and df_y are the feature and label teradataml
|
|
1376
|
+
# DataFrames.
|
|
1377
|
+
>>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
|
|
1378
|
+
>>> lgbm_data
|
|
1379
|
+
<lightgbm.basic.Dataset object at ....>
|
|
1380
|
+
|
|
1381
|
+
# Train the model using `td_lightgbm` interface object.
|
|
1382
|
+
>>> model = td_lightgbm.train(params={}, train_set=lgbm_data, num_boost_round=30, valid_sets=[lgbm_data])
|
|
1383
|
+
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
|
|
1384
|
+
You can set `force_row_wise=true` to remove the overhead.
|
|
1385
|
+
And if memory is not enough, you can set `force_col_wise=true`.
|
|
1386
|
+
[LightGBM] [Info] Total Bins 532
|
|
1387
|
+
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
|
|
1388
|
+
[1] valid_0's l2: 0.215811
|
|
1389
|
+
[2] valid_0's l2: 0.188138
|
|
1390
|
+
[3] valid_0's l2: 0.166146
|
|
1391
|
+
...
|
|
1392
|
+
...
|
|
1393
|
+
[29] valid_0's l2: 0.042255
|
|
1394
|
+
[30] valid_0's l2: 0.0416953
|
|
1395
|
+
|
|
1396
|
+
# Deploy the model to Vantage.
|
|
1397
|
+
>>> lgb_model = model.deploy("lgbm_train_model_ver_2")
|
|
1398
|
+
>>> lgb_model
|
|
1399
|
+
<lightgbm.basic.Booster object at ...>
|
|
1400
|
+
"""
|
|
1401
|
+
|
|
1402
|
+
# Install model file into Vantage, if not installed.
|
|
1403
|
+
self._install_initial_model_file()
|
|
1404
|
+
|
|
1405
|
+
self._save_model(model_name, replace_if_exists)
|
|
1406
|
+
return self
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
class _FunctionWrapper(_GenericObjectWrapper):
|
|
1410
|
+
def __init__(self, module_name, func_name, file_type, template_file):
|
|
1411
|
+
super().__init__()
|
|
1412
|
+
self._module_name = module_name
|
|
1413
|
+
self._func_name = func_name
|
|
1414
|
+
self._params = None
|
|
1415
|
+
self._data_args = OrderedDict()
|
|
1416
|
+
self._template_file = template_file
|
|
1417
|
+
self._script_file_name = _generate_new_name(type=file_type, extension="py")
|
|
1418
|
+
|
|
1419
|
+
def __call__(self, **kwargs):
|
|
1420
|
+
"""
|
|
1421
|
+
Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
|
|
1422
|
+
"""
|
|
1423
|
+
replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
|
|
1424
|
+
|
|
1425
|
+
script_file_path = f"{self._script_file_name}" if self._is_lake_system \
|
|
1426
|
+
else f"./{self._db_name}/{self._script_file_name}"
|
|
1427
|
+
|
|
1428
|
+
model_file_prefix = None
|
|
1429
|
+
if self._is_lake_system:
|
|
1430
|
+
model_file_prefix = self._script_file_name.replace(".py", "")
|
|
1431
|
+
|
|
1432
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
1433
|
+
script_command = f"{py_exc} {script_file_path} {model_file_prefix} {self._is_lake_system}"
|
|
1434
|
+
|
|
1435
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
1436
|
+
|
|
1437
|
+
return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1438
|
+
for col in partition_cols] + [(self._func_name, model_type)]
|
|
1439
|
+
|
|
1440
|
+
replace_dict.update({"<module_name>": self._module_name,
|
|
1441
|
+
"<func_name>": self._func_name,
|
|
1442
|
+
"<params>": json.dumps(kwargs)})
|
|
1443
|
+
|
|
1444
|
+
# Generate new file in .teradataml directory and install it to Vantage.
|
|
1445
|
+
self._prepare_and_install_file(replace_dict=replace_dict)
|
|
1446
|
+
|
|
1447
|
+
try:
|
|
1448
|
+
self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
|
|
1449
|
+
self._model_data._index_label = None
|
|
1450
|
+
|
|
1451
|
+
fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
|
|
1452
|
+
|
|
1453
|
+
self._extract_model_objs(n_unique_partitions=len(fit_partition_unique_values),
|
|
1454
|
+
n_partition_cols=len(partition_cols))
|
|
1455
|
+
|
|
1456
|
+
except Exception as ex:
|
|
1457
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
1458
|
+
os.remove(self._script_file_local)
|
|
1459
|
+
self._remove_script_file(self._script_file_name)
|
|
1460
|
+
raise
|
|
1461
|
+
|
|
1462
|
+
# File cleanup after processing.
|
|
1463
|
+
os.remove(self._script_file_local)
|
|
1464
|
+
self._remove_script_file(self._script_file_name)
|
|
1465
|
+
|
|
1466
|
+
return self.modelObj
|