teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,1008 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2024 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
|
|
7
|
+
# Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
|
|
8
|
+
#
|
|
9
|
+
# Version: 1.0
|
|
10
|
+
# Function Version: 1.0
|
|
11
|
+
#
|
|
12
|
+
# This file contains object wrapper class for scikit-learn opensource package.
|
|
13
|
+
#
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
import inspect
|
|
17
|
+
import math
|
|
18
|
+
import time
|
|
19
|
+
|
|
20
|
+
import numpy
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import pandas.api.types as pt
|
|
23
|
+
from teradatasqlalchemy.types import (BLOB, CLOB, FLOAT, INTEGER, TIMESTAMP,
|
|
24
|
+
VARCHAR)
|
|
25
|
+
|
|
26
|
+
from teradataml.common.utils import UtilFuncs
|
|
27
|
+
from teradataml.dataframe.copy_to import _get_sqlalchemy_mapping
|
|
28
|
+
from teradataml.opensource._base import (_FunctionWrapper,
|
|
29
|
+
_OpenSourceObjectWrapper)
|
|
30
|
+
from teradataml.opensource._constants import OpenSourcePackage
|
|
31
|
+
from teradataml.opensource._wrapper_utils import (
|
|
32
|
+
_derive_df_and_required_columns, _validate_fit_run,
|
|
33
|
+
_validate_opensource_func_args)
|
|
34
|
+
from teradataml.utils.utils import execute_sql
|
|
35
|
+
from teradataml.utils.validators import _Validators
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
39
|
+
|
|
40
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
|
|
41
|
+
_pkgs = ["scikit-learn", "numpy", "scipy"]
|
|
42
|
+
|
|
43
|
+
def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
|
|
44
|
+
|
|
45
|
+
super().__init__(model=model, module_name=module_name, class_name=class_name,
|
|
46
|
+
pos_args=pos_args, kwargs=kwargs)
|
|
47
|
+
|
|
48
|
+
self._initialize_variables(table_name_prefix="td_sklearn_")
|
|
49
|
+
if model is not None:
|
|
50
|
+
self.modelObj = model
|
|
51
|
+
self.module_name = model.__module__.split("._")[0]
|
|
52
|
+
self.class_name = model.__class__.__name__
|
|
53
|
+
# __dict__ gets all the arguments as dictionary including default ones and positional
|
|
54
|
+
# args.
|
|
55
|
+
self.kwargs = model.__dict__
|
|
56
|
+
self.pos_args = tuple() # Kept empty as all are moved to kwargs.
|
|
57
|
+
else:
|
|
58
|
+
self._initialize_object()
|
|
59
|
+
|
|
60
|
+
def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
|
|
61
|
+
skip_either_or_that=False):
|
|
62
|
+
"""
|
|
63
|
+
Internal function to validate arguments passed to exposed opensource APIs and return
|
|
64
|
+
parent DataFrame, feature columns, label columns, group columns, data partition columns.
|
|
65
|
+
"""
|
|
66
|
+
_validate_opensource_func_args(X=X, y=y, groups=groups,
|
|
67
|
+
fit_partition_cols=self._fit_partition_colums_non_default,
|
|
68
|
+
kwargs=kwargs,
|
|
69
|
+
skip_either_or_that=skip_either_or_that)
|
|
70
|
+
return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
71
|
+
fit_partition_cols=self._fit_partition_colums_non_default)
|
|
72
|
+
|
|
73
|
+
def _run_fit_related_functions(self,
|
|
74
|
+
data,
|
|
75
|
+
feature_columns,
|
|
76
|
+
label_columns,
|
|
77
|
+
partition_columns,
|
|
78
|
+
func,
|
|
79
|
+
classes=None,
|
|
80
|
+
file_name="sklearn_fit.py"):
|
|
81
|
+
"""
|
|
82
|
+
Internal function to run fit() and partial_fit() functions.
|
|
83
|
+
"""
|
|
84
|
+
label_columns = self._get_columns_as_list(label_columns)
|
|
85
|
+
|
|
86
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
87
|
+
feature_columns,
|
|
88
|
+
label_columns,
|
|
89
|
+
partition_columns)
|
|
90
|
+
|
|
91
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
92
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
93
|
+
for col in new_partition_columns] + [("model", model_type)]
|
|
94
|
+
|
|
95
|
+
if classes:
|
|
96
|
+
class_type = type(classes[0]).__name__
|
|
97
|
+
classes = "--".join([str(x) for x in classes])
|
|
98
|
+
else:
|
|
99
|
+
classes = str(None)
|
|
100
|
+
class_type = str(None)
|
|
101
|
+
|
|
102
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
103
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
104
|
+
|
|
105
|
+
# db_name is applicable for enterprise system.
|
|
106
|
+
db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
|
|
107
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
108
|
+
script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
|
|
109
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
110
|
+
f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
|
|
111
|
+
|
|
112
|
+
# Get unique values in partitioning columns.
|
|
113
|
+
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
114
|
+
|
|
115
|
+
self._install_initial_model_file()
|
|
116
|
+
|
|
117
|
+
self._model_data = self._run_script(data, script_command, new_partition_columns,
|
|
118
|
+
return_types)
|
|
119
|
+
|
|
120
|
+
self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
|
|
121
|
+
|
|
122
|
+
def partial_fit(self, X=None, y=None, classes=None, **kwargs):
|
|
123
|
+
"""
|
|
124
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
125
|
+
"""
|
|
126
|
+
st_time = time.time()
|
|
127
|
+
|
|
128
|
+
# "classes" argument validation.
|
|
129
|
+
arg_info_matrix = []
|
|
130
|
+
arg_info_matrix.append(["classes", classes, True, (list)])
|
|
131
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
132
|
+
|
|
133
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
134
|
+
|
|
135
|
+
data, feature_columns, label_columns, _, partition_columns = \
|
|
136
|
+
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
137
|
+
|
|
138
|
+
if partition_columns:
|
|
139
|
+
self._is_default_partition_value_fit = False
|
|
140
|
+
self._fit_partition_colums_non_default = partition_columns
|
|
141
|
+
|
|
142
|
+
self._run_fit_related_functions(data,
|
|
143
|
+
feature_columns,
|
|
144
|
+
label_columns,
|
|
145
|
+
partition_columns,
|
|
146
|
+
inspect.stack()[0][3],
|
|
147
|
+
classes)
|
|
148
|
+
|
|
149
|
+
self._partial_fit_execution_time = time.time() - st_time
|
|
150
|
+
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def fit(self, X=None, y=None, **kwargs):
|
|
154
|
+
"""
|
|
155
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
156
|
+
"""
|
|
157
|
+
st_time = time.time()
|
|
158
|
+
|
|
159
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
160
|
+
|
|
161
|
+
data, feature_columns, label_columns, _, partition_columns = \
|
|
162
|
+
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
163
|
+
|
|
164
|
+
if partition_columns:
|
|
165
|
+
self._is_default_partition_value_fit = False
|
|
166
|
+
self._fit_partition_colums_non_default = partition_columns
|
|
167
|
+
|
|
168
|
+
file_name = kwargs.pop("file_name", None)
|
|
169
|
+
func_name = kwargs.pop("name", "fit")
|
|
170
|
+
|
|
171
|
+
args = {"data": data,
|
|
172
|
+
"feature_columns": feature_columns,
|
|
173
|
+
"label_columns": label_columns,
|
|
174
|
+
"partition_columns": partition_columns,
|
|
175
|
+
"func": func_name}
|
|
176
|
+
|
|
177
|
+
if file_name is not None:
|
|
178
|
+
args["file_name"] = file_name
|
|
179
|
+
|
|
180
|
+
self._run_fit_related_functions(**args)
|
|
181
|
+
|
|
182
|
+
self._fit_execution_time = time.time() - st_time
|
|
183
|
+
|
|
184
|
+
return self
|
|
185
|
+
|
|
186
|
+
def set_params(self, **params):
|
|
187
|
+
"""
|
|
188
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
189
|
+
"""
|
|
190
|
+
for key, val in params.items():
|
|
191
|
+
self.kwargs[key] = val
|
|
192
|
+
|
|
193
|
+
# Initialize with new arguments and return the class/model object.
|
|
194
|
+
# set_params takes all keyword arguments and no positional arguments.
|
|
195
|
+
self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
|
|
196
|
+
return self
|
|
197
|
+
|
|
198
|
+
# get_params() will be executed through __getattr__().
|
|
199
|
+
|
|
200
|
+
# @_validate_fit_run
|
|
201
|
+
def __getattr__(self, name):
|
|
202
|
+
def __run_transform(*c, **kwargs):
|
|
203
|
+
kwargs["name"] = name
|
|
204
|
+
return self._transform(*c, **kwargs)
|
|
205
|
+
|
|
206
|
+
def __run_function_needing_all_rows(*c, **kwargs):
|
|
207
|
+
kwargs["name"] = name
|
|
208
|
+
return self._run_function_needing_all_rows(*c, **kwargs)
|
|
209
|
+
|
|
210
|
+
def __run_kneighbors(*c, **kwargs):
|
|
211
|
+
kwargs["name"] = name
|
|
212
|
+
return self._run_neighbors(*c, **kwargs)
|
|
213
|
+
|
|
214
|
+
if name in ["score", "aic", "bic", "perplexity"]:
|
|
215
|
+
# TODO: ELE-6352 - Implement error_norm() function later.
|
|
216
|
+
return __run_function_needing_all_rows
|
|
217
|
+
|
|
218
|
+
if name in ["kneighbors",
|
|
219
|
+
"radius_neighbors",
|
|
220
|
+
"kneighbors_graph",
|
|
221
|
+
"radius_neighbors_graph"]:
|
|
222
|
+
return __run_kneighbors
|
|
223
|
+
|
|
224
|
+
if name in ["predict",
|
|
225
|
+
"transform",
|
|
226
|
+
"inverse_transform",
|
|
227
|
+
"predict_proba",
|
|
228
|
+
"predict_log_proba",
|
|
229
|
+
"decision_function",
|
|
230
|
+
"score_samples",
|
|
231
|
+
"decision_path",
|
|
232
|
+
"apply",
|
|
233
|
+
"cost_complexity_pruning_path",
|
|
234
|
+
"gibbs",
|
|
235
|
+
"kneighbors_graph",
|
|
236
|
+
"radius_neighbors_graph",
|
|
237
|
+
"mahalanobis",
|
|
238
|
+
"correct_covariance",
|
|
239
|
+
"reweight_covariance",
|
|
240
|
+
"path"]:
|
|
241
|
+
return __run_transform
|
|
242
|
+
|
|
243
|
+
return super().__getattr__(name)
|
|
244
|
+
|
|
245
|
+
def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
|
|
246
|
+
func_name, **kwargs):
|
|
247
|
+
"""
|
|
248
|
+
Internal function to handle multi model case for transform function for functions
|
|
249
|
+
["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
|
|
250
|
+
and "Birch" of cluster module.
|
|
251
|
+
These functions generate multiple models and when transform is applied to each model, it generates
|
|
252
|
+
output with different number of columns.
|
|
253
|
+
"""
|
|
254
|
+
skl_objs_dict = {}
|
|
255
|
+
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
256
|
+
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
257
|
+
|
|
258
|
+
# Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
|
|
259
|
+
# and get the maximum number of columns and their types.
|
|
260
|
+
for i in range(no_of_unique_partitions):
|
|
261
|
+
skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
data = data.select(feature_columns + label_columns + partition_columns)
|
|
265
|
+
ten_row_data = data.head(10).get_values()
|
|
266
|
+
X = numpy.array(ten_row_data)
|
|
267
|
+
|
|
268
|
+
# For multi-model case, model in one AMP can give more number of columns than other AMPs.
|
|
269
|
+
# Returns clause can't contain different number of columns in different AMPs. Hence, taking
|
|
270
|
+
# maximum number of columns and their types from all models.
|
|
271
|
+
max_no_of_columns = 0
|
|
272
|
+
max_col_names = []
|
|
273
|
+
max_col_types = []
|
|
274
|
+
|
|
275
|
+
def _get_input_row_without_nans(row):
|
|
276
|
+
"""
|
|
277
|
+
`inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
|
|
278
|
+
"""
|
|
279
|
+
X1 = []
|
|
280
|
+
for _, v in enumerate(row):
|
|
281
|
+
if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
|
|
282
|
+
# Add to list when:
|
|
283
|
+
# - v is None or
|
|
284
|
+
# - v is string or
|
|
285
|
+
# - v is not nan or
|
|
286
|
+
# - if module is impute (which transforms nan values) even though v is nan.
|
|
287
|
+
X1.append(v)
|
|
288
|
+
else:
|
|
289
|
+
# skip nan values.
|
|
290
|
+
pass
|
|
291
|
+
return X1
|
|
292
|
+
|
|
293
|
+
for i in range(X.shape[0]):
|
|
294
|
+
# Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
|
|
295
|
+
partition_values = tuple(X[i, -no_of_partitioning_cols:])
|
|
296
|
+
skl_obj = skl_objs_dict[partition_values]
|
|
297
|
+
|
|
298
|
+
X1 = X[i, :-no_of_partitioning_cols]
|
|
299
|
+
# Since Nans/NULLs are added in transform for last columns where some models generated
|
|
300
|
+
# less number of columns, removing Nans/NULLs from the input row for inverse_transform
|
|
301
|
+
# using function _get_input_row_without_nans().
|
|
302
|
+
X1 = numpy.array([_get_input_row_without_nans(X1)])
|
|
303
|
+
|
|
304
|
+
trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
|
|
305
|
+
|
|
306
|
+
no_of_columns = 1
|
|
307
|
+
|
|
308
|
+
if trans_opt.shape == (X1.shape[0],):
|
|
309
|
+
trans_opt = trans_opt.reshape(X1.shape[0], 1)
|
|
310
|
+
|
|
311
|
+
if isinstance(trans_opt[0], numpy.ndarray) \
|
|
312
|
+
or isinstance(trans_opt[0], list) \
|
|
313
|
+
or isinstance(trans_opt[0], tuple):
|
|
314
|
+
no_of_columns = len(trans_opt[0])
|
|
315
|
+
|
|
316
|
+
col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
|
|
317
|
+
|
|
318
|
+
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
319
|
+
opt_pd = pd.DataFrame(trans_opt)
|
|
320
|
+
|
|
321
|
+
# Get output column types for each column in pandas df from the output of transform
|
|
322
|
+
# type functions.
|
|
323
|
+
types = {}
|
|
324
|
+
for idx in range(no_of_columns):
|
|
325
|
+
col = list(opt_pd.columns)[idx]
|
|
326
|
+
|
|
327
|
+
# Only one row in trans_opt.
|
|
328
|
+
if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
|
|
329
|
+
type_ = type(trans_opt[0][idx])
|
|
330
|
+
else:
|
|
331
|
+
# only one value in the output.
|
|
332
|
+
type_ = type(trans_opt[0])
|
|
333
|
+
|
|
334
|
+
# If type of the output value (trans_opt) is None, then use `str` as type since
|
|
335
|
+
# pandas astype() does not accept None type.
|
|
336
|
+
if type_ is type(None):
|
|
337
|
+
type_ = str
|
|
338
|
+
|
|
339
|
+
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
340
|
+
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
341
|
+
# Error while type casting for column '2'"
|
|
342
|
+
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
343
|
+
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
344
|
+
|
|
345
|
+
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
346
|
+
opt_pd = opt_pd.astype(types)
|
|
347
|
+
|
|
348
|
+
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
349
|
+
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
350
|
+
col_types = [TIMESTAMP(timezone=True)
|
|
351
|
+
if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
|
|
352
|
+
else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
|
|
353
|
+
for key, col_name in enumerate(list(opt_pd.columns))]
|
|
354
|
+
|
|
355
|
+
# Different models in multi model case can generate different number of output columns for example in
|
|
356
|
+
# SelectFpr. Hence, taking the model which generates maximum number of columns.
|
|
357
|
+
if no_of_columns > max_no_of_columns:
|
|
358
|
+
max_no_of_columns = no_of_columns
|
|
359
|
+
max_col_names = col_names
|
|
360
|
+
max_col_types = col_types
|
|
361
|
+
|
|
362
|
+
return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
|
|
363
|
+
|
|
364
|
+
def _execute_function_locally(self, ten_row_data, feature_columns, label_columns, openml_obj,
|
|
365
|
+
func_name, **kwargs):
|
|
366
|
+
"""
|
|
367
|
+
Executes a opensourceml function of the class object openml_obj" on the provided data locally.
|
|
368
|
+
Parameters:
|
|
369
|
+
ten_row_data (list or array-like): The input data containing rows to be processed.
|
|
370
|
+
feature_columns (list): List of feature column names.
|
|
371
|
+
label_columns (list): List of label column names.
|
|
372
|
+
openml_obj (object): The opensourceml object on which the function is to be executed.
|
|
373
|
+
func_name (str): The name of the function to be executed on the opensourceml object.
|
|
374
|
+
**kwargs: Additional keyword arguments to be passed to the opensourceml function.
|
|
375
|
+
Returns:
|
|
376
|
+
numpy.ndarray: The transformed output from the opensource function.
|
|
377
|
+
Raises:
|
|
378
|
+
NotImplementedError: If the function name is "path", which is not implemented.
|
|
379
|
+
"""
|
|
380
|
+
|
|
381
|
+
X = numpy.array(ten_row_data)
|
|
382
|
+
|
|
383
|
+
if label_columns:
|
|
384
|
+
n_f = len(feature_columns)
|
|
385
|
+
n_c = len(label_columns)
|
|
386
|
+
y = X[:,n_f : n_f + n_c]
|
|
387
|
+
X = X[:,:n_f]
|
|
388
|
+
# predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
|
|
389
|
+
# in local run if passed. Generally, 'y' is passed to return y along with actual output.
|
|
390
|
+
try:
|
|
391
|
+
trans_opt = getattr(openml_obj, func_name)(X, y, **kwargs)
|
|
392
|
+
except TypeError as ex:
|
|
393
|
+
# Function which does not accept 'y' like predict_proba() raises error like
|
|
394
|
+
# "predict_proba() takes 2 positional arguments but 3 were given".
|
|
395
|
+
trans_opt = getattr(openml_obj, func_name)(X, **kwargs)
|
|
396
|
+
else:
|
|
397
|
+
trans_opt = getattr(openml_obj, func_name)(X, **kwargs)
|
|
398
|
+
|
|
399
|
+
if func_name == "path":
|
|
400
|
+
raise NotImplementedError(
|
|
401
|
+
"path() returns tuple of ndarrays of different shapes. Not Implemented yet."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
|
|
405
|
+
trans_opt = trans_opt.reshape(X.shape[0], 1)
|
|
406
|
+
|
|
407
|
+
return trans_opt
|
|
408
|
+
|
|
409
|
+
def _get_return_columns_for_function_(self,
|
|
410
|
+
data,
|
|
411
|
+
feature_columns,
|
|
412
|
+
label_columns,
|
|
413
|
+
partition_columns,
|
|
414
|
+
func_name,
|
|
415
|
+
kwargs):
|
|
416
|
+
"""
|
|
417
|
+
Internal function to return list of column names and their sqlalchemy types
|
|
418
|
+
which should be used in return_types of Script.
|
|
419
|
+
"""
|
|
420
|
+
if func_name == "fit_predict":
|
|
421
|
+
"""
|
|
422
|
+
Get return columns using label_columns.
|
|
423
|
+
"""
|
|
424
|
+
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
|
|
425
|
+
data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
426
|
+
for i, col in enumerate(label_columns)]
|
|
427
|
+
|
|
428
|
+
if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
|
|
429
|
+
"""
|
|
430
|
+
Return predict columns using either label_columns (if provided) or
|
|
431
|
+
self._fit_label_columns_types (if the function is trained using label columns).
|
|
432
|
+
Otherwise run predict on ten rows of data to get the number of columns and their types
|
|
433
|
+
after this if condition.
|
|
434
|
+
"""
|
|
435
|
+
if label_columns:
|
|
436
|
+
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
|
|
437
|
+
data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
438
|
+
for i, col in enumerate(label_columns)]
|
|
439
|
+
if self._fit_label_columns_types:
|
|
440
|
+
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
|
|
441
|
+
for i, col_type in enumerate(self._fit_label_columns_types)]
|
|
442
|
+
|
|
443
|
+
## If function is not `fit_predict`:
|
|
444
|
+
# then take one row of transform/other functions to execute in client
|
|
445
|
+
# to get number of columns in return clause and their Vantage types.
|
|
446
|
+
|
|
447
|
+
# For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
|
|
448
|
+
# Just for getting the number of columns and their types, using only one model of all.
|
|
449
|
+
if len(self._fit_partition_unique_values) == 1:
|
|
450
|
+
# Single model case.
|
|
451
|
+
skl_obj = self.modelObj
|
|
452
|
+
else:
|
|
453
|
+
# Multi model case.
|
|
454
|
+
if (func_name in ["transform", "inverse_transform"] and \
|
|
455
|
+
self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
|
|
456
|
+
(self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
|
|
457
|
+
# Special handling for multi model case for transform function as these classes
|
|
458
|
+
# generate transform output with different number of columns for each model.
|
|
459
|
+
# Hence, need to add Nulls/Nans to columns which are not present in the transform output of
|
|
460
|
+
# some models.
|
|
461
|
+
return self._special_handling_multimodel_(data, feature_columns, label_columns,
|
|
462
|
+
partition_columns, func_name, **kwargs)
|
|
463
|
+
|
|
464
|
+
skl_obj = self.modelObj.iloc[0]["model"]
|
|
465
|
+
|
|
466
|
+
data = data.select(feature_columns + label_columns)
|
|
467
|
+
|
|
468
|
+
ten_row_data = data.head(10).get_values()
|
|
469
|
+
|
|
470
|
+
trans_opt = self._execute_function_locally(ten_row_data, feature_columns, label_columns,
|
|
471
|
+
skl_obj, func_name, **kwargs)
|
|
472
|
+
|
|
473
|
+
if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
474
|
+
no_of_columns = trans_opt.get_shape()[1]
|
|
475
|
+
trans_opt = trans_opt.toarray()
|
|
476
|
+
elif isinstance(trans_opt, dict):
|
|
477
|
+
raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
|
|
478
|
+
elif isinstance(trans_opt[0], numpy.ndarray) \
|
|
479
|
+
or isinstance(trans_opt[0], list) \
|
|
480
|
+
or isinstance(trans_opt[0], tuple):
|
|
481
|
+
no_of_columns = len(trans_opt[0])
|
|
482
|
+
else:
|
|
483
|
+
no_of_columns = 1
|
|
484
|
+
|
|
485
|
+
# Special handling when inverse_transform of no_of_columns returns no of rows
|
|
486
|
+
# less than the no of classes. Such columns are filled with NaN values.
|
|
487
|
+
# Updating number of columns here (new columns with NaN values will be added).
|
|
488
|
+
if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
|
|
489
|
+
no_of_columns = len(self.classes_)
|
|
490
|
+
for i in range(len(ten_row_data)):
|
|
491
|
+
trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
|
|
492
|
+
|
|
493
|
+
# Special handling required for cross_decomposition classes's transform function, which
|
|
494
|
+
# takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
|
|
495
|
+
# y_scores. If label columns are not provided, only x_scores are returned.
|
|
496
|
+
if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
|
|
497
|
+
# For cross_decomposition, output is a tuple of arrays when label columns are provided
|
|
498
|
+
# along with feature columns for transform function. In this case, concatenate the
|
|
499
|
+
# arrays and return the column names accordingly.
|
|
500
|
+
if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
|
|
501
|
+
assert trans_opt[0].shape == trans_opt[1].shape,\
|
|
502
|
+
"Output arrays should be of same shape when transform/fit_transform is run "\
|
|
503
|
+
"with label columns for cross_decomposition classes.."
|
|
504
|
+
first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
|
|
505
|
+
second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
|
|
506
|
+
no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
|
|
507
|
+
col_names = first_cols + second_cols
|
|
508
|
+
|
|
509
|
+
trans_opt = numpy.concatenate(trans_opt, axis=1)
|
|
510
|
+
else:
|
|
511
|
+
assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
|
|
512
|
+
"without label columns for cross_decomposition classes, "\
|
|
513
|
+
"output should be a numpy array."
|
|
514
|
+
no_of_columns = trans_opt.shape[1]
|
|
515
|
+
col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
|
|
516
|
+
else:
|
|
517
|
+
# Generate list of new column names.
|
|
518
|
+
col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
|
|
519
|
+
|
|
520
|
+
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
521
|
+
opt_pd = pd.DataFrame(trans_opt)
|
|
522
|
+
|
|
523
|
+
# Get output column types for each column in pandas df from the output of transform
|
|
524
|
+
# type functions.
|
|
525
|
+
types = {}
|
|
526
|
+
for idx, col in enumerate(list(opt_pd.columns)):
|
|
527
|
+
types_ = []
|
|
528
|
+
# Get type of column using data from all rows, in case if the column has None values.
|
|
529
|
+
# 'and' of types of all values in the column with type(None) gives the type of the column.
|
|
530
|
+
type_ = type(None)
|
|
531
|
+
for i in range(len(trans_opt)):
|
|
532
|
+
type_ = type_ and type(trans_opt[i][idx])
|
|
533
|
+
types_.append(type_)
|
|
534
|
+
|
|
535
|
+
# If all the values of the output (trans_opt) is None, thelen use `str` as type since
|
|
536
|
+
# pandas astype() does not accept None type.
|
|
537
|
+
if type_ is type(None):
|
|
538
|
+
type_ = str
|
|
539
|
+
|
|
540
|
+
# MultilabelBinarize String (non-numeric) labels containing the column having string and
|
|
541
|
+
# float values. Handling this case separately here.
|
|
542
|
+
if str in types_ and float in types_:
|
|
543
|
+
types[col] = str
|
|
544
|
+
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
545
|
+
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
546
|
+
# Error while type casting for column '2'"
|
|
547
|
+
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
548
|
+
else:
|
|
549
|
+
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
553
|
+
opt_pd = opt_pd.astype(types)
|
|
554
|
+
|
|
555
|
+
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
556
|
+
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
557
|
+
col_types = [TIMESTAMP(timezone=True)
|
|
558
|
+
if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
|
|
559
|
+
else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
|
|
560
|
+
for key, col_name in enumerate(list(opt_pd.columns))]
|
|
561
|
+
|
|
562
|
+
return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
|
|
563
|
+
|
|
564
|
+
@_validate_fit_run
|
|
565
|
+
def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
|
|
566
|
+
"""
|
|
567
|
+
Internal function to run functions like score, aic, bic which needs all rows and return
|
|
568
|
+
one floating number as result.
|
|
569
|
+
"""
|
|
570
|
+
st_time = time.time()
|
|
571
|
+
|
|
572
|
+
assert kwargs["name"], "function name should be passed."
|
|
573
|
+
func_name = kwargs["name"]
|
|
574
|
+
|
|
575
|
+
# Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
|
|
576
|
+
kwargs.pop("name")
|
|
577
|
+
|
|
578
|
+
data, feature_columns, label_columns, _, partition_columns = \
|
|
579
|
+
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
580
|
+
|
|
581
|
+
label_columns = self._get_columns_as_list(label_columns)
|
|
582
|
+
|
|
583
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
584
|
+
feature_columns,
|
|
585
|
+
label_columns,
|
|
586
|
+
partition_columns)
|
|
587
|
+
|
|
588
|
+
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
589
|
+
else f"./{self._db_name}/{file_name}"
|
|
590
|
+
|
|
591
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
592
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
593
|
+
|
|
594
|
+
self._validate_unique_partition_values(data, new_partition_columns)
|
|
595
|
+
|
|
596
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
597
|
+
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
598
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
599
|
+
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
600
|
+
|
|
601
|
+
# score, aic, bic returns float values.
|
|
602
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
603
|
+
for col in new_partition_columns] + [(func_name, FLOAT())]
|
|
604
|
+
|
|
605
|
+
# Checking the trained model installation. If not installed,
|
|
606
|
+
# install it and set flag to True.
|
|
607
|
+
if not self._is_trained_model_installed:
|
|
608
|
+
self._install_initial_model_file()
|
|
609
|
+
self._is_trained_model_installed = True
|
|
610
|
+
|
|
611
|
+
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
612
|
+
|
|
613
|
+
self._score_execution_time = time.time() - st_time
|
|
614
|
+
|
|
615
|
+
if self._is_default_partition_value_fit:
|
|
616
|
+
# For single model case, partition column is internally generated and
|
|
617
|
+
# no point in returning it to the user.
|
|
618
|
+
return opt.select(func_name)
|
|
619
|
+
|
|
620
|
+
return opt
|
|
621
|
+
|
|
622
|
+
@_validate_fit_run
|
|
623
|
+
def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
|
|
624
|
+
"""
|
|
625
|
+
Internal function to run predict/transform and similar functions, which returns
|
|
626
|
+
multiple columns. This function will return data row along with the generated
|
|
627
|
+
columns' row data, unlike sklearn's functions which returns just output data.
|
|
628
|
+
"""
|
|
629
|
+
st_time = time.time()
|
|
630
|
+
|
|
631
|
+
assert kwargs["name"], "function name should be passed."
|
|
632
|
+
func_name = kwargs["name"]
|
|
633
|
+
|
|
634
|
+
# Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
|
|
635
|
+
kwargs.pop("name")
|
|
636
|
+
|
|
637
|
+
data, feature_columns, label_columns, _, partition_columns = \
|
|
638
|
+
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
639
|
+
|
|
640
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
641
|
+
feature_columns,
|
|
642
|
+
label_columns,
|
|
643
|
+
partition_columns)
|
|
644
|
+
|
|
645
|
+
# Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
|
|
646
|
+
self._remove_data_related_args_from_kwargs(kwargs)
|
|
647
|
+
|
|
648
|
+
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
649
|
+
else f"./{self._db_name}/{file_name}"
|
|
650
|
+
|
|
651
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
652
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
653
|
+
|
|
654
|
+
self._validate_unique_partition_values(data, new_partition_columns)
|
|
655
|
+
|
|
656
|
+
return_columns_python_types = None
|
|
657
|
+
if self._fit_label_columns_python_types:
|
|
658
|
+
return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
|
|
659
|
+
|
|
660
|
+
# Returning feature columns also along with transformed columns because we don't know the
|
|
661
|
+
# mapping of feature columns to the transformed columns.
|
|
662
|
+
## 'correct_covariance()' returns the (n_features, n_features)
|
|
663
|
+
if func_name == "correct_covariance":
|
|
664
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
665
|
+
for col in new_partition_columns]
|
|
666
|
+
else:
|
|
667
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
668
|
+
for col in (new_partition_columns + feature_columns)]
|
|
669
|
+
if func_name in ["predict", "decision_function"] and label_columns:
|
|
670
|
+
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
671
|
+
for col in label_columns]
|
|
672
|
+
|
|
673
|
+
output_cols_types = self._get_return_columns_for_function_(data,
|
|
674
|
+
feature_columns,
|
|
675
|
+
label_columns,
|
|
676
|
+
new_partition_columns,
|
|
677
|
+
func_name,
|
|
678
|
+
kwargs)
|
|
679
|
+
return_types += output_cols_types
|
|
680
|
+
|
|
681
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
682
|
+
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
683
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
684
|
+
f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
|
|
685
|
+
f"{return_columns_python_types}"
|
|
686
|
+
|
|
687
|
+
# Checking the trained model installation. If not installed,
|
|
688
|
+
# install it and set flag to True.
|
|
689
|
+
if not self._is_trained_model_installed:
|
|
690
|
+
self._install_initial_model_file()
|
|
691
|
+
self._is_trained_model_installed = True
|
|
692
|
+
|
|
693
|
+
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
694
|
+
|
|
695
|
+
self._transform_execution_time = time.time() - st_time
|
|
696
|
+
|
|
697
|
+
return self._get_returning_df(opt, new_partition_columns, return_types)
|
|
698
|
+
|
|
699
|
+
def fit_predict(self, X=None, y=None, **kwargs):
|
|
700
|
+
"""
|
|
701
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
702
|
+
"""
|
|
703
|
+
st_time = time.time()
|
|
704
|
+
|
|
705
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
706
|
+
|
|
707
|
+
data, feature_columns, label_columns, _, partition_columns = \
|
|
708
|
+
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
709
|
+
|
|
710
|
+
if partition_columns:
|
|
711
|
+
self._is_default_partition_value_fit = False
|
|
712
|
+
|
|
713
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
714
|
+
feature_columns,
|
|
715
|
+
label_columns,
|
|
716
|
+
partition_columns)
|
|
717
|
+
|
|
718
|
+
# Return label_columns also if user provides in the function call.
|
|
719
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
720
|
+
for col in (new_partition_columns + feature_columns + label_columns)]
|
|
721
|
+
|
|
722
|
+
func_name = inspect.stack()[0][3]
|
|
723
|
+
if label_columns:
|
|
724
|
+
return_types += self._get_return_columns_for_function_(data,
|
|
725
|
+
feature_columns,
|
|
726
|
+
label_columns,
|
|
727
|
+
new_partition_columns,
|
|
728
|
+
func_name,
|
|
729
|
+
{})
|
|
730
|
+
else:
|
|
731
|
+
# If there are no label_columns, we will have only one
|
|
732
|
+
# predicted column.
|
|
733
|
+
return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
|
|
734
|
+
|
|
735
|
+
file_name = "sklearn_fit_predict.py"
|
|
736
|
+
|
|
737
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
738
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
739
|
+
|
|
740
|
+
script_file_name = f"{file_name}" if self._is_lake_system \
|
|
741
|
+
else f"./{self._db_name}/{file_name}"
|
|
742
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
743
|
+
script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
|
|
744
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
745
|
+
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
746
|
+
|
|
747
|
+
# Get unique values in partitioning columns.
|
|
748
|
+
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
749
|
+
|
|
750
|
+
# Checking the trained model installation. If not installed,
|
|
751
|
+
# install it and flag to True.
|
|
752
|
+
if not self._is_trained_model_installed:
|
|
753
|
+
self._install_initial_model_file()
|
|
754
|
+
self._is_trained_model_installed = True
|
|
755
|
+
|
|
756
|
+
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
757
|
+
|
|
758
|
+
self._fit_predict_execution_time = time.time() - st_time
|
|
759
|
+
|
|
760
|
+
if self._is_default_partition_value_fit:
|
|
761
|
+
# For single model case, partition column is internally generated and no point in
|
|
762
|
+
# returning it to the user.
|
|
763
|
+
|
|
764
|
+
# Extract columns from return types.
|
|
765
|
+
returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
|
|
766
|
+
return opt.select(returning_cols)
|
|
767
|
+
|
|
768
|
+
return opt
|
|
769
|
+
|
|
770
|
+
def fit_transform(self, X=None, y=None, **kwargs):
|
|
771
|
+
"""
|
|
772
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
773
|
+
"""
|
|
774
|
+
# 'y' is not needed for transform().
|
|
775
|
+
fit_obj = self.fit(X, y, **kwargs)
|
|
776
|
+
kwargs["label_columns"] = None
|
|
777
|
+
return fit_obj.transform(X, None, **kwargs)
|
|
778
|
+
|
|
779
|
+
@_validate_fit_run
|
|
780
|
+
def _run_neighbors(self, X=None, **kwargs):
|
|
781
|
+
"""
|
|
782
|
+
Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
|
|
783
|
+
radius_neighbors_graph which returns multiple columns. This function will return data row
|
|
784
|
+
along with the generated columns' row data, unlike sklearn's functions which returns just
|
|
785
|
+
output data.
|
|
786
|
+
"""
|
|
787
|
+
assert kwargs["name"], "function name should be passed."
|
|
788
|
+
func_name = kwargs["name"]
|
|
789
|
+
kwargs.pop("name")
|
|
790
|
+
|
|
791
|
+
if self.module_name != "sklearn.neighbors":
|
|
792
|
+
raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
|
|
793
|
+
|
|
794
|
+
data = kwargs.get("data", None)
|
|
795
|
+
partition_columns = kwargs.get("partition_columns", None)
|
|
796
|
+
|
|
797
|
+
if not X and not partition_columns and not data:
|
|
798
|
+
# If data is not passed, then run from client only.
|
|
799
|
+
# TODO: decide whether to run from client or from Vantage.
|
|
800
|
+
opt = super().__getattr__(func_name)(**kwargs)
|
|
801
|
+
from scipy.sparse.csr import csr_matrix
|
|
802
|
+
if isinstance(opt, csr_matrix):
|
|
803
|
+
return opt.toarray()
|
|
804
|
+
return opt
|
|
805
|
+
|
|
806
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
807
|
+
|
|
808
|
+
data, feature_columns, _, _, new_partition_columns = \
|
|
809
|
+
self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
|
|
810
|
+
skip_either_or_that=True)
|
|
811
|
+
|
|
812
|
+
# Remove the kwargs data.
|
|
813
|
+
self._remove_data_related_args_from_kwargs(kwargs)
|
|
814
|
+
|
|
815
|
+
if partition_columns:
|
|
816
|
+
# kwargs are passed to kneighbors function. So, removing them from kwargs.
|
|
817
|
+
self._is_default_partition_value_fit = False
|
|
818
|
+
|
|
819
|
+
# Generating new partition column name.
|
|
820
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
821
|
+
feature_columns,
|
|
822
|
+
[],
|
|
823
|
+
partition_columns)
|
|
824
|
+
|
|
825
|
+
args_str = self._get_kwargs_str(kwargs)
|
|
826
|
+
|
|
827
|
+
file_name = "sklearn_neighbors.py"
|
|
828
|
+
|
|
829
|
+
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
830
|
+
else f"./{self._db_name}/{file_name}"
|
|
831
|
+
|
|
832
|
+
# Returning feature columns also along with new columns.
|
|
833
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
834
|
+
for col in (new_partition_columns + feature_columns)]
|
|
835
|
+
|
|
836
|
+
# `return_distance` is needed as the result is a tuple of two arrays when it is True.
|
|
837
|
+
return_distance = kwargs.get("return_distance", True) # Default value is True.
|
|
838
|
+
|
|
839
|
+
# Though new columns return numpy arrays, we are returning them as strings.
|
|
840
|
+
# TODO: Will update to columns later, if requested later.
|
|
841
|
+
if func_name in ['kneighbors', 'radius_neighbors']:
|
|
842
|
+
if return_distance:
|
|
843
|
+
return_types += [("neigh_dist", VARCHAR())]
|
|
844
|
+
return_types += [("neigh_ind", VARCHAR())]
|
|
845
|
+
elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
|
|
846
|
+
return_types += [("A", VARCHAR())]
|
|
847
|
+
else:
|
|
848
|
+
return_types += [("output", VARCHAR())]
|
|
849
|
+
|
|
850
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
851
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
852
|
+
|
|
853
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
854
|
+
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
855
|
+
f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
|
|
856
|
+
f"{args_str}"
|
|
857
|
+
|
|
858
|
+
# Get unique values in partitioning columns.
|
|
859
|
+
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
860
|
+
|
|
861
|
+
# Checking the trained model installation. If not installed,
|
|
862
|
+
# install it and set flag to True.
|
|
863
|
+
if not self._is_trained_model_installed:
|
|
864
|
+
self._install_initial_model_file()
|
|
865
|
+
self._is_trained_model_installed = True
|
|
866
|
+
|
|
867
|
+
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
868
|
+
|
|
869
|
+
return self._get_returning_df(opt, new_partition_columns, return_types)
|
|
870
|
+
|
|
871
|
+
def split(self, X=None, y=None, groups=None, **kwargs):
|
|
872
|
+
"""
|
|
873
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
874
|
+
"""
|
|
875
|
+
opt = self._run_model_selection("split", X=X, y=y, groups=groups,
|
|
876
|
+
skip_either_or_that=True, kwargs=kwargs)
|
|
877
|
+
|
|
878
|
+
# Get number of splits in the result DataFrame.
|
|
879
|
+
n_splits = opt.drop_duplicate("split_id").shape[0]
|
|
880
|
+
|
|
881
|
+
data = kwargs.get("data", None)
|
|
882
|
+
feature_columns = kwargs.get("feature_columns", [])
|
|
883
|
+
label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
|
|
884
|
+
|
|
885
|
+
# If there is not X and y, get feature_columns and label_columns for "data".
|
|
886
|
+
partition_columns = kwargs.get("partition_columns", [])
|
|
887
|
+
feature_columns = [col for col in X.columns if col not in partition_columns] \
|
|
888
|
+
if X and not data and not feature_columns else feature_columns
|
|
889
|
+
label_columns = y.columns if y and not data and not label_columns else label_columns
|
|
890
|
+
|
|
891
|
+
# Return iterator of the train and test dataframes for each split.
|
|
892
|
+
for i in range(1, n_splits+1):
|
|
893
|
+
train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
|
|
894
|
+
.select(partition_columns + feature_columns + label_columns)
|
|
895
|
+
train_df._index_label = None
|
|
896
|
+
test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
|
|
897
|
+
.select(partition_columns + feature_columns + label_columns)
|
|
898
|
+
test_df._index_label = None
|
|
899
|
+
|
|
900
|
+
yield train_df, test_df
|
|
901
|
+
|
|
902
|
+
def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
|
|
903
|
+
"""
|
|
904
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
905
|
+
"""
|
|
906
|
+
return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
|
|
907
|
+
skip_either_or_that=True, kwargs=kwargs)
|
|
908
|
+
|
|
909
|
+
def _run_model_selection(self,
|
|
910
|
+
func_name,
|
|
911
|
+
X=None,
|
|
912
|
+
y=None,
|
|
913
|
+
groups=None,
|
|
914
|
+
skip_either_or_that=False,
|
|
915
|
+
kwargs={}):
|
|
916
|
+
"""
|
|
917
|
+
Internal function to run functions like split, get_n_splits of model selection module.
|
|
918
|
+
- get_n_splits() returns number of splits as value, not as teradataml DataFrame.
|
|
919
|
+
- split() returns teradataml DataFrame containing train and test data for each split
|
|
920
|
+
(add partition information if the argument "partition_cols" is provided).
|
|
921
|
+
"""
|
|
922
|
+
if self.module_name != "sklearn.model_selection":
|
|
923
|
+
raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
|
|
924
|
+
f"have {func_name}() method.")
|
|
925
|
+
|
|
926
|
+
data = kwargs.get("data", None)
|
|
927
|
+
|
|
928
|
+
if not X and not y and not groups and not data:
|
|
929
|
+
# If data is not passed, then run from client only.
|
|
930
|
+
# TODO: decide whether to run from client or from Vantage.
|
|
931
|
+
return super().__getattr__(func_name)()
|
|
932
|
+
|
|
933
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
934
|
+
|
|
935
|
+
data, feature_columns, label_columns, group_columns, partition_columns = \
|
|
936
|
+
self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
937
|
+
skip_either_or_that=skip_either_or_that)
|
|
938
|
+
|
|
939
|
+
if partition_columns:
|
|
940
|
+
self._is_default_partition_value_fit = False
|
|
941
|
+
|
|
942
|
+
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
943
|
+
feature_columns,
|
|
944
|
+
label_columns,
|
|
945
|
+
partition_columns,
|
|
946
|
+
group_columns)
|
|
947
|
+
|
|
948
|
+
file_name = "sklearn_model_selection_split.py"
|
|
949
|
+
|
|
950
|
+
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
951
|
+
else f"./{self._db_name}/{file_name}"
|
|
952
|
+
|
|
953
|
+
if func_name == "split":
|
|
954
|
+
# Need to generate data into splits of train and test.
|
|
955
|
+
# split_id - the column which will be used to identify the split.
|
|
956
|
+
# data_type - the column which will be used to identify whether the row is
|
|
957
|
+
# train or test row.
|
|
958
|
+
return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
|
|
959
|
+
# Returning feature columns and label columns as well.
|
|
960
|
+
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
961
|
+
for col in (feature_columns + label_columns)]
|
|
962
|
+
else:
|
|
963
|
+
# Return Varchar by default.
|
|
964
|
+
# Returns Varchar even for functions like `get_n_splits` which returns large integer
|
|
965
|
+
# numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
|
|
966
|
+
# `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
|
|
967
|
+
return_types = [(func_name, VARCHAR())]
|
|
968
|
+
|
|
969
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
970
|
+
for col in new_partition_columns] + return_types
|
|
971
|
+
|
|
972
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
973
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
974
|
+
|
|
975
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
976
|
+
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
977
|
+
f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
978
|
+
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
979
|
+
|
|
980
|
+
# Get unique values in partitioning columns.
|
|
981
|
+
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
982
|
+
|
|
983
|
+
# Checking the trained model installation. If not installed,
|
|
984
|
+
# install it and set flag to True.
|
|
985
|
+
if not self._is_trained_model_installed:
|
|
986
|
+
self._install_initial_model_file()
|
|
987
|
+
self._is_trained_model_installed = True
|
|
988
|
+
|
|
989
|
+
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
990
|
+
|
|
991
|
+
if func_name == "get_n_splits" and not partition_columns:
|
|
992
|
+
# Return number of splits as value, not as dataframe.
|
|
993
|
+
vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
|
|
994
|
+
opt = vals.fetchall()[0][0]
|
|
995
|
+
|
|
996
|
+
# Varchar is returned by the script. Convert it to int.
|
|
997
|
+
return int(opt)
|
|
998
|
+
|
|
999
|
+
return opt
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
class _SKLearnFunctionWrapper(_FunctionWrapper):
|
|
1003
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
|
|
1004
|
+
_pkgs = ["scikit-learn", "numpy", "scipy"]
|
|
1005
|
+
def __init__(self, module_name, func_name):
|
|
1006
|
+
file_type = "file_fn_sklearn"
|
|
1007
|
+
template_file = "sklearn_function.template"
|
|
1008
|
+
super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)
|