teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +193 -1
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +25 -18
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  8. teradataml/analytics/sqle/__init__.py +20 -2
  9. teradataml/analytics/utils.py +15 -1
  10. teradataml/analytics/valib.py +18 -4
  11. teradataml/automl/__init__.py +341 -112
  12. teradataml/automl/autodataprep/__init__.py +471 -0
  13. teradataml/automl/data_preparation.py +84 -42
  14. teradataml/automl/data_transformation.py +69 -33
  15. teradataml/automl/feature_engineering.py +76 -9
  16. teradataml/automl/feature_exploration.py +639 -25
  17. teradataml/automl/model_training.py +35 -14
  18. teradataml/clients/auth_client.py +2 -2
  19. teradataml/common/__init__.py +1 -2
  20. teradataml/common/constants.py +122 -63
  21. teradataml/common/messagecodes.py +14 -3
  22. teradataml/common/messages.py +8 -4
  23. teradataml/common/sqlbundle.py +40 -10
  24. teradataml/common/utils.py +366 -74
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +348 -86
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/apriori_example.json +22 -0
  29. teradataml/data/byom_example.json +11 -0
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  37. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  38. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  40. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  41. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  42. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  43. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  45. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  49. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  51. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  52. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  53. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  54. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  55. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  57. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  58. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  59. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  60. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  61. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  62. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  63. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  64. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  65. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  67. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  68. teradataml/data/hnsw_alter_data.csv +5 -0
  69. teradataml/data/hnsw_data.csv +10 -0
  70. teradataml/data/jsons/byom/h2opredict.json +1 -1
  71. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  72. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  73. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  74. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  75. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  76. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  77. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  78. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  79. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  80. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  81. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  82. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  83. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  84. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  85. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  86. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  87. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  88. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  89. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  90. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  91. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  92. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  93. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  94. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
  95. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
  96. teradataml/data/ner_dict.csv +8 -0
  97. teradataml/data/ner_input_eng.csv +7 -0
  98. teradataml/data/ner_rule.csv +5 -0
  99. teradataml/data/pos_input.csv +40 -0
  100. teradataml/data/tdnerextractor_example.json +14 -0
  101. teradataml/data/teradataml_example.json +21 -0
  102. teradataml/data/textmorph_example.json +5 -0
  103. teradataml/data/to_num_data.csv +4 -0
  104. teradataml/data/tochar_data.csv +5 -0
  105. teradataml/data/trans_dense.csv +16 -0
  106. teradataml/data/trans_sparse.csv +55 -0
  107. teradataml/data/vectordistance_example.json +1 -1
  108. teradataml/dataframe/copy_to.py +45 -29
  109. teradataml/dataframe/data_transfer.py +72 -46
  110. teradataml/dataframe/dataframe.py +642 -166
  111. teradataml/dataframe/dataframe_utils.py +167 -22
  112. teradataml/dataframe/functions.py +135 -20
  113. teradataml/dataframe/setop.py +11 -6
  114. teradataml/dataframe/sql.py +330 -78
  115. teradataml/dbutils/dbutils.py +556 -140
  116. teradataml/dbutils/filemgr.py +14 -10
  117. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  118. teradataml/lib/aed_0_1.dll +0 -0
  119. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
  120. teradataml/opensource/_class.py +141 -17
  121. teradataml/opensource/{constants.py → _constants.py} +7 -3
  122. teradataml/opensource/_lightgbm.py +52 -53
  123. teradataml/opensource/_sklearn.py +1008 -0
  124. teradataml/opensource/_wrapper_utils.py +5 -5
  125. teradataml/options/__init__.py +47 -15
  126. teradataml/options/configure.py +103 -26
  127. teradataml/options/display.py +13 -2
  128. teradataml/plot/axis.py +47 -8
  129. teradataml/plot/figure.py +33 -0
  130. teradataml/plot/plot.py +63 -13
  131. teradataml/scriptmgmt/UserEnv.py +307 -40
  132. teradataml/scriptmgmt/lls_utils.py +428 -145
  133. teradataml/store/__init__.py +2 -3
  134. teradataml/store/feature_store/feature_store.py +102 -7
  135. teradataml/table_operators/Apply.py +48 -19
  136. teradataml/table_operators/Script.py +23 -2
  137. teradataml/table_operators/TableOperator.py +3 -1
  138. teradataml/table_operators/table_operator_util.py +58 -9
  139. teradataml/utils/dtypes.py +49 -1
  140. teradataml/utils/internal_buffer.py +38 -0
  141. teradataml/utils/validators.py +377 -62
  142. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
  143. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
  144. teradataml/data/SQL_Fundamentals.pdf +0 -0
  145. teradataml/libaed_0_1.dylib +0 -0
  146. teradataml/libaed_0_1.so +0 -0
  147. teradataml/opensource/sklearn/__init__.py +0 -0
  148. teradataml/store/vector_store/__init__.py +0 -1586
  149. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  150. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  151. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -0,0 +1,1008 @@
1
+ # ##################################################################
2
+ #
3
+ # Copyright 2024 Teradata. All rights reserved.
4
+ # TERADATA CONFIDENTIAL AND TRADE SECRET
5
+ #
6
+ # Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
7
+ # Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
8
+ #
9
+ # Version: 1.0
10
+ # Function Version: 1.0
11
+ #
12
+ # This file contains object wrapper class for scikit-learn opensource package.
13
+ #
14
+ # ##################################################################
15
+
16
+ import inspect
17
+ import math
18
+ import time
19
+
20
+ import numpy
21
+ import pandas as pd
22
+ import pandas.api.types as pt
23
+ from teradatasqlalchemy.types import (BLOB, CLOB, FLOAT, INTEGER, TIMESTAMP,
24
+ VARCHAR)
25
+
26
+ from teradataml.common.utils import UtilFuncs
27
+ from teradataml.dataframe.copy_to import _get_sqlalchemy_mapping
28
+ from teradataml.opensource._base import (_FunctionWrapper,
29
+ _OpenSourceObjectWrapper)
30
+ from teradataml.opensource._constants import OpenSourcePackage
31
+ from teradataml.opensource._wrapper_utils import (
32
+ _derive_df_and_required_columns, _validate_fit_run,
33
+ _validate_opensource_func_args)
34
+ from teradataml.utils.utils import execute_sql
35
+ from teradataml.utils.validators import _Validators
36
+
37
+
38
+ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
39
+
40
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
41
+ _pkgs = ["scikit-learn", "numpy", "scipy"]
42
+
43
+ def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
44
+
45
+ super().__init__(model=model, module_name=module_name, class_name=class_name,
46
+ pos_args=pos_args, kwargs=kwargs)
47
+
48
+ self._initialize_variables(table_name_prefix="td_sklearn_")
49
+ if model is not None:
50
+ self.modelObj = model
51
+ self.module_name = model.__module__.split("._")[0]
52
+ self.class_name = model.__class__.__name__
53
+ # __dict__ gets all the arguments as dictionary including default ones and positional
54
+ # args.
55
+ self.kwargs = model.__dict__
56
+ self.pos_args = tuple() # Kept empty as all are moved to kwargs.
57
+ else:
58
+ self._initialize_object()
59
+
60
+ def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
61
+ skip_either_or_that=False):
62
+ """
63
+ Internal function to validate arguments passed to exposed opensource APIs and return
64
+ parent DataFrame, feature columns, label columns, group columns, data partition columns.
65
+ """
66
+ _validate_opensource_func_args(X=X, y=y, groups=groups,
67
+ fit_partition_cols=self._fit_partition_colums_non_default,
68
+ kwargs=kwargs,
69
+ skip_either_or_that=skip_either_or_that)
70
+ return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
71
+ fit_partition_cols=self._fit_partition_colums_non_default)
72
+
73
+ def _run_fit_related_functions(self,
74
+ data,
75
+ feature_columns,
76
+ label_columns,
77
+ partition_columns,
78
+ func,
79
+ classes=None,
80
+ file_name="sklearn_fit.py"):
81
+ """
82
+ Internal function to run fit() and partial_fit() functions.
83
+ """
84
+ label_columns = self._get_columns_as_list(label_columns)
85
+
86
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
87
+ feature_columns,
88
+ label_columns,
89
+ partition_columns)
90
+
91
+ model_type = BLOB() if self._is_lake_system else CLOB()
92
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
93
+ for col in new_partition_columns] + [("model", model_type)]
94
+
95
+ if classes:
96
+ class_type = type(classes[0]).__name__
97
+ classes = "--".join([str(x) for x in classes])
98
+ else:
99
+ classes = str(None)
100
+ class_type = str(None)
101
+
102
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
103
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
104
+
105
+ # db_name is applicable for enterprise system.
106
+ db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
107
+ py_exc = UtilFuncs._get_python_execution_path()
108
+ script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
109
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
110
+ f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
111
+
112
+ # Get unique values in partitioning columns.
113
+ self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
114
+
115
+ self._install_initial_model_file()
116
+
117
+ self._model_data = self._run_script(data, script_command, new_partition_columns,
118
+ return_types)
119
+
120
+ self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
121
+
122
+ def partial_fit(self, X=None, y=None, classes=None, **kwargs):
123
+ """
124
+ Please check the description in Docs/OpensourceML/sklearn.py.
125
+ """
126
+ st_time = time.time()
127
+
128
+ # "classes" argument validation.
129
+ arg_info_matrix = []
130
+ arg_info_matrix.append(["classes", classes, True, (list)])
131
+ _Validators._validate_function_arguments(arg_info_matrix)
132
+
133
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
134
+
135
+ data, feature_columns, label_columns, _, partition_columns = \
136
+ self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
137
+
138
+ if partition_columns:
139
+ self._is_default_partition_value_fit = False
140
+ self._fit_partition_colums_non_default = partition_columns
141
+
142
+ self._run_fit_related_functions(data,
143
+ feature_columns,
144
+ label_columns,
145
+ partition_columns,
146
+ inspect.stack()[0][3],
147
+ classes)
148
+
149
+ self._partial_fit_execution_time = time.time() - st_time
150
+
151
+ return self
152
+
153
+ def fit(self, X=None, y=None, **kwargs):
154
+ """
155
+ Please check the description in Docs/OpensourceML/sklearn.py.
156
+ """
157
+ st_time = time.time()
158
+
159
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
160
+
161
+ data, feature_columns, label_columns, _, partition_columns = \
162
+ self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
163
+
164
+ if partition_columns:
165
+ self._is_default_partition_value_fit = False
166
+ self._fit_partition_colums_non_default = partition_columns
167
+
168
+ file_name = kwargs.pop("file_name", None)
169
+ func_name = kwargs.pop("name", "fit")
170
+
171
+ args = {"data": data,
172
+ "feature_columns": feature_columns,
173
+ "label_columns": label_columns,
174
+ "partition_columns": partition_columns,
175
+ "func": func_name}
176
+
177
+ if file_name is not None:
178
+ args["file_name"] = file_name
179
+
180
+ self._run_fit_related_functions(**args)
181
+
182
+ self._fit_execution_time = time.time() - st_time
183
+
184
+ return self
185
+
186
+ def set_params(self, **params):
187
+ """
188
+ Please check the description in Docs/OpensourceML/sklearn.py.
189
+ """
190
+ for key, val in params.items():
191
+ self.kwargs[key] = val
192
+
193
+ # Initialize with new arguments and return the class/model object.
194
+ # set_params takes all keyword arguments and no positional arguments.
195
+ self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
196
+ return self
197
+
198
+ # get_params() will be executed through __getattr__().
199
+
200
+ # @_validate_fit_run
201
+ def __getattr__(self, name):
202
+ def __run_transform(*c, **kwargs):
203
+ kwargs["name"] = name
204
+ return self._transform(*c, **kwargs)
205
+
206
+ def __run_function_needing_all_rows(*c, **kwargs):
207
+ kwargs["name"] = name
208
+ return self._run_function_needing_all_rows(*c, **kwargs)
209
+
210
+ def __run_kneighbors(*c, **kwargs):
211
+ kwargs["name"] = name
212
+ return self._run_neighbors(*c, **kwargs)
213
+
214
+ if name in ["score", "aic", "bic", "perplexity"]:
215
+ # TODO: ELE-6352 - Implement error_norm() function later.
216
+ return __run_function_needing_all_rows
217
+
218
+ if name in ["kneighbors",
219
+ "radius_neighbors",
220
+ "kneighbors_graph",
221
+ "radius_neighbors_graph"]:
222
+ return __run_kneighbors
223
+
224
+ if name in ["predict",
225
+ "transform",
226
+ "inverse_transform",
227
+ "predict_proba",
228
+ "predict_log_proba",
229
+ "decision_function",
230
+ "score_samples",
231
+ "decision_path",
232
+ "apply",
233
+ "cost_complexity_pruning_path",
234
+ "gibbs",
235
+ "kneighbors_graph",
236
+ "radius_neighbors_graph",
237
+ "mahalanobis",
238
+ "correct_covariance",
239
+ "reweight_covariance",
240
+ "path"]:
241
+ return __run_transform
242
+
243
+ return super().__getattr__(name)
244
+
245
+ def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
246
+ func_name, **kwargs):
247
+ """
248
+ Internal function to handle multi model case for transform function for functions
249
+ ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
250
+ and "Birch" of cluster module.
251
+ These functions generate multiple models and when transform is applied to each model, it generates
252
+ output with different number of columns.
253
+ """
254
+ skl_objs_dict = {}
255
+ no_of_unique_partitions = len(self._fit_partition_unique_values)
256
+ no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
257
+
258
+ # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
259
+ # and get the maximum number of columns and their types.
260
+ for i in range(no_of_unique_partitions):
261
+ skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
262
+
263
+
264
+ data = data.select(feature_columns + label_columns + partition_columns)
265
+ ten_row_data = data.head(10).get_values()
266
+ X = numpy.array(ten_row_data)
267
+
268
+ # For multi-model case, model in one AMP can give more number of columns than other AMPs.
269
+ # Returns clause can't contain different number of columns in different AMPs. Hence, taking
270
+ # maximum number of columns and their types from all models.
271
+ max_no_of_columns = 0
272
+ max_col_names = []
273
+ max_col_types = []
274
+
275
+ def _get_input_row_without_nans(row):
276
+ """
277
+ `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
278
+ """
279
+ X1 = []
280
+ for _, v in enumerate(row):
281
+ if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
282
+ # Add to list when:
283
+ # - v is None or
284
+ # - v is string or
285
+ # - v is not nan or
286
+ # - if module is impute (which transforms nan values) even though v is nan.
287
+ X1.append(v)
288
+ else:
289
+ # skip nan values.
290
+ pass
291
+ return X1
292
+
293
+ for i in range(X.shape[0]):
294
+ # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
295
+ partition_values = tuple(X[i, -no_of_partitioning_cols:])
296
+ skl_obj = skl_objs_dict[partition_values]
297
+
298
+ X1 = X[i, :-no_of_partitioning_cols]
299
+ # Since Nans/NULLs are added in transform for last columns where some models generated
300
+ # less number of columns, removing Nans/NULLs from the input row for inverse_transform
301
+ # using function _get_input_row_without_nans().
302
+ X1 = numpy.array([_get_input_row_without_nans(X1)])
303
+
304
+ trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
305
+
306
+ no_of_columns = 1
307
+
308
+ if trans_opt.shape == (X1.shape[0],):
309
+ trans_opt = trans_opt.reshape(X1.shape[0], 1)
310
+
311
+ if isinstance(trans_opt[0], numpy.ndarray) \
312
+ or isinstance(trans_opt[0], list) \
313
+ or isinstance(trans_opt[0], tuple):
314
+ no_of_columns = len(trans_opt[0])
315
+
316
+ col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
317
+
318
+ # Get new column sqlalchemy types for pandas df columns of transform output.
319
+ opt_pd = pd.DataFrame(trans_opt)
320
+
321
+ # Get output column types for each column in pandas df from the output of transform
322
+ # type functions.
323
+ types = {}
324
+ for idx in range(no_of_columns):
325
+ col = list(opt_pd.columns)[idx]
326
+
327
+ # Only one row in trans_opt.
328
+ if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
329
+ type_ = type(trans_opt[0][idx])
330
+ else:
331
+ # only one value in the output.
332
+ type_ = type(trans_opt[0])
333
+
334
+ # If type of the output value (trans_opt) is None, then use `str` as type since
335
+ # pandas astype() does not accept None type.
336
+ if type_ is type(None):
337
+ type_ = str
338
+
339
+ # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
340
+ # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
341
+ # Error while type casting for column '2'"
342
+ # Hence, using pd.Int64Dtype() for integer columns with nan values.
343
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
344
+
345
+ # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
346
+ opt_pd = opt_pd.astype(types)
347
+
348
+ # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
349
+ # TIMESTAMP(timezone=True) else map it according to default value.
350
+ col_types = [TIMESTAMP(timezone=True)
351
+ if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
352
+ else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
353
+ for key, col_name in enumerate(list(opt_pd.columns))]
354
+
355
+ # Different models in multi model case can generate different number of output columns for example in
356
+ # SelectFpr. Hence, taking the model which generates maximum number of columns.
357
+ if no_of_columns > max_no_of_columns:
358
+ max_no_of_columns = no_of_columns
359
+ max_col_names = col_names
360
+ max_col_types = col_types
361
+
362
+ return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
363
+
364
+ def _execute_function_locally(self, ten_row_data, feature_columns, label_columns, openml_obj,
365
+ func_name, **kwargs):
366
+ """
367
+ Executes a opensourceml function of the class object openml_obj" on the provided data locally.
368
+ Parameters:
369
+ ten_row_data (list or array-like): The input data containing rows to be processed.
370
+ feature_columns (list): List of feature column names.
371
+ label_columns (list): List of label column names.
372
+ openml_obj (object): The opensourceml object on which the function is to be executed.
373
+ func_name (str): The name of the function to be executed on the opensourceml object.
374
+ **kwargs: Additional keyword arguments to be passed to the opensourceml function.
375
+ Returns:
376
+ numpy.ndarray: The transformed output from the opensource function.
377
+ Raises:
378
+ NotImplementedError: If the function name is "path", which is not implemented.
379
+ """
380
+
381
+ X = numpy.array(ten_row_data)
382
+
383
+ if label_columns:
384
+ n_f = len(feature_columns)
385
+ n_c = len(label_columns)
386
+ y = X[:,n_f : n_f + n_c]
387
+ X = X[:,:n_f]
388
+ # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
389
+ # in local run if passed. Generally, 'y' is passed to return y along with actual output.
390
+ try:
391
+ trans_opt = getattr(openml_obj, func_name)(X, y, **kwargs)
392
+ except TypeError as ex:
393
+ # Function which does not accept 'y' like predict_proba() raises error like
394
+ # "predict_proba() takes 2 positional arguments but 3 were given".
395
+ trans_opt = getattr(openml_obj, func_name)(X, **kwargs)
396
+ else:
397
+ trans_opt = getattr(openml_obj, func_name)(X, **kwargs)
398
+
399
+ if func_name == "path":
400
+ raise NotImplementedError(
401
+ "path() returns tuple of ndarrays of different shapes. Not Implemented yet."
402
+ )
403
+
404
+ if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
405
+ trans_opt = trans_opt.reshape(X.shape[0], 1)
406
+
407
+ return trans_opt
408
+
409
+ def _get_return_columns_for_function_(self,
410
+ data,
411
+ feature_columns,
412
+ label_columns,
413
+ partition_columns,
414
+ func_name,
415
+ kwargs):
416
+ """
417
+ Internal function to return list of column names and their sqlalchemy types
418
+ which should be used in return_types of Script.
419
+ """
420
+ if func_name == "fit_predict":
421
+ """
422
+ Get return columns using label_columns.
423
+ """
424
+ return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
425
+ data._td_column_names_and_sqlalchemy_types[col.lower()])
426
+ for i, col in enumerate(label_columns)]
427
+
428
+ if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
429
+ """
430
+ Return predict columns using either label_columns (if provided) or
431
+ self._fit_label_columns_types (if the function is trained using label columns).
432
+ Otherwise run predict on ten rows of data to get the number of columns and their types
433
+ after this if condition.
434
+ """
435
+ if label_columns:
436
+ return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
437
+ data._td_column_names_and_sqlalchemy_types[col.lower()])
438
+ for i, col in enumerate(label_columns)]
439
+ if self._fit_label_columns_types:
440
+ return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
441
+ for i, col_type in enumerate(self._fit_label_columns_types)]
442
+
443
+ ## If function is not `fit_predict`:
444
+ # then take one row of transform/other functions to execute in client
445
+ # to get number of columns in return clause and their Vantage types.
446
+
447
+ # For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
448
+ # Just for getting the number of columns and their types, using only one model of all.
449
+ if len(self._fit_partition_unique_values) == 1:
450
+ # Single model case.
451
+ skl_obj = self.modelObj
452
+ else:
453
+ # Multi model case.
454
+ if (func_name in ["transform", "inverse_transform"] and \
455
+ self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
456
+ (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
457
+ # Special handling for multi model case for transform function as these classes
458
+ # generate transform output with different number of columns for each model.
459
+ # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
460
+ # some models.
461
+ return self._special_handling_multimodel_(data, feature_columns, label_columns,
462
+ partition_columns, func_name, **kwargs)
463
+
464
+ skl_obj = self.modelObj.iloc[0]["model"]
465
+
466
+ data = data.select(feature_columns + label_columns)
467
+
468
+ ten_row_data = data.head(10).get_values()
469
+
470
+ trans_opt = self._execute_function_locally(ten_row_data, feature_columns, label_columns,
471
+ skl_obj, func_name, **kwargs)
472
+
473
+ if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
474
+ no_of_columns = trans_opt.get_shape()[1]
475
+ trans_opt = trans_opt.toarray()
476
+ elif isinstance(trans_opt, dict):
477
+ raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
478
+ elif isinstance(trans_opt[0], numpy.ndarray) \
479
+ or isinstance(trans_opt[0], list) \
480
+ or isinstance(trans_opt[0], tuple):
481
+ no_of_columns = len(trans_opt[0])
482
+ else:
483
+ no_of_columns = 1
484
+
485
+ # Special handling when inverse_transform of no_of_columns returns no of rows
486
+ # less than the no of classes. Such columns are filled with NaN values.
487
+ # Updating number of columns here (new columns with NaN values will be added).
488
+ if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
489
+ no_of_columns = len(self.classes_)
490
+ for i in range(len(ten_row_data)):
491
+ trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
492
+
493
+ # Special handling required for cross_decomposition classes's transform function, which
494
+ # takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
495
+ # y_scores. If label columns are not provided, only x_scores are returned.
496
+ if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
497
+ # For cross_decomposition, output is a tuple of arrays when label columns are provided
498
+ # along with feature columns for transform function. In this case, concatenate the
499
+ # arrays and return the column names accordingly.
500
+ if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
501
+ assert trans_opt[0].shape == trans_opt[1].shape,\
502
+ "Output arrays should be of same shape when transform/fit_transform is run "\
503
+ "with label columns for cross_decomposition classes.."
504
+ first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
505
+ second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
506
+ no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
507
+ col_names = first_cols + second_cols
508
+
509
+ trans_opt = numpy.concatenate(trans_opt, axis=1)
510
+ else:
511
+ assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
512
+ "without label columns for cross_decomposition classes, "\
513
+ "output should be a numpy array."
514
+ no_of_columns = trans_opt.shape[1]
515
+ col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
516
+ else:
517
+ # Generate list of new column names.
518
+ col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
519
+
520
+ # Get new column sqlalchemy types for pandas df columns of transform output.
521
+ opt_pd = pd.DataFrame(trans_opt)
522
+
523
+ # Get output column types for each column in pandas df from the output of transform
524
+ # type functions.
525
+ types = {}
526
+ for idx, col in enumerate(list(opt_pd.columns)):
527
+ types_ = []
528
+ # Get type of column using data from all rows, in case if the column has None values.
529
+ # 'and' of types of all values in the column with type(None) gives the type of the column.
530
+ type_ = type(None)
531
+ for i in range(len(trans_opt)):
532
+ type_ = type_ and type(trans_opt[i][idx])
533
+ types_.append(type_)
534
+
535
+ # If all the values of the output (trans_opt) is None, thelen use `str` as type since
536
+ # pandas astype() does not accept None type.
537
+ if type_ is type(None):
538
+ type_ = str
539
+
540
+ # MultilabelBinarize String (non-numeric) labels containing the column having string and
541
+ # float values. Handling this case separately here.
542
+ if str in types_ and float in types_:
543
+ types[col] = str
544
+ # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
545
+ # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
546
+ # Error while type casting for column '2'"
547
+ # Hence, using pd.Int64Dtype() for integer columns with nan values.
548
+ else:
549
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
550
+
551
+
552
+ # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
553
+ opt_pd = opt_pd.astype(types)
554
+
555
+ # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
556
+ # TIMESTAMP(timezone=True) else map it according to default value.
557
+ col_types = [TIMESTAMP(timezone=True)
558
+ if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
559
+ else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
560
+ for key, col_name in enumerate(list(opt_pd.columns))]
561
+
562
+ return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
563
+
564
+ @_validate_fit_run
565
+ def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
566
+ """
567
+ Internal function to run functions like score, aic, bic which needs all rows and return
568
+ one floating number as result.
569
+ """
570
+ st_time = time.time()
571
+
572
+ assert kwargs["name"], "function name should be passed."
573
+ func_name = kwargs["name"]
574
+
575
+ # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
576
+ kwargs.pop("name")
577
+
578
+ data, feature_columns, label_columns, _, partition_columns = \
579
+ self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
580
+
581
+ label_columns = self._get_columns_as_list(label_columns)
582
+
583
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
584
+ feature_columns,
585
+ label_columns,
586
+ partition_columns)
587
+
588
+ script_file_path = f"{file_name}" if self._is_lake_system \
589
+ else f"./{self._db_name}/{file_name}"
590
+
591
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
592
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
593
+
594
+ self._validate_unique_partition_values(data, new_partition_columns)
595
+
596
+ py_exc = UtilFuncs._get_python_execution_path()
597
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
598
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
599
+ f"{self._model_file_name_prefix} {self._is_lake_system}"
600
+
601
+ # score, aic, bic returns float values.
602
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
603
+ for col in new_partition_columns] + [(func_name, FLOAT())]
604
+
605
+ # Checking the trained model installation. If not installed,
606
+ # install it and set flag to True.
607
+ if not self._is_trained_model_installed:
608
+ self._install_initial_model_file()
609
+ self._is_trained_model_installed = True
610
+
611
+ opt = self._run_script(data, script_command, new_partition_columns, return_types)
612
+
613
+ self._score_execution_time = time.time() - st_time
614
+
615
+ if self._is_default_partition_value_fit:
616
+ # For single model case, partition column is internally generated and
617
+ # no point in returning it to the user.
618
+ return opt.select(func_name)
619
+
620
+ return opt
621
+
622
+ @_validate_fit_run
623
+ def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
624
+ """
625
+ Internal function to run predict/transform and similar functions, which returns
626
+ multiple columns. This function will return data row along with the generated
627
+ columns' row data, unlike sklearn's functions which returns just output data.
628
+ """
629
+ st_time = time.time()
630
+
631
+ assert kwargs["name"], "function name should be passed."
632
+ func_name = kwargs["name"]
633
+
634
+ # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
635
+ kwargs.pop("name")
636
+
637
+ data, feature_columns, label_columns, _, partition_columns = \
638
+ self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
639
+
640
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
641
+ feature_columns,
642
+ label_columns,
643
+ partition_columns)
644
+
645
+ # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
646
+ self._remove_data_related_args_from_kwargs(kwargs)
647
+
648
+ script_file_path = f"{file_name}" if self._is_lake_system \
649
+ else f"./{self._db_name}/{file_name}"
650
+
651
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
652
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
653
+
654
+ self._validate_unique_partition_values(data, new_partition_columns)
655
+
656
+ return_columns_python_types = None
657
+ if self._fit_label_columns_python_types:
658
+ return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
659
+
660
+ # Returning feature columns also along with transformed columns because we don't know the
661
+ # mapping of feature columns to the transformed columns.
662
+ ## 'correct_covariance()' returns the (n_features, n_features)
663
+ if func_name == "correct_covariance":
664
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
665
+ for col in new_partition_columns]
666
+ else:
667
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
668
+ for col in (new_partition_columns + feature_columns)]
669
+ if func_name in ["predict", "decision_function"] and label_columns:
670
+ return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
671
+ for col in label_columns]
672
+
673
+ output_cols_types = self._get_return_columns_for_function_(data,
674
+ feature_columns,
675
+ label_columns,
676
+ new_partition_columns,
677
+ func_name,
678
+ kwargs)
679
+ return_types += output_cols_types
680
+
681
+ py_exc = UtilFuncs._get_python_execution_path()
682
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
683
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
684
+ f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
685
+ f"{return_columns_python_types}"
686
+
687
+ # Checking the trained model installation. If not installed,
688
+ # install it and set flag to True.
689
+ if not self._is_trained_model_installed:
690
+ self._install_initial_model_file()
691
+ self._is_trained_model_installed = True
692
+
693
+ opt = self._run_script(data, script_command, new_partition_columns, return_types)
694
+
695
+ self._transform_execution_time = time.time() - st_time
696
+
697
+ return self._get_returning_df(opt, new_partition_columns, return_types)
698
+
699
+ def fit_predict(self, X=None, y=None, **kwargs):
700
+ """
701
+ Please check the description in Docs/OpensourceML/sklearn.py.
702
+ """
703
+ st_time = time.time()
704
+
705
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
706
+
707
+ data, feature_columns, label_columns, _, partition_columns = \
708
+ self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
709
+
710
+ if partition_columns:
711
+ self._is_default_partition_value_fit = False
712
+
713
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
714
+ feature_columns,
715
+ label_columns,
716
+ partition_columns)
717
+
718
+ # Return label_columns also if user provides in the function call.
719
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
720
+ for col in (new_partition_columns + feature_columns + label_columns)]
721
+
722
+ func_name = inspect.stack()[0][3]
723
+ if label_columns:
724
+ return_types += self._get_return_columns_for_function_(data,
725
+ feature_columns,
726
+ label_columns,
727
+ new_partition_columns,
728
+ func_name,
729
+ {})
730
+ else:
731
+ # If there are no label_columns, we will have only one
732
+ # predicted column.
733
+ return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
734
+
735
+ file_name = "sklearn_fit_predict.py"
736
+
737
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
738
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
739
+
740
+ script_file_name = f"{file_name}" if self._is_lake_system \
741
+ else f"./{self._db_name}/{file_name}"
742
+ py_exc = UtilFuncs._get_python_execution_path()
743
+ script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
744
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
745
+ f"{self._model_file_name_prefix} {self._is_lake_system}"
746
+
747
+ # Get unique values in partitioning columns.
748
+ self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
749
+
750
+ # Checking the trained model installation. If not installed,
751
+ # install it and flag to True.
752
+ if not self._is_trained_model_installed:
753
+ self._install_initial_model_file()
754
+ self._is_trained_model_installed = True
755
+
756
+ opt = self._run_script(data, script_command, new_partition_columns, return_types)
757
+
758
+ self._fit_predict_execution_time = time.time() - st_time
759
+
760
+ if self._is_default_partition_value_fit:
761
+ # For single model case, partition column is internally generated and no point in
762
+ # returning it to the user.
763
+
764
+ # Extract columns from return types.
765
+ returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
766
+ return opt.select(returning_cols)
767
+
768
+ return opt
769
+
770
+ def fit_transform(self, X=None, y=None, **kwargs):
771
+ """
772
+ Please check the description in Docs/OpensourceML/sklearn.py.
773
+ """
774
+ # 'y' is not needed for transform().
775
+ fit_obj = self.fit(X, y, **kwargs)
776
+ kwargs["label_columns"] = None
777
+ return fit_obj.transform(X, None, **kwargs)
778
+
779
+ @_validate_fit_run
780
+ def _run_neighbors(self, X=None, **kwargs):
781
+ """
782
+ Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
783
+ radius_neighbors_graph which returns multiple columns. This function will return data row
784
+ along with the generated columns' row data, unlike sklearn's functions which returns just
785
+ output data.
786
+ """
787
+ assert kwargs["name"], "function name should be passed."
788
+ func_name = kwargs["name"]
789
+ kwargs.pop("name")
790
+
791
+ if self.module_name != "sklearn.neighbors":
792
+ raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
793
+
794
+ data = kwargs.get("data", None)
795
+ partition_columns = kwargs.get("partition_columns", None)
796
+
797
+ if not X and not partition_columns and not data:
798
+ # If data is not passed, then run from client only.
799
+ # TODO: decide whether to run from client or from Vantage.
800
+ opt = super().__getattr__(func_name)(**kwargs)
801
+ from scipy.sparse.csr import csr_matrix
802
+ if isinstance(opt, csr_matrix):
803
+ return opt.toarray()
804
+ return opt
805
+
806
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
807
+
808
+ data, feature_columns, _, _, new_partition_columns = \
809
+ self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
810
+ skip_either_or_that=True)
811
+
812
+ # Remove the kwargs data.
813
+ self._remove_data_related_args_from_kwargs(kwargs)
814
+
815
+ if partition_columns:
816
+ # kwargs are passed to kneighbors function. So, removing them from kwargs.
817
+ self._is_default_partition_value_fit = False
818
+
819
+ # Generating new partition column name.
820
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
821
+ feature_columns,
822
+ [],
823
+ partition_columns)
824
+
825
+ args_str = self._get_kwargs_str(kwargs)
826
+
827
+ file_name = "sklearn_neighbors.py"
828
+
829
+ script_file_path = f"{file_name}" if self._is_lake_system \
830
+ else f"./{self._db_name}/{file_name}"
831
+
832
+ # Returning feature columns also along with new columns.
833
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
834
+ for col in (new_partition_columns + feature_columns)]
835
+
836
+ # `return_distance` is needed as the result is a tuple of two arrays when it is True.
837
+ return_distance = kwargs.get("return_distance", True) # Default value is True.
838
+
839
+ # Though new columns return numpy arrays, we are returning them as strings.
840
+ # TODO: Will update to columns later, if requested later.
841
+ if func_name in ['kneighbors', 'radius_neighbors']:
842
+ if return_distance:
843
+ return_types += [("neigh_dist", VARCHAR())]
844
+ return_types += [("neigh_ind", VARCHAR())]
845
+ elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
846
+ return_types += [("A", VARCHAR())]
847
+ else:
848
+ return_types += [("output", VARCHAR())]
849
+
850
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
851
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
852
+
853
+ py_exc = UtilFuncs._get_python_execution_path()
854
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
855
+ f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
856
+ f"{args_str}"
857
+
858
+ # Get unique values in partitioning columns.
859
+ self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
860
+
861
+ # Checking the trained model installation. If not installed,
862
+ # install it and set flag to True.
863
+ if not self._is_trained_model_installed:
864
+ self._install_initial_model_file()
865
+ self._is_trained_model_installed = True
866
+
867
+ opt = self._run_script(data, script_command, new_partition_columns, return_types)
868
+
869
+ return self._get_returning_df(opt, new_partition_columns, return_types)
870
+
871
+ def split(self, X=None, y=None, groups=None, **kwargs):
872
+ """
873
+ Please check the description in Docs/OpensourceML/sklearn.py.
874
+ """
875
+ opt = self._run_model_selection("split", X=X, y=y, groups=groups,
876
+ skip_either_or_that=True, kwargs=kwargs)
877
+
878
+ # Get number of splits in the result DataFrame.
879
+ n_splits = opt.drop_duplicate("split_id").shape[0]
880
+
881
+ data = kwargs.get("data", None)
882
+ feature_columns = kwargs.get("feature_columns", [])
883
+ label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
884
+
885
+ # If there is not X and y, get feature_columns and label_columns for "data".
886
+ partition_columns = kwargs.get("partition_columns", [])
887
+ feature_columns = [col for col in X.columns if col not in partition_columns] \
888
+ if X and not data and not feature_columns else feature_columns
889
+ label_columns = y.columns if y and not data and not label_columns else label_columns
890
+
891
+ # Return iterator of the train and test dataframes for each split.
892
+ for i in range(1, n_splits+1):
893
+ train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
894
+ .select(partition_columns + feature_columns + label_columns)
895
+ train_df._index_label = None
896
+ test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
897
+ .select(partition_columns + feature_columns + label_columns)
898
+ test_df._index_label = None
899
+
900
+ yield train_df, test_df
901
+
902
+ def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
903
+ """
904
+ Please check the description in Docs/OpensourceML/sklearn.py.
905
+ """
906
+ return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
907
+ skip_either_or_that=True, kwargs=kwargs)
908
+
909
+ def _run_model_selection(self,
910
+ func_name,
911
+ X=None,
912
+ y=None,
913
+ groups=None,
914
+ skip_either_or_that=False,
915
+ kwargs={}):
916
+ """
917
+ Internal function to run functions like split, get_n_splits of model selection module.
918
+ - get_n_splits() returns number of splits as value, not as teradataml DataFrame.
919
+ - split() returns teradataml DataFrame containing train and test data for each split
920
+ (add partition information if the argument "partition_cols" is provided).
921
+ """
922
+ if self.module_name != "sklearn.model_selection":
923
+ raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
924
+ f"have {func_name}() method.")
925
+
926
+ data = kwargs.get("data", None)
927
+
928
+ if not X and not y and not groups and not data:
929
+ # If data is not passed, then run from client only.
930
+ # TODO: decide whether to run from client or from Vantage.
931
+ return super().__getattr__(func_name)()
932
+
933
+ self._is_default_partition_value_fit = True # False when the user provides partition columns.
934
+
935
+ data, feature_columns, label_columns, group_columns, partition_columns = \
936
+ self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
937
+ skip_either_or_that=skip_either_or_that)
938
+
939
+ if partition_columns:
940
+ self._is_default_partition_value_fit = False
941
+
942
+ data, new_partition_columns = self._get_data_and_data_partition_columns(data,
943
+ feature_columns,
944
+ label_columns,
945
+ partition_columns,
946
+ group_columns)
947
+
948
+ file_name = "sklearn_model_selection_split.py"
949
+
950
+ script_file_path = f"{file_name}" if self._is_lake_system \
951
+ else f"./{self._db_name}/{file_name}"
952
+
953
+ if func_name == "split":
954
+ # Need to generate data into splits of train and test.
955
+ # split_id - the column which will be used to identify the split.
956
+ # data_type - the column which will be used to identify whether the row is
957
+ # train or test row.
958
+ return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
959
+ # Returning feature columns and label columns as well.
960
+ return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
961
+ for col in (feature_columns + label_columns)]
962
+ else:
963
+ # Return Varchar by default.
964
+ # Returns Varchar even for functions like `get_n_splits` which returns large integer
965
+ # numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
966
+ # `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
967
+ return_types = [(func_name, VARCHAR())]
968
+
969
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
970
+ for col in new_partition_columns] + return_types
971
+
972
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
973
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
974
+
975
+ py_exc = UtilFuncs._get_python_execution_path()
976
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
977
+ f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
978
+ f"{self._model_file_name_prefix} {self._is_lake_system}"
979
+
980
+ # Get unique values in partitioning columns.
981
+ self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
982
+
983
+ # Checking the trained model installation. If not installed,
984
+ # install it and set flag to True.
985
+ if not self._is_trained_model_installed:
986
+ self._install_initial_model_file()
987
+ self._is_trained_model_installed = True
988
+
989
+ opt = self._run_script(data, script_command, new_partition_columns, return_types)
990
+
991
+ if func_name == "get_n_splits" and not partition_columns:
992
+ # Return number of splits as value, not as dataframe.
993
+ vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
994
+ opt = vals.fetchall()[0][0]
995
+
996
+ # Varchar is returned by the script. Convert it to int.
997
+ return int(opt)
998
+
999
+ return opt
1000
+
1001
+
1002
+ class _SKLearnFunctionWrapper(_FunctionWrapper):
1003
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
1004
+ _pkgs = ["scikit-learn", "numpy", "scipy"]
1005
+ def __init__(self, module_name, func_name):
1006
+ file_type = "file_fn_sklearn"
1007
+ template_file = "sklearn_function.template"
1008
+ super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)