teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +306 -0
  4. teradataml/__init__.py +10 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +299 -16
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +13 -3
  11. teradataml/analytics/json_parser/utils.py +13 -6
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +11 -2
  15. teradataml/analytics/table_operator/__init__.py +4 -3
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +66 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1502 -323
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +247 -307
  22. teradataml/automl/data_transformation.py +32 -12
  23. teradataml/automl/feature_engineering.py +325 -86
  24. teradataml/automl/model_evaluation.py +44 -35
  25. teradataml/automl/model_training.py +122 -153
  26. teradataml/catalog/byom.py +8 -8
  27. teradataml/clients/pkce_client.py +1 -1
  28. teradataml/common/__init__.py +2 -1
  29. teradataml/common/constants.py +72 -0
  30. teradataml/common/deprecations.py +13 -7
  31. teradataml/common/garbagecollector.py +152 -120
  32. teradataml/common/messagecodes.py +11 -2
  33. teradataml/common/messages.py +4 -1
  34. teradataml/common/sqlbundle.py +26 -4
  35. teradataml/common/utils.py +225 -14
  36. teradataml/common/wrapper_utils.py +1 -1
  37. teradataml/context/context.py +82 -2
  38. teradataml/data/SQL_Fundamentals.pdf +0 -0
  39. teradataml/data/complaints_test_tokenized.csv +353 -0
  40. teradataml/data/complaints_tokens_model.csv +348 -0
  41. teradataml/data/covid_confirm_sd.csv +83 -0
  42. teradataml/data/dataframe_example.json +27 -1
  43. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  44. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  45. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  46. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  47. teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
  48. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  49. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  50. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  51. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  52. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  53. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  54. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  55. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  56. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  57. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  58. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  59. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  60. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  61. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  62. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  63. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  64. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  65. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  66. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  67. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  68. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  69. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  70. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  71. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  72. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  74. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  75. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  76. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  77. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  78. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  79. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  80. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  81. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  82. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  83. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  84. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  85. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  86. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  87. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  88. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  89. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  90. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  91. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  92. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  93. teradataml/data/dwt2d_dataTable.csv +65 -0
  94. teradataml/data/dwt_dataTable.csv +8 -0
  95. teradataml/data/dwt_filterTable.csv +3 -0
  96. teradataml/data/finance_data4.csv +13 -0
  97. teradataml/data/grocery_transaction.csv +19 -0
  98. teradataml/data/idwt2d_dataTable.csv +5 -0
  99. teradataml/data/idwt_dataTable.csv +8 -0
  100. teradataml/data/idwt_filterTable.csv +3 -0
  101. teradataml/data/interval_data.csv +5 -0
  102. teradataml/data/jsons/paired_functions.json +14 -0
  103. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  104. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  105. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  106. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  107. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  108. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  109. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  110. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  111. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  112. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  113. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  114. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  115. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  116. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  117. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  118. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  119. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  120. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  121. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  122. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  123. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  124. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  125. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  126. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  127. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  128. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  129. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  130. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  131. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  132. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  133. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  134. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  135. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  136. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  137. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  138. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  139. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  140. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  141. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  142. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  143. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  144. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  145. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  146. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  147. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  148. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  149. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  150. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  151. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  152. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  153. teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
  154. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  155. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  156. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  157. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  158. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  159. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
  160. teradataml/data/load_example_data.py +8 -2
  161. teradataml/data/medical_readings.csv +101 -0
  162. teradataml/data/naivebayestextclassifier_example.json +1 -1
  163. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  164. teradataml/data/patient_profile.csv +101 -0
  165. teradataml/data/peppers.png +0 -0
  166. teradataml/data/real_values.csv +14 -0
  167. teradataml/data/sax_example.json +8 -0
  168. teradataml/data/scripts/deploy_script.py +1 -1
  169. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  170. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  171. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  172. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  173. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
  174. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  175. teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
  176. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  177. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  178. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  179. teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
  180. teradataml/data/star_pivot.csv +8 -0
  181. teradataml/data/target_udt_data.csv +8 -0
  182. teradataml/data/templates/open_source_ml.json +3 -1
  183. teradataml/data/teradataml_example.json +20 -1
  184. teradataml/data/timestamp_data.csv +4 -0
  185. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  186. teradataml/data/uaf_example.json +55 -1
  187. teradataml/data/unpivot_example.json +15 -0
  188. teradataml/data/url_data.csv +9 -0
  189. teradataml/data/vectordistance_example.json +4 -0
  190. teradataml/data/windowdfft.csv +16 -0
  191. teradataml/dataframe/copy_to.py +1 -1
  192. teradataml/dataframe/data_transfer.py +5 -3
  193. teradataml/dataframe/dataframe.py +1002 -201
  194. teradataml/dataframe/fastload.py +3 -3
  195. teradataml/dataframe/functions.py +867 -0
  196. teradataml/dataframe/row.py +160 -0
  197. teradataml/dataframe/setop.py +2 -2
  198. teradataml/dataframe/sql.py +840 -33
  199. teradataml/dataframe/window.py +1 -1
  200. teradataml/dbutils/dbutils.py +878 -34
  201. teradataml/dbutils/filemgr.py +48 -1
  202. teradataml/geospatial/geodataframe.py +1 -1
  203. teradataml/geospatial/geodataframecolumn.py +1 -1
  204. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  205. teradataml/lib/aed_0_1.dll +0 -0
  206. teradataml/opensource/__init__.py +1 -1
  207. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  208. teradataml/opensource/_lightgbm.py +950 -0
  209. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  210. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  211. teradataml/opensource/sklearn/__init__.py +0 -1
  212. teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
  213. teradataml/options/__init__.py +9 -23
  214. teradataml/options/configure.py +42 -4
  215. teradataml/options/display.py +2 -2
  216. teradataml/plot/axis.py +4 -4
  217. teradataml/scriptmgmt/UserEnv.py +13 -9
  218. teradataml/scriptmgmt/lls_utils.py +77 -23
  219. teradataml/store/__init__.py +13 -0
  220. teradataml/store/feature_store/__init__.py +0 -0
  221. teradataml/store/feature_store/constants.py +291 -0
  222. teradataml/store/feature_store/feature_store.py +2223 -0
  223. teradataml/store/feature_store/models.py +1505 -0
  224. teradataml/store/vector_store/__init__.py +1586 -0
  225. teradataml/table_operators/Script.py +2 -2
  226. teradataml/table_operators/TableOperator.py +106 -20
  227. teradataml/table_operators/query_generator.py +3 -0
  228. teradataml/table_operators/table_operator_query_generator.py +3 -1
  229. teradataml/table_operators/table_operator_util.py +102 -56
  230. teradataml/table_operators/templates/dataframe_register.template +69 -0
  231. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  232. teradataml/telemetry_utils/__init__.py +0 -0
  233. teradataml/telemetry_utils/queryband.py +52 -0
  234. teradataml/utils/dtypes.py +4 -2
  235. teradataml/utils/validators.py +34 -2
  236. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
  237. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
  238. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  239. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  240. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -0,0 +1,950 @@
1
+ # ##################################################################
2
+ #
3
+ # Copyright 2024 Teradata. All rights reserved.
4
+ # TERADATA CONFIDENTIAL AND TRADE SECRET
5
+ #
6
+ # Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
7
+ # Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
8
+ #
9
+ # Version: 1.0
10
+ # Function Version: 1.0
11
+ #
12
+ # This file contains object wrapper class for lightgbm opensource package.
13
+ #
14
+ # ##################################################################
15
+
16
+
17
+ import base64
18
+ import json
19
+ import os
20
+ import pickle
21
+ import warnings
22
+
23
+ from collections import OrderedDict
24
+ from importlib import import_module
25
+
26
+
27
+ import pandas as pd
28
+ from teradatasqlalchemy import BLOB, CLOB, FLOAT
29
+
30
+ from teradataml import _TDML_DIRECTORY, UtilFuncs, execute_sql, TeradataMlException, Messages, MessageCodes, DataFrame
31
+ from teradataml.opensource._wrapper_utils import _generate_new_name
32
+ from teradataml.opensource.constants import OpenSourcePackage
33
+ from teradataml.opensource.sklearn._sklearn_wrapper import (
34
+ _FunctionWrapper, _OpenSourceObjectWrapper, _SkLearnObjectWrapper)
35
+
36
+
37
+ class _LightgbmDatasetWrapper(_OpenSourceObjectWrapper):
38
+ """
39
+ Internal class for Lightgbm Dataset object.
40
+ """
41
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
42
+ def __init__(self, model=None, module_name=None, class_name=None, kwargs=None):
43
+
44
+ file_type = "file_fn_lightgbm"
45
+ self._template_file = "dataset.template"
46
+ super().__init__(model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
47
+
48
+ self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
49
+
50
+ self._script_file_name = _generate_new_name(type=file_type, extension="py")
51
+ self._data_args = OrderedDict()
52
+
53
+ self._initialize_variables(table_name_prefix="td_lightgbm_")
54
+ if model:
55
+ self.modelObj = model
56
+ self.module_name = model.__module__.split("._")[0]
57
+ self.class_name = model.__class__.__name__
58
+ _model_init_arguments = model.__init__.__code__.co_varnames
59
+ self.kwargs = dict((k, v) for k, v in model.__dict__.items() if k in _model_init_arguments)
60
+
61
+ self.pos_args = tuple() # Kept empty as all are moved to kwargs.
62
+ else:
63
+ self.initial_args = kwargs
64
+ self._initialize_object()
65
+ self.__run_func_returning_objects(all_kwargs=self.kwargs, use_dummy_initial_file=True)
66
+
67
+ def __getattr__(self, name):
68
+ if name in ["construct"]:
69
+ wt = self.initial_args.get("weight", None) if hasattr(self, "initial_args") else None
70
+ if (isinstance(wt, pd.DataFrame) and wt.iloc[0]["get_weight"] is not None) or wt is not None:
71
+ raise ValueError(f"The method '{name}' is not implemented when \"weight\" argument is provided.")
72
+
73
+ if name in ["set_weight", "set_label"]:
74
+ raise NotImplementedError(f"'{name}' is not implemented for Lightgbm Dataset object.\n")
75
+
76
+ if name == "set_group" and isinstance(self.modelObj, pd.DataFrame):
77
+ raise NotImplementedError("'set_group' is not implemented for Lightgbm Dataset object "\
78
+ "in multi-model case as different models have different number "\
79
+ "of rows and grouping them in one set of group is not possible.")
80
+
81
+ return super().__getattr__(name)
82
+
83
+ def save_binary(self, file_name, save_in_vantage=False):
84
+ """
85
+ DESCRIPTION:
86
+ Save the model(s) to a binary file(s). Additionally the files are saved
87
+ to Vantage if "save_in_vantage" argument is set to True.
88
+
89
+ PARAMETERS:
90
+ file_name:
91
+ Required Argument.
92
+ Specifies the absolute path of the file name to which lightgbm Dataset
93
+ object is to be saved to.
94
+ Note:
95
+ * File name is prefixed with underscore delimitted partition column
96
+ values in multi-model case.
97
+ * File name excluding extension and file name with extension should
98
+ not already be present in Vantage.
99
+ Type: str
100
+
101
+ save_in_vantage:
102
+ Optional Argument.
103
+ Specifies whether to save the file in VantageCloud Enterprise or user environment
104
+ of VantageCloud Lake.
105
+ Default Value: False
106
+ Type: bool
107
+
108
+ RETURNS:
109
+ None
110
+
111
+ RAISES:
112
+ TeradataMlException
113
+
114
+ EXAMPLES:
115
+ >>> # Save the lightgbm Dataset object to a binary file in client.
116
+ >>> lightgbm_dataset.save_binary("lightgbm_dataset.pickle")
117
+
118
+ >>> # Save the lightgbm Dataset object to a binary file in client and Vantage.
119
+ >>> lightgbm_dataset.save_binary("lightgbm_dataset.pickle", save_in_vantage=True)
120
+
121
+ """
122
+ _file_name = os.path.basename(file_name)
123
+ _file_dir = os.path.dirname(file_name)
124
+ if not isinstance(self.modelObj, pd.DataFrame):
125
+ self.modelObj.save_binary(file_name)
126
+ file_prefix = _file_name.split(".")[0]
127
+ print("Model saved in client as ", file_name)
128
+ if save_in_vantage:
129
+ self._install_script_file(file_identifier=file_prefix,
130
+ file_name=_file_name,
131
+ is_binary=True,
132
+ file_location=_file_dir)
133
+ print(f"Model file {_file_name} saved in Vantage.")
134
+ else:
135
+ no_of_unique_partitions = len(self._fit_partition_unique_values)
136
+ no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
137
+
138
+ print("Multiple model files in multi-model case are saved with different names"\
139
+ " with partition column values information delimited by underscore.")
140
+
141
+ for i in range(no_of_unique_partitions):
142
+ partition_join = "_".join(list(map(str, self.modelObj.iloc[i, :no_of_partitioning_cols])))
143
+ # Split extension from file name to add partition column values before extension.
144
+ __file_name, __file_ext = os.path.splitext(_file_name)
145
+ __file_name = f"{__file_name}_{partition_join}{__file_ext}"
146
+ __file_prefix = os.path.splitext(__file_name)[0] # File identifier.
147
+
148
+ __joined_file = os.path.join(_file_dir, __file_name)
149
+ self.modelObj.iloc[i]["model"].save_binary(__joined_file)
150
+
151
+ if save_in_vantage:
152
+ self._install_script_file(file_identifier=__file_prefix,
153
+ file_name=__file_name,
154
+ is_binary=True,
155
+ file_location=_file_dir)
156
+ print(f"Model file {__file_name} saved in Vantage.")
157
+
158
+ def create_valid(self, **kwargs):
159
+ if isinstance(self.modelObj, pd.DataFrame):
160
+ raise NotImplementedError("'create_valid' is not implemented for Lightgbm Dataset object"\
161
+ " in multi-model case.")
162
+ return self.__run_func_returning_objects(all_kwargs=kwargs, func_name="create_valid")
163
+
164
+ def __run_func_returning_objects(self, all_kwargs, func_name=None, use_dummy_initial_file=False):
165
+ """
166
+ Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
167
+ """
168
+ kwargs = all_kwargs.copy()
169
+
170
+ if kwargs.get("label", None) is not None:
171
+ label_df = kwargs["label"]
172
+ self._fit_label_columns_types = []
173
+ self._fit_label_columns_python_types = []
174
+ for l_c in label_df.columns:
175
+ column_data = label_df._td_column_names_and_sqlalchemy_types[l_c.lower()]
176
+ self._fit_label_columns_types.append(column_data)
177
+ self._fit_label_columns_python_types.append(column_data.python_type.__name__)
178
+
179
+ replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
180
+
181
+ script_file_path = f"{self._script_file_name}" if self._is_lake_system \
182
+ else f"./{self._db_name}/{self._script_file_name}"
183
+
184
+ py_exc = UtilFuncs._get_python_execution_path()
185
+ script_command = f"{py_exc} {script_file_path} {self._model_file_name_prefix} {self._is_lake_system}"
186
+
187
+ model_type = BLOB() if self._is_lake_system else CLOB()
188
+ return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
189
+ for col in partition_cols] + [("model", model_type)]
190
+
191
+ if "reference" in kwargs.keys() and kwargs["reference"] is not None:
192
+ # "reference" is another Dataset object which is passed as an argument.
193
+ # It should be accessed through model file name prefix as it raises an exception
194
+ # if we try to dump it as json -`TypeError: Object of type Dataset is not JSON serializable`.
195
+ self.initial_args["reference"]._install_initial_model_file()
196
+ kwargs["reference"] = self.initial_args["reference"]._model_file_name_prefix
197
+
198
+ replace_dict.update({"<all_col_names>": str(list(self._tdml_df.columns)),
199
+ "<params>": json.dumps(kwargs),
200
+ "<module_name>": f"'{self.module_name}'",
201
+ "<class_name>": f"'{self.class_name}'",
202
+ "<func_name>": f"'{func_name}'" if func_name else "None"})
203
+
204
+ # Generate new file in .teradataml directory and install it to Vantage.
205
+ self._prepare_and_install_file(replace_dict=replace_dict)
206
+
207
+ if partition_cols:
208
+ self._fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
209
+
210
+ self._install_initial_model_file(use_dummy_initial_file=use_dummy_initial_file)
211
+
212
+ self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
213
+ self._model_data._index_label = None
214
+
215
+ self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
216
+ n_partition_cols=len(partition_cols))
217
+
218
+ # File cleanup after processing.
219
+ os.remove(self._script_file_local)
220
+ self._remove_script_file(self._script_file_name)
221
+
222
+ return self
223
+
224
+
225
+ class _LightgbmFunctionWrapper(_FunctionWrapper):
226
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
227
+ def __init__(self, module_name=None, func_name=None):
228
+ file_type = "file_fn_lightgbm"
229
+ template_file = "lightgbm_function.template"
230
+ self._script_file_name = _generate_new_name(type=file_type, extension="py")
231
+ super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)
232
+ self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
233
+
234
+ def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1, record_eval_exists=False):
235
+ """
236
+ Internal function to extract lightgbm object from the model(s) depending on the number of
237
+ partitions. When it is only one model, it is directly used as modelObj.
238
+ When it is multiple models, it is converted to pandas DataFrame and stored in modelObj.
239
+
240
+ PARAMETERS:
241
+ n_unique_partitions:
242
+ Optional Argument.
243
+ Specifies the number of unique partitions. If this argument is greater than 1,
244
+ then pandas DataFame is created for modelObj. Otherwise, model object is directly
245
+ stored in modelObj.
246
+ Type: int
247
+
248
+ n_partition_cols:
249
+ Optional Argument.
250
+ Specifies the number of partition columns. Since partition columns are stored in
251
+ the first columns of the self.model_data, this argument is used to extract model
252
+ object and other columns (console_output) from self.model_data.
253
+ Type: int
254
+
255
+ record_eval_exists:
256
+ Optional Argument.
257
+ Specifies whether record_evaluation callback exists in the function call.
258
+ If yes, then record_evaluation_result is also extracted from the model data.
259
+ Type: bool
260
+
261
+ RETURNS:
262
+ None
263
+
264
+ RAISES:
265
+ ValueError
266
+
267
+ EXAMPLES:
268
+ >>> # Extract model object, console output and record_evaluation results from the model
269
+ >>> # data and assign them to self.modelObj.
270
+ >>> self._extract_model_objs(n_unique_partitions=4, n_partition_cols=2, record_eval_exists=True)
271
+
272
+ """
273
+ vals = execute_sql("select * from {}".format(self._model_data._table_name)).fetchall()
274
+
275
+ # pickle will issue a caution warning, if model pickling was done with
276
+ # different library version than used here. The following disables any warnings
277
+ # that might otherwise show in the scriptlog files on the Advanced SQL Engine
278
+ # nodes in this case. Yet, do keep an eye for incompatible pickle versions.
279
+ warnings.filterwarnings("ignore")
280
+
281
+ model_obj = None
282
+ console_opt = None
283
+ record_eval_result = None
284
+ # Extract and unpickle the following:
285
+ # - column next to partition columns - model object.
286
+ # - column next to model object - console output.
287
+ # - column next to console output - record_evaluation_result (if record_evaluation callback
288
+ # is there in input).
289
+ for i, row in enumerate(vals):
290
+ if self._is_lake_system:
291
+ model_obj = pickle.loads(row[n_partition_cols])
292
+ # console_output is stored in the column next to model object.
293
+ console_opt = row[n_partition_cols+1].decode()
294
+ if record_eval_exists:
295
+ # record_evaluation_result is stored in the column next to console_output.
296
+ record_eval_result = pickle.loads(row[n_partition_cols+2])
297
+ else:
298
+ model_obj = pickle.loads(base64.b64decode(row[n_partition_cols].partition("'")[2]))
299
+ # console_output is stored in the column next to model object.
300
+ console_opt = base64.b64decode(row[n_partition_cols+1].partition("'")[2]).decode()
301
+ if record_eval_exists:
302
+ # record_evaluation_result is stored in the column next to console_output.
303
+ record_eval_result = pickle.loads(
304
+ base64.b64decode(row[n_partition_cols+2].partition("'")[2]))
305
+ row[n_partition_cols] = model_obj
306
+ row[n_partition_cols+1] = console_opt
307
+ if record_eval_exists:
308
+ row[n_partition_cols+2] = record_eval_result
309
+ vals[i] = row
310
+ if n_unique_partitions == 1:
311
+ # Return both model object and console output for single model case.
312
+ pdf_data = [model_obj, console_opt]
313
+ if record_eval_exists:
314
+ # Add record_evaluation_result to the pandas df if exists.
315
+ pdf_data.append(record_eval_result)
316
+ self.modelObj = pd.DataFrame([pdf_data],
317
+ # First column is partition column. Hence, removed.
318
+ columns=self._model_data.columns[1:])
319
+ elif n_unique_partitions > 1:
320
+ self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
321
+ else:
322
+ ValueError("Number of partitions should be greater than 0.")
323
+
324
+ warnings.filterwarnings("default")
325
+
326
+ def __call__(self, **kwargs):
327
+
328
+ if self._func_name == "cv" and kwargs.get("return_cvbooster", None):
329
+ raise NotImplementedError("return_cvbooster argument is not supported yet.")
330
+
331
+ train_set = kwargs.pop("train_set")
332
+
333
+ train_set._install_initial_model_file()
334
+
335
+ # Data with only partition columns to run training on correct Dataset object in
336
+ # appropriate AMP/Node.
337
+ data = train_set._model_data.drop(columns="model")
338
+
339
+ kwargs["train_set"] = train_set._model_file_name_prefix
340
+ train_part_unique_vals = train_set._fit_partition_unique_values
341
+
342
+ partition_cols = data.columns # Because all the columns are parition columns.
343
+
344
+ valid_sets = kwargs.pop("valid_sets", None)
345
+ if valid_sets:
346
+ kwargs["valid_sets"] = []
347
+ for _, val in enumerate(valid_sets):
348
+ val._install_initial_model_file()
349
+ kwargs["valid_sets"].append(val._model_file_name_prefix)
350
+ val_part_unique_vals = val._fit_partition_unique_values
351
+
352
+ # Make sure all datasets are partitioned on same column values.
353
+ if not self._validate_equality_of_partition_values(train_part_unique_vals,
354
+ val_part_unique_vals):
355
+ raise TeradataMlException(
356
+ Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING,
357
+ "training", "validation"),
358
+ MessageCodes.PARTITION_VALUES_NOT_MATCHING
359
+ )
360
+
361
+ # Handle callbacks. Check if record_evaluation callback is present.
362
+ rec_eval_exists = False # Flag to check if record_evaluation callback exists.
363
+ if "callbacks" in kwargs and kwargs["callbacks"] is not None:
364
+ callbacks = kwargs["callbacks"]
365
+ callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
366
+ for callback in callbacks:
367
+ if callback["func_name"] == "record_evaluation":
368
+ rec_eval_exists = True
369
+ break
370
+
371
+ script_file_path = f"{self._script_file_name}" if self._is_lake_system \
372
+ else f"./{self._db_name}/{self._script_file_name}"
373
+
374
+ py_exc = UtilFuncs._get_python_execution_path()
375
+ script_command = f"{py_exc} {script_file_path}"
376
+
377
+ _, partition_indices, partition_types, partition_cols = \
378
+ self._get_data_col_types_and_partition_col_indices_and_types(data,
379
+ partition_cols,
380
+ idx_delim=None,
381
+ types_delim=None)
382
+
383
+ model_file_prefix = None
384
+ if self._is_lake_system:
385
+ model_file_prefix = self._script_file_name.replace(".py", "")
386
+
387
+ replace_dict = {"<module_name>": self._module_name,
388
+ "<func_name>": self._func_name,
389
+ "<is_lake_system>": str(self._is_lake_system),
390
+ "<params>": json.dumps(kwargs),
391
+ "<partition_cols_indices>": str(partition_indices),
392
+ "<partition_cols_types>": str(partition_types),
393
+ "<model_file_prefix>": str(model_file_prefix)}
394
+
395
+ self._prepare_and_install_file(replace_dict=replace_dict)
396
+
397
+ # One additional column "console_output" containing captured console output which contain
398
+ # training and validation logs.
399
+ model_type = BLOB() if self._is_lake_system else CLOB()
400
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
401
+ for col in partition_cols] + \
402
+ [("model", model_type), ("console_output", model_type)]
403
+
404
+ rec_eval_col_name = "record_evaluation_result"
405
+ if rec_eval_exists:
406
+ # If record_evaluation result exists in callback, add it to return types and corresponding
407
+ # output in script.
408
+ return_types.append((rec_eval_col_name, model_type))
409
+
410
+ _no_of_unique_partitions = len(train_set._fit_partition_unique_values)
411
+
412
+ try:
413
+ self._model_data = self._run_script(data, script_command, partition_cols, return_types)
414
+
415
+ self._extract_model_objs(n_unique_partitions=_no_of_unique_partitions,
416
+ n_partition_cols=len(partition_cols),
417
+ record_eval_exists=rec_eval_exists)
418
+
419
+ except Exception as ex:
420
+ # File cleanup if script execution fails or unable to fetch modelObj.
421
+ os.remove(self._script_file_local)
422
+ self._remove_script_file(self._script_file_name)
423
+ raise
424
+
425
+ # File cleanup after processing.
426
+ os.remove(self._script_file_local)
427
+ self._remove_script_file(self._script_file_name)
428
+
429
+ if _no_of_unique_partitions == 1:
430
+ # If only one partition, print the console output and return the model object.
431
+ print(self.modelObj.iloc[0]["console_output"])
432
+ if self._func_name == "cv":
433
+ return self.modelObj.iloc[0]["model"]
434
+ if not rec_eval_exists:
435
+ booster_obj = _LightgbmBoosterWrapper(model=self.modelObj.iloc[0]["model"])
436
+ else:
437
+ # If record_evaluation results are there, return dictionary of model object and
438
+ # record_evaluation results.
439
+ model_dict = {"model" : self.modelObj.iloc[0]["model"],
440
+ rec_eval_col_name : self.modelObj.iloc[0][rec_eval_col_name]}
441
+ booster_obj = _LightgbmBoosterWrapper(model=model_dict, model_column_name="model")
442
+ booster_obj._is_default_partition_value_fit = True
443
+ booster_obj._fit_partition_unique_values = train_part_unique_vals
444
+ booster_obj._is_model_installed = False # As model is trained and returned but not saved to Vantage.
445
+
446
+ else:
447
+ if self._func_name == "cv":
448
+ return self.modelObj
449
+ booster_obj = _LightgbmBoosterWrapper(model=self.modelObj, model_column_name="model")
450
+ booster_obj._fit_partition_colums_non_default = partition_cols
451
+ booster_obj._is_default_partition_value_fit = train_set._is_default_partition_value_fit
452
+
453
+ booster_obj._fit_partition_unique_values = train_part_unique_vals
454
+ booster_obj._is_model_installed = False # As model is trained and returned but not saved to Vantage.
455
+
456
+ return booster_obj
457
+
458
+
459
+ # Using _SkLearnObjectWrapper as base class for _LightgbmBoosterWrapper as _transform method is not
460
+ # present in _OpenSourceObjectWrapper class.
461
+ class _LightgbmBoosterWrapper(_SkLearnObjectWrapper):
462
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
463
+ def __init__(self, model=None, module_name=None, class_name=None, kwargs=None, model_column_name=None):
464
+ file_type = "file_fn_lightgbm_booster"
465
+
466
+ self._model_column_name = model_column_name
467
+
468
+ self.record_evaluation_result = None
469
+
470
+ if model is not None and isinstance(model, dict) and self._model_column_name in model.keys():
471
+ self.record_evaluation_result = model["record_evaluation_result"]
472
+ model = model[self._model_column_name] # As model is stored in dictionary with key as "train_".
473
+
474
+ _OpenSourceObjectWrapper.__init__(self, model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
475
+
476
+ self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
477
+
478
+ self._script_file_name = _generate_new_name(type=file_type, extension="py")
479
+
480
+ self._initialize_variables(table_name_prefix="td_lightgbm_")
481
+ if model is not None:
482
+ first_model = model
483
+ if isinstance(model, pd.DataFrame):
484
+ first_model = model.iloc[0][self._model_column_name]
485
+ self.modelObj = model
486
+ self.module_name = first_model.__module__.split("._")[0]
487
+ self.class_name = first_model.__class__.__name__
488
+ _model_init_arguments = first_model.__init__.__code__.co_varnames
489
+ self.kwargs = dict((k, v) for k, v in first_model.__dict__.items() if k in _model_init_arguments)
490
+
491
+ self.pos_args = tuple()
492
+
493
+ else:
494
+ # Create model object from new positional and keyword arguments.
495
+ if "train_set" in self.kwargs and self.kwargs["train_set"] is not None and \
496
+ isinstance(self.kwargs["train_set"], _LightgbmDatasetWrapper):
497
+ self.kwargs["train_set"] = self.kwargs["train_set"].modelObj
498
+
499
+ from importlib import import_module
500
+ class_obj = getattr(import_module(self.module_name), self.class_name)
501
+ self.modelObj = class_obj(**self.kwargs)
502
+
503
+ def deploy(self, model_name, replace_if_exists=False):
504
+ raise NotImplementedError("The deploy() function is not yet supported for lightgbm OpensourceML objects. \
505
+ Support will be added in future releases.")
506
+
507
+ @property
508
+ def model_info(self):
509
+ """
510
+ DESCRIPTION:
511
+ Get the model information along with console output for multi-model case. Only model
512
+ object is returned for single model case.
513
+ Note:
514
+ This is particularly useful in multi-model case when the user want to see the console
515
+ output of each partition.
516
+
517
+ PARAMETERS:
518
+ None
519
+
520
+ RAISES:
521
+ None
522
+
523
+ RETURNS:
524
+ Pandas DataFrame
525
+
526
+ EXAMPLES:
527
+ # Load example data.
528
+ >>> load_example_data("openml", ["multi_model_classification"])
529
+ >>> df = DataFrame("multi_model_classification")
530
+ >>> df.head(3)
531
+ col2 col3 col4 label group_column partition_column_1 partition_column_2
532
+ col1
533
+ -2.560430 0.402232 -1.100742 -2.959588 0 9 0 10
534
+ -3.587546 0.291819 -1.850169 -4.331055 0 10 0 10
535
+ -3.697436 1.576888 -0.461220 -3.598652 0 10 0 11
536
+
537
+ # Get the feature and label data.
538
+ >>> df_x = df.select(["col1", "col2", "col3", "col4"])
539
+ >>> df_y = df.select("label")
540
+
541
+ # Partition columns for multi model case.
542
+ >>> part_cols = ["partition_column_1", "partition_column_2"]
543
+
544
+ ## Single model case.
545
+ # Create lightgbm Dataset object.
546
+ >>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
547
+
548
+ # Train the model.
549
+ >>> model = td_lightgbm.train(params={}, train_set=lgbm_data,
550
+ ... num_boost_round=30,
551
+ ... early_stopping_rounds=50)
552
+ >>> model # This is object of _LightgbmBoosterWrapper class.
553
+ <lightgbm.basic.Booster object at 0x0000025BD2459160>
554
+
555
+ ## Multi model case.
556
+ # Create lightgbm Dataset objects for training and validation.
557
+ >>> obj_m = td_lightgbm.Dataset(df_x, df_y, free_raw_data=False,
558
+ partition_columns=part_cols)
559
+
560
+ >>> obj_m_v = td_lightgbm.Dataset(df_x, df_y, free_raw_data=False,
561
+ partition_columns=part_cols)
562
+
563
+ # Train the models in multi model case.
564
+ >>> model = td_lightgbm.train(params={}, train_set=obj_m,
565
+ ... num_boost_round=30,
566
+ ... early_stopping_rounds=50,
567
+ ... valid_sets=[obj_m_v, obj_m_v])
568
+ >>> model
569
+ partition_column_1 partition_column_2 \
570
+ 0 1 11
571
+ 1 0 11
572
+ 2 1 10
573
+ 3 0 10
574
+
575
+ model \
576
+ 0 <lightgbm.basic.Booster object at 0x7f2e95ffc0a0>
577
+ 1 <lightgbm.basic.Booster object at 0x7f2e95ffc880>
578
+ 2 <lightgbm.basic.Booster object at 0x7f2e95f852e0>
579
+ 3 <lightgbm.basic.Booster object at 0x7f2e95f853a0>
580
+
581
+ console_output
582
+ 0 [LightGBM] [Warning] Auto-choosing col-wise mu...
583
+ 1 [LightGBM] [Warning] Auto-choosing row-wise mu...
584
+ 2 [LightGBM] [Warning] Auto-choosing col-wise mu...
585
+ 3 [LightGBM] [Warning] Auto-choosing row-wise mu...
586
+
587
+ # Get the model information which returns the printed output as pandas
588
+ # DataFrame containing the model information along with console output.
589
+ >>> model_info = lightgbm_booster.model_info
590
+
591
+ # Print console output of first partition.
592
+ >>> print(model_info.iloc[0]["console_output"])
593
+ [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
594
+ You can set `force_col_wise=true` to remove the overhead.
595
+ [LightGBM] [Info] Total Bins 136
596
+ [LightGBM] [Info] Number of data points in the train set: 97, number of used features: 4
597
+ [LightGBM] [Info] Start training from score 0.556701
598
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
599
+ [1] valid_0's l2: 0.219637 valid_1's l2: 0.219637
600
+ Training until validation scores don't improve for 50 rounds
601
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
602
+ [2] valid_0's l2: 0.196525 valid_1's l2: 0.196525
603
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
604
+ [3] valid_0's l2: 0.178462 valid_1's l2: 0.178462
605
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
606
+ [4] valid_0's l2: 0.162887 valid_1's l2: 0.162887
607
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
608
+ [5] valid_0's l2: 0.150271 valid_1's l2: 0.150271
609
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
610
+ [6] valid_0's l2: 0.140219 valid_1's l2: 0.140219
611
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
612
+ [7] valid_0's l2: 0.131697 valid_1's l2: 0.131697
613
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
614
+ [8] valid_0's l2: 0.124056 valid_1's l2: 0.124056
615
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
616
+ [9] valid_0's l2: 0.117944 valid_1's l2: 0.117944
617
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
618
+ [10] valid_0's l2: 0.11263 valid_1's l2: 0.11263
619
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
620
+ [11] valid_0's l2: 0.105228 valid_1's l2: 0.105228
621
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
622
+ [12] valid_0's l2: 0.0981571 valid_1's l2: 0.0981571
623
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
624
+ [13] valid_0's l2: 0.0924294 valid_1's l2: 0.0924294
625
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
626
+ [14] valid_0's l2: 0.0877899 valid_1's l2: 0.0877899
627
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
628
+ [15] valid_0's l2: 0.084032 valid_1's l2: 0.084032
629
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
630
+ [16] valid_0's l2: 0.080988 valid_1's l2: 0.080988
631
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
632
+ [17] valid_0's l2: 0.0785224 valid_1's l2: 0.0785224
633
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
634
+ [18] valid_0's l2: 0.0765253 valid_1's l2: 0.0765253
635
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
636
+ [19] valid_0's l2: 0.0750803 valid_1's l2: 0.0750803
637
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
638
+ [20] valid_0's l2: 0.0738915 valid_1's l2: 0.0738915
639
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
640
+ [21] valid_0's l2: 0.07288 valid_1's l2: 0.07288
641
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
642
+ [22] valid_0's l2: 0.0718676 valid_1's l2: 0.0718676
643
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
644
+ [23] valid_0's l2: 0.0706037 valid_1's l2: 0.0706037
645
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
646
+ [24] valid_0's l2: 0.0695799 valid_1's l2: 0.0695799
647
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
648
+ [25] valid_0's l2: 0.0687507 valid_1's l2: 0.0687507
649
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
650
+ [26] valid_0's l2: 0.0680819 valid_1's l2: 0.0680819
651
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
652
+ [27] valid_0's l2: 0.0674077 valid_1's l2: 0.0674077
653
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
654
+ [28] valid_0's l2: 0.0665111 valid_1's l2: 0.0665111
655
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
656
+ [29] valid_0's l2: 0.0659656 valid_1's l2: 0.0659656
657
+ [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
658
+ [30] valid_0's l2: 0.0652665 valid_1's l2: 0.0652665
659
+ Did not meet early stopping. Best iteration is:
660
+ [30] valid_0's l2: 0.0652665 valid_1's l2: 0.0652665
661
+
662
+ """
663
+ return self.modelObj
664
+
665
+ def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
666
+ """
667
+ Internal function to convert all OpensourceML related objects in arguments to
668
+ underlying model objects.
669
+ """
670
+ if isinstance(args, dict):
671
+ new_args = args.copy() # To avoid updating
672
+ for k, v in new_args.items():
673
+ if isinstance(v, type(self)) or isinstance(v, _LightgbmDatasetWrapper):
674
+ if idx_multi_model is None:
675
+ # single model. This argument (idx_multi_model) is set only when modelObj
676
+ # is multi model.
677
+ new_args[k] = v.modelObj
678
+ else:
679
+ # multi-model. Get appropriate model from modelObj.
680
+ new_args[k] = v.modelObj.iloc[idx_multi_model][self._model_column_name]
681
+ else:
682
+ new_args[k] = v
683
+ return new_args
684
+
685
+ # If args is tuple, convert all elements to underlying model object.
686
+ elif isinstance(args, tuple):
687
+ new_args = tuple()
688
+ for arg in args:
689
+ if isinstance(arg, type(self)) or isinstance(arg, _LightgbmDatasetWrapper):
690
+ if idx_multi_model is None:
691
+ # single model. This argument is set only when modelObj is single model.
692
+ new_args += (arg.modelObj,)
693
+ else:
694
+ # multi-model. Get appropriate model from modelObj.
695
+ new_args += (arg.modelObj.iloc[idx_multi_model][self._model_column_name],)
696
+ else:
697
+ new_args += (arg,)
698
+ return new_args
699
+ return args
700
+
701
+ def __getattr__(self, name):
702
+ def __run_transform(*c, **kwargs):
703
+ # Lightgbm predict method takes other keyword arguments along with data related arguments.
704
+ # Hence need to generate script dynamically instead of standard scikit-learn's
705
+ # sklearn_transform.py file.
706
+ self._convert_pos_args_to_kwargs_for_function(c, kwargs, name)
707
+ self._generate_script_file_from_template_file(kwargs=kwargs,
708
+ template_file="lightgbm_class_functions.template",
709
+ func_name=name)
710
+
711
+ return self._transform(**kwargs)
712
+
713
+ # TODO: Will be added as part of ELE-7150
714
+ if name in ["add_valid", "eval", "eval_train", "eval_valid", "refit", "set_attr", "update"]:
715
+ raise NotImplementedError(f"{name}() function is not supported yet. Will be added in future releases.")
716
+
717
+ # TODO: Will be added as part of ELE-7150
718
+ if name == "model_from_string" and not self._is_default_partition_value_fit:
719
+ # For multi model case of model_from_string() function.
720
+ raise NotImplementedError(
721
+ "model_from_string() function is not supported for multi model case. Will be added in future releases.")
722
+
723
+ # TODO: Will be added as part of ELE-7150
724
+ if name == "set_network":
725
+ raise NotImplementedError(
726
+ "set_network() function is not applicable for Teradata Vantage.")
727
+
728
+ if name in ["predict"]:
729
+ return __run_transform
730
+ return super().__getattr__(name)
731
+
732
+ def _transform(self, **kwargs):
733
+ # Overwriting existing _transform method to handle data related arguments and other
734
+ # keyword arguments.
735
+
736
+ # Extract data and label columns.
737
+ data_df = kwargs.pop("data") # "data" is mandatory argument for predict method.
738
+ current_dfs = [data_df]
739
+ feature_columns = data_df.columns
740
+
741
+ label_columns = None
742
+ if "label" in kwargs.keys() and kwargs["label"] is not None:
743
+ label_df = kwargs.pop("label")
744
+ current_dfs.append(label_df)
745
+ label_columns = label_df.columns
746
+
747
+ file_name = kwargs.pop("file_name")
748
+
749
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils
750
+ data = DataFrameUtils()._get_common_parent_df_from_dataframes(current_dfs)
751
+
752
+ try:
753
+ # Install initial model file and script file to Vantage.
754
+ self._install_model_and_script_files(file_name=file_name,
755
+ file_location=self._tdml_tmp_dir)
756
+
757
+ trans_opt = super()._transform(data=data, feature_columns=feature_columns,
758
+ label_columns=label_columns, file_name=file_name,
759
+ **kwargs)
760
+ except Exception as ex:
761
+ # File cleanup if script execution fails or unable to fetch modelObj.
762
+ os.remove(os.path.join(self._tdml_tmp_dir, file_name))
763
+ self._remove_script_file(file_name)
764
+ raise
765
+
766
+ # File cleanup after processing.
767
+ os.remove(os.path.join(self._tdml_tmp_dir, file_name))
768
+ self._remove_script_file(file_name)
769
+
770
+ return trans_opt
771
+
772
+ def __repr__(self):
773
+ return self.modelObj.__repr__()
774
+
775
+
776
+ class _LighgbmSklearnWrapper(_SkLearnObjectWrapper):
777
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
778
+ def __init__(self, model=None, module_name=None, class_name=None, kwargs=None):
779
+ super().__init__(model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
780
+ self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
781
+
782
+ def deploy(self, model_name, replace_if_exists=False):
783
+ raise NotImplementedError("The deploy() function is not yet supported for lightgbm OpensourceML objects. \
784
+ Support will be added in future releases.")
785
+
786
+ def set_params(self, **params):
787
+ """
788
+ Please check the description in Docs/OpensourceML/sklearn.py.
789
+ """
790
+ for key, val in params.items():
791
+ self.kwargs[key] = val
792
+
793
+ self.__init__(None, self.module_name, self.class_name, self.kwargs)
794
+ return self
795
+
796
+ def _process_and_run_fit_and_score_run(self, pos_args, kwargs, func_name):
797
+ """
798
+ Internal function to process data related arguments and other keyword arguments
799
+ for fit and score methods.
800
+ """
801
+ self._convert_pos_args_to_kwargs_for_function(pos_args, kwargs, func_name)
802
+
803
+ label_columns = kwargs["y"].columns if kwargs.get("y", None) else kwargs.get("label_columns", None)
804
+
805
+ if func_name == "score":
806
+ # Get partition columns from the trained model object.
807
+ if self._fit_partition_colums_non_default is not None and "partition_columns" not in kwargs.keys():
808
+ kwargs["partition_columns"] = self._fit_partition_colums_non_default
809
+ if func_name == "fit":
810
+ earlier_partition_cols = kwargs.get("partition_columns", None)
811
+ if earlier_partition_cols:
812
+ self._is_default_partition_value_fit = False
813
+ self._fit_partition_colums_non_default = earlier_partition_cols
814
+ else:
815
+ self._is_default_partition_value_fit = True
816
+ self._fit_partition_colums_non_default = None
817
+
818
+ generated_script_file = _generate_new_name(type=f"file_fn_lightgbm_sklearn_{func_name}", extension="py")
819
+
820
+ non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
821
+
822
+ replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
823
+
824
+ # Update non data related arguments in replace_dict containing data related argument information.
825
+ replace_dict.update({"<params>": json.dumps(non_data_related_args),
826
+ "<func_name>": f"'{func_name}'",
827
+ "<model_file_prefix>": f"'{self._model_file_name_prefix}'",
828
+ "<is_lake_system>": str(self._is_lake_system)})
829
+
830
+ # Replace placeholders in tempate file with actual values and write to new file.
831
+ self._read_from_template_and_write_dict_to_file(template_file="lightgbm_sklearn.template",
832
+ replace_dict=replace_dict,
833
+ output_script_file_name=generated_script_file)
834
+
835
+ if func_name == "fit":
836
+ # Get unique values in partitioning columns.
837
+ self._fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
838
+
839
+ # Install initial model file and script file to Vantage.
840
+ self._install_model_and_script_files(file_name=generated_script_file,
841
+ file_location=self._tdml_tmp_dir)
842
+
843
+ # db_name is applicable for enterprise system.
844
+ db_file_name = generated_script_file if self._is_lake_system else f"./{self._db_name}/{generated_script_file}"
845
+ py_exc = UtilFuncs._get_python_execution_path()
846
+ script_command = f"{py_exc} {db_file_name}"
847
+
848
+ return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
849
+ for col in partition_cols]
850
+ if func_name == "fit":
851
+ model_type = BLOB() if self._is_lake_system else CLOB()
852
+ return_types += [("model", model_type)]
853
+ if func_name == "score":
854
+ return_types += [("score", FLOAT())]
855
+ # Checking the trained model installation. If not installed,
856
+ # set flag to True (as it is already installed in
857
+ # `self._install_model_and_script_files()` call).
858
+ if not self._is_trained_model_installed:
859
+ self._is_trained_model_installed = True
860
+
861
+ try:
862
+ opt = self._run_script(data=self._tdml_df, command=script_command,
863
+ partition_columns=partition_cols,
864
+ return_types=return_types)
865
+ except Exception as ex:
866
+ # File cleanup if script execution fails or unable to fetch modelObj.
867
+ os.remove(os.path.join(self._tdml_tmp_dir, generated_script_file))
868
+ self._remove_script_file(generated_script_file)
869
+ raise
870
+
871
+ # File cleanup after processing.
872
+ os.remove(os.path.join(self._tdml_tmp_dir, generated_script_file))
873
+ self._remove_script_file(generated_script_file)
874
+
875
+ if func_name == "fit":
876
+ self._model_data = opt
877
+ self._assign_fit_variables_after_execution(self._tdml_df, partition_cols, label_columns)
878
+ return self
879
+
880
+ if func_name == "score":
881
+ if self._is_default_partition_value_fit:
882
+ # For single model case, partition column is internally generated and
883
+ # no point in returning it to the user.
884
+ opt = opt.select(func_name)
885
+ return opt
886
+
887
+ def fit(self, *c, **kwargs):
888
+ return self._process_and_run_fit_and_score_run(c, kwargs, "fit")
889
+
890
+ def score(self, *c, **kwargs):
891
+ return self._process_and_run_fit_and_score_run(c, kwargs, "score")
892
+
893
+ def _transform(self, **kwargs):
894
+ # Overwriting existing _transform method to handle data related arguments and other
895
+ # keyword arguments.
896
+
897
+ # Extract data and label columns.
898
+ data_df = kwargs.pop("X") # "X" is mandatory argument for predict method.
899
+ current_dfs = [data_df]
900
+ feature_columns = data_df.columns
901
+
902
+ label_columns = None
903
+ if "y" in kwargs.keys() and kwargs["y"] is not None:
904
+ label_df = kwargs.pop("y")
905
+ current_dfs.append(label_df)
906
+ label_columns = label_df.columns
907
+
908
+ file_name = kwargs.pop("file_name")
909
+
910
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils
911
+ data = DataFrameUtils()._get_common_parent_df_from_dataframes(current_dfs)
912
+
913
+ try:
914
+ # Install initial model file and script file to Vantage.
915
+ self._install_model_and_script_files(file_name=file_name,
916
+ file_location=self._tdml_tmp_dir)
917
+
918
+ trans_opt = super()._transform(data=data, feature_columns=feature_columns,
919
+ label_columns=label_columns, file_name=file_name,
920
+ **kwargs)
921
+ except Exception as ex:
922
+ # File cleanup if script execution fails or unable to fetch modelObj.
923
+ os.remove(os.path.join(self._tdml_tmp_dir, file_name))
924
+ self._remove_script_file(file_name)
925
+ raise
926
+
927
+ # File cleanup after processing.
928
+ os.remove(os.path.join(self._tdml_tmp_dir, file_name))
929
+ self._remove_script_file(file_name)
930
+
931
+ return trans_opt
932
+
933
+ def __getattr__(self, name):
934
+ def __run_transform(*c, **kwargs):
935
+ # Lightgbm predict method takes other keyword arguments along with data related arguments.
936
+ # Hence need to generate script dynamically instead of standard scikit-learn's
937
+ # sklearn_transform.py file.
938
+ generated_script_file = _generate_new_name(type=f"file_fn_lightgbm_sklearn_{name}", extension="py")
939
+
940
+ self._convert_pos_args_to_kwargs_for_function(c, kwargs, name)
941
+ self._generate_script_file_from_template_file(kwargs=kwargs,
942
+ template_file="lightgbm_class_functions.template",
943
+ func_name=name,
944
+ output_script_file_name=generated_script_file)
945
+
946
+ return self._transform(**kwargs)
947
+
948
+ if name in ["predict", "predict_proba"]:
949
+ return __run_transform
950
+ return super().__getattr__(name)