teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,950 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2024 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Adithya Avvaru (adithya.avvaru@teradata.com)
|
|
7
|
+
# Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
|
|
8
|
+
#
|
|
9
|
+
# Version: 1.0
|
|
10
|
+
# Function Version: 1.0
|
|
11
|
+
#
|
|
12
|
+
# This file contains object wrapper class for lightgbm opensource package.
|
|
13
|
+
#
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
import base64
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import pickle
|
|
21
|
+
import warnings
|
|
22
|
+
|
|
23
|
+
from collections import OrderedDict
|
|
24
|
+
from importlib import import_module
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
import pandas as pd
|
|
28
|
+
from teradatasqlalchemy import BLOB, CLOB, FLOAT
|
|
29
|
+
|
|
30
|
+
from teradataml import _TDML_DIRECTORY, UtilFuncs, execute_sql, TeradataMlException, Messages, MessageCodes, DataFrame
|
|
31
|
+
from teradataml.opensource._wrapper_utils import _generate_new_name
|
|
32
|
+
from teradataml.opensource.constants import OpenSourcePackage
|
|
33
|
+
from teradataml.opensource.sklearn._sklearn_wrapper import (
|
|
34
|
+
_FunctionWrapper, _OpenSourceObjectWrapper, _SkLearnObjectWrapper)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class _LightgbmDatasetWrapper(_OpenSourceObjectWrapper):
|
|
38
|
+
"""
|
|
39
|
+
Internal class for Lightgbm Dataset object.
|
|
40
|
+
"""
|
|
41
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
|
|
42
|
+
def __init__(self, model=None, module_name=None, class_name=None, kwargs=None):
|
|
43
|
+
|
|
44
|
+
file_type = "file_fn_lightgbm"
|
|
45
|
+
self._template_file = "dataset.template"
|
|
46
|
+
super().__init__(model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
|
|
47
|
+
|
|
48
|
+
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
|
|
49
|
+
|
|
50
|
+
self._script_file_name = _generate_new_name(type=file_type, extension="py")
|
|
51
|
+
self._data_args = OrderedDict()
|
|
52
|
+
|
|
53
|
+
self._initialize_variables(table_name_prefix="td_lightgbm_")
|
|
54
|
+
if model:
|
|
55
|
+
self.modelObj = model
|
|
56
|
+
self.module_name = model.__module__.split("._")[0]
|
|
57
|
+
self.class_name = model.__class__.__name__
|
|
58
|
+
_model_init_arguments = model.__init__.__code__.co_varnames
|
|
59
|
+
self.kwargs = dict((k, v) for k, v in model.__dict__.items() if k in _model_init_arguments)
|
|
60
|
+
|
|
61
|
+
self.pos_args = tuple() # Kept empty as all are moved to kwargs.
|
|
62
|
+
else:
|
|
63
|
+
self.initial_args = kwargs
|
|
64
|
+
self._initialize_object()
|
|
65
|
+
self.__run_func_returning_objects(all_kwargs=self.kwargs, use_dummy_initial_file=True)
|
|
66
|
+
|
|
67
|
+
def __getattr__(self, name):
|
|
68
|
+
if name in ["construct"]:
|
|
69
|
+
wt = self.initial_args.get("weight", None) if hasattr(self, "initial_args") else None
|
|
70
|
+
if (isinstance(wt, pd.DataFrame) and wt.iloc[0]["get_weight"] is not None) or wt is not None:
|
|
71
|
+
raise ValueError(f"The method '{name}' is not implemented when \"weight\" argument is provided.")
|
|
72
|
+
|
|
73
|
+
if name in ["set_weight", "set_label"]:
|
|
74
|
+
raise NotImplementedError(f"'{name}' is not implemented for Lightgbm Dataset object.\n")
|
|
75
|
+
|
|
76
|
+
if name == "set_group" and isinstance(self.modelObj, pd.DataFrame):
|
|
77
|
+
raise NotImplementedError("'set_group' is not implemented for Lightgbm Dataset object "\
|
|
78
|
+
"in multi-model case as different models have different number "\
|
|
79
|
+
"of rows and grouping them in one set of group is not possible.")
|
|
80
|
+
|
|
81
|
+
return super().__getattr__(name)
|
|
82
|
+
|
|
83
|
+
def save_binary(self, file_name, save_in_vantage=False):
|
|
84
|
+
"""
|
|
85
|
+
DESCRIPTION:
|
|
86
|
+
Save the model(s) to a binary file(s). Additionally the files are saved
|
|
87
|
+
to Vantage if "save_in_vantage" argument is set to True.
|
|
88
|
+
|
|
89
|
+
PARAMETERS:
|
|
90
|
+
file_name:
|
|
91
|
+
Required Argument.
|
|
92
|
+
Specifies the absolute path of the file name to which lightgbm Dataset
|
|
93
|
+
object is to be saved to.
|
|
94
|
+
Note:
|
|
95
|
+
* File name is prefixed with underscore delimitted partition column
|
|
96
|
+
values in multi-model case.
|
|
97
|
+
* File name excluding extension and file name with extension should
|
|
98
|
+
not already be present in Vantage.
|
|
99
|
+
Type: str
|
|
100
|
+
|
|
101
|
+
save_in_vantage:
|
|
102
|
+
Optional Argument.
|
|
103
|
+
Specifies whether to save the file in VantageCloud Enterprise or user environment
|
|
104
|
+
of VantageCloud Lake.
|
|
105
|
+
Default Value: False
|
|
106
|
+
Type: bool
|
|
107
|
+
|
|
108
|
+
RETURNS:
|
|
109
|
+
None
|
|
110
|
+
|
|
111
|
+
RAISES:
|
|
112
|
+
TeradataMlException
|
|
113
|
+
|
|
114
|
+
EXAMPLES:
|
|
115
|
+
>>> # Save the lightgbm Dataset object to a binary file in client.
|
|
116
|
+
>>> lightgbm_dataset.save_binary("lightgbm_dataset.pickle")
|
|
117
|
+
|
|
118
|
+
>>> # Save the lightgbm Dataset object to a binary file in client and Vantage.
|
|
119
|
+
>>> lightgbm_dataset.save_binary("lightgbm_dataset.pickle", save_in_vantage=True)
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
_file_name = os.path.basename(file_name)
|
|
123
|
+
_file_dir = os.path.dirname(file_name)
|
|
124
|
+
if not isinstance(self.modelObj, pd.DataFrame):
|
|
125
|
+
self.modelObj.save_binary(file_name)
|
|
126
|
+
file_prefix = _file_name.split(".")[0]
|
|
127
|
+
print("Model saved in client as ", file_name)
|
|
128
|
+
if save_in_vantage:
|
|
129
|
+
self._install_script_file(file_identifier=file_prefix,
|
|
130
|
+
file_name=_file_name,
|
|
131
|
+
is_binary=True,
|
|
132
|
+
file_location=_file_dir)
|
|
133
|
+
print(f"Model file {_file_name} saved in Vantage.")
|
|
134
|
+
else:
|
|
135
|
+
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
136
|
+
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
137
|
+
|
|
138
|
+
print("Multiple model files in multi-model case are saved with different names"\
|
|
139
|
+
" with partition column values information delimited by underscore.")
|
|
140
|
+
|
|
141
|
+
for i in range(no_of_unique_partitions):
|
|
142
|
+
partition_join = "_".join(list(map(str, self.modelObj.iloc[i, :no_of_partitioning_cols])))
|
|
143
|
+
# Split extension from file name to add partition column values before extension.
|
|
144
|
+
__file_name, __file_ext = os.path.splitext(_file_name)
|
|
145
|
+
__file_name = f"{__file_name}_{partition_join}{__file_ext}"
|
|
146
|
+
__file_prefix = os.path.splitext(__file_name)[0] # File identifier.
|
|
147
|
+
|
|
148
|
+
__joined_file = os.path.join(_file_dir, __file_name)
|
|
149
|
+
self.modelObj.iloc[i]["model"].save_binary(__joined_file)
|
|
150
|
+
|
|
151
|
+
if save_in_vantage:
|
|
152
|
+
self._install_script_file(file_identifier=__file_prefix,
|
|
153
|
+
file_name=__file_name,
|
|
154
|
+
is_binary=True,
|
|
155
|
+
file_location=_file_dir)
|
|
156
|
+
print(f"Model file {__file_name} saved in Vantage.")
|
|
157
|
+
|
|
158
|
+
def create_valid(self, **kwargs):
|
|
159
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
160
|
+
raise NotImplementedError("'create_valid' is not implemented for Lightgbm Dataset object"\
|
|
161
|
+
" in multi-model case.")
|
|
162
|
+
return self.__run_func_returning_objects(all_kwargs=kwargs, func_name="create_valid")
|
|
163
|
+
|
|
164
|
+
def __run_func_returning_objects(self, all_kwargs, func_name=None, use_dummy_initial_file=False):
|
|
165
|
+
"""
|
|
166
|
+
Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
|
|
167
|
+
"""
|
|
168
|
+
kwargs = all_kwargs.copy()
|
|
169
|
+
|
|
170
|
+
if kwargs.get("label", None) is not None:
|
|
171
|
+
label_df = kwargs["label"]
|
|
172
|
+
self._fit_label_columns_types = []
|
|
173
|
+
self._fit_label_columns_python_types = []
|
|
174
|
+
for l_c in label_df.columns:
|
|
175
|
+
column_data = label_df._td_column_names_and_sqlalchemy_types[l_c.lower()]
|
|
176
|
+
self._fit_label_columns_types.append(column_data)
|
|
177
|
+
self._fit_label_columns_python_types.append(column_data.python_type.__name__)
|
|
178
|
+
|
|
179
|
+
replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
|
|
180
|
+
|
|
181
|
+
script_file_path = f"{self._script_file_name}" if self._is_lake_system \
|
|
182
|
+
else f"./{self._db_name}/{self._script_file_name}"
|
|
183
|
+
|
|
184
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
185
|
+
script_command = f"{py_exc} {script_file_path} {self._model_file_name_prefix} {self._is_lake_system}"
|
|
186
|
+
|
|
187
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
188
|
+
return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
189
|
+
for col in partition_cols] + [("model", model_type)]
|
|
190
|
+
|
|
191
|
+
if "reference" in kwargs.keys() and kwargs["reference"] is not None:
|
|
192
|
+
# "reference" is another Dataset object which is passed as an argument.
|
|
193
|
+
# It should be accessed through model file name prefix as it raises an exception
|
|
194
|
+
# if we try to dump it as json -`TypeError: Object of type Dataset is not JSON serializable`.
|
|
195
|
+
self.initial_args["reference"]._install_initial_model_file()
|
|
196
|
+
kwargs["reference"] = self.initial_args["reference"]._model_file_name_prefix
|
|
197
|
+
|
|
198
|
+
replace_dict.update({"<all_col_names>": str(list(self._tdml_df.columns)),
|
|
199
|
+
"<params>": json.dumps(kwargs),
|
|
200
|
+
"<module_name>": f"'{self.module_name}'",
|
|
201
|
+
"<class_name>": f"'{self.class_name}'",
|
|
202
|
+
"<func_name>": f"'{func_name}'" if func_name else "None"})
|
|
203
|
+
|
|
204
|
+
# Generate new file in .teradataml directory and install it to Vantage.
|
|
205
|
+
self._prepare_and_install_file(replace_dict=replace_dict)
|
|
206
|
+
|
|
207
|
+
if partition_cols:
|
|
208
|
+
self._fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
|
|
209
|
+
|
|
210
|
+
self._install_initial_model_file(use_dummy_initial_file=use_dummy_initial_file)
|
|
211
|
+
|
|
212
|
+
self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
|
|
213
|
+
self._model_data._index_label = None
|
|
214
|
+
|
|
215
|
+
self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
|
|
216
|
+
n_partition_cols=len(partition_cols))
|
|
217
|
+
|
|
218
|
+
# File cleanup after processing.
|
|
219
|
+
os.remove(self._script_file_local)
|
|
220
|
+
self._remove_script_file(self._script_file_name)
|
|
221
|
+
|
|
222
|
+
return self
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class _LightgbmFunctionWrapper(_FunctionWrapper):
|
|
226
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
|
|
227
|
+
def __init__(self, module_name=None, func_name=None):
|
|
228
|
+
file_type = "file_fn_lightgbm"
|
|
229
|
+
template_file = "lightgbm_function.template"
|
|
230
|
+
self._script_file_name = _generate_new_name(type=file_type, extension="py")
|
|
231
|
+
super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)
|
|
232
|
+
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
|
|
233
|
+
|
|
234
|
+
def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1, record_eval_exists=False):
|
|
235
|
+
"""
|
|
236
|
+
Internal function to extract lightgbm object from the model(s) depending on the number of
|
|
237
|
+
partitions. When it is only one model, it is directly used as modelObj.
|
|
238
|
+
When it is multiple models, it is converted to pandas DataFrame and stored in modelObj.
|
|
239
|
+
|
|
240
|
+
PARAMETERS:
|
|
241
|
+
n_unique_partitions:
|
|
242
|
+
Optional Argument.
|
|
243
|
+
Specifies the number of unique partitions. If this argument is greater than 1,
|
|
244
|
+
then pandas DataFame is created for modelObj. Otherwise, model object is directly
|
|
245
|
+
stored in modelObj.
|
|
246
|
+
Type: int
|
|
247
|
+
|
|
248
|
+
n_partition_cols:
|
|
249
|
+
Optional Argument.
|
|
250
|
+
Specifies the number of partition columns. Since partition columns are stored in
|
|
251
|
+
the first columns of the self.model_data, this argument is used to extract model
|
|
252
|
+
object and other columns (console_output) from self.model_data.
|
|
253
|
+
Type: int
|
|
254
|
+
|
|
255
|
+
record_eval_exists:
|
|
256
|
+
Optional Argument.
|
|
257
|
+
Specifies whether record_evaluation callback exists in the function call.
|
|
258
|
+
If yes, then record_evaluation_result is also extracted from the model data.
|
|
259
|
+
Type: bool
|
|
260
|
+
|
|
261
|
+
RETURNS:
|
|
262
|
+
None
|
|
263
|
+
|
|
264
|
+
RAISES:
|
|
265
|
+
ValueError
|
|
266
|
+
|
|
267
|
+
EXAMPLES:
|
|
268
|
+
>>> # Extract model object, console output and record_evaluation results from the model
|
|
269
|
+
>>> # data and assign them to self.modelObj.
|
|
270
|
+
>>> self._extract_model_objs(n_unique_partitions=4, n_partition_cols=2, record_eval_exists=True)
|
|
271
|
+
|
|
272
|
+
"""
|
|
273
|
+
vals = execute_sql("select * from {}".format(self._model_data._table_name)).fetchall()
|
|
274
|
+
|
|
275
|
+
# pickle will issue a caution warning, if model pickling was done with
|
|
276
|
+
# different library version than used here. The following disables any warnings
|
|
277
|
+
# that might otherwise show in the scriptlog files on the Advanced SQL Engine
|
|
278
|
+
# nodes in this case. Yet, do keep an eye for incompatible pickle versions.
|
|
279
|
+
warnings.filterwarnings("ignore")
|
|
280
|
+
|
|
281
|
+
model_obj = None
|
|
282
|
+
console_opt = None
|
|
283
|
+
record_eval_result = None
|
|
284
|
+
# Extract and unpickle the following:
|
|
285
|
+
# - column next to partition columns - model object.
|
|
286
|
+
# - column next to model object - console output.
|
|
287
|
+
# - column next to console output - record_evaluation_result (if record_evaluation callback
|
|
288
|
+
# is there in input).
|
|
289
|
+
for i, row in enumerate(vals):
|
|
290
|
+
if self._is_lake_system:
|
|
291
|
+
model_obj = pickle.loads(row[n_partition_cols])
|
|
292
|
+
# console_output is stored in the column next to model object.
|
|
293
|
+
console_opt = row[n_partition_cols+1].decode()
|
|
294
|
+
if record_eval_exists:
|
|
295
|
+
# record_evaluation_result is stored in the column next to console_output.
|
|
296
|
+
record_eval_result = pickle.loads(row[n_partition_cols+2])
|
|
297
|
+
else:
|
|
298
|
+
model_obj = pickle.loads(base64.b64decode(row[n_partition_cols].partition("'")[2]))
|
|
299
|
+
# console_output is stored in the column next to model object.
|
|
300
|
+
console_opt = base64.b64decode(row[n_partition_cols+1].partition("'")[2]).decode()
|
|
301
|
+
if record_eval_exists:
|
|
302
|
+
# record_evaluation_result is stored in the column next to console_output.
|
|
303
|
+
record_eval_result = pickle.loads(
|
|
304
|
+
base64.b64decode(row[n_partition_cols+2].partition("'")[2]))
|
|
305
|
+
row[n_partition_cols] = model_obj
|
|
306
|
+
row[n_partition_cols+1] = console_opt
|
|
307
|
+
if record_eval_exists:
|
|
308
|
+
row[n_partition_cols+2] = record_eval_result
|
|
309
|
+
vals[i] = row
|
|
310
|
+
if n_unique_partitions == 1:
|
|
311
|
+
# Return both model object and console output for single model case.
|
|
312
|
+
pdf_data = [model_obj, console_opt]
|
|
313
|
+
if record_eval_exists:
|
|
314
|
+
# Add record_evaluation_result to the pandas df if exists.
|
|
315
|
+
pdf_data.append(record_eval_result)
|
|
316
|
+
self.modelObj = pd.DataFrame([pdf_data],
|
|
317
|
+
# First column is partition column. Hence, removed.
|
|
318
|
+
columns=self._model_data.columns[1:])
|
|
319
|
+
elif n_unique_partitions > 1:
|
|
320
|
+
self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
|
|
321
|
+
else:
|
|
322
|
+
ValueError("Number of partitions should be greater than 0.")
|
|
323
|
+
|
|
324
|
+
warnings.filterwarnings("default")
|
|
325
|
+
|
|
326
|
+
def __call__(self, **kwargs):
|
|
327
|
+
|
|
328
|
+
if self._func_name == "cv" and kwargs.get("return_cvbooster", None):
|
|
329
|
+
raise NotImplementedError("return_cvbooster argument is not supported yet.")
|
|
330
|
+
|
|
331
|
+
train_set = kwargs.pop("train_set")
|
|
332
|
+
|
|
333
|
+
train_set._install_initial_model_file()
|
|
334
|
+
|
|
335
|
+
# Data with only partition columns to run training on correct Dataset object in
|
|
336
|
+
# appropriate AMP/Node.
|
|
337
|
+
data = train_set._model_data.drop(columns="model")
|
|
338
|
+
|
|
339
|
+
kwargs["train_set"] = train_set._model_file_name_prefix
|
|
340
|
+
train_part_unique_vals = train_set._fit_partition_unique_values
|
|
341
|
+
|
|
342
|
+
partition_cols = data.columns # Because all the columns are parition columns.
|
|
343
|
+
|
|
344
|
+
valid_sets = kwargs.pop("valid_sets", None)
|
|
345
|
+
if valid_sets:
|
|
346
|
+
kwargs["valid_sets"] = []
|
|
347
|
+
for _, val in enumerate(valid_sets):
|
|
348
|
+
val._install_initial_model_file()
|
|
349
|
+
kwargs["valid_sets"].append(val._model_file_name_prefix)
|
|
350
|
+
val_part_unique_vals = val._fit_partition_unique_values
|
|
351
|
+
|
|
352
|
+
# Make sure all datasets are partitioned on same column values.
|
|
353
|
+
if not self._validate_equality_of_partition_values(train_part_unique_vals,
|
|
354
|
+
val_part_unique_vals):
|
|
355
|
+
raise TeradataMlException(
|
|
356
|
+
Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING,
|
|
357
|
+
"training", "validation"),
|
|
358
|
+
MessageCodes.PARTITION_VALUES_NOT_MATCHING
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Handle callbacks. Check if record_evaluation callback is present.
|
|
362
|
+
rec_eval_exists = False # Flag to check if record_evaluation callback exists.
|
|
363
|
+
if "callbacks" in kwargs and kwargs["callbacks"] is not None:
|
|
364
|
+
callbacks = kwargs["callbacks"]
|
|
365
|
+
callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
|
|
366
|
+
for callback in callbacks:
|
|
367
|
+
if callback["func_name"] == "record_evaluation":
|
|
368
|
+
rec_eval_exists = True
|
|
369
|
+
break
|
|
370
|
+
|
|
371
|
+
script_file_path = f"{self._script_file_name}" if self._is_lake_system \
|
|
372
|
+
else f"./{self._db_name}/{self._script_file_name}"
|
|
373
|
+
|
|
374
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
375
|
+
script_command = f"{py_exc} {script_file_path}"
|
|
376
|
+
|
|
377
|
+
_, partition_indices, partition_types, partition_cols = \
|
|
378
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data,
|
|
379
|
+
partition_cols,
|
|
380
|
+
idx_delim=None,
|
|
381
|
+
types_delim=None)
|
|
382
|
+
|
|
383
|
+
model_file_prefix = None
|
|
384
|
+
if self._is_lake_system:
|
|
385
|
+
model_file_prefix = self._script_file_name.replace(".py", "")
|
|
386
|
+
|
|
387
|
+
replace_dict = {"<module_name>": self._module_name,
|
|
388
|
+
"<func_name>": self._func_name,
|
|
389
|
+
"<is_lake_system>": str(self._is_lake_system),
|
|
390
|
+
"<params>": json.dumps(kwargs),
|
|
391
|
+
"<partition_cols_indices>": str(partition_indices),
|
|
392
|
+
"<partition_cols_types>": str(partition_types),
|
|
393
|
+
"<model_file_prefix>": str(model_file_prefix)}
|
|
394
|
+
|
|
395
|
+
self._prepare_and_install_file(replace_dict=replace_dict)
|
|
396
|
+
|
|
397
|
+
# One additional column "console_output" containing captured console output which contain
|
|
398
|
+
# training and validation logs.
|
|
399
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
400
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
401
|
+
for col in partition_cols] + \
|
|
402
|
+
[("model", model_type), ("console_output", model_type)]
|
|
403
|
+
|
|
404
|
+
rec_eval_col_name = "record_evaluation_result"
|
|
405
|
+
if rec_eval_exists:
|
|
406
|
+
# If record_evaluation result exists in callback, add it to return types and corresponding
|
|
407
|
+
# output in script.
|
|
408
|
+
return_types.append((rec_eval_col_name, model_type))
|
|
409
|
+
|
|
410
|
+
_no_of_unique_partitions = len(train_set._fit_partition_unique_values)
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
self._model_data = self._run_script(data, script_command, partition_cols, return_types)
|
|
414
|
+
|
|
415
|
+
self._extract_model_objs(n_unique_partitions=_no_of_unique_partitions,
|
|
416
|
+
n_partition_cols=len(partition_cols),
|
|
417
|
+
record_eval_exists=rec_eval_exists)
|
|
418
|
+
|
|
419
|
+
except Exception as ex:
|
|
420
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
421
|
+
os.remove(self._script_file_local)
|
|
422
|
+
self._remove_script_file(self._script_file_name)
|
|
423
|
+
raise
|
|
424
|
+
|
|
425
|
+
# File cleanup after processing.
|
|
426
|
+
os.remove(self._script_file_local)
|
|
427
|
+
self._remove_script_file(self._script_file_name)
|
|
428
|
+
|
|
429
|
+
if _no_of_unique_partitions == 1:
|
|
430
|
+
# If only one partition, print the console output and return the model object.
|
|
431
|
+
print(self.modelObj.iloc[0]["console_output"])
|
|
432
|
+
if self._func_name == "cv":
|
|
433
|
+
return self.modelObj.iloc[0]["model"]
|
|
434
|
+
if not rec_eval_exists:
|
|
435
|
+
booster_obj = _LightgbmBoosterWrapper(model=self.modelObj.iloc[0]["model"])
|
|
436
|
+
else:
|
|
437
|
+
# If record_evaluation results are there, return dictionary of model object and
|
|
438
|
+
# record_evaluation results.
|
|
439
|
+
model_dict = {"model" : self.modelObj.iloc[0]["model"],
|
|
440
|
+
rec_eval_col_name : self.modelObj.iloc[0][rec_eval_col_name]}
|
|
441
|
+
booster_obj = _LightgbmBoosterWrapper(model=model_dict, model_column_name="model")
|
|
442
|
+
booster_obj._is_default_partition_value_fit = True
|
|
443
|
+
booster_obj._fit_partition_unique_values = train_part_unique_vals
|
|
444
|
+
booster_obj._is_model_installed = False # As model is trained and returned but not saved to Vantage.
|
|
445
|
+
|
|
446
|
+
else:
|
|
447
|
+
if self._func_name == "cv":
|
|
448
|
+
return self.modelObj
|
|
449
|
+
booster_obj = _LightgbmBoosterWrapper(model=self.modelObj, model_column_name="model")
|
|
450
|
+
booster_obj._fit_partition_colums_non_default = partition_cols
|
|
451
|
+
booster_obj._is_default_partition_value_fit = train_set._is_default_partition_value_fit
|
|
452
|
+
|
|
453
|
+
booster_obj._fit_partition_unique_values = train_part_unique_vals
|
|
454
|
+
booster_obj._is_model_installed = False # As model is trained and returned but not saved to Vantage.
|
|
455
|
+
|
|
456
|
+
return booster_obj
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
# Using _SkLearnObjectWrapper as base class for _LightgbmBoosterWrapper as _transform method is not
|
|
460
|
+
# present in _OpenSourceObjectWrapper class.
|
|
461
|
+
class _LightgbmBoosterWrapper(_SkLearnObjectWrapper):
|
|
462
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
|
|
463
|
+
def __init__(self, model=None, module_name=None, class_name=None, kwargs=None, model_column_name=None):
|
|
464
|
+
file_type = "file_fn_lightgbm_booster"
|
|
465
|
+
|
|
466
|
+
self._model_column_name = model_column_name
|
|
467
|
+
|
|
468
|
+
self.record_evaluation_result = None
|
|
469
|
+
|
|
470
|
+
if model is not None and isinstance(model, dict) and self._model_column_name in model.keys():
|
|
471
|
+
self.record_evaluation_result = model["record_evaluation_result"]
|
|
472
|
+
model = model[self._model_column_name] # As model is stored in dictionary with key as "train_".
|
|
473
|
+
|
|
474
|
+
_OpenSourceObjectWrapper.__init__(self, model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
|
|
475
|
+
|
|
476
|
+
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
|
|
477
|
+
|
|
478
|
+
self._script_file_name = _generate_new_name(type=file_type, extension="py")
|
|
479
|
+
|
|
480
|
+
self._initialize_variables(table_name_prefix="td_lightgbm_")
|
|
481
|
+
if model is not None:
|
|
482
|
+
first_model = model
|
|
483
|
+
if isinstance(model, pd.DataFrame):
|
|
484
|
+
first_model = model.iloc[0][self._model_column_name]
|
|
485
|
+
self.modelObj = model
|
|
486
|
+
self.module_name = first_model.__module__.split("._")[0]
|
|
487
|
+
self.class_name = first_model.__class__.__name__
|
|
488
|
+
_model_init_arguments = first_model.__init__.__code__.co_varnames
|
|
489
|
+
self.kwargs = dict((k, v) for k, v in first_model.__dict__.items() if k in _model_init_arguments)
|
|
490
|
+
|
|
491
|
+
self.pos_args = tuple()
|
|
492
|
+
|
|
493
|
+
else:
|
|
494
|
+
# Create model object from new positional and keyword arguments.
|
|
495
|
+
if "train_set" in self.kwargs and self.kwargs["train_set"] is not None and \
|
|
496
|
+
isinstance(self.kwargs["train_set"], _LightgbmDatasetWrapper):
|
|
497
|
+
self.kwargs["train_set"] = self.kwargs["train_set"].modelObj
|
|
498
|
+
|
|
499
|
+
from importlib import import_module
|
|
500
|
+
class_obj = getattr(import_module(self.module_name), self.class_name)
|
|
501
|
+
self.modelObj = class_obj(**self.kwargs)
|
|
502
|
+
|
|
503
|
+
def deploy(self, model_name, replace_if_exists=False):
|
|
504
|
+
raise NotImplementedError("The deploy() function is not yet supported for lightgbm OpensourceML objects. \
|
|
505
|
+
Support will be added in future releases.")
|
|
506
|
+
|
|
507
|
+
@property
|
|
508
|
+
def model_info(self):
|
|
509
|
+
"""
|
|
510
|
+
DESCRIPTION:
|
|
511
|
+
Get the model information along with console output for multi-model case. Only model
|
|
512
|
+
object is returned for single model case.
|
|
513
|
+
Note:
|
|
514
|
+
This is particularly useful in multi-model case when the user want to see the console
|
|
515
|
+
output of each partition.
|
|
516
|
+
|
|
517
|
+
PARAMETERS:
|
|
518
|
+
None
|
|
519
|
+
|
|
520
|
+
RAISES:
|
|
521
|
+
None
|
|
522
|
+
|
|
523
|
+
RETURNS:
|
|
524
|
+
Pandas DataFrame
|
|
525
|
+
|
|
526
|
+
EXAMPLES:
|
|
527
|
+
# Load example data.
|
|
528
|
+
>>> load_example_data("openml", ["multi_model_classification"])
|
|
529
|
+
>>> df = DataFrame("multi_model_classification")
|
|
530
|
+
>>> df.head(3)
|
|
531
|
+
col2 col3 col4 label group_column partition_column_1 partition_column_2
|
|
532
|
+
col1
|
|
533
|
+
-2.560430 0.402232 -1.100742 -2.959588 0 9 0 10
|
|
534
|
+
-3.587546 0.291819 -1.850169 -4.331055 0 10 0 10
|
|
535
|
+
-3.697436 1.576888 -0.461220 -3.598652 0 10 0 11
|
|
536
|
+
|
|
537
|
+
# Get the feature and label data.
|
|
538
|
+
>>> df_x = df.select(["col1", "col2", "col3", "col4"])
|
|
539
|
+
>>> df_y = df.select("label")
|
|
540
|
+
|
|
541
|
+
# Partition columns for multi model case.
|
|
542
|
+
>>> part_cols = ["partition_column_1", "partition_column_2"]
|
|
543
|
+
|
|
544
|
+
## Single model case.
|
|
545
|
+
# Create lightgbm Dataset object.
|
|
546
|
+
>>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
|
|
547
|
+
|
|
548
|
+
# Train the model.
|
|
549
|
+
>>> model = td_lightgbm.train(params={}, train_set=lgbm_data,
|
|
550
|
+
... num_boost_round=30,
|
|
551
|
+
... early_stopping_rounds=50)
|
|
552
|
+
>>> model # This is object of _LightgbmBoosterWrapper class.
|
|
553
|
+
<lightgbm.basic.Booster object at 0x0000025BD2459160>
|
|
554
|
+
|
|
555
|
+
## Multi model case.
|
|
556
|
+
# Create lightgbm Dataset objects for training and validation.
|
|
557
|
+
>>> obj_m = td_lightgbm.Dataset(df_x, df_y, free_raw_data=False,
|
|
558
|
+
partition_columns=part_cols)
|
|
559
|
+
|
|
560
|
+
>>> obj_m_v = td_lightgbm.Dataset(df_x, df_y, free_raw_data=False,
|
|
561
|
+
partition_columns=part_cols)
|
|
562
|
+
|
|
563
|
+
# Train the models in multi model case.
|
|
564
|
+
>>> model = td_lightgbm.train(params={}, train_set=obj_m,
|
|
565
|
+
... num_boost_round=30,
|
|
566
|
+
... early_stopping_rounds=50,
|
|
567
|
+
... valid_sets=[obj_m_v, obj_m_v])
|
|
568
|
+
>>> model
|
|
569
|
+
partition_column_1 partition_column_2 \
|
|
570
|
+
0 1 11
|
|
571
|
+
1 0 11
|
|
572
|
+
2 1 10
|
|
573
|
+
3 0 10
|
|
574
|
+
|
|
575
|
+
model \
|
|
576
|
+
0 <lightgbm.basic.Booster object at 0x7f2e95ffc0a0>
|
|
577
|
+
1 <lightgbm.basic.Booster object at 0x7f2e95ffc880>
|
|
578
|
+
2 <lightgbm.basic.Booster object at 0x7f2e95f852e0>
|
|
579
|
+
3 <lightgbm.basic.Booster object at 0x7f2e95f853a0>
|
|
580
|
+
|
|
581
|
+
console_output
|
|
582
|
+
0 [LightGBM] [Warning] Auto-choosing col-wise mu...
|
|
583
|
+
1 [LightGBM] [Warning] Auto-choosing row-wise mu...
|
|
584
|
+
2 [LightGBM] [Warning] Auto-choosing col-wise mu...
|
|
585
|
+
3 [LightGBM] [Warning] Auto-choosing row-wise mu...
|
|
586
|
+
|
|
587
|
+
# Get the model information which returns the printed output as pandas
|
|
588
|
+
# DataFrame containing the model information along with console output.
|
|
589
|
+
>>> model_info = lightgbm_booster.model_info
|
|
590
|
+
|
|
591
|
+
# Print console output of first partition.
|
|
592
|
+
>>> print(model_info.iloc[0]["console_output"])
|
|
593
|
+
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
|
|
594
|
+
You can set `force_col_wise=true` to remove the overhead.
|
|
595
|
+
[LightGBM] [Info] Total Bins 136
|
|
596
|
+
[LightGBM] [Info] Number of data points in the train set: 97, number of used features: 4
|
|
597
|
+
[LightGBM] [Info] Start training from score 0.556701
|
|
598
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
599
|
+
[1] valid_0's l2: 0.219637 valid_1's l2: 0.219637
|
|
600
|
+
Training until validation scores don't improve for 50 rounds
|
|
601
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
602
|
+
[2] valid_0's l2: 0.196525 valid_1's l2: 0.196525
|
|
603
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
604
|
+
[3] valid_0's l2: 0.178462 valid_1's l2: 0.178462
|
|
605
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
606
|
+
[4] valid_0's l2: 0.162887 valid_1's l2: 0.162887
|
|
607
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
608
|
+
[5] valid_0's l2: 0.150271 valid_1's l2: 0.150271
|
|
609
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
610
|
+
[6] valid_0's l2: 0.140219 valid_1's l2: 0.140219
|
|
611
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
612
|
+
[7] valid_0's l2: 0.131697 valid_1's l2: 0.131697
|
|
613
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
614
|
+
[8] valid_0's l2: 0.124056 valid_1's l2: 0.124056
|
|
615
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
616
|
+
[9] valid_0's l2: 0.117944 valid_1's l2: 0.117944
|
|
617
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
618
|
+
[10] valid_0's l2: 0.11263 valid_1's l2: 0.11263
|
|
619
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
620
|
+
[11] valid_0's l2: 0.105228 valid_1's l2: 0.105228
|
|
621
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
622
|
+
[12] valid_0's l2: 0.0981571 valid_1's l2: 0.0981571
|
|
623
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
624
|
+
[13] valid_0's l2: 0.0924294 valid_1's l2: 0.0924294
|
|
625
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
626
|
+
[14] valid_0's l2: 0.0877899 valid_1's l2: 0.0877899
|
|
627
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
628
|
+
[15] valid_0's l2: 0.084032 valid_1's l2: 0.084032
|
|
629
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
630
|
+
[16] valid_0's l2: 0.080988 valid_1's l2: 0.080988
|
|
631
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
632
|
+
[17] valid_0's l2: 0.0785224 valid_1's l2: 0.0785224
|
|
633
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
634
|
+
[18] valid_0's l2: 0.0765253 valid_1's l2: 0.0765253
|
|
635
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
636
|
+
[19] valid_0's l2: 0.0750803 valid_1's l2: 0.0750803
|
|
637
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
638
|
+
[20] valid_0's l2: 0.0738915 valid_1's l2: 0.0738915
|
|
639
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
640
|
+
[21] valid_0's l2: 0.07288 valid_1's l2: 0.07288
|
|
641
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
642
|
+
[22] valid_0's l2: 0.0718676 valid_1's l2: 0.0718676
|
|
643
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
644
|
+
[23] valid_0's l2: 0.0706037 valid_1's l2: 0.0706037
|
|
645
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
646
|
+
[24] valid_0's l2: 0.0695799 valid_1's l2: 0.0695799
|
|
647
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
648
|
+
[25] valid_0's l2: 0.0687507 valid_1's l2: 0.0687507
|
|
649
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
650
|
+
[26] valid_0's l2: 0.0680819 valid_1's l2: 0.0680819
|
|
651
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
652
|
+
[27] valid_0's l2: 0.0674077 valid_1's l2: 0.0674077
|
|
653
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
654
|
+
[28] valid_0's l2: 0.0665111 valid_1's l2: 0.0665111
|
|
655
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
656
|
+
[29] valid_0's l2: 0.0659656 valid_1's l2: 0.0659656
|
|
657
|
+
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
|
|
658
|
+
[30] valid_0's l2: 0.0652665 valid_1's l2: 0.0652665
|
|
659
|
+
Did not meet early stopping. Best iteration is:
|
|
660
|
+
[30] valid_0's l2: 0.0652665 valid_1's l2: 0.0652665
|
|
661
|
+
|
|
662
|
+
"""
|
|
663
|
+
return self.modelObj
|
|
664
|
+
|
|
665
|
+
def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
|
|
666
|
+
"""
|
|
667
|
+
Internal function to convert all OpensourceML related objects in arguments to
|
|
668
|
+
underlying model objects.
|
|
669
|
+
"""
|
|
670
|
+
if isinstance(args, dict):
|
|
671
|
+
new_args = args.copy() # To avoid updating
|
|
672
|
+
for k, v in new_args.items():
|
|
673
|
+
if isinstance(v, type(self)) or isinstance(v, _LightgbmDatasetWrapper):
|
|
674
|
+
if idx_multi_model is None:
|
|
675
|
+
# single model. This argument (idx_multi_model) is set only when modelObj
|
|
676
|
+
# is multi model.
|
|
677
|
+
new_args[k] = v.modelObj
|
|
678
|
+
else:
|
|
679
|
+
# multi-model. Get appropriate model from modelObj.
|
|
680
|
+
new_args[k] = v.modelObj.iloc[idx_multi_model][self._model_column_name]
|
|
681
|
+
else:
|
|
682
|
+
new_args[k] = v
|
|
683
|
+
return new_args
|
|
684
|
+
|
|
685
|
+
# If args is tuple, convert all elements to underlying model object.
|
|
686
|
+
elif isinstance(args, tuple):
|
|
687
|
+
new_args = tuple()
|
|
688
|
+
for arg in args:
|
|
689
|
+
if isinstance(arg, type(self)) or isinstance(arg, _LightgbmDatasetWrapper):
|
|
690
|
+
if idx_multi_model is None:
|
|
691
|
+
# single model. This argument is set only when modelObj is single model.
|
|
692
|
+
new_args += (arg.modelObj,)
|
|
693
|
+
else:
|
|
694
|
+
# multi-model. Get appropriate model from modelObj.
|
|
695
|
+
new_args += (arg.modelObj.iloc[idx_multi_model][self._model_column_name],)
|
|
696
|
+
else:
|
|
697
|
+
new_args += (arg,)
|
|
698
|
+
return new_args
|
|
699
|
+
return args
|
|
700
|
+
|
|
701
|
+
def __getattr__(self, name):
|
|
702
|
+
def __run_transform(*c, **kwargs):
|
|
703
|
+
# Lightgbm predict method takes other keyword arguments along with data related arguments.
|
|
704
|
+
# Hence need to generate script dynamically instead of standard scikit-learn's
|
|
705
|
+
# sklearn_transform.py file.
|
|
706
|
+
self._convert_pos_args_to_kwargs_for_function(c, kwargs, name)
|
|
707
|
+
self._generate_script_file_from_template_file(kwargs=kwargs,
|
|
708
|
+
template_file="lightgbm_class_functions.template",
|
|
709
|
+
func_name=name)
|
|
710
|
+
|
|
711
|
+
return self._transform(**kwargs)
|
|
712
|
+
|
|
713
|
+
# TODO: Will be added as part of ELE-7150
|
|
714
|
+
if name in ["add_valid", "eval", "eval_train", "eval_valid", "refit", "set_attr", "update"]:
|
|
715
|
+
raise NotImplementedError(f"{name}() function is not supported yet. Will be added in future releases.")
|
|
716
|
+
|
|
717
|
+
# TODO: Will be added as part of ELE-7150
|
|
718
|
+
if name == "model_from_string" and not self._is_default_partition_value_fit:
|
|
719
|
+
# For multi model case of model_from_string() function.
|
|
720
|
+
raise NotImplementedError(
|
|
721
|
+
"model_from_string() function is not supported for multi model case. Will be added in future releases.")
|
|
722
|
+
|
|
723
|
+
# TODO: Will be added as part of ELE-7150
|
|
724
|
+
if name == "set_network":
|
|
725
|
+
raise NotImplementedError(
|
|
726
|
+
"set_network() function is not applicable for Teradata Vantage.")
|
|
727
|
+
|
|
728
|
+
if name in ["predict"]:
|
|
729
|
+
return __run_transform
|
|
730
|
+
return super().__getattr__(name)
|
|
731
|
+
|
|
732
|
+
def _transform(self, **kwargs):
|
|
733
|
+
# Overwriting existing _transform method to handle data related arguments and other
|
|
734
|
+
# keyword arguments.
|
|
735
|
+
|
|
736
|
+
# Extract data and label columns.
|
|
737
|
+
data_df = kwargs.pop("data") # "data" is mandatory argument for predict method.
|
|
738
|
+
current_dfs = [data_df]
|
|
739
|
+
feature_columns = data_df.columns
|
|
740
|
+
|
|
741
|
+
label_columns = None
|
|
742
|
+
if "label" in kwargs.keys() and kwargs["label"] is not None:
|
|
743
|
+
label_df = kwargs.pop("label")
|
|
744
|
+
current_dfs.append(label_df)
|
|
745
|
+
label_columns = label_df.columns
|
|
746
|
+
|
|
747
|
+
file_name = kwargs.pop("file_name")
|
|
748
|
+
|
|
749
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
750
|
+
data = DataFrameUtils()._get_common_parent_df_from_dataframes(current_dfs)
|
|
751
|
+
|
|
752
|
+
try:
|
|
753
|
+
# Install initial model file and script file to Vantage.
|
|
754
|
+
self._install_model_and_script_files(file_name=file_name,
|
|
755
|
+
file_location=self._tdml_tmp_dir)
|
|
756
|
+
|
|
757
|
+
trans_opt = super()._transform(data=data, feature_columns=feature_columns,
|
|
758
|
+
label_columns=label_columns, file_name=file_name,
|
|
759
|
+
**kwargs)
|
|
760
|
+
except Exception as ex:
|
|
761
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
762
|
+
os.remove(os.path.join(self._tdml_tmp_dir, file_name))
|
|
763
|
+
self._remove_script_file(file_name)
|
|
764
|
+
raise
|
|
765
|
+
|
|
766
|
+
# File cleanup after processing.
|
|
767
|
+
os.remove(os.path.join(self._tdml_tmp_dir, file_name))
|
|
768
|
+
self._remove_script_file(file_name)
|
|
769
|
+
|
|
770
|
+
return trans_opt
|
|
771
|
+
|
|
772
|
+
def __repr__(self):
|
|
773
|
+
return self.modelObj.__repr__()
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
class _LighgbmSklearnWrapper(_SkLearnObjectWrapper):
|
|
777
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.LIGHTGBM
|
|
778
|
+
def __init__(self, model=None, module_name=None, class_name=None, kwargs=None):
|
|
779
|
+
super().__init__(model=model, module_name=module_name, class_name=class_name, kwargs=kwargs)
|
|
780
|
+
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "lightgbm")
|
|
781
|
+
|
|
782
|
+
def deploy(self, model_name, replace_if_exists=False):
|
|
783
|
+
raise NotImplementedError("The deploy() function is not yet supported for lightgbm OpensourceML objects. \
|
|
784
|
+
Support will be added in future releases.")
|
|
785
|
+
|
|
786
|
+
def set_params(self, **params):
|
|
787
|
+
"""
|
|
788
|
+
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
789
|
+
"""
|
|
790
|
+
for key, val in params.items():
|
|
791
|
+
self.kwargs[key] = val
|
|
792
|
+
|
|
793
|
+
self.__init__(None, self.module_name, self.class_name, self.kwargs)
|
|
794
|
+
return self
|
|
795
|
+
|
|
796
|
+
def _process_and_run_fit_and_score_run(self, pos_args, kwargs, func_name):
|
|
797
|
+
"""
|
|
798
|
+
Internal function to process data related arguments and other keyword arguments
|
|
799
|
+
for fit and score methods.
|
|
800
|
+
"""
|
|
801
|
+
self._convert_pos_args_to_kwargs_for_function(pos_args, kwargs, func_name)
|
|
802
|
+
|
|
803
|
+
label_columns = kwargs["y"].columns if kwargs.get("y", None) else kwargs.get("label_columns", None)
|
|
804
|
+
|
|
805
|
+
if func_name == "score":
|
|
806
|
+
# Get partition columns from the trained model object.
|
|
807
|
+
if self._fit_partition_colums_non_default is not None and "partition_columns" not in kwargs.keys():
|
|
808
|
+
kwargs["partition_columns"] = self._fit_partition_colums_non_default
|
|
809
|
+
if func_name == "fit":
|
|
810
|
+
earlier_partition_cols = kwargs.get("partition_columns", None)
|
|
811
|
+
if earlier_partition_cols:
|
|
812
|
+
self._is_default_partition_value_fit = False
|
|
813
|
+
self._fit_partition_colums_non_default = earlier_partition_cols
|
|
814
|
+
else:
|
|
815
|
+
self._is_default_partition_value_fit = True
|
|
816
|
+
self._fit_partition_colums_non_default = None
|
|
817
|
+
|
|
818
|
+
generated_script_file = _generate_new_name(type=f"file_fn_lightgbm_sklearn_{func_name}", extension="py")
|
|
819
|
+
|
|
820
|
+
non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
|
|
821
|
+
|
|
822
|
+
replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
|
|
823
|
+
|
|
824
|
+
# Update non data related arguments in replace_dict containing data related argument information.
|
|
825
|
+
replace_dict.update({"<params>": json.dumps(non_data_related_args),
|
|
826
|
+
"<func_name>": f"'{func_name}'",
|
|
827
|
+
"<model_file_prefix>": f"'{self._model_file_name_prefix}'",
|
|
828
|
+
"<is_lake_system>": str(self._is_lake_system)})
|
|
829
|
+
|
|
830
|
+
# Replace placeholders in tempate file with actual values and write to new file.
|
|
831
|
+
self._read_from_template_and_write_dict_to_file(template_file="lightgbm_sklearn.template",
|
|
832
|
+
replace_dict=replace_dict,
|
|
833
|
+
output_script_file_name=generated_script_file)
|
|
834
|
+
|
|
835
|
+
if func_name == "fit":
|
|
836
|
+
# Get unique values in partitioning columns.
|
|
837
|
+
self._fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
|
|
838
|
+
|
|
839
|
+
# Install initial model file and script file to Vantage.
|
|
840
|
+
self._install_model_and_script_files(file_name=generated_script_file,
|
|
841
|
+
file_location=self._tdml_tmp_dir)
|
|
842
|
+
|
|
843
|
+
# db_name is applicable for enterprise system.
|
|
844
|
+
db_file_name = generated_script_file if self._is_lake_system else f"./{self._db_name}/{generated_script_file}"
|
|
845
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
846
|
+
script_command = f"{py_exc} {db_file_name}"
|
|
847
|
+
|
|
848
|
+
return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
849
|
+
for col in partition_cols]
|
|
850
|
+
if func_name == "fit":
|
|
851
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
852
|
+
return_types += [("model", model_type)]
|
|
853
|
+
if func_name == "score":
|
|
854
|
+
return_types += [("score", FLOAT())]
|
|
855
|
+
# Checking the trained model installation. If not installed,
|
|
856
|
+
# set flag to True (as it is already installed in
|
|
857
|
+
# `self._install_model_and_script_files()` call).
|
|
858
|
+
if not self._is_trained_model_installed:
|
|
859
|
+
self._is_trained_model_installed = True
|
|
860
|
+
|
|
861
|
+
try:
|
|
862
|
+
opt = self._run_script(data=self._tdml_df, command=script_command,
|
|
863
|
+
partition_columns=partition_cols,
|
|
864
|
+
return_types=return_types)
|
|
865
|
+
except Exception as ex:
|
|
866
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
867
|
+
os.remove(os.path.join(self._tdml_tmp_dir, generated_script_file))
|
|
868
|
+
self._remove_script_file(generated_script_file)
|
|
869
|
+
raise
|
|
870
|
+
|
|
871
|
+
# File cleanup after processing.
|
|
872
|
+
os.remove(os.path.join(self._tdml_tmp_dir, generated_script_file))
|
|
873
|
+
self._remove_script_file(generated_script_file)
|
|
874
|
+
|
|
875
|
+
if func_name == "fit":
|
|
876
|
+
self._model_data = opt
|
|
877
|
+
self._assign_fit_variables_after_execution(self._tdml_df, partition_cols, label_columns)
|
|
878
|
+
return self
|
|
879
|
+
|
|
880
|
+
if func_name == "score":
|
|
881
|
+
if self._is_default_partition_value_fit:
|
|
882
|
+
# For single model case, partition column is internally generated and
|
|
883
|
+
# no point in returning it to the user.
|
|
884
|
+
opt = opt.select(func_name)
|
|
885
|
+
return opt
|
|
886
|
+
|
|
887
|
+
def fit(self, *c, **kwargs):
|
|
888
|
+
return self._process_and_run_fit_and_score_run(c, kwargs, "fit")
|
|
889
|
+
|
|
890
|
+
def score(self, *c, **kwargs):
|
|
891
|
+
return self._process_and_run_fit_and_score_run(c, kwargs, "score")
|
|
892
|
+
|
|
893
|
+
def _transform(self, **kwargs):
|
|
894
|
+
# Overwriting existing _transform method to handle data related arguments and other
|
|
895
|
+
# keyword arguments.
|
|
896
|
+
|
|
897
|
+
# Extract data and label columns.
|
|
898
|
+
data_df = kwargs.pop("X") # "X" is mandatory argument for predict method.
|
|
899
|
+
current_dfs = [data_df]
|
|
900
|
+
feature_columns = data_df.columns
|
|
901
|
+
|
|
902
|
+
label_columns = None
|
|
903
|
+
if "y" in kwargs.keys() and kwargs["y"] is not None:
|
|
904
|
+
label_df = kwargs.pop("y")
|
|
905
|
+
current_dfs.append(label_df)
|
|
906
|
+
label_columns = label_df.columns
|
|
907
|
+
|
|
908
|
+
file_name = kwargs.pop("file_name")
|
|
909
|
+
|
|
910
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
911
|
+
data = DataFrameUtils()._get_common_parent_df_from_dataframes(current_dfs)
|
|
912
|
+
|
|
913
|
+
try:
|
|
914
|
+
# Install initial model file and script file to Vantage.
|
|
915
|
+
self._install_model_and_script_files(file_name=file_name,
|
|
916
|
+
file_location=self._tdml_tmp_dir)
|
|
917
|
+
|
|
918
|
+
trans_opt = super()._transform(data=data, feature_columns=feature_columns,
|
|
919
|
+
label_columns=label_columns, file_name=file_name,
|
|
920
|
+
**kwargs)
|
|
921
|
+
except Exception as ex:
|
|
922
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
923
|
+
os.remove(os.path.join(self._tdml_tmp_dir, file_name))
|
|
924
|
+
self._remove_script_file(file_name)
|
|
925
|
+
raise
|
|
926
|
+
|
|
927
|
+
# File cleanup after processing.
|
|
928
|
+
os.remove(os.path.join(self._tdml_tmp_dir, file_name))
|
|
929
|
+
self._remove_script_file(file_name)
|
|
930
|
+
|
|
931
|
+
return trans_opt
|
|
932
|
+
|
|
933
|
+
def __getattr__(self, name):
|
|
934
|
+
def __run_transform(*c, **kwargs):
|
|
935
|
+
# Lightgbm predict method takes other keyword arguments along with data related arguments.
|
|
936
|
+
# Hence need to generate script dynamically instead of standard scikit-learn's
|
|
937
|
+
# sklearn_transform.py file.
|
|
938
|
+
generated_script_file = _generate_new_name(type=f"file_fn_lightgbm_sklearn_{name}", extension="py")
|
|
939
|
+
|
|
940
|
+
self._convert_pos_args_to_kwargs_for_function(c, kwargs, name)
|
|
941
|
+
self._generate_script_file_from_template_file(kwargs=kwargs,
|
|
942
|
+
template_file="lightgbm_class_functions.template",
|
|
943
|
+
func_name=name,
|
|
944
|
+
output_script_file_name=generated_script_file)
|
|
945
|
+
|
|
946
|
+
return self._transform(**kwargs)
|
|
947
|
+
|
|
948
|
+
if name in ["predict", "predict_proba"]:
|
|
949
|
+
return __run_transform
|
|
950
|
+
return super().__getattr__(name)
|