teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -19,7 +19,6 @@ from collections import OrderedDict, defaultdict
|
|
|
19
19
|
from importlib import import_module
|
|
20
20
|
|
|
21
21
|
import base64
|
|
22
|
-
import functools
|
|
23
22
|
import json
|
|
24
23
|
import numpy
|
|
25
24
|
import os
|
|
@@ -28,7 +27,7 @@ import time
|
|
|
28
27
|
import inspect
|
|
29
28
|
import warnings
|
|
30
29
|
import json
|
|
31
|
-
import
|
|
30
|
+
import math
|
|
32
31
|
import pandas as pd
|
|
33
32
|
from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
|
|
34
33
|
import pandas.api.types as pt
|
|
@@ -41,19 +40,18 @@ from teradataml.context.context import _get_current_databasename, get_connection
|
|
|
41
40
|
from teradataml.dbutils.filemgr import install_file, remove_file
|
|
42
41
|
from teradataml.utils.utils import execute_sql
|
|
43
42
|
from teradataml.options.configure import configure
|
|
44
|
-
from teradataml.opensource.
|
|
43
|
+
from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
|
|
45
44
|
_validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
|
|
46
|
-
from teradataml.opensource.
|
|
45
|
+
from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
|
|
47
46
|
_OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
|
|
48
47
|
_OSML_ADDITIONAL_COLUMN_TYPES
|
|
49
48
|
from teradataml.common.messagecodes import MessageCodes
|
|
50
49
|
from teradataml.common.messages import Messages
|
|
51
50
|
from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
|
|
52
|
-
from teradataml.dbutils.dbutils import _create_table
|
|
51
|
+
from teradataml.dbutils.dbutils import _create_table, set_session_param
|
|
53
52
|
from teradataml.utils.validators import _Validators
|
|
54
53
|
from teradataml.dataframe.dataframe import DataFrame
|
|
55
54
|
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
56
|
-
from teradataml.scriptmgmt.lls_utils import create_env, get_env
|
|
57
55
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
58
56
|
from teradataml.common.constants import TeradataConstants
|
|
59
57
|
|
|
@@ -64,8 +62,15 @@ validator = _Validators()
|
|
|
64
62
|
|
|
65
63
|
installed_model_files = defaultdict(int)
|
|
66
64
|
|
|
65
|
+
## Flag to ensure the sklearn script
|
|
66
|
+
## installation occurs only once.
|
|
67
|
+
_file_installed = False
|
|
68
|
+
|
|
67
69
|
class _GenericObjectWrapper:
|
|
68
70
|
def __init__(self) -> None:
|
|
71
|
+
if not get_connection():
|
|
72
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.INVALID_CONTEXT_CONNECTION),
|
|
73
|
+
MessageCodes.INVALID_CONTEXT_CONNECTION)
|
|
69
74
|
self._db_name = _get_current_databasename()
|
|
70
75
|
|
|
71
76
|
self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
|
|
@@ -86,43 +91,24 @@ class _GenericObjectWrapper:
|
|
|
86
91
|
if configure.openml_user_env is not None:
|
|
87
92
|
self._env = configure.openml_user_env
|
|
88
93
|
else:
|
|
89
|
-
self._create_or_get_env()
|
|
94
|
+
self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
|
|
90
95
|
else:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _create_or_get_env(self):
|
|
94
|
-
"""
|
|
95
|
-
Internal function to return the env if already exists else
|
|
96
|
-
creates the environment using template file and return the env.
|
|
97
|
-
"""
|
|
98
|
-
# Get the template file path.
|
|
99
|
-
template_dir_path = os.path.join(_TDML_DIRECTORY, "data", "templates",
|
|
100
|
-
"open_source_ml.json")
|
|
96
|
+
set_session_param("searchuifdbpath",self._db_name)
|
|
101
97
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
98
|
+
global _file_installed
|
|
99
|
+
## Flag to check whether trained model is installed or not.
|
|
100
|
+
self._is_trained_model_installed = False
|
|
105
101
|
|
|
106
|
-
|
|
107
|
-
|
|
102
|
+
## Install all sklearn script files on Vantage.
|
|
103
|
+
if not _file_installed:
|
|
104
|
+
sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
|
|
105
|
+
"sklearn_transform.py", "sklearn_fit_predict.py",
|
|
106
|
+
"sklearn_neighbors.py", "sklearn_model_selection_split.py"]
|
|
107
|
+
for script_file in sklearn_script_files:
|
|
108
|
+
self._install_script_file(file_identifier=script_file.split(".")[0],
|
|
109
|
+
file_name=script_file)
|
|
108
110
|
|
|
109
|
-
|
|
110
|
-
# Call function to 'openml_env' get env.
|
|
111
|
-
self._env = get_env(_env_name)
|
|
112
|
-
except TeradataMlException as tdml_e:
|
|
113
|
-
# We will get here when error says, env does not exist otherwise raise the exception as is.
|
|
114
|
-
# Env does not exist so create one.
|
|
115
|
-
|
|
116
|
-
exc_msg = "Failed to execute get_env(). User environment '{}' not " \
|
|
117
|
-
"found.".format(_env_name)
|
|
118
|
-
if exc_msg in tdml_e.args[0]:
|
|
119
|
-
print(f"No OpenAF environment with name '{_env_name}' found. Creating one with "\
|
|
120
|
-
"latest supported python and required packages.")
|
|
121
|
-
_env = create_env(template=template_dir_path)
|
|
122
|
-
else:
|
|
123
|
-
raise tdml_e
|
|
124
|
-
except Exception as exc:
|
|
125
|
-
raise exc
|
|
111
|
+
_file_installed = True
|
|
126
112
|
|
|
127
113
|
def _get_columns_as_list(self, cols):
|
|
128
114
|
"""
|
|
@@ -205,13 +191,32 @@ class _GenericObjectWrapper:
|
|
|
205
191
|
is_binary=is_binary)
|
|
206
192
|
else:
|
|
207
193
|
status = self._env.install_file(file_path=new_script,
|
|
208
|
-
|
|
209
|
-
|
|
194
|
+
replace=True,
|
|
195
|
+
suppress_output=True)
|
|
210
196
|
if not status:
|
|
211
197
|
raise TeradataMlException(
|
|
212
198
|
f"Script file '{file_name}' failed to get installed/replaced in Vantage."
|
|
213
199
|
)
|
|
214
200
|
|
|
201
|
+
def _remove_script_file(self, file_name):
|
|
202
|
+
"""
|
|
203
|
+
Internal function to remove script file in Vantage.
|
|
204
|
+
"""
|
|
205
|
+
# _env is set while object creation
|
|
206
|
+
# If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
|
|
207
|
+
|
|
208
|
+
if not self._is_lake_system:
|
|
209
|
+
status = remove_file(file_identifier=file_name.split(".")[0],
|
|
210
|
+
force_remove=True,
|
|
211
|
+
suppress_output=True)
|
|
212
|
+
else:
|
|
213
|
+
status = self._env.remove_file(file_name=file_name,
|
|
214
|
+
suppress_output=True)
|
|
215
|
+
if not status:
|
|
216
|
+
raise TeradataMlException(
|
|
217
|
+
f"Script file '{file_name}' failed to remove in Vantage."
|
|
218
|
+
)
|
|
219
|
+
|
|
215
220
|
def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
|
|
216
221
|
idx_delim=",",
|
|
217
222
|
types_delim="--"):
|
|
@@ -261,7 +266,7 @@ class _GenericObjectWrapper:
|
|
|
261
266
|
args_str += f" {strr}"
|
|
262
267
|
return args_str
|
|
263
268
|
|
|
264
|
-
def
|
|
269
|
+
def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1):
|
|
265
270
|
"""
|
|
266
271
|
Internal function to extract sklearn object from the model(s) depending on the number of
|
|
267
272
|
partitions. When it is only one model, it is directly used as sklearn object (modelObj).
|
|
@@ -294,33 +299,130 @@ class _GenericObjectWrapper:
|
|
|
294
299
|
|
|
295
300
|
warnings.filterwarnings("default")
|
|
296
301
|
|
|
302
|
+
def _validate_existence_of_partition_columns(self, partition_columns, all_columns, arg_names_for_dfs):
|
|
303
|
+
"""
|
|
304
|
+
Validate if columns in "partition_columns" argument are present in any of the given
|
|
305
|
+
dataframes.
|
|
306
|
+
"""
|
|
307
|
+
invalid_part_cols = [c for c in partition_columns if c not in all_columns]
|
|
297
308
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
309
|
+
if invalid_part_cols:
|
|
310
|
+
raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
|
|
311
|
+
", ".join(invalid_part_cols),
|
|
312
|
+
"', '".join(arg_names_for_dfs))
|
|
313
|
+
)
|
|
301
314
|
|
|
302
|
-
def
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
315
|
+
def _prepare_data_args_string(self, kwargs):
|
|
316
|
+
"""
|
|
317
|
+
Get column indices and types of each data related arguments in the format:
|
|
318
|
+
"{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
319
|
+
{<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
320
|
+
"""
|
|
321
|
+
data_args_str = []
|
|
322
|
+
for arg_name in list(self._data_args.keys()):
|
|
323
|
+
# Remove DataFrame arguments from kwargs, which will be passed to Script.
|
|
324
|
+
kwargs.pop(arg_name)
|
|
307
325
|
|
|
308
|
-
|
|
309
|
-
|
|
326
|
+
# Get column indices and their types for each dataframe from parent dataframe.
|
|
327
|
+
_, partition_indices_str, partition_types_str, _ = \
|
|
328
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
|
|
329
|
+
self._data_args[arg_name].columns,
|
|
330
|
+
idx_delim=",",
|
|
331
|
+
types_delim=",")
|
|
332
|
+
|
|
333
|
+
# Format "<arg_name>-<comma separated indices>-<comma separated types>"
|
|
334
|
+
data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
|
|
335
|
+
|
|
336
|
+
# Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
337
|
+
# {<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
338
|
+
return "--".join(data_args_str)
|
|
310
339
|
|
|
311
|
-
|
|
340
|
+
def _prepare_and_install_file(self, replace_dict):
|
|
341
|
+
"""
|
|
342
|
+
Prepare function script file from template file and install it in Vantage.
|
|
343
|
+
Takes the dictionary with keys as strings to be replaced in script and values as
|
|
344
|
+
strings which should be added in place of keys.
|
|
345
|
+
"""
|
|
312
346
|
|
|
313
|
-
self.
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
347
|
+
with open(os.path.join(self._scripts_path, self._template_file)) as fp:
|
|
348
|
+
script_data = fp.read()
|
|
349
|
+
|
|
350
|
+
for old, new in replace_dict.items():
|
|
351
|
+
script_data = script_data.replace(old, new)
|
|
317
352
|
|
|
318
|
-
self.
|
|
319
|
-
self._table_name_prefix = None
|
|
353
|
+
self._script_file_local = os.path.join(self._tdml_tmp_dir, self._script_file_name)
|
|
320
354
|
|
|
321
|
-
self.
|
|
322
|
-
|
|
323
|
-
|
|
355
|
+
with open(self._script_file_local, "w") as fp:
|
|
356
|
+
fp.write(script_data)
|
|
357
|
+
|
|
358
|
+
self._install_script_file(file_identifier=self._script_file_name.split(".")[0],
|
|
359
|
+
file_name=self._script_file_name,
|
|
360
|
+
file_location=self._tdml_tmp_dir)
|
|
361
|
+
|
|
362
|
+
def _get_dataframe_related_args_and_their_columns(self, kwargs):
|
|
363
|
+
"""
|
|
364
|
+
Get dataframe related arguments and return all their column names from kwargs.
|
|
365
|
+
"""
|
|
366
|
+
__data_columns = []
|
|
367
|
+
__data_args_dict = OrderedDict()
|
|
368
|
+
|
|
369
|
+
# Separate dataframe related arguments and their column names from actual kwargs.
|
|
370
|
+
for k, v in kwargs.items():
|
|
371
|
+
if isinstance(v, DataFrame):
|
|
372
|
+
# All dataframes should be select of parent dataframe.
|
|
373
|
+
_validate_df_query_type(v, "select", k)
|
|
374
|
+
|
|
375
|
+
# Save all columns in dataframe related arguments.
|
|
376
|
+
__data_columns.extend(v.columns)
|
|
377
|
+
|
|
378
|
+
__data_args_dict[k] = v
|
|
379
|
+
|
|
380
|
+
return __data_args_dict, __data_columns
|
|
381
|
+
|
|
382
|
+
def _process_data_for_funcs_returning_objects(self, kwargs):
|
|
383
|
+
"""
|
|
384
|
+
Internal function to process all arguments and assign self._data_args, self._tdml_df
|
|
385
|
+
and return
|
|
386
|
+
1. dictionary of elements (needed to replace in the script template file)
|
|
387
|
+
2. partition columns list.
|
|
388
|
+
"""
|
|
389
|
+
partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
|
|
390
|
+
if partition_cols:
|
|
391
|
+
kwargs.pop("partition_columns")
|
|
392
|
+
|
|
393
|
+
self._data_args, __data_columns = self._get_dataframe_related_args_and_their_columns(kwargs)
|
|
394
|
+
|
|
395
|
+
arg_names_for_dfs = list(self._data_args.keys())
|
|
396
|
+
|
|
397
|
+
# Get common parent dataframe from all dataframes.
|
|
398
|
+
self._tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self._data_args.values()))
|
|
399
|
+
|
|
400
|
+
self._tdml_df = self._tdml_df.select(__data_columns + partition_cols)
|
|
401
|
+
|
|
402
|
+
self._validate_existence_of_partition_columns(partition_cols, self._tdml_df.columns, arg_names_for_dfs)
|
|
403
|
+
|
|
404
|
+
self._tdml_df, partition_cols = self._get_data_and_data_partition_columns(self._tdml_df,
|
|
405
|
+
__data_columns,
|
|
406
|
+
[],
|
|
407
|
+
partition_cols
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Prepare string of data arguments with name, indices where columns of that argument resides
|
|
411
|
+
# and types of each of the column.
|
|
412
|
+
data_args_str = self._prepare_data_args_string(kwargs)
|
|
413
|
+
|
|
414
|
+
# Get indices of partition_columns and types of all columns.
|
|
415
|
+
data_column_types_str, partition_indices_str, _, partition_cols = \
|
|
416
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
|
|
417
|
+
partition_cols,
|
|
418
|
+
types_delim=None,
|
|
419
|
+
idx_delim=None)
|
|
420
|
+
|
|
421
|
+
replace_dict = {"<partition_cols_indices>": str(partition_indices_str),
|
|
422
|
+
"<types_of_data_cols>": str(data_column_types_str),
|
|
423
|
+
"<data_args_info_str>": f"'{data_args_str}'"}
|
|
424
|
+
|
|
425
|
+
return replace_dict, partition_cols
|
|
324
426
|
|
|
325
427
|
def _validate_equality_of_partition_values(self, fit_values, trans_values):
|
|
326
428
|
"""
|
|
@@ -335,294 +437,139 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
335
437
|
|
|
336
438
|
return True
|
|
337
439
|
|
|
338
|
-
def
|
|
440
|
+
def _get_non_data_related_args_from_kwargs(self, kwargs):
|
|
339
441
|
"""
|
|
340
|
-
|
|
341
|
-
and predict() are same.
|
|
442
|
+
Get all non-data related arguments from kwargs.
|
|
342
443
|
"""
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
else self._fit_partition_unique_values, key=lambda x: tuple(x))
|
|
350
|
-
default_unique_values = [[self._default_data_partition_value]]
|
|
351
|
-
|
|
352
|
-
if fit_unique_values == default_unique_values and \
|
|
353
|
-
trans_unique_values != default_unique_values:
|
|
354
|
-
error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
|
|
355
|
-
"without", "with")
|
|
356
|
-
msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
|
|
357
|
-
raise TeradataMlException(error_msg, msg_code)
|
|
444
|
+
non_data_related_args = {}
|
|
445
|
+
for k, v in kwargs.items():
|
|
446
|
+
if not isinstance(v, DataFrame):
|
|
447
|
+
non_data_related_args[k] = v
|
|
448
|
+
non_data_related_args.pop("partition_columns", None)
|
|
449
|
+
return non_data_related_args
|
|
358
450
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
451
|
+
def _read_from_template_and_write_dict_to_file(self, template_file, replace_dict,
|
|
452
|
+
output_script_file_name=None):
|
|
453
|
+
"""
|
|
454
|
+
Read template file, replace the keys with values and write to new file.
|
|
455
|
+
"""
|
|
456
|
+
with open(os.path.join(self._scripts_path, template_file)) as fp:
|
|
457
|
+
script_data = fp.read()
|
|
458
|
+
|
|
459
|
+
for old, new in replace_dict.items():
|
|
460
|
+
script_data = script_data.replace(old, new)
|
|
364
461
|
|
|
365
|
-
|
|
366
|
-
|
|
462
|
+
if output_script_file_name is None:
|
|
463
|
+
output_script_file_name = self._script_file_name
|
|
464
|
+
file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
|
|
465
|
+
with open(file_path, "w") as fp:
|
|
466
|
+
fp.write(script_data)
|
|
367
467
|
|
|
368
|
-
def
|
|
468
|
+
def _generate_script_file_from_template_file(self, kwargs, template_file, func_name,
|
|
469
|
+
output_script_file_name=None):
|
|
369
470
|
"""
|
|
370
|
-
Internal function to
|
|
371
|
-
|
|
471
|
+
Internal function to generate script file from template file. It just adds the non-data
|
|
472
|
+
related arguments to the template file and writes the contents to new file, so that these
|
|
473
|
+
arguments are available in the script file for running this function "func_name".
|
|
372
474
|
"""
|
|
373
|
-
#
|
|
374
|
-
|
|
375
|
-
def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
|
|
376
|
-
multi_models = self.modelObj.copy()
|
|
377
|
-
for i in range(multi_models.shape[0]):
|
|
378
|
-
curr_model = multi_models.iloc[i]["model"]
|
|
379
|
-
multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
|
|
380
|
-
return multi_models.rename(columns={"model": name})
|
|
475
|
+
# Take out all non-data related arguments to write to template file.
|
|
476
|
+
non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
|
|
381
477
|
|
|
382
|
-
#
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
is_attr_callable = callable(getattr(self.modelObj.iloc[0]["model"], name))
|
|
478
|
+
# Read template file and write the contents to new file with non-data related arguments.
|
|
479
|
+
template_f = os.path.join(self._scripts_path, template_file)
|
|
480
|
+
with open(template_f, "r") as f:
|
|
481
|
+
template = f.read()
|
|
387
482
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
483
|
+
if output_script_file_name is None:
|
|
484
|
+
output_script_file_name = self._script_file_name
|
|
485
|
+
file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
|
|
486
|
+
with open(file_path, "w") as f:
|
|
487
|
+
f.write("import json\n")
|
|
488
|
+
f.write(f"params = json.loads('{json.dumps(non_data_related_args)}')\n")
|
|
489
|
+
f.write(template)
|
|
392
490
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
model = output_attributes.iloc[i]["model"]
|
|
396
|
-
output_attributes.at[i, "model"] = getattr(model, name)
|
|
397
|
-
return output_attributes.rename(columns={"model": name})
|
|
491
|
+
kwargs["file_name"] = output_script_file_name
|
|
492
|
+
kwargs["name"] = func_name
|
|
398
493
|
|
|
399
|
-
def
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
494
|
+
def _remove_data_related_args_from_kwargs(self, kwargs):
|
|
495
|
+
"""
|
|
496
|
+
Internal function to remove data related arguments from kwargs.
|
|
497
|
+
"""
|
|
498
|
+
kwargs.pop("data", None)
|
|
499
|
+
kwargs.pop("feature_columns", None)
|
|
500
|
+
kwargs.pop("group_columns", None)
|
|
501
|
+
kwargs.pop("partition_columns", None)
|
|
502
|
+
kwargs.pop("label_columns", None)
|
|
405
503
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
504
|
+
def _convert_pos_args_to_kwargs_for_function(self, pos_args, kwargs, func_name):
|
|
505
|
+
"""
|
|
506
|
+
Internal function to convert positional arguments to keyword arguments.
|
|
507
|
+
"""
|
|
508
|
+
fn = getattr(getattr(import_module(self.module_name), self.class_name), func_name)
|
|
509
|
+
kwargs.update(zip(fn.__code__.co_varnames[1:], pos_args))
|
|
410
510
|
|
|
411
|
-
|
|
412
|
-
def _validate_model_supportability(cls, model):
|
|
511
|
+
def _install_model_and_script_files(self, file_name, file_location):
|
|
413
512
|
"""
|
|
414
|
-
Internal function to
|
|
415
|
-
teradataml's opensourceML.
|
|
513
|
+
Internal function to install model and script files to Vantage.
|
|
416
514
|
"""
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
# TODO: check for other supported packages.
|
|
423
|
-
if model.__module__.split(".")[0] not in OpenSourcePackage.values():
|
|
424
|
-
raise TeradataMlException(error_msg, msg_code)
|
|
425
|
-
except Exception as ex:
|
|
426
|
-
# If in case, model.__module__ fails.
|
|
427
|
-
raise TeradataMlException(error_msg, msg_code) from ex
|
|
515
|
+
self._install_initial_model_file()
|
|
516
|
+
self._install_script_file(file_identifier=file_name.split(".")[0],
|
|
517
|
+
file_name=file_name,
|
|
518
|
+
is_binary=False,
|
|
519
|
+
file_location=file_location)
|
|
428
520
|
|
|
429
|
-
def
|
|
521
|
+
def _assign_fit_variables_after_execution(self, data, partition_columns, label_columns):
|
|
430
522
|
"""
|
|
431
|
-
Internal function to
|
|
432
|
-
"model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
|
|
433
|
-
on the value of "replace_if_exists" argument.
|
|
523
|
+
Internal function to assign fit related variables.
|
|
434
524
|
"""
|
|
435
|
-
#
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
table_name=_OSML_MODELS_TABLE_NAME,
|
|
439
|
-
schema=self._db_name)
|
|
440
|
-
if not osml_models_table_exists:
|
|
441
|
-
all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
|
|
442
|
-
all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
|
|
443
|
-
_create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
|
|
444
|
-
primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
|
|
525
|
+
# Extract sklearn object(s) from the depending on the number of unique partitioning values.
|
|
526
|
+
self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
|
|
527
|
+
n_partition_cols=len(partition_columns))
|
|
445
528
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
model=self.modelObj,
|
|
450
|
-
pos_args=self.pos_args,
|
|
451
|
-
key_args=self.kwargs)
|
|
529
|
+
# Need this label columns types in prediction.
|
|
530
|
+
self._fit_label_columns_types = []
|
|
531
|
+
self._fit_label_columns_python_types = []
|
|
452
532
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
533
|
+
for l_c in label_columns:
|
|
534
|
+
column_data = data._td_column_names_and_sqlalchemy_types[l_c.lower()]
|
|
535
|
+
self._fit_label_columns_types.append(column_data)
|
|
536
|
+
self._fit_label_columns_python_types.append(column_data.python_type.__name__)
|
|
457
537
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
except TeradataMlException as ex:
|
|
465
|
-
model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
|
|
466
|
-
if not replace_if_exists and model_exists_msg == str(ex):
|
|
467
|
-
raise
|
|
468
|
-
elif replace_if_exists and model_exists_msg == str(ex):
|
|
469
|
-
# Delete the model from Model table and save again.
|
|
470
|
-
delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
|
|
471
|
-
save_byom(model_id=model_name,
|
|
472
|
-
model_file=file_name,
|
|
473
|
-
table_name=_OSML_MODELS_TABLE_NAME,
|
|
474
|
-
additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
|
|
475
|
-
additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
|
|
476
|
-
else:
|
|
477
|
-
raise
|
|
478
|
-
finally:
|
|
479
|
-
os.remove(file_name)
|
|
538
|
+
# If the model is trained a second time after the object creation,
|
|
539
|
+
# or if set_params() is called after the first model training,
|
|
540
|
+
# this flag will reset to False. So that for subsequent predict/score
|
|
541
|
+
# operations, the newly trained model will be installed.
|
|
542
|
+
if self._is_trained_model_installed:
|
|
543
|
+
self._is_trained_model_installed = False
|
|
480
544
|
|
|
481
|
-
@classmethod
|
|
482
|
-
def _deploy(cls, model_name, model, replace_if_exists=False):
|
|
483
|
-
"""
|
|
484
|
-
Internal function to create an instance of the class using the model and deploy
|
|
485
|
-
the model to Vantage.
|
|
486
|
-
"""
|
|
487
|
-
cls._validate_model_supportability(model=model)
|
|
488
545
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
cls._install_initial_model_file()
|
|
546
|
+
class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
547
|
+
# This has to be set for every package which subclasses this class.
|
|
548
|
+
OPENSOURCE_PACKAGE_NAME = None
|
|
493
549
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
def _load(cls, model_name):
|
|
500
|
-
"""
|
|
501
|
-
Internal function to load model corresponding to the package (like sklearn etc)
|
|
502
|
-
from Vantage to client using retrieve_byom() and create an instance of the class if
|
|
503
|
-
the model is from the same package.
|
|
504
|
-
"""
|
|
505
|
-
try:
|
|
506
|
-
model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
|
|
507
|
-
return_addition_columns=True)
|
|
508
|
-
except TeradataMlException as ex:
|
|
509
|
-
# Not showing table name in error message as it is an internal table.
|
|
510
|
-
part_msg = f"Model '{model_name}' not found in the table "
|
|
511
|
-
if part_msg in str(ex):
|
|
512
|
-
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
|
|
513
|
-
MessageCodes.MODEL_NOT_FOUND)
|
|
514
|
-
raise
|
|
515
|
-
|
|
516
|
-
model_vals_list = model.get_values()[0]
|
|
517
|
-
# List of 3 elements -
|
|
518
|
-
# - model name as index column,
|
|
519
|
-
# - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
|
|
520
|
-
# - 2nd contains package name.
|
|
521
|
-
model_obj = pickle.loads(model_vals_list[0])
|
|
522
|
-
model = model_obj.model
|
|
523
|
-
package = model_vals_list[1]
|
|
524
|
-
|
|
525
|
-
if package != cls.OPENSOURCE_PACKAGE_NAME.value:
|
|
526
|
-
# Raise error if trying to access model of different package.
|
|
527
|
-
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
|
|
528
|
-
f". Requested model is from '{package}' package"),
|
|
529
|
-
MessageCodes.MODEL_NOT_FOUND)
|
|
530
|
-
|
|
531
|
-
if isinstance(model, pd.DataFrame):
|
|
532
|
-
# Create a new instance of the class and set the model object to the instance.
|
|
533
|
-
# Instantiation can take only model, not model object. Hence, passing one of the model
|
|
534
|
-
# from pandas df. Updating modelObj and other fields later
|
|
535
|
-
cls = cls(model=model.iloc[1,2])
|
|
536
|
-
cls.modelObj = model
|
|
537
|
-
cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
|
|
538
|
-
else:
|
|
539
|
-
cls = cls(model=model)
|
|
540
|
-
|
|
541
|
-
cls._model_file_name_prefix = model_obj.partition_file_prefix
|
|
542
|
-
cls._is_default_partition_value_fit = model_obj.is_default_partition_value
|
|
543
|
-
cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
|
|
544
|
-
cls.pos_args = model_obj.pos_args
|
|
545
|
-
cls.kwargs = model_obj.key_args
|
|
546
|
-
|
|
547
|
-
# Load the model file into Vantage node as file can be used in
|
|
548
|
-
# predict or other operations.
|
|
549
|
-
cls._install_initial_model_file()
|
|
550
|
-
|
|
551
|
-
return cls
|
|
552
|
-
|
|
553
|
-
def deploy(self, model_name, replace_if_exists=False):
|
|
554
|
-
"""
|
|
555
|
-
DESCRIPTION:
|
|
556
|
-
Deploys the model held by interface object to Vantage.
|
|
557
|
-
|
|
558
|
-
PARAMETERS:
|
|
559
|
-
model_name:
|
|
560
|
-
Required Argument.
|
|
561
|
-
Specifies the unique name of the model to be deployed.
|
|
562
|
-
Types: str
|
|
563
|
-
|
|
564
|
-
replace_if_exists:
|
|
565
|
-
Optional Argument.
|
|
566
|
-
Specifies whether to replace the model if a model with the same name already
|
|
567
|
-
exists in Vantage. If this argument is set to False and a model with the same
|
|
568
|
-
name already exists, then the function raises an exception.
|
|
569
|
-
Default Value: False
|
|
570
|
-
Types: bool
|
|
571
|
-
|
|
572
|
-
RETURNS:
|
|
573
|
-
The opensource object wrapper.
|
|
574
|
-
|
|
575
|
-
RAISES:
|
|
576
|
-
TeradataMLException if model with "model_name" already exists and the argument
|
|
577
|
-
"replace_if_exists" is set to False.
|
|
578
|
-
|
|
579
|
-
EXAMPLES:
|
|
580
|
-
>>> from teradataml import td_sklearn
|
|
581
|
-
>>> model = td_sklearn.LinearRegression(normalize=True)
|
|
582
|
-
>>> model
|
|
583
|
-
LinearRegression(normalize=True)
|
|
584
|
-
|
|
585
|
-
# Example 1: Deploy the model held by interface object to Vantage.
|
|
586
|
-
>>> lin_reg = model.deploy("linreg_model_ver_2")
|
|
587
|
-
Model is saved.
|
|
588
|
-
>>> lin_reg
|
|
589
|
-
LinearRegression(normalize=True)
|
|
590
|
-
|
|
591
|
-
# Example 2: Deploy the model held by interface object to Vantage with the name same
|
|
592
|
-
# as that of model that already existed in Vantage.
|
|
593
|
-
>>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
|
|
594
|
-
Model is deleted.
|
|
595
|
-
Model is saved.
|
|
596
|
-
>>> lin_reg
|
|
597
|
-
LinearRegression(normalize=True)
|
|
598
|
-
"""
|
|
599
|
-
|
|
600
|
-
# Install model file into Vantage, if not installed.
|
|
601
|
-
self._install_initial_model_file()
|
|
602
|
-
|
|
603
|
-
self._save_model(model_name, replace_if_exists)
|
|
604
|
-
return self
|
|
550
|
+
def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
|
|
551
|
+
if model is None and not module_name and not class_name:
|
|
552
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
|
|
553
|
+
"module_name and class_name"),
|
|
554
|
+
MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
|
|
605
555
|
|
|
556
|
+
validator._validate_mutually_inclusive_arguments(module_name, "module_name",
|
|
557
|
+
class_name, "class_name")
|
|
606
558
|
|
|
607
|
-
|
|
559
|
+
super().__init__()
|
|
608
560
|
|
|
609
|
-
|
|
561
|
+
self.module_name = module_name
|
|
562
|
+
self.class_name = class_name
|
|
563
|
+
self.kwargs = kwargs if kwargs is not None else {}
|
|
564
|
+
self.pos_args = pos_args if pos_args is not None else tuple()
|
|
610
565
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
566
|
+
self._fit_label_columns_types = None
|
|
567
|
+
self._fit_label_columns_python_types = None
|
|
568
|
+
self._table_name_prefix = None
|
|
614
569
|
|
|
615
|
-
self.
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
self.module_name = model.__module__.split("._")[0]
|
|
619
|
-
self.class_name = model.__class__.__name__
|
|
620
|
-
# __dict__ gets all the arguments as dictionary including default ones and positional
|
|
621
|
-
# args.
|
|
622
|
-
self.kwargs = model.__dict__
|
|
623
|
-
self.pos_args = tuple() # Kept empty as all are moved to kwargs.
|
|
624
|
-
else:
|
|
625
|
-
self._initialize_object()
|
|
570
|
+
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
571
|
+
self._fit_partition_colums_non_default = None
|
|
572
|
+
self._is_default_partition_value_predict = True # False when the user provides partition columns.
|
|
626
573
|
|
|
627
574
|
def __repr__(self):
|
|
628
575
|
if self._is_default_partition_value_fit:
|
|
@@ -636,19 +583,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
636
583
|
pd.reset_option("display.max_colwidth")
|
|
637
584
|
return opt
|
|
638
585
|
|
|
639
|
-
def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
|
|
640
|
-
skip_either_or_that=False):
|
|
641
|
-
"""
|
|
642
|
-
Internal function to validate arguments passed to exposed opensource APIs and return
|
|
643
|
-
parent DataFrame, feature columns, label columns, group columns, data partition columns.
|
|
644
|
-
"""
|
|
645
|
-
_validate_opensource_func_args(X=X, y=y, groups=groups,
|
|
646
|
-
fit_partition_cols=self._fit_partition_colums_non_default,
|
|
647
|
-
kwargs=kwargs,
|
|
648
|
-
skip_either_or_that=skip_either_or_that)
|
|
649
|
-
return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
650
|
-
fit_partition_cols=self._fit_partition_colums_non_default)
|
|
651
|
-
|
|
652
586
|
def _initialize_object(self):
|
|
653
587
|
"""
|
|
654
588
|
Internal function to initialize sklearn object from module name and class name.
|
|
@@ -657,6 +591,13 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
657
591
|
imported_args = {}
|
|
658
592
|
# If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
|
|
659
593
|
# corresponding sklearn object.
|
|
594
|
+
_partition_column_names = None
|
|
595
|
+
if "partition_columns" in self.kwargs:
|
|
596
|
+
self._fit_partition_colums_non_default = self.kwargs["partition_columns"]
|
|
597
|
+
self._is_default_partition_value_fit = False
|
|
598
|
+
_partition_column_names = self._fit_partition_colums_non_default
|
|
599
|
+
|
|
600
|
+
|
|
660
601
|
new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
|
|
661
602
|
new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
|
|
662
603
|
|
|
@@ -681,19 +622,33 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
681
622
|
# TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
|
|
682
623
|
# are not supported yet due to pickling issue.
|
|
683
624
|
continue
|
|
684
|
-
if
|
|
685
|
-
|
|
625
|
+
if self.get_params():
|
|
626
|
+
if k in self.get_params():
|
|
627
|
+
self.kwargs[k] = v
|
|
628
|
+
else:
|
|
629
|
+
_model_init_arguments = None
|
|
630
|
+
try:
|
|
631
|
+
_model_init_arguments = self.modelObj.__init__.__code__.co_varnames
|
|
632
|
+
except AttributeError:
|
|
633
|
+
pass
|
|
634
|
+
if _model_init_arguments:
|
|
635
|
+
self.kwargs = dict((k, v) for k, v in _arguments.items() if k in _model_init_arguments)
|
|
636
|
+
else:
|
|
637
|
+
self.kwargs = _arguments
|
|
686
638
|
else:
|
|
687
639
|
# Model selection classes will not have `get_params`, in which case modelObj's __dict__
|
|
688
640
|
# is saved as kwargs.
|
|
689
641
|
self.kwargs = _arguments
|
|
690
642
|
|
|
691
|
-
|
|
643
|
+
if _partition_column_names:
|
|
644
|
+
self.kwargs["partition_columns"] = _partition_column_names
|
|
645
|
+
|
|
646
|
+
def _initialize_variables(self, table_name_prefix):
|
|
692
647
|
"""
|
|
693
648
|
Internal function to initialize variables used in this class.
|
|
694
649
|
"""
|
|
695
650
|
self.feature_names_in_ = None
|
|
696
|
-
self._table_name_prefix =
|
|
651
|
+
self._table_name_prefix = table_name_prefix
|
|
697
652
|
self._model_file_name_prefix = _generate_new_name(type="file")
|
|
698
653
|
self.model_file_paths_local = set()
|
|
699
654
|
|
|
@@ -710,6 +665,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
710
665
|
self._is_model_installed = False
|
|
711
666
|
self._fit_partition_unique_values = [[self._default_data_partition_value]]
|
|
712
667
|
|
|
668
|
+
def _get_returning_df(self, script_df, partition_column, returns):
|
|
669
|
+
"""
|
|
670
|
+
Internal function to return the teradataml Dataframe except
|
|
671
|
+
partition_column.
|
|
672
|
+
"""
|
|
673
|
+
if self._is_default_partition_value_fit:
|
|
674
|
+
# For single model case, partition column is internally generated
|
|
675
|
+
# and no point in returning it to the user.
|
|
676
|
+
|
|
677
|
+
# Extract columns from return types.
|
|
678
|
+
returning_cols = [col[0] for col in returns[len(partition_column):]]
|
|
679
|
+
return script_df.select(returning_cols)
|
|
680
|
+
return script_df
|
|
681
|
+
|
|
713
682
|
def modify_args(self, fp1, arg, imported_args):
|
|
714
683
|
"""
|
|
715
684
|
Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
|
|
@@ -752,61 +721,480 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
752
721
|
self.modify_args(fp1, k, imported_args),
|
|
753
722
|
self.modify_args(fp1, v, imported_args),
|
|
754
723
|
)
|
|
755
|
-
for k, v in arg.items()
|
|
724
|
+
for k, v in arg.items() if k != "partition_columns"
|
|
756
725
|
)
|
|
726
|
+
# elif arg == "partition_columns":
|
|
727
|
+
|
|
757
728
|
else:
|
|
758
729
|
return arg
|
|
759
730
|
|
|
760
|
-
def _install_initial_model_file(self):
|
|
761
|
-
"""
|
|
762
|
-
If model file(s) is/are not installed in Vantage, then install it/them.
|
|
731
|
+
def _install_initial_model_file(self, use_dummy_initial_file=False):
|
|
732
|
+
"""
|
|
733
|
+
If model file(s) is/are not installed in Vantage, then install it/them.
|
|
734
|
+
"""
|
|
735
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
736
|
+
# Get list of unique partition values and corresponding model object as dict.
|
|
737
|
+
partition_values_model_dict = {}
|
|
738
|
+
obj_list = self.modelObj.values.tolist()
|
|
739
|
+
for lst in obj_list:
|
|
740
|
+
partition_values_model_dict[tuple(lst[:len(self._fit_partition_colums_non_default)])] = \
|
|
741
|
+
lst[len(self._fit_partition_colums_non_default)]
|
|
742
|
+
|
|
743
|
+
for partition in self._fit_partition_unique_values:
|
|
744
|
+
# Create a new file with file name with partition values and
|
|
745
|
+
# dump sklearn object into it. Finally install the file to Vantage.
|
|
746
|
+
partition_join = "_".join([str(x) for x in partition])
|
|
747
|
+
file_name = f"{self._model_file_name_prefix}_{partition_join}"
|
|
748
|
+
# Replace '-' with '_' as '-' can't be present in file identifier.
|
|
749
|
+
# Needed this replace because partition_columns can be negative.
|
|
750
|
+
file_name = file_name.replace("-", "_")
|
|
751
|
+
full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
|
|
752
|
+
with open(full_file_name, "wb+") as fp:
|
|
753
|
+
# Write sklearn object to file.
|
|
754
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
755
|
+
# If multiple models, then write the model corresponding to the partition value.
|
|
756
|
+
fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
|
|
757
|
+
else:
|
|
758
|
+
if use_dummy_initial_file:
|
|
759
|
+
fp.write(pickle.dumps("abc"))
|
|
760
|
+
else:
|
|
761
|
+
fp.write(pickle.dumps(self.modelObj))
|
|
762
|
+
self.model_file_paths_local.add(file_name)
|
|
763
|
+
|
|
764
|
+
self._install_script_file(file_identifier=file_name,
|
|
765
|
+
file_name=file_name,
|
|
766
|
+
is_binary=True,
|
|
767
|
+
file_location=self._tdml_tmp_dir)
|
|
768
|
+
|
|
769
|
+
if self._is_lake_system:
|
|
770
|
+
# Need to pass env_name along with file_name for cleaning up the files in env.
|
|
771
|
+
obj = f"{self._env.env_name}::{file_name}"
|
|
772
|
+
if installed_model_files[obj] == 0:
|
|
773
|
+
# Add to GC for the first time the model file (along with env name) is encountered.
|
|
774
|
+
installed_model_files[obj] = 1
|
|
775
|
+
GarbageCollector._add_to_garbagecollector(object_name=obj,
|
|
776
|
+
object_type=TeradataConstants.TERADATA_APPLY)
|
|
777
|
+
else:
|
|
778
|
+
if installed_model_files[file_name] == 0:
|
|
779
|
+
# Add to GC for the first time the model file is encountered.
|
|
780
|
+
installed_model_files[file_name] = 1
|
|
781
|
+
GarbageCollector._add_to_garbagecollector(object_name=file_name,
|
|
782
|
+
object_type=TeradataConstants.TERADATA_SCRIPT)
|
|
783
|
+
|
|
784
|
+
self._is_model_installed = True
|
|
785
|
+
|
|
786
|
+
def _validate_unique_partition_values(self, data, partition_columns):
|
|
787
|
+
"""
|
|
788
|
+
Internal function to validate if the partition values in partition_columns used in fit()
|
|
789
|
+
and predict() are same.
|
|
790
|
+
"""
|
|
791
|
+
data._index_label = None
|
|
792
|
+
unique_values = data.drop_duplicate(partition_columns).get_values()
|
|
793
|
+
|
|
794
|
+
trans_unique_values = sorted(unique_values.tolist(), key=lambda x: tuple(x))
|
|
795
|
+
fit_unique_values = sorted(self._fit_partition_unique_values.tolist() \
|
|
796
|
+
if not isinstance(self._fit_partition_unique_values, list) \
|
|
797
|
+
else self._fit_partition_unique_values, key=lambda x: tuple(x))
|
|
798
|
+
default_unique_values = [[self._default_data_partition_value]]
|
|
799
|
+
|
|
800
|
+
if fit_unique_values == default_unique_values and \
|
|
801
|
+
trans_unique_values != default_unique_values:
|
|
802
|
+
error_msg = Messages.get_message(MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT,
|
|
803
|
+
"without", "with")
|
|
804
|
+
msg_code = MessageCodes.PARTITION_IN_BOTH_FIT_AND_PREDICT
|
|
805
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
806
|
+
|
|
807
|
+
if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
|
|
808
|
+
raise TeradataMlException(
|
|
809
|
+
Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING, "training", "test"),
|
|
810
|
+
MessageCodes.PARTITION_VALUES_NOT_MATCHING
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
def fit(self, **kwargs):
|
|
814
|
+
pass
|
|
815
|
+
|
|
816
|
+
def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
|
|
817
|
+
"""
|
|
818
|
+
Internal function to convert all OpensourceML related objects in arguments to
|
|
819
|
+
underlying model objects.
|
|
820
|
+
"""
|
|
821
|
+
if isinstance(args, dict):
|
|
822
|
+
new_args = args.copy() # To avoid updating
|
|
823
|
+
for k, v in new_args.items():
|
|
824
|
+
if isinstance(v, type(self)):
|
|
825
|
+
if idx_multi_model is not None:
|
|
826
|
+
# single model. This argument is set only when modelObj is single model.
|
|
827
|
+
new_args[k] = v.modelObj
|
|
828
|
+
else:
|
|
829
|
+
# multi-model. Get appropriate model from modelObj.
|
|
830
|
+
new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
|
|
831
|
+
else:
|
|
832
|
+
new_args[k] = v
|
|
833
|
+
return new_args
|
|
834
|
+
|
|
835
|
+
# If args is tuple, convert all elements to underlying model object.
|
|
836
|
+
elif isinstance(args, tuple):
|
|
837
|
+
new_args = tuple()
|
|
838
|
+
for arg in args:
|
|
839
|
+
if isinstance(arg, type(self)):
|
|
840
|
+
if idx_multi_model is None:
|
|
841
|
+
# single model. This argument is set only when modelObj is single model.
|
|
842
|
+
new_args += (arg.modelObj,)
|
|
843
|
+
else:
|
|
844
|
+
# multi-model. Get appropriate model from modelObj.
|
|
845
|
+
new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
|
|
846
|
+
else:
|
|
847
|
+
new_args += (arg,)
|
|
848
|
+
return new_args
|
|
849
|
+
return args
|
|
850
|
+
|
|
851
|
+
def __get_obj_attributes_multi_model(self, name):
|
|
852
|
+
"""
|
|
853
|
+
Internal function to get attributes of all sklearn model objects when multiple models are
|
|
854
|
+
generated by fit.
|
|
855
|
+
"""
|
|
856
|
+
|
|
857
|
+
def __generate_model_object(model_obj_value, init_model_obj):
|
|
858
|
+
"""
|
|
859
|
+
Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
|
|
860
|
+
"""
|
|
861
|
+
# Create _SkLearnObjectWrapper object from opensource model object.
|
|
862
|
+
model_obj = self.__class__(model=init_model_obj)
|
|
863
|
+
|
|
864
|
+
model_obj.modelObj = model_obj_value
|
|
865
|
+
model_obj._is_model_installed = True
|
|
866
|
+
|
|
867
|
+
# Setting other model attributes.
|
|
868
|
+
model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
|
|
869
|
+
model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
|
|
870
|
+
model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
|
|
871
|
+
model_obj._fit_partition_unique_values = self._fit_partition_unique_values
|
|
872
|
+
return model_obj
|
|
873
|
+
|
|
874
|
+
# Wrapper function to invoke dynamic method, using arguments
|
|
875
|
+
# passed by user, on model in each row.
|
|
876
|
+
def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
|
|
877
|
+
multi_models = self.modelObj.copy()
|
|
878
|
+
for i in range(multi_models.shape[0]):
|
|
879
|
+
curr_model = multi_models.iloc[i]["model"]
|
|
880
|
+
partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
|
|
881
|
+
partition_values = "_".join([str(x) for x in partition_values])
|
|
882
|
+
if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
|
|
883
|
+
# filename is first argument.
|
|
884
|
+
kwargs1 = kwargs.copy()
|
|
885
|
+
c1 = c
|
|
886
|
+
|
|
887
|
+
if len(c) > 0:
|
|
888
|
+
c1 = list(c1)
|
|
889
|
+
c1[0] = f"{c1[0]}_{partition_values}"
|
|
890
|
+
c1 = tuple(c1)
|
|
891
|
+
if len(kwargs) > 0 and kwargs.get("filename", None):
|
|
892
|
+
kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
|
|
893
|
+
|
|
894
|
+
multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
|
|
895
|
+
**self._convert_arguments_to_modelObj(kwargs1, i))
|
|
896
|
+
else:
|
|
897
|
+
multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
|
|
898
|
+
**self._convert_arguments_to_modelObj(kwargs, i))
|
|
899
|
+
|
|
900
|
+
first_function_value = multi_models.at[0, "model"]
|
|
901
|
+
if self.__class__._validate_model_supportability(first_function_value):
|
|
902
|
+
return __generate_model_object(multi_models, init_model_obj=first_function_value)
|
|
903
|
+
|
|
904
|
+
multi_models = multi_models.rename(columns={"model": name})
|
|
905
|
+
|
|
906
|
+
# Select only partition columns and the attribute column.
|
|
907
|
+
return multi_models[self._fit_partition_colums_non_default + [name]]
|
|
908
|
+
|
|
909
|
+
# Assuming that self.modelObj will have at least 1 row.
|
|
910
|
+
|
|
911
|
+
# Get attribute instance from first model object.
|
|
912
|
+
first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
|
|
913
|
+
|
|
914
|
+
# If first_atrribute_instance is callable, it should be applied on model in each row
|
|
915
|
+
# using passed arguments.
|
|
916
|
+
if callable(first_atrribute_instance):
|
|
917
|
+
return __sklearn_method_invoker_for_multimodel
|
|
918
|
+
|
|
919
|
+
output_attributes = self.modelObj.copy()
|
|
920
|
+
for i in range(output_attributes.shape[0]):
|
|
921
|
+
model = output_attributes.iloc[i]["model"]
|
|
922
|
+
output_attributes.at[i, "model"] = getattr(model, name)
|
|
923
|
+
|
|
924
|
+
if self.__class__._validate_model_supportability(first_atrribute_instance):
|
|
925
|
+
return __generate_model_object(output_attributes, init_model_obj=first_atrribute_instance)
|
|
926
|
+
|
|
927
|
+
return output_attributes.rename(columns={"model": name})
|
|
928
|
+
|
|
929
|
+
def __getattr__(self, name):
|
|
930
|
+
# This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
|
|
931
|
+
def __sklearn_method_invoker(*c, **kwargs):
|
|
932
|
+
# Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
|
|
933
|
+
model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
|
|
934
|
+
if self.__class__._validate_model_supportability(model_obj):
|
|
935
|
+
model_obj = self.__class__(model=model_obj)
|
|
936
|
+
model_obj._is_model_installed = True # Trained model is returned by function call.
|
|
937
|
+
return model_obj
|
|
938
|
+
|
|
939
|
+
if isinstance(self.modelObj, pd.DataFrame):
|
|
940
|
+
return self.__get_obj_attributes_multi_model(name)
|
|
941
|
+
|
|
942
|
+
attribute_instance = getattr(self.modelObj, name)
|
|
943
|
+
|
|
944
|
+
if callable(attribute_instance):
|
|
945
|
+
return __sklearn_method_invoker
|
|
946
|
+
|
|
947
|
+
if self.__class__._validate_model_supportability(attribute_instance):
|
|
948
|
+
# sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
|
|
949
|
+
model_obj = self.__class__(model=attribute_instance)
|
|
950
|
+
model_obj._is_model_installed = True # Trained model is returned as attribute.
|
|
951
|
+
return model_obj
|
|
952
|
+
|
|
953
|
+
return attribute_instance
|
|
954
|
+
|
|
955
|
+
@classmethod
|
|
956
|
+
def _validate_model_supportability(cls, model):
|
|
957
|
+
"""
|
|
958
|
+
Internal function to validate if the model provided for deployment is supported by
|
|
959
|
+
teradataml's opensourceML.
|
|
960
|
+
"""
|
|
961
|
+
error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
|
|
962
|
+
"The given model is not a supported opensource model.")
|
|
963
|
+
msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
|
|
964
|
+
package_name = None
|
|
965
|
+
class_name = None
|
|
966
|
+
try:
|
|
967
|
+
# For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
|
|
968
|
+
# TODO: check for other supported packages.
|
|
969
|
+
if hasattr(model, "__module__"):
|
|
970
|
+
package_name = model.__module__.split(".")[0]
|
|
971
|
+
if package_name not in OpenSourcePackage.values():
|
|
972
|
+
return False
|
|
973
|
+
if hasattr(model, "__class__"):
|
|
974
|
+
class_name = model.__class__.__name__
|
|
975
|
+
except Exception as ex:
|
|
976
|
+
# If in case, model.__module__ fails.
|
|
977
|
+
raise TeradataMlException(error_msg, msg_code) from ex
|
|
978
|
+
|
|
979
|
+
# True only if package name is opensource package name and class name is not internal class.
|
|
980
|
+
return True if package_name and class_name and \
|
|
981
|
+
package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
|
|
982
|
+
|
|
983
|
+
def _save_model(self, model_name, replace_if_exists=False):
|
|
984
|
+
"""
|
|
985
|
+
Internal function to save the model stored in file at location mentioned by class variable
|
|
986
|
+
"model_file_path_local" to Vantage using BYOM methods save_byom() and delete_byom() based
|
|
987
|
+
on the value of "replace_if_exists" argument.
|
|
988
|
+
"""
|
|
989
|
+
# Creating a table, if doesn't exist, in Vantage to store the model info.
|
|
990
|
+
conn = get_connection()
|
|
991
|
+
osml_models_table_exists = conn.dialect.has_table(conn,
|
|
992
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
993
|
+
schema=self._db_name,
|
|
994
|
+
table_only=True)
|
|
995
|
+
if not osml_models_table_exists:
|
|
996
|
+
all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
|
|
997
|
+
all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
|
|
998
|
+
_create_table(table_name=_OSML_MODELS_TABLE_NAME, columns=all_columns,
|
|
999
|
+
primary_index=_OSML_MODELS_PRIMARY_INDEX, schema_name=self._db_name)
|
|
1000
|
+
|
|
1001
|
+
model_obj = OpensourceModels(is_default_partition_value=self._is_default_partition_value_fit,
|
|
1002
|
+
partition_file_prefix=self._model_file_name_prefix,
|
|
1003
|
+
fit_partition_columns_non_default=self._fit_partition_colums_non_default,
|
|
1004
|
+
model=self.modelObj,
|
|
1005
|
+
pos_args=self.pos_args,
|
|
1006
|
+
key_args=self.kwargs)
|
|
1007
|
+
|
|
1008
|
+
# Saved the model object to a file to be used in save_byom() for writing to Vantage table.
|
|
1009
|
+
file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
|
|
1010
|
+
with open(file_name, "wb+") as fp:
|
|
1011
|
+
fp.write(pickle.dumps(model_obj))
|
|
1012
|
+
|
|
1013
|
+
try:
|
|
1014
|
+
save_byom(model_id=model_name,
|
|
1015
|
+
model_file=file_name,
|
|
1016
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
1017
|
+
additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
|
|
1018
|
+
additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
|
|
1019
|
+
except TeradataMlException as ex:
|
|
1020
|
+
model_exists_msg = Messages.get_message(MessageCodes.MODEL_ALREADY_EXISTS, model_name)
|
|
1021
|
+
if not replace_if_exists and model_exists_msg == str(ex):
|
|
1022
|
+
raise
|
|
1023
|
+
elif replace_if_exists and model_exists_msg == str(ex):
|
|
1024
|
+
# Delete the model from Model table and save again.
|
|
1025
|
+
delete_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME)
|
|
1026
|
+
save_byom(model_id=model_name,
|
|
1027
|
+
model_file=file_name,
|
|
1028
|
+
table_name=_OSML_MODELS_TABLE_NAME,
|
|
1029
|
+
additional_columns_types=_OSML_ADDITIONAL_COLUMN_TYPES,
|
|
1030
|
+
additional_columns={"package": self.OPENSOURCE_PACKAGE_NAME.value})
|
|
1031
|
+
else:
|
|
1032
|
+
raise
|
|
1033
|
+
finally:
|
|
1034
|
+
os.remove(file_name)
|
|
1035
|
+
|
|
1036
|
+
@classmethod
|
|
1037
|
+
def _deploy(cls, model_name, model, replace_if_exists=False):
|
|
1038
|
+
"""
|
|
1039
|
+
Internal function to create an instance of the class using the model and deploy
|
|
1040
|
+
the model to Vantage.
|
|
1041
|
+
"""
|
|
1042
|
+
is_model_supportable = cls._validate_model_supportability(model=model)
|
|
1043
|
+
if not is_model_supportable:
|
|
1044
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
|
|
1045
|
+
"deploy", "The given model is not a supported opensource model."),
|
|
1046
|
+
MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
|
|
1047
|
+
|
|
1048
|
+
cls = cls(model=model)
|
|
1049
|
+
# Load the model file into Vantage node as file can be used in
|
|
1050
|
+
# predict or other operations.
|
|
1051
|
+
cls._install_initial_model_file()
|
|
1052
|
+
|
|
1053
|
+
cls._save_model(model_name, replace_if_exists)
|
|
1054
|
+
|
|
1055
|
+
return cls
|
|
1056
|
+
|
|
1057
|
+
@classmethod
|
|
1058
|
+
def _load(cls, model_name):
|
|
1059
|
+
"""
|
|
1060
|
+
Internal function to load model corresponding to the package (like sklearn etc)
|
|
1061
|
+
from Vantage to client using retrieve_byom() and create an instance of the class if
|
|
1062
|
+
the model is from the same package.
|
|
1063
|
+
"""
|
|
1064
|
+
try:
|
|
1065
|
+
model = retrieve_byom(model_id=model_name, table_name=_OSML_MODELS_TABLE_NAME,
|
|
1066
|
+
return_addition_columns=True)
|
|
1067
|
+
except TeradataMlException as ex:
|
|
1068
|
+
# Not showing table name in error message as it is an internal table.
|
|
1069
|
+
part_msg = f"Model '{model_name}' not found in the table "
|
|
1070
|
+
if part_msg in str(ex):
|
|
1071
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name, ""),
|
|
1072
|
+
MessageCodes.MODEL_NOT_FOUND)
|
|
1073
|
+
raise
|
|
1074
|
+
|
|
1075
|
+
model_vals_list = model.get_values()[0]
|
|
1076
|
+
# List of 3 elements -
|
|
1077
|
+
# - model name as index column,
|
|
1078
|
+
# - 1st contains model object with fields: is_default_partition_value, partition_file_prefix, model. etc
|
|
1079
|
+
# - 2nd contains package name.
|
|
1080
|
+
model_obj = pickle.loads(model_vals_list[0])
|
|
1081
|
+
model = model_obj.model
|
|
1082
|
+
package = model_vals_list[1]
|
|
1083
|
+
|
|
1084
|
+
if package != cls.OPENSOURCE_PACKAGE_NAME.value:
|
|
1085
|
+
# Raise error if trying to access model of different package.
|
|
1086
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
|
|
1087
|
+
f". Requested model is from '{package}' package"),
|
|
1088
|
+
MessageCodes.MODEL_NOT_FOUND)
|
|
1089
|
+
|
|
1090
|
+
if isinstance(model, pd.DataFrame):
|
|
1091
|
+
# Create a new instance of the class and set the model object to the instance.
|
|
1092
|
+
# Instantiation can take only model, not model object. Hence, passing one of the model
|
|
1093
|
+
# from pandas df. Updating modelObj and other fields later
|
|
1094
|
+
cls = cls(model=model.iloc[1,2])
|
|
1095
|
+
cls.modelObj = model
|
|
1096
|
+
cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
|
|
1097
|
+
else:
|
|
1098
|
+
cls = cls(model=model)
|
|
1099
|
+
|
|
1100
|
+
cls._model_file_name_prefix = model_obj.partition_file_prefix
|
|
1101
|
+
cls._is_default_partition_value_fit = model_obj.is_default_partition_value
|
|
1102
|
+
cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
|
|
1103
|
+
cls.pos_args = model_obj.pos_args
|
|
1104
|
+
cls.kwargs = model_obj.key_args
|
|
1105
|
+
|
|
1106
|
+
# Load the model file into Vantage node as file can be used in
|
|
1107
|
+
# predict or other operations.
|
|
1108
|
+
cls._install_initial_model_file()
|
|
1109
|
+
|
|
1110
|
+
return cls
|
|
1111
|
+
|
|
1112
|
+
def deploy(self, model_name, replace_if_exists=False):
|
|
1113
|
+
"""
|
|
1114
|
+
DESCRIPTION:
|
|
1115
|
+
Deploys the model held by interface object to Vantage.
|
|
1116
|
+
|
|
1117
|
+
PARAMETERS:
|
|
1118
|
+
model_name:
|
|
1119
|
+
Required Argument.
|
|
1120
|
+
Specifies the unique name of the model to be deployed.
|
|
1121
|
+
Types: str
|
|
1122
|
+
|
|
1123
|
+
replace_if_exists:
|
|
1124
|
+
Optional Argument.
|
|
1125
|
+
Specifies whether to replace the model if a model with the same name already
|
|
1126
|
+
exists in Vantage. If this argument is set to False and a model with the same
|
|
1127
|
+
name already exists, then the function raises an exception.
|
|
1128
|
+
Default Value: False
|
|
1129
|
+
Types: bool
|
|
1130
|
+
|
|
1131
|
+
RETURNS:
|
|
1132
|
+
The opensource object wrapper.
|
|
1133
|
+
|
|
1134
|
+
RAISES:
|
|
1135
|
+
TeradataMLException if model with "model_name" already exists and the argument
|
|
1136
|
+
"replace_if_exists" is set to False.
|
|
1137
|
+
|
|
1138
|
+
EXAMPLES:
|
|
1139
|
+
>>> from teradataml import td_sklearn
|
|
1140
|
+
>>> model = td_sklearn.LinearRegression(normalize=True)
|
|
1141
|
+
>>> model
|
|
1142
|
+
LinearRegression(normalize=True)
|
|
1143
|
+
|
|
1144
|
+
# Example 1: Deploy the model held by interface object to Vantage.
|
|
1145
|
+
>>> lin_reg = model.deploy("linreg_model_ver_2")
|
|
1146
|
+
Model is saved.
|
|
1147
|
+
>>> lin_reg
|
|
1148
|
+
LinearRegression(normalize=True)
|
|
1149
|
+
|
|
1150
|
+
# Example 2: Deploy the model held by interface object to Vantage with the name same
|
|
1151
|
+
# as that of model that already existed in Vantage.
|
|
1152
|
+
>>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
|
|
1153
|
+
Model is deleted.
|
|
1154
|
+
Model is saved.
|
|
1155
|
+
>>> lin_reg
|
|
1156
|
+
LinearRegression(normalize=True)
|
|
763
1157
|
"""
|
|
764
|
-
if isinstance(self.modelObj, pd.DataFrame):
|
|
765
|
-
# Get list of unique partition values and corresponding model object as dict.
|
|
766
|
-
partition_values_model_dict = {}
|
|
767
|
-
obj_list = self.modelObj.values.tolist()
|
|
768
|
-
for lst in obj_list:
|
|
769
|
-
partition_values_model_dict[tuple(lst[:len(lst)-1])] = lst[len(lst)-1]
|
|
770
1158
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
# dump sklearn object into it. Finally install the file to Vantage.
|
|
774
|
-
partition_join = "_".join([str(x) for x in partition])
|
|
775
|
-
file_name = f"{self._model_file_name_prefix}_{partition_join}"
|
|
776
|
-
# Replace '-' with '_' as '-' can't be present in file identifier.
|
|
777
|
-
# Needed this replace because partition_columns can be negative.
|
|
778
|
-
file_name = file_name.replace("-", "_")
|
|
779
|
-
full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
|
|
780
|
-
with open(full_file_name, "wb+") as fp:
|
|
781
|
-
# Write sklearn object to file.
|
|
782
|
-
if isinstance(self.modelObj, pd.DataFrame):
|
|
783
|
-
# If multiple models, then write the model corresponding to the partition value.
|
|
784
|
-
fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
|
|
785
|
-
else:
|
|
786
|
-
fp.write(pickle.dumps(self.modelObj))
|
|
787
|
-
self.model_file_paths_local.add(file_name)
|
|
1159
|
+
# Install model file into Vantage, if not installed.
|
|
1160
|
+
self._install_initial_model_file()
|
|
788
1161
|
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
is_binary=True,
|
|
792
|
-
file_location=self._tdml_tmp_dir)
|
|
1162
|
+
self._save_model(model_name, replace_if_exists)
|
|
1163
|
+
return self
|
|
793
1164
|
|
|
794
|
-
if self._is_lake_system:
|
|
795
|
-
# Need to pass env_name along with file_name for cleaning up the files in env.
|
|
796
|
-
obj = f"{self._env.env_name}::{file_name}"
|
|
797
|
-
if installed_model_files[obj] == 0:
|
|
798
|
-
# Add to GC for the first time the model file (along with env name) is encountered.
|
|
799
|
-
installed_model_files[obj] = 1
|
|
800
|
-
GarbageCollector._add_to_garbagecollector(object_name=obj,
|
|
801
|
-
object_type=TeradataConstants.TERADATA_APPLY)
|
|
802
|
-
else:
|
|
803
|
-
if installed_model_files[file_name] == 0:
|
|
804
|
-
# Add to GC for the first time the model file is encountered.
|
|
805
|
-
installed_model_files[file_name] = 1
|
|
806
|
-
GarbageCollector._add_to_garbagecollector(object_name=file_name,
|
|
807
|
-
object_type=TeradataConstants.TERADATA_SCRIPT)
|
|
808
1165
|
|
|
809
|
-
|
|
1166
|
+
class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
1167
|
+
|
|
1168
|
+
OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
|
|
1169
|
+
|
|
1170
|
+
def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
|
|
1171
|
+
super().__init__(model=model, module_name=module_name, class_name=class_name,
|
|
1172
|
+
pos_args=pos_args, kwargs=kwargs)
|
|
1173
|
+
|
|
1174
|
+
self._initialize_variables(table_name_prefix="td_sklearn_")
|
|
1175
|
+
if model is not None:
|
|
1176
|
+
self.modelObj = model
|
|
1177
|
+
self.module_name = model.__module__.split("._")[0]
|
|
1178
|
+
self.class_name = model.__class__.__name__
|
|
1179
|
+
# __dict__ gets all the arguments as dictionary including default ones and positional
|
|
1180
|
+
# args.
|
|
1181
|
+
self.kwargs = model.__dict__
|
|
1182
|
+
self.pos_args = tuple() # Kept empty as all are moved to kwargs.
|
|
1183
|
+
else:
|
|
1184
|
+
self._initialize_object()
|
|
1185
|
+
|
|
1186
|
+
def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
|
|
1187
|
+
skip_either_or_that=False):
|
|
1188
|
+
"""
|
|
1189
|
+
Internal function to validate arguments passed to exposed opensource APIs and return
|
|
1190
|
+
parent DataFrame, feature columns, label columns, group columns, data partition columns.
|
|
1191
|
+
"""
|
|
1192
|
+
_validate_opensource_func_args(X=X, y=y, groups=groups,
|
|
1193
|
+
fit_partition_cols=self._fit_partition_colums_non_default,
|
|
1194
|
+
kwargs=kwargs,
|
|
1195
|
+
skip_either_or_that=skip_either_or_that)
|
|
1196
|
+
return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
1197
|
+
fit_partition_cols=self._fit_partition_colums_non_default)
|
|
810
1198
|
|
|
811
1199
|
def _run_fit_related_functions(self,
|
|
812
1200
|
data,
|
|
@@ -814,7 +1202,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
814
1202
|
label_columns,
|
|
815
1203
|
partition_columns,
|
|
816
1204
|
func,
|
|
817
|
-
classes=None
|
|
1205
|
+
classes=None,
|
|
1206
|
+
file_name="sklearn_fit.py"):
|
|
818
1207
|
"""
|
|
819
1208
|
Internal function to run fit() and partial_fit() functions.
|
|
820
1209
|
"""
|
|
@@ -829,9 +1218,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
829
1218
|
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
830
1219
|
for col in new_partition_columns] + [("model", model_type)]
|
|
831
1220
|
|
|
832
|
-
file_name = "sklearn_fit.py"
|
|
833
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
834
|
-
|
|
835
1221
|
if classes:
|
|
836
1222
|
class_type = type(classes[0]).__name__
|
|
837
1223
|
classes = "--".join([str(x) for x in classes])
|
|
@@ -857,13 +1243,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
857
1243
|
self._model_data = self._run_script(data, script_command, new_partition_columns,
|
|
858
1244
|
return_types)
|
|
859
1245
|
|
|
860
|
-
|
|
861
|
-
self.extract_sklearn_obj(n_unique_partitions=len(self._fit_partition_unique_values),
|
|
862
|
-
n_partition_cols=len(new_partition_columns))
|
|
863
|
-
|
|
864
|
-
# Need this label columns types in prediction.
|
|
865
|
-
self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
|
|
866
|
-
for l_c in label_columns]
|
|
1246
|
+
self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
|
|
867
1247
|
|
|
868
1248
|
def partial_fit(self, X=None, y=None, classes=None, **kwargs):
|
|
869
1249
|
"""
|
|
@@ -911,11 +1291,19 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
911
1291
|
self._is_default_partition_value_fit = False
|
|
912
1292
|
self._fit_partition_colums_non_default = partition_columns
|
|
913
1293
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
1294
|
+
file_name = kwargs.pop("file_name", None)
|
|
1295
|
+
func_name = kwargs.pop("name", "fit")
|
|
1296
|
+
|
|
1297
|
+
args = {"data": data,
|
|
1298
|
+
"feature_columns": feature_columns,
|
|
1299
|
+
"label_columns": label_columns,
|
|
1300
|
+
"partition_columns": partition_columns,
|
|
1301
|
+
"func": func_name}
|
|
1302
|
+
|
|
1303
|
+
if file_name is not None:
|
|
1304
|
+
args["file_name"] = file_name
|
|
1305
|
+
|
|
1306
|
+
self._run_fit_related_functions(**args)
|
|
919
1307
|
|
|
920
1308
|
self._fit_execution_time = time.time() - st_time
|
|
921
1309
|
|
|
@@ -980,10 +1368,130 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
980
1368
|
|
|
981
1369
|
return super().__getattr__(name)
|
|
982
1370
|
|
|
1371
|
+
def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
|
|
1372
|
+
func_name, **kwargs):
|
|
1373
|
+
"""
|
|
1374
|
+
Internal function to handle multi model case for transform function for functions
|
|
1375
|
+
["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
|
|
1376
|
+
and "Birch" of cluster module.
|
|
1377
|
+
These functions generate multiple models and when transform is applied to each model, it generates
|
|
1378
|
+
output with different number of columns.
|
|
1379
|
+
"""
|
|
1380
|
+
skl_objs_dict = {}
|
|
1381
|
+
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
1382
|
+
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
1383
|
+
|
|
1384
|
+
# Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
|
|
1385
|
+
# and get the maximum number of columns and their types.
|
|
1386
|
+
for i in range(no_of_unique_partitions):
|
|
1387
|
+
skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
data = data.select(feature_columns + label_columns + partition_columns)
|
|
1391
|
+
ten_row_data = data.head(10).get_values()
|
|
1392
|
+
X = numpy.array(ten_row_data)
|
|
1393
|
+
|
|
1394
|
+
# For multi-model case, model in one AMP can give more number of columns than other AMPs.
|
|
1395
|
+
# Returns clause can't contain different number of columns in different AMPs. Hence, taking
|
|
1396
|
+
# maximum number of columns and their types from all models.
|
|
1397
|
+
max_no_of_columns = 0
|
|
1398
|
+
max_col_names = []
|
|
1399
|
+
max_col_types = []
|
|
1400
|
+
|
|
1401
|
+
def _get_input_row_without_nans(row):
|
|
1402
|
+
"""
|
|
1403
|
+
`inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
|
|
1404
|
+
"""
|
|
1405
|
+
X1 = []
|
|
1406
|
+
for _, v in enumerate(row):
|
|
1407
|
+
if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
|
|
1408
|
+
# Add to list when:
|
|
1409
|
+
# - v is None or
|
|
1410
|
+
# - v is string or
|
|
1411
|
+
# - v is not nan or
|
|
1412
|
+
# - if module is impute (which transforms nan values) even though v is nan.
|
|
1413
|
+
X1.append(v)
|
|
1414
|
+
else:
|
|
1415
|
+
# skip nan values.
|
|
1416
|
+
pass
|
|
1417
|
+
return X1
|
|
1418
|
+
|
|
1419
|
+
for i in range(X.shape[0]):
|
|
1420
|
+
# Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
|
|
1421
|
+
partition_values = tuple(X[i, -no_of_partitioning_cols:])
|
|
1422
|
+
skl_obj = skl_objs_dict[partition_values]
|
|
1423
|
+
|
|
1424
|
+
X1 = X[i, :-no_of_partitioning_cols]
|
|
1425
|
+
# Since Nans/NULLs are added in transform for last columns where some models generated
|
|
1426
|
+
# less number of columns, removing Nans/NULLs from the input row for inverse_transform
|
|
1427
|
+
# using function _get_input_row_without_nans().
|
|
1428
|
+
X1 = numpy.array([_get_input_row_without_nans(X1)])
|
|
1429
|
+
|
|
1430
|
+
trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
|
|
1431
|
+
|
|
1432
|
+
no_of_columns = 1
|
|
1433
|
+
|
|
1434
|
+
if trans_opt.shape == (X1.shape[0],):
|
|
1435
|
+
trans_opt = trans_opt.reshape(X1.shape[0], 1)
|
|
1436
|
+
|
|
1437
|
+
if isinstance(trans_opt[0], numpy.ndarray) \
|
|
1438
|
+
or isinstance(trans_opt[0], list) \
|
|
1439
|
+
or isinstance(trans_opt[0], tuple):
|
|
1440
|
+
no_of_columns = len(trans_opt[0])
|
|
1441
|
+
|
|
1442
|
+
col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
|
|
1443
|
+
|
|
1444
|
+
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
1445
|
+
opt_pd = pd.DataFrame(trans_opt)
|
|
1446
|
+
|
|
1447
|
+
# Get output column types for each column in pandas df from the output of transform
|
|
1448
|
+
# type functions.
|
|
1449
|
+
types = {}
|
|
1450
|
+
for idx in range(no_of_columns):
|
|
1451
|
+
col = list(opt_pd.columns)[idx]
|
|
1452
|
+
|
|
1453
|
+
# Only one row in trans_opt.
|
|
1454
|
+
if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
|
|
1455
|
+
type_ = type(trans_opt[0][idx])
|
|
1456
|
+
else:
|
|
1457
|
+
# only one value in the output.
|
|
1458
|
+
type_ = type(trans_opt[0])
|
|
1459
|
+
|
|
1460
|
+
# If type of the output value (trans_opt) is None, then use `str` as type since
|
|
1461
|
+
# pandas astype() does not accept None type.
|
|
1462
|
+
if type_ is type(None):
|
|
1463
|
+
type_ = str
|
|
1464
|
+
|
|
1465
|
+
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
1466
|
+
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
1467
|
+
# Error while type casting for column '2'"
|
|
1468
|
+
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
1469
|
+
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
1470
|
+
|
|
1471
|
+
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
1472
|
+
opt_pd = opt_pd.astype(types)
|
|
1473
|
+
|
|
1474
|
+
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
1475
|
+
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
1476
|
+
col_types = [TIMESTAMP(timezone=True)
|
|
1477
|
+
if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
|
|
1478
|
+
else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
|
|
1479
|
+
for key, col_name in enumerate(list(opt_pd.columns))]
|
|
1480
|
+
|
|
1481
|
+
# Different models in multi model case can generate different number of output columns for example in
|
|
1482
|
+
# SelectFpr. Hence, taking the model which generates maximum number of columns.
|
|
1483
|
+
if no_of_columns > max_no_of_columns:
|
|
1484
|
+
max_no_of_columns = no_of_columns
|
|
1485
|
+
max_col_names = col_names
|
|
1486
|
+
max_col_types = col_types
|
|
1487
|
+
|
|
1488
|
+
return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
|
|
1489
|
+
|
|
983
1490
|
def _get_return_columns_for_function_(self,
|
|
984
1491
|
data,
|
|
985
1492
|
feature_columns,
|
|
986
1493
|
label_columns,
|
|
1494
|
+
partition_columns,
|
|
987
1495
|
func_name,
|
|
988
1496
|
kwargs):
|
|
989
1497
|
"""
|
|
@@ -997,7 +1505,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
997
1505
|
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
|
|
998
1506
|
data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
999
1507
|
for i, col in enumerate(label_columns)]
|
|
1000
|
-
|
|
1508
|
+
|
|
1509
|
+
if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
|
|
1001
1510
|
"""
|
|
1002
1511
|
Return predict columns using either label_columns (if provided) or
|
|
1003
1512
|
self._fit_label_columns_types (if the function is trained using label columns).
|
|
@@ -1012,8 +1521,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1012
1521
|
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
|
|
1013
1522
|
for i, col_type in enumerate(self._fit_label_columns_types)]
|
|
1014
1523
|
|
|
1015
|
-
data = data.select(feature_columns + label_columns)
|
|
1016
|
-
|
|
1017
1524
|
## If function is not `fit_predict`:
|
|
1018
1525
|
# then take one row of transform/other functions to execute in client
|
|
1019
1526
|
# to get number of columns in return clause and their Vantage types.
|
|
@@ -1027,8 +1534,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1027
1534
|
skl_obj = self.modelObj
|
|
1028
1535
|
else:
|
|
1029
1536
|
# Multi model case.
|
|
1537
|
+
if (func_name in ["transform", "inverse_transform"] and \
|
|
1538
|
+
self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
|
|
1539
|
+
(self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
|
|
1540
|
+
# Special handling for multi model case for transform function as these classes
|
|
1541
|
+
# generate transform output with different number of columns for each model.
|
|
1542
|
+
# Hence, need to add Nulls/Nans to columns which are not present in the transform output of
|
|
1543
|
+
# some models.
|
|
1544
|
+
return self._special_handling_multimodel_(data, feature_columns, label_columns,
|
|
1545
|
+
partition_columns, func_name, **kwargs)
|
|
1546
|
+
|
|
1030
1547
|
skl_obj = self.modelObj.iloc[0]["model"]
|
|
1031
1548
|
|
|
1549
|
+
data = data.select(feature_columns + label_columns)
|
|
1550
|
+
|
|
1032
1551
|
ten_row_data = data.head(10).get_values()
|
|
1033
1552
|
X = numpy.array(ten_row_data)
|
|
1034
1553
|
if label_columns:
|
|
@@ -1122,7 +1641,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1122
1641
|
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
1123
1642
|
# Error while type casting for column '2'"
|
|
1124
1643
|
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
1125
|
-
types[col] = type_ if type_
|
|
1644
|
+
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
1126
1645
|
|
|
1127
1646
|
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
1128
1647
|
opt_pd = opt_pd.astype(types)
|
|
@@ -1137,7 +1656,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1137
1656
|
return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
|
|
1138
1657
|
|
|
1139
1658
|
@_validate_fit_run
|
|
1140
|
-
def _run_function_needing_all_rows(self, X=None, y=None, **kwargs):
|
|
1659
|
+
def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
|
|
1141
1660
|
"""
|
|
1142
1661
|
Internal function to run functions like score, aic, bic which needs all rows and return
|
|
1143
1662
|
one floating number as result.
|
|
@@ -1160,9 +1679,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1160
1679
|
label_columns,
|
|
1161
1680
|
partition_columns)
|
|
1162
1681
|
|
|
1163
|
-
file_name = "sklearn_score.py"
|
|
1164
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1165
|
-
|
|
1166
1682
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1167
1683
|
else f"./{self._db_name}/{file_name}"
|
|
1168
1684
|
|
|
@@ -1180,7 +1696,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1180
1696
|
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1181
1697
|
for col in new_partition_columns] + [(func_name, FLOAT())]
|
|
1182
1698
|
|
|
1183
|
-
|
|
1699
|
+
# Checking the trained model installation. If not installed,
|
|
1700
|
+
# install it and set flag to True.
|
|
1701
|
+
if not self._is_trained_model_installed:
|
|
1702
|
+
self._install_initial_model_file()
|
|
1703
|
+
self._is_trained_model_installed = True
|
|
1184
1704
|
|
|
1185
1705
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1186
1706
|
|
|
@@ -1194,7 +1714,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1194
1714
|
return opt
|
|
1195
1715
|
|
|
1196
1716
|
@_validate_fit_run
|
|
1197
|
-
def _transform(self, X=None, y=None, **kwargs):
|
|
1717
|
+
def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
|
|
1198
1718
|
"""
|
|
1199
1719
|
Internal function to run predict/transform and similar functions, which returns
|
|
1200
1720
|
multiple columns. This function will return data row along with the generated
|
|
@@ -1217,19 +1737,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1217
1737
|
partition_columns)
|
|
1218
1738
|
|
|
1219
1739
|
# Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
|
|
1220
|
-
|
|
1221
|
-
kwargs.pop("data")
|
|
1222
|
-
if "feature_columns" in kwargs:
|
|
1223
|
-
kwargs.pop("feature_columns")
|
|
1224
|
-
if "group_columns" in kwargs:
|
|
1225
|
-
kwargs.pop("group_columns")
|
|
1226
|
-
if "partition_columns" in kwargs:
|
|
1227
|
-
kwargs.pop("partition_columns")
|
|
1228
|
-
if "label_columns" in kwargs:
|
|
1229
|
-
kwargs.pop("label_columns")
|
|
1230
|
-
|
|
1231
|
-
file_name = "sklearn_transform.py"
|
|
1232
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1740
|
+
self._remove_data_related_args_from_kwargs(kwargs)
|
|
1233
1741
|
|
|
1234
1742
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1235
1743
|
else f"./{self._db_name}/{file_name}"
|
|
@@ -1239,26 +1747,42 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1239
1747
|
|
|
1240
1748
|
self._validate_unique_partition_values(data, new_partition_columns)
|
|
1241
1749
|
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1750
|
+
return_columns_python_types = None
|
|
1751
|
+
if self._fit_label_columns_python_types:
|
|
1752
|
+
return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
|
|
1246
1753
|
|
|
1247
1754
|
# Returning feature columns also along with transformed columns because we don't know the
|
|
1248
1755
|
# mapping of feature columns to the transformed columns.
|
|
1249
|
-
|
|
1250
|
-
|
|
1756
|
+
## 'correct_covariance()' returns the (n_features, n_features)
|
|
1757
|
+
if func_name == "correct_covariance":
|
|
1758
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1759
|
+
for col in new_partition_columns]
|
|
1760
|
+
else:
|
|
1761
|
+
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1762
|
+
for col in (new_partition_columns + feature_columns)]
|
|
1251
1763
|
if func_name in ["predict", "decision_function"] and label_columns:
|
|
1252
1764
|
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1253
1765
|
for col in label_columns]
|
|
1254
|
-
return_types += self._get_return_columns_for_function_(data,
|
|
1255
|
-
feature_columns,
|
|
1256
|
-
label_columns,
|
|
1257
|
-
func_name,
|
|
1258
|
-
kwargs)
|
|
1259
1766
|
|
|
1260
|
-
|
|
1261
|
-
|
|
1767
|
+
output_cols_types = self._get_return_columns_for_function_(data,
|
|
1768
|
+
feature_columns,
|
|
1769
|
+
label_columns,
|
|
1770
|
+
new_partition_columns,
|
|
1771
|
+
func_name,
|
|
1772
|
+
kwargs)
|
|
1773
|
+
return_types += output_cols_types
|
|
1774
|
+
|
|
1775
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
1776
|
+
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1777
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1778
|
+
f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
|
|
1779
|
+
f"{return_columns_python_types}"
|
|
1780
|
+
|
|
1781
|
+
# Checking the trained model installation. If not installed,
|
|
1782
|
+
# install it and set flag to True.
|
|
1783
|
+
if not self._is_trained_model_installed:
|
|
1784
|
+
self._install_initial_model_file()
|
|
1785
|
+
self._is_trained_model_installed = True
|
|
1262
1786
|
|
|
1263
1787
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1264
1788
|
|
|
@@ -1294,6 +1818,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1294
1818
|
return_types += self._get_return_columns_for_function_(data,
|
|
1295
1819
|
feature_columns,
|
|
1296
1820
|
label_columns,
|
|
1821
|
+
new_partition_columns,
|
|
1297
1822
|
func_name,
|
|
1298
1823
|
{})
|
|
1299
1824
|
else:
|
|
@@ -1302,7 +1827,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1302
1827
|
return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
|
|
1303
1828
|
|
|
1304
1829
|
file_name = "sklearn_fit_predict.py"
|
|
1305
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1306
1830
|
|
|
1307
1831
|
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1308
1832
|
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
@@ -1317,7 +1841,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1317
1841
|
# Get unique values in partitioning columns.
|
|
1318
1842
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1319
1843
|
|
|
1320
|
-
|
|
1844
|
+
# Checking the trained model installation. If not installed,
|
|
1845
|
+
# install it and flag to True.
|
|
1846
|
+
if not self._is_trained_model_installed:
|
|
1847
|
+
self._install_initial_model_file()
|
|
1848
|
+
self._is_trained_model_installed = True
|
|
1321
1849
|
|
|
1322
1850
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1323
1851
|
|
|
@@ -1376,14 +1904,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1376
1904
|
skip_either_or_that=True)
|
|
1377
1905
|
|
|
1378
1906
|
# Remove the kwargs data.
|
|
1379
|
-
|
|
1380
|
-
partition_cols = kwargs.pop("partition_columns", None)
|
|
1381
|
-
feature_cols = kwargs.pop("feature_columns", None)
|
|
1382
|
-
label_cols = kwargs.pop("label_columns", None)
|
|
1907
|
+
self._remove_data_related_args_from_kwargs(kwargs)
|
|
1383
1908
|
|
|
1384
1909
|
if partition_columns:
|
|
1385
1910
|
# kwargs are passed to kneighbors function. So, removing them from kwargs.
|
|
1386
|
-
kwargs.pop("partition_columns")
|
|
1387
1911
|
self._is_default_partition_value_fit = False
|
|
1388
1912
|
|
|
1389
1913
|
# Generating new partition column name.
|
|
@@ -1395,7 +1919,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1395
1919
|
args_str = self._get_kwargs_str(kwargs)
|
|
1396
1920
|
|
|
1397
1921
|
file_name = "sklearn_neighbors.py"
|
|
1398
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1399
1922
|
|
|
1400
1923
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1401
1924
|
else f"./{self._db_name}/{file_name}"
|
|
@@ -1429,7 +1952,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1429
1952
|
# Get unique values in partitioning columns.
|
|
1430
1953
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1431
1954
|
|
|
1432
|
-
|
|
1955
|
+
# Checking the trained model installation. If not installed,
|
|
1956
|
+
# install it and set flag to True.
|
|
1957
|
+
if not self._is_trained_model_installed:
|
|
1958
|
+
self._install_initial_model_file()
|
|
1959
|
+
self._is_trained_model_installed = True
|
|
1433
1960
|
|
|
1434
1961
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1435
1962
|
|
|
@@ -1513,7 +2040,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1513
2040
|
group_columns)
|
|
1514
2041
|
|
|
1515
2042
|
file_name = "sklearn_model_selection_split.py"
|
|
1516
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1517
2043
|
|
|
1518
2044
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1519
2045
|
else f"./{self._db_name}/{file_name}"
|
|
@@ -1548,7 +2074,11 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1548
2074
|
# Get unique values in partitioning columns.
|
|
1549
2075
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1550
2076
|
|
|
1551
|
-
|
|
2077
|
+
# Checking the trained model installation. If not installed,
|
|
2078
|
+
# install it and set flag to True.
|
|
2079
|
+
if not self._is_trained_model_installed:
|
|
2080
|
+
self._install_initial_model_file()
|
|
2081
|
+
self._is_trained_model_installed = True
|
|
1552
2082
|
|
|
1553
2083
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1554
2084
|
|
|
@@ -1562,154 +2092,69 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1562
2092
|
|
|
1563
2093
|
return opt
|
|
1564
2094
|
|
|
1565
|
-
def _get_returning_df(self, script_df, partition_column, returns):
|
|
1566
|
-
"""
|
|
1567
|
-
Internal function to return the teradataml Dataframe except
|
|
1568
|
-
partition_column.
|
|
1569
|
-
"""
|
|
1570
|
-
if self._is_default_partition_value_fit:
|
|
1571
|
-
# For single model case, partition column is internally generated
|
|
1572
|
-
# and no point in returning it to the user.
|
|
1573
|
-
|
|
1574
|
-
# Extract columns from return types.
|
|
1575
|
-
returning_cols = [col[0] for col in returns[len(partition_column):]]
|
|
1576
|
-
return script_df.select(returning_cols)
|
|
1577
|
-
return script_df
|
|
1578
|
-
|
|
1579
2095
|
|
|
1580
|
-
class
|
|
1581
|
-
def __init__(self, module_name, func_name):
|
|
2096
|
+
class _FunctionWrapper(_GenericObjectWrapper):
|
|
2097
|
+
def __init__(self, module_name, func_name, file_type, template_file):
|
|
1582
2098
|
super().__init__()
|
|
1583
|
-
self.
|
|
1584
|
-
self.
|
|
1585
|
-
self.
|
|
1586
|
-
self.
|
|
1587
|
-
self.
|
|
2099
|
+
self._module_name = module_name
|
|
2100
|
+
self._func_name = func_name
|
|
2101
|
+
self._params = None
|
|
2102
|
+
self._data_args = OrderedDict()
|
|
2103
|
+
self._template_file = template_file
|
|
2104
|
+
self._script_file_name = _generate_new_name(type=file_type, extension="py")
|
|
1588
2105
|
|
|
1589
2106
|
def __call__(self, **kwargs):
|
|
1590
2107
|
"""
|
|
1591
2108
|
Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
|
|
1592
2109
|
"""
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
|
|
1596
|
-
if partition_cols:
|
|
1597
|
-
kwargs.pop("partition_columns")
|
|
1598
|
-
|
|
1599
|
-
# Separate dataframe related arguments and their column names from actual kwargs.
|
|
1600
|
-
for k, v in kwargs.items():
|
|
1601
|
-
if isinstance(v, DataFrame):
|
|
1602
|
-
# All dataframes should be select of parent dataframe.
|
|
1603
|
-
_validate_df_query_type(v, "select", k)
|
|
1604
|
-
|
|
1605
|
-
# Save all columns in dataframe related arguments.
|
|
1606
|
-
__data_columns.extend(v.columns)
|
|
1607
|
-
|
|
1608
|
-
self.__data_args[k] = v
|
|
2110
|
+
replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
|
|
1609
2111
|
|
|
2112
|
+
script_file_path = f"{self._script_file_name}" if self._is_lake_system \
|
|
2113
|
+
else f"./{self._db_name}/{self._script_file_name}"
|
|
1610
2114
|
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
self._validate_existence_of_partition_columns(partition_cols, self.__tdml_df.columns)
|
|
1615
|
-
|
|
1616
|
-
self.__tdml_df = self.__tdml_df.select(__data_columns + partition_cols)
|
|
1617
|
-
|
|
1618
|
-
self.__tdml_df, partition_cols = self._get_data_and_data_partition_columns(self.__tdml_df,
|
|
1619
|
-
__data_columns,
|
|
1620
|
-
[],
|
|
1621
|
-
partition_cols
|
|
1622
|
-
)
|
|
1623
|
-
|
|
1624
|
-
# Prepare string of data arguments with name, indices where columns of that argument resides
|
|
1625
|
-
# and types of each of the column.
|
|
1626
|
-
data_args_str = self._prepare_data_args_string(kwargs)
|
|
2115
|
+
model_file_prefix = None
|
|
2116
|
+
if self._is_lake_system:
|
|
2117
|
+
model_file_prefix = self._script_file_name.replace(".py", "")
|
|
1627
2118
|
|
|
1628
|
-
|
|
2119
|
+
py_exc = UtilFuncs._get_python_execution_path()
|
|
2120
|
+
script_command = f"{py_exc} {script_file_path} {model_file_prefix} {self._is_lake_system}"
|
|
1629
2121
|
|
|
1630
|
-
|
|
1631
|
-
data_column_types_str, partition_indices_str, _, partition_cols = \
|
|
1632
|
-
self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
|
|
2122
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
1633
2123
|
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1637
|
-
script_command = f"{py_exc} {script_file_path} {partition_indices_str} {data_column_types_str} {data_args_str}"
|
|
2124
|
+
return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
2125
|
+
for col in partition_cols] + [(self._func_name, model_type)]
|
|
1638
2126
|
|
|
1639
|
-
|
|
1640
|
-
|
|
2127
|
+
replace_dict.update({"<module_name>": self._module_name,
|
|
2128
|
+
"<func_name>": self._func_name,
|
|
2129
|
+
"<params>": json.dumps(kwargs)})
|
|
1641
2130
|
|
|
1642
2131
|
# Generate new file in .teradataml directory and install it to Vantage.
|
|
1643
|
-
self._prepare_and_install_file()
|
|
2132
|
+
self._prepare_and_install_file(replace_dict=replace_dict)
|
|
2133
|
+
|
|
2134
|
+
try:
|
|
2135
|
+
self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
|
|
2136
|
+
self._model_data._index_label = None
|
|
1644
2137
|
|
|
1645
|
-
|
|
1646
|
-
self._model_data._index_label = None
|
|
2138
|
+
fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
|
|
1647
2139
|
|
|
1648
|
-
|
|
2140
|
+
self._extract_model_objs(n_unique_partitions=len(fit_partition_unique_values),
|
|
2141
|
+
n_partition_cols=len(partition_cols))
|
|
1649
2142
|
|
|
1650
|
-
|
|
1651
|
-
|
|
2143
|
+
except Exception as ex:
|
|
2144
|
+
# File cleanup if script execution fails or unable to fetch modelObj.
|
|
2145
|
+
os.remove(self._script_file_local)
|
|
2146
|
+
self._remove_script_file(self._script_file_name)
|
|
2147
|
+
raise
|
|
1652
2148
|
|
|
1653
2149
|
# File cleanup after processing.
|
|
1654
|
-
os.remove(self.
|
|
1655
|
-
|
|
1656
|
-
force_remove=True)
|
|
2150
|
+
os.remove(self._script_file_local)
|
|
2151
|
+
self._remove_script_file(self._script_file_name)
|
|
1657
2152
|
|
|
1658
2153
|
return self.modelObj
|
|
1659
2154
|
|
|
1660
|
-
def _prepare_data_args_string(self, kwargs):
|
|
1661
|
-
"""
|
|
1662
|
-
Get column indices and types of each data related arguments in the format:
|
|
1663
|
-
"{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
1664
|
-
{<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
1665
|
-
"""
|
|
1666
|
-
data_args_str = []
|
|
1667
|
-
for arg_name in list(self.__data_args.keys()):
|
|
1668
|
-
# Remove DataFrame arguments from kwargs, which will be passed to Script.
|
|
1669
|
-
kwargs.pop(arg_name)
|
|
1670
|
-
|
|
1671
|
-
# Get column indices and their types for each dataframe from parent dataframe.
|
|
1672
|
-
_, partition_indices_str, partition_types_str, _ = \
|
|
1673
|
-
self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
|
|
1674
|
-
self.__data_args[arg_name].columns,
|
|
1675
|
-
idx_delim=",",
|
|
1676
|
-
types_delim=",")
|
|
1677
|
-
|
|
1678
|
-
# Format "<arg_name>-<comma separated indices>-<comma separated types>"
|
|
1679
|
-
data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
|
|
1680
|
-
|
|
1681
|
-
# Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
1682
|
-
# {<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
1683
|
-
return "--".join(data_args_str)
|
|
1684
|
-
|
|
1685
|
-
def _validate_existence_of_partition_columns(self, partition_columns, all_columns):
|
|
1686
|
-
"""
|
|
1687
|
-
Validate if columns in "partition_columns" argument are present in any of the given
|
|
1688
|
-
dataframes.
|
|
1689
|
-
"""
|
|
1690
|
-
invalid_part_cols = [c for c in partition_columns if c not in all_columns]
|
|
1691
|
-
|
|
1692
|
-
if invalid_part_cols:
|
|
1693
|
-
raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
|
|
1694
|
-
", ".join(invalid_part_cols),
|
|
1695
|
-
"', '".join(list(self.__data_args.keys())))
|
|
1696
|
-
)
|
|
1697
|
-
|
|
1698
|
-
def _prepare_and_install_file(self):
|
|
1699
|
-
"""
|
|
1700
|
-
Prepare function script file from template file and install it in Vaantage.
|
|
1701
|
-
"""
|
|
1702
|
-
with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
|
|
1703
|
-
script_data = fp.read()
|
|
1704
|
-
script_data = script_data.replace("<module_name>",self.__module_name).\
|
|
1705
|
-
replace("<func_name>",self.__func_name).replace("<params>", json.dumps(self.__params))
|
|
1706
|
-
|
|
1707
|
-
self._model_file_local = os.path.join(self._tdml_tmp_dir, self._model_file_name)
|
|
1708
|
-
|
|
1709
|
-
with open(self._model_file_local, "w") as fp:
|
|
1710
|
-
fp.write(script_data)
|
|
1711
|
-
|
|
1712
|
-
self._install_script_file(file_identifier=self._model_file_name.split(".")[0],
|
|
1713
|
-
file_name=self._model_file_name,
|
|
1714
|
-
file_location=self._tdml_tmp_dir)
|
|
1715
2155
|
|
|
2156
|
+
class _SKLearnFunctionWrapper(_FunctionWrapper):
|
|
2157
|
+
def __init__(self, module_name, func_name):
|
|
2158
|
+
file_type = "file_fn_sklearn"
|
|
2159
|
+
template_file = "sklearn_function.template"
|
|
2160
|
+
super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)
|