teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -49,7 +49,7 @@ from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MOD
|
|
|
49
49
|
from teradataml.common.messagecodes import MessageCodes
|
|
50
50
|
from teradataml.common.messages import Messages
|
|
51
51
|
from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
|
|
52
|
-
from teradataml.dbutils.dbutils import _create_table
|
|
52
|
+
from teradataml.dbutils.dbutils import _create_table, set_session_param
|
|
53
53
|
from teradataml.utils.validators import _Validators
|
|
54
54
|
from teradataml.dataframe.dataframe import DataFrame
|
|
55
55
|
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
@@ -64,6 +64,10 @@ validator = _Validators()
|
|
|
64
64
|
|
|
65
65
|
installed_model_files = defaultdict(int)
|
|
66
66
|
|
|
67
|
+
## Flag to ensure the sklearn script
|
|
68
|
+
## installation occurs only once.
|
|
69
|
+
_file_installed = False
|
|
70
|
+
|
|
67
71
|
class _GenericObjectWrapper:
|
|
68
72
|
def __init__(self) -> None:
|
|
69
73
|
self._db_name = _get_current_databasename()
|
|
@@ -76,7 +80,7 @@ class _GenericObjectWrapper:
|
|
|
76
80
|
self.modelObj = None
|
|
77
81
|
self._model_data = None
|
|
78
82
|
|
|
79
|
-
self._tdml_tmp_dir =
|
|
83
|
+
self._tdml_tmp_dir = GarbageCollector._get_temp_dir_name()
|
|
80
84
|
|
|
81
85
|
self._env = None
|
|
82
86
|
|
|
@@ -86,43 +90,24 @@ class _GenericObjectWrapper:
|
|
|
86
90
|
if configure.openml_user_env is not None:
|
|
87
91
|
self._env = configure.openml_user_env
|
|
88
92
|
else:
|
|
89
|
-
self._create_or_get_env()
|
|
93
|
+
self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
|
|
90
94
|
else:
|
|
91
|
-
|
|
95
|
+
set_session_param("searchuifdbpath",self._db_name)
|
|
92
96
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
creates the environment using template file and return the env.
|
|
97
|
-
"""
|
|
98
|
-
# Get the template file path.
|
|
99
|
-
template_dir_path = os.path.join(_TDML_DIRECTORY, "data", "templates",
|
|
100
|
-
"open_source_ml.json")
|
|
97
|
+
global _file_installed
|
|
98
|
+
## Flag to check whether trained model is installed or not.
|
|
99
|
+
self._is_trained_model_installed = False
|
|
101
100
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
101
|
+
## Install all sklearn script files on Vantage.
|
|
102
|
+
if not _file_installed:
|
|
103
|
+
sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
|
|
104
|
+
"sklearn_transform.py", "sklearn_fit_predict.py",
|
|
105
|
+
"sklearn_neighbors.py", "sklearn_model_selection_split.py"]
|
|
106
|
+
for script_file in sklearn_script_files:
|
|
107
|
+
self._install_script_file(file_identifier=script_file.split(".")[0],
|
|
108
|
+
file_name=script_file)
|
|
105
109
|
|
|
106
|
-
|
|
107
|
-
_env_name = data["env_specs"][0]["env_name"]
|
|
108
|
-
|
|
109
|
-
try:
|
|
110
|
-
# Call function to 'openml_env' get env.
|
|
111
|
-
self._env = get_env(_env_name)
|
|
112
|
-
except TeradataMlException as tdml_e:
|
|
113
|
-
# We will get here when error says, env does not exist otherwise raise the exception as is.
|
|
114
|
-
# Env does not exist so create one.
|
|
115
|
-
|
|
116
|
-
exc_msg = "Failed to execute get_env(). User environment '{}' not " \
|
|
117
|
-
"found.".format(_env_name)
|
|
118
|
-
if exc_msg in tdml_e.args[0]:
|
|
119
|
-
print(f"No OpenAF environment with name '{_env_name}' found. Creating one with "\
|
|
120
|
-
"latest supported python and required packages.")
|
|
121
|
-
_env = create_env(template=template_dir_path)
|
|
122
|
-
else:
|
|
123
|
-
raise tdml_e
|
|
124
|
-
except Exception as exc:
|
|
125
|
-
raise exc
|
|
110
|
+
_file_installed = True
|
|
126
111
|
|
|
127
112
|
def _get_columns_as_list(self, cols):
|
|
128
113
|
"""
|
|
@@ -205,34 +190,65 @@ class _GenericObjectWrapper:
|
|
|
205
190
|
is_binary=is_binary)
|
|
206
191
|
else:
|
|
207
192
|
status = self._env.install_file(file_path=new_script,
|
|
208
|
-
|
|
209
|
-
|
|
193
|
+
replace=True,
|
|
194
|
+
suppress_output=True)
|
|
210
195
|
if not status:
|
|
211
196
|
raise TeradataMlException(
|
|
212
197
|
f"Script file '{file_name}' failed to get installed/replaced in Vantage."
|
|
213
198
|
)
|
|
214
199
|
|
|
215
|
-
def
|
|
200
|
+
def _remove_script_file(self, file_name):
|
|
216
201
|
"""
|
|
217
|
-
|
|
218
|
-
So, get the indices and types of these columns from the data columns.
|
|
202
|
+
Internal function to remove script file in Vantage.
|
|
219
203
|
"""
|
|
220
|
-
|
|
221
|
-
|
|
204
|
+
# _env is set while object creation
|
|
205
|
+
# If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
|
|
206
|
+
|
|
207
|
+
if not self._is_lake_system:
|
|
208
|
+
status = remove_file(file_identifier=file_name.split(".")[0],
|
|
209
|
+
force_remove=True,
|
|
210
|
+
suppress_output=True)
|
|
211
|
+
else:
|
|
212
|
+
status = self._env.remove_file(file_name=file_name,
|
|
213
|
+
suppress_output=True)
|
|
214
|
+
if not status:
|
|
215
|
+
raise TeradataMlException(
|
|
216
|
+
f"Script file '{file_name}' failed to remove in Vantage."
|
|
217
|
+
)
|
|
218
|
+
def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
|
|
219
|
+
idx_delim=",",
|
|
220
|
+
types_delim="--"):
|
|
221
|
+
"""
|
|
222
|
+
Internal function to get the data column types and partition column names, indices and types.
|
|
223
|
+
Function returns delimiter separated string of types and indices if idx_delim and
|
|
224
|
+
types_delim are provided. Otherwise, it returns list of types and indices. Partition names
|
|
225
|
+
are returned as list always.
|
|
226
|
+
"""
|
|
227
|
+
data_column_types = "" if types_delim else []
|
|
228
|
+
partition_indices = "" if idx_delim else []
|
|
229
|
+
partition_types = "" if types_delim else []
|
|
222
230
|
new_partition_columns = []
|
|
231
|
+
j = 0
|
|
223
232
|
for i, col in enumerate(data.columns):
|
|
233
|
+
_type = data._td_column_names_and_sqlalchemy_types[col.lower()].python_type.__name__
|
|
234
|
+
if types_delim:
|
|
235
|
+
data_column_types += (_type if i == 0 else f"{types_delim}{_type}")
|
|
236
|
+
else:
|
|
237
|
+
data_column_types.append(_type)
|
|
224
238
|
if col in partition_columns:
|
|
225
239
|
new_partition_columns.append(col)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
240
|
+
if idx_delim:
|
|
241
|
+
partition_indices += (str(i) if j == 0 else f"{idx_delim}{str(i)}")
|
|
242
|
+
else:
|
|
243
|
+
partition_indices.append(i)
|
|
244
|
+
if types_delim:
|
|
245
|
+
partition_types += (_type if j == 0 else f"{types_delim}{_type}")
|
|
246
|
+
else:
|
|
247
|
+
partition_types.append(_type)
|
|
248
|
+
j += 1
|
|
249
|
+
# Return types of all columns (as list or str), partition column indices (as list or str)
|
|
250
|
+
# and partition column types (as list or str).
|
|
251
|
+
return data_column_types, partition_indices, partition_types, new_partition_columns
|
|
236
252
|
|
|
237
253
|
def _get_kwargs_str(self, kwargs):
|
|
238
254
|
"""
|
|
@@ -357,6 +373,23 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
357
373
|
Internal function to get attributes of all sklearn model objects when multiple models are
|
|
358
374
|
generated by fit.
|
|
359
375
|
"""
|
|
376
|
+
|
|
377
|
+
def __generate_model_object(model_obj_value):
|
|
378
|
+
"""
|
|
379
|
+
Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
|
|
380
|
+
"""
|
|
381
|
+
# Create _SkLearnObjectWrapper object from opensource model object.
|
|
382
|
+
model_obj = self.__class__(model=first_atrribute_instance)
|
|
383
|
+
model_obj.modelObj = model_obj_value
|
|
384
|
+
model_obj._is_model_installed = True
|
|
385
|
+
|
|
386
|
+
# Setting other model attributes.
|
|
387
|
+
model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
|
|
388
|
+
model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
|
|
389
|
+
model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
|
|
390
|
+
model_obj._fit_partition_unique_values = self._fit_partition_unique_values
|
|
391
|
+
return model_obj
|
|
392
|
+
|
|
360
393
|
# Wrapper function to invoke dynamic method, using arguments
|
|
361
394
|
# passed by user, on model in each row.
|
|
362
395
|
def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
|
|
@@ -364,36 +397,58 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
364
397
|
for i in range(multi_models.shape[0]):
|
|
365
398
|
curr_model = multi_models.iloc[i]["model"]
|
|
366
399
|
multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
|
|
400
|
+
|
|
401
|
+
first_function_instance = multi_models.at[0, "model"]
|
|
402
|
+
if self.__class__._validate_model_supportability(first_function_instance):
|
|
403
|
+
return __generate_model_object(multi_models)
|
|
404
|
+
|
|
367
405
|
return multi_models.rename(columns={"model": name})
|
|
368
406
|
|
|
369
|
-
# Identify if attribute is callable or not to avoid
|
|
370
|
-
# this check in loop for every model.
|
|
371
|
-
is_attr_callable = False
|
|
372
407
|
# Assuming that self.modelObj will have at least 1 row.
|
|
373
|
-
is_attr_callable = callable(getattr(self.modelObj.iloc[0]["model"], name))
|
|
374
408
|
|
|
375
|
-
#
|
|
409
|
+
# Get attribute instance from first model object.
|
|
410
|
+
first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
|
|
411
|
+
|
|
412
|
+
# If first_atrribute_instance is callable, it should be applied on model in each row
|
|
376
413
|
# using passed arguments.
|
|
377
|
-
if
|
|
414
|
+
if callable(first_atrribute_instance):
|
|
378
415
|
return __sklearn_method_invoker_for_multimodel
|
|
379
416
|
|
|
380
417
|
output_attributes = self.modelObj.copy()
|
|
381
418
|
for i in range(output_attributes.shape[0]):
|
|
382
419
|
model = output_attributes.iloc[i]["model"]
|
|
383
420
|
output_attributes.at[i, "model"] = getattr(model, name)
|
|
421
|
+
|
|
422
|
+
if self.__class__._validate_model_supportability(first_atrribute_instance):
|
|
423
|
+
return __generate_model_object(output_attributes)
|
|
424
|
+
|
|
384
425
|
return output_attributes.rename(columns={"model": name})
|
|
385
426
|
|
|
386
427
|
def __getattr__(self, name):
|
|
387
428
|
# This just run attributes (functions and properties) from sklearn object.
|
|
388
429
|
def __sklearn_method_invoker(*c, **kwargs):
|
|
389
|
-
|
|
430
|
+
# sklearn model is returned from the function call. Create _SkLearnObjectWrapper object.
|
|
431
|
+
model_obj = attribute_instance(*c, **kwargs)
|
|
432
|
+
if self.__class__._validate_model_supportability(model_obj):
|
|
433
|
+
model_obj = self.__class__(model=model_obj)
|
|
434
|
+
model_obj._is_model_installed = True # Trained model is returned by function call.
|
|
435
|
+
return model_obj
|
|
436
|
+
|
|
390
437
|
if isinstance(self.modelObj, pd.DataFrame):
|
|
391
438
|
return self.__get_obj_attributes_multi_model(name)
|
|
392
439
|
|
|
393
|
-
|
|
394
|
-
|
|
440
|
+
attribute_instance = getattr(self.modelObj, name)
|
|
441
|
+
|
|
442
|
+
if callable(attribute_instance):
|
|
395
443
|
return __sklearn_method_invoker
|
|
396
|
-
|
|
444
|
+
|
|
445
|
+
if self.__class__._validate_model_supportability(attribute_instance):
|
|
446
|
+
# sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
|
|
447
|
+
model_obj = self.__class__(model=attribute_instance)
|
|
448
|
+
model_obj._is_model_installed = True # Trained model is returned as attribute.
|
|
449
|
+
return model_obj
|
|
450
|
+
|
|
451
|
+
return attribute_instance
|
|
397
452
|
|
|
398
453
|
@classmethod
|
|
399
454
|
def _validate_model_supportability(cls, model):
|
|
@@ -404,15 +459,25 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
404
459
|
error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
|
|
405
460
|
"The given model is not a supported opensource model.")
|
|
406
461
|
msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
|
|
462
|
+
package_name = None
|
|
463
|
+
class_name = None
|
|
407
464
|
try:
|
|
408
465
|
# For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
|
|
409
466
|
# TODO: check for other supported packages.
|
|
410
|
-
if model
|
|
411
|
-
|
|
467
|
+
if hasattr(model, "__module__"):
|
|
468
|
+
package_name = model.__module__.split(".")[0]
|
|
469
|
+
if package_name not in OpenSourcePackage.values():
|
|
470
|
+
return False
|
|
471
|
+
if hasattr(model, "__class__"):
|
|
472
|
+
class_name = model.__class__.__name__
|
|
412
473
|
except Exception as ex:
|
|
413
474
|
# If in case, model.__module__ fails.
|
|
414
475
|
raise TeradataMlException(error_msg, msg_code) from ex
|
|
415
476
|
|
|
477
|
+
# True only if package name is opensource package name and class name is not internal class.
|
|
478
|
+
return True if package_name and class_name and \
|
|
479
|
+
package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
|
|
480
|
+
|
|
416
481
|
def _save_model(self, model_name, replace_if_exists=False):
|
|
417
482
|
"""
|
|
418
483
|
Internal function to save the model stored in file at location mentioned by class variable
|
|
@@ -423,7 +488,8 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
423
488
|
conn = get_connection()
|
|
424
489
|
osml_models_table_exists = conn.dialect.has_table(conn,
|
|
425
490
|
table_name=_OSML_MODELS_TABLE_NAME,
|
|
426
|
-
schema=self._db_name
|
|
491
|
+
schema=self._db_name,
|
|
492
|
+
table_only=True)
|
|
427
493
|
if not osml_models_table_exists:
|
|
428
494
|
all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
|
|
429
495
|
all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
|
|
@@ -471,7 +537,11 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
471
537
|
Internal function to create an instance of the class using the model and deploy
|
|
472
538
|
the model to Vantage.
|
|
473
539
|
"""
|
|
474
|
-
cls._validate_model_supportability(model=model)
|
|
540
|
+
is_model_supportable = cls._validate_model_supportability(model=model)
|
|
541
|
+
if not is_model_supportable:
|
|
542
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
|
|
543
|
+
"deploy", "The given model is not a supported opensource model."),
|
|
544
|
+
MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
|
|
475
545
|
|
|
476
546
|
cls = cls(model=model)
|
|
477
547
|
# Load the model file into Vantage node as file can be used in
|
|
@@ -817,7 +887,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
817
887
|
for col in new_partition_columns] + [("model", model_type)]
|
|
818
888
|
|
|
819
889
|
file_name = "sklearn_fit.py"
|
|
820
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
821
890
|
|
|
822
891
|
if classes:
|
|
823
892
|
class_type = type(classes[0]).__name__
|
|
@@ -825,15 +894,15 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
825
894
|
else:
|
|
826
895
|
classes = str(None)
|
|
827
896
|
class_type = str(None)
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
self.
|
|
897
|
+
|
|
898
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
899
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
831
900
|
|
|
832
901
|
# db_name is applicable for enterprise system.
|
|
833
902
|
db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
|
|
834
903
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
835
904
|
script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
|
|
836
|
-
f"{len(label_columns)} {
|
|
905
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
837
906
|
f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
|
|
838
907
|
|
|
839
908
|
# Get unique values in partitioning columns.
|
|
@@ -852,6 +921,13 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
852
921
|
self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
|
|
853
922
|
for l_c in label_columns]
|
|
854
923
|
|
|
924
|
+
# If the model is trained a second time after the object creation,
|
|
925
|
+
# or if set_params() is called after the first model training,
|
|
926
|
+
# this flag will reset to False. So that for subsequent predict/score
|
|
927
|
+
# operations, the newly trained model will be installed.
|
|
928
|
+
if self._is_trained_model_installed:
|
|
929
|
+
self._is_trained_model_installed = False
|
|
930
|
+
|
|
855
931
|
def partial_fit(self, X=None, y=None, classes=None, **kwargs):
|
|
856
932
|
"""
|
|
857
933
|
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
@@ -972,7 +1048,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
972
1048
|
feature_columns,
|
|
973
1049
|
label_columns,
|
|
974
1050
|
func_name,
|
|
975
|
-
n_partitions,
|
|
976
1051
|
kwargs):
|
|
977
1052
|
"""
|
|
978
1053
|
Internal function to return list of column names and their sqlalchemy types
|
|
@@ -1010,7 +1085,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1010
1085
|
|
|
1011
1086
|
# For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
|
|
1012
1087
|
# Just for getting the number of columns and their types, using only one model of all.
|
|
1013
|
-
if
|
|
1088
|
+
if len(self._fit_partition_unique_values) == 1:
|
|
1014
1089
|
# Single model case.
|
|
1015
1090
|
skl_obj = self.modelObj
|
|
1016
1091
|
else:
|
|
@@ -1038,11 +1113,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1038
1113
|
"path() returns tuple of ndarrays of different shapes. Not Implemented yet."
|
|
1039
1114
|
)
|
|
1040
1115
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
if isinstance(trans_opt, csr_matrix):
|
|
1116
|
+
if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
|
|
1117
|
+
trans_opt = trans_opt.reshape(X.shape[0], 1)
|
|
1118
|
+
|
|
1119
|
+
if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
1046
1120
|
no_of_columns = trans_opt.get_shape()[1]
|
|
1047
1121
|
trans_opt = trans_opt.toarray()
|
|
1048
1122
|
elif isinstance(trans_opt, dict):
|
|
@@ -1054,6 +1128,14 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1054
1128
|
else:
|
|
1055
1129
|
no_of_columns = 1
|
|
1056
1130
|
|
|
1131
|
+
# Special handling when inverse_transform of no_of_columns returns no of rows
|
|
1132
|
+
# less than the no of classes. Such columns are filled with NaN values.
|
|
1133
|
+
# Updating number of columns here (new columns with NaN values will be added).
|
|
1134
|
+
if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
|
|
1135
|
+
no_of_columns = len(self.classes_)
|
|
1136
|
+
for i in range(len(ten_row_data)):
|
|
1137
|
+
trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
|
|
1138
|
+
|
|
1057
1139
|
# Special handling required for cross_decomposition classes's transform function, which
|
|
1058
1140
|
# takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
|
|
1059
1141
|
# y_scores. If label columns are not provided, only x_scores are returned.
|
|
@@ -1084,6 +1166,30 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1084
1166
|
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
1085
1167
|
opt_pd = pd.DataFrame(trans_opt)
|
|
1086
1168
|
|
|
1169
|
+
# Get output column types for each column in pandas df from the output of transform
|
|
1170
|
+
# type functions.
|
|
1171
|
+
types = {}
|
|
1172
|
+
for idx, col in enumerate(list(opt_pd.columns)):
|
|
1173
|
+
# Get type of column using data from all rows, in case if the column has None values.
|
|
1174
|
+
# 'and' of types of all values in the column with type(None) gives the type of the column.
|
|
1175
|
+
type_ = type(None)
|
|
1176
|
+
for i in range(len(trans_opt)):
|
|
1177
|
+
type_ = type_ and type(trans_opt[i][idx])
|
|
1178
|
+
|
|
1179
|
+
# If all the values of the output (trans_opt) is None, thelen use `str` as type since
|
|
1180
|
+
# pandas astype() does not accept None type.
|
|
1181
|
+
if type_ is type(None):
|
|
1182
|
+
type_ = str
|
|
1183
|
+
|
|
1184
|
+
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
1185
|
+
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
1186
|
+
# Error while type casting for column '2'"
|
|
1187
|
+
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
1188
|
+
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
1189
|
+
|
|
1190
|
+
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
1191
|
+
opt_pd = opt_pd.astype(types)
|
|
1192
|
+
|
|
1087
1193
|
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
1088
1194
|
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
1089
1195
|
col_types = [TIMESTAMP(timezone=True)
|
|
@@ -1118,26 +1224,29 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1118
1224
|
partition_columns)
|
|
1119
1225
|
|
|
1120
1226
|
file_name = "sklearn_score.py"
|
|
1121
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1122
1227
|
|
|
1123
1228
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1124
1229
|
else f"./{self._db_name}/{file_name}"
|
|
1125
1230
|
|
|
1126
|
-
|
|
1127
|
-
self.
|
|
1231
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1232
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1128
1233
|
|
|
1129
1234
|
self._validate_unique_partition_values(data, new_partition_columns)
|
|
1130
1235
|
|
|
1131
1236
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1132
1237
|
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1133
|
-
f"{len(label_columns)} {
|
|
1238
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1134
1239
|
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1135
1240
|
|
|
1136
1241
|
# score, aic, bic returns float values.
|
|
1137
1242
|
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1138
1243
|
for col in new_partition_columns] + [(func_name, FLOAT())]
|
|
1139
1244
|
|
|
1140
|
-
|
|
1245
|
+
# Checking the trained model installation. If not installed,
|
|
1246
|
+
# install it and set flag to True.
|
|
1247
|
+
if not self._is_trained_model_installed:
|
|
1248
|
+
self._install_initial_model_file()
|
|
1249
|
+
self._is_trained_model_installed = True
|
|
1141
1250
|
|
|
1142
1251
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1143
1252
|
|
|
@@ -1186,19 +1295,18 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1186
1295
|
kwargs.pop("label_columns")
|
|
1187
1296
|
|
|
1188
1297
|
file_name = "sklearn_transform.py"
|
|
1189
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1190
1298
|
|
|
1191
1299
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1192
1300
|
else f"./{self._db_name}/{file_name}"
|
|
1193
1301
|
|
|
1194
|
-
|
|
1195
|
-
self.
|
|
1302
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1303
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1196
1304
|
|
|
1197
1305
|
self._validate_unique_partition_values(data, new_partition_columns)
|
|
1198
1306
|
|
|
1199
1307
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1200
1308
|
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1201
|
-
f"{len(label_columns)} {
|
|
1309
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1202
1310
|
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1203
1311
|
|
|
1204
1312
|
# Returning feature columns also along with transformed columns because we don't know the
|
|
@@ -1208,15 +1316,18 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1208
1316
|
if func_name in ["predict", "decision_function"] and label_columns:
|
|
1209
1317
|
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1210
1318
|
for col in label_columns]
|
|
1319
|
+
|
|
1211
1320
|
return_types += self._get_return_columns_for_function_(data,
|
|
1212
1321
|
feature_columns,
|
|
1213
1322
|
label_columns,
|
|
1214
1323
|
func_name,
|
|
1215
|
-
len(new_partition_columns),
|
|
1216
1324
|
kwargs)
|
|
1217
1325
|
|
|
1218
|
-
#
|
|
1219
|
-
|
|
1326
|
+
# Checking the trained model installation. If not installed,
|
|
1327
|
+
# install it and set flag to True.
|
|
1328
|
+
if not self._is_trained_model_installed:
|
|
1329
|
+
self._install_initial_model_file()
|
|
1330
|
+
self._is_trained_model_installed = True
|
|
1220
1331
|
|
|
1221
1332
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1222
1333
|
|
|
@@ -1253,7 +1364,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1253
1364
|
feature_columns,
|
|
1254
1365
|
label_columns,
|
|
1255
1366
|
func_name,
|
|
1256
|
-
len(new_partition_columns),
|
|
1257
1367
|
{})
|
|
1258
1368
|
else:
|
|
1259
1369
|
# If there are no label_columns, we will have only one
|
|
@@ -1261,22 +1371,25 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1261
1371
|
return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
|
|
1262
1372
|
|
|
1263
1373
|
file_name = "sklearn_fit_predict.py"
|
|
1264
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1265
1374
|
|
|
1266
|
-
|
|
1267
|
-
self.
|
|
1375
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1376
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1268
1377
|
|
|
1269
1378
|
script_file_name = f"{file_name}" if self._is_lake_system \
|
|
1270
1379
|
else f"./{self._db_name}/{file_name}"
|
|
1271
1380
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1272
1381
|
script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
|
|
1273
|
-
f"{len(label_columns)} {
|
|
1382
|
+
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1274
1383
|
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1275
1384
|
|
|
1276
1385
|
# Get unique values in partitioning columns.
|
|
1277
1386
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1278
1387
|
|
|
1279
|
-
|
|
1388
|
+
# Checking the trained model installation. If not installed,
|
|
1389
|
+
# install it and flag to True.
|
|
1390
|
+
if not self._is_trained_model_installed:
|
|
1391
|
+
self._install_initial_model_file()
|
|
1392
|
+
self._is_trained_model_installed = True
|
|
1280
1393
|
|
|
1281
1394
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1282
1395
|
|
|
@@ -1354,7 +1467,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1354
1467
|
args_str = self._get_kwargs_str(kwargs)
|
|
1355
1468
|
|
|
1356
1469
|
file_name = "sklearn_neighbors.py"
|
|
1357
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1358
1470
|
|
|
1359
1471
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1360
1472
|
else f"./{self._db_name}/{file_name}"
|
|
@@ -1377,18 +1489,22 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1377
1489
|
else:
|
|
1378
1490
|
return_types += [("output", VARCHAR())]
|
|
1379
1491
|
|
|
1380
|
-
|
|
1381
|
-
self.
|
|
1492
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1493
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1382
1494
|
|
|
1383
1495
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1384
1496
|
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1385
|
-
f"{
|
|
1497
|
+
f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
|
|
1386
1498
|
f"{args_str}"
|
|
1387
1499
|
|
|
1388
1500
|
# Get unique values in partitioning columns.
|
|
1389
1501
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1390
1502
|
|
|
1391
|
-
|
|
1503
|
+
# Checking the trained model installation. If not installed,
|
|
1504
|
+
# install it and set flag to True.
|
|
1505
|
+
if not self._is_trained_model_installed:
|
|
1506
|
+
self._install_initial_model_file()
|
|
1507
|
+
self._is_trained_model_installed = True
|
|
1392
1508
|
|
|
1393
1509
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1394
1510
|
|
|
@@ -1472,7 +1588,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1472
1588
|
group_columns)
|
|
1473
1589
|
|
|
1474
1590
|
file_name = "sklearn_model_selection_split.py"
|
|
1475
|
-
self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
|
|
1476
1591
|
|
|
1477
1592
|
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1478
1593
|
else f"./{self._db_name}/{file_name}"
|
|
@@ -1496,18 +1611,22 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
|
|
|
1496
1611
|
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1497
1612
|
for col in new_partition_columns] + return_types
|
|
1498
1613
|
|
|
1499
|
-
|
|
1500
|
-
self.
|
|
1614
|
+
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1615
|
+
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1501
1616
|
|
|
1502
1617
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1503
1618
|
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1504
|
-
f"{len(label_columns)} {len(group_columns)} {
|
|
1619
|
+
f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1505
1620
|
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1506
1621
|
|
|
1507
1622
|
# Get unique values in partitioning columns.
|
|
1508
1623
|
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1509
1624
|
|
|
1510
|
-
|
|
1625
|
+
# Checking the trained model installation. If not installed,
|
|
1626
|
+
# install it and set flag to True.
|
|
1627
|
+
if not self._is_trained_model_installed:
|
|
1628
|
+
self._install_initial_model_file()
|
|
1629
|
+
self._is_trained_model_installed = True
|
|
1511
1630
|
|
|
1512
1631
|
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1513
1632
|
|
|
@@ -1586,19 +1705,25 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
|
|
|
1586
1705
|
|
|
1587
1706
|
self.__params = kwargs
|
|
1588
1707
|
|
|
1589
|
-
# Get indices and types of
|
|
1590
|
-
|
|
1591
|
-
|
|
1708
|
+
# Get indices of partition_columns and types of all columns.
|
|
1709
|
+
data_column_types_str, partition_indices_str, _, partition_cols = \
|
|
1710
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
|
|
1592
1711
|
|
|
1593
1712
|
script_file_path = f"{self._model_file_name}" if self._is_lake_system \
|
|
1594
1713
|
else f"./{self._db_name}/{self._model_file_name}"
|
|
1714
|
+
|
|
1715
|
+
model_file_prefix = None
|
|
1716
|
+
if self._is_lake_system:
|
|
1717
|
+
model_file_prefix = self._model_file_name.replace(".py", "")
|
|
1718
|
+
|
|
1595
1719
|
py_exc = UtilFuncs._get_python_execution_path()
|
|
1596
|
-
script_command = (f"{py_exc} {script_file_path} {
|
|
1597
|
-
f" "
|
|
1598
|
-
|
|
1720
|
+
script_command = (f"{py_exc} {script_file_path} {partition_indices_str} "\
|
|
1721
|
+
f"{data_column_types_str} {data_args_str} {self._is_lake_system}"\
|
|
1722
|
+
f" {model_file_prefix}")
|
|
1599
1723
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1724
|
+
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
1725
|
+
return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1726
|
+
for col in partition_cols] + [(self.__func_name, model_type)]
|
|
1602
1727
|
|
|
1603
1728
|
# Generate new file in .teradataml directory and install it to Vantage.
|
|
1604
1729
|
self._prepare_and_install_file()
|
|
@@ -1613,23 +1738,30 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
|
|
|
1613
1738
|
|
|
1614
1739
|
# File cleanup after processing.
|
|
1615
1740
|
os.remove(self._model_file_local)
|
|
1616
|
-
|
|
1617
|
-
force_remove=True)
|
|
1741
|
+
self._remove_script_file(self._model_file_name)
|
|
1618
1742
|
|
|
1619
1743
|
return self.modelObj
|
|
1620
1744
|
|
|
1621
1745
|
def _prepare_data_args_string(self, kwargs):
|
|
1746
|
+
"""
|
|
1747
|
+
Get column indices and types of each data related arguments in the format:
|
|
1748
|
+
"{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
1749
|
+
{<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
1750
|
+
"""
|
|
1622
1751
|
data_args_str = []
|
|
1623
1752
|
for arg_name in list(self.__data_args.keys()):
|
|
1624
1753
|
# Remove DataFrame arguments from kwargs, which will be passed to Script.
|
|
1625
1754
|
kwargs.pop(arg_name)
|
|
1626
1755
|
|
|
1627
1756
|
# Get column indices and their types for each dataframe from parent dataframe.
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1757
|
+
_, partition_indices_str, partition_types_str, _ = \
|
|
1758
|
+
self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
|
|
1759
|
+
self.__data_args[arg_name].columns,
|
|
1760
|
+
idx_delim=",",
|
|
1761
|
+
types_delim=",")
|
|
1762
|
+
|
|
1763
|
+
# Format "<arg_name>-<comma separated indices>-<comma separated types>"
|
|
1764
|
+
data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
|
|
1633
1765
|
|
|
1634
1766
|
# Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
|
|
1635
1767
|
# {<arg_name>-<comma separated indices>-<comma separated types>}"
|
|
@@ -1650,7 +1782,7 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
|
|
|
1650
1782
|
|
|
1651
1783
|
def _prepare_and_install_file(self):
|
|
1652
1784
|
"""
|
|
1653
|
-
Prepare function script file from template file and install it in
|
|
1785
|
+
Prepare function script file from template file and install it in Vantage.
|
|
1654
1786
|
"""
|
|
1655
1787
|
with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
|
|
1656
1788
|
script_data = fp.read()
|