teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +193 -1
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +25 -18
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +20 -2
- teradataml/analytics/utils.py +15 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +341 -112
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +84 -42
- teradataml/automl/data_transformation.py +69 -33
- teradataml/automl/feature_engineering.py +76 -9
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +35 -14
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +122 -63
- teradataml/common/messagecodes.py +14 -3
- teradataml/common/messages.py +8 -4
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +366 -74
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +348 -86
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +45 -29
- teradataml/dataframe/data_transfer.py +72 -46
- teradataml/dataframe/dataframe.py +642 -166
- teradataml/dataframe/dataframe_utils.py +167 -22
- teradataml/dataframe/functions.py +135 -20
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +330 -78
- teradataml/dbutils/dbutils.py +556 -140
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -26
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +307 -40
- teradataml/scriptmgmt/lls_utils.py +428 -145
- teradataml/store/__init__.py +2 -3
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +48 -19
- teradataml/table_operators/Script.py +23 -2
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +49 -1
- teradataml/utils/internal_buffer.py +38 -0
- teradataml/utils/validators.py +377 -62
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -15,46 +15,42 @@
|
|
|
15
15
|
#
|
|
16
16
|
# ##################################################################
|
|
17
17
|
|
|
18
|
-
from collections import OrderedDict, defaultdict
|
|
19
|
-
from importlib import import_module
|
|
20
|
-
|
|
21
18
|
import base64
|
|
22
19
|
import json
|
|
23
|
-
import numpy
|
|
24
20
|
import os
|
|
25
21
|
import pickle
|
|
26
|
-
import time
|
|
27
|
-
import inspect
|
|
28
22
|
import warnings
|
|
29
|
-
import
|
|
30
|
-
import
|
|
23
|
+
from collections import OrderedDict, defaultdict
|
|
24
|
+
from importlib import import_module
|
|
25
|
+
|
|
31
26
|
import pandas as pd
|
|
32
|
-
from
|
|
33
|
-
import
|
|
27
|
+
from teradataml.scriptmgmt.lls_utils import list_user_envs
|
|
28
|
+
from teradatasqlalchemy import BLOB, CLOB
|
|
34
29
|
|
|
35
|
-
from teradataml import _TDML_DIRECTORY, Script, TeradataMlException
|
|
36
|
-
from teradataml.
|
|
30
|
+
from teradataml import _TDML_DIRECTORY, Apply, Script, TeradataMlException
|
|
31
|
+
from teradataml.catalog.byom import delete_byom, retrieve_byom, save_byom
|
|
37
32
|
from teradataml.common import pylogger
|
|
38
|
-
from teradataml.common.
|
|
39
|
-
from teradataml.
|
|
40
|
-
from teradataml.dbutils.filemgr import install_file, remove_file
|
|
41
|
-
from teradataml.utils.utils import execute_sql
|
|
42
|
-
from teradataml.options.configure import configure
|
|
43
|
-
from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
|
|
44
|
-
_validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
|
|
45
|
-
from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
|
|
46
|
-
_OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
|
|
47
|
-
_OSML_ADDITIONAL_COLUMN_TYPES
|
|
33
|
+
from teradataml.common.constants import TeradataConstants
|
|
34
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
48
35
|
from teradataml.common.messagecodes import MessageCodes
|
|
49
36
|
from teradataml.common.messages import Messages
|
|
50
|
-
from teradataml.
|
|
51
|
-
from teradataml.
|
|
52
|
-
from teradataml.
|
|
37
|
+
from teradataml.common.utils import UtilFuncs
|
|
38
|
+
from teradataml.common.warnings import OneTimeUserWarning
|
|
39
|
+
from teradataml.context.context import (_get_current_databasename,
|
|
40
|
+
get_connection)
|
|
53
41
|
from teradataml.dataframe.dataframe import DataFrame
|
|
54
42
|
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
55
|
-
from teradataml.
|
|
56
|
-
|
|
57
|
-
|
|
43
|
+
from teradataml.dbutils.dbutils import (_create_table,
|
|
44
|
+
execute_sql, set_session_param)
|
|
45
|
+
from teradataml.dbutils.filemgr import install_file, remove_file
|
|
46
|
+
from teradataml.opensource._constants import (
|
|
47
|
+
_OSML_ADDITIONAL_COLUMN_TYPES, _OSML_MODELS_PRIMARY_INDEX,
|
|
48
|
+
_OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, _OSML_MODELS_TABLE_NAME,
|
|
49
|
+
OpensourceModels, OpenSourcePackage, _packages_verified_in_vantage)
|
|
50
|
+
from teradataml.opensource._wrapper_utils import (_generate_new_name,
|
|
51
|
+
_validate_df_query_type)
|
|
52
|
+
from teradataml.options.configure import configure
|
|
53
|
+
from teradataml.utils.validators import _Validators
|
|
58
54
|
|
|
59
55
|
logger = pylogger.getLogger()
|
|
60
56
|
|
|
@@ -92,8 +88,15 @@ class _GenericObjectWrapper:
|
|
|
92
88
|
self._env = configure.openml_user_env
|
|
93
89
|
else:
|
|
94
90
|
self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
|
|
95
|
-
|
|
96
|
-
|
|
91
|
+
|
|
92
|
+
# Check if the Python interpreter major versions are consistent between Vantage and local.
|
|
93
|
+
UtilFuncs._check_python_version_diff(self._env)
|
|
94
|
+
|
|
95
|
+
# Raise warning when python package versions don't match between Vantage and local.
|
|
96
|
+
# OPENSOURCE_PACKAGE_NAME is set for each opensource package, but not for the base class.
|
|
97
|
+
# Add a check to avoid running this function for the base class.
|
|
98
|
+
if self.OPENSOURCE_PACKAGE_NAME is not None:
|
|
99
|
+
UtilFuncs._check_package_version_diff(self.OPENSOURCE_PACKAGE_NAME.value, self._pkgs, self._env)
|
|
97
100
|
|
|
98
101
|
global _file_installed
|
|
99
102
|
## Flag to check whether trained model is installed or not.
|
|
@@ -295,7 +298,7 @@ class _GenericObjectWrapper:
|
|
|
295
298
|
elif n_unique_partitions > 1:
|
|
296
299
|
self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
|
|
297
300
|
else:
|
|
298
|
-
ValueError("Number of partitions should be greater than 0.")
|
|
301
|
+
raise ValueError("Number of partitions should be greater than 0.")
|
|
299
302
|
|
|
300
303
|
warnings.filterwarnings("default")
|
|
301
304
|
|
|
@@ -813,41 +816,56 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
813
816
|
def fit(self, **kwargs):
|
|
814
817
|
pass
|
|
815
818
|
|
|
816
|
-
def _convert_arguments_to_modelObj(self, args,
|
|
819
|
+
def _convert_arguments_to_modelObj(self, args, partition_col_values=None):
|
|
817
820
|
"""
|
|
818
|
-
Internal function to
|
|
819
|
-
|
|
821
|
+
Internal function to get appropriate model from <argument>.modelObj when multiple models are
|
|
822
|
+
generated by fit, based on partition_col_values. If partition_col_values is None, then it is
|
|
823
|
+
single model case.
|
|
820
824
|
"""
|
|
821
825
|
if isinstance(args, dict):
|
|
822
826
|
new_args = args.copy() # To avoid updating
|
|
823
827
|
for k, v in new_args.items():
|
|
824
|
-
if isinstance(v,
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
new_args[k] = v.modelObj
|
|
828
|
-
else:
|
|
828
|
+
if isinstance(v, _OpenSourceObjectWrapper):
|
|
829
|
+
arg_model_obj = v.modelObj
|
|
830
|
+
if isinstance(arg_model_obj, pd.DataFrame):
|
|
829
831
|
# multi-model. Get appropriate model from modelObj.
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
832
|
+
arg_partition_values_model_dict = v._get_partition_columns_to_model_dict()
|
|
833
|
+
new_args[k] = arg_partition_values_model_dict[partition_col_values]
|
|
834
|
+
else:
|
|
835
|
+
# single model.
|
|
836
|
+
new_args[k] = arg_model_obj
|
|
833
837
|
return new_args
|
|
834
838
|
|
|
835
|
-
|
|
836
|
-
elif isinstance(args, tuple):
|
|
839
|
+
if isinstance(args, tuple):
|
|
837
840
|
new_args = tuple()
|
|
838
841
|
for arg in args:
|
|
839
842
|
if isinstance(arg, type(self)):
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
new_args += (arg.modelObj,)
|
|
843
|
-
else:
|
|
843
|
+
arg_model_obj = arg.modelObj
|
|
844
|
+
if isinstance(arg_model_obj, pd.DataFrame):
|
|
844
845
|
# multi-model. Get appropriate model from modelObj.
|
|
845
|
-
|
|
846
|
+
arg_partition_values_model_dict = arg._get_partition_columns_to_model_dict()
|
|
847
|
+
new_args += (arg_partition_values_model_dict[partition_col_values],)
|
|
848
|
+
else:
|
|
849
|
+
# single model.
|
|
850
|
+
new_args += (arg_model_obj,)
|
|
846
851
|
else:
|
|
847
852
|
new_args += (arg,)
|
|
848
853
|
return new_args
|
|
849
854
|
return args
|
|
850
855
|
|
|
856
|
+
def _get_partition_columns_to_model_dict(self):
|
|
857
|
+
"""
|
|
858
|
+
Internal function to get partition columns to model dictionary.
|
|
859
|
+
"""
|
|
860
|
+
partition_values_model_dict = {}
|
|
861
|
+
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
862
|
+
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
863
|
+
|
|
864
|
+
for i in range(no_of_unique_partitions):
|
|
865
|
+
partition_values_model_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
|
|
866
|
+
|
|
867
|
+
return partition_values_model_dict
|
|
868
|
+
|
|
851
869
|
def __get_obj_attributes_multi_model(self, name):
|
|
852
870
|
"""
|
|
853
871
|
Internal function to get attributes of all sklearn model objects when multiple models are
|
|
@@ -873,12 +891,17 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
873
891
|
|
|
874
892
|
# Wrapper function to invoke dynamic method, using arguments
|
|
875
893
|
# passed by user, on model in each row.
|
|
876
|
-
def
|
|
894
|
+
def __opensource_method_invoker_for_multimodel(*c, **kwargs):
|
|
895
|
+
"""
|
|
896
|
+
Internal function to run functions not taking data related arguments but taking
|
|
897
|
+
arguments, which might contain other model objects.
|
|
898
|
+
"""
|
|
877
899
|
multi_models = self.modelObj.copy()
|
|
878
900
|
for i in range(multi_models.shape[0]):
|
|
879
901
|
curr_model = multi_models.iloc[i]["model"]
|
|
880
|
-
partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
|
|
881
|
-
|
|
902
|
+
partition_values = tuple(multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list())
|
|
903
|
+
|
|
904
|
+
partition_values_joined = "_".join([str(x) for x in partition_values])
|
|
882
905
|
if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
|
|
883
906
|
# filename is first argument.
|
|
884
907
|
kwargs1 = kwargs.copy()
|
|
@@ -886,17 +909,19 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
886
909
|
|
|
887
910
|
if len(c) > 0:
|
|
888
911
|
c1 = list(c1)
|
|
889
|
-
c1[0] = f"{c1[0]}_{
|
|
912
|
+
c1[0] = f"{c1[0]}_{partition_values_joined}"
|
|
890
913
|
c1 = tuple(c1)
|
|
891
914
|
if len(kwargs) > 0 and kwargs.get("filename", None):
|
|
892
|
-
kwargs1["filename"] = f"{kwargs1['filename']}_{
|
|
915
|
+
kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values_joined}"
|
|
893
916
|
|
|
894
|
-
|
|
895
|
-
|
|
917
|
+
pos_args = self._convert_arguments_to_modelObj(c1, partition_values)
|
|
918
|
+
key_args = self._convert_arguments_to_modelObj(kwargs1, partition_values)
|
|
896
919
|
else:
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
920
|
+
pos_args = self._convert_arguments_to_modelObj(c, partition_values)
|
|
921
|
+
key_args = self._convert_arguments_to_modelObj(kwargs, partition_values)
|
|
922
|
+
|
|
923
|
+
multi_models.at[i, "model"] = getattr(curr_model, name)(*pos_args, **key_args)
|
|
924
|
+
|
|
900
925
|
first_function_value = multi_models.at[0, "model"]
|
|
901
926
|
if self.__class__._validate_model_supportability(first_function_value):
|
|
902
927
|
return __generate_model_object(multi_models, init_model_obj=first_function_value)
|
|
@@ -914,7 +939,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
914
939
|
# If first_atrribute_instance is callable, it should be applied on model in each row
|
|
915
940
|
# using passed arguments.
|
|
916
941
|
if callable(first_atrribute_instance):
|
|
917
|
-
return
|
|
942
|
+
return __opensource_method_invoker_for_multimodel
|
|
918
943
|
|
|
919
944
|
output_attributes = self.modelObj.copy()
|
|
920
945
|
for i in range(output_attributes.shape[0]):
|
|
@@ -928,7 +953,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
928
953
|
|
|
929
954
|
def __getattr__(self, name):
|
|
930
955
|
# This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
|
|
931
|
-
def
|
|
956
|
+
def __opensource_method_invoker(*c, **kwargs):
|
|
932
957
|
# Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
|
|
933
958
|
model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
|
|
934
959
|
if self.__class__._validate_model_supportability(model_obj):
|
|
@@ -942,7 +967,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
942
967
|
attribute_instance = getattr(self.modelObj, name)
|
|
943
968
|
|
|
944
969
|
if callable(attribute_instance):
|
|
945
|
-
return
|
|
970
|
+
return __opensource_method_invoker
|
|
946
971
|
|
|
947
972
|
if self.__class__._validate_model_supportability(attribute_instance):
|
|
948
973
|
# sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
|
|
@@ -1003,7 +1028,9 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
1003
1028
|
fit_partition_columns_non_default=self._fit_partition_colums_non_default,
|
|
1004
1029
|
model=self.modelObj,
|
|
1005
1030
|
pos_args=self.pos_args,
|
|
1006
|
-
key_args=self.kwargs
|
|
1031
|
+
key_args=self.kwargs,
|
|
1032
|
+
osml_class=self.__class__.__name__,
|
|
1033
|
+
osml_module=self.__module__)
|
|
1007
1034
|
|
|
1008
1035
|
# Saved the model object to a file to be used in save_byom() for writing to Vantage table.
|
|
1009
1036
|
file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
|
|
@@ -1048,7 +1075,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
1048
1075
|
cls = cls(model=model)
|
|
1049
1076
|
# Load the model file into Vantage node as file can be used in
|
|
1050
1077
|
# predict or other operations.
|
|
1051
|
-
cls._install_initial_model_file()
|
|
1078
|
+
cls._install_initial_model_file(False)
|
|
1052
1079
|
|
|
1053
1080
|
cls._save_model(model_name, replace_if_exists)
|
|
1054
1081
|
|
|
@@ -1079,9 +1106,16 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
1079
1106
|
# - 2nd contains package name.
|
|
1080
1107
|
model_obj = pickle.loads(model_vals_list[0])
|
|
1081
1108
|
model = model_obj.model
|
|
1109
|
+
osml_module = model_obj.osml_module if hasattr(model_obj, "osml_module") else None
|
|
1110
|
+
osml_class = model_obj.osml_class if hasattr(model_obj, "osml_class") else None
|
|
1111
|
+
|
|
1112
|
+
new_cls = cls
|
|
1113
|
+
if osml_module is not None and osml_class is not None:
|
|
1114
|
+
new_cls = getattr(import_module(osml_module), osml_class)
|
|
1115
|
+
|
|
1082
1116
|
package = model_vals_list[1]
|
|
1083
1117
|
|
|
1084
|
-
if package !=
|
|
1118
|
+
if package != new_cls.OPENSOURCE_PACKAGE_NAME.value:
|
|
1085
1119
|
# Raise error if trying to access model of different package.
|
|
1086
1120
|
raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
|
|
1087
1121
|
f". Requested model is from '{package}' package"),
|
|
@@ -1091,23 +1125,24 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
1091
1125
|
# Create a new instance of the class and set the model object to the instance.
|
|
1092
1126
|
# Instantiation can take only model, not model object. Hence, passing one of the model
|
|
1093
1127
|
# from pandas df. Updating modelObj and other fields later
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1128
|
+
new_cls = new_cls(model=model.iloc[1,2])
|
|
1129
|
+
new_cls.modelObj = model
|
|
1130
|
+
new_cls._fit_partition_unique_values = [lst[:len(model_obj.fit_partition_columns_non_default)]
|
|
1131
|
+
for lst in model.values.tolist()]
|
|
1097
1132
|
else:
|
|
1098
|
-
|
|
1133
|
+
new_cls = new_cls(model=model)
|
|
1099
1134
|
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1135
|
+
new_cls._model_file_name_prefix = model_obj.partition_file_prefix
|
|
1136
|
+
new_cls._is_default_partition_value_fit = model_obj.is_default_partition_value
|
|
1137
|
+
new_cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
|
|
1138
|
+
new_cls.pos_args = model_obj.pos_args
|
|
1139
|
+
new_cls.kwargs = model_obj.key_args
|
|
1105
1140
|
|
|
1106
1141
|
# Load the model file into Vantage node as file can be used in
|
|
1107
1142
|
# predict or other operations.
|
|
1108
|
-
|
|
1143
|
+
new_cls._install_initial_model_file(False)
|
|
1109
1144
|
|
|
1110
|
-
return
|
|
1145
|
+
return new_cls
|
|
1111
1146
|
|
|
1112
1147
|
def deploy(self, model_name, replace_if_exists=False):
|
|
1113
1148
|
"""
|
|
@@ -1136,962 +1171,89 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
|
|
|
1136
1171
|
"replace_if_exists" is set to False.
|
|
1137
1172
|
|
|
1138
1173
|
EXAMPLES:
|
|
1174
|
+
## sklearn examples.
|
|
1175
|
+
|
|
1176
|
+
# Import the required libraries and create LinearRegression Opensource object wrapper.
|
|
1139
1177
|
>>> from teradataml import td_sklearn
|
|
1140
1178
|
>>> model = td_sklearn.LinearRegression(normalize=True)
|
|
1141
1179
|
>>> model
|
|
1142
1180
|
LinearRegression(normalize=True)
|
|
1143
1181
|
|
|
1144
|
-
# Example 1: Deploy the model held by
|
|
1182
|
+
# Example 1: Deploy the model held by LinearRegression Opensource object to Vantage.
|
|
1145
1183
|
>>> lin_reg = model.deploy("linreg_model_ver_2")
|
|
1146
1184
|
Model is saved.
|
|
1147
1185
|
>>> lin_reg
|
|
1148
1186
|
LinearRegression(normalize=True)
|
|
1149
1187
|
|
|
1150
|
-
# Example 2: Deploy the model held by
|
|
1151
|
-
# as that of model that already existed in Vantage.
|
|
1188
|
+
# Example 2: Deploy the model held by LinearRegression Opensource object to Vantage
|
|
1189
|
+
# with the name same as that of model that already existed in Vantage.
|
|
1152
1190
|
>>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
|
|
1153
1191
|
Model is deleted.
|
|
1154
1192
|
Model is saved.
|
|
1155
1193
|
>>> lin_reg
|
|
1156
1194
|
LinearRegression(normalize=True)
|
|
1157
|
-
"""
|
|
1158
|
-
|
|
1159
|
-
# Install model file into Vantage, if not installed.
|
|
1160
|
-
self._install_initial_model_file()
|
|
1161
|
-
|
|
1162
|
-
self._save_model(model_name, replace_if_exists)
|
|
1163
|
-
return self
|
|
1164
|
-
|
|
1165
1195
|
|
|
1166
|
-
|
|
1196
|
+
## lightgbm examples.
|
|
1167
1197
|
|
|
1168
|
-
|
|
1198
|
+
# For lightGBM, there are two types of models created by `td_lightgbm` interface object.
|
|
1199
|
+
# - the model object created using LGBMClassifier or other class of lightgbm.sklearn module.
|
|
1200
|
+
# - the model object created using train() method (object of lightgbm.Booster class)
|
|
1201
|
+
# or standalone object of lightgbm.Booster class.
|
|
1169
1202
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
if model is not None:
|
|
1176
|
-
self.modelObj = model
|
|
1177
|
-
self.module_name = model.__module__.split("._")[0]
|
|
1178
|
-
self.class_name = model.__class__.__name__
|
|
1179
|
-
# __dict__ gets all the arguments as dictionary including default ones and positional
|
|
1180
|
-
# args.
|
|
1181
|
-
self.kwargs = model.__dict__
|
|
1182
|
-
self.pos_args = tuple() # Kept empty as all are moved to kwargs.
|
|
1183
|
-
else:
|
|
1184
|
-
self._initialize_object()
|
|
1203
|
+
# Import the required libraries and create LGBMClassifier Opensource object wrapper.
|
|
1204
|
+
>>> from teradataml import td_lightgbm
|
|
1205
|
+
>>> model = td_lightgbm.LGBMClassifier()
|
|
1206
|
+
>>> model
|
|
1207
|
+
LGBMClassifier()
|
|
1185
1208
|
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
"""
|
|
1192
|
-
_validate_opensource_func_args(X=X, y=y, groups=groups,
|
|
1193
|
-
fit_partition_cols=self._fit_partition_colums_non_default,
|
|
1194
|
-
kwargs=kwargs,
|
|
1195
|
-
skip_either_or_that=skip_either_or_that)
|
|
1196
|
-
return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
1197
|
-
fit_partition_cols=self._fit_partition_colums_non_default)
|
|
1209
|
+
# Example 1: Deploy the model held by LGBMClassifier Opensource object to Vantage.
|
|
1210
|
+
>>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2")
|
|
1211
|
+
Model is saved.
|
|
1212
|
+
>>> lgbm_cls
|
|
1213
|
+
LGBMClassifier()
|
|
1198
1214
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1215
|
+
# Example 2: Deploy the model held by LGBMClassifier Opensource object to Vantage with
|
|
1216
|
+
# the name same as that of model that already existed in Vantage.
|
|
1217
|
+
>>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2", replace_if_exists=True)
|
|
1218
|
+
Model is deleted.
|
|
1219
|
+
Model is saved.
|
|
1220
|
+
>>> lgbm_cls
|
|
1221
|
+
LGBMClassifier()
|
|
1222
|
+
|
|
1223
|
+
# Example 3: Deploy the model trained using td_lightgbm.train() function to Vantage.
|
|
1224
|
+
# Create Dataset object, assuming df_x and df_y are the feature and label teradataml
|
|
1225
|
+
# DataFrames.
|
|
1226
|
+
>>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
|
|
1227
|
+
>>> lgbm_data
|
|
1228
|
+
<lightgbm.basic.Dataset object at ....>
|
|
1229
|
+
|
|
1230
|
+
# Train the model using `td_lightgbm` interface object.
|
|
1231
|
+
>>> model = td_lightgbm.train(params={}, train_set=lgbm_data, num_boost_round=30, valid_sets=[lgbm_data])
|
|
1232
|
+
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
|
|
1233
|
+
You can set `force_row_wise=true` to remove the overhead.
|
|
1234
|
+
And if memory is not enough, you can set `force_col_wise=true`.
|
|
1235
|
+
[LightGBM] [Info] Total Bins 532
|
|
1236
|
+
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
|
|
1237
|
+
[1] valid_0's l2: 0.215811
|
|
1238
|
+
[2] valid_0's l2: 0.188138
|
|
1239
|
+
[3] valid_0's l2: 0.166146
|
|
1240
|
+
...
|
|
1241
|
+
...
|
|
1242
|
+
[29] valid_0's l2: 0.042255
|
|
1243
|
+
[30] valid_0's l2: 0.0416953
|
|
1244
|
+
|
|
1245
|
+
# Deploy the model to Vantage.
|
|
1246
|
+
>>> lgb_model = model.deploy("lgbm_train_model_ver_2")
|
|
1247
|
+
>>> lgb_model
|
|
1248
|
+
<lightgbm.basic.Booster object at ...>
|
|
1209
1249
|
"""
|
|
1210
|
-
label_columns = self._get_columns_as_list(label_columns)
|
|
1211
|
-
|
|
1212
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
1213
|
-
feature_columns,
|
|
1214
|
-
label_columns,
|
|
1215
|
-
partition_columns)
|
|
1216
|
-
|
|
1217
|
-
model_type = BLOB() if self._is_lake_system else CLOB()
|
|
1218
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1219
|
-
for col in new_partition_columns] + [("model", model_type)]
|
|
1220
|
-
|
|
1221
|
-
if classes:
|
|
1222
|
-
class_type = type(classes[0]).__name__
|
|
1223
|
-
classes = "--".join([str(x) for x in classes])
|
|
1224
|
-
else:
|
|
1225
|
-
classes = str(None)
|
|
1226
|
-
class_type = str(None)
|
|
1227
|
-
|
|
1228
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1229
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1230
|
-
|
|
1231
|
-
# db_name is applicable for enterprise system.
|
|
1232
|
-
db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
|
|
1233
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1234
|
-
script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
|
|
1235
|
-
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1236
|
-
f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
|
|
1237
|
-
|
|
1238
|
-
# Get unique values in partitioning columns.
|
|
1239
|
-
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1240
1250
|
|
|
1251
|
+
# Install model file into Vantage, if not installed.
|
|
1241
1252
|
self._install_initial_model_file()
|
|
1242
1253
|
|
|
1243
|
-
self.
|
|
1244
|
-
return_types)
|
|
1245
|
-
|
|
1246
|
-
self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
|
|
1247
|
-
|
|
1248
|
-
def partial_fit(self, X=None, y=None, classes=None, **kwargs):
|
|
1249
|
-
"""
|
|
1250
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1251
|
-
"""
|
|
1252
|
-
st_time = time.time()
|
|
1253
|
-
|
|
1254
|
-
# "classes" argument validation.
|
|
1255
|
-
arg_info_matrix = []
|
|
1256
|
-
arg_info_matrix.append(["classes", classes, True, (list)])
|
|
1257
|
-
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1258
|
-
|
|
1259
|
-
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
1260
|
-
|
|
1261
|
-
data, feature_columns, label_columns, _, partition_columns = \
|
|
1262
|
-
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
1263
|
-
|
|
1264
|
-
if partition_columns:
|
|
1265
|
-
self._is_default_partition_value_fit = False
|
|
1266
|
-
self._fit_partition_colums_non_default = partition_columns
|
|
1267
|
-
|
|
1268
|
-
self._run_fit_related_functions(data,
|
|
1269
|
-
feature_columns,
|
|
1270
|
-
label_columns,
|
|
1271
|
-
partition_columns,
|
|
1272
|
-
inspect.stack()[0][3],
|
|
1273
|
-
classes)
|
|
1274
|
-
|
|
1275
|
-
self._partial_fit_execution_time = time.time() - st_time
|
|
1276
|
-
|
|
1277
|
-
return self
|
|
1278
|
-
|
|
1279
|
-
def fit(self, X=None, y=None, **kwargs):
|
|
1280
|
-
"""
|
|
1281
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1282
|
-
"""
|
|
1283
|
-
st_time = time.time()
|
|
1284
|
-
|
|
1285
|
-
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
1286
|
-
|
|
1287
|
-
data, feature_columns, label_columns, _, partition_columns = \
|
|
1288
|
-
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
1289
|
-
|
|
1290
|
-
if partition_columns:
|
|
1291
|
-
self._is_default_partition_value_fit = False
|
|
1292
|
-
self._fit_partition_colums_non_default = partition_columns
|
|
1293
|
-
|
|
1294
|
-
file_name = kwargs.pop("file_name", None)
|
|
1295
|
-
func_name = kwargs.pop("name", "fit")
|
|
1296
|
-
|
|
1297
|
-
args = {"data": data,
|
|
1298
|
-
"feature_columns": feature_columns,
|
|
1299
|
-
"label_columns": label_columns,
|
|
1300
|
-
"partition_columns": partition_columns,
|
|
1301
|
-
"func": func_name}
|
|
1302
|
-
|
|
1303
|
-
if file_name is not None:
|
|
1304
|
-
args["file_name"] = file_name
|
|
1305
|
-
|
|
1306
|
-
self._run_fit_related_functions(**args)
|
|
1307
|
-
|
|
1308
|
-
self._fit_execution_time = time.time() - st_time
|
|
1309
|
-
|
|
1310
|
-
return self
|
|
1311
|
-
|
|
1312
|
-
def set_params(self, **params):
|
|
1313
|
-
"""
|
|
1314
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1315
|
-
"""
|
|
1316
|
-
for key, val in params.items():
|
|
1317
|
-
self.kwargs[key] = val
|
|
1318
|
-
|
|
1319
|
-
# Initialize with new arguments and return the class/model object.
|
|
1320
|
-
# set_params takes all keyword arguments and no positional arguments.
|
|
1321
|
-
self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
|
|
1254
|
+
self._save_model(model_name, replace_if_exists)
|
|
1322
1255
|
return self
|
|
1323
1256
|
|
|
1324
|
-
# get_params() will be executed through __getattr__().
|
|
1325
|
-
|
|
1326
|
-
# @_validate_fit_run
|
|
1327
|
-
def __getattr__(self, name):
|
|
1328
|
-
def __run_transform(*c, **kwargs):
|
|
1329
|
-
kwargs["name"] = name
|
|
1330
|
-
return self._transform(*c, **kwargs)
|
|
1331
|
-
|
|
1332
|
-
def __run_function_needing_all_rows(*c, **kwargs):
|
|
1333
|
-
kwargs["name"] = name
|
|
1334
|
-
return self._run_function_needing_all_rows(*c, **kwargs)
|
|
1335
|
-
|
|
1336
|
-
def __run_kneighbors(*c, **kwargs):
|
|
1337
|
-
kwargs["name"] = name
|
|
1338
|
-
return self._run_neighbors(*c, **kwargs)
|
|
1339
|
-
|
|
1340
|
-
if name in ["score", "aic", "bic", "perplexity"]:
|
|
1341
|
-
# TODO: ELE-6352 - Implement error_norm() function later.
|
|
1342
|
-
return __run_function_needing_all_rows
|
|
1343
|
-
|
|
1344
|
-
if name in ["kneighbors",
|
|
1345
|
-
"radius_neighbors",
|
|
1346
|
-
"kneighbors_graph",
|
|
1347
|
-
"radius_neighbors_graph"]:
|
|
1348
|
-
return __run_kneighbors
|
|
1349
|
-
|
|
1350
|
-
if name in ["predict",
|
|
1351
|
-
"transform",
|
|
1352
|
-
"inverse_transform",
|
|
1353
|
-
"predict_proba",
|
|
1354
|
-
"predict_log_proba",
|
|
1355
|
-
"decision_function",
|
|
1356
|
-
"score_samples",
|
|
1357
|
-
"decision_path",
|
|
1358
|
-
"apply",
|
|
1359
|
-
"cost_complexity_pruning_path",
|
|
1360
|
-
"gibbs",
|
|
1361
|
-
"kneighbors_graph",
|
|
1362
|
-
"radius_neighbors_graph",
|
|
1363
|
-
"mahalanobis",
|
|
1364
|
-
"correct_covariance",
|
|
1365
|
-
"reweight_covariance",
|
|
1366
|
-
"path"]:
|
|
1367
|
-
return __run_transform
|
|
1368
|
-
|
|
1369
|
-
return super().__getattr__(name)
|
|
1370
|
-
|
|
1371
|
-
def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
|
|
1372
|
-
func_name, **kwargs):
|
|
1373
|
-
"""
|
|
1374
|
-
Internal function to handle multi model case for transform function for functions
|
|
1375
|
-
["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
|
|
1376
|
-
and "Birch" of cluster module.
|
|
1377
|
-
These functions generate multiple models and when transform is applied to each model, it generates
|
|
1378
|
-
output with different number of columns.
|
|
1379
|
-
"""
|
|
1380
|
-
skl_objs_dict = {}
|
|
1381
|
-
no_of_unique_partitions = len(self._fit_partition_unique_values)
|
|
1382
|
-
no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
|
|
1383
|
-
|
|
1384
|
-
# Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
|
|
1385
|
-
# and get the maximum number of columns and their types.
|
|
1386
|
-
for i in range(no_of_unique_partitions):
|
|
1387
|
-
skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
data = data.select(feature_columns + label_columns + partition_columns)
|
|
1391
|
-
ten_row_data = data.head(10).get_values()
|
|
1392
|
-
X = numpy.array(ten_row_data)
|
|
1393
|
-
|
|
1394
|
-
# For multi-model case, model in one AMP can give more number of columns than other AMPs.
|
|
1395
|
-
# Returns clause can't contain different number of columns in different AMPs. Hence, taking
|
|
1396
|
-
# maximum number of columns and their types from all models.
|
|
1397
|
-
max_no_of_columns = 0
|
|
1398
|
-
max_col_names = []
|
|
1399
|
-
max_col_types = []
|
|
1400
|
-
|
|
1401
|
-
def _get_input_row_without_nans(row):
|
|
1402
|
-
"""
|
|
1403
|
-
`inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
|
|
1404
|
-
"""
|
|
1405
|
-
X1 = []
|
|
1406
|
-
for _, v in enumerate(row):
|
|
1407
|
-
if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
|
|
1408
|
-
# Add to list when:
|
|
1409
|
-
# - v is None or
|
|
1410
|
-
# - v is string or
|
|
1411
|
-
# - v is not nan or
|
|
1412
|
-
# - if module is impute (which transforms nan values) even though v is nan.
|
|
1413
|
-
X1.append(v)
|
|
1414
|
-
else:
|
|
1415
|
-
# skip nan values.
|
|
1416
|
-
pass
|
|
1417
|
-
return X1
|
|
1418
|
-
|
|
1419
|
-
for i in range(X.shape[0]):
|
|
1420
|
-
# Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
|
|
1421
|
-
partition_values = tuple(X[i, -no_of_partitioning_cols:])
|
|
1422
|
-
skl_obj = skl_objs_dict[partition_values]
|
|
1423
|
-
|
|
1424
|
-
X1 = X[i, :-no_of_partitioning_cols]
|
|
1425
|
-
# Since Nans/NULLs are added in transform for last columns where some models generated
|
|
1426
|
-
# less number of columns, removing Nans/NULLs from the input row for inverse_transform
|
|
1427
|
-
# using function _get_input_row_without_nans().
|
|
1428
|
-
X1 = numpy.array([_get_input_row_without_nans(X1)])
|
|
1429
|
-
|
|
1430
|
-
trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
|
|
1431
|
-
|
|
1432
|
-
no_of_columns = 1
|
|
1433
|
-
|
|
1434
|
-
if trans_opt.shape == (X1.shape[0],):
|
|
1435
|
-
trans_opt = trans_opt.reshape(X1.shape[0], 1)
|
|
1436
|
-
|
|
1437
|
-
if isinstance(trans_opt[0], numpy.ndarray) \
|
|
1438
|
-
or isinstance(trans_opt[0], list) \
|
|
1439
|
-
or isinstance(trans_opt[0], tuple):
|
|
1440
|
-
no_of_columns = len(trans_opt[0])
|
|
1441
|
-
|
|
1442
|
-
col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
|
|
1443
|
-
|
|
1444
|
-
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
1445
|
-
opt_pd = pd.DataFrame(trans_opt)
|
|
1446
|
-
|
|
1447
|
-
# Get output column types for each column in pandas df from the output of transform
|
|
1448
|
-
# type functions.
|
|
1449
|
-
types = {}
|
|
1450
|
-
for idx in range(no_of_columns):
|
|
1451
|
-
col = list(opt_pd.columns)[idx]
|
|
1452
|
-
|
|
1453
|
-
# Only one row in trans_opt.
|
|
1454
|
-
if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
|
|
1455
|
-
type_ = type(trans_opt[0][idx])
|
|
1456
|
-
else:
|
|
1457
|
-
# only one value in the output.
|
|
1458
|
-
type_ = type(trans_opt[0])
|
|
1459
|
-
|
|
1460
|
-
# If type of the output value (trans_opt) is None, then use `str` as type since
|
|
1461
|
-
# pandas astype() does not accept None type.
|
|
1462
|
-
if type_ is type(None):
|
|
1463
|
-
type_ = str
|
|
1464
|
-
|
|
1465
|
-
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
1466
|
-
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
1467
|
-
# Error while type casting for column '2'"
|
|
1468
|
-
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
1469
|
-
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
1470
|
-
|
|
1471
|
-
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
1472
|
-
opt_pd = opt_pd.astype(types)
|
|
1473
|
-
|
|
1474
|
-
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
1475
|
-
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
1476
|
-
col_types = [TIMESTAMP(timezone=True)
|
|
1477
|
-
if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
|
|
1478
|
-
else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
|
|
1479
|
-
for key, col_name in enumerate(list(opt_pd.columns))]
|
|
1480
|
-
|
|
1481
|
-
# Different models in multi model case can generate different number of output columns for example in
|
|
1482
|
-
# SelectFpr. Hence, taking the model which generates maximum number of columns.
|
|
1483
|
-
if no_of_columns > max_no_of_columns:
|
|
1484
|
-
max_no_of_columns = no_of_columns
|
|
1485
|
-
max_col_names = col_names
|
|
1486
|
-
max_col_types = col_types
|
|
1487
|
-
|
|
1488
|
-
return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
|
|
1489
|
-
|
|
1490
|
-
def _get_return_columns_for_function_(self,
|
|
1491
|
-
data,
|
|
1492
|
-
feature_columns,
|
|
1493
|
-
label_columns,
|
|
1494
|
-
partition_columns,
|
|
1495
|
-
func_name,
|
|
1496
|
-
kwargs):
|
|
1497
|
-
"""
|
|
1498
|
-
Internal function to return list of column names and their sqlalchemy types
|
|
1499
|
-
which should be used in return_types of Script.
|
|
1500
|
-
"""
|
|
1501
|
-
if func_name == "fit_predict":
|
|
1502
|
-
"""
|
|
1503
|
-
Get return columns using label_columns.
|
|
1504
|
-
"""
|
|
1505
|
-
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
|
|
1506
|
-
data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1507
|
-
for i, col in enumerate(label_columns)]
|
|
1508
|
-
|
|
1509
|
-
if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
|
|
1510
|
-
"""
|
|
1511
|
-
Return predict columns using either label_columns (if provided) or
|
|
1512
|
-
self._fit_label_columns_types (if the function is trained using label columns).
|
|
1513
|
-
Otherwise run predict on ten rows of data to get the number of columns and their types
|
|
1514
|
-
after this if condition.
|
|
1515
|
-
"""
|
|
1516
|
-
if label_columns:
|
|
1517
|
-
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
|
|
1518
|
-
data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1519
|
-
for i, col in enumerate(label_columns)]
|
|
1520
|
-
if self._fit_label_columns_types:
|
|
1521
|
-
return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
|
|
1522
|
-
for i, col_type in enumerate(self._fit_label_columns_types)]
|
|
1523
|
-
|
|
1524
|
-
## If function is not `fit_predict`:
|
|
1525
|
-
# then take one row of transform/other functions to execute in client
|
|
1526
|
-
# to get number of columns in return clause and their Vantage types.
|
|
1527
|
-
n_f = len(feature_columns)
|
|
1528
|
-
n_c = len(label_columns)
|
|
1529
|
-
|
|
1530
|
-
# For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
|
|
1531
|
-
# Just for getting the number of columns and their types, using only one model of all.
|
|
1532
|
-
if len(self._fit_partition_unique_values) == 1:
|
|
1533
|
-
# Single model case.
|
|
1534
|
-
skl_obj = self.modelObj
|
|
1535
|
-
else:
|
|
1536
|
-
# Multi model case.
|
|
1537
|
-
if (func_name in ["transform", "inverse_transform"] and \
|
|
1538
|
-
self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
|
|
1539
|
-
(self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
|
|
1540
|
-
# Special handling for multi model case for transform function as these classes
|
|
1541
|
-
# generate transform output with different number of columns for each model.
|
|
1542
|
-
# Hence, need to add Nulls/Nans to columns which are not present in the transform output of
|
|
1543
|
-
# some models.
|
|
1544
|
-
return self._special_handling_multimodel_(data, feature_columns, label_columns,
|
|
1545
|
-
partition_columns, func_name, **kwargs)
|
|
1546
|
-
|
|
1547
|
-
skl_obj = self.modelObj.iloc[0]["model"]
|
|
1548
|
-
|
|
1549
|
-
data = data.select(feature_columns + label_columns)
|
|
1550
|
-
|
|
1551
|
-
ten_row_data = data.head(10).get_values()
|
|
1552
|
-
X = numpy.array(ten_row_data)
|
|
1553
|
-
if label_columns:
|
|
1554
|
-
y = X[:,n_f : n_f + n_c]
|
|
1555
|
-
X = X[:,:n_f]
|
|
1556
|
-
# predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
|
|
1557
|
-
# in local run if passed. Generally, 'y' is passed to return y along with actual output.
|
|
1558
|
-
try:
|
|
1559
|
-
trans_opt = getattr(skl_obj, func_name)(X, y, **kwargs)
|
|
1560
|
-
except TypeError as ex:
|
|
1561
|
-
# Function which does not accept 'y' like predict_proba() raises error like
|
|
1562
|
-
# "predict_proba() takes 2 positional arguments but 3 were given".
|
|
1563
|
-
trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
|
|
1564
|
-
else:
|
|
1565
|
-
trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
|
|
1566
|
-
|
|
1567
|
-
if func_name == "path":
|
|
1568
|
-
raise NotImplementedError(
|
|
1569
|
-
"path() returns tuple of ndarrays of different shapes. Not Implemented yet."
|
|
1570
|
-
)
|
|
1571
|
-
|
|
1572
|
-
if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
|
|
1573
|
-
trans_opt = trans_opt.reshape(X.shape[0], 1)
|
|
1574
|
-
|
|
1575
|
-
if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
1576
|
-
no_of_columns = trans_opt.get_shape()[1]
|
|
1577
|
-
trans_opt = trans_opt.toarray()
|
|
1578
|
-
elif isinstance(trans_opt, dict):
|
|
1579
|
-
raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
|
|
1580
|
-
elif isinstance(trans_opt[0], numpy.ndarray) \
|
|
1581
|
-
or isinstance(trans_opt[0], list) \
|
|
1582
|
-
or isinstance(trans_opt[0], tuple):
|
|
1583
|
-
no_of_columns = len(trans_opt[0])
|
|
1584
|
-
else:
|
|
1585
|
-
no_of_columns = 1
|
|
1586
|
-
|
|
1587
|
-
# Special handling when inverse_transform of no_of_columns returns no of rows
|
|
1588
|
-
# less than the no of classes. Such columns are filled with NaN values.
|
|
1589
|
-
# Updating number of columns here (new columns with NaN values will be added).
|
|
1590
|
-
if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
|
|
1591
|
-
no_of_columns = len(self.classes_)
|
|
1592
|
-
for i in range(len(ten_row_data)):
|
|
1593
|
-
trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
|
|
1594
|
-
|
|
1595
|
-
# Special handling required for cross_decomposition classes's transform function, which
|
|
1596
|
-
# takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
|
|
1597
|
-
# y_scores. If label columns are not provided, only x_scores are returned.
|
|
1598
|
-
if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
|
|
1599
|
-
# For cross_decomposition, output is a tuple of arrays when label columns are provided
|
|
1600
|
-
# along with feature columns for transform function. In this case, concatenate the
|
|
1601
|
-
# arrays and return the column names accordingly.
|
|
1602
|
-
if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
|
|
1603
|
-
assert trans_opt[0].shape == trans_opt[1].shape,\
|
|
1604
|
-
"Output arrays should be of same shape when transform/fit_transform is run "\
|
|
1605
|
-
"with label columns for cross_decomposition classes.."
|
|
1606
|
-
first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
|
|
1607
|
-
second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
|
|
1608
|
-
no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
|
|
1609
|
-
col_names = first_cols + second_cols
|
|
1610
|
-
|
|
1611
|
-
trans_opt = numpy.concatenate(trans_opt, axis=1)
|
|
1612
|
-
else:
|
|
1613
|
-
assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
|
|
1614
|
-
"without label columns for cross_decomposition classes, "\
|
|
1615
|
-
"output should be a numpy array."
|
|
1616
|
-
no_of_columns = trans_opt.shape[1]
|
|
1617
|
-
col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
|
|
1618
|
-
else:
|
|
1619
|
-
# Generate list of new column names.
|
|
1620
|
-
col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
|
|
1621
|
-
|
|
1622
|
-
# Get new column sqlalchemy types for pandas df columns of transform output.
|
|
1623
|
-
opt_pd = pd.DataFrame(trans_opt)
|
|
1624
|
-
|
|
1625
|
-
# Get output column types for each column in pandas df from the output of transform
|
|
1626
|
-
# type functions.
|
|
1627
|
-
types = {}
|
|
1628
|
-
for idx, col in enumerate(list(opt_pd.columns)):
|
|
1629
|
-
# Get type of column using data from all rows, in case if the column has None values.
|
|
1630
|
-
# 'and' of types of all values in the column with type(None) gives the type of the column.
|
|
1631
|
-
type_ = type(None)
|
|
1632
|
-
for i in range(len(trans_opt)):
|
|
1633
|
-
type_ = type_ and type(trans_opt[i][idx])
|
|
1634
|
-
|
|
1635
|
-
# If all the values of the output (trans_opt) is None, thelen use `str` as type since
|
|
1636
|
-
# pandas astype() does not accept None type.
|
|
1637
|
-
if type_ is type(None):
|
|
1638
|
-
type_ = str
|
|
1639
|
-
|
|
1640
|
-
# numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
|
|
1641
|
-
# It raises error like "Cannot convert non-finite values (NA or inf) to integer:
|
|
1642
|
-
# Error while type casting for column '2'"
|
|
1643
|
-
# Hence, using pd.Int64Dtype() for integer columns with nan values.
|
|
1644
|
-
types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
|
|
1645
|
-
|
|
1646
|
-
# Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
|
|
1647
|
-
opt_pd = opt_pd.astype(types)
|
|
1648
|
-
|
|
1649
|
-
# If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
|
|
1650
|
-
# TIMESTAMP(timezone=True) else map it according to default value.
|
|
1651
|
-
col_types = [TIMESTAMP(timezone=True)
|
|
1652
|
-
if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
|
|
1653
|
-
else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
|
|
1654
|
-
for key, col_name in enumerate(list(opt_pd.columns))]
|
|
1655
|
-
|
|
1656
|
-
return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
|
|
1657
|
-
|
|
1658
|
-
@_validate_fit_run
|
|
1659
|
-
def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
|
|
1660
|
-
"""
|
|
1661
|
-
Internal function to run functions like score, aic, bic which needs all rows and return
|
|
1662
|
-
one floating number as result.
|
|
1663
|
-
"""
|
|
1664
|
-
st_time = time.time()
|
|
1665
|
-
|
|
1666
|
-
assert kwargs["name"], "function name should be passed."
|
|
1667
|
-
func_name = kwargs["name"]
|
|
1668
|
-
|
|
1669
|
-
# Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
|
|
1670
|
-
kwargs.pop("name")
|
|
1671
|
-
|
|
1672
|
-
data, feature_columns, label_columns, _, partition_columns = \
|
|
1673
|
-
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
1674
|
-
|
|
1675
|
-
label_columns = self._get_columns_as_list(label_columns)
|
|
1676
|
-
|
|
1677
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
1678
|
-
feature_columns,
|
|
1679
|
-
label_columns,
|
|
1680
|
-
partition_columns)
|
|
1681
|
-
|
|
1682
|
-
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1683
|
-
else f"./{self._db_name}/{file_name}"
|
|
1684
|
-
|
|
1685
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1686
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1687
|
-
|
|
1688
|
-
self._validate_unique_partition_values(data, new_partition_columns)
|
|
1689
|
-
|
|
1690
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1691
|
-
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1692
|
-
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1693
|
-
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1694
|
-
|
|
1695
|
-
# score, aic, bic returns float values.
|
|
1696
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1697
|
-
for col in new_partition_columns] + [(func_name, FLOAT())]
|
|
1698
|
-
|
|
1699
|
-
# Checking the trained model installation. If not installed,
|
|
1700
|
-
# install it and set flag to True.
|
|
1701
|
-
if not self._is_trained_model_installed:
|
|
1702
|
-
self._install_initial_model_file()
|
|
1703
|
-
self._is_trained_model_installed = True
|
|
1704
|
-
|
|
1705
|
-
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1706
|
-
|
|
1707
|
-
self._score_execution_time = time.time() - st_time
|
|
1708
|
-
|
|
1709
|
-
if self._is_default_partition_value_fit:
|
|
1710
|
-
# For single model case, partition column is internally generated and
|
|
1711
|
-
# no point in returning it to the user.
|
|
1712
|
-
return opt.select(func_name)
|
|
1713
|
-
|
|
1714
|
-
return opt
|
|
1715
|
-
|
|
1716
|
-
@_validate_fit_run
|
|
1717
|
-
def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
|
|
1718
|
-
"""
|
|
1719
|
-
Internal function to run predict/transform and similar functions, which returns
|
|
1720
|
-
multiple columns. This function will return data row along with the generated
|
|
1721
|
-
columns' row data, unlike sklearn's functions which returns just output data.
|
|
1722
|
-
"""
|
|
1723
|
-
st_time = time.time()
|
|
1724
|
-
|
|
1725
|
-
assert kwargs["name"], "function name should be passed."
|
|
1726
|
-
func_name = kwargs["name"]
|
|
1727
|
-
|
|
1728
|
-
# Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
|
|
1729
|
-
kwargs.pop("name")
|
|
1730
|
-
|
|
1731
|
-
data, feature_columns, label_columns, _, partition_columns = \
|
|
1732
|
-
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
1733
|
-
|
|
1734
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
1735
|
-
feature_columns,
|
|
1736
|
-
label_columns,
|
|
1737
|
-
partition_columns)
|
|
1738
|
-
|
|
1739
|
-
# Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
|
|
1740
|
-
self._remove_data_related_args_from_kwargs(kwargs)
|
|
1741
|
-
|
|
1742
|
-
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1743
|
-
else f"./{self._db_name}/{file_name}"
|
|
1744
|
-
|
|
1745
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1746
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1747
|
-
|
|
1748
|
-
self._validate_unique_partition_values(data, new_partition_columns)
|
|
1749
|
-
|
|
1750
|
-
return_columns_python_types = None
|
|
1751
|
-
if self._fit_label_columns_python_types:
|
|
1752
|
-
return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
|
|
1753
|
-
|
|
1754
|
-
# Returning feature columns also along with transformed columns because we don't know the
|
|
1755
|
-
# mapping of feature columns to the transformed columns.
|
|
1756
|
-
## 'correct_covariance()' returns the (n_features, n_features)
|
|
1757
|
-
if func_name == "correct_covariance":
|
|
1758
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1759
|
-
for col in new_partition_columns]
|
|
1760
|
-
else:
|
|
1761
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1762
|
-
for col in (new_partition_columns + feature_columns)]
|
|
1763
|
-
if func_name in ["predict", "decision_function"] and label_columns:
|
|
1764
|
-
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1765
|
-
for col in label_columns]
|
|
1766
|
-
|
|
1767
|
-
output_cols_types = self._get_return_columns_for_function_(data,
|
|
1768
|
-
feature_columns,
|
|
1769
|
-
label_columns,
|
|
1770
|
-
new_partition_columns,
|
|
1771
|
-
func_name,
|
|
1772
|
-
kwargs)
|
|
1773
|
-
return_types += output_cols_types
|
|
1774
|
-
|
|
1775
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1776
|
-
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1777
|
-
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1778
|
-
f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
|
|
1779
|
-
f"{return_columns_python_types}"
|
|
1780
|
-
|
|
1781
|
-
# Checking the trained model installation. If not installed,
|
|
1782
|
-
# install it and set flag to True.
|
|
1783
|
-
if not self._is_trained_model_installed:
|
|
1784
|
-
self._install_initial_model_file()
|
|
1785
|
-
self._is_trained_model_installed = True
|
|
1786
|
-
|
|
1787
|
-
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1788
|
-
|
|
1789
|
-
self._transform_execution_time = time.time() - st_time
|
|
1790
|
-
|
|
1791
|
-
return self._get_returning_df(opt, new_partition_columns, return_types)
|
|
1792
|
-
|
|
1793
|
-
def fit_predict(self, X=None, y=None, **kwargs):
|
|
1794
|
-
"""
|
|
1795
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1796
|
-
"""
|
|
1797
|
-
st_time = time.time()
|
|
1798
|
-
|
|
1799
|
-
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
1800
|
-
|
|
1801
|
-
data, feature_columns, label_columns, _, partition_columns = \
|
|
1802
|
-
self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
|
|
1803
|
-
|
|
1804
|
-
if partition_columns:
|
|
1805
|
-
self._is_default_partition_value_fit = False
|
|
1806
|
-
|
|
1807
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
1808
|
-
feature_columns,
|
|
1809
|
-
label_columns,
|
|
1810
|
-
partition_columns)
|
|
1811
|
-
|
|
1812
|
-
# Return label_columns also if user provides in the function call.
|
|
1813
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1814
|
-
for col in (new_partition_columns + feature_columns + label_columns)]
|
|
1815
|
-
|
|
1816
|
-
func_name = inspect.stack()[0][3]
|
|
1817
|
-
if label_columns:
|
|
1818
|
-
return_types += self._get_return_columns_for_function_(data,
|
|
1819
|
-
feature_columns,
|
|
1820
|
-
label_columns,
|
|
1821
|
-
new_partition_columns,
|
|
1822
|
-
func_name,
|
|
1823
|
-
{})
|
|
1824
|
-
else:
|
|
1825
|
-
# If there are no label_columns, we will have only one
|
|
1826
|
-
# predicted column.
|
|
1827
|
-
return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
|
|
1828
|
-
|
|
1829
|
-
file_name = "sklearn_fit_predict.py"
|
|
1830
|
-
|
|
1831
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1832
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1833
|
-
|
|
1834
|
-
script_file_name = f"{file_name}" if self._is_lake_system \
|
|
1835
|
-
else f"./{self._db_name}/{file_name}"
|
|
1836
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1837
|
-
script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
|
|
1838
|
-
f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
1839
|
-
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
1840
|
-
|
|
1841
|
-
# Get unique values in partitioning columns.
|
|
1842
|
-
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1843
|
-
|
|
1844
|
-
# Checking the trained model installation. If not installed,
|
|
1845
|
-
# install it and flag to True.
|
|
1846
|
-
if not self._is_trained_model_installed:
|
|
1847
|
-
self._install_initial_model_file()
|
|
1848
|
-
self._is_trained_model_installed = True
|
|
1849
|
-
|
|
1850
|
-
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1851
|
-
|
|
1852
|
-
self._fit_predict_execution_time = time.time() - st_time
|
|
1853
|
-
|
|
1854
|
-
if self._is_default_partition_value_fit:
|
|
1855
|
-
# For single model case, partition column is internally generated and no point in
|
|
1856
|
-
# returning it to the user.
|
|
1857
|
-
|
|
1858
|
-
# Extract columns from return types.
|
|
1859
|
-
returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
|
|
1860
|
-
return opt.select(returning_cols)
|
|
1861
|
-
|
|
1862
|
-
return opt
|
|
1863
|
-
|
|
1864
|
-
def fit_transform(self, X=None, y=None, **kwargs):
|
|
1865
|
-
"""
|
|
1866
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1867
|
-
"""
|
|
1868
|
-
# 'y' is not needed for transform().
|
|
1869
|
-
fit_obj = self.fit(X, y, **kwargs)
|
|
1870
|
-
kwargs["label_columns"] = None
|
|
1871
|
-
return fit_obj.transform(X, None, **kwargs)
|
|
1872
|
-
|
|
1873
|
-
@_validate_fit_run
|
|
1874
|
-
def _run_neighbors(self, X=None, **kwargs):
|
|
1875
|
-
"""
|
|
1876
|
-
Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
|
|
1877
|
-
radius_neighbors_graph which returns multiple columns. This function will return data row
|
|
1878
|
-
along with the generated columns' row data, unlike sklearn's functions which returns just
|
|
1879
|
-
output data.
|
|
1880
|
-
"""
|
|
1881
|
-
assert kwargs["name"], "function name should be passed."
|
|
1882
|
-
func_name = kwargs["name"]
|
|
1883
|
-
kwargs.pop("name")
|
|
1884
|
-
|
|
1885
|
-
if self.module_name != "sklearn.neighbors":
|
|
1886
|
-
raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
|
|
1887
|
-
|
|
1888
|
-
data = kwargs.get("data", None)
|
|
1889
|
-
partition_columns = kwargs.get("partition_columns", None)
|
|
1890
|
-
|
|
1891
|
-
if not X and not partition_columns and not data:
|
|
1892
|
-
# If data is not passed, then run from client only.
|
|
1893
|
-
# TODO: decide whether to run from client or from Vantage.
|
|
1894
|
-
opt = super().__getattr__(func_name)(**kwargs)
|
|
1895
|
-
from scipy.sparse.csr import csr_matrix
|
|
1896
|
-
if isinstance(opt, csr_matrix):
|
|
1897
|
-
return opt.toarray()
|
|
1898
|
-
return opt
|
|
1899
|
-
|
|
1900
|
-
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
1901
|
-
|
|
1902
|
-
data, feature_columns, _, _, new_partition_columns = \
|
|
1903
|
-
self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
|
|
1904
|
-
skip_either_or_that=True)
|
|
1905
|
-
|
|
1906
|
-
# Remove the kwargs data.
|
|
1907
|
-
self._remove_data_related_args_from_kwargs(kwargs)
|
|
1908
|
-
|
|
1909
|
-
if partition_columns:
|
|
1910
|
-
# kwargs are passed to kneighbors function. So, removing them from kwargs.
|
|
1911
|
-
self._is_default_partition_value_fit = False
|
|
1912
|
-
|
|
1913
|
-
# Generating new partition column name.
|
|
1914
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
1915
|
-
feature_columns,
|
|
1916
|
-
[],
|
|
1917
|
-
partition_columns)
|
|
1918
|
-
|
|
1919
|
-
args_str = self._get_kwargs_str(kwargs)
|
|
1920
|
-
|
|
1921
|
-
file_name = "sklearn_neighbors.py"
|
|
1922
|
-
|
|
1923
|
-
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
1924
|
-
else f"./{self._db_name}/{file_name}"
|
|
1925
|
-
|
|
1926
|
-
# Returning feature columns also along with new columns.
|
|
1927
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
1928
|
-
for col in (new_partition_columns + feature_columns)]
|
|
1929
|
-
|
|
1930
|
-
# `return_distance` is needed as the result is a tuple of two arrays when it is True.
|
|
1931
|
-
return_distance = kwargs.get("return_distance", True) # Default value is True.
|
|
1932
|
-
|
|
1933
|
-
# Though new columns return numpy arrays, we are returning them as strings.
|
|
1934
|
-
# TODO: Will update to columns later, if requested later.
|
|
1935
|
-
if func_name in ['kneighbors', 'radius_neighbors']:
|
|
1936
|
-
if return_distance:
|
|
1937
|
-
return_types += [("neigh_dist", VARCHAR())]
|
|
1938
|
-
return_types += [("neigh_ind", VARCHAR())]
|
|
1939
|
-
elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
|
|
1940
|
-
return_types += [("A", VARCHAR())]
|
|
1941
|
-
else:
|
|
1942
|
-
return_types += [("output", VARCHAR())]
|
|
1943
|
-
|
|
1944
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
1945
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
1946
|
-
|
|
1947
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
1948
|
-
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
1949
|
-
f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
|
|
1950
|
-
f"{args_str}"
|
|
1951
|
-
|
|
1952
|
-
# Get unique values in partitioning columns.
|
|
1953
|
-
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
1954
|
-
|
|
1955
|
-
# Checking the trained model installation. If not installed,
|
|
1956
|
-
# install it and set flag to True.
|
|
1957
|
-
if not self._is_trained_model_installed:
|
|
1958
|
-
self._install_initial_model_file()
|
|
1959
|
-
self._is_trained_model_installed = True
|
|
1960
|
-
|
|
1961
|
-
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
1962
|
-
|
|
1963
|
-
return self._get_returning_df(opt, new_partition_columns, return_types)
|
|
1964
|
-
|
|
1965
|
-
def split(self, X=None, y=None, groups=None, **kwargs):
|
|
1966
|
-
"""
|
|
1967
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1968
|
-
"""
|
|
1969
|
-
opt = self._run_model_selection("split", X=X, y=y, groups=groups,
|
|
1970
|
-
skip_either_or_that=True, kwargs=kwargs)
|
|
1971
|
-
|
|
1972
|
-
# Get number of splits in the result DataFrame.
|
|
1973
|
-
n_splits = opt.drop_duplicate("split_id").shape[0]
|
|
1974
|
-
|
|
1975
|
-
data = kwargs.get("data", None)
|
|
1976
|
-
feature_columns = kwargs.get("feature_columns", [])
|
|
1977
|
-
label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
|
|
1978
|
-
|
|
1979
|
-
# If there is not X and y, get feature_columns and label_columns for "data".
|
|
1980
|
-
partition_columns = kwargs.get("partition_columns", [])
|
|
1981
|
-
feature_columns = [col for col in X.columns if col not in partition_columns] \
|
|
1982
|
-
if X and not data and not feature_columns else feature_columns
|
|
1983
|
-
label_columns = y.columns if y and not data and not label_columns else label_columns
|
|
1984
|
-
|
|
1985
|
-
# Return iterator of the train and test dataframes for each split.
|
|
1986
|
-
for i in range(1, n_splits+1):
|
|
1987
|
-
train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
|
|
1988
|
-
.select(partition_columns + feature_columns + label_columns)
|
|
1989
|
-
train_df._index_label = None
|
|
1990
|
-
test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
|
|
1991
|
-
.select(partition_columns + feature_columns + label_columns)
|
|
1992
|
-
test_df._index_label = None
|
|
1993
|
-
|
|
1994
|
-
yield train_df, test_df
|
|
1995
|
-
|
|
1996
|
-
def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
|
|
1997
|
-
"""
|
|
1998
|
-
Please check the description in Docs/OpensourceML/sklearn.py.
|
|
1999
|
-
"""
|
|
2000
|
-
return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
|
|
2001
|
-
skip_either_or_that=True, kwargs=kwargs)
|
|
2002
|
-
|
|
2003
|
-
def _run_model_selection(self,
|
|
2004
|
-
func_name,
|
|
2005
|
-
X=None,
|
|
2006
|
-
y=None,
|
|
2007
|
-
groups=None,
|
|
2008
|
-
skip_either_or_that=False,
|
|
2009
|
-
kwargs={}):
|
|
2010
|
-
"""
|
|
2011
|
-
Internal function to run functions like split, get_n_splits of model selection module.
|
|
2012
|
-
- get_n_splits() returns number of splits as value, not as teradataml DataFrame.
|
|
2013
|
-
- split() returns teradataml DataFrame containing train and test data for each split
|
|
2014
|
-
(add partition information if the argument "partition_cols" is provided).
|
|
2015
|
-
"""
|
|
2016
|
-
if self.module_name != "sklearn.model_selection":
|
|
2017
|
-
raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
|
|
2018
|
-
f"have {func_name}() method.")
|
|
2019
|
-
|
|
2020
|
-
data = kwargs.get("data", None)
|
|
2021
|
-
|
|
2022
|
-
if not X and not y and not groups and not data:
|
|
2023
|
-
# If data is not passed, then run from client only.
|
|
2024
|
-
# TODO: decide whether to run from client or from Vantage.
|
|
2025
|
-
return super().__getattr__(func_name)()
|
|
2026
|
-
|
|
2027
|
-
self._is_default_partition_value_fit = True # False when the user provides partition columns.
|
|
2028
|
-
|
|
2029
|
-
data, feature_columns, label_columns, group_columns, partition_columns = \
|
|
2030
|
-
self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
|
|
2031
|
-
skip_either_or_that=skip_either_or_that)
|
|
2032
|
-
|
|
2033
|
-
if partition_columns:
|
|
2034
|
-
self._is_default_partition_value_fit = False
|
|
2035
|
-
|
|
2036
|
-
data, new_partition_columns = self._get_data_and_data_partition_columns(data,
|
|
2037
|
-
feature_columns,
|
|
2038
|
-
label_columns,
|
|
2039
|
-
partition_columns,
|
|
2040
|
-
group_columns)
|
|
2041
|
-
|
|
2042
|
-
file_name = "sklearn_model_selection_split.py"
|
|
2043
|
-
|
|
2044
|
-
script_file_path = f"{file_name}" if self._is_lake_system \
|
|
2045
|
-
else f"./{self._db_name}/{file_name}"
|
|
2046
|
-
|
|
2047
|
-
if func_name == "split":
|
|
2048
|
-
# Need to generate data into splits of train and test.
|
|
2049
|
-
# split_id - the column which will be used to identify the split.
|
|
2050
|
-
# data_type - the column which will be used to identify whether the row is
|
|
2051
|
-
# train or test row.
|
|
2052
|
-
return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
|
|
2053
|
-
# Returning feature columns and label columns as well.
|
|
2054
|
-
return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
2055
|
-
for col in (feature_columns + label_columns)]
|
|
2056
|
-
else:
|
|
2057
|
-
# Return Varchar by default.
|
|
2058
|
-
# Returns Varchar even for functions like `get_n_splits` which returns large integer
|
|
2059
|
-
# numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
|
|
2060
|
-
# `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
|
|
2061
|
-
return_types = [(func_name, VARCHAR())]
|
|
2062
|
-
|
|
2063
|
-
return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
|
|
2064
|
-
for col in new_partition_columns] + return_types
|
|
2065
|
-
|
|
2066
|
-
data_column_types_str, partition_indices_str, _, new_partition_columns = \
|
|
2067
|
-
self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
|
|
2068
|
-
|
|
2069
|
-
py_exc = UtilFuncs._get_python_execution_path()
|
|
2070
|
-
script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
|
|
2071
|
-
f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
|
|
2072
|
-
f"{self._model_file_name_prefix} {self._is_lake_system}"
|
|
2073
|
-
|
|
2074
|
-
# Get unique values in partitioning columns.
|
|
2075
|
-
self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
|
|
2076
|
-
|
|
2077
|
-
# Checking the trained model installation. If not installed,
|
|
2078
|
-
# install it and set flag to True.
|
|
2079
|
-
if not self._is_trained_model_installed:
|
|
2080
|
-
self._install_initial_model_file()
|
|
2081
|
-
self._is_trained_model_installed = True
|
|
2082
|
-
|
|
2083
|
-
opt = self._run_script(data, script_command, new_partition_columns, return_types)
|
|
2084
|
-
|
|
2085
|
-
if func_name == "get_n_splits" and not partition_columns:
|
|
2086
|
-
# Return number of splits as value, not as dataframe.
|
|
2087
|
-
vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
|
|
2088
|
-
opt = vals.fetchall()[0][0]
|
|
2089
|
-
|
|
2090
|
-
# Varchar is returned by the script. Convert it to int.
|
|
2091
|
-
return int(opt)
|
|
2092
|
-
|
|
2093
|
-
return opt
|
|
2094
|
-
|
|
2095
1257
|
|
|
2096
1258
|
class _FunctionWrapper(_GenericObjectWrapper):
|
|
2097
1259
|
def __init__(self, module_name, func_name, file_type, template_file):
|
|
@@ -2151,10 +1313,3 @@ class _FunctionWrapper(_GenericObjectWrapper):
|
|
|
2151
1313
|
self._remove_script_file(self._script_file_name)
|
|
2152
1314
|
|
|
2153
1315
|
return self.modelObj
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
class _SKLearnFunctionWrapper(_FunctionWrapper):
|
|
2157
|
-
def __init__(self, module_name, func_name):
|
|
2158
|
-
file_type = "file_fn_sklearn"
|
|
2159
|
-
template_file = "sklearn_function.template"
|
|
2160
|
-
super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)
|