PyPI - teradataml - Versions diffs - 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl - Mend

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +193 -1
teradataml/__init__.py +2 -1
teradataml/_version.py +2 -2
teradataml/analytics/analytic_function_executor.py +25 -18
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
teradataml/analytics/sqle/__init__.py +20 -2
teradataml/analytics/utils.py +15 -1
teradataml/analytics/valib.py +18 -4
teradataml/automl/__init__.py +341 -112
teradataml/automl/autodataprep/__init__.py +471 -0
teradataml/automl/data_preparation.py +84 -42
teradataml/automl/data_transformation.py +69 -33
teradataml/automl/feature_engineering.py +76 -9
teradataml/automl/feature_exploration.py +639 -25
teradataml/automl/model_training.py +35 -14
teradataml/clients/auth_client.py +2 -2
teradataml/common/__init__.py +1 -2
teradataml/common/constants.py +122 -63
teradataml/common/messagecodes.py +14 -3
teradataml/common/messages.py +8 -4
teradataml/common/sqlbundle.py +40 -10
teradataml/common/utils.py +366 -74
teradataml/common/warnings.py +11 -0
teradataml/context/context.py +348 -86
teradataml/data/amazon_reviews_25.csv +26 -0
teradataml/data/apriori_example.json +22 -0
teradataml/data/byom_example.json +11 -0
teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
teradataml/data/hnsw_alter_data.csv +5 -0
teradataml/data/hnsw_data.csv +10 -0
teradataml/data/jsons/byom/h2opredict.json +1 -1
teradataml/data/jsons/byom/onnxembeddings.json +266 -0
teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
teradataml/data/ner_dict.csv +8 -0
teradataml/data/ner_input_eng.csv +7 -0
teradataml/data/ner_rule.csv +5 -0
teradataml/data/pos_input.csv +40 -0
teradataml/data/tdnerextractor_example.json +14 -0
teradataml/data/teradataml_example.json +21 -0
teradataml/data/textmorph_example.json +5 -0
teradataml/data/to_num_data.csv +4 -0
teradataml/data/tochar_data.csv +5 -0
teradataml/data/trans_dense.csv +16 -0
teradataml/data/trans_sparse.csv +55 -0
teradataml/data/vectordistance_example.json +1 -1
teradataml/dataframe/copy_to.py +45 -29
teradataml/dataframe/data_transfer.py +72 -46
teradataml/dataframe/dataframe.py +642 -166
teradataml/dataframe/dataframe_utils.py +167 -22
teradataml/dataframe/functions.py +135 -20
teradataml/dataframe/setop.py +11 -6
teradataml/dataframe/sql.py +330 -78
teradataml/dbutils/dbutils.py +556 -140
teradataml/dbutils/filemgr.py +14 -10
teradataml/hyperparameter_tuner/optimizer.py +12 -1
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
teradataml/opensource/_class.py +141 -17
teradataml/opensource/{constants.py → _constants.py} +7 -3
teradataml/opensource/_lightgbm.py +52 -53
teradataml/opensource/_sklearn.py +1008 -0
teradataml/opensource/_wrapper_utils.py +5 -5
teradataml/options/__init__.py +47 -15
teradataml/options/configure.py +103 -26
teradataml/options/display.py +13 -2
teradataml/plot/axis.py +47 -8
teradataml/plot/figure.py +33 -0
teradataml/plot/plot.py +63 -13
teradataml/scriptmgmt/UserEnv.py +307 -40
teradataml/scriptmgmt/lls_utils.py +428 -145
teradataml/store/__init__.py +2 -3
teradataml/store/feature_store/feature_store.py +102 -7
teradataml/table_operators/Apply.py +48 -19
teradataml/table_operators/Script.py +23 -2
teradataml/table_operators/TableOperator.py +3 -1
teradataml/table_operators/table_operator_util.py +58 -9
teradataml/utils/dtypes.py +49 -1
teradataml/utils/internal_buffer.py +38 -0
teradataml/utils/validators.py +377 -62
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/libaed_0_1.dylib +0 -0
teradataml/libaed_0_1.so +0 -0
teradataml/opensource/sklearn/__init__.py +0 -0
teradataml/store/vector_store/__init__.py +0 -1586
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0

teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} RENAMED Viewed

@@ -15,46 +15,42 @@
 #
 # ##################################################################
-from collections import OrderedDict, defaultdict
-from importlib import import_module
 import base64
 import json
-import numpy
 import os
 import pickle
-import time
-import inspect
 import warnings
-import json
-import math
+from collections import OrderedDict, defaultdict
+from importlib import import_module
 import pandas as pd
-from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
-import pandas.api.types as pt
+from teradataml.scriptmgmt.lls_utils import list_user_envs
+from teradatasqlalchemy import BLOB, CLOB
-from teradataml import _TDML_DIRECTORY, Script, TeradataMlException, Apply
-from teradataml.dataframe.copy_to import _get_sqlalchemy_mapping
+from teradataml import _TDML_DIRECTORY, Apply, Script, TeradataMlException
+from teradataml.catalog.byom import delete_byom, retrieve_byom, save_byom
 from teradataml.common import pylogger
-from teradataml.common.utils import UtilFuncs
-from teradataml.context.context import _get_current_databasename, get_connection
-from teradataml.dbutils.filemgr import install_file, remove_file
-from teradataml.utils.utils import execute_sql
-from teradataml.options.configure import configure
-from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
-    _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
-from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
-    _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
-    _OSML_ADDITIONAL_COLUMN_TYPES
+from teradataml.common.constants import TeradataConstants
+from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.common.messagecodes import MessageCodes
 from teradataml.common.messages import Messages
-from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
-from teradataml.dbutils.dbutils import _create_table, set_session_param
-from teradataml.utils.validators import _Validators
+from teradataml.common.utils import UtilFuncs
+from teradataml.common.warnings import OneTimeUserWarning
+from teradataml.context.context import (_get_current_databasename,
+                                        get_connection)
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml.dataframe.dataframe_utils import DataFrameUtils
-from teradataml.common.garbagecollector import GarbageCollector
-from teradataml.common.constants import TeradataConstants
+from teradataml.dbutils.dbutils import (_create_table,
+                                        execute_sql, set_session_param)
+from teradataml.dbutils.filemgr import install_file, remove_file
+from teradataml.opensource._constants import (
+    _OSML_ADDITIONAL_COLUMN_TYPES, _OSML_MODELS_PRIMARY_INDEX,
+    _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, _OSML_MODELS_TABLE_NAME,
+    OpensourceModels, OpenSourcePackage, _packages_verified_in_vantage)
+from teradataml.opensource._wrapper_utils import (_generate_new_name,
+                                                  _validate_df_query_type)
+from teradataml.options.configure import configure
+from teradataml.utils.validators import _Validators
 logger = pylogger.getLogger()
@@ -92,8 +88,15 @@ class _GenericObjectWrapper:
                 self._env = configure.openml_user_env
             else:
                 self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
-        else:
-            set_session_param("searchuifdbpath",self._db_name)
+        # Check if the Python interpreter major versions are consistent between Vantage and local.
+        UtilFuncs._check_python_version_diff(self._env)
+        # Raise warning when python package versions don't match between Vantage and local.
+        # OPENSOURCE_PACKAGE_NAME is set for each opensource package, but not for the base class.
+        # Add a check to avoid running this function for the base class.
+        if self.OPENSOURCE_PACKAGE_NAME is not None:
+            UtilFuncs._check_package_version_diff(self.OPENSOURCE_PACKAGE_NAME.value, self._pkgs, self._env)
         global _file_installed
         ## Flag to check whether trained model is installed or not.
@@ -295,7 +298,7 @@ class _GenericObjectWrapper:
         elif n_unique_partitions > 1:
             self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
         else:
-            ValueError("Number of partitions should be greater than 0.")
+            raise ValueError("Number of partitions should be greater than 0.")
         warnings.filterwarnings("default")
@@ -813,41 +816,56 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
     def fit(self, **kwargs):
         pass
-    def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
+    def _convert_arguments_to_modelObj(self, args, partition_col_values=None):
         """
-        Internal function to convert all OpensourceML related objects in arguments to
-        underlying model objects.
+        Internal function to get appropriate model from <argument>.modelObj when multiple models are
+        generated by fit, based on partition_col_values. If partition_col_values is None, then it is
+        single model case.
         """
         if isinstance(args, dict):
             new_args = args.copy() # To avoid updating
             for k, v in new_args.items():
-                if isinstance(v, type(self)):
-                    if idx_multi_model is not None:
-                        # single model. This argument is set only when modelObj is single model.
-                        new_args[k] = v.modelObj
-                    else:
+                if isinstance(v, _OpenSourceObjectWrapper):
+                    arg_model_obj = v.modelObj
+                    if isinstance(arg_model_obj, pd.DataFrame):
                         # multi-model. Get appropriate model from modelObj.
-                        new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
-                else:
-                    new_args[k] = v
+                        arg_partition_values_model_dict = v._get_partition_columns_to_model_dict()
+                        new_args[k] = arg_partition_values_model_dict[partition_col_values]
+                    else:
+                        # single model.
+                        new_args[k] = arg_model_obj
             return new_args
-        # If args is tuple, convert all elements to underlying model object.
-        elif isinstance(args, tuple):
+        if isinstance(args, tuple):
             new_args = tuple()
             for arg in args:
                 if isinstance(arg, type(self)):
-                    if idx_multi_model is None:
-                        # single model. This argument is set only when modelObj is single model.
-                        new_args += (arg.modelObj,)
-                    else:
+                    arg_model_obj = arg.modelObj
+                    if isinstance(arg_model_obj, pd.DataFrame):
                         # multi-model. Get appropriate model from modelObj.
-                        new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
+                        arg_partition_values_model_dict = arg._get_partition_columns_to_model_dict()
+                        new_args += (arg_partition_values_model_dict[partition_col_values],)
+                    else:
+                        # single model.
+                        new_args += (arg_model_obj,)
                 else:
                     new_args += (arg,)
             return new_args
         return args
+    def _get_partition_columns_to_model_dict(self):
+        """
+        Internal function to get partition columns to model dictionary.
+        """
+        partition_values_model_dict = {}
+        no_of_unique_partitions = len(self._fit_partition_unique_values)
+        no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
+        for i in range(no_of_unique_partitions):
+            partition_values_model_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
+        return partition_values_model_dict
     def __get_obj_attributes_multi_model(self, name):
         """
         Internal function to get attributes of all sklearn model objects when multiple models are
@@ -873,12 +891,17 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         # Wrapper function to invoke dynamic method, using arguments
         # passed by user, on model in each row.
-        def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
+        def __opensource_method_invoker_for_multimodel(*c, **kwargs):
+            """
+            Internal function to run functions not taking data related arguments but taking
+            arguments, which might contain other model objects.
+            """
             multi_models = self.modelObj.copy()
             for i in range(multi_models.shape[0]):
                 curr_model = multi_models.iloc[i]["model"]
-                partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
-                partition_values = "_".join([str(x) for x in partition_values])
+                partition_values = tuple(multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list())
+                partition_values_joined = "_".join([str(x) for x in partition_values])
                 if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
                     # filename is first argument.
                     kwargs1 = kwargs.copy()
@@ -886,17 +909,19 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
                     if len(c) > 0:
                         c1 = list(c1)
-                        c1[0] = f"{c1[0]}_{partition_values}"
+                        c1[0] = f"{c1[0]}_{partition_values_joined}"
                         c1 = tuple(c1)
                     if len(kwargs) > 0 and kwargs.get("filename", None):
-                        kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
+                        kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values_joined}"
-                    multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
-                                                                            **self._convert_arguments_to_modelObj(kwargs1, i))
+                    pos_args = self._convert_arguments_to_modelObj(c1, partition_values)
+                    key_args = self._convert_arguments_to_modelObj(kwargs1, partition_values)
                 else:
-                    multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
-                                                                            **self._convert_arguments_to_modelObj(kwargs, i))
+                    pos_args = self._convert_arguments_to_modelObj(c, partition_values)
+                    key_args = self._convert_arguments_to_modelObj(kwargs, partition_values)
+                multi_models.at[i, "model"] = getattr(curr_model, name)(*pos_args, **key_args)
             first_function_value = multi_models.at[0, "model"]
             if self.__class__._validate_model_supportability(first_function_value):
                 return __generate_model_object(multi_models, init_model_obj=first_function_value)
@@ -914,7 +939,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         # If first_atrribute_instance is callable, it should be applied on model in each row
         # using passed arguments.
         if callable(first_atrribute_instance):
-            return __sklearn_method_invoker_for_multimodel
+            return __opensource_method_invoker_for_multimodel
         output_attributes = self.modelObj.copy()
         for i in range(output_attributes.shape[0]):
@@ -928,7 +953,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
     def __getattr__(self, name):
         # This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
-        def __sklearn_method_invoker(*c, **kwargs):
+        def __opensource_method_invoker(*c, **kwargs):
             # Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
             model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
             if self.__class__._validate_model_supportability(model_obj):
@@ -942,7 +967,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         attribute_instance = getattr(self.modelObj, name)
         if callable(attribute_instance):
-            return __sklearn_method_invoker
+            return __opensource_method_invoker
         if self.__class__._validate_model_supportability(attribute_instance):
             # sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
@@ -1003,7 +1028,9 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
                                      fit_partition_columns_non_default=self._fit_partition_colums_non_default,
                                      model=self.modelObj,
                                      pos_args=self.pos_args,
-                                     key_args=self.kwargs)
+                                     key_args=self.kwargs,
+                                     osml_class=self.__class__.__name__,
+                                     osml_module=self.__module__)
         # Saved the model object to a file to be used in save_byom() for writing to Vantage table.
         file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
@@ -1048,7 +1075,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         cls = cls(model=model)
         # Load the model file into Vantage node as file can be used in
         # predict or other operations.
-        cls._install_initial_model_file()
+        cls._install_initial_model_file(False)
         cls._save_model(model_name, replace_if_exists)
@@ -1079,9 +1106,16 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         #   - 2nd contains package name.
         model_obj = pickle.loads(model_vals_list[0])
         model = model_obj.model
+        osml_module =  model_obj.osml_module if hasattr(model_obj, "osml_module") else None
+        osml_class = model_obj.osml_class if hasattr(model_obj, "osml_class") else None
+        new_cls = cls
+        if osml_module is not None and osml_class is not None:
+            new_cls = getattr(import_module(osml_module), osml_class)
         package = model_vals_list[1]
-        if package != cls.OPENSOURCE_PACKAGE_NAME.value:
+        if package != new_cls.OPENSOURCE_PACKAGE_NAME.value:
             # Raise error if trying to access model of different package.
             raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
                                         f". Requested model is from '{package}' package"),
@@ -1091,23 +1125,24 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
             # Create a new instance of the class and set the model object to the instance.
             # Instantiation can take only model, not model object. Hence, passing one of the model
             # from pandas df. Updating modelObj and other fields later
-            cls = cls(model=model.iloc[1,2])
-            cls.modelObj = model
-            cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
+            new_cls = new_cls(model=model.iloc[1,2])
+            new_cls.modelObj = model
+            new_cls._fit_partition_unique_values = [lst[:len(model_obj.fit_partition_columns_non_default)]
+                                                    for lst in model.values.tolist()]
         else:
-            cls = cls(model=model)
+            new_cls = new_cls(model=model)
-        cls._model_file_name_prefix = model_obj.partition_file_prefix
-        cls._is_default_partition_value_fit = model_obj.is_default_partition_value
-        cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
-        cls.pos_args = model_obj.pos_args
-        cls.kwargs = model_obj.key_args
+        new_cls._model_file_name_prefix = model_obj.partition_file_prefix
+        new_cls._is_default_partition_value_fit = model_obj.is_default_partition_value
+        new_cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
+        new_cls.pos_args = model_obj.pos_args
+        new_cls.kwargs = model_obj.key_args
         # Load the model file into Vantage node as file can be used in
         # predict or other operations.
-        cls._install_initial_model_file()
+        new_cls._install_initial_model_file(False)
-        return cls
+        return new_cls
     def deploy(self, model_name, replace_if_exists=False):
         """
@@ -1136,962 +1171,89 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
             "replace_if_exists" is set to False.
         EXAMPLES:
+            ## sklearn examples.
+            # Import the required libraries and create LinearRegression Opensource object wrapper.
             >>> from teradataml import td_sklearn
             >>> model = td_sklearn.LinearRegression(normalize=True)
             >>> model
             LinearRegression(normalize=True)
-            # Example 1: Deploy the model held by interface object to Vantage.
+            # Example 1: Deploy the model held by LinearRegression Opensource object to Vantage.
             >>> lin_reg = model.deploy("linreg_model_ver_2")
             Model is saved.
             >>> lin_reg
             LinearRegression(normalize=True)
-            # Example 2: Deploy the model held by interface object to Vantage with the name same
-            #            as that of model that already existed in Vantage.
+            # Example 2: Deploy the model held by LinearRegression Opensource object to Vantage
+            #            with the name same as that of model that already existed in Vantage.
             >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
             Model is deleted.
             Model is saved.
             >>> lin_reg
             LinearRegression(normalize=True)
-        """
-        # Install model file into Vantage, if not installed.
-        self._install_initial_model_file()
-        self._save_model(model_name, replace_if_exists)
-        return self
-class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
+            ## lightgbm examples.
-    OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
+            # For lightGBM, there are two types of models created by `td_lightgbm` interface object.
+            #   - the model object created using LGBMClassifier or other class of lightgbm.sklearn module.
+            #   - the model object created using train() method (object of lightgbm.Booster class)
+            #     or standalone object of lightgbm.Booster class.
-    def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
-        super().__init__(model=model, module_name=module_name, class_name=class_name,
-                         pos_args=pos_args, kwargs=kwargs)
-        self._initialize_variables(table_name_prefix="td_sklearn_")
-        if model is not None:
-            self.modelObj = model
-            self.module_name = model.__module__.split("._")[0]
-            self.class_name = model.__class__.__name__
-            # __dict__ gets all the arguments as dictionary including default ones and positional
-            # args.
-            self.kwargs = model.__dict__
-            self.pos_args = tuple() # Kept empty as all are moved to kwargs.
-        else:
-            self._initialize_object()
+            # Import the required libraries and create LGBMClassifier Opensource object wrapper.
+            >>> from teradataml import td_lightgbm
+            >>> model = td_lightgbm.LGBMClassifier()
+            >>> model
+            LGBMClassifier()
-    def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
-                                    skip_either_or_that=False):
-        """
-        Internal function to validate arguments passed to exposed opensource APIs and return
-        parent DataFrame, feature columns, label columns, group columns, data partition columns.
-        """
-        _validate_opensource_func_args(X=X, y=y, groups=groups,
-                                       fit_partition_cols=self._fit_partition_colums_non_default,
-                                       kwargs=kwargs,
-                                       skip_either_or_that=skip_either_or_that)
-        return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
-                                        fit_partition_cols=self._fit_partition_colums_non_default)
+            # Example 1: Deploy the model held by LGBMClassifier Opensource object to Vantage.
+            >>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2")
+            Model is saved.
+            >>> lgbm_cls
+            LGBMClassifier()
-    def _run_fit_related_functions(self,
-                                   data,
-                                   feature_columns,
-                                   label_columns,
-                                   partition_columns,
-                                   func,
-                                   classes=None,
-                                   file_name="sklearn_fit.py"):
-        """
-        Internal function to run fit() and partial_fit() functions.
+            # Example 2: Deploy the model held by LGBMClassifier Opensource object to Vantage with
+            #            the name same as that of model that already existed in Vantage.
+            >>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2", replace_if_exists=True)
+            Model is deleted.
+            Model is saved.
+            >>> lgbm_cls
+            LGBMClassifier()
+            # Example 3: Deploy the model trained using td_lightgbm.train() function to Vantage.
+            # Create Dataset object, assuming df_x and df_y are the feature and label teradataml
+            # DataFrames.
+            >>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
+            >>> lgbm_data
+            <lightgbm.basic.Dataset object at ....>
+            # Train the model using `td_lightgbm` interface object.
+            >>> model = td_lightgbm.train(params={}, train_set=lgbm_data, num_boost_round=30, valid_sets=[lgbm_data])
+            [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
+            You can set `force_row_wise=true` to remove the overhead.
+            And if memory is not enough, you can set `force_col_wise=true`.
+            [LightGBM] [Info] Total Bins 532
+            [LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
+            [1]	valid_0's l2: 0.215811
+            [2]	valid_0's l2: 0.188138
+            [3]	valid_0's l2: 0.166146
+            ...
+            ...
+            [29]	valid_0's l2: 0.042255
+            [30]	valid_0's l2: 0.0416953
+            # Deploy the model to Vantage.
+            >>> lgb_model = model.deploy("lgbm_train_model_ver_2")
+            >>> lgb_model
+            <lightgbm.basic.Booster object at ...>
         """
-        label_columns = self._get_columns_as_list(label_columns)
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                label_columns,
-                                                                                partition_columns)
-        model_type = BLOB() if self._is_lake_system else CLOB()
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in new_partition_columns] + [("model", model_type)]
-        if classes:
-            class_type = type(classes[0]).__name__
-            classes = "--".join([str(x) for x in classes])
-        else:
-            classes = str(None)
-            class_type = str(None)
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        # db_name is applicable for enterprise system.
-        db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
-            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
-        # Get unique values in partitioning columns.
-        self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
+        # Install model file into Vantage, if not installed.
         self._install_initial_model_file()
-        self._model_data = self._run_script(data, script_command, new_partition_columns,
-                                            return_types)
-        self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
-    def partial_fit(self, X=None, y=None, classes=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        st_time = time.time()
-        # "classes" argument validation.
-        arg_info_matrix = []
-        arg_info_matrix.append(["classes", classes, True, (list)])
-        _Validators._validate_function_arguments(arg_info_matrix)
-        self._is_default_partition_value_fit = True # False when the user provides partition columns.
-        data, feature_columns, label_columns, _, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
-        if partition_columns:
-            self._is_default_partition_value_fit = False
-            self._fit_partition_colums_non_default = partition_columns
-        self._run_fit_related_functions(data,
-                                        feature_columns,
-                                        label_columns,
-                                        partition_columns,
-                                        inspect.stack()[0][3],
-                                        classes)
-        self._partial_fit_execution_time = time.time() - st_time
-        return self
-    def fit(self, X=None, y=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        st_time = time.time()
-        self._is_default_partition_value_fit = True # False when the user provides partition columns.
-        data, feature_columns, label_columns, _, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
-        if partition_columns:
-            self._is_default_partition_value_fit = False
-            self._fit_partition_colums_non_default = partition_columns
-        file_name = kwargs.pop("file_name", None)
-        func_name = kwargs.pop("name", "fit")
-        args = {"data": data,
-                "feature_columns": feature_columns,
-                "label_columns": label_columns,
-                "partition_columns": partition_columns,
-                "func": func_name}
-        if file_name is not None:
-            args["file_name"] = file_name
-        self._run_fit_related_functions(**args)
-        self._fit_execution_time = time.time() - st_time
-        return self
-    def set_params(self, **params):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        for key, val in params.items():
-            self.kwargs[key] = val
-        # Initialize with new arguments and return the class/model object.
-        # set_params takes all keyword arguments and no positional arguments.
-        self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
+        self._save_model(model_name, replace_if_exists)
         return self
-    # get_params() will be executed through __getattr__().
-    # @_validate_fit_run
-    def __getattr__(self, name):
-        def __run_transform(*c, **kwargs):
-            kwargs["name"] = name
-            return self._transform(*c, **kwargs)
-        def __run_function_needing_all_rows(*c, **kwargs):
-            kwargs["name"] = name
-            return self._run_function_needing_all_rows(*c, **kwargs)
-        def __run_kneighbors(*c, **kwargs):
-            kwargs["name"] = name
-            return self._run_neighbors(*c, **kwargs)
-        if name in ["score", "aic", "bic", "perplexity"]:
-            # TODO: ELE-6352 - Implement error_norm() function later.
-            return __run_function_needing_all_rows
-        if name in ["kneighbors",
-                    "radius_neighbors",
-                    "kneighbors_graph",
-                    "radius_neighbors_graph"]:
-            return __run_kneighbors
-        if name in ["predict",
-                    "transform",
-                    "inverse_transform",
-                    "predict_proba",
-                    "predict_log_proba",
-                    "decision_function",
-                    "score_samples",
-                    "decision_path",
-                    "apply",
-                    "cost_complexity_pruning_path",
-                    "gibbs",
-                    "kneighbors_graph",
-                    "radius_neighbors_graph",
-                    "mahalanobis",
-                    "correct_covariance",
-                    "reweight_covariance",
-                    "path"]:
-            return __run_transform
-        return super().__getattr__(name)
-    def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
-                                      func_name, **kwargs):
-        """
-        Internal function to handle multi model case for transform function for functions
-        ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
-        and "Birch" of cluster module.
-        These functions generate multiple models and when transform is applied to each model, it generates
-        output with different number of columns.
-        """
-        skl_objs_dict = {}
-        no_of_unique_partitions = len(self._fit_partition_unique_values)
-        no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
-        # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
-        # and get the maximum number of columns and their types.
-        for i in range(no_of_unique_partitions):
-            skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
-        data = data.select(feature_columns + label_columns + partition_columns)
-        ten_row_data = data.head(10).get_values()
-        X = numpy.array(ten_row_data)
-        # For multi-model case, model in one AMP can give more number of columns than other AMPs.
-        # Returns clause can't contain different number of columns in different AMPs. Hence, taking
-        # maximum number of columns and their types from all models.
-        max_no_of_columns = 0
-        max_col_names = []
-        max_col_types = []
-        def _get_input_row_without_nans(row):
-            """
-            `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
-            """
-            X1 = []
-            for _, v in enumerate(row):
-                if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
-                    # Add to list when:
-                    #  - v is None or
-                    #   - v is string or
-                    #   - v is not nan or
-                    #   - if module is impute (which transforms nan values) even though v is nan.
-                    X1.append(v)
-                else:
-                    # skip nan values.
-                    pass
-            return X1
-        for i in range(X.shape[0]):
-            # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
-            partition_values = tuple(X[i, -no_of_partitioning_cols:])
-            skl_obj = skl_objs_dict[partition_values]
-            X1 = X[i, :-no_of_partitioning_cols]
-            # Since Nans/NULLs are added in transform for last columns where some models generated
-            # less number of columns, removing Nans/NULLs from the input row for inverse_transform
-            # using function _get_input_row_without_nans().
-            X1 = numpy.array([_get_input_row_without_nans(X1)])
-            trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
-            no_of_columns = 1
-            if trans_opt.shape == (X1.shape[0],):
-                trans_opt = trans_opt.reshape(X1.shape[0], 1)
-            if isinstance(trans_opt[0], numpy.ndarray) \
-                    or isinstance(trans_opt[0], list) \
-                    or isinstance(trans_opt[0], tuple):
-                no_of_columns = len(trans_opt[0])
-            col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
-            # Get new column sqlalchemy types for pandas df columns of transform output.
-            opt_pd = pd.DataFrame(trans_opt)
-            # Get output column types for each column in pandas df from the output of transform
-            # type functions.
-            types = {}
-            for idx in range(no_of_columns):
-                col = list(opt_pd.columns)[idx]
-                # Only one row in trans_opt.
-                if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
-                    type_ = type(trans_opt[0][idx])
-                else:
-                    # only one value in the output.
-                    type_ = type(trans_opt[0])
-                # If type of the output value (trans_opt) is None, then use `str` as type since
-                # pandas astype() does not accept None type.
-                if type_ is type(None):
-                    type_ = str
-                # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
-                # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
-                #                       Error while type casting for column '2'"
-                # Hence, using pd.Int64Dtype() for integer columns with nan values.
-                types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
-            # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
-            opt_pd = opt_pd.astype(types)
-            # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
-            # TIMESTAMP(timezone=True) else map it according to default value.
-            col_types = [TIMESTAMP(timezone=True)
-                        if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
-                        else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
-                        for key, col_name in enumerate(list(opt_pd.columns))]
-            # Different models in multi model case can generate different number of output columns for example in
-            # SelectFpr. Hence, taking the model which generates maximum number of columns.
-            if no_of_columns > max_no_of_columns:
-                max_no_of_columns = no_of_columns
-                max_col_names = col_names
-                max_col_types = col_types
-        return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
-    def _get_return_columns_for_function_(self,
-                                          data,
-                                          feature_columns,
-                                          label_columns,
-                                          partition_columns,
-                                          func_name,
-                                          kwargs):
-        """
-        Internal function to return list of column names and their sqlalchemy types
-        which should be used in return_types of Script.
-        """
-        if func_name == "fit_predict":
-            """
-            Get return columns using label_columns.
-            """
-            return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
-                     data._td_column_names_and_sqlalchemy_types[col.lower()])
-                    for i, col in enumerate(label_columns)]
-        if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
-            """
-            Return predict columns using either label_columns (if provided) or
-            self._fit_label_columns_types (if the function is trained using label columns).
-            Otherwise run predict on ten rows of data to get the number of columns and their types
-            after this if condition.
-            """
-            if label_columns:
-                return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
-                         data._td_column_names_and_sqlalchemy_types[col.lower()])
-                             for i, col in enumerate(label_columns)]
-            if self._fit_label_columns_types:
-                return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
-                        for i, col_type in enumerate(self._fit_label_columns_types)]
-        ## If function is not `fit_predict`:
-        #   then take one row of transform/other functions to execute in client
-        #   to get number of columns in return clause and their Vantage types.
-        n_f = len(feature_columns)
-        n_c = len(label_columns)
-        # For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
-        # Just for getting the number of columns and their types, using only one model of all.
-        if len(self._fit_partition_unique_values) == 1:
-            # Single model case.
-            skl_obj = self.modelObj
-        else:
-            # Multi model case.
-            if (func_name in ["transform", "inverse_transform"] and \
-                self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
-                (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
-                # Special handling for multi model case for transform function as these classes
-                # generate transform output with different number of columns for each model.
-                # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
-                # some models.
-                return self._special_handling_multimodel_(data, feature_columns, label_columns,
-                                                          partition_columns, func_name, **kwargs)
-            skl_obj = self.modelObj.iloc[0]["model"]
-        data = data.select(feature_columns + label_columns)
-        ten_row_data = data.head(10).get_values()
-        X = numpy.array(ten_row_data)
-        if label_columns:
-            y = X[:,n_f : n_f + n_c]
-            X = X[:,:n_f]
-            # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
-            # in local run if passed. Generally, 'y' is passed to return y along with actual output.
-            try:
-                trans_opt = getattr(skl_obj, func_name)(X, y, **kwargs)
-            except TypeError as ex:
-                # Function which does not accept 'y' like predict_proba() raises error like
-                # "predict_proba() takes 2 positional arguments but 3 were given".
-                trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
-        else:
-            trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
-        if func_name == "path":
-            raise NotImplementedError(
-                "path() returns tuple of ndarrays of different shapes. Not Implemented yet."
-            )
-        if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
-            trans_opt = trans_opt.reshape(X.shape[0], 1)
-        if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
-            no_of_columns = trans_opt.get_shape()[1]
-            trans_opt = trans_opt.toarray()
-        elif isinstance(trans_opt, dict):
-            raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
-        elif isinstance(trans_opt[0], numpy.ndarray) \
-                or isinstance(trans_opt[0], list) \
-                or isinstance(trans_opt[0], tuple):
-            no_of_columns = len(trans_opt[0])
-        else:
-            no_of_columns = 1
-        # Special handling when inverse_transform of no_of_columns returns no of rows
-        # less than the no of classes. Such columns are filled with NaN values.
-        # Updating number of columns here (new columns with NaN values will be added).
-        if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
-            no_of_columns = len(self.classes_)
-            for i in range(len(ten_row_data)):
-                trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
-        # Special handling required for cross_decomposition classes's transform function, which
-        # takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
-        # y_scores. If label columns are not provided, only x_scores are returned.
-        if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
-            # For cross_decomposition, output is a tuple of arrays when label columns are provided
-            # along with feature columns for transform function. In this case, concatenate the
-            # arrays and return the column names accordingly.
-            if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
-                assert trans_opt[0].shape == trans_opt[1].shape,\
-                    "Output arrays should be of same shape when transform/fit_transform is run "\
-                    "with label columns for cross_decomposition classes.."
-                first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
-                second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
-                no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
-                col_names = first_cols + second_cols
-                trans_opt = numpy.concatenate(trans_opt, axis=1)
-            else:
-                assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
-                    "without label columns for cross_decomposition classes, "\
-                    "output should be a numpy array."
-                no_of_columns = trans_opt.shape[1]
-                col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
-        else:
-            # Generate list of new column names.
-            col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
-        # Get new column sqlalchemy types for pandas df columns of transform output.
-        opt_pd = pd.DataFrame(trans_opt)
-        # Get output column types for each column in pandas df from the output of transform
-        # type functions.
-        types = {}
-        for idx, col in enumerate(list(opt_pd.columns)):
-            # Get type of column using data from all rows, in case if the column has None values.
-            # 'and' of types of all values in the column with type(None) gives the type of the column.
-            type_ = type(None)
-            for i in range(len(trans_opt)):
-                type_ = type_ and type(trans_opt[i][idx])
-            # If all the values of the output (trans_opt) is None, thelen use `str` as type since
-            # pandas astype() does not accept None type.
-            if type_ is type(None):
-                type_ = str
-            # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
-            # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
-            #                       Error while type casting for column '2'"
-            # Hence, using pd.Int64Dtype() for integer columns with nan values.
-            types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
-        # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
-        opt_pd = opt_pd.astype(types)
-        # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
-        # TIMESTAMP(timezone=True) else map it according to default value.
-        col_types = [TIMESTAMP(timezone=True)
-                     if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
-                     else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
-                     for key, col_name in enumerate(list(opt_pd.columns))]
-        return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
-    @_validate_fit_run
-    def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
-        """
-        Internal function to run functions like score, aic, bic which needs all rows and return
-        one floating number as result.
-        """
-        st_time = time.time()
-        assert kwargs["name"], "function name should be passed."
-        func_name = kwargs["name"]
-        # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
-        kwargs.pop("name")
-        data, feature_columns, label_columns, _, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
-        label_columns = self._get_columns_as_list(label_columns)
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                label_columns,
-                                                                                partition_columns)
-        script_file_path = f"{file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{file_name}"
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        self._validate_unique_partition_values(data, new_partition_columns)
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
-            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {self._is_lake_system}"
-        # score, aic, bic returns float values.
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in new_partition_columns] + [(func_name, FLOAT())]
-        # Checking the trained model installation. If not installed,
-        # install it and set flag to True.
-        if not self._is_trained_model_installed:
-            self._install_initial_model_file()
-            self._is_trained_model_installed = True
-        opt = self._run_script(data, script_command, new_partition_columns, return_types)
-        self._score_execution_time = time.time() - st_time
-        if self._is_default_partition_value_fit:
-            # For single model case, partition column is internally generated and
-            # no point in returning it to the user.
-            return opt.select(func_name)
-        return opt
-    @_validate_fit_run
-    def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
-        """
-        Internal function to run predict/transform and similar functions, which returns
-        multiple columns. This function will return data row along with the generated
-        columns' row data, unlike sklearn's functions which returns just output data.
-        """
-        st_time = time.time()
-        assert kwargs["name"], "function name should be passed."
-        func_name = kwargs["name"]
-        # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
-        kwargs.pop("name")
-        data, feature_columns, label_columns, _, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                label_columns,
-                                                                                partition_columns)
-        # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
-        self._remove_data_related_args_from_kwargs(kwargs)
-        script_file_path = f"{file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{file_name}"
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        self._validate_unique_partition_values(data, new_partition_columns)
-        return_columns_python_types = None
-        if self._fit_label_columns_python_types:
-            return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
-        # Returning feature columns also along with transformed columns because we don't know the
-        # mapping of feature columns to the transformed columns.
-        ## 'correct_covariance()' returns the (n_features, n_features)
-        if func_name == "correct_covariance":
-            return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                            for col in new_partition_columns]
-        else:
-            return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                            for col in (new_partition_columns + feature_columns)]
-        if func_name in ["predict", "decision_function"] and label_columns:
-            return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                             for col in label_columns]
-        output_cols_types = self._get_return_columns_for_function_(data,
-                                                                   feature_columns,
-                                                                   label_columns,
-                                                                   new_partition_columns,
-                                                                   func_name,
-                                                                   kwargs)
-        return_types += output_cols_types
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
-            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
-            f"{return_columns_python_types}"
-        # Checking the trained model installation. If not installed,
-        # install it and set flag to True.
-        if not self._is_trained_model_installed:
-            self._install_initial_model_file()
-            self._is_trained_model_installed = True
-        opt = self._run_script(data, script_command, new_partition_columns, return_types)
-        self._transform_execution_time = time.time() - st_time
-        return self._get_returning_df(opt, new_partition_columns, return_types)
-    def fit_predict(self, X=None, y=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        st_time = time.time()
-        self._is_default_partition_value_fit = True # False when the user provides partition columns.
-        data, feature_columns, label_columns, _, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
-        if partition_columns:
-            self._is_default_partition_value_fit = False
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                label_columns,
-                                                                                partition_columns)
-        # Return label_columns also if user provides in the function call.
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in (new_partition_columns + feature_columns + label_columns)]
-        func_name = inspect.stack()[0][3]
-        if label_columns:
-            return_types += self._get_return_columns_for_function_(data,
-                                                                   feature_columns,
-                                                                   label_columns,
-                                                                   new_partition_columns,
-                                                                   func_name,
-                                                                   {})
-        else:
-            # If there are no label_columns, we will have only one
-            # predicted column.
-            return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
-        file_name = "sklearn_fit_predict.py"
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        script_file_name = f"{file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{file_name}"
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
-            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {self._is_lake_system}"
-        # Get unique values in partitioning columns.
-        self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
-        # Checking the trained model installation. If not installed,
-        # install it and flag to True.
-        if not self._is_trained_model_installed:
-            self._install_initial_model_file()
-            self._is_trained_model_installed = True
-        opt = self._run_script(data, script_command, new_partition_columns, return_types)
-        self._fit_predict_execution_time = time.time() - st_time
-        if self._is_default_partition_value_fit:
-            # For single model case, partition column is internally generated and no point in
-            # returning it to the user.
-            # Extract columns from return types.
-            returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
-            return opt.select(returning_cols)
-        return opt
-    def fit_transform(self, X=None, y=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        # 'y' is not needed for transform().
-        fit_obj = self.fit(X, y, **kwargs)
-        kwargs["label_columns"] = None
-        return fit_obj.transform(X, None, **kwargs)
-    @_validate_fit_run
-    def _run_neighbors(self, X=None, **kwargs):
-        """
-        Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
-        radius_neighbors_graph which returns multiple columns. This function will return data row
-        along with the generated columns' row data, unlike sklearn's functions which returns just
-        output data.
-        """
-        assert kwargs["name"], "function name should be passed."
-        func_name = kwargs["name"]
-        kwargs.pop("name")
-        if self.module_name != "sklearn.neighbors":
-            raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
-        data = kwargs.get("data", None)
-        partition_columns = kwargs.get("partition_columns", None)
-        if not X and not partition_columns and not data:
-            # If data is not passed, then run from client only.
-            # TODO: decide whether to run from client or from Vantage.
-            opt = super().__getattr__(func_name)(**kwargs)
-            from scipy.sparse.csr import csr_matrix
-            if isinstance(opt, csr_matrix):
-                return opt.toarray()
-            return opt
-        self._is_default_partition_value_fit = True # False when the user provides partition columns.
-        data, feature_columns, _, _, new_partition_columns = \
-            self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
-                                             skip_either_or_that=True)
-        # Remove the kwargs data.
-        self._remove_data_related_args_from_kwargs(kwargs)
-        if partition_columns:
-            # kwargs are passed to kneighbors function. So, removing them from kwargs.
-            self._is_default_partition_value_fit = False
-        # Generating new partition column name.
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                [],
-                                                                                partition_columns)
-        args_str = self._get_kwargs_str(kwargs)
-        file_name = "sklearn_neighbors.py"
-        script_file_path = f"{file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{file_name}"
-        # Returning feature columns also along with new columns.
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in (new_partition_columns + feature_columns)]
-        # `return_distance` is needed as the result is a tuple of two arrays when it is True.
-        return_distance = kwargs.get("return_distance", True) # Default value is True.
-        # Though new columns return numpy arrays, we are returning them as strings.
-        # TODO: Will update to columns later, if requested later.
-        if func_name in ['kneighbors', 'radius_neighbors']:
-            if return_distance:
-                return_types += [("neigh_dist", VARCHAR())]
-            return_types += [("neigh_ind", VARCHAR())]
-        elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
-            return_types += [("A", VARCHAR())]
-        else:
-            return_types += [("output", VARCHAR())]
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
-            f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
-            f"{args_str}"
-        # Get unique values in partitioning columns.
-        self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
-        # Checking the trained model installation. If not installed,
-        # install it and set flag to True.
-        if not self._is_trained_model_installed:
-            self._install_initial_model_file()
-            self._is_trained_model_installed = True
-        opt = self._run_script(data, script_command, new_partition_columns, return_types)
-        return self._get_returning_df(opt, new_partition_columns, return_types)
-    def split(self, X=None, y=None, groups=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        opt = self._run_model_selection("split", X=X, y=y, groups=groups,
-                                        skip_either_or_that=True, kwargs=kwargs)
-        # Get number of splits in the result DataFrame.
-        n_splits = opt.drop_duplicate("split_id").shape[0]
-        data = kwargs.get("data", None)
-        feature_columns = kwargs.get("feature_columns", [])
-        label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
-        # If there is not X and y, get feature_columns and label_columns for "data".
-        partition_columns = kwargs.get("partition_columns", [])
-        feature_columns = [col for col in X.columns if col not in partition_columns] \
-            if X and not data and not feature_columns else feature_columns
-        label_columns = y.columns if y and not data and not label_columns else label_columns
-        # Return iterator of the train and test dataframes for each split.
-        for i in range(1, n_splits+1):
-            train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
-                .select(partition_columns + feature_columns + label_columns)
-            train_df._index_label = None
-            test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
-                .select(partition_columns + feature_columns + label_columns)
-            test_df._index_label = None
-            yield train_df, test_df
-    def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
-        """
-        Please check the description in Docs/OpensourceML/sklearn.py.
-        """
-        return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
-                                         skip_either_or_that=True, kwargs=kwargs)
-    def _run_model_selection(self,
-                             func_name,
-                             X=None,
-                             y=None,
-                             groups=None,
-                             skip_either_or_that=False,
-                             kwargs={}):
-        """
-        Internal function to run functions like split, get_n_splits of model selection module.
-        - get_n_splits() returns number of splits as value, not as teradataml DataFrame.
-        - split() returns teradataml DataFrame containing train and test data for each split
-          (add partition information if the argument "partition_cols" is provided).
-        """
-        if self.module_name != "sklearn.model_selection":
-            raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
-                                 f"have {func_name}() method.")
-        data = kwargs.get("data", None)
-        if not X and not y and not groups and not data:
-            # If data is not passed, then run from client only.
-            # TODO: decide whether to run from client or from Vantage.
-            return super().__getattr__(func_name)()
-        self._is_default_partition_value_fit = True # False when the user provides partition columns.
-        data, feature_columns, label_columns, group_columns, partition_columns = \
-            self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
-                                             skip_either_or_that=skip_either_or_that)
-        if partition_columns:
-            self._is_default_partition_value_fit = False
-        data, new_partition_columns = self._get_data_and_data_partition_columns(data,
-                                                                                feature_columns,
-                                                                                label_columns,
-                                                                                partition_columns,
-                                                                                group_columns)
-        file_name = "sklearn_model_selection_split.py"
-        script_file_path = f"{file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{file_name}"
-        if func_name == "split":
-            # Need to generate data into splits of train and test.
-            #   split_id - the column which will be used to identify the split.
-            #   data_type - the column which will be used to identify whether the row is
-            #               train or test row.
-            return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
-            # Returning feature columns and label columns as well.
-            return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                            for col in (feature_columns + label_columns)]
-        else:
-            # Return Varchar by default.
-            # Returns Varchar even for functions like `get_n_splits` which returns large integer
-            # numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
-            # `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
-            return_types = [(func_name, VARCHAR())]
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in new_partition_columns] + return_types
-        data_column_types_str, partition_indices_str, _, new_partition_columns = \
-            self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
-            f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {self._is_lake_system}"
-        # Get unique values in partitioning columns.
-        self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
-        # Checking the trained model installation. If not installed,
-        # install it and set flag to True.
-        if not self._is_trained_model_installed:
-            self._install_initial_model_file()
-            self._is_trained_model_installed = True
-        opt = self._run_script(data, script_command, new_partition_columns, return_types)
-        if func_name == "get_n_splits" and not partition_columns:
-                # Return number of splits as value, not as dataframe.
-                vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
-                opt = vals.fetchall()[0][0]
-                # Varchar is returned by the script. Convert it to int.
-                return int(opt)
-        return opt
 class _FunctionWrapper(_GenericObjectWrapper):
     def __init__(self, module_name, func_name, file_type, template_file):
@@ -2151,10 +1313,3 @@ class _FunctionWrapper(_GenericObjectWrapper):
         self._remove_script_file(self._script_file_name)
         return self.modelObj
-class _SKLearnFunctionWrapper(_FunctionWrapper):
-    def __init__(self, module_name, func_name):
-        file_type = "file_fn_sklearn"
-        template_file = "sklearn_function.template"
-        super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)

teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.5py3-none-any.whl