PyPI - teradataml - Versions diffs - 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl - Mend

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +196 -2
teradataml/__init__.py +4 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +79 -4
teradataml/analytics/json_parser/metadata.py +12 -3
teradataml/analytics/json_parser/utils.py +7 -2
teradataml/analytics/sqle/__init__.py +1 -0
teradataml/analytics/table_operator/__init__.py +1 -1
teradataml/analytics/uaf/__init__.py +1 -1
teradataml/analytics/utils.py +4 -0
teradataml/automl/data_preparation.py +3 -2
teradataml/automl/feature_engineering.py +15 -7
teradataml/automl/model_training.py +39 -33
teradataml/common/__init__.py +2 -1
teradataml/common/constants.py +35 -0
teradataml/common/garbagecollector.py +2 -1
teradataml/common/messagecodes.py +8 -2
teradataml/common/messages.py +3 -1
teradataml/common/sqlbundle.py +25 -3
teradataml/common/utils.py +134 -9
teradataml/context/context.py +20 -10
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/data/dataframe_example.json +18 -2
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
teradataml/data/medical_readings.csv +101 -0
teradataml/data/patient_profile.csv +101 -0
teradataml/data/scripts/lightgbm/dataset.template +157 -0
teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
teradataml/data/target_udt_data.csv +8 -0
teradataml/data/templates/open_source_ml.json +3 -2
teradataml/data/vectordistance_example.json +4 -0
teradataml/dataframe/dataframe.py +543 -175
teradataml/dataframe/functions.py +553 -25
teradataml/dataframe/sql.py +184 -15
teradataml/dbutils/dbutils.py +556 -18
teradataml/dbutils/filemgr.py +48 -1
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/__init__.py +1 -1
teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
teradataml/opensource/_lightgbm.py +950 -0
teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
teradataml/opensource/sklearn/__init__.py +0 -1
teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
teradataml/options/__init__.py +7 -23
teradataml/options/configure.py +29 -3
teradataml/scriptmgmt/UserEnv.py +3 -3
teradataml/scriptmgmt/lls_utils.py +74 -21
teradataml/store/__init__.py +13 -0
teradataml/store/feature_store/__init__.py +0 -0
teradataml/store/feature_store/constants.py +291 -0
teradataml/store/feature_store/feature_store.py +2223 -0
teradataml/store/feature_store/models.py +1505 -0
teradataml/store/vector_store/__init__.py +1586 -0
teradataml/table_operators/query_generator.py +3 -0
teradataml/table_operators/table_operator_query_generator.py +3 -1
teradataml/table_operators/table_operator_util.py +37 -38
teradataml/table_operators/templates/dataframe_register.template +69 -0
teradataml/utils/dtypes.py +4 -2
teradataml/utils/validators.py +33 -1
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0

teradataml/opensource/sklearn/_sklearn_wrapper.py CHANGED Viewed

@@ -19,7 +19,6 @@ from collections import OrderedDict, defaultdict
 from importlib import import_module
 import base64
-import functools
 import json
 import numpy
 import os
@@ -28,7 +27,7 @@ import time
 import inspect
 import warnings
 import json
-import random
+import math
 import pandas as pd
 from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
 import pandas.api.types as pt
@@ -41,9 +40,9 @@ from teradataml.context.context import _get_current_databasename, get_connection
 from teradataml.dbutils.filemgr import install_file, remove_file
 from teradataml.utils.utils import execute_sql
 from teradataml.options.configure import configure
-from teradataml.opensource.sklearn._wrapper_utils import _validate_fit_run, _generate_new_name,\
+from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
     _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
-from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
+from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
     _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
     _OSML_ADDITIONAL_COLUMN_TYPES
 from teradataml.common.messagecodes import MessageCodes
@@ -53,7 +52,6 @@ from teradataml.dbutils.dbutils import _create_table, set_session_param
 from teradataml.utils.validators import _Validators
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml.dataframe.dataframe_utils import DataFrameUtils
-from teradataml.scriptmgmt.lls_utils import create_env, get_env
 from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.common.constants import TeradataConstants
@@ -70,6 +68,9 @@ _file_installed = False
 class _GenericObjectWrapper:
     def __init__(self) -> None:
+        if not get_connection():
+            raise TeradataMlException(Messages.get_message(MessageCodes.INVALID_CONTEXT_CONNECTION),
+                                      MessageCodes.INVALID_CONTEXT_CONNECTION)
         self._db_name = _get_current_databasename()
         self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
@@ -215,6 +216,7 @@ class _GenericObjectWrapper:
             raise TeradataMlException(
                 f"Script file '{file_name}' failed to remove in Vantage."
             )
     def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
                                                                 idx_delim=",",
                                                                 types_delim="--"):
@@ -264,7 +266,7 @@ class _GenericObjectWrapper:
                 args_str += f" {strr}"
         return args_str
-    def extract_sklearn_obj(self, n_unique_partitions = 1, n_partition_cols = 1):
+    def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1):
         """
         Internal function to extract sklearn object from the model(s) depending on the number of
         partitions. When it is only one model, it is directly used as sklearn object (modelObj).
@@ -297,13 +299,256 @@ class _GenericObjectWrapper:
         warnings.filterwarnings("default")
+    def _validate_existence_of_partition_columns(self, partition_columns, all_columns, arg_names_for_dfs):
+        """
+        Validate if columns in "partition_columns" argument are present in any of the given
+        dataframes.
+        """
+        invalid_part_cols = [c for c in partition_columns if c not in all_columns]
+        if invalid_part_cols:
+            raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
+                                                  ", ".join(invalid_part_cols),
+                                                  "', '".join(arg_names_for_dfs))
+                                                  )
+    def _prepare_data_args_string(self, kwargs):
+        """
+        Get column indices and types of each data related arguments in the format:
+        "{<arg_name>-<comma separated indices>-<comma separated types>}--
+         {<arg_name>-<comma separated indices>-<comma separated types>}"
+        """
+        data_args_str = []
+        for arg_name in list(self._data_args.keys()):
+            # Remove DataFrame arguments from kwargs, which will be passed to Script.
+            kwargs.pop(arg_name)
+            # Get column indices and their types for each dataframe from parent dataframe.
+            _, partition_indices_str, partition_types_str, _ = \
+                self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
+                                                                   self._data_args[arg_name].columns,
+                                                                   idx_delim=",",
+                                                                   types_delim=",")
+            # Format "<arg_name>-<comma separated indices>-<comma separated types>"
+            data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
+        # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
+        #    {<arg_name>-<comma separated indices>-<comma separated types>}"
+        return "--".join(data_args_str)
+    def _prepare_and_install_file(self, replace_dict):
+        """
+        Prepare function script file from template file and install it in Vantage.
+        Takes the dictionary with keys as strings to be replaced in script and values as
+        strings which should be added in place of keys.
+        """
+        with open(os.path.join(self._scripts_path, self._template_file)) as fp:
+            script_data = fp.read()
+        for old, new in replace_dict.items():
+            script_data = script_data.replace(old, new)
+        self._script_file_local = os.path.join(self._tdml_tmp_dir, self._script_file_name)
+        with open(self._script_file_local, "w") as fp:
+            fp.write(script_data)
+        self._install_script_file(file_identifier=self._script_file_name.split(".")[0],
+                                  file_name=self._script_file_name,
+                                  file_location=self._tdml_tmp_dir)
+    def _get_dataframe_related_args_and_their_columns(self, kwargs):
+        """
+        Get dataframe related arguments and return all their column names from kwargs.
+        """
+        __data_columns = []
+        __data_args_dict = OrderedDict()
+        # Separate dataframe related arguments and their column names from actual kwargs.
+        for k, v in kwargs.items():
+            if isinstance(v, DataFrame):
+                # All dataframes should be select of parent dataframe.
+                _validate_df_query_type(v, "select", k)
+                # Save all columns in dataframe related arguments.
+                __data_columns.extend(v.columns)
+                __data_args_dict[k] = v
+        return __data_args_dict, __data_columns
+    def _process_data_for_funcs_returning_objects(self, kwargs):
+        """
+        Internal function to process all arguments and assign self._data_args, self._tdml_df
+        and return
+        1. dictionary of elements (needed to replace in the script template file)
+        2. partition columns list.
+        """
+        partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
+        if partition_cols:
+            kwargs.pop("partition_columns")
+        self._data_args, __data_columns = self._get_dataframe_related_args_and_their_columns(kwargs)
+        arg_names_for_dfs = list(self._data_args.keys())
+        # Get common parent dataframe from all dataframes.
+        self._tdml_df =  DataFrameUtils()._get_common_parent_df_from_dataframes(list(self._data_args.values()))
+        self._tdml_df = self._tdml_df.select(__data_columns + partition_cols)
+        self._validate_existence_of_partition_columns(partition_cols, self._tdml_df.columns, arg_names_for_dfs)
+        self._tdml_df, partition_cols = self._get_data_and_data_partition_columns(self._tdml_df,
+                                                                                   __data_columns,
+                                                                                   [],
+                                                                                   partition_cols
+                                                                                   )
+        # Prepare string of data arguments with name, indices where columns of that argument resides
+        # and types of each of the column.
+        data_args_str = self._prepare_data_args_string(kwargs)
+        # Get indices of partition_columns and types of all columns.
+        data_column_types_str, partition_indices_str, _, partition_cols = \
+            self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
+                                                                         partition_cols,
+                                                                         types_delim=None,
+                                                                         idx_delim=None)
+        replace_dict = {"<partition_cols_indices>": str(partition_indices_str),
+                        "<types_of_data_cols>": str(data_column_types_str),
+                        "<data_args_info_str>": f"'{data_args_str}'"}
+        return replace_dict, partition_cols
+    def _validate_equality_of_partition_values(self, fit_values, trans_values):
+        """
+        Internal function to compare the partition values in fit() and predict() are same.
+        """
+        if len(fit_values) != len(trans_values):
+            return False
+        for val in fit_values:
+            if not all([val in trans_values]):
+                return False
+        return True
+    def _get_non_data_related_args_from_kwargs(self, kwargs):
+        """
+        Get all non-data related arguments from kwargs.
+        """
+        non_data_related_args = {}
+        for k, v in kwargs.items():
+            if not isinstance(v, DataFrame):
+                non_data_related_args[k] = v
+        non_data_related_args.pop("partition_columns", None)
+        return non_data_related_args
+    def _read_from_template_and_write_dict_to_file(self, template_file, replace_dict,
+                                                   output_script_file_name=None):
+        """
+        Read template file, replace the keys with values and write to new file.
+        """
+        with open(os.path.join(self._scripts_path, template_file)) as fp:
+            script_data = fp.read()
+        for old, new in replace_dict.items():
+            script_data = script_data.replace(old, new)
+        if output_script_file_name is None:
+            output_script_file_name = self._script_file_name
+        file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
+        with open(file_path, "w") as fp:
+            fp.write(script_data)
+    def _generate_script_file_from_template_file(self, kwargs, template_file, func_name,
+                                                 output_script_file_name=None):
+        """
+        Internal function to generate script file from template file. It just adds the non-data
+        related arguments to the template file and writes the contents to new file, so that these
+        arguments are available in the script file for running this function "func_name".
+        """
+        # Take out all non-data related arguments to write to template file.
+        non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
+        # Read template file and write the contents to new file with non-data related arguments.
+        template_f = os.path.join(self._scripts_path, template_file)
+        with open(template_f, "r") as f:
+            template = f.read()
+        if output_script_file_name is None:
+            output_script_file_name = self._script_file_name
+        file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
+        with open(file_path, "w") as f:
+            f.write("import json\n")
+            f.write(f"params = json.loads('{json.dumps(non_data_related_args)}')\n")
+            f.write(template)
+        kwargs["file_name"] = output_script_file_name
+        kwargs["name"] = func_name
+    def _remove_data_related_args_from_kwargs(self, kwargs):
+        """
+        Internal function to remove data related arguments from kwargs.
+        """
+        kwargs.pop("data", None)
+        kwargs.pop("feature_columns", None)
+        kwargs.pop("group_columns", None)
+        kwargs.pop("partition_columns", None)
+        kwargs.pop("label_columns", None)
+    def _convert_pos_args_to_kwargs_for_function(self, pos_args, kwargs, func_name):
+        """
+        Internal function to convert positional arguments to keyword arguments.
+        """
+        fn = getattr(getattr(import_module(self.module_name), self.class_name), func_name)
+        kwargs.update(zip(fn.__code__.co_varnames[1:], pos_args))
+    def _install_model_and_script_files(self, file_name, file_location):
+        """
+        Internal function to install model and script files to Vantage.
+        """
+        self._install_initial_model_file()
+        self._install_script_file(file_identifier=file_name.split(".")[0],
+                                  file_name=file_name,
+                                  is_binary=False,
+                                  file_location=file_location)
+    def _assign_fit_variables_after_execution(self, data, partition_columns, label_columns):
+        """
+        Internal function to assign fit related variables.
+        """
+        # Extract sklearn object(s) from the depending on the number of unique partitioning values.
+        self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
+                                 n_partition_cols=len(partition_columns))
+        # Need this label columns types in prediction.
+        self._fit_label_columns_types = []
+        self._fit_label_columns_python_types = []
+        for l_c in label_columns:
+            column_data = data._td_column_names_and_sqlalchemy_types[l_c.lower()]
+            self._fit_label_columns_types.append(column_data)
+            self._fit_label_columns_python_types.append(column_data.python_type.__name__)
+        # If the model is trained a second time after the object creation,
+        # or if set_params() is called after the first model training,
+        # this flag will reset to False. So that for subsequent predict/score
+        # operations, the newly trained model will be installed.
+        if self._is_trained_model_installed:
+            self._is_trained_model_installed = False
 class _OpenSourceObjectWrapper(_GenericObjectWrapper):
     # This has to be set for every package which subclasses this class.
     OPENSOURCE_PACKAGE_NAME = None
     def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
-        if not model and not module_name and not class_name:
+        if model is None and not module_name and not class_name:
             raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
                                                            "module_name and class_name"),
                                       MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
@@ -319,24 +564,224 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         self.pos_args = pos_args if pos_args is not None else tuple()
         self._fit_label_columns_types = None
+        self._fit_label_columns_python_types = None
         self._table_name_prefix = None
         self._is_default_partition_value_fit = True # False when the user provides partition columns.
         self._fit_partition_colums_non_default = None
         self._is_default_partition_value_predict = True # False when the user provides partition columns.
-    def _validate_equality_of_partition_values(self, fit_values, trans_values):
+    def __repr__(self):
+        if self._is_default_partition_value_fit:
+            # Single model use case.
+            return self.modelObj.__repr__()
+        pd.set_option("display.expand_frame_repr", None)
+        pd.set_option("display.max_colwidth", None)
+        opt = self.modelObj.__repr__()
+        pd.reset_option("display.expand_frame_repr")
+        pd.reset_option("display.max_colwidth")
+        return opt
+    def _initialize_object(self):
         """
-        Internal function to compare the partition values in fit() and predict() are same.
+        Internal function to initialize sklearn object from module name and class name.
         """
-        if len(fit_values) != len(trans_values):
-            return False
+        # Needed when writing imported modules to generated file. TODO: Remove later.
+        imported_args = {}
+        # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
+        # corresponding sklearn object.
+        _partition_column_names = None
+        if "partition_columns" in self.kwargs:
+            self._fit_partition_colums_non_default = self.kwargs["partition_columns"]
+            self._is_default_partition_value_fit = False
+            _partition_column_names = self._fit_partition_colums_non_default
-        for val in fit_values:
-            if not all([val in trans_values]):
-                return False
-        return True
+        new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
+        new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
+        # Create model object from new positional and keyword arguments.
+        class_obj = getattr(import_module(self.module_name), self.class_name)
+        if new_sklearn_pos_args:
+            self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
+        else:
+            self.modelObj = class_obj(**new_sklearn_kwargs)
+        # All arguments are moved to kwargs and kept pos_args empty.
+        # Might help in set_params() bug fix.
+        self.pos_args = tuple()
+        _arguments = self.modelObj.__dict__
+        if hasattr(self.modelObj, "get_params"):
+            # Update kwargs that are both in modelObj and get_params() as there are
+            # some classes which return other internals variables also.
+            # Hence, filtering them using get_params().
+            for k, v in _arguments.items():
+                if type(v).__name__ in ["function", "generator"]:
+                    # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
+                    #       are not supported yet due to pickling issue.
+                    continue
+                if self.get_params():
+                    if k in self.get_params():
+                        self.kwargs[k] = v
+                else:
+                    _model_init_arguments = None
+                    try:
+                        _model_init_arguments = self.modelObj.__init__.__code__.co_varnames
+                    except AttributeError:
+                        pass
+                    if _model_init_arguments:
+                        self.kwargs = dict((k, v) for k, v in _arguments.items() if k in _model_init_arguments)
+                    else:
+                        self.kwargs = _arguments
+        else:
+            # Model selection classes will not have `get_params`, in which case modelObj's __dict__
+            # is saved as kwargs.
+            self.kwargs = _arguments
+        if _partition_column_names:
+            self.kwargs["partition_columns"] = _partition_column_names
+    def _initialize_variables(self, table_name_prefix):
+        """
+        Internal function to initialize variables used in this class.
+        """
+        self.feature_names_in_ = None
+        self._table_name_prefix = table_name_prefix
+        self._model_file_name_prefix = _generate_new_name(type="file")
+        self.model_file_paths_local = set()
+        self._fit_execution_time = None
+        self._fit_predict_execution_time = None
+        self._partial_fit_execution_time = None
+        self._predict_execution_time = None
+        self._transform_execution_time = None
+        self._score_execution_time = None
+        # Set to partition columns when training is done with partition columns.
+        self._fit_partition_colums_non_default = None
+        self._is_model_installed = False
+        self._fit_partition_unique_values = [[self._default_data_partition_value]]
+    def _get_returning_df(self, script_df, partition_column, returns):
+        """
+        Internal function to return the teradataml Dataframe except
+        partition_column.
+        """
+        if self._is_default_partition_value_fit:
+            # For single model case, partition column is internally generated
+            # and no point in returning it to the user.
+            # Extract columns from return types.
+            returning_cols = [col[0] for col in returns[len(partition_column):]]
+            return script_df.select(returning_cols)
+        return script_df
+    def modify_args(self, fp1, arg, imported_args):
+        """
+        Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
+        of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
+        object.
+        This function can also be used to write import statements to file (if "fp1" is not
+        None). Update "imported_args" dictionary with imported module and class name to avoid
+        importing same module and class again when writing to file. This is useful when we want to
+        generate script from template file.
+        Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
+        object to corresponding sklearn object.
+        """
+        if isinstance(arg, type(self)):
+            imported_tuple = (arg.module_name, arg.class_name)
+            already_imported = imported_args.get(imported_tuple, False)
+            if not already_imported:
+                imported_args[imported_tuple] = True
+                if fp1:
+                    fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
+                self.modify_args(fp1, arg.pos_args, imported_args)
+                self.modify_args(fp1, arg.kwargs, imported_args)
+            return arg.modelObj
+        elif isinstance(arg, list):
+            return [self.modify_args(fp1, val, imported_args) for val in arg]
+        elif isinstance(arg, tuple):
+            return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
+        elif type(arg).__name__ == "generator":
+            # Raising exception as generator object can't be pickled.
+            # TODO: ELE-6351 - Find ways to pickle generator object later.
+            raise ValueError("Generator type/iterator is not supported for any argument. "\
+                             "Support will be added later.")
+        elif type(arg).__name__ == "function":
+            # Raising exception as functions/lambda functions can't be pickled.
+            # TODO: ELE-6351 - Find ways to pickle functions later.
+            raise ValueError("Functions are not supported for any argument. "\
+                             "Support will be added later.")
+        elif isinstance(arg, dict):
+            return dict(
+                (
+                    self.modify_args(fp1, k, imported_args),
+                    self.modify_args(fp1, v, imported_args),
+                )
+                for k, v in arg.items() if k != "partition_columns"
+            )
+        # elif arg == "partition_columns":
+        else:
+            return arg
+    def _install_initial_model_file(self, use_dummy_initial_file=False):
+        """
+        If model file(s) is/are not installed in Vantage, then install it/them.
+        """
+        if isinstance(self.modelObj, pd.DataFrame):
+            # Get list of unique partition values and corresponding model object as dict.
+            partition_values_model_dict = {}
+            obj_list = self.modelObj.values.tolist()
+            for lst in obj_list:
+                partition_values_model_dict[tuple(lst[:len(self._fit_partition_colums_non_default)])] = \
+                    lst[len(self._fit_partition_colums_non_default)]
+        for partition in self._fit_partition_unique_values:
+            # Create a new file with file name with partition values and
+            # dump sklearn object into it. Finally install the file to Vantage.
+            partition_join = "_".join([str(x) for x in partition])
+            file_name = f"{self._model_file_name_prefix}_{partition_join}"
+            # Replace '-' with '_' as '-' can't be present in file identifier.
+            # Needed this replace because partition_columns can be negative.
+            file_name = file_name.replace("-", "_")
+            full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
+            with open(full_file_name, "wb+") as fp:
+                # Write sklearn object to file.
+                if isinstance(self.modelObj, pd.DataFrame):
+                    # If multiple models, then write the model corresponding to the partition value.
+                    fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
+                else:
+                    if use_dummy_initial_file:
+                        fp.write(pickle.dumps("abc"))
+                    else:
+                        fp.write(pickle.dumps(self.modelObj))
+            self.model_file_paths_local.add(file_name)
+            self._install_script_file(file_identifier=file_name,
+                                      file_name=file_name,
+                                      is_binary=True,
+                                      file_location=self._tdml_tmp_dir)
+            if self._is_lake_system:
+                # Need to pass env_name along with file_name for cleaning up the files in env.
+                obj = f"{self._env.env_name}::{file_name}"
+                if installed_model_files[obj] == 0:
+                    # Add to GC for the first time the model file (along with env name) is encountered.
+                    installed_model_files[obj] = 1
+                    GarbageCollector._add_to_garbagecollector(object_name=obj,
+                                                object_type=TeradataConstants.TERADATA_APPLY)
+            else:
+                if installed_model_files[file_name] == 0:
+                    # Add to GC for the first time the model file is encountered.
+                    installed_model_files[file_name] = 1
+                    GarbageCollector._add_to_garbagecollector(object_name=file_name,
+                                                object_type=TeradataConstants.TERADATA_SCRIPT)
+            self._is_model_installed = True
     def _validate_unique_partition_values(self, data, partition_columns):
         """
@@ -361,25 +806,61 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
             raise TeradataMlException(
-                Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING),
+                Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING, "training", "test"),
                 MessageCodes.PARTITION_VALUES_NOT_MATCHING
             )
     def fit(self, **kwargs):
         pass
+    def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
+        """
+        Internal function to convert all OpensourceML related objects in arguments to
+        underlying model objects.
+        """
+        if isinstance(args, dict):
+            new_args = args.copy() # To avoid updating
+            for k, v in new_args.items():
+                if isinstance(v, type(self)):
+                    if idx_multi_model is not None:
+                        # single model. This argument is set only when modelObj is single model.
+                        new_args[k] = v.modelObj
+                    else:
+                        # multi-model. Get appropriate model from modelObj.
+                        new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
+                else:
+                    new_args[k] = v
+            return new_args
+        # If args is tuple, convert all elements to underlying model object.
+        elif isinstance(args, tuple):
+            new_args = tuple()
+            for arg in args:
+                if isinstance(arg, type(self)):
+                    if idx_multi_model is None:
+                        # single model. This argument is set only when modelObj is single model.
+                        new_args += (arg.modelObj,)
+                    else:
+                        # multi-model. Get appropriate model from modelObj.
+                        new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
+                else:
+                    new_args += (arg,)
+            return new_args
+        return args
     def __get_obj_attributes_multi_model(self, name):
         """
         Internal function to get attributes of all sklearn model objects when multiple models are
         generated by fit.
         """
-        def __generate_model_object(model_obj_value):
+        def __generate_model_object(model_obj_value, init_model_obj):
             """
             Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
             """
             # Create _SkLearnObjectWrapper object from opensource model object.
-            model_obj = self.__class__(model=first_atrribute_instance)
+            model_obj = self.__class__(model=init_model_obj)
             model_obj.modelObj = model_obj_value
             model_obj._is_model_installed = True
@@ -396,13 +877,34 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
             multi_models = self.modelObj.copy()
             for i in range(multi_models.shape[0]):
                 curr_model = multi_models.iloc[i]["model"]
-                multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
+                partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
+                partition_values = "_".join([str(x) for x in partition_values])
+                if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
+                    # filename is first argument.
+                    kwargs1 = kwargs.copy()
+                    c1 = c
+                    if len(c) > 0:
+                        c1 = list(c1)
+                        c1[0] = f"{c1[0]}_{partition_values}"
+                        c1 = tuple(c1)
+                    if len(kwargs) > 0 and kwargs.get("filename", None):
+                        kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
+                    multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
+                                                                            **self._convert_arguments_to_modelObj(kwargs1, i))
+                else:
+                    multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
+                                                                            **self._convert_arguments_to_modelObj(kwargs, i))
-            first_function_instance = multi_models.at[0, "model"]
-            if self.__class__._validate_model_supportability(first_function_instance):
-                return __generate_model_object(multi_models)
+            first_function_value = multi_models.at[0, "model"]
+            if self.__class__._validate_model_supportability(first_function_value):
+                return __generate_model_object(multi_models, init_model_obj=first_function_value)
-            return multi_models.rename(columns={"model": name})
+            multi_models = multi_models.rename(columns={"model": name})
+            # Select only partition columns and the attribute column.
+            return multi_models[self._fit_partition_colums_non_default + [name]]
         # Assuming that self.modelObj will have at least 1 row.
@@ -420,15 +922,15 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
             output_attributes.at[i, "model"] = getattr(model, name)
         if self.__class__._validate_model_supportability(first_atrribute_instance):
-            return __generate_model_object(output_attributes)
+            return __generate_model_object(output_attributes, init_model_obj=first_atrribute_instance)
         return output_attributes.rename(columns={"model": name})
     def __getattr__(self, name):
-        # This just run attributes (functions and properties) from sklearn object.
+        # This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
         def __sklearn_method_invoker(*c, **kwargs):
-            # sklearn model is returned from the function call. Create _SkLearnObjectWrapper object.
-            model_obj = attribute_instance(*c, **kwargs)
+            # Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
+            model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
             if self.__class__._validate_model_supportability(model_obj):
                 model_obj = self.__class__(model=model_obj)
                 model_obj._is_model_installed = True # Trained model is returned by function call.
@@ -636,234 +1138,63 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
         EXAMPLES:
             >>> from teradataml import td_sklearn
             >>> model = td_sklearn.LinearRegression(normalize=True)
-            >>> model
-            LinearRegression(normalize=True)
-            # Example 1: Deploy the model held by interface object to Vantage.
-            >>> lin_reg = model.deploy("linreg_model_ver_2")
-            Model is saved.
-            >>> lin_reg
-            LinearRegression(normalize=True)
-            # Example 2: Deploy the model held by interface object to Vantage with the name same
-            #            as that of model that already existed in Vantage.
-            >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
-            Model is deleted.
-            Model is saved.
-            >>> lin_reg
-            LinearRegression(normalize=True)
-        """
-        # Install model file into Vantage, if not installed.
-        self._install_initial_model_file()
-        self._save_model(model_name, replace_if_exists)
-        return self
-class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
-    OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
-    def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
-        super().__init__(model=model, module_name=module_name, class_name=class_name,
-                         pos_args=pos_args, kwargs=kwargs)
-        self._initialize_variables()
-        if model:
-            self.modelObj = model
-            self.module_name = model.__module__.split("._")[0]
-            self.class_name = model.__class__.__name__
-            # __dict__ gets all the arguments as dictionary including default ones and positional
-            # args.
-            self.kwargs = model.__dict__
-            self.pos_args = tuple() # Kept empty as all are moved to kwargs.
-        else:
-            self._initialize_object()
-    def __repr__(self):
-        if self._is_default_partition_value_fit:
-            # Single model use case.
-            return self.modelObj.__repr__()
-        pd.set_option("display.expand_frame_repr", None)
-        pd.set_option("display.max_colwidth", None)
-        opt = self.modelObj.__repr__()
-        pd.reset_option("display.expand_frame_repr")
-        pd.reset_option("display.max_colwidth")
-        return opt
-    def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
-                                    skip_either_or_that=False):
-        """
-        Internal function to validate arguments passed to exposed opensource APIs and return
-        parent DataFrame, feature columns, label columns, group columns, data partition columns.
-        """
-        _validate_opensource_func_args(X=X, y=y, groups=groups,
-                                       fit_partition_cols=self._fit_partition_colums_non_default,
-                                       kwargs=kwargs,
-                                       skip_either_or_that=skip_either_or_that)
-        return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
-                                        fit_partition_cols=self._fit_partition_colums_non_default)
-    def _initialize_object(self):
-        """
-        Internal function to initialize sklearn object from module name and class name.
-        """
-        # Needed when writing imported modules to generated file. TODO: Remove later.
-        imported_args = {}
-        # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
-        # corresponding sklearn object.
-        new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
-        new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
-        # Create model object from new positional and keyword arguments.
-        class_obj = getattr(import_module(self.module_name), self.class_name)
-        if new_sklearn_pos_args:
-            self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
-        else:
-            self.modelObj = class_obj(**new_sklearn_kwargs)
-        # All arguments are moved to kwargs and kept pos_args empty.
-        # Might help in set_params() bug fix.
-        self.pos_args = tuple()
-        _arguments = self.modelObj.__dict__
-        if hasattr(self.modelObj, "get_params"):
-            # Update kwargs that are both in modelObj and get_params() as there are
-            # some classes which return other internals variables also.
-            # Hence, filtering them using get_params().
-            for k, v in _arguments.items():
-                if type(v).__name__ in ["function", "generator"]:
-                    # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
-                    #       are not supported yet due to pickling issue.
-                    continue
-                if k in self.get_params():
-                    self.kwargs[k] = v
-        else:
-            # Model selection classes will not have `get_params`, in which case modelObj's __dict__
-            # is saved as kwargs.
-            self.kwargs = _arguments
-    def _initialize_variables(self):
-        """
-        Internal function to initialize variables used in this class.
-        """
-        self.feature_names_in_ = None
-        self._table_name_prefix = "td_sklearn_"
-        self._model_file_name_prefix = _generate_new_name(type="file")
-        self.model_file_paths_local = set()
-        self._fit_execution_time = None
-        self._fit_predict_execution_time = None
-        self._partial_fit_execution_time = None
-        self._predict_execution_time = None
-        self._transform_execution_time = None
-        self._score_execution_time = None
-        # Set to partition columns when training is done with partition columns.
-        self._fit_partition_colums_non_default = None
-        self._is_model_installed = False
-        self._fit_partition_unique_values = [[self._default_data_partition_value]]
-    def modify_args(self, fp1, arg, imported_args):
-        """
-        Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
-        of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
-        object.
-        This function can also be used to write import statements to file (if "fp1" is not
-        None). Update "imported_args" dictionary with imported module and class name to avoid
-        importing same module and class again when writing to file. This is useful when we want to
-        generate script from template file.
-        Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
-        object to corresponding sklearn object.
-        """
-        if isinstance(arg, type(self)):
-            imported_tuple = (arg.module_name, arg.class_name)
-            already_imported = imported_args.get(imported_tuple, False)
-            if not already_imported:
-                imported_args[imported_tuple] = True
-                if fp1:
-                    fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
-                self.modify_args(fp1, arg.pos_args, imported_args)
-                self.modify_args(fp1, arg.kwargs, imported_args)
-            return arg.modelObj
-        elif isinstance(arg, list):
-            return [self.modify_args(fp1, val, imported_args) for val in arg]
-        elif isinstance(arg, tuple):
-            return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
-        elif type(arg).__name__ == "generator":
-            # Raising exception as generator object can't be pickled.
-            # TODO: ELE-6351 - Find ways to pickle generator object later.
-            raise ValueError("Generator type/iterator is not supported for any argument. "\
-                             "Support will be added later.")
-        elif type(arg).__name__ == "function":
-            # Raising exception as functions/lambda functions can't be pickled.
-            # TODO: ELE-6351 - Find ways to pickle functions later.
-            raise ValueError("Functions are not supported for any argument. "\
-                             "Support will be added later.")
-        elif isinstance(arg, dict):
-            return dict(
-                (
-                    self.modify_args(fp1, k, imported_args),
-                    self.modify_args(fp1, v, imported_args),
-                )
-                for k, v in arg.items()
-            )
-        else:
-            return arg
+            >>> model
+            LinearRegression(normalize=True)
-    def _install_initial_model_file(self):
-        """
-        If model file(s) is/are not installed in Vantage, then install it/them.
+            # Example 1: Deploy the model held by interface object to Vantage.
+            >>> lin_reg = model.deploy("linreg_model_ver_2")
+            Model is saved.
+            >>> lin_reg
+            LinearRegression(normalize=True)
+            # Example 2: Deploy the model held by interface object to Vantage with the name same
+            #            as that of model that already existed in Vantage.
+            >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
+            Model is deleted.
+            Model is saved.
+            >>> lin_reg
+            LinearRegression(normalize=True)
         """
-        if isinstance(self.modelObj, pd.DataFrame):
-            # Get list of unique partition values and corresponding model object as dict.
-            partition_values_model_dict = {}
-            obj_list = self.modelObj.values.tolist()
-            for lst in obj_list:
-                partition_values_model_dict[tuple(lst[:len(lst)-1])] = lst[len(lst)-1]
-        for partition in self._fit_partition_unique_values:
-            # Create a new file with file name with partition values and
-            # dump sklearn object into it. Finally install the file to Vantage.
-            partition_join = "_".join([str(x) for x in partition])
-            file_name = f"{self._model_file_name_prefix}_{partition_join}"
-            # Replace '-' with '_' as '-' can't be present in file identifier.
-            # Needed this replace because partition_columns can be negative.
-            file_name = file_name.replace("-", "_")
-            full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
-            with open(full_file_name, "wb+") as fp:
-                # Write sklearn object to file.
-                if isinstance(self.modelObj, pd.DataFrame):
-                    # If multiple models, then write the model corresponding to the partition value.
-                    fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
-                else:
-                    fp.write(pickle.dumps(self.modelObj))
-            self.model_file_paths_local.add(file_name)
+        # Install model file into Vantage, if not installed.
+        self._install_initial_model_file()
-            self._install_script_file(file_identifier=file_name,
-                                      file_name=file_name,
-                                      is_binary=True,
-                                      file_location=self._tdml_tmp_dir)
+        self._save_model(model_name, replace_if_exists)
+        return self
-            if self._is_lake_system:
-                # Need to pass env_name along with file_name for cleaning up the files in env.
-                obj = f"{self._env.env_name}::{file_name}"
-                if installed_model_files[obj] == 0:
-                    # Add to GC for the first time the model file (along with env name) is encountered.
-                    installed_model_files[obj] = 1
-                    GarbageCollector._add_to_garbagecollector(object_name=obj,
-                                                object_type=TeradataConstants.TERADATA_APPLY)
-            else:
-                if installed_model_files[file_name] == 0:
-                    # Add to GC for the first time the model file is encountered.
-                    installed_model_files[file_name] = 1
-                    GarbageCollector._add_to_garbagecollector(object_name=file_name,
-                                                object_type=TeradataConstants.TERADATA_SCRIPT)
-            self._is_model_installed = True
+class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
+    OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
+    def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
+        super().__init__(model=model, module_name=module_name, class_name=class_name,
+                         pos_args=pos_args, kwargs=kwargs)
+        self._initialize_variables(table_name_prefix="td_sklearn_")
+        if model is not None:
+            self.modelObj = model
+            self.module_name = model.__module__.split("._")[0]
+            self.class_name = model.__class__.__name__
+            # __dict__ gets all the arguments as dictionary including default ones and positional
+            # args.
+            self.kwargs = model.__dict__
+            self.pos_args = tuple() # Kept empty as all are moved to kwargs.
+        else:
+            self._initialize_object()
+    def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
+                                    skip_either_or_that=False):
+        """
+        Internal function to validate arguments passed to exposed opensource APIs and return
+        parent DataFrame, feature columns, label columns, group columns, data partition columns.
+        """
+        _validate_opensource_func_args(X=X, y=y, groups=groups,
+                                       fit_partition_cols=self._fit_partition_colums_non_default,
+                                       kwargs=kwargs,
+                                       skip_either_or_that=skip_either_or_that)
+        return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
+                                        fit_partition_cols=self._fit_partition_colums_non_default)
     def _run_fit_related_functions(self,
                                    data,
@@ -871,7 +1202,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
                                    label_columns,
                                    partition_columns,
                                    func,
-                                   classes=None):
+                                   classes=None,
+                                   file_name="sklearn_fit.py"):
         """
         Internal function to run fit() and partial_fit() functions.
         """
@@ -886,8 +1218,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
                         for col in new_partition_columns] + [("model", model_type)]
-        file_name = "sklearn_fit.py"
         if classes:
             class_type = type(classes[0]).__name__
             classes = "--".join([str(x) for x in classes])
@@ -913,20 +1243,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         self._model_data = self._run_script(data, script_command, new_partition_columns,
                                             return_types)
-        # Extract sklearn object(s) from the depending on the number of unique partitioning values.
-        self.extract_sklearn_obj(n_unique_partitions=len(self._fit_partition_unique_values),
-                                 n_partition_cols=len(new_partition_columns))
-        # Need this label columns types in prediction.
-        self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
-                                         for l_c in label_columns]
-        # If the model is trained a second time after the object creation,
-        # or if set_params() is called after the first model training,
-        # this flag will reset to False. So that for subsequent predict/score
-        # operations, the newly trained model will be installed.
-        if self._is_trained_model_installed:
-            self._is_trained_model_installed = False
+        self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
     def partial_fit(self, X=None, y=None, classes=None, **kwargs):
         """
@@ -974,11 +1291,19 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
             self._is_default_partition_value_fit = False
             self._fit_partition_colums_non_default = partition_columns
-        self._run_fit_related_functions(data,
-                                        feature_columns,
-                                        label_columns,
-                                        partition_columns,
-                                        inspect.stack()[0][3])
+        file_name = kwargs.pop("file_name", None)
+        func_name = kwargs.pop("name", "fit")
+        args = {"data": data,
+                "feature_columns": feature_columns,
+                "label_columns": label_columns,
+                "partition_columns": partition_columns,
+                "func": func_name}
+        if file_name is not None:
+            args["file_name"] = file_name
+        self._run_fit_related_functions(**args)
         self._fit_execution_time = time.time() - st_time
@@ -1043,10 +1368,130 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         return super().__getattr__(name)
+    def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
+                                      func_name, **kwargs):
+        """
+        Internal function to handle multi model case for transform function for functions
+        ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
+        and "Birch" of cluster module.
+        These functions generate multiple models and when transform is applied to each model, it generates
+        output with different number of columns.
+        """
+        skl_objs_dict = {}
+        no_of_unique_partitions = len(self._fit_partition_unique_values)
+        no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
+        # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
+        # and get the maximum number of columns and their types.
+        for i in range(no_of_unique_partitions):
+            skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
+        data = data.select(feature_columns + label_columns + partition_columns)
+        ten_row_data = data.head(10).get_values()
+        X = numpy.array(ten_row_data)
+        # For multi-model case, model in one AMP can give more number of columns than other AMPs.
+        # Returns clause can't contain different number of columns in different AMPs. Hence, taking
+        # maximum number of columns and their types from all models.
+        max_no_of_columns = 0
+        max_col_names = []
+        max_col_types = []
+        def _get_input_row_without_nans(row):
+            """
+            `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
+            """
+            X1 = []
+            for _, v in enumerate(row):
+                if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
+                    # Add to list when:
+                    #  - v is None or
+                    #   - v is string or
+                    #   - v is not nan or
+                    #   - if module is impute (which transforms nan values) even though v is nan.
+                    X1.append(v)
+                else:
+                    # skip nan values.
+                    pass
+            return X1
+        for i in range(X.shape[0]):
+            # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
+            partition_values = tuple(X[i, -no_of_partitioning_cols:])
+            skl_obj = skl_objs_dict[partition_values]
+            X1 = X[i, :-no_of_partitioning_cols]
+            # Since Nans/NULLs are added in transform for last columns where some models generated
+            # less number of columns, removing Nans/NULLs from the input row for inverse_transform
+            # using function _get_input_row_without_nans().
+            X1 = numpy.array([_get_input_row_without_nans(X1)])
+            trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
+            no_of_columns = 1
+            if trans_opt.shape == (X1.shape[0],):
+                trans_opt = trans_opt.reshape(X1.shape[0], 1)
+            if isinstance(trans_opt[0], numpy.ndarray) \
+                    or isinstance(trans_opt[0], list) \
+                    or isinstance(trans_opt[0], tuple):
+                no_of_columns = len(trans_opt[0])
+            col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
+            # Get new column sqlalchemy types for pandas df columns of transform output.
+            opt_pd = pd.DataFrame(trans_opt)
+            # Get output column types for each column in pandas df from the output of transform
+            # type functions.
+            types = {}
+            for idx in range(no_of_columns):
+                col = list(opt_pd.columns)[idx]
+                # Only one row in trans_opt.
+                if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
+                    type_ = type(trans_opt[0][idx])
+                else:
+                    # only one value in the output.
+                    type_ = type(trans_opt[0])
+                # If type of the output value (trans_opt) is None, then use `str` as type since
+                # pandas astype() does not accept None type.
+                if type_ is type(None):
+                    type_ = str
+                # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
+                # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
+                #                       Error while type casting for column '2'"
+                # Hence, using pd.Int64Dtype() for integer columns with nan values.
+                types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
+            # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
+            opt_pd = opt_pd.astype(types)
+            # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
+            # TIMESTAMP(timezone=True) else map it according to default value.
+            col_types = [TIMESTAMP(timezone=True)
+                        if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
+                        else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
+                        for key, col_name in enumerate(list(opt_pd.columns))]
+            # Different models in multi model case can generate different number of output columns for example in
+            # SelectFpr. Hence, taking the model which generates maximum number of columns.
+            if no_of_columns > max_no_of_columns:
+                max_no_of_columns = no_of_columns
+                max_col_names = col_names
+                max_col_types = col_types
+        return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
     def _get_return_columns_for_function_(self,
                                           data,
                                           feature_columns,
                                           label_columns,
+                                          partition_columns,
                                           func_name,
                                           kwargs):
         """
@@ -1060,7 +1505,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
             return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
                      data._td_column_names_and_sqlalchemy_types[col.lower()])
                     for i, col in enumerate(label_columns)]
-        if func_name == "predict":
+        if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
             """
             Return predict columns using either label_columns (if provided) or
             self._fit_label_columns_types (if the function is trained using label columns).
@@ -1075,8 +1521,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
                 return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
                         for i, col_type in enumerate(self._fit_label_columns_types)]
-        data = data.select(feature_columns + label_columns)
         ## If function is not `fit_predict`:
         #   then take one row of transform/other functions to execute in client
         #   to get number of columns in return clause and their Vantage types.
@@ -1090,8 +1534,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
             skl_obj = self.modelObj
         else:
             # Multi model case.
+            if (func_name in ["transform", "inverse_transform"] and \
+                self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
+                (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
+                # Special handling for multi model case for transform function as these classes
+                # generate transform output with different number of columns for each model.
+                # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
+                # some models.
+                return self._special_handling_multimodel_(data, feature_columns, label_columns,
+                                                          partition_columns, func_name, **kwargs)
             skl_obj = self.modelObj.iloc[0]["model"]
+        data = data.select(feature_columns + label_columns)
         ten_row_data = data.head(10).get_values()
         X = numpy.array(ten_row_data)
         if label_columns:
@@ -1200,7 +1656,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
     @_validate_fit_run
-    def _run_function_needing_all_rows(self, X=None, y=None, **kwargs):
+    def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
         """
         Internal function to run functions like score, aic, bic which needs all rows and return
         one floating number as result.
@@ -1223,8 +1679,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
                                                                                 label_columns,
                                                                                 partition_columns)
-        file_name = "sklearn_score.py"
         script_file_path = f"{file_name}" if self._is_lake_system \
             else f"./{self._db_name}/{file_name}"
@@ -1260,7 +1714,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         return opt
     @_validate_fit_run
-    def _transform(self, X=None, y=None, **kwargs):
+    def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
         """
         Internal function to run predict/transform and similar functions, which returns
         multiple columns. This function will return data row along with the generated
@@ -1283,18 +1737,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
                                                                                 partition_columns)
         # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
-        if "data" in kwargs:
-            kwargs.pop("data")
-        if "feature_columns" in kwargs:
-            kwargs.pop("feature_columns")
-        if "group_columns" in kwargs:
-            kwargs.pop("group_columns")
-        if "partition_columns" in kwargs:
-            kwargs.pop("partition_columns")
-        if "label_columns" in kwargs:
-            kwargs.pop("label_columns")
-        file_name = "sklearn_transform.py"
+        self._remove_data_related_args_from_kwargs(kwargs)
         script_file_path = f"{file_name}" if self._is_lake_system \
             else f"./{self._db_name}/{file_name}"
@@ -1304,24 +1747,36 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         self._validate_unique_partition_values(data, new_partition_columns)
-        py_exc = UtilFuncs._get_python_execution_path()
-        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
-            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
-            f"{self._model_file_name_prefix} {self._is_lake_system}"
+        return_columns_python_types = None
+        if self._fit_label_columns_python_types:
+            return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
         # Returning feature columns also along with transformed columns because we don't know the
         # mapping of feature columns to the transformed columns.
-        return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in (new_partition_columns + feature_columns)]
+        ## 'correct_covariance()' returns the (n_features, n_features)
+        if func_name == "correct_covariance":
+            return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
+                            for col in new_partition_columns]
+        else:
+            return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
+                            for col in (new_partition_columns + feature_columns)]
         if func_name in ["predict", "decision_function"] and label_columns:
             return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
                              for col in label_columns]
-        return_types += self._get_return_columns_for_function_(data,
-                                                               feature_columns,
-                                                               label_columns,
-                                                               func_name,
-                                                               kwargs)
+        output_cols_types = self._get_return_columns_for_function_(data,
+                                                                   feature_columns,
+                                                                   label_columns,
+                                                                   new_partition_columns,
+                                                                   func_name,
+                                                                   kwargs)
+        return_types += output_cols_types
+        py_exc = UtilFuncs._get_python_execution_path()
+        script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
+            f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
+            f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
+            f"{return_columns_python_types}"
         # Checking the trained model installation. If not installed,
         # install it and set flag to True.
@@ -1363,6 +1818,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
             return_types += self._get_return_columns_for_function_(data,
                                                                    feature_columns,
                                                                    label_columns,
+                                                                   new_partition_columns,
                                                                    func_name,
                                                                    {})
         else:
@@ -1448,14 +1904,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
                                              skip_either_or_that=True)
         # Remove the kwargs data.
-        input_data = kwargs.pop("data", None)
-        partition_cols = kwargs.pop("partition_columns", None)
-        feature_cols = kwargs.pop("feature_columns", None)
-        label_cols = kwargs.pop("label_columns", None)
+        self._remove_data_related_args_from_kwargs(kwargs)
         if partition_columns:
             # kwargs are passed to kneighbors function. So, removing them from kwargs.
-            kwargs.pop("partition_columns")
             self._is_default_partition_value_fit = False
         # Generating new partition column name.
@@ -1640,161 +2092,69 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
         return opt
-    def _get_returning_df(self, script_df, partition_column, returns):
-        """
-        Internal function to return the teradataml Dataframe except
-        partition_column.
-        """
-        if self._is_default_partition_value_fit:
-            # For single model case, partition column is internally generated
-            # and no point in returning it to the user.
-            # Extract columns from return types.
-            returning_cols = [col[0] for col in returns[len(partition_column):]]
-            return script_df.select(returning_cols)
-        return script_df
-class _SKLearnFunctionWrapper(_GenericObjectWrapper):
-    def __init__(self, module_name, func_name):
+class _FunctionWrapper(_GenericObjectWrapper):
+    def __init__(self, module_name, func_name, file_type, template_file):
         super().__init__()
-        self.__module_name = module_name
-        self.__func_name = func_name
-        self.__params = None
-        self.__data_args = OrderedDict()
-        self._model_file_name = _generate_new_name(type="file_function", extension="py")
+        self._module_name = module_name
+        self._func_name = func_name
+        self._params = None
+        self._data_args = OrderedDict()
+        self._template_file = template_file
+        self._script_file_name = _generate_new_name(type=file_type, extension="py")
     def __call__(self, **kwargs):
         """
         Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
         """
-        __data_columns = []
-        partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
-        if partition_cols:
-            kwargs.pop("partition_columns")
-        # Separate dataframe related arguments and their column names from actual kwargs.
-        for k, v in kwargs.items():
-            if isinstance(v, DataFrame):
-                # All dataframes should be select of parent dataframe.
-                _validate_df_query_type(v, "select", k)
-                # Save all columns in dataframe related arguments.
-                __data_columns.extend(v.columns)
-                self.__data_args[k] = v
-        # Get common parent dataframe from all dataframes.
-        self.__tdml_df =  DataFrameUtils()._get_common_parent_df_from_dataframes(list(self.__data_args.values()))
+        replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
-        self._validate_existence_of_partition_columns(partition_cols, self.__tdml_df.columns)
-        self.__tdml_df = self.__tdml_df.select(__data_columns + partition_cols)
-        self.__tdml_df, partition_cols = self._get_data_and_data_partition_columns(self.__tdml_df,
-                                                                                   __data_columns,
-                                                                                   [],
-                                                                                   partition_cols
-                                                                                   )
-        # Prepare string of data arguments with name, indices where columns of that argument resides
-        # and types of each of the column.
-        data_args_str = self._prepare_data_args_string(kwargs)
-        self.__params = kwargs
-        # Get indices of partition_columns and types of all columns.
-        data_column_types_str, partition_indices_str, _, partition_cols = \
-            self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
-        script_file_path = f"{self._model_file_name}" if self._is_lake_system \
-            else f"./{self._db_name}/{self._model_file_name}"
+        script_file_path = f"{self._script_file_name}" if self._is_lake_system \
+            else f"./{self._db_name}/{self._script_file_name}"
         model_file_prefix = None
         if self._is_lake_system:
-            model_file_prefix = self._model_file_name.replace(".py", "")
+            model_file_prefix = self._script_file_name.replace(".py", "")
         py_exc = UtilFuncs._get_python_execution_path()
-        script_command = (f"{py_exc} {script_file_path} {partition_indices_str} "\
-                          f"{data_column_types_str} {data_args_str} {self._is_lake_system}"\
-                          f" {model_file_prefix}")
+        script_command = f"{py_exc} {script_file_path} {model_file_prefix} {self._is_lake_system}"
         model_type = BLOB() if self._is_lake_system else CLOB()
-        return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
-                        for col in partition_cols] + [(self.__func_name, model_type)]
-        # Generate new file in .teradataml directory and install it to Vantage.
-        self._prepare_and_install_file()
-        self._model_data = self._run_script(self.__tdml_df, script_command, partition_cols, return_types)
-        self._model_data._index_label = None
-        fit_partition_unique_values = self.__tdml_df.drop_duplicate(partition_cols).get_values()
+        return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
+                        for col in partition_cols] + [(self._func_name, model_type)]
-        self.extract_sklearn_obj(n_unique_partitions=len(fit_partition_unique_values),
-                                 n_partition_cols=len(partition_cols))
-        # File cleanup after processing.
-        os.remove(self._model_file_local)
-        self._remove_script_file(self._model_file_name)
+        replace_dict.update({"<module_name>": self._module_name,
+                             "<func_name>": self._func_name,
+                             "<params>": json.dumps(kwargs)})
-        return self.modelObj
+        # Generate new file in .teradataml directory and install it to Vantage.
+        self._prepare_and_install_file(replace_dict=replace_dict)
-    def _prepare_data_args_string(self, kwargs):
-        """
-        Get column indices and types of each data related arguments in the format:
-        "{<arg_name>-<comma separated indices>-<comma separated types>}--
-         {<arg_name>-<comma separated indices>-<comma separated types>}"
-        """
-        data_args_str = []
-        for arg_name in list(self.__data_args.keys()):
-            # Remove DataFrame arguments from kwargs, which will be passed to Script.
-            kwargs.pop(arg_name)
+        try:
+            self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
+            self._model_data._index_label = None
-            # Get column indices and their types for each dataframe from parent dataframe.
-            _, partition_indices_str, partition_types_str, _ = \
-                self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
-                                                                   self.__data_args[arg_name].columns,
-                                                                   idx_delim=",",
-                                                                   types_delim=",")
-            # Format "<arg_name>-<comma separated indices>-<comma separated types>"
-            data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
-        # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
-        #    {<arg_name>-<comma separated indices>-<comma separated types>}"
-        return "--".join(data_args_str)
+            fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
-    def _validate_existence_of_partition_columns(self, partition_columns, all_columns):
-        """
-        Validate if columns in "partition_columns" argument are present in any of the given
-        dataframes.
-        """
-        invalid_part_cols = [c for c in partition_columns if c not in all_columns]
+            self._extract_model_objs(n_unique_partitions=len(fit_partition_unique_values),
+                                     n_partition_cols=len(partition_cols))
-        if invalid_part_cols:
-            raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
-                                                  ", ".join(invalid_part_cols),
-                                                  "', '".join(list(self.__data_args.keys())))
-                                                  )
+        except Exception as ex:
+            # File cleanup if script execution fails or unable to fetch modelObj.
+            os.remove(self._script_file_local)
+            self._remove_script_file(self._script_file_name)
+            raise
-    def _prepare_and_install_file(self):
-        """
-        Prepare function script file from template file and install it in Vantage.
-        """
-        with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
-            script_data = fp.read()
-        script_data = script_data.replace("<module_name>",self.__module_name).\
-            replace("<func_name>",self.__func_name).replace("<params>", json.dumps(self.__params))
+        # File cleanup after processing.
+        os.remove(self._script_file_local)
+        self._remove_script_file(self._script_file_name)
-        self._model_file_local = os.path.join(self._tdml_tmp_dir, self._model_file_name)
+        return self.modelObj
-        with open(self._model_file_local, "w") as fp:
-            fp.write(script_data)
-        self._install_script_file(file_identifier=self._model_file_name.split(".")[0],
-                                  file_name=self._model_file_name,
-                                  file_location=self._tdml_tmp_dir)
+class _SKLearnFunctionWrapper(_FunctionWrapper):
+    def __init__(self, module_name, func_name):
+        file_type = "file_fn_sklearn"
+        template_file = "sklearn_function.template"
+        super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)

teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl