PyPI - teradataml - Versions diffs - 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl - Mend

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (84) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +119 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +18 -6
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/sqle/__init__.py +4 -1
teradataml/analytics/valib.py +18 -4
teradataml/automl/__init__.py +51 -6
teradataml/automl/data_preparation.py +56 -33
teradataml/automl/data_transformation.py +58 -33
teradataml/automl/feature_engineering.py +12 -5
teradataml/automl/model_training.py +34 -13
teradataml/common/__init__.py +1 -2
teradataml/common/constants.py +64 -40
teradataml/common/messagecodes.py +13 -3
teradataml/common/messages.py +4 -1
teradataml/common/sqlbundle.py +40 -10
teradataml/common/utils.py +113 -39
teradataml/common/warnings.py +11 -0
teradataml/context/context.py +141 -17
teradataml/data/amazon_reviews_25.csv +26 -0
teradataml/data/byom_example.json +11 -0
teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
teradataml/data/hnsw_alter_data.csv +5 -0
teradataml/data/hnsw_data.csv +10 -0
teradataml/data/jsons/byom/h2opredict.json +1 -1
teradataml/data/jsons/byom/onnxembeddings.json +266 -0
teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
teradataml/data/teradataml_example.json +8 -0
teradataml/data/vectordistance_example.json +1 -1
teradataml/dataframe/copy_to.py +8 -3
teradataml/dataframe/data_transfer.py +11 -1
teradataml/dataframe/dataframe.py +517 -121
teradataml/dataframe/dataframe_utils.py +152 -20
teradataml/dataframe/functions.py +26 -11
teradataml/dataframe/setop.py +11 -6
teradataml/dataframe/sql.py +2 -2
teradataml/dbutils/dbutils.py +525 -129
teradataml/hyperparameter_tuner/optimizer.py +12 -1
teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
teradataml/opensource/_class.py +141 -17
teradataml/opensource/{constants.py → _constants.py} +7 -3
teradataml/opensource/_lightgbm.py +52 -53
teradataml/opensource/_sklearn.py +1008 -0
teradataml/opensource/_wrapper_utils.py +5 -5
teradataml/options/__init__.py +47 -15
teradataml/options/configure.py +103 -25
teradataml/options/display.py +13 -2
teradataml/plot/axis.py +47 -8
teradataml/plot/figure.py +33 -0
teradataml/plot/plot.py +63 -13
teradataml/scriptmgmt/UserEnv.py +2 -2
teradataml/scriptmgmt/lls_utils.py +63 -26
teradataml/store/__init__.py +1 -2
teradataml/store/feature_store/feature_store.py +102 -7
teradataml/table_operators/Apply.py +32 -18
teradataml/table_operators/Script.py +3 -1
teradataml/table_operators/TableOperator.py +3 -1
teradataml/utils/dtypes.py +47 -0
teradataml/utils/internal_buffer.py +18 -0
teradataml/utils/validators.py +68 -9
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/libaed_0_1.dylib +0 -0
teradataml/libaed_0_1.so +0 -0
teradataml/opensource/sklearn/__init__.py +0 -0
teradataml/store/vector_store/__init__.py +0 -1586
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0

teradataml/LICENSE-3RD-PARTY.pdf CHANGED Viewed

Binary file

teradataml/README.md CHANGED Viewed

@@ -17,6 +17,125 @@ Copyright 2024, Teradata. All Rights Reserved.
 ## Release Notes:
+#### teradataml 20.00.00.04
+* ##### New Features/Functionality
+    * ###### teradataml OTF Support:
+      * This release has enabled the support for accessing OTF data from teradataml.
+      * User can now create a teradataml DataFrame on OTF table, allowing user to use teradataml functions.
+        * Example usage below:
+          * Creation of view on OTF/datalake table is not supported. Hence, user has to set `configure.temp_object_type` to `VT` using below-mentioned statement.
+            ```configure.temp_object_type = "VT"```
+          * User needs to provide additional information about datalake while creating the DataFrame. There are two approaches to provide datalake information
+            * Approach 1: Using `in_schema()`
+              ```
+              >>> from teradataml.dataframe.dataframe import in_schema
+              # Create an in_schema object to privide additional information about datalake.
+              >>> in_schema_tbl = in_schema(schema_name="datalake_db",
+              ...                           table_name="datalake_table_name",
+              ...                           datalake_name="datalake")
+              >>> otf_df = DataFrame(in_schema_tbl)
+                ```
+            * Approach 2: Using `DataFrame.from_table()`
+              ```
+               >>> otf_df = DataFrame.from_table(table_name = "datalake_table_name",
+               ...                               schema_name="datalake_db",
+               ...                               datalake_name="datalake")
+              ```
+          * Once this DataFrame is created, users can use any DataFrame method or analytics features/functionality from teradataml with it. Visit Limitations and considerations section in _Teradata Python Package User Guide_ to check the supportability.
+            *  Note: All further operations create volatile tables in local database.
+               ```
+                >>> new_df = otf_df.assign(new_col=otf_df.existing_col*2)
+               ```
+  * ###### teradataml: DataFrame
+    * Introduced a new feature 'Exploratory Data Analysis UI' (EDA-UI), which enhances
+      the user experience of teradataml with Jupyter notebook. EDA-UI is displayed by default
+      when a teradataml DataFrame is printed in the Jupyter notebook.
+    * User can control the EDA-UI using a new configuration option `display.enable_ui`.
+      It can be disabled by setting `display.enable_ui` to False.
+    * New Function
+      * `get_output()` is added to get the result of Analytic function when executed from EDA UI.
+  * ###### OpensourceML
+    * `td_lightgbm` - A teradataml OpenSourceML module
+      * `deploy()` - User can now deploy the models created by lightgbm `Booster` and `sklearn` modules. Deploying the model stores the model in Vantage for future use with `td_lightgbm`.
+        * `td_lightgbm.deploy()` - Deploy the lightgbm `Booster` or any `scikit-learn` model trained outside Vantage.
+        * `td_lightgbm.train().deploy()` - Deploys the lightgbm `Booster` object trained within Vantage.
+        * `td_lightgbm.<sklearn_class>().deploy()` - Deploys lightgbm's sklearn class object created/trained within Vantage.
+      * `load()` - User can load the deployed models back in the current session. This allows user to use the lightgbm functions with the `td_lightgbm` module.
+        * `td_lightgbm.load()` - Load the deployed model in the current session.
+  * ###### FeatureStore
+    * New function `FeatureStore.delete()` is added to drop the Feature Store and corresponding repo from Vantage.
+  * ###### Database Utility
+    * `db_python_version_diff()` - Identifies the Python interpreter major version difference between the interpreter installed on Vantage vs interpreter on the local user environment.
+    * `db_python_package_version_diff()` - Identifies the Python package version difference between the packages installed on Vantage vs the local user environment.
+  * ###### BYOM Function
+    * `ONNXEmbeddings()` - Calculate embeddings values in Vantage using an embeddings model that has been created outside Vantage and stored in ONNX format.
+  * ###### teradataml Options
+      * Configuration Options
+        * `configure.temp_object_type` - Allows user to choose between creating volatile tables or views for teradataml internal use. By default, teradataml internally creates the views for some of the operations. Now, with new configuration option, user can opt to create Volatile tables instead of views. This provides greater flexibility for users who lack the necessary permissions to create view or need to create views on tables without WITH GRANT permissions.
+      * Display Options
+        * `display.enable_ui` - Specifies whether to display exploratory data analysis UI when DataFrame is printed. By default, this option is enabled (True), allowing exploratory data analysis UI to be displayed. When set to False, exploratory data analysis UI is hidden.
+* ##### Updates
+  * ###### teradataml: DataFrame function
+    * `describe()`
+      * New argument added: `pivot`.
+      * When argument `pivot` is set to False, Non-numeric columns are no longer supported for generating statistics.
+        Use `CategoricalSummary` and `ColumnSummary`.
+    * `fillna()` - Accepts new argument `partition_column` to partition the data and impute null values accordingly.
+    * Optimised performance for `DataFrame.plot()`.
+      * `DataFrame.plot()` will not regenerate the image when run more than once with same arguments.
+    * `DataFrame.from_table()`: New argument `datalake_name` added to accept datalake name while creating DataFrame on datalake table.
+  * ###### teradataml: DataFrame Utilities
+    * `in_schema()`: New argument `datalake_name` added to accept datalake name.
+  * ###### Table Operator
+    * `Apply()` no longer looks at authentication token by default. Authentication token is now required only if user want to update backend Open Analytics Framework service.
+  * ###### Hyper Parameter Tuner
+    * `GridSearch()` and `RandomSearch()` now displays a message to refer to `get_error_log()` api when model training fails in HPT.
+  * ###### teradataml Options
+    * Configuration Options
+      * `configure.indb_install_location`
+        Determines the installation location of the In-DB Python package based on the installed RPM version.
+  * ###### teradataml Context Creation
+    * `create_context()` - Enables user to create connection using either parameters set in environment or config file, in addition to previous method. Newly added options help users to hide the sensitive data from the script.
+  * ###### Open Analytics Framework
+    * Enhanced the `create_env()` to display a message when an invalid base_env is passed, informing users that the default base_env is being used.
+  * ###### OpensourceML
+    * Raises a TeradataMlException, if the Python interpreter major version is different between the Vantage Python environment and the local user environment.
+    * Displays a warning, if specific Python package versions are different between the Vantage Python environment and the local user environment.
+  * ###### Database Utility
+    * `db_list_tables()`: New argument `datalake_name` added to accept datalake name to list tables from.
+    * `db_drop_table()`:
+      * New argument `datalake_name` added to accept datalake name to drop tables from.
+      * New argument `purge` added to specify whether to use `PURGE ALL` or `NO PURGE` clause while dropping table.
+* ##### Bug Fixes
+  * `td_lightgbm` OpensourceML module: In multi model case, `td_lightgbm.Dataset().add_features_from()` function should add features of one partition in first Dataset to features of the same partition in second Dataset. This is not the case before and this function fails. Fixed this now.
+  * Fixed a minor bug in the `Shap()` and converted argument `training_method` to required argument.
+  * Fixed PCA-related warnings in `AutoML`.
+  * `AutoML` no longer fails when data with all categorical columns are provided.
+  * Fixed `AutoML` issue with upsampling method.
+  * Excluded the identifier column from outlier processing in `AutoML`.
+  * `DataFrame.set_index()` no longer modifies the original DataFrame's index when argument `append` is used.
+  * `concat()` function now supports the DataFrame with column name starts with digit or contains special characters or contains reserved keywords.
+  * `create_env()` proceeds to install other files even if current file installation fails.
+  * Corrected the error message being raised in `create_env()` when authentication is not set.
+  * Added missing argument `charset` for Vantage Analytic Library functions.
+  * New argument `seed` is added to `AutoML`, `AutoRegressor` and `AutoClassifier` to ensure consistency on result.
+  * Analytic functions now work even if name of columns for underlying tables is non-ascii characters.
 #### teradataml 20.00.00.03
 * teradataml no longer supports setting the `auth_token` using `set_config_params()`. Users should use `set_auth_token()` to set the token.

teradataml/_version.py CHANGED Viewed

@@ -8,4 +8,4 @@
 #
 # ##################################################################
-version = "20.00.00.03"
+version = "20.00.00.04"

teradataml/analytics/analytic_function_executor.py CHANGED Viewed

@@ -482,17 +482,20 @@ class _AnlyticFunctionExecutor:
                 # Validate column is existed or not in the table.
                 _Validators._validate_dataframe_has_argument_columns(
-                    arg_value, arg_name, dataframe, target_table_argument_name)
+                    arg_value, arg_name, dataframe, target_table_argument_name, case_insensitive=True)
                 # Append square brackets for column range when function
                 # does not require special case handler.
                 arg_value = self._spl_func_obj._add_square_bracket(arg_value)
+                # Check if there are columns with non-ASCII characters.
+                if UtilFuncs._is_ascii(arg_value):
+                    arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
                 # Handling special case for Teradata reserved keywords or column names with spaces.
                 # If argument is a string or list of strings, then add quotes to the string.
-                if arg_name not in ["partition_columns"] and (\
+                elif arg_name not in ["partition_columns"] and (\
                     UtilFuncs._contains_space(arg_value) or list_td_reserved_keywords(arg_value)):
-                    arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
+                    arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
             # SequenceInputBy arguments require special processing.
             if 500 <= argument.get_r_order_number() <= 510:
@@ -717,10 +720,17 @@ class _AnlyticFunctionExecutor:
         kwargs.update(kwargs.pop("generic_arguments", {}))
         # Add all arguments to dynamic class as data members.
+        global_volatile = False
+        if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
+            global_volatile = True
         start_time = time.time()
         persist = kwargs.get("persist", False)
-        volatile = kwargs.get("volatile", False)
+        # Use global volatile only when persist argument is False. If persist argument
+        # is True, then volatile can't be used whether it is global volatile or normal
+        # volatile. If it is normal volatile, then it will raise
+        # `CANNOT_USE_TOGETHER_WITH` error below.
+        volatile = kwargs.get("volatile", global_volatile if not persist else False)
         display_table_name = kwargs.get("display_table_name", True)
         # Validate local_order_column argument type and values.
@@ -1039,7 +1049,8 @@ class _SQLEFunctionExecutor(_AnlyticFunctionExecutor):
                     _Validators._validate_dataframe_has_argument_columns(arg_value,
                                                                          arg,
                                                                          input_table_arg_value,
-                                                                         input_table_arg
+                                                                         input_table_arg,
+                                                                         case_insensitive=True
                                                                          )
             order_column_arg_value = UtilFuncs._teradata_collapse_arglist(order_column_arg_value, "\"")
@@ -1491,7 +1502,8 @@ class _TableOperatorExecutor(_SQLEFunctionExecutor):
         _Validators._validate_dataframe_has_argument_columns(hash_column_value,
                                                              hash_column_arg,
                                                              input_table_arg_value,
-                                                             input_table_arg
+                                                             input_table_arg,
+                                                             case_insensitive=True
                                                              )
         # Hash and order by can be used together as long as is_local_order = True.

teradataml/analytics/byom/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from teradataml.analytics.byom.PMMLPredict import PMMLPredict
 from teradataml.analytics.meta_class import _AnalyticFunction
 from teradataml.analytics.meta_class import _common_init, _common_dir
-_byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict']
+_byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict', 'ONNXEmbeddings']
 for func in _byom_functions:
     globals()[func] = type("{}".format(func), (_AnalyticFunction,),

teradataml/analytics/sqle/__init__.py CHANGED Viewed

@@ -95,7 +95,10 @@ _sqle_functions = ['ANOVA',
                   'WordEmbeddings',
                   'XGBoost',
                   'XGBoostPredict',
-                  'ZTest'
+                  'ZTest',
+                  'HNSW',
+                  'HNSWPredict',
+                  'HNSWSummary',
                   ]
 for func in _sqle_functions:

teradataml/analytics/valib.py CHANGED Viewed

@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
 from teradataml.utils.validators import _Validators
 from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
     LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
+from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
 class _VALIB():
     """ An internal class for executing VALIB analytic functions. """
@@ -370,9 +372,16 @@ class _VALIB():
             self.__get_temp_table_name()
         """
         prefix = "valib_{}".format(self.__tdml_valib_name.lower())
-        return UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
-                                                   gc_on_quit=True, quote=False,
-                                                   table_type=TeradataConstants.TERADATA_TABLE)
+        tbl_name =  UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
+                                                        gc_on_quit=True, quote=False,
+                                                        table_type=TeradataConstants.TERADATA_TABLE)
+        # With VT option, table name is getting generated with 'vt_'.
+        # But its not getting created as Volatile table. Hence
+        # explicitly garbage collecting.
+        if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
+            GarbageCollector._add_to_garbagecollector(tbl_name,
+                                                      TeradataConstants.TERADATA_TABLE)
+        return tbl_name
     def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
         """
@@ -447,6 +456,7 @@ class _VALIB():
         # Add extension to the table name.
         generated_table_name = "{}{}".format(table_name, extension)
         # Register new output table to the GC.
         gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
         GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
@@ -1463,7 +1473,7 @@ class _VALIB():
         if gen_sql_only:
             valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
                                                             arg_name="gensqlonly")
+        charset = kwargs.pop("charset", None)
         # Raise error if there are additional arguments.
         if len(kwargs) != 0:
             err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
@@ -1478,6 +1488,10 @@ class _VALIB():
                                                         arg_name="tablename")
         valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
                                                         arg_name="columns")
+        # Generate clause of charset.
+        if charset:
+            valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
+                                                            arg_name="charset")
         return valib_inst._execute_valib_function(skip_data_arg_processing=True,
                                                   skip_other_arg_processing=True)

teradataml/automl/__init__.py CHANGED Viewed

@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
 from teradataml.dataframe.dataframe import DataFrame
 from teradataml.utils.utils import execute_sql
 from teradataml.utils.validators import _Validators
-from teradataml import ROC, BLOB
+from teradataml import ROC, BLOB, VARCHAR
 from teradataml.utils.dtypes import _Dtypes
 from teradataml.common.utils import UtilFuncs
 from teradataml import TeradataMlException
@@ -94,6 +94,9 @@ class AutoML:
             the processes by passing the JSON file path in case of custom run. It also
             supports early stopping of model training based on stopping metrics,
             maximum running time and maximum models to be trained.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             task_type:
@@ -187,6 +190,12 @@ class AutoML:
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoML.
@@ -417,9 +426,11 @@ class AutoML:
         volatile = kwargs.get('volatile', False)
         persist = kwargs.get('persist', False)
+        seed = kwargs.get('seed', 42)
         arg_info_matrix.append(["volatile", volatile, True, (bool)])
         arg_info_matrix.append(["persist", persist, True, (bool)])
+        arg_info_matrix.append(["seed", seed, True, (int)])
         # Validate argument types
         _Validators._validate_function_arguments(arg_info_matrix)
@@ -517,7 +528,7 @@ class AutoML:
         # Validate argument types
         _Validators._validate_function_arguments(arg_info_fit_matrix)
         # Initializing class variables
         self.data = data
         self.target_column = target_column
@@ -758,11 +769,12 @@ class AutoML:
         if self.target_column_ind:
             prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
             probability_column = 'prob_1'
+            pred_target_count = pred.result.drop_duplicate(self.target_column).size
             # Displaying confusion matrix and ROC-AUC for classification problem
             if self.is_classification_type():
                 print_data = lambda data: print(data) if _is_terminal() else display(data)
                 # Displaying ROC-AUC for binary classification
-                if self.target_count == 2:
+                if self.target_count == 2 and pred_target_count == 2:
                     fit_params = {
                         "probability_column" : probability_column,
                         "observation_column" : self.target_column,
@@ -886,8 +898,8 @@ class AutoML:
         # as it is required for evaluation.
         if self.target_column not in data.columns:
              raise TeradataMlException(
-                            Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
-                            MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
+                 Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
+                 MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
         # Checking if data is already transformed before or not
         data_node_id = data._nodeid
@@ -1234,6 +1246,8 @@ class AutoML:
         pca.n_components_ = load_pca_info['n_components']
         pca.noise_variance_ = load_pca_info['noise_variance']
         pca.singular_values_ = np.array(load_pca_info['singular_values'])
+        pca.feature_names_in_ = data_params['pca_fit_columns']
+        pca.n_features_in_ = len(data_params['pca_fit_columns'])
         data_params['pca_fit_instance'] = pca
@@ -1442,7 +1456,8 @@ class AutoML:
         # Saving data transformation parameters to the specified table
         sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
-        copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
+        copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
+                                                                                       'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
         print('Model Deployment Completed Successfully.')
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
                 Default Value: False
                 Types: bool
+            seed:
+                Optional Argument.
+                Specifies the random seed for reproducibility.
+                Default Value: 42
+                Types: int
         RETURNS:
             a tuple containing, model information and leaderboard.
         """
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
                 session.
                 Default Value: False
                 Types: bool
+            seed:
+                Optional Argument.
+                Specifies the random seed for reproducibility.
+                Default Value: 42
+                Types: int
         RETURNS:
             a tuple containing, model information and leaderboard.
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
         """
         DESCRIPTION:
             AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             include:
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoRegressor.
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
         """
         DESCRIPTION:
             AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
+            Note:
+                * configure.temp_object_type="VT" follows sequential execution.
         PARAMETERS:
             include:
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         RETURNS:
             Instance of AutoClassifier.

teradataml/automl/data_preparation.py CHANGED Viewed

@@ -16,7 +16,6 @@
 # Python libraries
 import numpy as np
 import pandas as pd
-import random
 import time
 import warnings
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
 from teradataml.common.garbagecollector import GarbageCollector
 from teradataml.common.messages import Messages, MessageCodes
 from teradataml.utils.validators import _Validators
-from teradataml import INTEGER
+from teradataml import configure, INTEGER
+from teradataml.common.constants import TeradataConstants
-# Control Randomnes
-random.seed(42)
-np.random.seed(42)
 class _DataPreparation:
@@ -117,6 +114,12 @@ class _DataPreparation:
                         session.
                         Default Value: False
                         Types: bool
+                    seed:
+                        Optional Argument.
+                        Specifies the random seed for reproducibility.
+                        Default Value: 42
+                        Types: int
         """
         self.data = data
         self.target_column = target_column
@@ -135,7 +138,13 @@ class _DataPreparation:
         self.table_name_mapping = {}
         self.data_types = {key: value for key, value in self.data._column_names_and_types}
+        self.seed = kwargs.get("seed", 42)
+        # np.random.seed() affects the random number generation in numpy and sklearn
+        # setting this changes the global state of the random number generator
+        # hence, setting the seed only if it is not None
+        if kwargs.get("seed") is not None:
+            np.random.seed(self.seed)
     def data_preparation(self,
                          auto = True):
@@ -262,25 +271,24 @@ class _DataPreparation:
         outlier_method = "Tukey"
         # List of columns for outlier processing.
-        outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
+        # Excluding target column and excluded columns from outlier processing
+        outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
-        # Detecting outlier percentage in each columns
-        outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
-        # Outlier Handling techniques
-        for i in outlier_percentage_df.itertuples():
-            # Column Name
-            col = i[0]
-            # Outlier value
-            value = i[1]
-            if col == self.target_column:
-                if value < 5.0 and value > 0.0:
+        if len(outlier_columns) != 0:
+            # Detecting outlier percentage in each columns
+            outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
+            # Outlier Handling techniques
+            for i in outlier_percentage_df.itertuples():
+                # Column Name
+                col = i[0]
+                # Outlier value
+                value = i[1]
+                # Dropping rows
+                if value > 0.0  and value <= 8.0 :
                     columns_to_drop_rows.append(col)
-            elif value > 0.0  and value <= 8.0 :
-                columns_to_drop_rows.append(col)
-            elif value> 8.0 and value <= 25.0:
-                columns_to_impute.append(col)
+                elif value> 8.0 and value <= 25.0:
+                    columns_to_impute.append(col)
         return columns_to_drop_rows, columns_to_impute
@@ -489,7 +497,7 @@ class _DataPreparation:
         train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
         # Initialize and fit PCA
-        pca = PCA()
+        pca = PCA(random_state=self.seed)
         pca.fit(train_data)
         # Find the number of components for PCA
@@ -497,7 +505,7 @@ class _DataPreparation:
         n = np.argmax(np.cumsum(variance) >= 0.95) + 1
         # Create a new instance of PCA with the optimal number of components
-        pca = PCA(n_components=n, random_state=42)
+        pca = PCA(n_components=n, random_state=self.seed)
         # Apply PCA on dataset
         X_train_pca = pca.fit_transform(train_data)
@@ -571,7 +579,7 @@ class _DataPreparation:
             # Random forest for RFE model
             RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
-            rf = RFModel(n_estimators=100, random_state=42)
+            rf = RFModel(n_estimators=100, random_state=self.seed)
             # Determine the scoring metric based on the number of unique classes
             score = 'r2' if not self.is_classification_type() \
@@ -665,10 +673,10 @@ class _DataPreparation:
                     scoring_metric = 'roc_auc'
                 else:
                     scoring_metric = 'f1_macro'
-                estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
+                estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
                 parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
             else:
-                estimator = Lasso(random_state=42)
+                estimator = Lasso(random_state=self.seed)
                 parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
                 scoring_metric = "r2"
@@ -679,7 +687,7 @@ class _DataPreparation:
             # Applying hyperparameter tuning and optimizing score
             hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
-                                                scoring=scoring_metric, verbose=0)
+                                                 scoring=scoring_metric, verbose=0)
             # Fitting the best result from hyperparameter
             hyperparameter_search.fit(train_features, train_target)
@@ -746,14 +754,20 @@ class _DataPreparation:
         train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
                                                                table_type = TeradataConstants.TERADATA_TABLE,
                                                                gc_on_quit=not persist)
+        # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
+        # table name in fully qualified format.
+        train_table_name = UtilFuncs._extract_table_name(train_table_name)
         # Storing the table names in the table name mapping dictionary
         self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
+        # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
+        is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
         # Pushing data into database
         if self.is_classification_type():
-            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
+            copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
         else:
-            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
+            copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
     def _scaling_features_helper(self,
                                  data=None,
@@ -856,6 +870,7 @@ class _DataPreparation:
             # List of columns to copy to the output generated by scale transform
             accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
             # Scaling dataset
             transform_obj = ScaleTransform(data=data_to_scale,
@@ -867,6 +882,8 @@ class _DataPreparation:
                               data=scaled_df,
                               progress_bar=self.progress_bar)
         else:
+            # No columns to scale, Original data will be used
+            scaled_df = data_to_scale
             self._display_msg(msg="No columns to scale.",
                               progress_bar=self.progress_bar)
@@ -915,10 +932,16 @@ class _DataPreparation:
         # Assigning data to target dataframe
         target_df = self.data
         # Detecting list of float columns on target dataset
-        float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
+        float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
         if len(float_columns) == 0:
-            return target_df.to_pandas()
+            cols = target_df.columns
+            # Doing reset index to get index column
+            df = target_df.to_pandas().reset_index()
+            # Returning the dataframe with cols
+            # to avoid extra columns generated by reset_index()
+            return df[cols]
         # storing the column details for round up in data transformation dictionary
         self.data_transform_dict["round_columns"] = float_columns

teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.4py3-none-any.whl