PyPI - teradataml - Versions diffs - 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl - Mend

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (84) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +119 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +18 -6
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/sqle/__init__.py +4 -1
teradataml/analytics/valib.py +18 -4
teradataml/automl/__init__.py +51 -6
teradataml/automl/data_preparation.py +56 -33
teradataml/automl/data_transformation.py +58 -33
teradataml/automl/feature_engineering.py +12 -5
teradataml/automl/model_training.py +34 -13
teradataml/common/__init__.py +1 -2
teradataml/common/constants.py +64 -40
teradataml/common/messagecodes.py +13 -3
teradataml/common/messages.py +4 -1
teradataml/common/sqlbundle.py +40 -10
teradataml/common/utils.py +113 -39
teradataml/common/warnings.py +11 -0
teradataml/context/context.py +141 -17
teradataml/data/amazon_reviews_25.csv +26 -0
teradataml/data/byom_example.json +11 -0
teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
teradataml/data/hnsw_alter_data.csv +5 -0
teradataml/data/hnsw_data.csv +10 -0
teradataml/data/jsons/byom/h2opredict.json +1 -1
teradataml/data/jsons/byom/onnxembeddings.json +266 -0
teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
teradataml/data/teradataml_example.json +8 -0
teradataml/data/vectordistance_example.json +1 -1
teradataml/dataframe/copy_to.py +8 -3
teradataml/dataframe/data_transfer.py +11 -1
teradataml/dataframe/dataframe.py +517 -121
teradataml/dataframe/dataframe_utils.py +152 -20
teradataml/dataframe/functions.py +26 -11
teradataml/dataframe/setop.py +11 -6
teradataml/dataframe/sql.py +2 -2
teradataml/dbutils/dbutils.py +525 -129
teradataml/hyperparameter_tuner/optimizer.py +12 -1
teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
teradataml/opensource/_class.py +141 -17
teradataml/opensource/{constants.py → _constants.py} +7 -3
teradataml/opensource/_lightgbm.py +52 -53
teradataml/opensource/_sklearn.py +1008 -0
teradataml/opensource/_wrapper_utils.py +5 -5
teradataml/options/__init__.py +47 -15
teradataml/options/configure.py +103 -25
teradataml/options/display.py +13 -2
teradataml/plot/axis.py +47 -8
teradataml/plot/figure.py +33 -0
teradataml/plot/plot.py +63 -13
teradataml/scriptmgmt/UserEnv.py +2 -2
teradataml/scriptmgmt/lls_utils.py +63 -26
teradataml/store/__init__.py +1 -2
teradataml/store/feature_store/feature_store.py +102 -7
teradataml/table_operators/Apply.py +32 -18
teradataml/table_operators/Script.py +3 -1
teradataml/table_operators/TableOperator.py +3 -1
teradataml/utils/dtypes.py +47 -0
teradataml/utils/internal_buffer.py +18 -0
teradataml/utils/validators.py +68 -9
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/libaed_0_1.dylib +0 -0
teradataml/libaed_0_1.so +0 -0
teradataml/opensource/sklearn/__init__.py +0 -0
teradataml/store/vector_store/__init__.py +0 -1586
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0

teradataml/dataframe/dataframe_utils.py CHANGED Viewed

@@ -12,13 +12,14 @@ This file implements util functions of data frame.
 """
 import numbers
+import re
 import pandas as pd
 from collections import OrderedDict
 from teradataml.common.utils import UtilFuncs
 from teradataml.common.aed_utils import AedUtils
 from teradataml.common.constants import AEDConstants, PTITableConstants, \
-    SQLPattern, PythonTypes
+    SQLPattern, PythonTypes, TeradataConstants, SQLConstants
 from teradataml.common.sqlbundle import SQLBundle
 from teradataml.common.exceptions import TeradataMlException
 from teradataml.common.messages import Messages
@@ -30,6 +31,7 @@ from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
 from teradataml.options.display import display
 from teradataml.options.configure import configure
+from teradataml.utils.dtypes import _DtypesMappers
 from teradataml.utils.utils import execute_sql
 from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
@@ -77,7 +79,10 @@ class DataFrameUtils():
                     is_persist = True
                 try:
-                    if node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
+                    if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
+                        UtilFuncs._create_table(view_names[index], queries[index], volatile=True)
+                    elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
                        ("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
                             is_persist:
                         # TODO:: OR condition in above needs to be removed once AED support is added.
@@ -1291,43 +1296,32 @@ class DataFrameUtils():
         aggregate_expr = ", ".join(select_columns)
         return aggregate_expr, new_column_names, new_column_types
     @staticmethod
-    def _invalid_describe_column(df, columns, metaexpr, groupby_column_list):
+    def _validate_describe_columns(columns, metaexpr, groupby_column_list):
         """
-        Internal function to validate columns provided to describe() is correct or not,
+        Internal function to validate columns provided to describe() are correct or not,
         when DataFrame is output of groupby and groupby_time.
         PARAMETERS:
-            df:
-                Required Argument.
-                Specifies teradataml DataFrame we are collecting statistics for.
-                Types: str
             columns:
                 Optional Argument.
                 Specifies the name(s) of columns we are collecting statistics for.
                 Types: str ot List of strings (str)
             metaexpr:
                 Required Argument.
                 Specifies the meta expression for the dataframe.
                 Types: _MetaExpression
             groupby_column_list:
                 Optional Argument.
                 Specifies the group by columns for the dataframe.
                 Default Values: None.
                 Types: str ot List of strings (str)
         Returns:
             None
         Raises:
             TeradataMLException
         """
-        invalid_columns = [_column for _column in groupby_column_list if columns is not None
-                           and _column in columns]
+        invalid_columns = [_column for _column in groupby_column_list if _column in columns]
         if len(invalid_columns) > 0:
             all_columns = [col.name for col in metaexpr.c]
             valid_columns = [item for item in all_columns if item not in groupby_column_list]
@@ -1849,7 +1843,10 @@ class DataFrameUtils():
             db_schema = UtilFuncs._extract_db_name(tab_name_first)
             db_table_name = UtilFuncs._extract_table_name(tab_name_first)
-            return DataFrame(in_schema(db_schema, db_table_name))
+            if db_schema:
+                return DataFrame(in_schema(db_schema, db_table_name))
+            return DataFrame(db_table_name)
         pids_first = None
         parent_df = None
@@ -1865,11 +1862,146 @@ class DataFrameUtils():
                 db_schema = UtilFuncs._extract_db_name(tab_name_first)
                 db_table_name = UtilFuncs._extract_table_name(tab_name_first)
-                parent_df = DataFrame(in_schema(db_schema, db_table_name))
+                if db_schema:
+                    parent_df = DataFrame(in_schema(db_schema, db_table_name))
+                else:
+                    parent_df = DataFrame(db_table_name)
                 pids_first = pids
             else:
                 if pids_first != pids:
                     raise TeradataMlException(Messages.get_message(MessageCodes.DFS_NO_COMMON_PARENT),
                                             MessageCodes.DFS_NO_COMMON_PARENT)
-        return parent_df
+        return parent_df
+    @staticmethod
+    def _get_sqlalchemy_type_from_str(td_type):
+        """
+        Function to get teradatasqlalchemy type from string representation of that type.
+        PARAMETERS:
+            td_type:
+                Required Argument.
+                Specifies string representation of teradatasqlalchemy type.
+                Types: str
+        RAISES:
+            ValueError
+        EXAMPLES:
+            >>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("DECIMAL(4,4)")
+            >>> dt
+            DECIMAL(precision=4, scale=4)
+            >>> type(dt)
+            teradatasqlalchemy.types.DECIMAL
+            >>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("VARCHAR(32000) CHARACTER SET UNICODE")
+            >>> dt
+            VARCHAR(length=32000, charset='UNICODE')
+            >>> type(dt)
+            teradatasqlalchemy.types.VARCHAR
+        """
+        # 4 groups of pattern:
+        # 1. Type name
+        # 2. Comma separated parameters enclosed in parentheses
+        # 3. Comma separated parameters without parenthesis
+        # 4. Remaining string
+        pattern = "([A-Z0-9_]+)(\((.*)\))?(.*)"
+        m = re.match(pattern, td_type)
+        td_str_type = m.group(1)
+        td_str_params = m.group(3)
+        td_str_remain = m.group(4)
+        if m is None or td_str_type not in _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER.keys():
+            raise ValueError("Invalid Teradata type: {} from datalake".format(td_type))
+        if td_str_type in ["VARCHAR", "CHAR"]:
+            # If VARCHAR or CHAR, extract, length and charset from string.
+            length = int(td_str_params.split(",")[0])
+            charset = td_str_remain.strip().split(" ")[2]
+            return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
+                (length=length, charset=charset)
+        if td_str_type in ["BLOB"]:
+            # Ignoring the charset as BLOB does not have it.
+            # If BLOB, extract length from string.
+            length = int(td_str_params.split(",")[0])
+            return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
+                (length=length)
+        if td_str_type in ["DECIMAL"]:
+            # If DECIMAL, extract precision and scale from string.
+            args = td_str_params.split(",")
+            return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
+                (precision=int(args[0]), scale=int(args[1]))
+        # TODO: Test for other data types once OTF team finalize all data types.
+        return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]()
+    @staticmethod
+    def _get_datalake_table_columns_info(schema, table_name, datalake):
+        """
+        Function to get column names and corresponding teradatasqlalchemy types
+        of a datalake table using results of 'help table <datalake>.<db_name>.<table_name>'
+        SQL query.
+        PARAMETERS:
+            schema:
+                Required Argument.
+                Specifies name of schema.
+                Types: str
+            table_name:
+                Required Argument.
+                Specifies name of table.
+                Types: str
+            datalake:
+                Required Argument.
+                Specifies name of datalake.
+                Types: str
+        RAISES:
+            TeradataMlException
+        EXAMPLES:
+            >>> DataFrameUtils._get_datalake_table_columns_info(table_name = 'sales',
+            ...                                                 schema='otftestdb',
+            ...                                                 datalake='datalake_iceberg_glue')
+            (['id', 'masters', 'gpa', 'stats', 'programming', 'admitted'],
+             [INTEGER(),
+              VARCHAR(length=2000, charset='UNICODE'),
+              FLOAT(),
+              VARCHAR(length=2000, charset='UNICODE'),
+              VARCHAR(length=2000, charset='UNICODE'),
+              INTEGER()])
+        """
+        # Get the column information from the strings type.
+        prepared = preparer(td_dialect())
+        sqlbundle = SQLBundle()
+        full_tbl_name = '{}.{}.{}'.format(prepared.quote(datalake),
+                                          prepared.quote(schema),
+                                          prepared.quote(table_name))
+        help_table_sql = sqlbundle._get_sql_query(SQLConstants.SQL_HELP_TABLE).format(full_tbl_name)
+        cur = execute_sql(help_table_sql)
+        td_types_col_index = -1
+        for i, col_metadata in enumerate(cur.description):
+            # Help Table returns column names and
+            # corresponding IcebergType, TeradataInternalType,
+            # TeradataType. We need to extract column index for
+            # 'TeradataType' column.
+            if col_metadata[0].lower() == 'teradatatype':
+                td_types_col_index = i
+        col_names = []
+        col_types = []
+        if td_types_col_index > -1:
+            for col_info in cur.fetchall():
+                col_names.append(col_info[0])
+                col_types.append(DataFrameUtils._get_sqlalchemy_type_from_str(col_info[td_types_col_index]))
+        else:
+            raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
+                                      MessageCodes.TDMLDF_CREATE_FAIL)
+        return col_names, col_types

teradataml/dataframe/functions.py CHANGED Viewed

@@ -23,6 +23,15 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
     """
     DESCRIPTION:
         Creates a user defined function (UDF).
+        Notes:
+            1. Date and time data types must be formatted to supported formats.
+               (See Prerequisite Input and Output Structures in Open Analytics Framework for more details.)
+            2. Packages required to run the user defined function must be installed in remote user
+               environment using install_lib method of UserEnv class. Import statements of these
+               packages should be inside the user defined function itself.
+            3. Do not call a regular function defined outside the udf() from the user defined function.
+               The function definition and call must be inside the udf(). Look at Example 9 to understand more.
     PARAMETERS:
         user_function:
@@ -31,7 +40,7 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
             teradataml DataFrame.
             Types: function
             Note:
-                1. Lambda Function are not supported.
+                Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
         returns:
             Optional Argument.
@@ -82,15 +91,6 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
     RAISES:
         TeradataMLException
-    NOTES:
-        1. While working on date and time data types one must format these to supported formats.
-           (See Requisite Input and Output Structures in Open Analytics Framework for more details.)
-        2. Required packages to run the user defined function must be installed in remote user
-           environment using install_lib function Of UserEnv class. Import statements of these
-           packages should be inside the user defined function itself.
-        3. One can't call a regular function defined outside the udf from the user defined function.
-           The function definition and call must be inside the udf. Look at Example 9 to understand more.
     EXAMPLES:
         # Load the data to run the example.
         >>> load_example_data("dataframe", "sales")
@@ -340,6 +340,12 @@ def register(name, user_function, returns=VARCHAR(1024)):
     DESCRIPTION:
         Registers a user defined function (UDF).
+        Notes:
+            1. Date and time data types must be formatted to supported formats.
+               (See Requisite Input and Output Structures in Open Analytics Framework for more details.)
+            2. On VantageCloud Lake, user defined function is registered by default in the 'openml_env' environment.
+               User can register it in their own user environment, using the 'openml_user_env' configuration option.
     PARAMETERS:
         name:
             Required Argument.
@@ -351,6 +357,8 @@ def register(name, user_function, returns=VARCHAR(1024)):
             Specifies the user defined function to create a column for
             teradataml DataFrame.
             Types: function, udf
+            Note:
+                Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
         returns:
             Optional Argument.
@@ -459,10 +467,17 @@ def call_udf(udf_name, func_args = () , **kwargs):
     DESCRIPTION:
         Call a registered user defined function (UDF).
+        Notes:
+            1. Packages required to run the registered user defined function must be installed in remote user
+               environment using install_lib method of UserEnv class. Import statements of these
+               packages should be inside the user defined function itself.
+            2. On VantageCloud Lake, user defined function runs by default in the 'openml_env' environment.
+               User can use their own user environment, using the 'openml_user_env' configuration option.
     PARAMETERS:
         udf_name:
             Required Argument.
-            Specifies the name of the registered user defined.
+            Specifies the name of the registered user defined function.
             Types: str
         func_args:

teradataml/dataframe/setop.py CHANGED Viewed

@@ -149,7 +149,7 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
     # Iterate on all DFs to be applied for set operation.
     for df in dfs_to_operate_on:
         # Process each column in the DF of the iteration.
-        for c in df._metaexpr.t.c:
+        for c in df._metaexpr.c:
             col_name = c.name
             # Process the column name if it is not already processed.
             # Processing of set operation is column name based so if the DF in the nth iteration had column 'xyz',
@@ -193,6 +193,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
                         col_dict[col_name]['col_present'] = col_present_in_dfs
                         # The type to be used for the column is the one of the first DF it is present in.
                         col_dict[col_name]['col_type'] = col_types_in_dfs[0]
+                        # Column name stored with quotes if required.
+                        col_dict[col_name]['name'] = c.compile()
                         # If the type of the column in all DFs is not the same, then the operation is not lazy.
                         if not all(ctype == col_dict[col_name]['col_type']
@@ -217,6 +219,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
                         col_dict[col_name]['col_present'] = col_present_in_dfs
                         # The type to be used for the column is the one of the first DF it is present in.
                         col_dict[col_name]['col_type'] = non_none_type_to_add
+                        # Column name stored with quotes if required.
+                        col_dict[col_name]['name'] = c.compile()
                         # If the type of the column in all DFs is not the same, then the operation is not lazy.
                         if not all(True if ctype is None else ctype == non_none_type_to_add
@@ -667,15 +671,16 @@ def concat(df_list, join='OUTER', allow_duplicates=True, sort=False, ignore_inde
         # Now create the list of columns for each DataFrame to concatenate
         type_compiler = td_type_compiler(td_dialect)
         for col_name, value in master_columns_dict.items():
             for i in range(len(col_list)):
+                # Quoting is already done for column names if column name starts with number or it is reserved keywords.
+                # Here checking again if it is teradata keyword or not for quotes.
+                column_name = UtilFuncs._process_for_teradata_keyword(value['name'])
                 if not value['col_present'][i]:
-                    col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']),
-                                                                       UtilFuncs._teradata_quote_arg(col_name, "\"",
-                                                                                                     False)))
+                    col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']), column_name))
                 else:
-                    col_name = UtilFuncs._process_for_teradata_keyword(col_name)
-                    col_list[i].append(col_name)
+                    col_list[i].append(column_name)
         input_table_columns = []
         for i in range(len(col_list)):

teradataml/dataframe/sql.py CHANGED Viewed

@@ -265,7 +265,7 @@ class _PandasTableExpression(TableExpression):
         existing = [(c.name, c) for c in self.c]
         new = [(label, expression) for label, expression in kw.items() if label not in current]
-        new = sorted(new, key = lambda x: x[0])
+        new = sorted(new, key=lambda x: x[0])
         for alias, expression in existing + new:
             if drop_columns and alias not in kw:
@@ -10978,4 +10978,4 @@ class _SQLColumnExpression(_LogicalColumnExpression,
             >>>
         """
-        return _SQLColumnExpression(literal_column(f"TD_ISFINITE({self.compile()})"), type=INTEGER)
+        return _SQLColumnExpression(literal_column(f"TD_ISFINITE({self.compile()})"), type=INTEGER)

teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.3py3-none-any.whl → 20.0.0.4py3-none-any.whl