PyPI - teradataml - Versions diffs - 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl - Mend

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show

teradataml/README.md +210 -0
teradataml/__init__.py +1 -1
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +162 -76
teradataml/analytics/byom/__init__.py +1 -1
teradataml/analytics/json_parser/__init__.py +2 -0
teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
teradataml/analytics/json_parser/metadata.py +22 -4
teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
teradataml/analytics/sqle/__init__.py +3 -0
teradataml/analytics/utils.py +4 -1
teradataml/automl/__init__.py +2369 -464
teradataml/automl/autodataprep/__init__.py +15 -0
teradataml/automl/custom_json_utils.py +184 -112
teradataml/automl/data_preparation.py +113 -58
teradataml/automl/data_transformation.py +154 -53
teradataml/automl/feature_engineering.py +113 -53
teradataml/automl/feature_exploration.py +548 -25
teradataml/automl/model_evaluation.py +260 -32
teradataml/automl/model_training.py +399 -206
teradataml/clients/auth_client.py +2 -2
teradataml/common/aed_utils.py +11 -2
teradataml/common/bulk_exposed_utils.py +4 -2
teradataml/common/constants.py +62 -2
teradataml/common/garbagecollector.py +50 -21
teradataml/common/messagecodes.py +47 -2
teradataml/common/messages.py +19 -1
teradataml/common/sqlbundle.py +23 -6
teradataml/common/utils.py +116 -10
teradataml/context/aed_context.py +16 -10
teradataml/data/Employee.csv +5 -0
teradataml/data/Employee_Address.csv +4 -0
teradataml/data/Employee_roles.csv +5 -0
teradataml/data/JulesBelvezeDummyData.csv +100 -0
teradataml/data/byom_example.json +5 -0
teradataml/data/creditcard_data.csv +284618 -0
teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
teradataml/data/load_example_data.py +29 -11
teradataml/data/payment_fraud_dataset.csv +10001 -0
teradataml/data/teradataml_example.json +67 -0
teradataml/dataframe/copy_to.py +714 -54
teradataml/dataframe/dataframe.py +1153 -33
teradataml/dataframe/dataframe_utils.py +8 -3
teradataml/dataframe/functions.py +168 -1
teradataml/dataframe/setop.py +4 -1
teradataml/dataframe/sql.py +141 -9
teradataml/dbutils/dbutils.py +470 -35
teradataml/dbutils/filemgr.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +456 -142
teradataml/lib/aed_0_1.dll +0 -0
teradataml/lib/libaed_0_1.dylib +0 -0
teradataml/lib/libaed_0_1.so +0 -0
teradataml/lib/libaed_0_1_aarch64.so +0 -0
teradataml/scriptmgmt/UserEnv.py +234 -34
teradataml/scriptmgmt/lls_utils.py +43 -17
teradataml/sdk/_json_parser.py +1 -1
teradataml/sdk/api_client.py +9 -6
teradataml/sdk/modelops/_client.py +3 -0
teradataml/series/series.py +12 -7
teradataml/store/feature_store/constants.py +601 -234
teradataml/store/feature_store/feature_store.py +2886 -616
teradataml/store/feature_store/mind_map.py +639 -0
teradataml/store/feature_store/models.py +5831 -214
teradataml/store/feature_store/utils.py +390 -0
teradataml/table_operators/table_operator_util.py +1 -1
teradataml/table_operators/templates/dataframe_register.template +6 -2
teradataml/table_operators/templates/dataframe_udf.template +6 -2
teradataml/utils/docstring.py +527 -0
teradataml/utils/dtypes.py +93 -0
teradataml/utils/internal_buffer.py +2 -2
teradataml/utils/utils.py +41 -2
teradataml/utils/validators.py +694 -17
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0

teradataml/dataframe/copy_to.py CHANGED Viewed

@@ -14,9 +14,11 @@ import pandas.api.types as pt
 from sqlalchemy import MetaData, Table, Column
 from sqlalchemy.exc import OperationalError as sqlachemyOperationalError
+from teradataml.dataframe.sql import ColumnExpression
 from teradatasqlalchemy import (INTEGER, BIGINT, BYTEINT, FLOAT)
-from teradatasqlalchemy import (TIMESTAMP)
+from teradatasqlalchemy import (TIMESTAMP, DATE)
 from teradatasqlalchemy import (VARCHAR)
+from teradatasqlalchemy import (PERIOD_DATE,PERIOD_TIMESTAMP)
 from teradatasqlalchemy.dialect import TDCreateTablePost as post
 from teradataml.common.aed_utils import AedUtils
 from teradataml.context.context import *
@@ -25,13 +27,15 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
 from teradataml.dbutils.dbutils import _rename_table
 from teradataml.common.utils import UtilFuncs
 from teradataml.options.configure import configure
-from teradataml.common.constants import CopyToConstants, PTITableConstants
+from teradataml.common.constants import CopyToConstants, PTITableConstants, TeradataTypes
 from teradatasql import OperationalError
 from teradataml.common.wrapper_utils import AnalyticsWrapperUtils
 from teradataml.utils.utils import execute_sql
 from teradataml.utils.validators import _Validators
 from teradataml.telemetry_utils.queryband import collect_queryband
+from teradatasqlalchemy.dialect import dialect as td_dialect
+from teradataml.utils.dtypes import _TupleOf
 @collect_queryband(queryband="CpToSql")
 def copy_to_sql(df, table_name,
@@ -48,7 +52,12 @@ def copy_to_sql(df, table_name,
                 seq_max=None,
                 set_table=False,
                 chunksize=CopyToConstants.DBAPI_BATCHSIZE.value,
-                match_column_order=True):
+                match_column_order=True,
+                partition_by=None,
+                partition_by_case=None,
+                partition_by_range=None,
+                sub_partition=None,
+                **kwargs):
     """
     Writes records stored in a Pandas DataFrame or a teradataml DataFrame to Teradata Vantage.
@@ -284,6 +293,68 @@ def copy_to_sql(df, table_name,
             Default Value: True
             Types: bool
+        partition_by:
+            Optional Argument.
+            Specifies the columns on which partition should be created while creating the table.
+            Note:
+                1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
+                2. "primary_index" should be specified when "partition_by" is used.
+                3. Not applicable for PTI tables.
+            Types: str or ColumnExpression
+        partition_by_case:
+            Optional Argument.
+            Specifies different cases to partition the index while creating table.
+            Note:
+                1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
+                2. "primary_index" should be specified when "partition_by_case" is used.
+                3. Not applicable for PTI tables.
+            Types: str or ColumnExpression or tuple of ColumnExpression, str
+        partition_by_range:
+            Optional Argument.
+            Specifies the range of values on which partition should be created while creating a table.
+            Note:
+                1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
+                2. "primary_index" should be specified when "partition_by_range" is used.
+                3. Not applicable for PTI tables.
+            types: str or ColumnExpression
+        sub_partition:
+            Optional Argument.
+            Specifies the details to subpartition the main partition according to the value provided while creating the table.
+            Note:
+                1. "sub_partition" is applicable only when "partition_by_range" is specified.
+                2. Not applicable for PTI tables.
+            Types: int or Teradata Interval datatypes
+        **kwargs:
+            Optional keyword arguments.
+            valid_time_columns:
+                Optional Argument.
+                Specifies the name(s) of the valid time columns to be referred in "df".
+                When "valid_time_columns" is specified, then function considers
+                these columns as valid time dimension columns and creates a
+                valid time dimension temporal table if table does not exist.
+                Notes:
+                    * If a string is provided, the column must be of PERIOD type.
+                Types: tuple of strings or str
+            derived_column:
+                Optional Argument.
+                Specifies the name of the derived column to be kept in the temporal table.
+                Notes:
+                    * Argument is ignored if "valid_time_columns" are not specified.
+                    * Argument is considered only if copy_to_sql() is creating a table.
+                    * If "valid_time_columns" is specified and "derived_column" is not specified,
+                      then copy_to_sql() automatically creates a derived column by adding "_" between
+                      the columns mentioned in "valid_time_columns". For example,
+                      if "valid_time_columns" is ('col1', 'col2') and "derived_column"
+                      is not specified, then copy_to_sql() creates table with
+                      derived column name as 'col1_col2'.
+                Types: str
     RETURNS:
         None
@@ -305,32 +376,32 @@ def copy_to_sql(df, table_name,
             >>> pandas_df = pd.DataFrame(df)
             a) Save a Pandas DataFrame using a dataframe & table name only:
-            >>> copy_to_sql(df = pandas_df, table_name = 'my_table')
+            >>> copy_to_sql(df=pandas_df, table_name='my_table')
             b) Saving as a SET table
-            >>> copy_to_sql(df = pandas_df, table_name = 'my_set_table', index=True,
+            >>> copy_to_sql(df=pandas_df, table_name='my_set_table', index=True,
                             primary_index='index_label', set_table=True)
             c) Save a Pandas DataFrame by specifying additional parameters:
-            >>> copy_to_sql(df = pandas_df, table_name = 'my_table_2', schema_name = 'alice',
-            ...             index = True, index_label = 'my_index_label', temporary = False,
-            ...             primary_index = ['emp_id'], if_exists = 'append',
-            ...             types = {'emp_name': VARCHAR, 'emp_sage':INTEGER,
-            ...                      'emp_id': BIGINT, 'marks': DECIMAL})
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_2', schema_name='alice',
+            ...             index=True, index_label='my_index_label', temporary=False,
+            ...             primary_index=['emp_id'], if_exists='append',
+            ...             types={'emp_name': VARCHAR, 'emp_sage':INTEGER,
+            ...                    'emp_id': BIGINT, 'marks': DECIMAL})
             d) Saving with additional parameters as a SET table
-            >>> copy_to_sql(df = pandas_df, table_name = 'my_table_3', schema_name = 'alice',
-            ...             index = True, index_label = 'my_index_label', temporary = False,
-            ...             primary_index = ['emp_id'], if_exists = 'append',
-            ...             types = {'emp_name': VARCHAR, 'emp_sage':INTEGER,
-            ...                       'emp_id': BIGINT, 'marks': DECIMAL},
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_3', schema_name='alice',
+            ...             index=True, index_label='my_index_label', temporary=False,
+            ...             primary_index=['emp_id'], if_exists='append',
+            ...             types={'emp_name': VARCHAR, 'emp_sage':INTEGER,
+            ...                    'emp_id': BIGINT, 'marks': DECIMAL},
             ...             set_table=True)
             e) Saving levels in index of type MultiIndex
             >>> pandas_df = pandas_df.set_index(['emp_id', 'emp_name'])
-            >>> copy_to_sql(df = pandas_df, table_name = 'my_table_4', schema_name = 'alice',
-            ...             index = True, index_label = ['index1', 'index2'], temporary = False,
-            ...             primary_index = ['index1'], if_exists = 'replace')
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_4', schema_name='alice',
+            ...             index=True, index_label=['index1', 'index2'], temporary=False,
+            ...             primary_index=['index1'], if_exists = 'replace')
             f) Save a Pandas DataFrame with VECTOR datatype:
             >>> import pandas as pd
@@ -343,6 +414,68 @@ def copy_to_sql(df, table_name,
             >>> from teradatasqlalchemy import VECTOR
             >>> copy_to_sql(df=df, table_name='my_vector_table', types={'array_col': VECTOR})
+            g) Saving pandas DataFrame with partition_by:
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_5', if_exists='replace',
+            ...             primary_index=['emp_id'],
+            ...             partition_by='emp_id')
+            h) Saving pandas DataFrame with partition_by_case:
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_6', if_exists='replace',
+            ...             primary_index=['emp_id'],
+            ...             partition_by_case='emp_id > 100, emp_id < 500')
+            i) Saving pandas DataFrame with partition_by_range:
+            >>> copy_to_sql(df=pandas_df, table_name='my_table_7', if_exists='replace',
+            ...             primary_index=['emp_id'],
+            ...             partition_by_range='emp_id BETWEEN 100 AND 500')
+            j) Save a Pandas DataFrame with valid time columns of DATE type to a temporal table.
+            >>> import pandas as pd
+            >>> from teradataml.dataframe.copy_to import copy_to_sql
+            >>> df = pd.DataFrame({
+            ...     'id': [1, 2, 3],
+            ...     'start_date': pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01']).date,
+            ...     'end_date': pd.to_datetime(['2024-01-10', '2024-02-10', '2024-03-10']).date,
+            ...     'description': ['a', 'b', 'c']
+            ... })
+            >>> copy_to_sql(
+            ...     df=df,
+            ...     table_name='temporal_table_pandas_date',
+            ...     valid_time_columns=('start_date', 'end_date')
+            ... )
+            k) Save a Pandas DataFrame with valid time columns of TIMESTAMP type
+            to a temporal table. Name the derived column as `valid_time`.
+            >>> import pandas as pd
+            >>> from teradataml.dataframe.copy_to import copy_to_sql
+            >>> df = pd.DataFrame({
+            ...     'id': [1, 2, 3],
+            ...     'start_time': pd.to_datetime(['2024-01-01 10:00:00', '2024-02-01 11:00:00', '2024-03-01 12:00:00']),
+            ...     'end_time': pd.to_datetime(['2024-01-01 12:00:00', '2024-02-01 13:00:00', '2024-03-01 14:00:00']),
+            ...     'description: ['a', 'b', 'c']
+            ... })
+            >>> copy_to_sql(
+            ...     df=df,
+            ...     table_name='temporal_table_pandas_timestamp',
+            ...     valid_time_columns=('start_time', 'end_time'),
+            ...     derived_column='valid_time'
+            ... )
+            f) Save a teradataml DataFrame with valid time column of PERIOD type to a temporal table.
+            >>> from teradataml.dataframe.dataframe import DataFrame
+            >>> from teradataml.dataframe.copy_to import copy_to_sql
+            >>> from teradataml.data.load_example_data import load_example_data
+            >>> load_example_data("teradataml", "Employee_roles")
+            >>> from teradatasqlalchemy.types import PERIOD_DATE
+            >>> df = DataFrame('Employee_roles')
+            >>> copy_to_sql(
+            ...     df,
+            ...     table_name = 'employee_roles_temporal',
+            ...     valid_time_column='role_validity_period',
+            ...     types={'role_validity_period':PERIOD_DATE}
+            ... )
         2. Saving a teradataml DataFrame:
             >>> from teradataml.dataframe.dataframe import DataFrame
@@ -368,14 +501,62 @@ def copy_to_sql(df, table_name,
             >>> copy_to_sql(df2, 'my_tdml_table_2')
             d) Save a teradataml DataFrame by using copy_to_sql with additional parameters:
-            >>> copy_to_sql(df = df2, table_name = 'my_tdml_table_3', schema_name = 'alice',
-            ...             temporary = False, primary_index = None, if_exists = 'append',
-            ...             types = {'masters': VARCHAR, 'gpa':INTEGER})
+            >>> copy_to_sql(df=df2, table_name='my_tdml_table_3', schema_name='alice',
+            ...             temporary=False, primary_index=None, if_exists='append',
+            ...             types={'masters': VARCHAR, 'gpa':INTEGER})
             e) Saving as a SET table
-            >>> copy_to_sql(df = df2, table_name = 'my_tdml_set_table', schema_name = 'alice',
-            ...             temporary = False, primary_index = ['gpa'], if_exists = 'append',
-            ...             types = {'masters': VARCHAR, 'gpa':INTEGER}, set_table = True)
+            >>> copy_to_sql(df = df2, table_name = 'my_tdml_set_table', schema_name='alice',
+            ...             temporary=False, primary_index=['gpa'], if_exists='append',
+            ...             types={'masters': VARCHAR, 'gpa':INTEGER}, set_table = True)
+            f) Saving a teradataml DataFrame into a table by partitioning the table with column 'gpa':
+            >>> copy_to_sql(df=df, table_name='my_tdml_table_4', if_exists='replace',
+            ...             primary_index=['gpa'],
+            ...             partition_by=df.gpa)
+            g) Saving a teradataml DataFrame into a table with two partitions as below:
+            >>> copy_to_sql(df=df, table_name='my_tdml_table_5', if_exists='replace',
+            ...             primary_index=['id'],
+            ...             partition_by_case=(df.id < 100, df.gpa < 5.0))
+            h) Saving a teradataml DataFrame into a table by partitioning the table with different ranges:
+            >>> copy_to_sql(df=df, table_name='my_tdml_table_6', if_exists='replace',
+            ...             primary_index=['id'],
+            ...             partition_by_range=df.id.between(1, 100))
+            i) Saving a teradataml DataFrame into a table by partitioning the table with different ranges.
+               Also sub-partitioning based on INTERVAL:
+            >>> load_example_data("dataframe", "sales")
+            >>> df = DataFrame('sales')
+            >>> from teradatasqlalchemy import INTERVAL_DAY
+            >>> copy_to_sql(df=df, table_name='my_tdml_table_7', if_exists='replace',
+            ...             primary_index="Feb"
+            ...             partition_by_range=df.datetime.between('2017-01-01', '2017-01-31'),
+            ...             sub_partition=INTERVAL_DAY(1))
+            j) Save a teradataml DataFrame with valid time columns of DATE type to a temporal table.
+            pdf = pd.DataFrame({
+            ...     'id': [1, 2, 3],
+            ...     'start_date': pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01']).date,
+            ...     'end_date': pd.to_datetime(['2024-01-10', '2024-02-10', '2024-03-10']).date,
+            ...     'description': ['a', 'b', 'c']
+            ... })
+            >>> df_temporal = DataFrame(data = pdf)
+            >>> copy_to_sql(df=df_temporal, table_name='temporal_table_tdml_date',
+            ...             valid_time_columns=('start_date', 'end_date'))
+            k) Save a teradataml DataFrame with valid time columns of TIMESTAMP type
+            to a temporal table. Name the derived column as `validity_period`.
+            >>>  df_temporal_ts = DataFrame(data = pd.DataFrame({
+            ...     'id': [1, 2, 3],
+            ...     'start_time': pd.to_datetime(['2024-01-01 10:00:00', '2024-02-01 11:00:00', '2024-03-01 12:00:00']),
+            ...     'end_time': pd.to_datetime(['2024-01-01 12:00:00', '2024-02-01 13:00:00', '2024-03-01 14:00:00']),
+            ...     'description': ['a', 'b', 'c']
+            ... }))
+            >>> copy_to_sql(df=df_temporal_ts, table_name='temporal_table_tdml_timestamp',
+            ...             valid_time_columns=('start_time', 'end_time'), derived_column='validity_period')
         3. Saving a teradataml DataFrame as a PTI table:
@@ -403,6 +584,10 @@ def copy_to_sql(df, table_name,
             ...             set_table=True)
     """
+    # Accept valid_time_columns and derived_column from kwargs
+    valid_time_columns = kwargs.get("valid_time_columns", None)
+    derived_column = kwargs.get("derived_column", None)
     # Deriving global connection using get_connection().
     con = get_connection()
@@ -460,6 +645,12 @@ def copy_to_sql(df, table_name,
         dt_obj._validate()
+        # Validate partition arguments
+        _validate_partition_arguments(partition_by=partition_by,
+                                      partition_by_case=partition_by_case,
+                                      partition_by_range=partition_by_range,
+                                      sub_partition=sub_partition)
         # If the table created must be a PTI table, then validate additional parameters
         # Note that if the required parameters for PTI are valid, then other parameters, though being validated,
         # will be ignored - for example, primary_index
@@ -473,6 +664,13 @@ def copy_to_sql(df, table_name,
             raise TeradataMlException(Messages.get_message(MessageCodes.SET_TABLE_NO_PI),
                                       MessageCodes.SET_TABLE_NO_PI)
+        # Check whether valid time columns are passed to consider it as temporal table.
+        is_temporal = False
+        if valid_time_columns is not None:
+            _validate_valid_time_columns(df, valid_time_columns, derived_column,types)
+            is_temporal = True
         # Check if destination table exists
         table_exists = dt_obj._table_exists(con)
@@ -503,35 +701,49 @@ def copy_to_sql(df, table_name,
             # failing with Blank name in quotation mark. Hence, extracted only the table name.
             table_name = UtilFuncs._extract_table_name(table_name)
+        partition_exp, partition_func = _build_partition_expression(partition_by=partition_by,
+                                                                    partition_by_case=partition_by_case,
+                                                                    partition_by_range=partition_by_range,
+                                                                    sub_partition=sub_partition)
         # Let's create the SQLAlchemy table object to recreate the table
         if not table_exists or if_exists.lower() == 'replace':
-            if not is_pti:
-                table = _create_table_object(df, table_name, con, primary_index, temporary, schema_name, set_table,
-                                             types, None if not is_pandas_df else index,
-                                             None if not is_pandas_df else index_label)
-            else:
-                table = _create_pti_table_object(df, con, table_name, schema_name, temporary,
-                                                 primary_time_index_name, timecode_column, timezero_date,
-                                                 timebucket_duration, sequence_column, seq_max,
-                                                 columns_list, set_table, types,
-                                                 None if not is_pandas_df else index,
-                                                 None if not is_pandas_df else index_label)
-            if table is not None:
-                # If the table need to be replaced and there is no table name conflict,
-                # let's drop the existing table first
-                if table_exists and not is_conflict:
-                    tbl_name = dt_obj._get_fully_qualified_table_name()
-                    UtilFuncs._drop_table(tbl_name)
-                try:
-                    table.create(bind=get_context())
-                except sqlachemyOperationalError as err:
-                    raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
-                                              '\n' + str(err),
-                                              MessageCodes.TABLE_OBJECT_CREATION_FAILED)
+            if is_temporal:
+                _create_temporal_table(df, table_name, con, primary_index,
+                                       schema_name, valid_time_columns, derived_column,
+                                       types, None if not is_pandas_df else index,
+                                       None if not is_pandas_df else index_label)
             else:
-                raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED),
-                                          MessageCodes.TABLE_OBJECT_CREATION_FAILED)
+                if is_pti:
+                    table = _create_pti_table_object(df, con, table_name, schema_name, temporary,
+                                                     primary_time_index_name, timecode_column, timezero_date,
+                                                     timebucket_duration, sequence_column, seq_max,
+                                                     columns_list, set_table, types,
+                                                     None if not is_pandas_df else index,
+                                                     None if not is_pandas_df else index_label)
+                else:
+                    table = _create_table_object(df, table_name, con, primary_index, temporary, schema_name, set_table,
+                                                 types, None if not is_pandas_df else index,
+                                                 None if not is_pandas_df else index_label,
+                                                 partition_expression=partition_exp,
+                                                 partition_function=partition_func
+                                                 )
+                if table is not None:
+                    # If the table need to be replaced and there is no table name conflict,
+                    # let's drop the existing table first
+                    if table_exists and not is_conflict:
+                        tbl_name = dt_obj._get_fully_qualified_table_name()
+                        UtilFuncs._drop_table(tbl_name)
+                    try:
+                        table.create(bind=get_context())
+                    except sqlachemyOperationalError as err:
+                        raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
+                                                '\n' + str(err),
+                                                MessageCodes.TABLE_OBJECT_CREATION_FAILED)
+                else:
+                    raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED),
+                                            MessageCodes.TABLE_OBJECT_CREATION_FAILED)
         # Check column compatibility for insertion when table exists and if_exists = 'append'
         if table_exists and if_exists.lower() == 'append':
@@ -549,7 +761,7 @@ def copy_to_sql(df, table_name,
                     cols, _ = df_utils._get_column_names_and_types_from_metaexpr(df._metaexpr)
                 if match_column_order:
                     cols_compatible = _check_columns_insertion_compatible(table.c, cols, is_pandas_df,
-                                                                      is_pti, timecode_column, sequence_column)
+                                                                      is_pti, timecode_column, sequence_column, derived_column)
                     if not cols_compatible:
                         raise TeradataMlException(Messages.get_message(MessageCodes.INSERTION_INCOMPATIBLE),
@@ -746,6 +958,143 @@ def _get_index_labels(df, index_label):
     return ind_names, ind_types
+def _validate_partition_arguments(partition_by=None,
+                                  partition_by_case=None,
+                                  partition_by_range=None,
+                                  sub_partition=None):
+    """
+    Internal function to validate the partition_by arguments.
+    PARAMETERS:
+        partition_by:
+            Optional argument.
+            Specifies the columns on which PARTITION BY should be created.
+            Types: str or ColumnExpression
+        partition_by_case:
+            Optional argument.
+            Specifies different cases to partition the index.
+            Types: str or ColumnExpression or tuple of ColumnExpression, str
+        partition_by_range:
+            Optional argument.
+            Specifies the range of values of Date columns on which partition to be created.
+            Types: str or ColumnExpression
+        sub_partition:
+            Optional argument.
+            Specifies the details to subpartition the main partition according to the value provided.
+            Types: int or Teradata Interval datatypes
+    RETURNS:
+        None
+    RAISES:
+        TeradataMlException
+    EXAMPLES:
+        >>> _validate_partition_arguments(partition_by='col1')
+        >>> _validate_partition_arguments(partition_by_case=(df.col1 < 100, df.col1 < 1000))
+    """
+    # Validate partition_by argument
+    arg_matrix = []
+    arg_matrix.append(['partition_by', partition_by, True, (str, ColumnExpression), True])
+    arg_matrix.append(['partition_by_case', partition_by_case, True, (ColumnExpression, str, _TupleOf((str, ColumnExpression))), True])
+    arg_matrix.append(['partition_by_range', partition_by_range, True, (ColumnExpression, str), True])
+    arg_matrix.append(['sub_partition', sub_partition, True, (int, TeradataTypes.TD_RANGE_N_CLAUSE_TYPES.value), True])
+    # Validate argument types
+    _Validators._validate_function_arguments(arg_matrix)
+    # Validate mutually exclusive arguments
+    _Validators._validate_mutually_exclusive_argument_groups({"partition_by":partition_by},
+                                                             {"partition_by_case":partition_by_case},
+                                                             {"partition_by_range":partition_by_range})
+def _build_partition_expression(partition_by=None,
+                                partition_by_case=None,
+                                partition_by_range=None,
+                                sub_partition=None):
+    """
+    DESCRIPTION:
+        Internal function to build the partitioning expression for the table.
+    PARAMETERS:
+        partition_by:
+            Optional argument.
+            Specifies the columns on which PARTITION BY should be created.
+            Types: str or ColumnExpression
+        partition_by_case:
+            Optional argument.
+            Specifies different cases to partition the index.
+            Types: str or ColumnExpression or tuple of ColumnExpression, str
+        partition_by_range:
+            Optional argument.
+            Specifies the range of values of Date columns on which partition to be created.
+            Types: str or ColumnExpression
+        sub_partition:
+            Optional argument.
+            Specifies the details to subpartition the main partition according to the value provided.
+            Types: int or Teradata Interval datatypes
+    RAISES:
+        None
+    RETURNS:
+        strings containing the partitioning expression and partition function.
+    EXAMPLES:
+        >>> _build_partition_expression(partition_by='col1')
+        >>> _build_partition_expression(partition_by_case=(df.col1 < 100, df.col1 < 1000))
+    """
+    partition_exp = None
+    partition_fn = None
+    # Check if partition_by expression is a ColumnExpression,
+    # if so, compile it to a string
+    if partition_by:
+        partition_exp = partition_by.compile() if isinstance(partition_by, ColumnExpression) \
+                        else partition_by
+    # Check if partition_by_case is a ColumnExpression or string,
+    # if string, join to partition_by expression
+    # if ColumnExpression, compile it to a string and join to partition_by expression
+    # if tuple, compile each expression to a string and join to partition_by expression
+    if partition_by_case:
+        partition_fn = "CASE_N"
+        partition_by_case = [partition_by_case] if isinstance(partition_by_case, (str, ColumnExpression)) \
+                            else partition_by_case
+        partition_exp = "{}, NO CASE, UNKNOWN".format(
+                        ", ".join(str(exp.compile()) if isinstance(exp, ColumnExpression) else str(exp)
+                        for exp in partition_by_case))
+    # Check if partition_by_range is a ColumnExpression or string,
+    # if so, compile it to a string
+    if partition_by_range:
+        partition_fn = "RANGE_N"
+        sub_partition_clause = ""
+        if isinstance(partition_by_range, ColumnExpression):
+            partition_by_range = partition_by_range.compile()
+        # Check if sub_partition provided,
+        # if so, complie the EACH clause for RANGE_N
+        # If sub_partition is an int, the convert to string and add to the clause.
+        # If sub_partition is a TeradataTypes.TD_RANGE_N_CLAUSE_TYPES,
+        # convert to string and extract the precision and add to the clause.
+        if sub_partition:
+            sub_partition_clause = (
+                                    f" EACH {str(sub_partition)}"
+                                    if isinstance(sub_partition, int)
+                                    else f" EACH INTERVAL '{sub_partition.precision}' {str(sub_partition).split(maxsplit=1)[1]}")
+        partition_exp = "{0}{1}".format(partition_by_range, sub_partition_clause)
+    # Return partition_by expression and partition function
+    return partition_exp, partition_fn
 def _validate_pti_copy_parameters(df, timecode_column, timebucket_duration,
                                   timezero_date, primary_time_index_name, columns_list,
@@ -1010,7 +1359,7 @@ def _validate_column_type(df, col, col_arg, expected_types, types = None, index
 def _create_table_object(df, table_name, con, primary_index, temporary, schema_name, set_table, types, index=None,
-                         index_label=None):
+                         index_label=None, partition_expression=None, partition_function=None):
     """
     This is an internal function used to construct a SQLAlchemy Table Object.
     This function checks appropriate flags and supports creation of Teradata
@@ -1041,6 +1390,12 @@ def _create_table_object(df, table_name, con, primary_index, temporary, schema_n
             When True, an attempt to create a SET table is made.
             When False, an attempt to create a MULTISET table is made.
+        partition_expression:
+            Specifies the partitioning expression to be used for partition by clause.
+        partition_function:
+            Specifies the partitioning function to be used  with partition by clause.
         types:
             Specifies a python dictionary with column-name(key) to column-type(value) mapping to create DataFrames.
@@ -1097,6 +1452,11 @@ def _create_table_object(df, table_name, con, primary_index, temporary, schema_n
     else:
         pti = pti.no_primary_index()
+    # Partitioning expression and function
+    if partition_expression:
+        pti = pti.partition_by(partition_expression=partition_expression,
+                               partition_fn=partition_function)
     # Create default Table construct with parameter dictionary
     table = Table(table_name, meta,
                   *(Column(col_name, col_type)
@@ -1243,6 +1603,142 @@ def _create_pti_table_object(df, con, table_name, schema_name, temporary, primar
     return table
+def _create_temporal_table(df, table_name, con, primary_index, schema_name,
+                           valid_time_columns, derived_column, types, index=None, index_label=None):
+    """
+    This is an internal function used to construct a CREATE TABLE statement for a Teradata temporal table.
+    Supports creation of tables with a PERIOD FOR derived column using the specified valid time columns.
+    PARAMETERS:
+        df:
+            Required Arugment.
+            The teradataml or Pandas DataFrame object to be saved.
+            Types: pandas.DataFrame or teradataml.dataframe.dataframe.DataFrame
+        table_name:
+            Required Argument.
+            Name of SQL table.
+            Types: String
+        con:
+            Optional Argument.
+            A SQLAlchemy connectable (engine/connection) object.
+            Types: SQLAlchemy Engine or Connection
+        primary_index:
+            Optional Argument.
+            Creates Teradata Table(s) with Primary index column if specified.
+            Types: String or list of Strings
+        schema_name:
+            Optional Argument.
+            Specifies the name of the SQL schema in the database to write to.
+            Types: String
+        valid_time_columns:
+            Required Argument.
+            Specifies a tuple of two column names representing the temporal validity period.
+            Types: tuple of Strings or str
+        derived_column:
+            Optional Argument.
+            Specifies the name of the derived PERIOD FOR column to be created.
+            Types: String
+        types:
+            Optional Argument.
+            Specifies a python dictionary with column-name(key) to column-type(value) mapping to create DataFrames.
+            Types: dict
+        index:
+            Optional Argument.
+            Flag specifying whether to write Pandas DataFrame index as a column(s) or not.
+            Types: Boolean
+        index_label:
+            Optional Argument.
+            Column label(s) for index column(s).
+            Types: String or list of Strings
+    RETURNS:
+        None
+    RAISES:
+        TeradataMlException
+    EXAMPLES:
+        _create_temporal_table(
+            df=my_df,
+            table_name='temporal_table',
+            con=td_connection,
+            primary_index=['id'],
+            schema_name='my_schema',
+            valid_time_columns=('start_date', 'end_date'),
+            derived_column='validity_period',
+            types={'id': INTEGER, 'start_date': DATE, 'end_date': DATE},
+            index=False,
+            index_label=None
+        )
+    """
+    # Extract column names and types
+    if isinstance(df, pd.DataFrame):
+        col_names, col_types = _extract_column_info(df, types, index, index_label)
+    else:
+        col_names, col_types = df_utils._get_column_names_and_types_from_metaexpr(df._metaexpr)
+        if types is not None:
+            col_types = [types.get(col_name, col_type) for col_name, col_type in zip(col_names, col_types)]
+    columns_clause_ = []
+    # Ensure all col_types are instances, not classes
+    for i, col_type in enumerate(col_types):
+        if isinstance(col_type, type):
+            col_types[i] = col_type()
+    # Use col_names and col_types to build the columns clause
+    # Compile column types to string using the dialect of the current connection
+    # Add NOT NULL to valid_time_columns
+    for col_name, col_type in zip(col_names, col_types):
+        col_def = '{} {}'.format(col_name, col_type.compile(dialect=td_dialect()))
+        if  col_name in valid_time_columns:
+            col_def += ' NOT NULL'
+            if isinstance(col_type, (PERIOD_DATE, PERIOD_TIMESTAMP)):
+                col_def += ' AS VALIDTIME'
+        columns_clause_.append(col_def)
+    period_for_clause = []
+    if isinstance(valid_time_columns, tuple):
+        if derived_column is None:
+            derived_column = "_".join(valid_time_columns)
+        period_for_clause = ['PERIOD FOR {}  ({}, {}) AS VALIDTIME'.format(
+            derived_column, valid_time_columns[0], valid_time_columns[1])
+        ]
+    columns_clause = ",\n ".join(columns_clause_ + period_for_clause)
+    # Prepare primary index clause.
+    if primary_index:
+        primary_index_clause = "PRIMARY INDEX ({})".format(
+            ", ".join(UtilFuncs._as_list(primary_index)))
+    else:
+        primary_index_clause = ""
+    # Prepare create table statement.
+    table_name = UtilFuncs._get_qualified_table_name(schema_name, table_name) if\
+        schema_name else table_name
+    sql = """
+    CREATE MULTISET TABLE {}
+    (\n{}\n)\n{}
+    """.format(table_name, columns_clause, primary_index_clause)
+    try:
+        execute_sql(sql)
+    except Exception as err:
+        raise TeradataMlException(
+            Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
+            '\n' + str(err),
+            MessageCodes.TABLE_OBJECT_CREATION_FAILED
+        )
 def _rename_column(col_names, search_for, rename_to):
     """
@@ -1370,7 +1866,7 @@ def _reorder_insert_list_for_pti(df_column_list, timecode_column, sequence_colum
 def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_pandas_df=False,
-                                        is_pti=False, timecode_column=None, sequence_column=None):
+                                        is_pti=False, timecode_column=None, sequence_column=None, derived_column=None):
     """
     Internal function used to extract column information from two lists of SQLAlchemy ColumnExpression objects;
     and check if the number of columns and their names are matching to determine table insertion compatibility.
@@ -1394,11 +1890,15 @@ def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_panda
         timecode_column:
             timecode_column required to order the select expression for the insert.
             It should be the first column in the select expression.
- q
         sequence_column:
             sequence_column required to order the select expression for the insert.
             It should be the second column in the select expression.
+        derived_column:
+            Specifies a derived column that is part of the table schema but not
+            part of insert.
+            Types: String
     RETURNS:
         a) True, when insertion compatible (number of columns and their names match)
@@ -1410,11 +1910,16 @@ def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_panda
     EXAMPLES:
         _check_columns_insertion_compatible(table1.c, ['co1', 'col2'], False)
         _check_columns_insertion_compatible(table1.c, (['co1', 'col2'], [int, str]), True, True, 'ts', 'seq')
+        _check_columns_insertion_compatible(table1.c, (['co1', 'col2'], [int, str]), True, True, 'ts', 'seq', 'derived_col')
     """
     table1_col_names, _ = UtilFuncs._extract_table_object_column_info(table1_col_object)
     table2_col_names = table2_cols[0] if is_pandas_df else table2_cols
+    # Remove derived_column from table1_col_names if specified
+    if derived_column is not None and derived_column in table1_col_names:
+        table1_col_names.remove(derived_column)
     # Check for number of columns
     if len(table1_col_names) != len(table2_col_names):
         return False
@@ -1783,3 +2288,158 @@ def _validate_timezero_date(timezero_date):
     # Looks like the value is valid
     return True
+def _validate_valid_time_columns(df, valid_time_columns, derived_column=None, types=None):
+    """
+    Internal function to validate that the columns specified in valid_time_columns
+    exist in the DataFrame, are of type DATE or TIMESTAMP, and are of the same type.
+    Also checks that the derived_column, if specified, is not present in the DataFrame.
+    PARAMETERS:
+        df:
+            Required Argument.
+            Specifies the Pandas or teradataml DataFrame object to be validated.
+            Types: pandas.DataFrame or teradataml.dataframe.dataframe.DataFrame
+        valid_time_columns:
+            Required Argument.
+            Specifies a tuple of two column names representing the temporal validity period.
+            Types: tuple of Strings
+        derived_column:
+            Optional Argument.
+            Specifies the name of the derived column that should not be
+            present in the DataFrame.
+            Types: String
+        types:
+            Optional Argument.
+            Specifies a python dictionary with column-name(key) to column-type(value)
+            mapping to create DataFrames.
+            Types: dict
+    RETURNS:
+        None
+    RAISES:
+        TeradataMlException
+    EXAMPLES:
+        _validate_valid_time_columns(
+            df=my_df,
+            valid_time_columns=('start_date', 'end_date'),
+            derived_column='validity_period',
+            types={'start_date': DATE, 'end_date': DATE}
+        )
+    """
+    df_columns = _get_pd_df_column_names(df) if isinstance(df, pd.DataFrame) else df.columns
+    df_dtypes = (
+        {
+            col: _get_sqlalchemy_mapping_types(str(df.dtypes[col]))
+            for col in df.dtypes.keys()
+        }
+        if isinstance(df, pd.DataFrame)
+        else df._td_column_names_and_sqlalchemy_types
+    )
+    # If types argument is provided, override the dtypes for those columns
+    if types is not None:
+        for col, typ in types.items():
+            if col in df_columns:
+                df_dtypes[col] = typ
+    if derived_column is not None and derived_column in df_columns:
+        raise TeradataMlException(
+            Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_FOUND).format(
+                derived_column, 'derived_column', 'dataframe.', 'Provide value which is not part of DataFrame columns'
+            ),
+            MessageCodes.TDMLDF_COLUMN_IN_ARG_FOUND
+        )
+    # valid_time_columns can be a tuple of two column names or a single column name
+    if isinstance(valid_time_columns, tuple):
+        if len(valid_time_columns) != 2:
+            raise TeradataMlException(
+                Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
+                    valid_time_columns, 'valid_time_columns', 'tuple of two column names'
+                ),
+                MessageCodes.INVALID_ARG_VALUE
+            )
+        # Check if both columns are present in the DataFrame
+        for col in valid_time_columns:
+            if col not in df_columns:
+                raise TeradataMlException(
+                    Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND).format(
+                        col, 'valid_time_columns', 'df', 'DataFrame'
+                    ),
+                    MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND
+                )
+        col1_type = df_dtypes[valid_time_columns[0]]
+        col2_type = df_dtypes[valid_time_columns[1]]
+        # When types are specified, ensure they are DATE or TIMESTAMP objects or classes.
+        if not (
+            isinstance(col1_type, TIMESTAMP) or isinstance(col1_type, DATE) or
+            col1_type is TIMESTAMP or col1_type is DATE
+        ):
+            raise TeradataMlException(
+                Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
+                    'valid_time_columns',
+                    col1_type.__name__ if isinstance(col1_type, type)
+                    else col1_type.__class__.__name__, 'DATE or TIMESTAMP'
+                ),
+                MessageCodes.INVALID_COLUMN_TYPE
+            )
+        # When types are specified, ensure they are DATE or TIMESTAMP objects or classes.
+        if not (
+            isinstance(col2_type, TIMESTAMP) or isinstance(col2_type, DATE) or
+            col2_type is TIMESTAMP or col2_type is DATE
+        ):
+            raise TeradataMlException(
+                Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
+                    'valid_time_columns',
+                    col2_type.__name__ if isinstance(col2_type, type)
+                    else col2_type.__class__.__name__, 'DATE or TIMESTAMP'
+                ),
+                MessageCodes.INVALID_COLUMN_TYPE
+            )
+        if type(col1_type) != type(col2_type):
+            raise ValueError(
+                Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
+                    valid_time_columns, 'valid_time_columns', 'both columns of same type (DATE or TIMESTAMP)'
+                ),
+                MessageCodes.INVALID_ARG_VALUE
+            )
+    elif isinstance(valid_time_columns, str):
+        col = valid_time_columns
+        col_type = df_dtypes[col]
+        if col not in df_columns:
+                raise TeradataMlException(
+                    Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND).format(
+                        col, 'valid_time_columns', 'df', 'DataFrame'
+                    ),
+                    MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND
+                )
+        # When types are specified, ensure they are PERIOD_DATE or PERIOD_TIMESTAMP objects or classes.
+        if not (
+            isinstance(col_type, PERIOD_TIMESTAMP) or isinstance(col_type, PERIOD_DATE) or
+            col_type is PERIOD_TIMESTAMP or col_type is PERIOD_DATE
+        ):
+            raise TeradataMlException(
+                Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
+                    'valid_time_columns',
+                    col_type.__name__ if isinstance(col_type, type)
+                    else col_type.__class__.__name__, 'PERIOD_DATE or PERIOD_TIMESTAMP'
+                ),
+                MessageCodes.INVALID_COLUMN_TYPE
+            )
+    else:
+        raise TeradataMlException(
+            Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
+                valid_time_columns, 'valid_time_columns', 'tuple of two column names or a single column name'
+            ),
+            MessageCodes.INVALID_ARG_VALUE
+        )

teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.6py3-none-any.whl → 20.0.0.7py3-none-any.whl