teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -27,8 +27,10 @@ import pandas as pd
|
|
|
27
27
|
import sqlalchemy
|
|
28
28
|
from sqlalchemy import Column
|
|
29
29
|
from sqlalchemy.exc import NoSuchColumnError
|
|
30
|
+
from datetime import datetime, date
|
|
30
31
|
from sqlalchemy.sql import ClauseElement
|
|
31
32
|
from teradatasql import OperationalError
|
|
33
|
+
from teradatasqlalchemy import types as tdtypes
|
|
32
34
|
from teradatasqlalchemy.dialect import dialect as td_dialect
|
|
33
35
|
from teradatasqlalchemy.dialect import preparer
|
|
34
36
|
from teradatasqlalchemy.types import (BIGINT, BYTEINT, DECIMAL, FLOAT, INTEGER,
|
|
@@ -38,7 +40,7 @@ import teradataml.context.context as tdmlctx
|
|
|
38
40
|
from teradataml import GarbageCollector, execute_sql
|
|
39
41
|
from teradataml.common.bulk_exposed_utils import \
|
|
40
42
|
_validate_unimplemented_function
|
|
41
|
-
from teradataml.common.constants import (AEDConstants, OutputStyle,
|
|
43
|
+
from teradataml.common.constants import (AEDConstants, DataFrameTypes, OutputStyle,
|
|
42
44
|
PTITableConstants, PythonTypes,
|
|
43
45
|
SourceType, SQLConstants,
|
|
44
46
|
SQLFunctionConstants,
|
|
@@ -239,6 +241,15 @@ class DataFrame():
|
|
|
239
241
|
included in the DataFrame if the dictionary contains those keys. If the dictionary does not
|
|
240
242
|
contain the specified keys, those columns will be added with NaN values.
|
|
241
243
|
Types: str OR list of str
|
|
244
|
+
|
|
245
|
+
persist:
|
|
246
|
+
Optional Argument.
|
|
247
|
+
Specifies whether to persist the DataFrame.
|
|
248
|
+
Note:
|
|
249
|
+
* This argument is only applicable when the "data" argument is of type dict, list or
|
|
250
|
+
pandas DataFrame.
|
|
251
|
+
Default Value: False
|
|
252
|
+
Types: bool
|
|
242
253
|
|
|
243
254
|
EXAMPLES:
|
|
244
255
|
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -366,15 +377,24 @@ class DataFrame():
|
|
|
366
377
|
# This attribute stores the root DataFrame columns.
|
|
367
378
|
self._root_columns = None
|
|
368
379
|
|
|
380
|
+
# Internal argument, when this attribute is set to True, the teradataml DataFrame locks
|
|
381
|
+
# the corresponding row(s) in the underlying table(s) while accessing the data.
|
|
382
|
+
_lock_rows = kwargs.get("_lock_rows", False)
|
|
383
|
+
|
|
369
384
|
self._datalake = None
|
|
370
385
|
self._database = None
|
|
371
386
|
self._table = None
|
|
372
387
|
self._otf = False
|
|
388
|
+
self._df_type = None
|
|
389
|
+
self._valid_time_column = None
|
|
390
|
+
self._transaction_time_column = None
|
|
391
|
+
|
|
373
392
|
|
|
374
393
|
table_name = kwargs.get("table_name", None)
|
|
375
394
|
primary_index = kwargs.get("primary_index", None)
|
|
376
395
|
columns = kwargs.get("columns", None)
|
|
377
396
|
types = kwargs.get("types", None)
|
|
397
|
+
persist = kwargs.get("persist", False)
|
|
378
398
|
|
|
379
399
|
# Check if the data is an instance of in_schema or if the data is None
|
|
380
400
|
# and table_name is an instance of in_schema, then assign the table_name,
|
|
@@ -441,9 +461,11 @@ class DataFrame():
|
|
|
441
461
|
pd_data = data.copy()
|
|
442
462
|
# If the columns are not of type string, then convert them to string.
|
|
443
463
|
pd_data.columns = [f"col_{i}" if isinstance(i, int) else i for i in pd_data.columns]
|
|
464
|
+
|
|
444
465
|
# Set the table_name to the name of the table created in the database.
|
|
445
466
|
table_name = UtilFuncs._generate_temp_table_name(prefix="from_pandas",
|
|
446
|
-
table_type=TeradataConstants.TERADATA_TABLE
|
|
467
|
+
table_type=TeradataConstants.TERADATA_TABLE,
|
|
468
|
+
gc_on_quit=not(persist))
|
|
447
469
|
|
|
448
470
|
copy_to_sql(pd_data, table_name, index=index, index_label=index_label, primary_index=primary_index,
|
|
449
471
|
types=types)
|
|
@@ -491,6 +513,8 @@ class DataFrame():
|
|
|
491
513
|
|
|
492
514
|
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
493
515
|
__execute_params = (self._table_name, self._query, True)
|
|
516
|
+
elif configure.temp_object_type == TeradataConstants.TERADATA_VIEW:
|
|
517
|
+
__execute_params = (self._table_name, self._query, _lock_rows)
|
|
494
518
|
|
|
495
519
|
try:
|
|
496
520
|
__execute(*__execute_params)
|
|
@@ -519,6 +543,7 @@ class DataFrame():
|
|
|
519
543
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
520
544
|
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
521
545
|
|
|
546
|
+
# _get_metaexpr() can be only used if self._table_name is set.
|
|
522
547
|
if table_name or query:
|
|
523
548
|
self._metaexpr = self._get_metaexpr()
|
|
524
549
|
self._get_metadata_from_metaexpr(self._metaexpr)
|
|
@@ -717,7 +742,7 @@ class DataFrame():
|
|
|
717
742
|
|
|
718
743
|
@classmethod
|
|
719
744
|
@collect_queryband(queryband="DF_fromQuery")
|
|
720
|
-
def from_query(cls, query, index=True, index_label=None, materialize=False):
|
|
745
|
+
def from_query(cls, query, index=True, index_label=None, materialize=False, **kwargs):
|
|
721
746
|
"""
|
|
722
747
|
Class method for creating a DataFrame from a query.
|
|
723
748
|
|
|
@@ -815,6 +840,7 @@ class DataFrame():
|
|
|
815
840
|
df._nodeid = nodeid
|
|
816
841
|
df._source_type = SourceType.TABLE.value
|
|
817
842
|
|
|
843
|
+
|
|
818
844
|
if not reuse_metaexpr:
|
|
819
845
|
# Create new _MetaExpression object using reference metaExpression
|
|
820
846
|
# for newly created DataFrame.
|
|
@@ -862,7 +888,7 @@ class DataFrame():
|
|
|
862
888
|
|
|
863
889
|
@classmethod
|
|
864
890
|
@collect_queryband(queryband="DF_fromPandas")
|
|
865
|
-
def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None):
|
|
891
|
+
def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None, persist=False):
|
|
866
892
|
"""
|
|
867
893
|
DESCRIPTION:
|
|
868
894
|
Creates a teradataml DataFrame from a pandas DataFrame.
|
|
@@ -892,6 +918,12 @@ class DataFrame():
|
|
|
892
918
|
Specifies which column(s) to use as primary index for the teradataml DataFrame.
|
|
893
919
|
Types: str OR list of str
|
|
894
920
|
|
|
921
|
+
persist:
|
|
922
|
+
Optional Argument.
|
|
923
|
+
Specifies whether to persist the DataFrame.
|
|
924
|
+
Default Value: False
|
|
925
|
+
Types: bool
|
|
926
|
+
|
|
895
927
|
RETURNS:
|
|
896
928
|
teradataml DataFrame
|
|
897
929
|
|
|
@@ -942,14 +974,15 @@ class DataFrame():
|
|
|
942
974
|
# Validate 'pandas_df' argument, other arguments, will be validated as part of DataFrame().
|
|
943
975
|
arg_type_matrix = []
|
|
944
976
|
arg_type_matrix.append(["pandas_df", pandas_df, False, (pd.DataFrame,), True])
|
|
945
|
-
|
|
977
|
+
arg_type_matrix.append(["persist", persist, True, (bool), True])
|
|
978
|
+
|
|
946
979
|
_Validators._validate_function_arguments(arg_type_matrix)
|
|
947
980
|
|
|
948
|
-
return cls(pandas_df, index, index_label, primary_index=primary_index)
|
|
981
|
+
return cls(pandas_df, index, index_label, primary_index=primary_index, persist=persist)
|
|
949
982
|
|
|
950
983
|
@classmethod
|
|
951
984
|
@collect_queryband(queryband="DF_fromDict")
|
|
952
|
-
def from_dict(cls, data, columns=None):
|
|
985
|
+
def from_dict(cls, data, columns=None, persist=False):
|
|
953
986
|
"""
|
|
954
987
|
DESCRIPTION:
|
|
955
988
|
Creates a DataFrame from a dictionary containing values as lists or numpy arrays.
|
|
@@ -969,6 +1002,12 @@ class DataFrame():
|
|
|
969
1002
|
Specifies the column names for the DataFrame.
|
|
970
1003
|
Types: str OR list of str
|
|
971
1004
|
|
|
1005
|
+
persist:
|
|
1006
|
+
Optional Argument.
|
|
1007
|
+
Specifies whether to persist the DataFrame.
|
|
1008
|
+
Default Value: False
|
|
1009
|
+
Types: bool
|
|
1010
|
+
|
|
972
1011
|
RETURNS:
|
|
973
1012
|
teradataml DataFrame
|
|
974
1013
|
|
|
@@ -1002,10 +1041,11 @@ class DataFrame():
|
|
|
1002
1041
|
arg_type_matrix = []
|
|
1003
1042
|
arg_type_matrix.append(["data", data, False, (dict), True])
|
|
1004
1043
|
arg_type_matrix.append(["columns", columns, True, (str, list), True])
|
|
1044
|
+
arg_type_matrix.append(["persist", persist, True, (bool), True])
|
|
1005
1045
|
|
|
1006
1046
|
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1007
1047
|
|
|
1008
|
-
return cls(data, columns=columns, index=False)
|
|
1048
|
+
return cls(data, columns=columns, index=False, persist=persist)
|
|
1009
1049
|
|
|
1010
1050
|
@classmethod
|
|
1011
1051
|
@collect_queryband(queryband="DF_fromRecords")
|
|
@@ -1049,6 +1089,12 @@ class DataFrame():
|
|
|
1049
1089
|
Specifies the number of rows to be read from the data if the data is iterator.
|
|
1050
1090
|
Types: int
|
|
1051
1091
|
|
|
1092
|
+
persist:
|
|
1093
|
+
Optional Argument.
|
|
1094
|
+
Specifies whether to persist the DataFrame.
|
|
1095
|
+
Default Value: False
|
|
1096
|
+
Types: bool
|
|
1097
|
+
|
|
1052
1098
|
RETURNS:
|
|
1053
1099
|
teradataml DataFrame
|
|
1054
1100
|
|
|
@@ -1136,6 +1182,7 @@ class DataFrame():
|
|
|
1136
1182
|
exclude = kwargs.get("exclude", None)
|
|
1137
1183
|
coerce_float = kwargs.get("coerce_float", True)
|
|
1138
1184
|
nrows = kwargs.get("nrows", None)
|
|
1185
|
+
persist = kwargs.get("persist", False)
|
|
1139
1186
|
|
|
1140
1187
|
arg_type_matrix = []
|
|
1141
1188
|
dtypes = (list, tuple, dict)
|
|
@@ -1144,6 +1191,7 @@ class DataFrame():
|
|
|
1144
1191
|
arg_type_matrix.append(["exclude", exclude, True, (_ListOf(str),), True])
|
|
1145
1192
|
arg_type_matrix.append(["coerce_float", coerce_float, True, (bool, ), True])
|
|
1146
1193
|
arg_type_matrix.append(["nrows", nrows, True, (int,), True])
|
|
1194
|
+
arg_type_matrix.append(["persist", persist, True, (bool,), True])
|
|
1147
1195
|
|
|
1148
1196
|
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1149
1197
|
|
|
@@ -1152,7 +1200,7 @@ class DataFrame():
|
|
|
1152
1200
|
|
|
1153
1201
|
df = pd.DataFrame.from_records(data, columns=columns, exclude=exclude,
|
|
1154
1202
|
coerce_float=coerce_float, nrows=nrows)
|
|
1155
|
-
return cls(df, index=False)
|
|
1203
|
+
return cls(df, index=False, persist=persist)
|
|
1156
1204
|
|
|
1157
1205
|
def create_temp_view(self, name):
|
|
1158
1206
|
"""
|
|
@@ -1546,6 +1594,57 @@ class DataFrame():
|
|
|
1546
1594
|
self._is_art = res[0][0] == 1
|
|
1547
1595
|
return self._is_art
|
|
1548
1596
|
|
|
1597
|
+
|
|
1598
|
+
def _process_columns_metadata(self):
|
|
1599
|
+
"""
|
|
1600
|
+
DESCRIPTION:
|
|
1601
|
+
Processes the metadata of columns to determine their time dimension properties
|
|
1602
|
+
and to check whether database object is a view, volatile table, or ART table.
|
|
1603
|
+
|
|
1604
|
+
PARAMETERS:
|
|
1605
|
+
None
|
|
1606
|
+
|
|
1607
|
+
RAISES:
|
|
1608
|
+
None
|
|
1609
|
+
|
|
1610
|
+
RETURNS:
|
|
1611
|
+
Tuple containing five boolean values:
|
|
1612
|
+
- is_view: True if the database object is a view, False otherwise.
|
|
1613
|
+
- is_volatile: True if the database object is a volatile table, False otherwise.
|
|
1614
|
+
- is_art_table: True if the database object is an ART table, False otherwise.
|
|
1615
|
+
- has_valid_time: True if any column has a valid time dimension, False otherwise.
|
|
1616
|
+
- has_transaction_time: True if any column has a transaction time dimension, False otherwise.
|
|
1617
|
+
EXAMPLES:
|
|
1618
|
+
>>> load_example_data("teradataml", "Employee")
|
|
1619
|
+
>>> df = DataFrame.from_table("Employee")
|
|
1620
|
+
>>> is_view, is_volatile, is_art_table, valid_time, transaction_time = (
|
|
1621
|
+
df._process_columns_metadata()
|
|
1622
|
+
)
|
|
1623
|
+
>>> is_view, is_volatile, is_art_table, valid_time, transaction_time
|
|
1624
|
+
(False, False, False, True, True)
|
|
1625
|
+
|
|
1626
|
+
"""
|
|
1627
|
+
|
|
1628
|
+
is_view = is_volatile = is_art_table = False
|
|
1629
|
+
|
|
1630
|
+
for col in self._metaexpr.c:
|
|
1631
|
+
metadata = col.expression.info
|
|
1632
|
+
time_dimension = metadata.get('time_dimension')
|
|
1633
|
+
is_view = metadata.get('is_view', is_view)
|
|
1634
|
+
is_volatile = metadata.get('is_volatile', is_volatile)
|
|
1635
|
+
is_art_table = metadata.get('is_art_table', is_art_table)
|
|
1636
|
+
|
|
1637
|
+
if time_dimension == "V":
|
|
1638
|
+
self._valid_time_column = col
|
|
1639
|
+
|
|
1640
|
+
if time_dimension == "T":
|
|
1641
|
+
self._transaction_time_column = col
|
|
1642
|
+
|
|
1643
|
+
has_valid_time = self._valid_time_column is not None
|
|
1644
|
+
has_transaction_time = self._transaction_time_column is not None
|
|
1645
|
+
|
|
1646
|
+
return is_view, is_volatile, is_art_table, has_valid_time, has_transaction_time
|
|
1647
|
+
|
|
1549
1648
|
def _get_metadata_from_metaexpr(self, metaexpr):
|
|
1550
1649
|
"""
|
|
1551
1650
|
Private method for setting _metaexpr and retrieving column names and types.
|
|
@@ -1598,6 +1697,7 @@ class DataFrame():
|
|
|
1598
1697
|
meta = sqlalchemy.MetaData()
|
|
1599
1698
|
db_schema = UtilFuncs._extract_db_name(self._table_name)
|
|
1600
1699
|
db_table_name = UtilFuncs._extract_table_name(self._table_name)
|
|
1700
|
+
|
|
1601
1701
|
if not self._datalake:
|
|
1602
1702
|
t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
|
|
1603
1703
|
return _MetaExpression(t)
|
|
@@ -1626,7 +1726,7 @@ class DataFrame():
|
|
|
1626
1726
|
# Create a SQLAlchemy table object representing datalake table.
|
|
1627
1727
|
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1628
1728
|
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
1629
|
-
return _MetaExpression(t)
|
|
1729
|
+
return _MetaExpression(t, datalake=self._datalake)
|
|
1630
1730
|
|
|
1631
1731
|
def __getattr__(self, name):
|
|
1632
1732
|
"""
|
|
@@ -2165,6 +2265,148 @@ class DataFrame():
|
|
|
2165
2265
|
td_metadata = [(column.name, repr(column.type)) for column in self._metaexpr.c]
|
|
2166
2266
|
return MetaData(td_metadata)
|
|
2167
2267
|
|
|
2268
|
+
@property
|
|
2269
|
+
def df_type(self):
|
|
2270
|
+
"""
|
|
2271
|
+
DESCRIPTION:
|
|
2272
|
+
Returns the type of the DataFrame based on the underlying database object.
|
|
2273
|
+
Possible teradataml DataFrame types are:
|
|
2274
|
+
- VALID_TIME_VIEW: DataFrame is created on Valid-Time dimension view.
|
|
2275
|
+
- TRANSACTION_TIME_VIEW: DataFrame is created on Transaction-Time dimension view.
|
|
2276
|
+
- BI_TEMPORAL_VIEW: DataFrame is created on Bi-temporal view.
|
|
2277
|
+
- VALID_TIME: DataFrame is created on Valid-Time dimension table.
|
|
2278
|
+
- TRANSACTION_TIME: DataFrame is created on Transaction-Time dimension table.
|
|
2279
|
+
- BI_TEMPORAL: DataFrame is created on Bi-temporal dimension table.
|
|
2280
|
+
- VIEW: DataFrame is created on a view.
|
|
2281
|
+
- TABLE: DataFrame is created on a table.
|
|
2282
|
+
- OTF: DataFrame is created on an OTF table.
|
|
2283
|
+
- ART: DataFrame is created on an ART table.
|
|
2284
|
+
- VOLATILE_TABLE: DataFrame is created on a volatile table.
|
|
2285
|
+
- BI_TEMPORAL_VOLATILE_TABLE: DataFrame is created on a Bi-temporal dimension volatile table.
|
|
2286
|
+
- VALID_TIME_VOLATILE_TABLE: DataFrame is created on a Valid-Time dimension volatile table.
|
|
2287
|
+
- TRANSACTION_TIME_VOLATILE_TABLE: DataFrame is created on a Transaction-Time dimension volatile table.
|
|
2288
|
+
|
|
2289
|
+
RETURNS:
|
|
2290
|
+
str
|
|
2291
|
+
|
|
2292
|
+
RAISES:
|
|
2293
|
+
None
|
|
2294
|
+
|
|
2295
|
+
EXAMPLES:
|
|
2296
|
+
# Load the data to run the example.
|
|
2297
|
+
>>> load_example_data("teradataml", "Employee_roles") # load valid time data.
|
|
2298
|
+
>>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
|
|
2299
|
+
>>> load_example_data("teradataml", "Employee") # load bitemporal data.
|
|
2300
|
+
>>> load_example_data("uaf", ["ocean_buoys2"]) # load data to create art table.
|
|
2301
|
+
>>> load_example_data('dataframe', ['admissions_train']) # load data to create a regular table.
|
|
2302
|
+
|
|
2303
|
+
# Example 1: DataFrame created on a Valid-Time dimension table.
|
|
2304
|
+
>>> df = DataFrame.from_table('Employee_roles')
|
|
2305
|
+
>>> df.df_type
|
|
2306
|
+
'VALID_TIME'
|
|
2307
|
+
|
|
2308
|
+
# Example 2: DataFrame created on a Transaction-Time dimension table.
|
|
2309
|
+
>>> df = DataFrame.from_table('Employee_Address')
|
|
2310
|
+
>>> df.df_type
|
|
2311
|
+
'TRANSACTION_TIME'
|
|
2312
|
+
|
|
2313
|
+
# Example 3: DataFrame created on a Bi-temporal dimension table.
|
|
2314
|
+
>>> df = DataFrame.from_table('Employee')
|
|
2315
|
+
>>> df.df_type
|
|
2316
|
+
'BI_TEMPORAL'
|
|
2317
|
+
|
|
2318
|
+
# Example 4: DataFrame created on a ART table.
|
|
2319
|
+
>>> data = DataFrame.from_table('ocean_buoys2')
|
|
2320
|
+
>>> from teradataml import TDSeries,SInfo
|
|
2321
|
+
>>> data_series_df = TDSeries(data=data,
|
|
2322
|
+
... id=["ocean_name","buoyid"],
|
|
2323
|
+
... row_index="TD_TIMECODE",
|
|
2324
|
+
... row_index_style="TIMECODE",
|
|
2325
|
+
... payload_field="jsoncol.Measure.salinity",
|
|
2326
|
+
... payload_content="REAL")
|
|
2327
|
+
>>> uaf_out = SInfo(data=data_series_df, output_table_name='TSINFO_RESULTS')
|
|
2328
|
+
>>> df = DataFrame.from_table('TSINFO_RESULTS')
|
|
2329
|
+
>>> df.df_type
|
|
2330
|
+
'ART'
|
|
2331
|
+
|
|
2332
|
+
# Example 5: DataFrame created on a regular table.
|
|
2333
|
+
>>> df = DataFrame.from_table('admissions_train')
|
|
2334
|
+
>>> df.df_type
|
|
2335
|
+
'REGULAR_TABLE'
|
|
2336
|
+
|
|
2337
|
+
# Example 6: DataFrame created on a volatile table.
|
|
2338
|
+
>>> df = DataFrame.from_table('admissions_train')
|
|
2339
|
+
>>> df.to_sql(table_name='admissions_train_volatile', temporary=True)
|
|
2340
|
+
>>> df = DataFrame.from_table('admissions_train_volatile')
|
|
2341
|
+
>>> df.df_type
|
|
2342
|
+
'VOLATILE_TABLE'
|
|
2343
|
+
|
|
2344
|
+
# Example 7: DataFrame created on a Bi-temporal dimension view.
|
|
2345
|
+
>>> execute_sql('create view Employee_view AS SEQUENCED VALIDTIME AND SEQUENCED TRANSACTIONTIME select * from Employee')
|
|
2346
|
+
>>> df = DataFrame.from_table('Employee_view')
|
|
2347
|
+
>>> df.df_type
|
|
2348
|
+
'BI_TEMPORAL_VIEW'
|
|
2349
|
+
|
|
2350
|
+
"""
|
|
2351
|
+
|
|
2352
|
+
if self._df_type is not None:
|
|
2353
|
+
return self._df_type
|
|
2354
|
+
|
|
2355
|
+
is_view, is_volatile, is_art_table, valid_time, transaction_time = (
|
|
2356
|
+
self._process_columns_metadata()
|
|
2357
|
+
)
|
|
2358
|
+
|
|
2359
|
+
# Check if the DataFrame is created from an OTF table
|
|
2360
|
+
if self._otf:
|
|
2361
|
+
self._df_type = DataFrameTypes.OTF_TABLE.value
|
|
2362
|
+
return self._df_type
|
|
2363
|
+
|
|
2364
|
+
# Check if the DataFrame is created from an ART table
|
|
2365
|
+
if is_art_table:
|
|
2366
|
+
self._df_type = DataFrameTypes.ART_TABLE.value
|
|
2367
|
+
return self._df_type
|
|
2368
|
+
|
|
2369
|
+
# Determine the type based on valid-time, transaction-time columns, and volatility
|
|
2370
|
+
if valid_time and transaction_time:
|
|
2371
|
+
if is_volatile:
|
|
2372
|
+
self._df_type = DataFrameTypes.BI_TEMPORAL_VOLATILE_TABLE.value
|
|
2373
|
+
else:
|
|
2374
|
+
self._df_type = (
|
|
2375
|
+
DataFrameTypes.BI_TEMPORAL_VIEW.value
|
|
2376
|
+
if is_view
|
|
2377
|
+
else DataFrameTypes.BI_TEMPORAL.value
|
|
2378
|
+
)
|
|
2379
|
+
elif valid_time:
|
|
2380
|
+
if is_volatile:
|
|
2381
|
+
self._df_type = DataFrameTypes.VALID_TIME_VOLATILE_TABLE.value
|
|
2382
|
+
else:
|
|
2383
|
+
self._df_type = (
|
|
2384
|
+
DataFrameTypes.VALID_TIME_VIEW.value
|
|
2385
|
+
if is_view
|
|
2386
|
+
else DataFrameTypes.VALID_TIME.value
|
|
2387
|
+
)
|
|
2388
|
+
elif transaction_time:
|
|
2389
|
+
if is_volatile:
|
|
2390
|
+
self._df_type = DataFrameTypes.TRANSACTION_TIME_VOLATILE_TABLE.value
|
|
2391
|
+
else:
|
|
2392
|
+
self._df_type = (
|
|
2393
|
+
DataFrameTypes.TRANSACTION_TIME_VIEW.value
|
|
2394
|
+
if is_view
|
|
2395
|
+
else DataFrameTypes.TRANSACTION_TIME.value
|
|
2396
|
+
)
|
|
2397
|
+
else:
|
|
2398
|
+
self._df_type = (
|
|
2399
|
+
DataFrameTypes.VOLATILE_TABLE.value
|
|
2400
|
+
if is_volatile
|
|
2401
|
+
else (
|
|
2402
|
+
DataFrameTypes.VIEW.value
|
|
2403
|
+
if is_view
|
|
2404
|
+
else DataFrameTypes.REGULAR_TABLE.value
|
|
2405
|
+
)
|
|
2406
|
+
)
|
|
2407
|
+
|
|
2408
|
+
return self._df_type
|
|
2409
|
+
|
|
2168
2410
|
@collect_queryband(queryband="DF_info")
|
|
2169
2411
|
def info(self, verbose=True, buf=None, max_cols=None, null_counts=False):
|
|
2170
2412
|
"""
|
|
@@ -6408,7 +6650,8 @@ class DataFrame():
|
|
|
6408
6650
|
|
|
6409
6651
|
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(aggregate_node_id,
|
|
6410
6652
|
zip(new_column_names,
|
|
6411
|
-
new_column_types)
|
|
6653
|
+
new_column_types),
|
|
6654
|
+
datalake=self._metaexpr.datalake)
|
|
6412
6655
|
agg_df = self._create_dataframe_from_node \
|
|
6413
6656
|
(aggregate_node_id, new_metaexpr, self._index_label)
|
|
6414
6657
|
|
|
@@ -6827,7 +7070,8 @@ class DataFrame():
|
|
|
6827
7070
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression)
|
|
6828
7071
|
|
|
6829
7072
|
# Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
|
|
6830
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items()
|
|
7073
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
|
|
7074
|
+
datalake=self._metaexpr.datalake)
|
|
6831
7075
|
return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
6832
7076
|
|
|
6833
7077
|
except TeradataMlException:
|
|
@@ -7777,7 +8021,8 @@ class DataFrame():
|
|
|
7777
8021
|
|
|
7778
8022
|
# Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
|
|
7779
8023
|
# and underlying table name.
|
|
7780
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items()
|
|
8024
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items(),
|
|
8025
|
+
datalake=self._metaexpr.datalake)
|
|
7781
8026
|
|
|
7782
8027
|
# Return a new joined dataframe.
|
|
7783
8028
|
return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
|
|
@@ -9150,7 +9395,6 @@ class DataFrame():
|
|
|
9150
9395
|
|
|
9151
9396
|
return df
|
|
9152
9397
|
|
|
9153
|
-
|
|
9154
9398
|
@collect_queryband(queryband="DF_get")
|
|
9155
9399
|
def get(self, key):
|
|
9156
9400
|
"""
|
|
@@ -9260,7 +9504,7 @@ class DataFrame():
|
|
|
9260
9504
|
append:
|
|
9261
9505
|
Optional Argument.
|
|
9262
9506
|
Specifies whether or not to append requested columns to the existing index.
|
|
9263
|
-
|
|
9507
|
+
When append is False, replaces existing index.
|
|
9264
9508
|
When append is True, retains both existing & currently appended index.
|
|
9265
9509
|
Default Value: False
|
|
9266
9510
|
Types: bool
|
|
@@ -9476,8 +9720,8 @@ class DataFrame():
|
|
|
9476
9720
|
include_grouping_columns:
|
|
9477
9721
|
Optional Argument.
|
|
9478
9722
|
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
9479
|
-
When set to True, the resultant DataFrame will have the aggregations on the
|
|
9480
|
-
columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
|
|
9723
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
9724
|
+
columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
|
|
9481
9725
|
aggregations on the columns mentioned in "columns_expr".
|
|
9482
9726
|
Default Value: False
|
|
9483
9727
|
Types: bool
|
|
@@ -9517,7 +9761,7 @@ class DataFrame():
|
|
|
9517
9761
|
26 yes 3.57 Advanced Advanced 1
|
|
9518
9762
|
17 no 3.83 Advanced Advanced 1
|
|
9519
9763
|
|
|
9520
|
-
# Example 1: Find the minimum value of all valid columns by
|
|
9764
|
+
# Example 1: Find the minimum value of all valid columns by
|
|
9521
9765
|
# grouping the DataFrame with column 'masters'.
|
|
9522
9766
|
>>> df1 = df.groupby(["masters"])
|
|
9523
9767
|
>>> df1.min()
|
|
@@ -9538,7 +9782,7 @@ class DataFrame():
|
|
|
9538
9782
|
|
|
9539
9783
|
# Example 3: Find the sum of all valid columns by grouping the DataFrame with
|
|
9540
9784
|
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
9541
|
-
# in aggregate function 'sum'.
|
|
9785
|
+
# in aggregate function 'sum'.
|
|
9542
9786
|
>>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=False)
|
|
9543
9787
|
>>> df1.sum()
|
|
9544
9788
|
masters admitted sum_id sum_gpa
|
|
@@ -11964,7 +12208,8 @@ class DataFrame():
|
|
|
11964
12208
|
column_info = ((col_name, col_type) for col_name, col_type in
|
|
11965
12209
|
new_metaexpr_columns_types.items())
|
|
11966
12210
|
# Get new metaexpr for sample_node_id
|
|
11967
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True
|
|
12211
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True,
|
|
12212
|
+
datalake=self._metaexpr.datalake)
|
|
11968
12213
|
|
|
11969
12214
|
# Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
|
|
11970
12215
|
# Cannot use __execute_node_and_set_table_name because self points to original df.
|
|
@@ -13399,9 +13644,9 @@ class DataFrame():
|
|
|
13399
13644
|
3. When ColumnExpression(s) is(are) passed to "order_columns", then the
|
|
13400
13645
|
corresponding expression takes precedence over arguments
|
|
13401
13646
|
"sort_ascending" and "nulls_first". Say, ColumnExpression is col1, then
|
|
13402
|
-
|
|
13403
|
-
|
|
13404
|
-
|
|
13647
|
+
1. col1.asc() or col.desc() is effective irrespective of "sort_ascending".
|
|
13648
|
+
2. col1.nulls_first() or col.nulls_last() is effective irrespective of "nulls_first".
|
|
13649
|
+
3. Any combination of above two take precedence over "sort_ascending" and "nulls_first".
|
|
13405
13650
|
Types: str OR list of Strings (str) OR ColumnExpression OR list of ColumnExpressions
|
|
13406
13651
|
|
|
13407
13652
|
sort_ascending:
|
|
@@ -13682,7 +13927,9 @@ class DataFrame():
|
|
|
13682
13927
|
|
|
13683
13928
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
13684
13929
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
|
|
13685
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items()
|
|
13930
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
|
|
13931
|
+
datalake=self._metaexpr.datalake)
|
|
13932
|
+
|
|
13686
13933
|
return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
13687
13934
|
|
|
13688
13935
|
@collect_queryband(queryband="DF_toCsv")
|
|
@@ -15880,8 +16127,8 @@ class DataFrame():
|
|
|
15880
16127
|
include_grouping_columns:
|
|
15881
16128
|
Optional Argument.
|
|
15882
16129
|
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
15883
|
-
When set to True, the resultant DataFrame will have the aggregations on the
|
|
15884
|
-
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16130
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16131
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
15885
16132
|
aggregations on the columns mentioned in "columns".
|
|
15886
16133
|
Default Value: False
|
|
15887
16134
|
Types: bool
|
|
@@ -15946,7 +16193,7 @@ class DataFrame():
|
|
|
15946
16193
|
|
|
15947
16194
|
# Example 3: Find the avg of all valid columns by grouping the DataFrame with
|
|
15948
16195
|
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
15949
|
-
# in aggregate function 'avg'.
|
|
16196
|
+
# in aggregate function 'avg'.
|
|
15950
16197
|
>>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=False).avg()
|
|
15951
16198
|
>>> df1
|
|
15952
16199
|
masters admitted avg_id avg_gpa
|
|
@@ -15993,8 +16240,8 @@ class DataFrame():
|
|
|
15993
16240
|
include_grouping_columns:
|
|
15994
16241
|
Optional Argument.
|
|
15995
16242
|
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
15996
|
-
When set to True, the resultant DataFrame will have the aggregations on the
|
|
15997
|
-
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16243
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16244
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
15998
16245
|
aggregations on the columns mentioned in "columns".
|
|
15999
16246
|
Default Value: False
|
|
16000
16247
|
Types: bool
|
|
@@ -16039,7 +16286,7 @@ class DataFrame():
|
|
|
16039
16286
|
6 yes Beginner 13 14.71 2
|
|
16040
16287
|
7 yes Advanced 366 49.26 7
|
|
16041
16288
|
8 no Advanced 189 34.95 9
|
|
16042
|
-
|
|
16289
|
+
|
|
16043
16290
|
# Example 2: Find the avg of all valid columns by grouping the DataFrame
|
|
16044
16291
|
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
16045
16292
|
# in aggregate function 'avg'.
|
|
@@ -16331,6 +16578,878 @@ class DataFrame():
|
|
|
16331
16578
|
"Use valid timestamp or correct snapshot id listed using 'snapshots' property.".format(as_of)),
|
|
16332
16579
|
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
16333
16580
|
|
|
16581
|
+
def as_of(self, **kwargs):
|
|
16582
|
+
"""
|
|
16583
|
+
DESCRIPTION:
|
|
16584
|
+
Function to get DataFrame at specific time on temporal table.
|
|
16585
|
+
Note:
|
|
16586
|
+
Function is supported only on temporal tables or temporal views.
|
|
16587
|
+
|
|
16588
|
+
PARAMETERS:
|
|
16589
|
+
kwargs:
|
|
16590
|
+
Specifies keyword arguments.
|
|
16591
|
+
|
|
16592
|
+
valid_time:
|
|
16593
|
+
Optional Argument.
|
|
16594
|
+
Specifies the valid time to retrieve data from DataFrame created on either ValidTime
|
|
16595
|
+
or BiTemporal table/view.
|
|
16596
|
+
Notes:
|
|
16597
|
+
* Either "valid_time" or "transaction_time" must be provided.
|
|
16598
|
+
* Argument accepts below values:
|
|
16599
|
+
* "current" - to get the current valid time data.
|
|
16600
|
+
* any string other than "current" is considered as date and data will be retrieved at that of time.
|
|
16601
|
+
* date object - to get the data valid on that date.
|
|
16602
|
+
* datetime object - to get the data valid at that point of time.
|
|
16603
|
+
* tuple - to get the data which is valid between the two valid times.
|
|
16604
|
+
* tuple should have only two elements. First element considered as starting time
|
|
16605
|
+
and second element considered as end time for a period of time.
|
|
16606
|
+
Records will be retrieved which are valid between the two valid times.
|
|
16607
|
+
* Both elements can be of date or datetime or string type. If you are using
|
|
16608
|
+
string, make sure the string represents a valid date.
|
|
16609
|
+
* Any element can be None.
|
|
16610
|
+
* If first element is None and valid time dimension column is PERIOD_DATE type,
|
|
16611
|
+
then it is considered as '0001-01-01'.
|
|
16612
|
+
* If first element is None and valid time dimension column is PERIOD_TIMESTAMP type,
|
|
16613
|
+
then it is considered as '0001-01-01 00:00:00.000000+00:00'.
|
|
16614
|
+
* If second element is None and valid time dimension column is PERIOD_DATE type,
|
|
16615
|
+
then it is considered as '9999-12-31'.
|
|
16616
|
+
* If second element is None and valid time dimension column is PERIOD_TIMESTAMP type,
|
|
16617
|
+
then it is considered as '9999-12-31 23:59:59.999999+00:00'.
|
|
16618
|
+
* None - to consider the DataFrame as regular DataFrame and retrieve all the records from
|
|
16619
|
+
valid time dimension.
|
|
16620
|
+
Types: date or str or tuple or NoneType
|
|
16621
|
+
|
|
16622
|
+
include_valid_time_column:
|
|
16623
|
+
Optional Argument.
|
|
16624
|
+
Specifies whether to include the valid time dimension column in the resultant DataFrame.
|
|
16625
|
+
When set to True, valid time dimension column is included in resultant DataFrame.
|
|
16626
|
+
Otherwise, valid time dimension column is not included in resultant DataFrame.
|
|
16627
|
+
Note:
|
|
16628
|
+
Ignored when "valid_time" is either tuple or None.
|
|
16629
|
+
Default Value: False
|
|
16630
|
+
Types: bool
|
|
16631
|
+
|
|
16632
|
+
transaction_time:
|
|
16633
|
+
Optional Argument.
|
|
16634
|
+
Specifies the transaction time to retrieve data from DataFrame created on either
|
|
16635
|
+
TransactionTime or BiTemporal table/view.
|
|
16636
|
+
Notes:
|
|
16637
|
+
* Either "valid_time" or "transaction_time" must be provided.
|
|
16638
|
+
* Argument accepts below values.
|
|
16639
|
+
* "current" - to get the records which are valid at current time.
|
|
16640
|
+
* any string other than "current" is considered as timestamp and records which are
|
|
16641
|
+
valid at that of time.
|
|
16642
|
+
* datetime object - to get the records which are valid at that of time.
|
|
16643
|
+
* None - to consider the DataFrame as regular DataFrame and retrieve all the records
|
|
16644
|
+
from transaction time dimension.
|
|
16645
|
+
Types: datetime or str or NoneType
|
|
16646
|
+
|
|
16647
|
+
include_transaction_time_column:
|
|
16648
|
+
Optional Argument.
|
|
16649
|
+
Specifies whether to include the transaction time dimension column in the resultant DataFrame.
|
|
16650
|
+
When set to True, transaction time dimension column is included in resultant DataFrame.
|
|
16651
|
+
Otherwise, transaction time dimension column is not included in resultant DataFrame.
|
|
16652
|
+
Default Value: False
|
|
16653
|
+
Types: bool
|
|
16654
|
+
|
|
16655
|
+
additional_period:
|
|
16656
|
+
Optional Argument.
|
|
16657
|
+
Specifies the additional period to be kept in resultant DataFrame.
|
|
16658
|
+
Note:
|
|
16659
|
+
This is applicable only when "valid_time" is None.
|
|
16660
|
+
Types: tuple of date or str
|
|
16661
|
+
|
|
16662
|
+
RETURNS:
|
|
16663
|
+
teradataml DataFrame
|
|
16664
|
+
|
|
16665
|
+
RAISES:
|
|
16666
|
+
TeradatamlException.
|
|
16667
|
+
|
|
16668
|
+
EXAMPLES:
|
|
16669
|
+
# Load the data to run the example.
|
|
16670
|
+
>>> load_example_data("teradataml", "Employee_roles") # load valid time data.
|
|
16671
|
+
>>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
|
|
16672
|
+
>>> load_example_data("teradataml", "Employee") # load bitemporal data.
|
|
16673
|
+
|
|
16674
|
+
>>> df1 = DataFrame("Employee_roles")
|
|
16675
|
+
EmployeeName Department Salary role_validity_period
|
|
16676
|
+
EmployeeID
|
|
16677
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31')
|
|
16678
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16679
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16680
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31')
|
|
16681
|
+
|
|
16682
|
+
# Example 1: Get the employee roles from DataFrame df1 which are valid at current time.
|
|
16683
|
+
>>> df1.as_of(valid_time="current")
|
|
16684
|
+
EmployeeName Department Salary
|
|
16685
|
+
EmployeeID
|
|
16686
|
+
2 Jane Smith DA 200.0
|
|
16687
|
+
3 Bob Marketing 330.0
|
|
16688
|
+
|
|
16689
|
+
# Example 2: Get the employee roles from DataFrame df1 which are valid at current time.
|
|
16690
|
+
# Also include valid time dimension column.
|
|
16691
|
+
>>> df1.as_of(valid_time="current", include_valid_time_column=True)
|
|
16692
|
+
EmployeeName Department Salary role_validity_period
|
|
16693
|
+
EmployeeID
|
|
16694
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16695
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16696
|
+
|
|
16697
|
+
# Example 3: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
|
|
16698
|
+
Include valid time dimension column.
|
|
16699
|
+
>>> df1.as_of(valid_time="2026-12-31", include_valid_time_column=True)
|
|
16700
|
+
EmployeeName Department Salary role_validity_period
|
|
16701
|
+
EmployeeID
|
|
16702
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16703
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16704
|
+
|
|
16705
|
+
# Example 4: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
|
|
16706
|
+
# Also include valid time dimension column. Use date object instead of string
|
|
16707
|
+
# to specify the date.
|
|
16708
|
+
>>> from datetime import date
|
|
16709
|
+
>>> d = date(2026, 12, 31)
|
|
16710
|
+
>>> df1.as_of(valid_time=d, include_valid_time_column=True)
|
|
16711
|
+
EmployeeName Department Salary role_validity_period
|
|
16712
|
+
EmployeeID
|
|
16713
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16714
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16715
|
+
|
|
16716
|
+
# Example 5: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
|
|
16717
|
+
# Include valid time dimension column.
|
|
16718
|
+
>>> df1.as_of(valid_time=("2018-01-20", "2024-03-05"), include_valid_time_column=True)
|
|
16719
|
+
EmployeeName Department Salary VALIDTIME
|
|
16720
|
+
EmployeeID
|
|
16721
|
+
2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
|
|
16722
|
+
1 John Doe IT 100.0 ('20/01/01', '24/03/05')
|
|
16723
|
+
3 Bob Sales 300.0 ('24/01/01', '24/03/05')
|
|
16724
|
+
|
|
16725
|
+
# Example 6: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
|
|
16726
|
+
# Then again get the records which are valid at 1st Jan 2023. Do not include
|
|
16727
|
+
# valid time dimension column since selecting valid time dimension column is ignored
|
|
16728
|
+
# when "valid_time" is a tuple.
|
|
16729
|
+
>>> df1.as_of(valid_time=(date(2018, 1, 20), "2024-03-05")).as_of(valid_time=date(2023, 1, 1))
|
|
16730
|
+
EmployeeName Department Salary
|
|
16731
|
+
EmployeeID
|
|
16732
|
+
2 Jane Smith DA 200.0
|
|
16733
|
+
1 John Doe IT 100.0
|
|
16734
|
+
|
|
16735
|
+
# Example 7: Get the employee roles which are valid between 1st Jan 0001 and 1st Jun 2024.
|
|
16736
|
+
>>> df1.as_of(valid_time=(None, date(2024, 3, 5)))
|
|
16737
|
+
EmployeeName Department Salary VALIDTIME
|
|
16738
|
+
EmployeeID
|
|
16739
|
+
2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
|
|
16740
|
+
1 John Doe IT 100.0 ('20/01/01', '24/03/05')
|
|
16741
|
+
3 Bob Sales 300.0 ('24/01/01', '24/03/05')
|
|
16742
|
+
|
|
16743
|
+
# Example 8: Get the employee roles which are valid between 1st Jun 2024 and 31st Dec 9999.
|
|
16744
|
+
>>> df1.as_of(valid_time=("2024-06-01", None))
|
|
16745
|
+
EmployeeName Department Salary VALIDTIME
|
|
16746
|
+
EmployeeID
|
|
16747
|
+
1 John Doe IT 100.0 ('24/06/01', '24/12/31')
|
|
16748
|
+
2 Jane Smith DA 200.0 ('24/06/01', '99/12/31')
|
|
16749
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16750
|
+
3 Bob Sales 300.0 ('24/06/01', '24/12/31')
|
|
16751
|
+
|
|
16752
|
+
# Example 9: Consider df1 as regular DataFrame and retrieve all the records irrespective
|
|
16753
|
+
# whether records are valid or not.
|
|
16754
|
+
>>> df1.as_of(valid_time=None)
|
|
16755
|
+
EmployeeName Department Salary
|
|
16756
|
+
EmployeeID
|
|
16757
|
+
1 John Doe IT 100.0
|
|
16758
|
+
2 Jane Smith DA 200.0
|
|
16759
|
+
3 Bob Marketing 330.0
|
|
16760
|
+
3 Bob Sales 300.0
|
|
16761
|
+
|
|
16762
|
+
# Example 10. Consider df1 as regular DataFrame and retrieve all the records irrespective
|
|
16763
|
+
# whether records are valid or not. Also include additional period and valid time
|
|
16764
|
+
# dimension column.
|
|
16765
|
+
>>> df1.as_of(valid_time=None, additional_period=("2024-01-01", "2024-03-05"), include_valid_time_column=True)
|
|
16766
|
+
EmployeeName Department Salary role_validity_period VALIDTIME
|
|
16767
|
+
EmployeeID
|
|
16768
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31') ('24/01/01', '24/03/05')
|
|
16769
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31') ('24/01/01', '24/03/05')
|
|
16770
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31') ('24/01/01', '24/03/05')
|
|
16771
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31') ('24/01/01', '24/03/05')
|
|
16772
|
+
|
|
16773
|
+
>>> df2 = DataFrame("Employee_Address")
|
|
16774
|
+
EmployeeName address validity_period
|
|
16775
|
+
EmployeeID
|
|
16776
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16777
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16778
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16779
|
+
|
|
16780
|
+
# Example 11: Consider df2 as regular DataFrame and retrieve all the records including historic
|
|
16781
|
+
# records. Also include transaction time dimension column.
|
|
16782
|
+
>>> df2.as_of(transaction_time=None, include_transaction_time_column=True)
|
|
16783
|
+
EmployeeName address validity_period
|
|
16784
|
+
EmployeeID
|
|
16785
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16786
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16787
|
+
3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
|
|
16788
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16789
|
+
|
|
16790
|
+
# Example 12: Get the employee address which are valid at current time from DataFrame df2.
|
|
16791
|
+
# Also include transaction time dimension column.
|
|
16792
|
+
>>> df2.as_of(transaction_time="current", include_transaction_time_column=True)
|
|
16793
|
+
EmployeeName address validity_period
|
|
16794
|
+
EmployeeID
|
|
16795
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16796
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16797
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16798
|
+
|
|
16799
|
+
# Example 13: Get the employee address which are valid at current time from DataFrame df2.
|
|
16800
|
+
# Do not include transaction time dimension column.
|
|
16801
|
+
>>> df2.as_of(transaction_time="current", include_transaction_time_column=False)
|
|
16802
|
+
EmployeeName address
|
|
16803
|
+
EmployeeID
|
|
16804
|
+
2 Jane Smith 456 Elm St
|
|
16805
|
+
1 John Doe 123 Main St
|
|
16806
|
+
3 Bob Johnson 789 Oak St
|
|
16807
|
+
|
|
16808
|
+
# Example 14: Get the employee address which are valid at 2025-03-04 15:41:44.610000+00:00 from DataFrame df2.
|
|
16809
|
+
# Include transaction time dimension column.
|
|
16810
|
+
>>> df2.as_of(transaction_time="2025-03-04 15:41:44.610000+00:00", include_transaction_time_column=True)
|
|
16811
|
+
EmployeeName address validity_period
|
|
16812
|
+
EmployeeID
|
|
16813
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16814
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16815
|
+
3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
|
|
16816
|
+
|
|
16817
|
+
# Example 15: Get the employee address which are valid at 2025-03-04 15:41:44.610001+00:00 from DataFrame df2.
|
|
16818
|
+
# Include transaction time dimension column.
|
|
16819
|
+
>>> from datetime import datetime, timezone, timedelta
|
|
16820
|
+
>>> dt = datetime(2025, 3, 4, 15, 41, 44, 610001)
|
|
16821
|
+
>>> dt_with_tz = dt.replace(tzinfo=timezone(timedelta(hours=0)))
|
|
16822
|
+
>>> df2.as_of(transaction_time=dt_with_tz, include_transaction_time_column=True)
|
|
16823
|
+
EmployeeName address validity_period
|
|
16824
|
+
EmployeeID
|
|
16825
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16826
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16827
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16828
|
+
|
|
16829
|
+
>>> df3 = DataFrame("Employee")
|
|
16830
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16831
|
+
EmployeeID
|
|
16832
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16833
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16834
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16835
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16836
|
+
|
|
16837
|
+
# Example 16: Get all the records from DataFrame df3 by considering the DataFrame as
|
|
16838
|
+
# regular DataFrame. Include both valid time and transaction time dimension columns.
|
|
16839
|
+
>>> df3.as_of(valid_time=None,
|
|
16840
|
+
... transaction_time=None,
|
|
16841
|
+
... include_valid_time_column=True,
|
|
16842
|
+
... include_transaction_time_column=True
|
|
16843
|
+
... )
|
|
16844
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16845
|
+
EmployeeID
|
|
16846
|
+
3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16847
|
+
3 Bob 789 Oak St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00')
|
|
16848
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16849
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16850
|
+
3 Bob 789 Oak Street Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16851
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16852
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16853
|
+
|
|
16854
|
+
# Example 17: Get the employee address from DataFrame df3 which are valid at 1st Jun 2024 from
|
|
16855
|
+
# valid time dimension and valid at '2025-03-04 18:09:08.720001+00:00' from transaction
|
|
16856
|
+
# time dimension. Include both valid time and transaction time dimension columns.
|
|
16857
|
+
>>> df3.as_of(valid_time="2024-06-01",
|
|
16858
|
+
... transaction_time="2025-03-04 18:09:08.720001+00:00",
|
|
16859
|
+
... include_valid_time_column=True,
|
|
16860
|
+
... include_transaction_time_column=True
|
|
16861
|
+
... )
|
|
16862
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16863
|
+
EmployeeID
|
|
16864
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16865
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16866
|
+
3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16867
|
+
|
|
16868
|
+
# Example 18: Get the employee address from DataFrame df3 which are valid at 25th Jan 2024
|
|
16869
|
+
# from valid time dimension and valid at current time from transaction time dimension.
|
|
16870
|
+
# Include only transaction time dimension column.
|
|
16871
|
+
>>> df3.as_of(valid_time=date(2024, 1, 25),
|
|
16872
|
+
... transaction_time="current",
|
|
16873
|
+
... include_transaction_time_column=True)
|
|
16874
|
+
EmployeeName address Department Salary validity_period
|
|
16875
|
+
EmployeeID
|
|
16876
|
+
2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16877
|
+
1 John Doe 123 Main St IT 100.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16878
|
+
3 Bob 789 Oak St Sales 300.0 ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16879
|
+
|
|
16880
|
+
# Example 19: Get the employee address from DataFrame df3 which are valid between 1st Jan 2025
|
|
16881
|
+
# and 30th June 2025 from valid time dimension and valid at
|
|
16882
|
+
# '2025-03-04 18:08:59.720000+00:00' from transaction time dimension.
|
|
16883
|
+
# Include both valid time and transaction time dimension columns.
|
|
16884
|
+
>>> from datetime import datetime, timezone
|
|
16885
|
+
>>>df3.as_of(valid_time=("2025-01-01", date(2025, 6, 30)),
|
|
16886
|
+
... transaction_time=datetime(2025, 3, 4, 18, 8, 59, 720000).astimezone(timezone.utc),
|
|
16887
|
+
... include_transaction_time_column=True)
|
|
16888
|
+
EmployeeName address Department Salary validity_period VALIDTIME
|
|
16889
|
+
EmployeeID
|
|
16890
|
+
2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('25/01/01', '25/06/30')
|
|
16891
|
+
3 Bob 789 Oak St Marketing 330.0 ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00') ('25/01/01', '25/06/30')
|
|
16892
|
+
|
|
16893
|
+
# Example 20: Get the employee address from DataFrame df3 by considering the DataFrame as regular
|
|
16894
|
+
# DataFrame from valid time dimension and valid at current time from transaction time dimension.
|
|
16895
|
+
# Add additional period and include both valid time and transaction time dimension columns.
|
|
16896
|
+
>>> df3.as_of(valid_time=None,
|
|
16897
|
+
... transaction_time="current",
|
|
16898
|
+
... additional_period=("2024-01-01", "2024-03-05"),
|
|
16899
|
+
... include_valid_time_column=True,
|
|
16900
|
+
... include_transaction_time_column=True
|
|
16901
|
+
... )
|
|
16902
|
+
EmployeeName address Department Salary role_validity validity_period VALIDTIME
|
|
16903
|
+
EmployeeID
|
|
16904
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16905
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16906
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16907
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16908
|
+
"""
|
|
16909
|
+
|
|
16910
|
+
if "valid_time" not in kwargs and "transaction_time" not in kwargs:
|
|
16911
|
+
_Validators._validate_mutually_exclusive_arguments(
|
|
16912
|
+
None, "valid_time", None, "transaction_time")
|
|
16913
|
+
|
|
16914
|
+
# Validate argument types.
|
|
16915
|
+
_validation = []
|
|
16916
|
+
_validation.append(["valid_time", kwargs.get("valid_time"), True, (date, datetime, str, tuple, type(None))])
|
|
16917
|
+
_validation.append(["transaction_time", kwargs.get("transaction_time"), True, (datetime, str, type(None))])
|
|
16918
|
+
_validation.append(["additional_period", kwargs.get("additional_period"), True, (tuple, type(None))])
|
|
16919
|
+
_validation.append(["include_valid_time_column", kwargs.get("include_valid_time_column"), True, bool])
|
|
16920
|
+
_validation.append(["include_transaction_time_column", kwargs.get("include_transaction_time_column"), True, bool])
|
|
16921
|
+
|
|
16922
|
+
# Validate argument types
|
|
16923
|
+
_Validators._validate_function_arguments(_validation)
|
|
16924
|
+
|
|
16925
|
+
# Validate temporal table type.
|
|
16926
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
16927
|
+
|
|
16928
|
+
# Extract valid_time and transaction_time from kwargs.
|
|
16929
|
+
valid_time = kwargs.get("valid_time")
|
|
16930
|
+
transaction_time = kwargs.get("transaction_time")
|
|
16931
|
+
additional_period = kwargs.get("additional_period")
|
|
16932
|
+
include_valid_time_column = kwargs.get("include_valid_time_column")
|
|
16933
|
+
include_transaction_time_column = kwargs.get("include_transaction_time_column")
|
|
16934
|
+
|
|
16935
|
+
# Validate if user specifies valid_time for a transaction time table.
|
|
16936
|
+
if "valid_time" in kwargs:
|
|
16937
|
+
_Validators._validate_as_of_arguments(df_type=self.df_type)
|
|
16938
|
+
|
|
16939
|
+
# Validate if user specifies transaction_time for a valid time table.
|
|
16940
|
+
if "transaction_time" in kwargs:
|
|
16941
|
+
_Validators._validate_as_of_arguments(df_type=self.df_type, argument_name='transaction_time')
|
|
16942
|
+
|
|
16943
|
+
add_vt_period = False
|
|
16944
|
+
|
|
16945
|
+
# Generate the time qualifier clause.
|
|
16946
|
+
if "valid_time" in kwargs and "transaction_time" not in kwargs:
|
|
16947
|
+
clause = self.__get_valid_time_clause(valid_time, additional_period)
|
|
16948
|
+
elif "transaction_time" in kwargs and "valid_time" not in kwargs:
|
|
16949
|
+
clause = self.__get_transaction_time_clause(transaction_time)
|
|
16950
|
+
else:
|
|
16951
|
+
# Generate both clauses.
|
|
16952
|
+
clause = "{} AND {}".format(self.__get_valid_time_clause(valid_time, additional_period),
|
|
16953
|
+
self.__get_transaction_time_clause(transaction_time)
|
|
16954
|
+
)
|
|
16955
|
+
|
|
16956
|
+
# Exclude the time dimension columns if user is not willing to see it in output DF.
|
|
16957
|
+
columns_to_exclude = []
|
|
16958
|
+
if not include_valid_time_column and self._valid_time_column:
|
|
16959
|
+
columns_to_exclude.append(self._valid_time_column.name)
|
|
16960
|
+
|
|
16961
|
+
if not include_transaction_time_column and self._transaction_time_column:
|
|
16962
|
+
columns_to_exclude.append(self._transaction_time_column.name)
|
|
16963
|
+
|
|
16964
|
+
columns = [col for col in self.columns if col not in columns_to_exclude]
|
|
16965
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16966
|
+
|
|
16967
|
+
# Notes:
|
|
16968
|
+
# * If valid_time is tuple, i.e., for valid time qualifier SEQUENCED VALIDTIME,
|
|
16969
|
+
# add additional column VALIDTIME. This column should not be present in SELECT statement.
|
|
16970
|
+
# Also, ValidTime dimension column should not be present in SELECT statement. VALIDTIME column
|
|
16971
|
+
# acts as validTime dimension column here.
|
|
16972
|
+
# * Time qualifier NONSEQUENCED VALIDTIME PERIOD clause also produces additional column VALIDTIME.
|
|
16973
|
+
# Hence, add additional column VALIDTIME also returned in the output DataFrame. However, valid time
|
|
16974
|
+
# column can exist in SELECT statement.
|
|
16975
|
+
if isinstance(valid_time, tuple):
|
|
16976
|
+
add_vt_period = True
|
|
16977
|
+
columns = [col for col in columns if col != self._valid_time_column.name]
|
|
16978
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16979
|
+
col_names_types["VALIDTIME"] = self._valid_time_column.type
|
|
16980
|
+
elif (isinstance(valid_time, type(None)) and additional_period is not None):
|
|
16981
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16982
|
+
col_names_types["VALIDTIME"] = self._valid_time_column.type
|
|
16983
|
+
|
|
16984
|
+
# SELECT Node.
|
|
16985
|
+
column_expression = ", ".join(columns)
|
|
16986
|
+
sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=clause)
|
|
16987
|
+
|
|
16988
|
+
# Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
|
|
16989
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
|
|
16990
|
+
df = self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
16991
|
+
|
|
16992
|
+
# If time qualifier is SEQUENCED PERIOD, then add VALIDTIME column to DataFrame
|
|
16993
|
+
# since it produces temporal dataset.
|
|
16994
|
+
if add_vt_period:
|
|
16995
|
+
df._valid_time_column = df['VALIDTIME']
|
|
16996
|
+
|
|
16997
|
+
return df
|
|
16998
|
+
|
|
16999
|
+
def __get_valid_time_clause(self, valid_time, additional_period=None):
|
|
17000
|
+
"""
|
|
17001
|
+
DESCRIPTION:
|
|
17002
|
+
Function to get valid time clause for temporal table.
|
|
17003
|
+
|
|
17004
|
+
PARAMETERS:
|
|
17005
|
+
valid_time:
|
|
17006
|
+
Required Argument.
|
|
17007
|
+
Specifies the valid time dimension to represent temporal data when creating the DataFrame.
|
|
17008
|
+
Types: date or str
|
|
17009
|
+
|
|
17010
|
+
additional_period:
|
|
17011
|
+
Optional Argument.
|
|
17012
|
+
Specifies the additional period to be kept in DataFrame.
|
|
17013
|
+
Note:
|
|
17014
|
+
This is applicable only when "valid_time" is None.
|
|
17015
|
+
Types: tuple of date or str
|
|
17016
|
+
|
|
17017
|
+
RETURNS:
|
|
17018
|
+
str
|
|
17019
|
+
|
|
17020
|
+
RAISES:
|
|
17021
|
+
None.
|
|
17022
|
+
"""
|
|
17023
|
+
is_vt_dt_type = isinstance(self._valid_time_column.type, tdtypes.PERIOD_DATE)
|
|
17024
|
+
if valid_time == "current":
|
|
17025
|
+
return "CURRENT VALIDTIME"
|
|
17026
|
+
|
|
17027
|
+
if isinstance(valid_time, (str, date, datetime)):
|
|
17028
|
+
# If valid_time is a string, then check what is the type of temporal column.
|
|
17029
|
+
# ValidTime dimension allows both DATE and TIMESTAMP type for ValidTime dimension
|
|
17030
|
+
# columns.
|
|
17031
|
+
if is_vt_dt_type:
|
|
17032
|
+
return "VALIDTIME AS OF DATE '{}'".format(valid_time)
|
|
17033
|
+
return "VALIDTIME AS OF TIMESTAMP '{}'".format(valid_time)
|
|
17034
|
+
|
|
17035
|
+
# If valid_time is a tuple, then it is a period.
|
|
17036
|
+
# User can specify start and/or end time. Derive missing value.
|
|
17037
|
+
if isinstance(valid_time, tuple):
|
|
17038
|
+
start = valid_time[0]
|
|
17039
|
+
end = valid_time[1]
|
|
17040
|
+
start = ("0001-01-01" if is_vt_dt_type else '0001-01-01 00:00:00.000000+00:00') if start is None else str(
|
|
17041
|
+
start)
|
|
17042
|
+
end = ("9999-12-31" if is_vt_dt_type else '9999-12-31 23:59:59.999999+00:00') if end is None else str(end)
|
|
17043
|
+
return "SEQUENCED VALIDTIME PERIOD '({}, {})'".format(start, end)
|
|
17044
|
+
|
|
17045
|
+
if isinstance(valid_time, type(None)) and additional_period is not None:
|
|
17046
|
+
return "NONSEQUENCED VALIDTIME PERIOD '({}, {})'".format(additional_period[0], additional_period[1])
|
|
17047
|
+
|
|
17048
|
+
return "NONSEQUENCED VALIDTIME"
|
|
17049
|
+
|
|
17050
|
+
def __get_transaction_time_clause(self, transaction_time):
|
|
17051
|
+
"""
|
|
17052
|
+
DESCRIPTION:
|
|
17053
|
+
Function to get transaction time clause for temporal table.
|
|
17054
|
+
|
|
17055
|
+
PARAMETERS:
|
|
17056
|
+
transaction_time:
|
|
17057
|
+
Required Argument.
|
|
17058
|
+
Specifies the transaction time dimension to represent temporal data when creating the DataFrame.
|
|
17059
|
+
Types: date or str
|
|
17060
|
+
|
|
17061
|
+
RETURNS:
|
|
17062
|
+
str
|
|
17063
|
+
|
|
17064
|
+
RAISES:
|
|
17065
|
+
None.
|
|
17066
|
+
"""
|
|
17067
|
+
if transaction_time == "current":
|
|
17068
|
+
return "CURRENT TRANSACTIONTIME"
|
|
17069
|
+
|
|
17070
|
+
if isinstance(transaction_time, type(None)):
|
|
17071
|
+
return "NONSEQUENCED TRANSACTIONTIME"
|
|
17072
|
+
|
|
17073
|
+
return "TRANSACTIONTIME as of timestamp '{}'".format(transaction_time)
|
|
17074
|
+
|
|
17075
|
+
def _generate_temporal_dataframe(self, timestamp_expr, time_column):
|
|
17076
|
+
"""
|
|
17077
|
+
DESCRIPTION:
|
|
17078
|
+
Helper method to generate a temporal DataFrame based on the given timestamp expression.
|
|
17079
|
+
|
|
17080
|
+
PARAMETERS:
|
|
17081
|
+
timestamp_expr:
|
|
17082
|
+
Required Argument.
|
|
17083
|
+
Specifies the timestamp expression to filter the temporal data.
|
|
17084
|
+
Types: str
|
|
17085
|
+
|
|
17086
|
+
time_column:
|
|
17087
|
+
Required Argument.
|
|
17088
|
+
Specifies the temporal column (valid-time or transaction-time) to process.
|
|
17089
|
+
Types: ColumnExpression
|
|
17090
|
+
|
|
17091
|
+
RAISES:
|
|
17092
|
+
None.
|
|
17093
|
+
|
|
17094
|
+
RETURNS:
|
|
17095
|
+
teradataml DataFrame
|
|
17096
|
+
"""
|
|
17097
|
+
col_expr = "{} as {}".format(time_column.cast(time_column.type).compile(), time_column.name)
|
|
17098
|
+
cols = [col.name if col.name != time_column.name else col_expr for col in self._metaexpr.c]
|
|
17099
|
+
column_expression = ", ".join(cols)
|
|
17100
|
+
sel_node_id = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=timestamp_expr)
|
|
17101
|
+
return self._create_dataframe_from_node(sel_node_id, self._metaexpr, self._index_label)
|
|
17102
|
+
|
|
17103
|
+
def historic_rows(self):
|
|
17104
|
+
"""
|
|
17105
|
+
DESCRIPTION:
|
|
17106
|
+
Retrieves historical rows from a DataFrame created on a valid-time
|
|
17107
|
+
or bi-temporal table/view. Historical rows are defined as those where the
|
|
17108
|
+
end of the valid-time period precedes the current time.
|
|
17109
|
+
|
|
17110
|
+
PARAMETERS:
|
|
17111
|
+
None.
|
|
17112
|
+
|
|
17113
|
+
RETURNS:
|
|
17114
|
+
teradataml DataFrame.
|
|
17115
|
+
|
|
17116
|
+
RAISES:
|
|
17117
|
+
TeradataMLException.
|
|
17118
|
+
|
|
17119
|
+
EXAMPLES:
|
|
17120
|
+
# Load the data to run the example.
|
|
17121
|
+
>>> load_example_data("teradataml", "Employee_roles")
|
|
17122
|
+
|
|
17123
|
+
# Create a DataFrame on 'Employee_roles' table.
|
|
17124
|
+
>>> df = DataFrame("Employee_roles")
|
|
17125
|
+
|
|
17126
|
+
# Retrieve historic rows from the DataFrame.
|
|
17127
|
+
>>> df.historic_rows()
|
|
17128
|
+
EmployeeID EmployeeName Department Salary role_validity_period
|
|
17129
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31')
|
|
17130
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31')
|
|
17131
|
+
"""
|
|
17132
|
+
|
|
17133
|
+
from teradataml.dataframe.functions import current_date, current_timestamp
|
|
17134
|
+
# Validate temporal table type.
|
|
17135
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17136
|
+
valid_time_col = self._valid_time_column
|
|
17137
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
|
|
17138
|
+
# Check the type of the ValidTime dimension column
|
|
17139
|
+
if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
|
|
17140
|
+
# Filter records where the end of the ValidTime period is less than the current date
|
|
17141
|
+
return df[valid_time_col.end() < current_date()]
|
|
17142
|
+
return df[valid_time_col.end() < current_timestamp()]
|
|
17143
|
+
|
|
17144
|
+
def future_rows(self):
|
|
17145
|
+
"""
|
|
17146
|
+
DESCRIPTION:
|
|
17147
|
+
Retrieves future rows from a DataFrame created on a valid-
|
|
17148
|
+
time or bi-temporal table/view. Future rows are defined as those where the
|
|
17149
|
+
start of the valid-time period is greater than the current time.
|
|
17150
|
+
|
|
17151
|
+
PARAMETERS:
|
|
17152
|
+
None.
|
|
17153
|
+
|
|
17154
|
+
RETURNS:
|
|
17155
|
+
teradataml DataFrame.
|
|
17156
|
+
|
|
17157
|
+
RAISES:
|
|
17158
|
+
TeradataMLException.
|
|
17159
|
+
|
|
17160
|
+
EXAMPLES:
|
|
17161
|
+
# Load the data to run the example.
|
|
17162
|
+
>>> load_example_data("teradataml", "Employee_roles")
|
|
17163
|
+
|
|
17164
|
+
# Create a DataFrame on 'Employee_roles' table.
|
|
17165
|
+
>>> df = DataFrame("Employee_roles")
|
|
17166
|
+
|
|
17167
|
+
# Retrieve future rows from the DataFrame.
|
|
17168
|
+
>>> df.future_rows()
|
|
17169
|
+
EmployeeID EmployeeName Department Salary role_validity_period
|
|
17170
|
+
3 Bob Marketing 330.0 ('29/01/01', '99/12/31')
|
|
17171
|
+
"""
|
|
17172
|
+
from teradataml.dataframe.functions import current_date, current_timestamp
|
|
17173
|
+
# Validate temporal table type.
|
|
17174
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17175
|
+
valid_time_col = self._valid_time_column
|
|
17176
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
|
|
17177
|
+
# Check the type of the ValidTime dimension column
|
|
17178
|
+
if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
|
|
17179
|
+
# Filter records where the start of the ValidTime period is greater than the current date
|
|
17180
|
+
return df[valid_time_col.begin() > current_date()]
|
|
17181
|
+
return df[valid_time_col.begin() > current_timestamp()]
|
|
17182
|
+
|
|
17183
|
+
def open_rows(self):
|
|
17184
|
+
"""
|
|
17185
|
+
DESCRIPTION:
|
|
17186
|
+
Retrieves open rows from a DataFrame created on a transaction-time
|
|
17187
|
+
or bi-temporal table/view. Open rows are defined as those where the
|
|
17188
|
+
end of the transaction-time period is greater than or equal to the current time.
|
|
17189
|
+
|
|
17190
|
+
PARAMETERS:
|
|
17191
|
+
None.
|
|
17192
|
+
|
|
17193
|
+
RETURNS:
|
|
17194
|
+
teradataml DataFrame.
|
|
17195
|
+
|
|
17196
|
+
RAISES:
|
|
17197
|
+
TeradataMLException.
|
|
17198
|
+
|
|
17199
|
+
EXAMPLES:
|
|
17200
|
+
# Load the data to run the example.
|
|
17201
|
+
>>> load_example_data("teradataml", "Employee_address")
|
|
17202
|
+
|
|
17203
|
+
# Create a DataFrame on 'Employee_address' table.
|
|
17204
|
+
>>> df = DataFrame("Employee_address")
|
|
17205
|
+
|
|
17206
|
+
# Retrieve open rows from the DataFrame.
|
|
17207
|
+
>>> df.open_rows()
|
|
17208
|
+
EmployeeID EmployeeName address validity_period
|
|
17209
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
17210
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
17211
|
+
"""
|
|
17212
|
+
from teradataml.dataframe.functions import current_timestamp
|
|
17213
|
+
# Validate temporal table type.
|
|
17214
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17215
|
+
transaction_time_col = self._transaction_time_column
|
|
17216
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
|
|
17217
|
+
return df[transaction_time_col.end() >= current_timestamp()]
|
|
17218
|
+
|
|
17219
|
+
def closed_rows(self):
|
|
17220
|
+
"""
|
|
17221
|
+
DESCRIPTION:
|
|
17222
|
+
Retrieves closed rows from a DataFrame created on a transaction-time
|
|
17223
|
+
or bi-temporal table/view. Closed rows are defined as those where the
|
|
17224
|
+
end of the transaction-time period is less than the current time.
|
|
17225
|
+
|
|
17226
|
+
PARAMETERS:
|
|
17227
|
+
None.
|
|
17228
|
+
|
|
17229
|
+
RETURNS:
|
|
17230
|
+
teradataml DataFrame.
|
|
17231
|
+
|
|
17232
|
+
RAISES:
|
|
17233
|
+
TeradataMLException.
|
|
17234
|
+
|
|
17235
|
+
EXAMPLES:
|
|
17236
|
+
# Load the data to run the example.
|
|
17237
|
+
>>> load_example_data("teradataml", "Employee_address")
|
|
17238
|
+
|
|
17239
|
+
# Create a DataFrame on 'Employee_address' table.
|
|
17240
|
+
>>> df = DataFrame("Employee_address")
|
|
17241
|
+
|
|
17242
|
+
# Retrieve closed rows from the DataFrame.
|
|
17243
|
+
>>> df.closed_rows()
|
|
17244
|
+
EmployeeID EmployeeName address validity_period
|
|
17245
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '2025-04-01 23:59:59.999999+00:00')
|
|
17246
|
+
"""
|
|
17247
|
+
from teradataml.dataframe.functions import current_timestamp
|
|
17248
|
+
# Validate temporal table type.
|
|
17249
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17250
|
+
transaction_time_col = self._transaction_time_column
|
|
17251
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
|
|
17252
|
+
return df[transaction_time_col.end() < current_timestamp()]
|
|
17253
|
+
|
|
17254
|
+
@collect_queryband(queryband="DF_create_view")
|
|
17255
|
+
def create_view(self, view_name, schema_name=None):
|
|
17256
|
+
"""
|
|
17257
|
+
Creates a view from the DataFrame object in the specified schema.
|
|
17258
|
+
As teradataml creates views, internally for operations, which will be garbage
|
|
17259
|
+
collected during remove_context(), this function helps the user to persist the
|
|
17260
|
+
DataFrame as a view.
|
|
17261
|
+
Note:
|
|
17262
|
+
The persisted view can be used across sessions and can be accessed
|
|
17263
|
+
using the view_name and schema_name.
|
|
17264
|
+
|
|
17265
|
+
PARAMETERS:
|
|
17266
|
+
view_name:
|
|
17267
|
+
Required Argument.
|
|
17268
|
+
Specifies the name of the view to be persisted.
|
|
17269
|
+
Types: str
|
|
17270
|
+
|
|
17271
|
+
schema_name:
|
|
17272
|
+
Optional Argument.
|
|
17273
|
+
Specifies the schema name where the view is to be persisted.
|
|
17274
|
+
Note:
|
|
17275
|
+
If the schema_name is not provided, the current database will be used.
|
|
17276
|
+
Types: str
|
|
17277
|
+
|
|
17278
|
+
RETURNS:
|
|
17279
|
+
Persisted teradataml DataFrame.
|
|
17280
|
+
|
|
17281
|
+
RAISES:
|
|
17282
|
+
TeradataMlException
|
|
17283
|
+
|
|
17284
|
+
EXAMPLES:
|
|
17285
|
+
# Load the data to run the example.
|
|
17286
|
+
>>> load_example_data("antiselect", ["antiselect_input"])
|
|
17287
|
+
>>> antiselect_input = DataFrame.from_table("antiselect_input")
|
|
17288
|
+
>>> antiselect_input
|
|
17289
|
+
orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
|
|
17290
|
+
rowids
|
|
17291
|
+
49 293 12/10/01 high 49 10123.0200 0.07 delivery truck barry french nunavut nunavut consumer office supplies
|
|
17292
|
+
97 613 11/06/17 high 12 93.5400 0.03 regular air carl jackson nunavut nunavut corporate office supplies
|
|
17293
|
+
85 515 10/08/28 not specified 19 394.2700 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
|
|
17294
|
+
86 515 10/08/28 not specified 21 146.6900 0.05 regular air carlos soltero nunavut nunavut consumer furniture
|
|
17295
|
+
1 3 10/10/13 low 6 261.5400 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
|
|
17296
|
+
50 293 12/10/01 high 27 244.5700 0.01 regular air barry french nunavut nunavut consumer office supplies
|
|
17297
|
+
80 483 11/07/10 high 30 4965.7595 0.08 regular air clay rozendal nunavut nunavut corporate technology
|
|
17298
|
+
|
|
17299
|
+
# Filter the data based on quantity.
|
|
17300
|
+
>>> anti_df = antiselect_input[antiselect_input.quantity < 30]
|
|
17301
|
+
>>> anti_df
|
|
17302
|
+
orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
|
|
17303
|
+
rowids
|
|
17304
|
+
97 613 11/06/17 high 12 93.54 0.03 regular air carl jackson nunavut nunavut corporate office supplies
|
|
17305
|
+
86 515 10/08/28 not specified 21 146.69 0.05 regular air carlos soltero nunavut nunavut consumer furniture
|
|
17306
|
+
85 515 10/08/28 not specified 19 394.27 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
|
|
17307
|
+
1 3 10/10/13 low 6 261.54 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
|
|
17308
|
+
50 293 12/10/01 high 27 244.57 0.01 regular air barry french nunavut nunavut consumer office supplies
|
|
17309
|
+
|
|
17310
|
+
# Run Antiselect on filtered data. This will create temporary view which will be garbage collected.
|
|
17311
|
+
>>> obj = Antiselect(data=anti_df, exclude=['rowids', 'orderdate', 'discount', 'province', 'custsegment'])
|
|
17312
|
+
|
|
17313
|
+
# Get the view name that is internally created by teradataml to store the result of Antiselect.
|
|
17314
|
+
>>> obj.result.db_object_name
|
|
17315
|
+
'"<schema_name>"."ml__td_sqlmr_out__1752582812690000"'
|
|
17316
|
+
|
|
17317
|
+
# Check the output of Antiselect.
|
|
17318
|
+
>>> obj.result
|
|
17319
|
+
orderid priority quantity sales shipmode custname region prodcat
|
|
17320
|
+
0 613 high 12 93.54 regular air carl jackson nunavut office supplies
|
|
17321
|
+
1 515 not specified 21 146.69 regular air carlos soltero nunavut furniture
|
|
17322
|
+
2 515 not specified 19 394.27 regular air carlos soltero nunavut office supplies
|
|
17323
|
+
3 293 high 27 244.57 regular air barry french nunavut office supplies
|
|
17324
|
+
4 3 low 6 261.54 regular air muhammed macintyre nunavut office supplies
|
|
17325
|
+
|
|
17326
|
+
# Describe the resultant DataFrame.
|
|
17327
|
+
>>> df = obj.result.describe() # This will create a temporary view.
|
|
17328
|
+
|
|
17329
|
+
# Get the view name.
|
|
17330
|
+
>>> df.db_object_name
|
|
17331
|
+
'"<schema_name>"."ml__td_sqlmr_out__1752585435339977"'
|
|
17332
|
+
|
|
17333
|
+
# Check the output of describe.
|
|
17334
|
+
>>> df
|
|
17335
|
+
ATTRIBUTE StatName StatValue
|
|
17336
|
+
0 orderid MAXIMUM 613.000000
|
|
17337
|
+
1 orderid STANDARD DEVIATION 245.016734
|
|
17338
|
+
2 orderid PERCENTILES(25) 293.000000
|
|
17339
|
+
3 orderid PERCENTILES(50) 515.000000
|
|
17340
|
+
4 quantity COUNT 5.000000
|
|
17341
|
+
5 quantity MINIMUM 6.000000
|
|
17342
|
+
6 quantity MAXIMUM 27.000000
|
|
17343
|
+
7 quantity MEAN 17.000000
|
|
17344
|
+
8 quantity STANDARD DEVIATION 8.154753
|
|
17345
|
+
9 quantity PERCENTILES(25) 12.000000
|
|
17346
|
+
|
|
17347
|
+
# Example 1: Persist the view which can be accessed across sessions.
|
|
17348
|
+
>>> df_new = df.create_view(view_name="antiselect_describe_view")
|
|
17349
|
+
>>> df_new
|
|
17350
|
+
ATTRIBUTE StatName StatValue
|
|
17351
|
+
0 quantity MAXIMUM 27.000000
|
|
17352
|
+
1 quantity STANDARD DEVIATION 8.154753
|
|
17353
|
+
2 quantity PERCENTILES(25) 12.000000
|
|
17354
|
+
3 quantity PERCENTILES(50) 19.000000
|
|
17355
|
+
4 sales COUNT 5.000000
|
|
17356
|
+
5 sales MINIMUM 93.540000
|
|
17357
|
+
6 orderid COUNT 5.000000
|
|
17358
|
+
7 orderid MINIMUM 3.000000
|
|
17359
|
+
8 orderid MAXIMUM 613.000000
|
|
17360
|
+
9 orderid MEAN 387.800000
|
|
17361
|
+
|
|
17362
|
+
# Get the view name.
|
|
17363
|
+
>>> df_new.db_object_name # "<schema_name>" is user connected database.
|
|
17364
|
+
'"<schema_name>"."antiselect_describe_view"'
|
|
17365
|
+
|
|
17366
|
+
"""
|
|
17367
|
+
# Argument validation
|
|
17368
|
+
arg_info_matrix = []
|
|
17369
|
+
arg_info_matrix.append(["view_name", view_name, False, (str,), True])
|
|
17370
|
+
arg_info_matrix.append(["schema_name", schema_name, True, (str,), True])
|
|
17371
|
+
_Validators._validate_missing_required_arguments(arg_info_matrix)
|
|
17372
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
17373
|
+
|
|
17374
|
+
# TODO: Investigate and identify issue when volatile tables replaces views in future.
|
|
17375
|
+
|
|
17376
|
+
visited = set()
|
|
17377
|
+
to_persist = []
|
|
17378
|
+
is_teradataml_temp_table = lambda x: x.startswith("ml__") or x.startswith("tdml_")
|
|
17379
|
+
sql_bundle = SQLBundle()
|
|
17380
|
+
|
|
17381
|
+
def trace_views(table_name):
|
|
17382
|
+
if table_name in visited:
|
|
17383
|
+
return
|
|
17384
|
+
visited.add(table_name)
|
|
17385
|
+
base_name = UtilFuncs._extract_table_name(full_qualified_name=table_name)
|
|
17386
|
+
if is_teradataml_temp_table(base_name):
|
|
17387
|
+
to_persist.append(table_name)
|
|
17388
|
+
# Try to get the SQL for the view
|
|
17389
|
+
show_view_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_VIEW).\
|
|
17390
|
+
format(table_name)
|
|
17391
|
+
try:
|
|
17392
|
+
result = execute_sql(show_view_sql).fetchall()
|
|
17393
|
+
if result:
|
|
17394
|
+
view_sql = result[0][0].replace("\r", "").replace("\n", " ")\
|
|
17395
|
+
.replace("\t", " ").strip()
|
|
17396
|
+
|
|
17397
|
+
# Extract all table names from the view SQL
|
|
17398
|
+
for tname in UtilFuncs.extract_table_names_from_query(view_sql):
|
|
17399
|
+
trace_views(tname)
|
|
17400
|
+
except Exception as e:
|
|
17401
|
+
# Check if error is like 'not a view', then try SHOW TABLE
|
|
17402
|
+
err_msg = str(e).lower()
|
|
17403
|
+
if 'not a view' in err_msg:
|
|
17404
|
+
show_table_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_TABLE).\
|
|
17405
|
+
format(table_name)
|
|
17406
|
+
try:
|
|
17407
|
+
result = execute_sql(show_table_sql).fetchall()
|
|
17408
|
+
if result:
|
|
17409
|
+
# Table found, nothing to trace further.
|
|
17410
|
+
# This table is persisted.
|
|
17411
|
+
return
|
|
17412
|
+
except Exception as e2:
|
|
17413
|
+
# If SHOW TABLE also fails, raise the exception
|
|
17414
|
+
raise e2
|
|
17415
|
+
else:
|
|
17416
|
+
# If error is not about 'not a view', re-raise
|
|
17417
|
+
raise e
|
|
17418
|
+
|
|
17419
|
+
# 1. Get the query for this DataFrame
|
|
17420
|
+
query = self.show_query()
|
|
17421
|
+
# 2. Extract all table names from the query
|
|
17422
|
+
for tname in UtilFuncs.extract_table_names_from_query(query):
|
|
17423
|
+
trace_views(tname)
|
|
17424
|
+
|
|
17425
|
+
# 3.. Persist the current DataFrame as a permanent object
|
|
17426
|
+
# This CREATE VIEW AS SELECT ...
|
|
17427
|
+
# Use object_name, schema_name as needed.
|
|
17428
|
+
from teradataml.dbutils.dbutils import _get_quoted_object_name
|
|
17429
|
+
target_name = _get_quoted_object_name(schema_name=schema_name, object_name=view_name)
|
|
17430
|
+
|
|
17431
|
+
create_sql = sql_bundle._build_create_view(view_name=target_name,
|
|
17432
|
+
select_expression=query)
|
|
17433
|
+
|
|
17434
|
+
# No try-except here, as we want to raise any error that occurs during execution.
|
|
17435
|
+
execute_sql(create_sql)
|
|
17436
|
+
|
|
17437
|
+
# TODO: Add logger message that these views/tables persisted.
|
|
17438
|
+
# if to_persist:
|
|
17439
|
+
# logger.info("to_persist: ", to_persist)
|
|
17440
|
+
|
|
17441
|
+
# Remove the tables/view from GC file as we need to persist them. Removing only after
|
|
17442
|
+
# required object is created.
|
|
17443
|
+
GarbageCollector._delete_object_entry(objects_to_delete=to_persist,
|
|
17444
|
+
object_type=None,
|
|
17445
|
+
remove_entry_from_gc_list=True)
|
|
17446
|
+
|
|
17447
|
+
# Return the teradataml DataFrame for the persisted object.
|
|
17448
|
+
if schema_name is None:
|
|
17449
|
+
schema_name = tdmlctx._get_current_databasename()
|
|
17450
|
+
return DataFrame(in_schema(schema_name=schema_name, table_name=view_name))
|
|
17451
|
+
|
|
17452
|
+
|
|
16334
17453
|
class DataFrameGroupBy(DataFrame):
|
|
16335
17454
|
"""
|
|
16336
17455
|
This class integrate GroupBy clause with AED.
|
|
@@ -16382,8 +17501,8 @@ class DataFrameGroupBy(DataFrame):
|
|
|
16382
17501
|
include_grouping_columns:
|
|
16383
17502
|
Optional Argument.
|
|
16384
17503
|
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
16385
|
-
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16386
|
-
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
17504
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
17505
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16387
17506
|
aggregations on the columns mentioned in "columns".
|
|
16388
17507
|
Default Value: False
|
|
16389
17508
|
Types: bool
|
|
@@ -16483,7 +17602,8 @@ class DataFrameGroupBy(DataFrame):
|
|
|
16483
17602
|
|
|
16484
17603
|
new_meta = UtilFuncs._get_metaexpr_using_columns(new_nodeid,
|
|
16485
17604
|
zip(new_column_names,
|
|
16486
|
-
new_column_types)
|
|
17605
|
+
new_column_types),
|
|
17606
|
+
datalake=self._metaexpr.datalake)
|
|
16487
17607
|
|
|
16488
17608
|
return (new_meta, new_nodeid)
|
|
16489
17609
|
|