teradataml 20.0.0.5__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +59 -11
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +10 -6
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +72 -2
- teradataml/common/exceptions.py +32 -0
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +73 -1
- teradataml/common/messages.py +27 -1
- teradataml/common/sqlbundle.py +25 -7
- teradataml/common/utils.py +210 -22
- teradataml/context/aed_context.py +16 -10
- teradataml/context/context.py +37 -9
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/onnxembeddings.json +1 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/teradataml_example.json +75 -1
- teradataml/data/url_data.csv +10 -9
- teradataml/dataframe/copy_to.py +715 -55
- teradataml/dataframe/dataframe.py +2115 -97
- teradataml/dataframe/dataframe_utils.py +66 -28
- teradataml/dataframe/functions.py +1130 -2
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +710 -1039
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/hyperparameter_tuner/utils.py +4 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/opensource/_base.py +7 -1
- teradataml/options/configure.py +20 -4
- teradataml/scriptmgmt/UserEnv.py +247 -36
- teradataml/scriptmgmt/lls_utils.py +140 -39
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/query_generator.py +4 -21
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +95 -1
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -3
- teradataml/utils/validators.py +699 -18
- {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +312 -2
- {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +119 -87
- {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -12,63 +12,74 @@ This file implements the teradataml dataframe.
|
|
|
12
12
|
A teradataml dataframe maps virtually to teradata tables and views.
|
|
13
13
|
"""
|
|
14
14
|
import decimal
|
|
15
|
-
import inspect
|
|
15
|
+
import inspect
|
|
16
|
+
import itertools
|
|
16
17
|
import json
|
|
17
18
|
import numbers
|
|
18
|
-
import pandas as pd
|
|
19
19
|
import re
|
|
20
|
-
import sqlalchemy
|
|
21
20
|
import sys
|
|
22
21
|
import urllib.parse
|
|
22
|
+
from collections import OrderedDict
|
|
23
|
+
from collections.abc import Iterator
|
|
23
24
|
|
|
25
|
+
import numpy as np
|
|
26
|
+
import pandas as pd
|
|
27
|
+
import sqlalchemy
|
|
24
28
|
from sqlalchemy import Column
|
|
29
|
+
from sqlalchemy.exc import NoSuchColumnError
|
|
30
|
+
from datetime import datetime, date
|
|
31
|
+
from sqlalchemy.sql import ClauseElement
|
|
32
|
+
from teradatasql import OperationalError
|
|
33
|
+
from teradatasqlalchemy import types as tdtypes
|
|
34
|
+
from teradatasqlalchemy.dialect import dialect as td_dialect
|
|
35
|
+
from teradatasqlalchemy.dialect import preparer
|
|
36
|
+
from teradatasqlalchemy.types import (BIGINT, BYTEINT, DECIMAL, FLOAT, INTEGER,
|
|
37
|
+
PERIOD_TIMESTAMP, SMALLINT, _TDType)
|
|
25
38
|
|
|
26
39
|
import teradataml.context.context as tdmlctx
|
|
27
|
-
|
|
28
|
-
from
|
|
29
|
-
|
|
30
|
-
from teradataml import
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
|
|
37
|
-
from teradataml.common.deprecations import argument_deprecation
|
|
38
|
-
from teradataml.common.utils import UtilFuncs
|
|
40
|
+
from teradataml import GarbageCollector, execute_sql
|
|
41
|
+
from teradataml.common.bulk_exposed_utils import \
|
|
42
|
+
_validate_unimplemented_function
|
|
43
|
+
from teradataml.common.constants import (AEDConstants, DataFrameTypes, OutputStyle,
|
|
44
|
+
PTITableConstants, PythonTypes,
|
|
45
|
+
SourceType, SQLConstants,
|
|
46
|
+
SQLFunctionConstants,
|
|
47
|
+
TableOperatorConstants,
|
|
48
|
+
TeradataConstants, TeradataTypes)
|
|
39
49
|
from teradataml.common.exceptions import TeradataMlException
|
|
40
|
-
from teradataml.common.messages import Messages
|
|
41
50
|
from teradataml.common.messagecodes import MessageCodes
|
|
42
|
-
from teradataml.common.
|
|
43
|
-
from teradataml.common.
|
|
44
|
-
|
|
45
|
-
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, DataFrameUtils
|
|
46
|
-
from teradataml.dataframe.indexer import _LocationIndexer
|
|
47
|
-
from teradataml.common.aed_utils import AedUtils
|
|
48
|
-
from teradataml.options.display import display
|
|
49
|
-
from teradataml.options.configure import configure
|
|
51
|
+
from teradataml.common.messages import Messages
|
|
52
|
+
from teradataml.common.sqlbundle import SQLBundle
|
|
53
|
+
from teradataml.common.utils import UtilFuncs
|
|
50
54
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
55
|
+
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
56
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
57
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
|
|
58
|
+
from teradataml.dataframe.indexer import _LocationIndexer
|
|
51
59
|
from teradataml.dataframe.row import _Row
|
|
52
60
|
from teradataml.dataframe.setop import concat
|
|
61
|
+
from teradataml.dataframe.sql import _MetaExpression
|
|
62
|
+
from teradataml.dataframe.sql_functions import case
|
|
63
|
+
from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
64
|
+
from teradataml.dataframe.window import Window
|
|
53
65
|
from teradataml.dbutils.dbutils import list_td_reserved_keywords
|
|
66
|
+
from teradataml.options.configure import configure
|
|
67
|
+
from teradataml.options.display import display
|
|
54
68
|
from teradataml.plot.plot import _Plot
|
|
55
69
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
56
|
-
from teradataml.
|
|
57
|
-
from teradataml.utils.validators import _Validators
|
|
70
|
+
from teradataml.series.series import Series
|
|
58
71
|
from teradataml.table_operators.table_operator_util import _TableOperatorUtils
|
|
59
|
-
from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
|
|
60
|
-
from teradatasql import OperationalError
|
|
61
|
-
from teradataml.dataframe.window import Window
|
|
62
|
-
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
63
|
-
from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
|
|
64
72
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
65
|
-
from teradataml.
|
|
66
|
-
from teradataml.utils.
|
|
67
|
-
|
|
73
|
+
from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
|
|
74
|
+
from teradataml.utils.validators import _Validators
|
|
75
|
+
|
|
76
|
+
# Adding imports at the end to avoid circular imports.
|
|
77
|
+
from teradataml.common.aed_utils import AedUtils
|
|
68
78
|
|
|
69
79
|
# TODO use logger when available on master branch
|
|
70
80
|
# logger = teradatapylog.getLogger()
|
|
71
81
|
|
|
82
|
+
|
|
72
83
|
class in_schema:
|
|
73
84
|
"""
|
|
74
85
|
Class takes a schema name, a table name and datalake name attributes
|
|
@@ -149,26 +160,37 @@ class DataFrame():
|
|
|
149
160
|
on tables, views, and queries on Teradata Vantage.
|
|
150
161
|
"""
|
|
151
162
|
|
|
152
|
-
def __init__(self,
|
|
163
|
+
def __init__(self, data=None, index=True, index_label=None, query=None, materialize=False, **kwargs):
|
|
153
164
|
"""
|
|
154
165
|
Constructor for teradataml DataFrame.
|
|
155
166
|
|
|
156
167
|
PARAMETERS:
|
|
157
|
-
|
|
168
|
+
data:
|
|
158
169
|
Optional Argument.
|
|
159
|
-
|
|
160
|
-
|
|
170
|
+
Specifies the input data to create a teradataml DataFrame.
|
|
171
|
+
Notes:
|
|
172
|
+
If a dictionary is provided, it must follow the below requirements:
|
|
173
|
+
* Keys must be strings (column names).
|
|
174
|
+
* Values must be lists of equal length (column data).
|
|
175
|
+
* Nested dictionaries are not supported.
|
|
176
|
+
Types: str OR pandas DataFrame OR in_schema OR numpy array OR list OR dictionary
|
|
161
177
|
|
|
162
178
|
index:
|
|
163
179
|
Optional Argument.
|
|
164
|
-
|
|
180
|
+
If "data" is a string, then the argument specifies whether to use the index column
|
|
181
|
+
for sorting or not.
|
|
182
|
+
If "data" is a pandas DataFrame, then this argument specifies whether to
|
|
183
|
+
save Pandas DataFrame index as a column or not.
|
|
165
184
|
Default Value: True
|
|
166
185
|
Types: bool
|
|
167
186
|
|
|
168
187
|
index_label:
|
|
169
188
|
Optional Argument.
|
|
170
|
-
|
|
171
|
-
|
|
189
|
+
If "data" is a string, then the argument specifies column(s) used for sorting.
|
|
190
|
+
If "data" is a pandas DataFrame, then the default behavior is applied.
|
|
191
|
+
Note:
|
|
192
|
+
* Refer to the "index_label" parameter of copy_to_sql() for details on the default behaviour.
|
|
193
|
+
Types: str OR list of str
|
|
172
194
|
|
|
173
195
|
query:
|
|
174
196
|
Optional Argument.
|
|
@@ -187,29 +209,136 @@ class DataFrame():
|
|
|
187
209
|
Default Value: False (No materialization)
|
|
188
210
|
Types: bool
|
|
189
211
|
|
|
212
|
+
kwargs:
|
|
213
|
+
table_name:
|
|
214
|
+
Optional Argument.
|
|
215
|
+
The table name or view name in Teradata Vantage referenced by this DataFrame.
|
|
216
|
+
Note:
|
|
217
|
+
* If "data" and "table_name" are both specified, then the "table_name" argument is ignored.
|
|
218
|
+
Types: str or in_schema
|
|
219
|
+
|
|
220
|
+
primary_index:
|
|
221
|
+
Optional Argument.
|
|
222
|
+
Specifies which column(s) to use as primary index for the teradataml DataFrame.
|
|
223
|
+
Note:
|
|
224
|
+
* This argument is only applicable when creating a DataFrame from a pandas DataFrame.
|
|
225
|
+
Types: str OR list of str
|
|
226
|
+
|
|
227
|
+
types:
|
|
228
|
+
Optional Argument.
|
|
229
|
+
Specifies required data types for requested columns to be saved in Teradata Vantage.
|
|
230
|
+
Notes:
|
|
231
|
+
* This argument is not applicable when "data" argument is of type str or in_schema.
|
|
232
|
+
* Refer to the "types" parameter of copy_to_sql() for more details.
|
|
233
|
+
Types: dict
|
|
234
|
+
|
|
235
|
+
columns:
|
|
236
|
+
Optional Argument.
|
|
237
|
+
Specifies the names of the columns to be used in the DataFrame.
|
|
238
|
+
Notes:
|
|
239
|
+
* This argument is not applicable when "data" argument is of type str or in_schema.
|
|
240
|
+
* If "data" is a dictionary and this argument is specified, only the specified columns will be
|
|
241
|
+
included in the DataFrame if the dictionary contains those keys. If the dictionary does not
|
|
242
|
+
contain the specified keys, those columns will be added with NaN values.
|
|
243
|
+
Types: str OR list of str
|
|
244
|
+
|
|
245
|
+
persist:
|
|
246
|
+
Optional Argument.
|
|
247
|
+
Specifies whether to persist the DataFrame.
|
|
248
|
+
Note:
|
|
249
|
+
* This argument is only applicable when the "data" argument is of type dict, list or
|
|
250
|
+
pandas DataFrame.
|
|
251
|
+
Default Value: False
|
|
252
|
+
Types: bool
|
|
253
|
+
|
|
190
254
|
EXAMPLES:
|
|
191
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
255
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
256
|
+
>>> import pandas as pd
|
|
192
257
|
|
|
193
|
-
# Example 1:
|
|
194
|
-
|
|
195
|
-
# Created DataFrame using table name.
|
|
196
|
-
df = DataFrame("mytab")
|
|
258
|
+
# Example 1: Create a teradataml DataFrame from table name.
|
|
259
|
+
>>> df = DataFrame("mytab")
|
|
197
260
|
|
|
198
|
-
#
|
|
199
|
-
df = DataFrame("myview")
|
|
261
|
+
# Example 2: Create a teradataml DataFrame from view name.
|
|
262
|
+
>>> df = DataFrame("myview")
|
|
200
263
|
|
|
201
|
-
#
|
|
202
|
-
df = DataFrame("myview", False)
|
|
264
|
+
# Example 3: Create a teradataml DataFrame using view name without using index column for sorting.
|
|
265
|
+
>>> df = DataFrame("myview", False)
|
|
203
266
|
|
|
204
|
-
#
|
|
205
|
-
|
|
267
|
+
# Example 4: Create a teradataml DataFrame using table name and consider columns Col1 and Col2
|
|
268
|
+
# while running DataFrame.head() or DataFrame.tail() methods.
|
|
269
|
+
>>> df = DataFrame("mytab", True, ["Col1", "Col2"])
|
|
206
270
|
|
|
271
|
+
# Example 5: Create a teradataml DataFrame from the existing Vantage table "dbcinfo"
|
|
272
|
+
# in the non-default database "dbc" using the in_schema() object.
|
|
273
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
274
|
+
>>> df = DataFrame(in_schema("dbc", "dbcinfo"))
|
|
207
275
|
|
|
208
|
-
# Example
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
276
|
+
# Example 6: Create a teradataml DataFrame from a pandas DataFrame.
|
|
277
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
278
|
+
>>> df = DataFrame(pdf)
|
|
279
|
+
>>> df
|
|
280
|
+
col1 col2 index_label
|
|
281
|
+
0 3 6 2
|
|
282
|
+
1 2 5 1
|
|
283
|
+
2 1 4 0
|
|
284
|
+
|
|
285
|
+
# Example 7: Create a teradataml DataFrame from a pandas DataFrame without index column.
|
|
286
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
287
|
+
>>> df = DataFrame(data=pdf, index=False)
|
|
288
|
+
>>> df
|
|
289
|
+
col1 col2
|
|
290
|
+
0 3 6
|
|
291
|
+
1 2 5
|
|
292
|
+
2 1 4
|
|
293
|
+
|
|
294
|
+
# Example 8: Create a teradataml DataFrame from a pandas DataFrame with
|
|
295
|
+
# index label and primary index as 'id'.
|
|
296
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
297
|
+
>>> df = DataFrame(pdf, index=True, index_label='id', primary_index='id')
|
|
298
|
+
>>> df
|
|
299
|
+
col1 col2
|
|
300
|
+
id
|
|
301
|
+
2 3 6
|
|
302
|
+
1 2 5
|
|
303
|
+
0 1 4
|
|
304
|
+
|
|
305
|
+
# Example 9: Create a teradataml DataFrame from list of lists.
|
|
306
|
+
>>> df = DataFrame([[1, 2], [3, 4]])
|
|
307
|
+
>>> df
|
|
308
|
+
col_0 col_1 index_label
|
|
309
|
+
0 3 4 1
|
|
310
|
+
1 1 2 0
|
|
311
|
+
|
|
312
|
+
# Example 10: Create a teradataml DataFrame from numpy array.
|
|
313
|
+
>>> import numpy as np
|
|
314
|
+
>>> df = DataFrame(np.array([[1, 2], [3, 4]]), index=True, index_label="id")
|
|
315
|
+
>>> df
|
|
316
|
+
col_0 col_1
|
|
317
|
+
id
|
|
318
|
+
1 3 4
|
|
319
|
+
0 1 2
|
|
320
|
+
|
|
321
|
+
# Example 11: Create a teradataml DataFrame from a dictionary.
|
|
322
|
+
>>> df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=True, index_label="id")
|
|
323
|
+
>>> df
|
|
324
|
+
col1 col2
|
|
325
|
+
id
|
|
326
|
+
1 2 4
|
|
327
|
+
0 1 3
|
|
328
|
+
|
|
329
|
+
# Example 12: Create a teradataml DataFrame from list of dictionaries.
|
|
330
|
+
>>> df = DataFrame([{"col1": 1, "col2": 2}, {"col1": 3, "col2": 4}], index=False)
|
|
331
|
+
>>> df
|
|
332
|
+
col1 col2
|
|
333
|
+
0 3 4
|
|
334
|
+
1 1 2
|
|
335
|
+
|
|
336
|
+
# Example 13: Create a teradataml DataFrame from list of tuples.
|
|
337
|
+
>>> df = DataFrame([("Alice", 1), ("Bob", 2)])
|
|
338
|
+
>>> df
|
|
339
|
+
col_0 col_1 index_label
|
|
340
|
+
0 Alice 1 1
|
|
341
|
+
1 Bob 2 0
|
|
213
342
|
|
|
214
343
|
RAISES:
|
|
215
344
|
TeradataMlException - TDMLDF_CREATE_FAIL
|
|
@@ -248,17 +377,39 @@ class DataFrame():
|
|
|
248
377
|
# This attribute stores the root DataFrame columns.
|
|
249
378
|
self._root_columns = None
|
|
250
379
|
|
|
380
|
+
# Internal argument, when this attribute is set to True, the teradataml DataFrame locks
|
|
381
|
+
# the corresponding row(s) in the underlying table(s) while accessing the data.
|
|
382
|
+
_lock_rows = kwargs.get("_lock_rows", False)
|
|
383
|
+
|
|
251
384
|
self._datalake = None
|
|
252
385
|
self._database = None
|
|
253
386
|
self._table = None
|
|
254
387
|
self._otf = False
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
388
|
+
self._df_type = None
|
|
389
|
+
self._valid_time_column = None
|
|
390
|
+
self._transaction_time_column = None
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
table_name = kwargs.get("table_name", None)
|
|
394
|
+
primary_index = kwargs.get("primary_index", None)
|
|
395
|
+
columns = kwargs.get("columns", None)
|
|
396
|
+
types = kwargs.get("types", None)
|
|
397
|
+
persist = kwargs.get("persist", False)
|
|
398
|
+
|
|
399
|
+
# Check if the data is an instance of in_schema or if the data is None
|
|
400
|
+
# and table_name is an instance of in_schema, then assign the table_name,
|
|
401
|
+
# datalake_name and schema_name to the DataFrame object.
|
|
402
|
+
schema_obj = data if isinstance(data, in_schema) else (
|
|
403
|
+
table_name if data is None and isinstance(table_name, in_schema) else None)
|
|
404
|
+
|
|
405
|
+
if schema_obj:
|
|
406
|
+
self._table = schema_obj.table_name
|
|
407
|
+
self._datalake = schema_obj.datalake_name
|
|
408
|
+
self._database = schema_obj.schema_name
|
|
260
409
|
self._otf = True if self._datalake else False
|
|
261
410
|
|
|
411
|
+
# Convert schema objects to strings.
|
|
412
|
+
data = str(data) if isinstance(data, in_schema) else data
|
|
262
413
|
table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
|
|
263
414
|
|
|
264
415
|
# Below matrix is list of list, where in each row contains following elements:
|
|
@@ -277,18 +428,51 @@ class DataFrame():
|
|
|
277
428
|
# 6. element6 --> A list of permitted values, an argument can accept.
|
|
278
429
|
# If not specified, it is as good as passing None. If a list is passed, validation will be
|
|
279
430
|
# performed for permitted values.
|
|
431
|
+
|
|
280
432
|
awu_matrix = []
|
|
281
|
-
|
|
433
|
+
dtypes = (list, tuple, dict)
|
|
434
|
+
awu_matrix.append(["data", data, True, (str, pd.DataFrame, np.ndarray, dict, _ListOf(dtypes)), True])
|
|
282
435
|
awu_matrix.append(["index", index, True, (bool)])
|
|
283
436
|
awu_matrix.append(["index_label", index_label, True, (str, list)])
|
|
284
437
|
awu_matrix.append(["query", query, True, (str), True])
|
|
285
438
|
awu_matrix.append(["materialize", materialize, True, (bool)])
|
|
439
|
+
awu_matrix.append(["table_name", table_name, True, (str), True])
|
|
440
|
+
awu_matrix.append(["primary_index", primary_index, True, (str, list)])
|
|
441
|
+
awu_matrix.append(["types", types, True, (dict)])
|
|
442
|
+
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
286
443
|
|
|
287
444
|
# Validate argument types
|
|
288
445
|
_Validators._validate_function_arguments(awu_matrix)
|
|
289
446
|
|
|
447
|
+
# Convert columns to list if it is a string.
|
|
448
|
+
if isinstance(columns, str):
|
|
449
|
+
columns = [columns]
|
|
450
|
+
|
|
290
451
|
try:
|
|
291
|
-
if table_name is not None:
|
|
452
|
+
if table_name is not None or data is not None:
|
|
453
|
+
|
|
454
|
+
# If data is list or numpy array or dictionary, then convert it to a pandas DataFrame.
|
|
455
|
+
if isinstance(data, (list, np.ndarray, dict)):
|
|
456
|
+
data = pd.DataFrame(data, columns=columns)
|
|
457
|
+
# If the data is a pandas DataFrame, then store the data in a temporary table in Vantage.
|
|
458
|
+
if isinstance(data, pd.DataFrame):
|
|
459
|
+
# Create a copy of the pandas DataFrame to avoid modifying the original,
|
|
460
|
+
# because column names will be changed if they are integers.
|
|
461
|
+
pd_data = data.copy()
|
|
462
|
+
# If the columns are not of type string, then convert them to string.
|
|
463
|
+
pd_data.columns = [f"col_{i}" if isinstance(i, int) else i for i in pd_data.columns]
|
|
464
|
+
|
|
465
|
+
# Set the table_name to the name of the table created in the database.
|
|
466
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="from_pandas",
|
|
467
|
+
table_type=TeradataConstants.TERADATA_TABLE,
|
|
468
|
+
gc_on_quit=not(persist))
|
|
469
|
+
|
|
470
|
+
copy_to_sql(pd_data, table_name, index=index, index_label=index_label, primary_index=primary_index,
|
|
471
|
+
types=types)
|
|
472
|
+
# If the data is a string, then set the table_name to the data.
|
|
473
|
+
elif isinstance(data, str):
|
|
474
|
+
table_name = data
|
|
475
|
+
|
|
292
476
|
self._table_name = UtilFuncs._quote_table_names(table_name)
|
|
293
477
|
self._source_type = SourceType.TABLE.value
|
|
294
478
|
self._nodeid = self._aed_utils._aed_table(self._table_name)
|
|
@@ -329,6 +513,8 @@ class DataFrame():
|
|
|
329
513
|
|
|
330
514
|
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
331
515
|
__execute_params = (self._table_name, self._query, True)
|
|
516
|
+
elif configure.temp_object_type == TeradataConstants.TERADATA_VIEW:
|
|
517
|
+
__execute_params = (self._table_name, self._query, _lock_rows)
|
|
332
518
|
|
|
333
519
|
try:
|
|
334
520
|
__execute(*__execute_params)
|
|
@@ -342,6 +528,12 @@ class DataFrame():
|
|
|
342
528
|
elif "[Error 3706] Syntax error" in str(oe):
|
|
343
529
|
raise ValueError(Messages.get_message(
|
|
344
530
|
MessageCodes.FROM_QUERY_SELECT_SUPPORTED).format("Check the syntax."))
|
|
531
|
+
elif "[Error 7825]" in str(oe):
|
|
532
|
+
# The UDF/XSP/UDM routine has thrown an SQLException
|
|
533
|
+
# with an SQL state in the range of 38001-38999 which
|
|
534
|
+
# is not a syntax error. Hence not a ValueError wrt query string.
|
|
535
|
+
# Expected when OTF snapshot related query is executed.
|
|
536
|
+
raise
|
|
345
537
|
raise ValueError(Messages.get_message(
|
|
346
538
|
MessageCodes.FROM_QUERY_SELECT_SUPPORTED))
|
|
347
539
|
|
|
@@ -351,6 +543,7 @@ class DataFrame():
|
|
|
351
543
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
352
544
|
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
353
545
|
|
|
546
|
+
# _get_metaexpr() can be only used if self._table_name is set.
|
|
354
547
|
if table_name or query:
|
|
355
548
|
self._metaexpr = self._get_metaexpr()
|
|
356
549
|
self._get_metadata_from_metaexpr(self._metaexpr)
|
|
@@ -503,7 +696,7 @@ class DataFrame():
|
|
|
503
696
|
Types: str
|
|
504
697
|
|
|
505
698
|
EXAMPLES:
|
|
506
|
-
>>> from teradataml
|
|
699
|
+
>>> from teradataml import DataFrame
|
|
507
700
|
|
|
508
701
|
# Example 1: The following example creates a DataFrame from a table or
|
|
509
702
|
a view.
|
|
@@ -543,13 +736,13 @@ class DataFrame():
|
|
|
543
736
|
|
|
544
737
|
"""
|
|
545
738
|
if schema_name:
|
|
546
|
-
return cls(in_schema(schema_name, table_name, datalake_name)
|
|
547
|
-
|
|
548
|
-
return cls(table_name, index, index_label)
|
|
739
|
+
return cls(table_name=in_schema(schema_name, table_name, datalake_name),
|
|
740
|
+
index=index, index_label=index_label)
|
|
741
|
+
return cls(table_name=table_name, index=index, index_label=index_label)
|
|
549
742
|
|
|
550
743
|
@classmethod
|
|
551
744
|
@collect_queryband(queryband="DF_fromQuery")
|
|
552
|
-
def from_query(cls, query, index=True, index_label=None, materialize=False):
|
|
745
|
+
def from_query(cls, query, index=True, index_label=None, materialize=False, **kwargs):
|
|
553
746
|
"""
|
|
554
747
|
Class method for creating a DataFrame from a query.
|
|
555
748
|
|
|
@@ -647,6 +840,7 @@ class DataFrame():
|
|
|
647
840
|
df._nodeid = nodeid
|
|
648
841
|
df._source_type = SourceType.TABLE.value
|
|
649
842
|
|
|
843
|
+
|
|
650
844
|
if not reuse_metaexpr:
|
|
651
845
|
# Create new _MetaExpression object using reference metaExpression
|
|
652
846
|
# for newly created DataFrame.
|
|
@@ -692,6 +886,322 @@ class DataFrame():
|
|
|
692
886
|
df.__setattr__(arg, arg_value)
|
|
693
887
|
return df
|
|
694
888
|
|
|
889
|
+
@classmethod
|
|
890
|
+
@collect_queryband(queryband="DF_fromPandas")
|
|
891
|
+
def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None, persist=False):
|
|
892
|
+
"""
|
|
893
|
+
DESCRIPTION:
|
|
894
|
+
Creates a teradataml DataFrame from a pandas DataFrame.
|
|
895
|
+
|
|
896
|
+
PARAMETERS:
|
|
897
|
+
pandas_df:
|
|
898
|
+
Required Argument.
|
|
899
|
+
Specifies the pandas DataFrame to be converted to teradataml DataFrame.
|
|
900
|
+
Types: pandas DataFrame
|
|
901
|
+
|
|
902
|
+
index:
|
|
903
|
+
Optional Argument.
|
|
904
|
+
Specifies whether to save Pandas DataFrame index as a column or not.
|
|
905
|
+
Default Value: True
|
|
906
|
+
Types: bool
|
|
907
|
+
|
|
908
|
+
index_label:
|
|
909
|
+
Optional Argument.
|
|
910
|
+
Specifies the column label(s) for Pandas DataFrame index column(s).
|
|
911
|
+
Note:
|
|
912
|
+
* Refer to the "index_label" parameter of copy_to_sql() for more details.
|
|
913
|
+
Default Value: None
|
|
914
|
+
Types: str OR list of str
|
|
915
|
+
|
|
916
|
+
primary_index:
|
|
917
|
+
Optional Argument.
|
|
918
|
+
Specifies which column(s) to use as primary index for the teradataml DataFrame.
|
|
919
|
+
Types: str OR list of str
|
|
920
|
+
|
|
921
|
+
persist:
|
|
922
|
+
Optional Argument.
|
|
923
|
+
Specifies whether to persist the DataFrame.
|
|
924
|
+
Default Value: False
|
|
925
|
+
Types: bool
|
|
926
|
+
|
|
927
|
+
RETURNS:
|
|
928
|
+
teradataml DataFrame
|
|
929
|
+
|
|
930
|
+
RAISES:
|
|
931
|
+
TeradataMlException
|
|
932
|
+
|
|
933
|
+
EXAMPLES:
|
|
934
|
+
>>> import pandas as pd
|
|
935
|
+
>>> from teradataml import DataFrame
|
|
936
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
937
|
+
>>> pdf1 = pd.DataFrame([[1, 2], [3, 4]])
|
|
938
|
+
|
|
939
|
+
# Example 1: Create a teradataml DataFrame from a pandas DataFrame.
|
|
940
|
+
>>> df = DataFrame.from_pandas(pdf)
|
|
941
|
+
>>> df
|
|
942
|
+
col1 col2 index_label
|
|
943
|
+
0 3 6 2
|
|
944
|
+
1 2 5 1
|
|
945
|
+
2 1 4 0
|
|
946
|
+
|
|
947
|
+
# Example 2: Create a teradataml DataFrame from a pandas DataFrame
|
|
948
|
+
# and do not save the index as a column.
|
|
949
|
+
>>> df = DataFrame.from_pandas(pdf, index=False)
|
|
950
|
+
>>> df
|
|
951
|
+
col1 col2
|
|
952
|
+
0 3 6
|
|
953
|
+
1 2 5
|
|
954
|
+
2 1 4
|
|
955
|
+
|
|
956
|
+
# Example 3: Create a teradataml DataFrame from a pandas DataFrame
|
|
957
|
+
# with index label as 'id' and set it as primary index.
|
|
958
|
+
>>> df = DataFrame.from_pandas(pdf, index=True, index_label='id', primary_index='id')
|
|
959
|
+
>>> df
|
|
960
|
+
col1 col2
|
|
961
|
+
id
|
|
962
|
+
2 3 6
|
|
963
|
+
1 2 5
|
|
964
|
+
0 1 4
|
|
965
|
+
|
|
966
|
+
# Example 4: Create a teradataml DataFrame from a pandas DataFrame where
|
|
967
|
+
# columns are not explicitly defined in the pandas DataFrame.
|
|
968
|
+
>>> df = DataFrame.from_pandas(pdf1)
|
|
969
|
+
>>> df
|
|
970
|
+
col_0 col_1 index_label
|
|
971
|
+
0 3 4 1
|
|
972
|
+
1 1 2 0
|
|
973
|
+
"""
|
|
974
|
+
# Validate 'pandas_df' argument, other arguments, will be validated as part of DataFrame().
|
|
975
|
+
arg_type_matrix = []
|
|
976
|
+
arg_type_matrix.append(["pandas_df", pandas_df, False, (pd.DataFrame,), True])
|
|
977
|
+
arg_type_matrix.append(["persist", persist, True, (bool), True])
|
|
978
|
+
|
|
979
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
980
|
+
|
|
981
|
+
return cls(pandas_df, index, index_label, primary_index=primary_index, persist=persist)
|
|
982
|
+
|
|
983
|
+
@classmethod
|
|
984
|
+
@collect_queryband(queryband="DF_fromDict")
|
|
985
|
+
def from_dict(cls, data, columns=None, persist=False):
|
|
986
|
+
"""
|
|
987
|
+
DESCRIPTION:
|
|
988
|
+
Creates a DataFrame from a dictionary containing values as lists or numpy arrays.
|
|
989
|
+
|
|
990
|
+
PARAMETERS:
|
|
991
|
+
data:
|
|
992
|
+
Required Argument.
|
|
993
|
+
Specifies the Python dictionary to create a teradataml DataFrame.
|
|
994
|
+
Notes:
|
|
995
|
+
* Keys of the dictionary are used as column names.
|
|
996
|
+
* Values of the dictionary should be lists or numpy arrays.
|
|
997
|
+
* Nested dictionaries are not supported.
|
|
998
|
+
Types: dict
|
|
999
|
+
|
|
1000
|
+
columns:
|
|
1001
|
+
Optional Argument.
|
|
1002
|
+
Specifies the column names for the DataFrame.
|
|
1003
|
+
Types: str OR list of str
|
|
1004
|
+
|
|
1005
|
+
persist:
|
|
1006
|
+
Optional Argument.
|
|
1007
|
+
Specifies whether to persist the DataFrame.
|
|
1008
|
+
Default Value: False
|
|
1009
|
+
Types: bool
|
|
1010
|
+
|
|
1011
|
+
RETURNS:
|
|
1012
|
+
teradataml DataFrame
|
|
1013
|
+
|
|
1014
|
+
RAISES:
|
|
1015
|
+
TeradataMlException
|
|
1016
|
+
|
|
1017
|
+
EXAMPLES:
|
|
1018
|
+
>>> from teradataml import DataFrame
|
|
1019
|
+
>>> data_dict = {"name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 28]}
|
|
1020
|
+
|
|
1021
|
+
# Example 1: Create a teradataml DataFrame from a dictionary where
|
|
1022
|
+
# keys are column names and values are lists of column data.
|
|
1023
|
+
>>> df = DataFrame.from_dict(data_dict)
|
|
1024
|
+
>>> df
|
|
1025
|
+
name age
|
|
1026
|
+
0 Charlie 28
|
|
1027
|
+
1 Bob 30
|
|
1028
|
+
2 Alice 25
|
|
1029
|
+
|
|
1030
|
+
# Example 2: Create a teradataml DataFrame from a dictionary where
|
|
1031
|
+
# keys are column names and values are numpy arrays.
|
|
1032
|
+
>>> import numpy as np
|
|
1033
|
+
>>> data_dict = {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}
|
|
1034
|
+
>>> df = DataFrame.from_dict(data_dict)
|
|
1035
|
+
>>> df
|
|
1036
|
+
col1 col2
|
|
1037
|
+
0 3 6
|
|
1038
|
+
1 2 5
|
|
1039
|
+
2 1 4
|
|
1040
|
+
"""
|
|
1041
|
+
arg_type_matrix = []
|
|
1042
|
+
arg_type_matrix.append(["data", data, False, (dict), True])
|
|
1043
|
+
arg_type_matrix.append(["columns", columns, True, (str, list), True])
|
|
1044
|
+
arg_type_matrix.append(["persist", persist, True, (bool), True])
|
|
1045
|
+
|
|
1046
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1047
|
+
|
|
1048
|
+
return cls(data, columns=columns, index=False, persist=persist)
|
|
1049
|
+
|
|
1050
|
+
@classmethod
|
|
1051
|
+
@collect_queryband(queryband="DF_fromRecords")
|
|
1052
|
+
def from_records(cls, data, columns=None, **kwargs):
|
|
1053
|
+
"""
|
|
1054
|
+
DESCRIPTION:
|
|
1055
|
+
Create a DataFrame from a list of lists/tuples/dictionaries/numpy arrays.
|
|
1056
|
+
|
|
1057
|
+
PARAMETERS:
|
|
1058
|
+
data:
|
|
1059
|
+
Required Argument.
|
|
1060
|
+
Specifies the iterator of data or the list of lists/tuples/dictionaries/numpy arrays to
|
|
1061
|
+
be converted to teradataml DataFrame.
|
|
1062
|
+
Note:
|
|
1063
|
+
* Nested lists or tuples or dictionaries are not supported.
|
|
1064
|
+
Types: Iterator, list
|
|
1065
|
+
|
|
1066
|
+
columns:
|
|
1067
|
+
Optional Argument.
|
|
1068
|
+
Specifies the column names for the DataFrame.
|
|
1069
|
+
Note:
|
|
1070
|
+
* If the data is a list of lists/tuples/numpy arrays and this argument
|
|
1071
|
+
is not specified, column names will be auto-generated as 'col_0', 'col_1', etc.
|
|
1072
|
+
Types: str OR list of str
|
|
1073
|
+
|
|
1074
|
+
kwargs:
|
|
1075
|
+
exclude:
|
|
1076
|
+
Optional Argument.
|
|
1077
|
+
Specifies the columns to be excluded from the DataFrame.
|
|
1078
|
+
Types: list OR tuple
|
|
1079
|
+
|
|
1080
|
+
coerce_float:
|
|
1081
|
+
Optional Argument.
|
|
1082
|
+
Specifies whether to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
1083
|
+
to floating point, useful for SQL result sets.
|
|
1084
|
+
Default Value: True
|
|
1085
|
+
Types: bool
|
|
1086
|
+
|
|
1087
|
+
nrows:
|
|
1088
|
+
Optional Argument.
|
|
1089
|
+
Specifies the number of rows to be read from the data if the data is iterator.
|
|
1090
|
+
Types: int
|
|
1091
|
+
|
|
1092
|
+
persist:
|
|
1093
|
+
Optional Argument.
|
|
1094
|
+
Specifies whether to persist the DataFrame.
|
|
1095
|
+
Default Value: False
|
|
1096
|
+
Types: bool
|
|
1097
|
+
|
|
1098
|
+
RETURNS:
|
|
1099
|
+
teradataml DataFrame
|
|
1100
|
+
|
|
1101
|
+
RAISES:
|
|
1102
|
+
TeradataMlException
|
|
1103
|
+
|
|
1104
|
+
EXAMPLES:
|
|
1105
|
+
>>> from teradataml import DataFrame
|
|
1106
|
+
|
|
1107
|
+
# Example 1: Create a teradataml DataFrame from a list of lists.
|
|
1108
|
+
>>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]], columns=['name', 'age'])
|
|
1109
|
+
>>> df
|
|
1110
|
+
name age
|
|
1111
|
+
0 Bob 2
|
|
1112
|
+
1 Alice 1
|
|
1113
|
+
|
|
1114
|
+
# Example 2: Create a teradataml DataFrame from a list of tuples.
|
|
1115
|
+
>>> df = DataFrame.from_records([('Alice', 1), ('Bob', 3)], columns=['name', 'age'])
|
|
1116
|
+
>>> df
|
|
1117
|
+
name age
|
|
1118
|
+
0 Bob 3
|
|
1119
|
+
1 Alice 1
|
|
1120
|
+
|
|
1121
|
+
# Example 3: Create a teradataml DataFrame from a list of dictionaries.
|
|
1122
|
+
>>> df = DataFrame.from_records([{'name': 'Alice', 'age': 4}, {'name': 'Bob', 'age': 2}])
|
|
1123
|
+
>>> df
|
|
1124
|
+
name age
|
|
1125
|
+
0 Bob 2
|
|
1126
|
+
1 Alice 4
|
|
1127
|
+
|
|
1128
|
+
# Example 4: Create a teradataml DataFrame from a list where columns
|
|
1129
|
+
# are not explicitly defined.
|
|
1130
|
+
>>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]])
|
|
1131
|
+
>>> df
|
|
1132
|
+
col_0 col_1
|
|
1133
|
+
0 Bob 2
|
|
1134
|
+
1 Alice 1
|
|
1135
|
+
|
|
1136
|
+
# Example 5: Create a teradataml DataFrame from a list by excluding 'grade' column.
|
|
1137
|
+
>>> df = DataFrame.from_records([['Alice', 1, 'A'], ['Bob', 2, 'B']],
|
|
1138
|
+
... columns=['name', 'age', 'grade'],
|
|
1139
|
+
... exclude=['grade'])
|
|
1140
|
+
>>> df
|
|
1141
|
+
name age
|
|
1142
|
+
0 Bob 2
|
|
1143
|
+
1 Alice 1
|
|
1144
|
+
|
|
1145
|
+
# Example 6: Create a teradataml DataFrame from a list of lists
|
|
1146
|
+
# with "coerce_float" set to False.
|
|
1147
|
+
>>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
|
|
1148
|
+
... columns=['col1', 'col2'], coerce_float=False)
|
|
1149
|
+
>>> df
|
|
1150
|
+
col1 col2
|
|
1151
|
+
0 3 4.0
|
|
1152
|
+
1 1 2.5
|
|
1153
|
+
>>> df.tdtypes
|
|
1154
|
+
col1 BIGINT()
|
|
1155
|
+
col2 VARCHAR(length=1024, charset='UNICODE')
|
|
1156
|
+
|
|
1157
|
+
# Example 7: Create a teradataml DataFrame from a list of lists
|
|
1158
|
+
# with "coerce_float" set to True.
|
|
1159
|
+
>>> from decimal import Decimal
|
|
1160
|
+
>>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
|
|
1161
|
+
... columns=['col1', 'col2'], coerce_float=True)
|
|
1162
|
+
>>> df
|
|
1163
|
+
col1 col2
|
|
1164
|
+
0 3 4.0
|
|
1165
|
+
1 1 2.5
|
|
1166
|
+
>>> df.tdtypes
|
|
1167
|
+
col1 BIGINT()
|
|
1168
|
+
col2 FLOAT()
|
|
1169
|
+
|
|
1170
|
+
# Example 8: Create a teradataml DataFrame from an iterator with "nrows" set to 2.
|
|
1171
|
+
>>> def data_gen():
|
|
1172
|
+
... yield ['Alice', 1]
|
|
1173
|
+
... yield ['Bob', 2]
|
|
1174
|
+
... yield ['Charlie', 3]
|
|
1175
|
+
>>> df = DataFrame.from_records(data_gen(), columns=['name', 'age'], nrows=2)
|
|
1176
|
+
>>> df
|
|
1177
|
+
name age
|
|
1178
|
+
0 Bob 2
|
|
1179
|
+
1 Alice 1
|
|
1180
|
+
"""
|
|
1181
|
+
|
|
1182
|
+
exclude = kwargs.get("exclude", None)
|
|
1183
|
+
coerce_float = kwargs.get("coerce_float", True)
|
|
1184
|
+
nrows = kwargs.get("nrows", None)
|
|
1185
|
+
persist = kwargs.get("persist", False)
|
|
1186
|
+
|
|
1187
|
+
arg_type_matrix = []
|
|
1188
|
+
dtypes = (list, tuple, dict)
|
|
1189
|
+
arg_type_matrix.append(["data", data, False, (Iterator, _ListOf(dtypes)), True])
|
|
1190
|
+
arg_type_matrix.append(["columns", columns, True, (str, _ListOf(str)), True])
|
|
1191
|
+
arg_type_matrix.append(["exclude", exclude, True, (_ListOf(str),), True])
|
|
1192
|
+
arg_type_matrix.append(["coerce_float", coerce_float, True, (bool, ), True])
|
|
1193
|
+
arg_type_matrix.append(["nrows", nrows, True, (int,), True])
|
|
1194
|
+
arg_type_matrix.append(["persist", persist, True, (bool,), True])
|
|
1195
|
+
|
|
1196
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1197
|
+
|
|
1198
|
+
if isinstance(columns, str):
|
|
1199
|
+
columns = [columns]
|
|
1200
|
+
|
|
1201
|
+
df = pd.DataFrame.from_records(data, columns=columns, exclude=exclude,
|
|
1202
|
+
coerce_float=coerce_float, nrows=nrows)
|
|
1203
|
+
return cls(df, index=False, persist=persist)
|
|
1204
|
+
|
|
695
1205
|
def create_temp_view(self, name):
|
|
696
1206
|
"""
|
|
697
1207
|
DESCRIPTION:
|
|
@@ -1084,6 +1594,57 @@ class DataFrame():
|
|
|
1084
1594
|
self._is_art = res[0][0] == 1
|
|
1085
1595
|
return self._is_art
|
|
1086
1596
|
|
|
1597
|
+
|
|
1598
|
+
def _process_columns_metadata(self):
|
|
1599
|
+
"""
|
|
1600
|
+
DESCRIPTION:
|
|
1601
|
+
Processes the metadata of columns to determine their time dimension properties
|
|
1602
|
+
and to check whether database object is a view, volatile table, or ART table.
|
|
1603
|
+
|
|
1604
|
+
PARAMETERS:
|
|
1605
|
+
None
|
|
1606
|
+
|
|
1607
|
+
RAISES:
|
|
1608
|
+
None
|
|
1609
|
+
|
|
1610
|
+
RETURNS:
|
|
1611
|
+
Tuple containing five boolean values:
|
|
1612
|
+
- is_view: True if the database object is a view, False otherwise.
|
|
1613
|
+
- is_volatile: True if the database object is a volatile table, False otherwise.
|
|
1614
|
+
- is_art_table: True if the database object is an ART table, False otherwise.
|
|
1615
|
+
- has_valid_time: True if any column has a valid time dimension, False otherwise.
|
|
1616
|
+
- has_transaction_time: True if any column has a transaction time dimension, False otherwise.
|
|
1617
|
+
EXAMPLES:
|
|
1618
|
+
>>> load_example_data("teradataml", "Employee")
|
|
1619
|
+
>>> df = DataFrame.from_table("Employee")
|
|
1620
|
+
>>> is_view, is_volatile, is_art_table, valid_time, transaction_time = (
|
|
1621
|
+
df._process_columns_metadata()
|
|
1622
|
+
)
|
|
1623
|
+
>>> is_view, is_volatile, is_art_table, valid_time, transaction_time
|
|
1624
|
+
(False, False, False, True, True)
|
|
1625
|
+
|
|
1626
|
+
"""
|
|
1627
|
+
|
|
1628
|
+
is_view = is_volatile = is_art_table = False
|
|
1629
|
+
|
|
1630
|
+
for col in self._metaexpr.c:
|
|
1631
|
+
metadata = col.expression.info
|
|
1632
|
+
time_dimension = metadata.get('time_dimension')
|
|
1633
|
+
is_view = metadata.get('is_view', is_view)
|
|
1634
|
+
is_volatile = metadata.get('is_volatile', is_volatile)
|
|
1635
|
+
is_art_table = metadata.get('is_art_table', is_art_table)
|
|
1636
|
+
|
|
1637
|
+
if time_dimension == "V":
|
|
1638
|
+
self._valid_time_column = col
|
|
1639
|
+
|
|
1640
|
+
if time_dimension == "T":
|
|
1641
|
+
self._transaction_time_column = col
|
|
1642
|
+
|
|
1643
|
+
has_valid_time = self._valid_time_column is not None
|
|
1644
|
+
has_transaction_time = self._transaction_time_column is not None
|
|
1645
|
+
|
|
1646
|
+
return is_view, is_volatile, is_art_table, has_valid_time, has_transaction_time
|
|
1647
|
+
|
|
1087
1648
|
def _get_metadata_from_metaexpr(self, metaexpr):
|
|
1088
1649
|
"""
|
|
1089
1650
|
Private method for setting _metaexpr and retrieving column names and types.
|
|
@@ -1136,6 +1697,7 @@ class DataFrame():
|
|
|
1136
1697
|
meta = sqlalchemy.MetaData()
|
|
1137
1698
|
db_schema = UtilFuncs._extract_db_name(self._table_name)
|
|
1138
1699
|
db_table_name = UtilFuncs._extract_table_name(self._table_name)
|
|
1700
|
+
|
|
1139
1701
|
if not self._datalake:
|
|
1140
1702
|
t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
|
|
1141
1703
|
return _MetaExpression(t)
|
|
@@ -1149,12 +1711,22 @@ class DataFrame():
|
|
|
1149
1711
|
datalake=self._datalake)
|
|
1150
1712
|
|
|
1151
1713
|
# Extract column names and corresponding teradatasqlalchemy types.
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1714
|
+
try:
|
|
1715
|
+
# For latest OTF help table query results.
|
|
1716
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1717
|
+
self._table,
|
|
1718
|
+
self._datalake,
|
|
1719
|
+
use_dialect=True)
|
|
1720
|
+
except NoSuchColumnError:
|
|
1721
|
+
# For older OTF help table query result.
|
|
1722
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1723
|
+
self._table,
|
|
1724
|
+
self._datalake)
|
|
1725
|
+
|
|
1726
|
+
# Create a SQLAlchemy table object representing datalake table.
|
|
1155
1727
|
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1156
1728
|
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
1157
|
-
return _MetaExpression(t)
|
|
1729
|
+
return _MetaExpression(t, datalake=self._datalake)
|
|
1158
1730
|
|
|
1159
1731
|
def __getattr__(self, name):
|
|
1160
1732
|
"""
|
|
@@ -1693,8 +2265,150 @@ class DataFrame():
|
|
|
1693
2265
|
td_metadata = [(column.name, repr(column.type)) for column in self._metaexpr.c]
|
|
1694
2266
|
return MetaData(td_metadata)
|
|
1695
2267
|
|
|
1696
|
-
@
|
|
1697
|
-
def
|
|
2268
|
+
@property
|
|
2269
|
+
def df_type(self):
|
|
2270
|
+
"""
|
|
2271
|
+
DESCRIPTION:
|
|
2272
|
+
Returns the type of the DataFrame based on the underlying database object.
|
|
2273
|
+
Possible teradataml DataFrame types are:
|
|
2274
|
+
- VALID_TIME_VIEW: DataFrame is created on Valid-Time dimension view.
|
|
2275
|
+
- TRANSACTION_TIME_VIEW: DataFrame is created on Transaction-Time dimension view.
|
|
2276
|
+
- BI_TEMPORAL_VIEW: DataFrame is created on Bi-temporal view.
|
|
2277
|
+
- VALID_TIME: DataFrame is created on Valid-Time dimension table.
|
|
2278
|
+
- TRANSACTION_TIME: DataFrame is created on Transaction-Time dimension table.
|
|
2279
|
+
- BI_TEMPORAL: DataFrame is created on Bi-temporal dimension table.
|
|
2280
|
+
- VIEW: DataFrame is created on a view.
|
|
2281
|
+
- TABLE: DataFrame is created on a table.
|
|
2282
|
+
- OTF: DataFrame is created on an OTF table.
|
|
2283
|
+
- ART: DataFrame is created on an ART table.
|
|
2284
|
+
- VOLATILE_TABLE: DataFrame is created on a volatile table.
|
|
2285
|
+
- BI_TEMPORAL_VOLATILE_TABLE: DataFrame is created on a Bi-temporal dimension volatile table.
|
|
2286
|
+
- VALID_TIME_VOLATILE_TABLE: DataFrame is created on a Valid-Time dimension volatile table.
|
|
2287
|
+
- TRANSACTION_TIME_VOLATILE_TABLE: DataFrame is created on a Transaction-Time dimension volatile table.
|
|
2288
|
+
|
|
2289
|
+
RETURNS:
|
|
2290
|
+
str
|
|
2291
|
+
|
|
2292
|
+
RAISES:
|
|
2293
|
+
None
|
|
2294
|
+
|
|
2295
|
+
EXAMPLES:
|
|
2296
|
+
# Load the data to run the example.
|
|
2297
|
+
>>> load_example_data("teradataml", "Employee_roles") # load valid time data.
|
|
2298
|
+
>>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
|
|
2299
|
+
>>> load_example_data("teradataml", "Employee") # load bitemporal data.
|
|
2300
|
+
>>> load_example_data("uaf", ["ocean_buoys2"]) # load data to create art table.
|
|
2301
|
+
>>> load_example_data('dataframe', ['admissions_train']) # load data to create a regular table.
|
|
2302
|
+
|
|
2303
|
+
# Example 1: DataFrame created on a Valid-Time dimension table.
|
|
2304
|
+
>>> df = DataFrame.from_table('Employee_roles')
|
|
2305
|
+
>>> df.df_type
|
|
2306
|
+
'VALID_TIME'
|
|
2307
|
+
|
|
2308
|
+
# Example 2: DataFrame created on a Transaction-Time dimension table.
|
|
2309
|
+
>>> df = DataFrame.from_table('Employee_Address')
|
|
2310
|
+
>>> df.df_type
|
|
2311
|
+
'TRANSACTION_TIME'
|
|
2312
|
+
|
|
2313
|
+
# Example 3: DataFrame created on a Bi-temporal dimension table.
|
|
2314
|
+
>>> df = DataFrame.from_table('Employee')
|
|
2315
|
+
>>> df.df_type
|
|
2316
|
+
'BI_TEMPORAL'
|
|
2317
|
+
|
|
2318
|
+
# Example 4: DataFrame created on a ART table.
|
|
2319
|
+
>>> data = DataFrame.from_table('ocean_buoys2')
|
|
2320
|
+
>>> from teradataml import TDSeries,SInfo
|
|
2321
|
+
>>> data_series_df = TDSeries(data=data,
|
|
2322
|
+
... id=["ocean_name","buoyid"],
|
|
2323
|
+
... row_index="TD_TIMECODE",
|
|
2324
|
+
... row_index_style="TIMECODE",
|
|
2325
|
+
... payload_field="jsoncol.Measure.salinity",
|
|
2326
|
+
... payload_content="REAL")
|
|
2327
|
+
>>> uaf_out = SInfo(data=data_series_df, output_table_name='TSINFO_RESULTS')
|
|
2328
|
+
>>> df = DataFrame.from_table('TSINFO_RESULTS')
|
|
2329
|
+
>>> df.df_type
|
|
2330
|
+
'ART'
|
|
2331
|
+
|
|
2332
|
+
# Example 5: DataFrame created on a regular table.
|
|
2333
|
+
>>> df = DataFrame.from_table('admissions_train')
|
|
2334
|
+
>>> df.df_type
|
|
2335
|
+
'REGULAR_TABLE'
|
|
2336
|
+
|
|
2337
|
+
# Example 6: DataFrame created on a volatile table.
|
|
2338
|
+
>>> df = DataFrame.from_table('admissions_train')
|
|
2339
|
+
>>> df.to_sql(table_name='admissions_train_volatile', temporary=True)
|
|
2340
|
+
>>> df = DataFrame.from_table('admissions_train_volatile')
|
|
2341
|
+
>>> df.df_type
|
|
2342
|
+
'VOLATILE_TABLE'
|
|
2343
|
+
|
|
2344
|
+
# Example 7: DataFrame created on a Bi-temporal dimension view.
|
|
2345
|
+
>>> execute_sql('create view Employee_view AS SEQUENCED VALIDTIME AND SEQUENCED TRANSACTIONTIME select * from Employee')
|
|
2346
|
+
>>> df = DataFrame.from_table('Employee_view')
|
|
2347
|
+
>>> df.df_type
|
|
2348
|
+
'BI_TEMPORAL_VIEW'
|
|
2349
|
+
|
|
2350
|
+
"""
|
|
2351
|
+
|
|
2352
|
+
if self._df_type is not None:
|
|
2353
|
+
return self._df_type
|
|
2354
|
+
|
|
2355
|
+
is_view, is_volatile, is_art_table, valid_time, transaction_time = (
|
|
2356
|
+
self._process_columns_metadata()
|
|
2357
|
+
)
|
|
2358
|
+
|
|
2359
|
+
# Check if the DataFrame is created from an OTF table
|
|
2360
|
+
if self._otf:
|
|
2361
|
+
self._df_type = DataFrameTypes.OTF_TABLE.value
|
|
2362
|
+
return self._df_type
|
|
2363
|
+
|
|
2364
|
+
# Check if the DataFrame is created from an ART table
|
|
2365
|
+
if is_art_table:
|
|
2366
|
+
self._df_type = DataFrameTypes.ART_TABLE.value
|
|
2367
|
+
return self._df_type
|
|
2368
|
+
|
|
2369
|
+
# Determine the type based on valid-time, transaction-time columns, and volatility
|
|
2370
|
+
if valid_time and transaction_time:
|
|
2371
|
+
if is_volatile:
|
|
2372
|
+
self._df_type = DataFrameTypes.BI_TEMPORAL_VOLATILE_TABLE.value
|
|
2373
|
+
else:
|
|
2374
|
+
self._df_type = (
|
|
2375
|
+
DataFrameTypes.BI_TEMPORAL_VIEW.value
|
|
2376
|
+
if is_view
|
|
2377
|
+
else DataFrameTypes.BI_TEMPORAL.value
|
|
2378
|
+
)
|
|
2379
|
+
elif valid_time:
|
|
2380
|
+
if is_volatile:
|
|
2381
|
+
self._df_type = DataFrameTypes.VALID_TIME_VOLATILE_TABLE.value
|
|
2382
|
+
else:
|
|
2383
|
+
self._df_type = (
|
|
2384
|
+
DataFrameTypes.VALID_TIME_VIEW.value
|
|
2385
|
+
if is_view
|
|
2386
|
+
else DataFrameTypes.VALID_TIME.value
|
|
2387
|
+
)
|
|
2388
|
+
elif transaction_time:
|
|
2389
|
+
if is_volatile:
|
|
2390
|
+
self._df_type = DataFrameTypes.TRANSACTION_TIME_VOLATILE_TABLE.value
|
|
2391
|
+
else:
|
|
2392
|
+
self._df_type = (
|
|
2393
|
+
DataFrameTypes.TRANSACTION_TIME_VIEW.value
|
|
2394
|
+
if is_view
|
|
2395
|
+
else DataFrameTypes.TRANSACTION_TIME.value
|
|
2396
|
+
)
|
|
2397
|
+
else:
|
|
2398
|
+
self._df_type = (
|
|
2399
|
+
DataFrameTypes.VOLATILE_TABLE.value
|
|
2400
|
+
if is_volatile
|
|
2401
|
+
else (
|
|
2402
|
+
DataFrameTypes.VIEW.value
|
|
2403
|
+
if is_view
|
|
2404
|
+
else DataFrameTypes.REGULAR_TABLE.value
|
|
2405
|
+
)
|
|
2406
|
+
)
|
|
2407
|
+
|
|
2408
|
+
return self._df_type
|
|
2409
|
+
|
|
2410
|
+
@collect_queryband(queryband="DF_info")
|
|
2411
|
+
def info(self, verbose=True, buf=None, max_cols=None, null_counts=False):
|
|
1698
2412
|
"""
|
|
1699
2413
|
DESCRIPTION:
|
|
1700
2414
|
Print a summary of the DataFrame.
|
|
@@ -5888,8 +6602,11 @@ class DataFrame():
|
|
|
5888
6602
|
groupby_col_names.append(col)
|
|
5889
6603
|
groupby_col_types.append(self[col].type)
|
|
5890
6604
|
|
|
5891
|
-
if
|
|
5892
|
-
|
|
6605
|
+
include_grouping_columns = True if isinstance(self, DataFrameGroupBy) and \
|
|
6606
|
+
self._include_grouping_columns else False
|
|
6607
|
+
if not include_grouping_columns and col in col_names:
|
|
6608
|
+
# If 'include_grouping_columns' argument is set to True and,
|
|
6609
|
+
# group by column is not specified in the columns argument,
|
|
5893
6610
|
# then, we should ignore this processing, otherwise we
|
|
5894
6611
|
# should process it in the same way to remove the reference
|
|
5895
6612
|
# for grouping column from aggregation list.
|
|
@@ -5933,7 +6650,8 @@ class DataFrame():
|
|
|
5933
6650
|
|
|
5934
6651
|
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(aggregate_node_id,
|
|
5935
6652
|
zip(new_column_names,
|
|
5936
|
-
new_column_types)
|
|
6653
|
+
new_column_types),
|
|
6654
|
+
datalake=self._metaexpr.datalake)
|
|
5937
6655
|
agg_df = self._create_dataframe_from_node \
|
|
5938
6656
|
(aggregate_node_id, new_metaexpr, self._index_label)
|
|
5939
6657
|
|
|
@@ -6352,7 +7070,8 @@ class DataFrame():
|
|
|
6352
7070
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression)
|
|
6353
7071
|
|
|
6354
7072
|
# Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
|
|
6355
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items()
|
|
7073
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
|
|
7074
|
+
datalake=self._metaexpr.datalake)
|
|
6356
7075
|
return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
6357
7076
|
|
|
6358
7077
|
except TeradataMlException:
|
|
@@ -7302,7 +8021,8 @@ class DataFrame():
|
|
|
7302
8021
|
|
|
7303
8022
|
# Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
|
|
7304
8023
|
# and underlying table name.
|
|
7305
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items()
|
|
8024
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items(),
|
|
8025
|
+
datalake=self._metaexpr.datalake)
|
|
7306
8026
|
|
|
7307
8027
|
# Return a new joined dataframe.
|
|
7308
8028
|
return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
|
|
@@ -8675,7 +9395,6 @@ class DataFrame():
|
|
|
8675
9395
|
|
|
8676
9396
|
return df
|
|
8677
9397
|
|
|
8678
|
-
|
|
8679
9398
|
@collect_queryband(queryband="DF_get")
|
|
8680
9399
|
def get(self, key):
|
|
8681
9400
|
"""
|
|
@@ -8785,7 +9504,7 @@ class DataFrame():
|
|
|
8785
9504
|
append:
|
|
8786
9505
|
Optional Argument.
|
|
8787
9506
|
Specifies whether or not to append requested columns to the existing index.
|
|
8788
|
-
|
|
9507
|
+
When append is False, replaces existing index.
|
|
8789
9508
|
When append is True, retains both existing & currently appended index.
|
|
8790
9509
|
Default Value: False
|
|
8791
9510
|
Types: bool
|
|
@@ -8998,6 +9717,15 @@ class DataFrame():
|
|
|
8998
9717
|
Permitted Values: "CUBE", "ROLLUP", None
|
|
8999
9718
|
Types: str or NoneType
|
|
9000
9719
|
|
|
9720
|
+
include_grouping_columns:
|
|
9721
|
+
Optional Argument.
|
|
9722
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
9723
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
9724
|
+
columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
|
|
9725
|
+
aggregations on the columns mentioned in "columns_expr".
|
|
9726
|
+
Default Value: False
|
|
9727
|
+
Types: bool
|
|
9728
|
+
|
|
9001
9729
|
NOTES:
|
|
9002
9730
|
1. Users can still apply teradataml DataFrame methods (filters/sort/etc) on top of the result.
|
|
9003
9731
|
2. Consecutive operations of grouping, i.e., groupby_time(), resample() and groupby() are not permitted.
|
|
@@ -9014,14 +9742,54 @@ class DataFrame():
|
|
|
9014
9742
|
TeradataMlException
|
|
9015
9743
|
|
|
9016
9744
|
EXAMPLES:
|
|
9745
|
+
# Load the data to run the example.
|
|
9017
9746
|
>>> load_example_data("dataframe","admissions_train")
|
|
9747
|
+
|
|
9748
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
9018
9749
|
>>> df = DataFrame("admissions_train")
|
|
9750
|
+
>>> df
|
|
9751
|
+
masters gpa stats programming admitted
|
|
9752
|
+
id
|
|
9753
|
+
15 yes 4.00 Advanced Advanced 1
|
|
9754
|
+
34 yes 3.85 Advanced Beginner 0
|
|
9755
|
+
13 no 4.00 Advanced Novice 1
|
|
9756
|
+
38 yes 2.65 Advanced Beginner 1
|
|
9757
|
+
5 no 3.44 Novice Novice 0
|
|
9758
|
+
40 yes 3.95 Novice Beginner 0
|
|
9759
|
+
7 yes 2.33 Novice Novice 1
|
|
9760
|
+
22 yes 3.46 Novice Beginner 0
|
|
9761
|
+
26 yes 3.57 Advanced Advanced 1
|
|
9762
|
+
17 no 3.83 Advanced Advanced 1
|
|
9763
|
+
|
|
9764
|
+
# Example 1: Find the minimum value of all valid columns by
|
|
9765
|
+
# grouping the DataFrame with column 'masters'.
|
|
9019
9766
|
>>> df1 = df.groupby(["masters"])
|
|
9020
9767
|
>>> df1.min()
|
|
9021
9768
|
masters min_id min_gpa min_stats min_programming min_admitted
|
|
9022
9769
|
0 no 3 1.87 Advanced Advanced 0
|
|
9023
9770
|
1 yes 1 1.98 Advanced Advanced 0
|
|
9024
9771
|
|
|
9772
|
+
# Example 2: Find the sum of all valid columns by grouping the DataFrame
|
|
9773
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
9774
|
+
# in aggregate function 'sum'.
|
|
9775
|
+
>>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=True)
|
|
9776
|
+
>>> df1.sum()
|
|
9777
|
+
masters admitted sum_id sum_gpa sum_admitted
|
|
9778
|
+
0 yes 1 188 34.35 10
|
|
9779
|
+
1 yes 0 289 43.36 0
|
|
9780
|
+
2 no 0 41 6.44 0
|
|
9781
|
+
3 no 1 302 57.52 16
|
|
9782
|
+
|
|
9783
|
+
# Example 3: Find the sum of all valid columns by grouping the DataFrame with
|
|
9784
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
9785
|
+
# in aggregate function 'sum'.
|
|
9786
|
+
>>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=False)
|
|
9787
|
+
>>> df1.sum()
|
|
9788
|
+
masters admitted sum_id sum_gpa
|
|
9789
|
+
0 yes 0 289 43.36
|
|
9790
|
+
1 no 0 41 6.44
|
|
9791
|
+
2 no 1 302 57.52
|
|
9792
|
+
3 yes 1 188 34.35
|
|
9025
9793
|
"""
|
|
9026
9794
|
# Argument validations
|
|
9027
9795
|
arg_info_matrix = []
|
|
@@ -9029,6 +9797,8 @@ class DataFrame():
|
|
|
9029
9797
|
option = kwargs.get("option", None)
|
|
9030
9798
|
arg_info_matrix.append(["option", option, True, (str, type(None)), True,
|
|
9031
9799
|
["CUBE", "ROLLUP", None]])
|
|
9800
|
+
include_grouping_columns = kwargs.get("include_grouping_columns", False)
|
|
9801
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, True, (bool)])
|
|
9032
9802
|
|
|
9033
9803
|
# Validate argument types
|
|
9034
9804
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -9073,7 +9843,8 @@ class DataFrame():
|
|
|
9073
9843
|
|
|
9074
9844
|
groupbyexpr = ', '.join(UtilFuncs._teradata_quote_arg(col, "\"", False) for col in column_list)
|
|
9075
9845
|
groupbyObj = DataFrameGroupBy(self._nodeid, self._metaexpr, self._column_names_and_types, self.columns,
|
|
9076
|
-
groupbyexpr, column_list, option)
|
|
9846
|
+
groupbyexpr, column_list, option, include_grouping_columns)
|
|
9847
|
+
|
|
9077
9848
|
return groupbyObj
|
|
9078
9849
|
except TeradataMlException:
|
|
9079
9850
|
raise
|
|
@@ -11437,7 +12208,8 @@ class DataFrame():
|
|
|
11437
12208
|
column_info = ((col_name, col_type) for col_name, col_type in
|
|
11438
12209
|
new_metaexpr_columns_types.items())
|
|
11439
12210
|
# Get new metaexpr for sample_node_id
|
|
11440
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True
|
|
12211
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True,
|
|
12212
|
+
datalake=self._metaexpr.datalake)
|
|
11441
12213
|
|
|
11442
12214
|
# Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
|
|
11443
12215
|
# Cannot use __execute_node_and_set_table_name because self points to original df.
|
|
@@ -12872,9 +13644,9 @@ class DataFrame():
|
|
|
12872
13644
|
3. When ColumnExpression(s) is(are) passed to "order_columns", then the
|
|
12873
13645
|
corresponding expression takes precedence over arguments
|
|
12874
13646
|
"sort_ascending" and "nulls_first". Say, ColumnExpression is col1, then
|
|
12875
|
-
|
|
12876
|
-
|
|
12877
|
-
|
|
13647
|
+
1. col1.asc() or col.desc() is effective irrespective of "sort_ascending".
|
|
13648
|
+
2. col1.nulls_first() or col.nulls_last() is effective irrespective of "nulls_first".
|
|
13649
|
+
3. Any combination of above two take precedence over "sort_ascending" and "nulls_first".
|
|
12878
13650
|
Types: str OR list of Strings (str) OR ColumnExpression OR list of ColumnExpressions
|
|
12879
13651
|
|
|
12880
13652
|
sort_ascending:
|
|
@@ -13150,12 +13922,14 @@ class DataFrame():
|
|
|
13150
13922
|
False)
|
|
13151
13923
|
column_names = list(dict.fromkeys(column_names))
|
|
13152
13924
|
|
|
13153
|
-
if list_td_reserved_keywords(column_names) or UtilFuncs.
|
|
13925
|
+
if list_td_reserved_keywords(column_names) or UtilFuncs._is_non_ascii(column_names):
|
|
13154
13926
|
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
13155
13927
|
|
|
13156
13928
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
13157
13929
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
|
|
13158
|
-
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items()
|
|
13930
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
|
|
13931
|
+
datalake=self._metaexpr.datalake)
|
|
13932
|
+
|
|
13159
13933
|
return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
13160
13934
|
|
|
13161
13935
|
@collect_queryband(queryband="DF_toCsv")
|
|
@@ -15336,7 +16110,7 @@ class DataFrame():
|
|
|
15336
16110
|
return self.assign(**new_columns, drop_columns=True).select(self.columns)
|
|
15337
16111
|
|
|
15338
16112
|
@collect_queryband(queryband="DF_cube")
|
|
15339
|
-
def cube(self, columns):
|
|
16113
|
+
def cube(self, columns, include_grouping_columns=False):
|
|
15340
16114
|
"""
|
|
15341
16115
|
DESCRIPTION:
|
|
15342
16116
|
cube() function creates a multi-dimensional cube for the DataFrame
|
|
@@ -15350,6 +16124,15 @@ class DataFrame():
|
|
|
15350
16124
|
Specifies the name(s) of input teradataml DataFrame column(s).
|
|
15351
16125
|
Types: str OR list of str(s)
|
|
15352
16126
|
|
|
16127
|
+
include_grouping_columns:
|
|
16128
|
+
Optional Argument.
|
|
16129
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
16130
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16131
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16132
|
+
aggregations on the columns mentioned in "columns".
|
|
16133
|
+
Default Value: False
|
|
16134
|
+
Types: bool
|
|
16135
|
+
|
|
15353
16136
|
RETURNS:
|
|
15354
16137
|
teradataml DataFrameGroupBy
|
|
15355
16138
|
|
|
@@ -15357,9 +16140,27 @@ class DataFrame():
|
|
|
15357
16140
|
TeradataMlException
|
|
15358
16141
|
|
|
15359
16142
|
EXAMPLES :
|
|
15360
|
-
#
|
|
16143
|
+
# Load the data to run the example.
|
|
15361
16144
|
>>> load_example_data("dataframe","admissions_train")
|
|
16145
|
+
|
|
16146
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
15362
16147
|
>>> df = DataFrame("admissions_train")
|
|
16148
|
+
>>> df
|
|
16149
|
+
masters gpa stats programming admitted
|
|
16150
|
+
id
|
|
16151
|
+
15 yes 4.00 Advanced Advanced 1
|
|
16152
|
+
34 yes 3.85 Advanced Beginner 0
|
|
16153
|
+
13 no 4.00 Advanced Novice 1
|
|
16154
|
+
38 yes 2.65 Advanced Beginner 1
|
|
16155
|
+
5 no 3.44 Novice Novice 0
|
|
16156
|
+
40 yes 3.95 Novice Beginner 0
|
|
16157
|
+
7 yes 2.33 Novice Novice 1
|
|
16158
|
+
22 yes 3.46 Novice Beginner 0
|
|
16159
|
+
26 yes 3.57 Advanced Advanced 1
|
|
16160
|
+
17 no 3.83 Advanced Advanced 1
|
|
16161
|
+
|
|
16162
|
+
# Example 1: Find the sum of all valid columns by grouping the
|
|
16163
|
+
# DataFrame columns with 'masters' and 'stats'.
|
|
15363
16164
|
>>> df1 = df.cube(["masters", "stats"]).sum()
|
|
15364
16165
|
>>> df1
|
|
15365
16166
|
masters stats sum_id sum_gpa sum_admitted
|
|
@@ -15374,10 +16175,42 @@ class DataFrame():
|
|
|
15374
16175
|
8 no Advanced 189 34.95 9
|
|
15375
16176
|
9 yes Novice 98 13.74 1
|
|
15376
16177
|
|
|
16178
|
+
# Example 2: Find the avg of all valid columns by grouping the DataFrame
|
|
16179
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
16180
|
+
# in aggregate function 'avg'.
|
|
16181
|
+
>>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=True).avg()
|
|
16182
|
+
>>> df1
|
|
16183
|
+
masters admitted avg_id avg_gpa avg_admitted
|
|
16184
|
+
0 yes NaN 21.681818 3.532273 0.454545
|
|
16185
|
+
1 None 1.0 18.846154 3.533462 1.000000
|
|
16186
|
+
2 no NaN 19.055556 3.553333 0.888889
|
|
16187
|
+
3 yes 0.0 24.083333 3.613333 0.000000
|
|
16188
|
+
4 None NaN 20.500000 3.541750 0.650000
|
|
16189
|
+
5 None 0.0 23.571429 3.557143 0.000000
|
|
16190
|
+
6 yes 1.0 18.800000 3.435000 1.000000
|
|
16191
|
+
7 no 1.0 18.875000 3.595000 1.000000
|
|
16192
|
+
8 no 0.0 20.500000 3.220000 0.000000
|
|
16193
|
+
|
|
16194
|
+
# Example 3: Find the avg of all valid columns by grouping the DataFrame with
|
|
16195
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
16196
|
+
# in aggregate function 'avg'.
|
|
16197
|
+
>>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=False).avg()
|
|
16198
|
+
>>> df1
|
|
16199
|
+
masters admitted avg_id avg_gpa
|
|
16200
|
+
0 no 0.0 20.500000 3.220000
|
|
16201
|
+
1 None 1.0 18.846154 3.533462
|
|
16202
|
+
2 no NaN 19.055556 3.553333
|
|
16203
|
+
3 yes 0.0 24.083333 3.613333
|
|
16204
|
+
4 None NaN 20.500000 3.541750
|
|
16205
|
+
5 None 0.0 23.571429 3.557143
|
|
16206
|
+
6 yes 1.0 18.800000 3.435000
|
|
16207
|
+
7 yes NaN 21.681818 3.532273
|
|
16208
|
+
8 no 1.0 18.875000 3.595000
|
|
15377
16209
|
"""
|
|
15378
16210
|
# Validate columns argument.
|
|
15379
16211
|
arg_info_matrix = []
|
|
15380
16212
|
arg_info_matrix.append(["columns", columns, False, (str, list), True])
|
|
16213
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
|
|
15381
16214
|
|
|
15382
16215
|
# Validate argument types
|
|
15383
16216
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -15387,10 +16220,10 @@ class DataFrame():
|
|
|
15387
16220
|
|
|
15388
16221
|
# Query generation of cube API is same as the group by.
|
|
15389
16222
|
# Only 'cube' is concatenated with 'group by' clause.
|
|
15390
|
-
return self.groupby(columns, option="cube")
|
|
16223
|
+
return self.groupby(columns, option="cube", include_grouping_columns=include_grouping_columns)
|
|
15391
16224
|
|
|
15392
16225
|
@collect_queryband(queryband="DF_rollup")
|
|
15393
|
-
def rollup(self, columns):
|
|
16226
|
+
def rollup(self, columns, include_grouping_columns=False):
|
|
15394
16227
|
"""
|
|
15395
16228
|
DESCRIPTION:
|
|
15396
16229
|
rollup() function creates a multi-dimensional rollup for the DataFrame
|
|
@@ -15404,6 +16237,15 @@ class DataFrame():
|
|
|
15404
16237
|
Specifies the name(s) of input teradataml DataFrame column(s).
|
|
15405
16238
|
Types: str OR list of str(s)
|
|
15406
16239
|
|
|
16240
|
+
include_grouping_columns:
|
|
16241
|
+
Optional Argument.
|
|
16242
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
16243
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16244
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16245
|
+
aggregations on the columns mentioned in "columns".
|
|
16246
|
+
Default Value: False
|
|
16247
|
+
Types: bool
|
|
16248
|
+
|
|
15407
16249
|
RETURNS:
|
|
15408
16250
|
teradataml DataFrameGroupBy
|
|
15409
16251
|
|
|
@@ -15411,9 +16253,27 @@ class DataFrame():
|
|
|
15411
16253
|
TeradataMlException
|
|
15412
16254
|
|
|
15413
16255
|
EXAMPLES :
|
|
15414
|
-
#
|
|
16256
|
+
# Load the data to run the example.
|
|
15415
16257
|
>>> load_example_data("dataframe","admissions_train")
|
|
16258
|
+
|
|
16259
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
15416
16260
|
>>> df = DataFrame("admissions_train")
|
|
16261
|
+
>>> df
|
|
16262
|
+
masters gpa stats programming admitted
|
|
16263
|
+
id
|
|
16264
|
+
15 yes 4.00 Advanced Advanced 1
|
|
16265
|
+
34 yes 3.85 Advanced Beginner 0
|
|
16266
|
+
13 no 4.00 Advanced Novice 1
|
|
16267
|
+
38 yes 2.65 Advanced Beginner 1
|
|
16268
|
+
5 no 3.44 Novice Novice 0
|
|
16269
|
+
40 yes 3.95 Novice Beginner 0
|
|
16270
|
+
7 yes 2.33 Novice Novice 1
|
|
16271
|
+
22 yes 3.46 Novice Beginner 0
|
|
16272
|
+
26 yes 3.57 Advanced Advanced 1
|
|
16273
|
+
17 no 3.83 Advanced Advanced 1
|
|
16274
|
+
|
|
16275
|
+
# Example 1: Find the sum of all valid columns by grouping the
|
|
16276
|
+
# DataFrame columns with 'masters' and 'stats'.
|
|
15417
16277
|
>>> df1 = df.rollup(["masters", "stats"]).sum()
|
|
15418
16278
|
>>> df1
|
|
15419
16279
|
masters stats sum_id sum_gpa sum_admitted
|
|
@@ -15427,10 +16287,38 @@ class DataFrame():
|
|
|
15427
16287
|
7 yes Advanced 366 49.26 7
|
|
15428
16288
|
8 no Advanced 189 34.95 9
|
|
15429
16289
|
|
|
16290
|
+
# Example 2: Find the avg of all valid columns by grouping the DataFrame
|
|
16291
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
16292
|
+
# in aggregate function 'avg'.
|
|
16293
|
+
>>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=True).avg()
|
|
16294
|
+
>>> df1
|
|
16295
|
+
masters admitted avg_id avg_gpa avg_admitted
|
|
16296
|
+
0 no NaN 19.055556 3.553333 0.888889
|
|
16297
|
+
1 yes NaN 21.681818 3.532273 0.454545
|
|
16298
|
+
2 None NaN 20.500000 3.541750 0.650000
|
|
16299
|
+
3 yes 0.0 24.083333 3.613333 0.000000
|
|
16300
|
+
4 no 1.0 18.875000 3.595000 1.000000
|
|
16301
|
+
5 yes 1.0 18.800000 3.435000 1.000000
|
|
16302
|
+
6 no 0.0 20.500000 3.220000 0.000000
|
|
16303
|
+
|
|
16304
|
+
# Example 3: Find the avg of all valid columns by grouping the DataFrame with
|
|
16305
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
16306
|
+
# in aggregate function 'avg'.
|
|
16307
|
+
>>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=False).avg()
|
|
16308
|
+
>>> df1
|
|
16309
|
+
masters admitted avg_id avg_gpa
|
|
16310
|
+
0 no NaN 19.055556 3.553333
|
|
16311
|
+
1 yes NaN 21.681818 3.532273
|
|
16312
|
+
2 no 0.0 20.500000 3.220000
|
|
16313
|
+
3 yes 0.0 24.083333 3.613333
|
|
16314
|
+
4 no 1.0 18.875000 3.595000
|
|
16315
|
+
5 yes 1.0 18.800000 3.435000
|
|
16316
|
+
6 None NaN 20.500000 3.541750
|
|
15430
16317
|
"""
|
|
15431
16318
|
# Validate columns argument.
|
|
15432
16319
|
arg_info_matrix = []
|
|
15433
16320
|
arg_info_matrix.append(["columns", columns, False, (str, list), True])
|
|
16321
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
|
|
15434
16322
|
|
|
15435
16323
|
# Validate argument types
|
|
15436
16324
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -15440,7 +16328,1126 @@ class DataFrame():
|
|
|
15440
16328
|
|
|
15441
16329
|
# Query generation of cube API is same as the group by.
|
|
15442
16330
|
# Only 'rollup' is concatenated with 'group by' clause.
|
|
15443
|
-
return self.groupby(columns, option="rollup")
|
|
16331
|
+
return self.groupby(columns, option="rollup", include_grouping_columns=include_grouping_columns)
|
|
16332
|
+
|
|
16333
|
+
# Metadata functions for DataFrame created on datalake/OTF table.
|
|
16334
|
+
@property
|
|
16335
|
+
@collect_queryband(queryband="DF_snpsht")
|
|
16336
|
+
@df_utils.check_otf_dataframe()
|
|
16337
|
+
def snapshots(self):
|
|
16338
|
+
"""
|
|
16339
|
+
DESCRIPTION:
|
|
16340
|
+
Gets snapshot information for a DataLake table.
|
|
16341
|
+
|
|
16342
|
+
PARAMETERS:
|
|
16343
|
+
None
|
|
16344
|
+
|
|
16345
|
+
RETURNS:
|
|
16346
|
+
teradataml DataFrame.
|
|
16347
|
+
|
|
16348
|
+
RAISES:
|
|
16349
|
+
TeradataMLException.
|
|
16350
|
+
|
|
16351
|
+
EXAMPLES :
|
|
16352
|
+
# Example 1: Get the snapshot information for datalake table.
|
|
16353
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16354
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16355
|
+
... table_name="datalake_table",
|
|
16356
|
+
... datalake_name="datalake")
|
|
16357
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16358
|
+
>>> datalake_df.snapshots
|
|
16359
|
+
snapshotId snapshotTimestamp timestampMSecs manifestList summary
|
|
16360
|
+
0 6373759902296319074 2023-06-15 00:07:47 1686787667420 s3://vim-iceberg-v1/glue/metadata/snap-6373759... {"added-data-files":"1","added-records":"5","a...}
|
|
16361
|
+
1 4768076782814510171 2023-06-15 00:09:01 1686787741964 s3://vim-iceberg-v1/glue/metadata/snap-4768076... {"added-data-files":"1","added-records":"2","a...}
|
|
16362
|
+
2 7771482207931850214 2024-05-29 04:59:09 1716958749946 s3://vim-iceberg-v1/glue/metadata/snap-7771482... {"deleted-data-files":"2","deleted-records":"7...}
|
|
16363
|
+
3 1545363077953282623 2024-05-29 05:13:39 1716959619455 s3://vim-iceberg-v1/glue/metadata/snap-1545363... {"changed-partition-count":"0","total-records"...}
|
|
16364
|
+
4 2166707884289108360 2024-05-29 05:17:49 1716959869075 s3://vim-iceberg-v1/glue/metadata/snap-2166707... {"changed-partition-count":"0","total-records"...}
|
|
16365
|
+
5 8934190131471882700 2024-05-29 05:21:32 1716960092422 s3://vim-iceberg-v1/glue/metadata/snap-8934190... {"changed-partition-count":"0","total-records"...}
|
|
16366
|
+
6 3086605171258231948 2024-05-29 05:34:43 1716960883786 s3://vim-iceberg-v1/glue/metadata/snap-3086605... {"changed-partition-count":"0","total-records"...}
|
|
16367
|
+
7 7592503716012384122 2024-05-29 06:04:48 1716962688047 s3://vim-iceberg-v1/glue/metadata/snap-7592503... {"changed-partition-count":"0","total-records"...}
|
|
16368
|
+
8 2831061717890032890 2024-06-04 17:21:01 1717521661689 s3://vim-iceberg-v1/glue/metadata/snap-2831061... {"added-data-files":"2","added-records":"7","a...}
|
|
16369
|
+
9 8810491341502972715 2024-10-22 23:47:22 1729640842067 s3://vim-iceberg-v1/glue/metadata/snap-8810491... {"added-data-files":"1","added-records":"1","a...}
|
|
16370
|
+
10 3953136136558551163 2024-12-03 04:40:48 1733200848733 s3://vim-iceberg-v1/glue/metadata/snap-3953136... {"added-data-files":"1","added-records":"4","a...}
|
|
16371
|
+
11 6034775168901969481 2024-12-03 04:40:49 1733200849966 s3://vim-iceberg-v1/glue/metadata/snap-6034775... {"deleted-data-files":"1","deleted-records":"5...}
|
|
16372
|
+
"""
|
|
16373
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_SNAPSHOTS")
|
|
16374
|
+
|
|
16375
|
+
@property
|
|
16376
|
+
@collect_queryband(queryband="DF_prttns")
|
|
16377
|
+
@df_utils.check_otf_dataframe()
|
|
16378
|
+
def partitions(self):
|
|
16379
|
+
"""
|
|
16380
|
+
DESCRIPTION:
|
|
16381
|
+
Gets partition information for a DataLake table.
|
|
16382
|
+
|
|
16383
|
+
PARAMETERS:
|
|
16384
|
+
None
|
|
16385
|
+
|
|
16386
|
+
RETURNS:
|
|
16387
|
+
teradataml DataFrame.
|
|
16388
|
+
|
|
16389
|
+
RAISES:
|
|
16390
|
+
TeradataMLException.
|
|
16391
|
+
|
|
16392
|
+
EXAMPLES :
|
|
16393
|
+
# Example 1: Get the partition information for datalake table.
|
|
16394
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16395
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16396
|
+
... table_name="datalake_table",
|
|
16397
|
+
... datalake_name="datalake")
|
|
16398
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16399
|
+
>>> datalake_df.partitions
|
|
16400
|
+
id name
|
|
16401
|
+
0 1000 c2
|
|
16402
|
+
1 1001 c3
|
|
16403
|
+
|
|
16404
|
+
|
|
16405
|
+
"""
|
|
16406
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_PARTITIONS")
|
|
16407
|
+
|
|
16408
|
+
@property
|
|
16409
|
+
@collect_queryband(queryband="DF_mnfsts")
|
|
16410
|
+
@df_utils.check_otf_dataframe()
|
|
16411
|
+
def manifests(self):
|
|
16412
|
+
"""
|
|
16413
|
+
DESCRIPTION:
|
|
16414
|
+
Gets manifest information for a DataLake table.
|
|
16415
|
+
|
|
16416
|
+
PARAMETERS:
|
|
16417
|
+
None
|
|
16418
|
+
|
|
16419
|
+
RETURNS:
|
|
16420
|
+
teradataml DataFrame.
|
|
16421
|
+
|
|
16422
|
+
RAISES:
|
|
16423
|
+
TeradataMLException.
|
|
16424
|
+
|
|
16425
|
+
EXAMPLES :
|
|
16426
|
+
# Example 1: Get the manifest information for datalake table.
|
|
16427
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16428
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16429
|
+
... table_name="datalake_table",
|
|
16430
|
+
... datalake_name="datalake")
|
|
16431
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16432
|
+
>>> datalake_df.manifests
|
|
16433
|
+
snapshotId snapshotTimestamp manifestList manifestFile manifestFileLength datafilecount totalrowcount
|
|
16434
|
+
0 8068130797628952520 2025-05-02 11:45:26 s3://vim-iceberg-v1/otftestdb/nt_sales/... s3://vim-iceberg-v1/otftestdb/nt_sales/... 7158 6 6
|
|
16435
|
+
"""
|
|
16436
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_MANIFESTS")
|
|
16437
|
+
|
|
16438
|
+
@property
|
|
16439
|
+
@collect_queryband(queryband="DF_hstry")
|
|
16440
|
+
@df_utils.check_otf_dataframe()
|
|
16441
|
+
def history(self):
|
|
16442
|
+
"""
|
|
16443
|
+
DESCRIPTION:
|
|
16444
|
+
Gets the snapshot history related to a DataLake table.
|
|
16445
|
+
|
|
16446
|
+
PARAMETERS:
|
|
16447
|
+
None
|
|
16448
|
+
|
|
16449
|
+
RETURNS:
|
|
16450
|
+
teradataml DataFrame.
|
|
16451
|
+
|
|
16452
|
+
RAISES:
|
|
16453
|
+
TeradataMLException.
|
|
16454
|
+
|
|
16455
|
+
EXAMPLES :
|
|
16456
|
+
# Example 1: Get the partition information for datalake table.
|
|
16457
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16458
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16459
|
+
... table_name="datalake_table",
|
|
16460
|
+
... datalake_name="datalake")
|
|
16461
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16462
|
+
>>> datalake_df.history
|
|
16463
|
+
id timestamp
|
|
16464
|
+
0 8068130797628952520 2025-05-02 11:45:26
|
|
16465
|
+
"""
|
|
16466
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_HISTORY")
|
|
16467
|
+
|
|
16468
|
+
def _execute_metadata_query_and_generate_dataframe(self, func_name):
|
|
16469
|
+
"""Function executes OTF metadata query and return result in DataFrame format"""
|
|
16470
|
+
query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_METADATA).format(func_name,
|
|
16471
|
+
self._table_name)
|
|
16472
|
+
return DataFrame.from_query(query)
|
|
16473
|
+
|
|
16474
|
+
@collect_queryband(queryband="DF_gt_snpsht")
|
|
16475
|
+
@df_utils.check_otf_dataframe()
|
|
16476
|
+
def get_snapshot(self, as_of):
|
|
16477
|
+
"""
|
|
16478
|
+
DESCRIPTION:
|
|
16479
|
+
Gets the data from a DataLake table for the given snapshot id or timestamp string.
|
|
16480
|
+
Notes:
|
|
16481
|
+
* The snapshot id can be obtained from the 'snapshots' property of the DataFrame.
|
|
16482
|
+
* The time travel value represented by 'as_of' should be in the format "YYYY-MM-DD HH:MM:SS.FFFFFFF"
|
|
16483
|
+
for TIMESTAMP string or "YYYY-MM-DD" for DATE string.
|
|
16484
|
+
|
|
16485
|
+
PARAMETERS:
|
|
16486
|
+
as_of:
|
|
16487
|
+
Required Argument.
|
|
16488
|
+
Specifies the snapshot id or timestamp information for which the snapshot is to be fetched.
|
|
16489
|
+
Types: str or int
|
|
16490
|
+
|
|
16491
|
+
RETURNS:
|
|
16492
|
+
teradataml DataFrame.
|
|
16493
|
+
|
|
16494
|
+
RAISES:
|
|
16495
|
+
TeradataMLException.
|
|
16496
|
+
|
|
16497
|
+
EXAMPLES:
|
|
16498
|
+
# DataFrame creation on OTF table.
|
|
16499
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16500
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16501
|
+
... table_name="datalake_table",
|
|
16502
|
+
... datalake_name="datalake")
|
|
16503
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16504
|
+
|
|
16505
|
+
# List snapshots first.
|
|
16506
|
+
>>> datalake_df.snapshots
|
|
16507
|
+
snapshotId snapshotTimestamp timestampMSecs manifestList summary
|
|
16508
|
+
2046682612111137809 2025-06-03 13:26:15 1748957175692 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-204... {"added-data-files":"Red Inc","added-records"...}
|
|
16509
|
+
282293708812257203 2025-06-03 05:53:19 1748929999245 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-282... {"added-data-files":"Blue Inc","added-records"...}
|
|
16510
|
+
|
|
16511
|
+
# Example 1: Get the snapshot using snapshot id.
|
|
16512
|
+
>>> datalake_df.get_snapshot(2046682612111137809)
|
|
16513
|
+
Feb Jan Mar Apr datetime
|
|
16514
|
+
accounts
|
|
16515
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16516
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16517
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16518
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16519
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16520
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16521
|
+
|
|
16522
|
+
# Example 2: Get the snapshot using snapshot id in string format.
|
|
16523
|
+
>>> datalake_df.get_snapshot("2046682612111137809")
|
|
16524
|
+
Feb Jan Mar Apr datetime
|
|
16525
|
+
accounts
|
|
16526
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16527
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16528
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16529
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16530
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16531
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16532
|
+
|
|
16533
|
+
# Example 3: Get the snapshot using timestamp string.
|
|
16534
|
+
>>> datalake_df.get_snapshot("2025-06-03 13:26:16")
|
|
16535
|
+
Feb Jan Mar Apr datetime
|
|
16536
|
+
accounts
|
|
16537
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16538
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16539
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16540
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16541
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16542
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16543
|
+
|
|
16544
|
+
# Example 4: Get the snapshot using date string.
|
|
16545
|
+
>>> datalake_df.get_snapshot("2025-06-04")
|
|
16546
|
+
Feb Jan Mar Apr datetime
|
|
16547
|
+
accounts
|
|
16548
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16549
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16550
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16551
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16552
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16553
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16554
|
+
|
|
16555
|
+
"""
|
|
16556
|
+
_Validators._validate_function_arguments([["as_of", as_of, False, (int, str)]])
|
|
16557
|
+
|
|
16558
|
+
# If already int or string representation of int, return by quoting it
|
|
16559
|
+
if isinstance(as_of, int) or (isinstance(as_of, str) and as_of.isdigit()):
|
|
16560
|
+
snapshot_on = "'{}'".format(as_of)
|
|
16561
|
+
else:
|
|
16562
|
+
try:
|
|
16563
|
+
snapshot_on = UtilFuncs._get_time_formatted_string(as_of)
|
|
16564
|
+
except ValueError as e:
|
|
16565
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
16566
|
+
"get_snapshot", "Invalid value for 'as_of' argument: {}. "
|
|
16567
|
+
"Use valid format [\"YYYY-MM-DD HH:MM:SS.FFFFFFF\", \"YYYY-MM-DD HH:MM:SS\","
|
|
16568
|
+
"\"YYYY-MM-DD\"]".format(as_of)),
|
|
16569
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
16570
|
+
|
|
16571
|
+
query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_SNAPSHOT).format(self._table_name, snapshot_on)
|
|
16572
|
+
|
|
16573
|
+
try:
|
|
16574
|
+
return DataFrame.from_query(query)
|
|
16575
|
+
except TeradataMlException as e:
|
|
16576
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
16577
|
+
"get_snapshot()", "Invalid value for 'as_of' argument: {}. "
|
|
16578
|
+
"Use valid timestamp or correct snapshot id listed using 'snapshots' property.".format(as_of)),
|
|
16579
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
16580
|
+
|
|
16581
|
+
def as_of(self, **kwargs):
|
|
16582
|
+
"""
|
|
16583
|
+
DESCRIPTION:
|
|
16584
|
+
Function to get DataFrame at specific time on temporal table.
|
|
16585
|
+
Note:
|
|
16586
|
+
Function is supported only on temporal tables or temporal views.
|
|
16587
|
+
|
|
16588
|
+
PARAMETERS:
|
|
16589
|
+
kwargs:
|
|
16590
|
+
Specifies keyword arguments.
|
|
16591
|
+
|
|
16592
|
+
valid_time:
|
|
16593
|
+
Optional Argument.
|
|
16594
|
+
Specifies the valid time to retrieve data from DataFrame created on either ValidTime
|
|
16595
|
+
or BiTemporal table/view.
|
|
16596
|
+
Notes:
|
|
16597
|
+
* Either "valid_time" or "transaction_time" must be provided.
|
|
16598
|
+
* Argument accepts below values:
|
|
16599
|
+
* "current" - to get the current valid time data.
|
|
16600
|
+
* any string other than "current" is considered as date and data will be retrieved at that of time.
|
|
16601
|
+
* date object - to get the data valid on that date.
|
|
16602
|
+
* datetime object - to get the data valid at that point of time.
|
|
16603
|
+
* tuple - to get the data which is valid between the two valid times.
|
|
16604
|
+
* tuple should have only two elements. First element considered as starting time
|
|
16605
|
+
and second element considered as end time for a period of time.
|
|
16606
|
+
Records will be retrieved which are valid between the two valid times.
|
|
16607
|
+
* Both elements can be of date or datetime or string type. If you are using
|
|
16608
|
+
string, make sure the string represents a valid date.
|
|
16609
|
+
* Any element can be None.
|
|
16610
|
+
* If first element is None and valid time dimension column is PERIOD_DATE type,
|
|
16611
|
+
then it is considered as '0001-01-01'.
|
|
16612
|
+
* If first element is None and valid time dimension column is PERIOD_TIMESTAMP type,
|
|
16613
|
+
then it is considered as '0001-01-01 00:00:00.000000+00:00'.
|
|
16614
|
+
* If second element is None and valid time dimension column is PERIOD_DATE type,
|
|
16615
|
+
then it is considered as '9999-12-31'.
|
|
16616
|
+
* If second element is None and valid time dimension column is PERIOD_TIMESTAMP type,
|
|
16617
|
+
then it is considered as '9999-12-31 23:59:59.999999+00:00'.
|
|
16618
|
+
* None - to consider the DataFrame as regular DataFrame and retrieve all the records from
|
|
16619
|
+
valid time dimension.
|
|
16620
|
+
Types: date or str or tuple or NoneType
|
|
16621
|
+
|
|
16622
|
+
include_valid_time_column:
|
|
16623
|
+
Optional Argument.
|
|
16624
|
+
Specifies whether to include the valid time dimension column in the resultant DataFrame.
|
|
16625
|
+
When set to True, valid time dimension column is included in resultant DataFrame.
|
|
16626
|
+
Otherwise, valid time dimension column is not included in resultant DataFrame.
|
|
16627
|
+
Note:
|
|
16628
|
+
Ignored when "valid_time" is either tuple or None.
|
|
16629
|
+
Default Value: False
|
|
16630
|
+
Types: bool
|
|
16631
|
+
|
|
16632
|
+
transaction_time:
|
|
16633
|
+
Optional Argument.
|
|
16634
|
+
Specifies the transaction time to retrieve data from DataFrame created on either
|
|
16635
|
+
TransactionTime or BiTemporal table/view.
|
|
16636
|
+
Notes:
|
|
16637
|
+
* Either "valid_time" or "transaction_time" must be provided.
|
|
16638
|
+
* Argument accepts below values.
|
|
16639
|
+
* "current" - to get the records which are valid at current time.
|
|
16640
|
+
* any string other than "current" is considered as timestamp and records which are
|
|
16641
|
+
valid at that of time.
|
|
16642
|
+
* datetime object - to get the records which are valid at that of time.
|
|
16643
|
+
* None - to consider the DataFrame as regular DataFrame and retrieve all the records
|
|
16644
|
+
from transaction time dimension.
|
|
16645
|
+
Types: datetime or str or NoneType
|
|
16646
|
+
|
|
16647
|
+
include_transaction_time_column:
|
|
16648
|
+
Optional Argument.
|
|
16649
|
+
Specifies whether to include the transaction time dimension column in the resultant DataFrame.
|
|
16650
|
+
When set to True, transaction time dimension column is included in resultant DataFrame.
|
|
16651
|
+
Otherwise, transaction time dimension column is not included in resultant DataFrame.
|
|
16652
|
+
Default Value: False
|
|
16653
|
+
Types: bool
|
|
16654
|
+
|
|
16655
|
+
additional_period:
|
|
16656
|
+
Optional Argument.
|
|
16657
|
+
Specifies the additional period to be kept in resultant DataFrame.
|
|
16658
|
+
Note:
|
|
16659
|
+
This is applicable only when "valid_time" is None.
|
|
16660
|
+
Types: tuple of date or str
|
|
16661
|
+
|
|
16662
|
+
RETURNS:
|
|
16663
|
+
teradataml DataFrame
|
|
16664
|
+
|
|
16665
|
+
RAISES:
|
|
16666
|
+
TeradatamlException.
|
|
16667
|
+
|
|
16668
|
+
EXAMPLES:
|
|
16669
|
+
# Load the data to run the example.
|
|
16670
|
+
>>> load_example_data("teradataml", "Employee_roles") # load valid time data.
|
|
16671
|
+
>>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
|
|
16672
|
+
>>> load_example_data("teradataml", "Employee") # load bitemporal data.
|
|
16673
|
+
|
|
16674
|
+
>>> df1 = DataFrame("Employee_roles")
|
|
16675
|
+
EmployeeName Department Salary role_validity_period
|
|
16676
|
+
EmployeeID
|
|
16677
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31')
|
|
16678
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16679
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16680
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31')
|
|
16681
|
+
|
|
16682
|
+
# Example 1: Get the employee roles from DataFrame df1 which are valid at current time.
|
|
16683
|
+
>>> df1.as_of(valid_time="current")
|
|
16684
|
+
EmployeeName Department Salary
|
|
16685
|
+
EmployeeID
|
|
16686
|
+
2 Jane Smith DA 200.0
|
|
16687
|
+
3 Bob Marketing 330.0
|
|
16688
|
+
|
|
16689
|
+
# Example 2: Get the employee roles from DataFrame df1 which are valid at current time.
|
|
16690
|
+
# Also include valid time dimension column.
|
|
16691
|
+
>>> df1.as_of(valid_time="current", include_valid_time_column=True)
|
|
16692
|
+
EmployeeName Department Salary role_validity_period
|
|
16693
|
+
EmployeeID
|
|
16694
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16695
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16696
|
+
|
|
16697
|
+
# Example 3: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
|
|
16698
|
+
Include valid time dimension column.
|
|
16699
|
+
>>> df1.as_of(valid_time="2026-12-31", include_valid_time_column=True)
|
|
16700
|
+
EmployeeName Department Salary role_validity_period
|
|
16701
|
+
EmployeeID
|
|
16702
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16703
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16704
|
+
|
|
16705
|
+
# Example 4: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
|
|
16706
|
+
# Also include valid time dimension column. Use date object instead of string
|
|
16707
|
+
# to specify the date.
|
|
16708
|
+
>>> from datetime import date
|
|
16709
|
+
>>> d = date(2026, 12, 31)
|
|
16710
|
+
>>> df1.as_of(valid_time=d, include_valid_time_column=True)
|
|
16711
|
+
EmployeeName Department Salary role_validity_period
|
|
16712
|
+
EmployeeID
|
|
16713
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
|
|
16714
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16715
|
+
|
|
16716
|
+
# Example 5: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
|
|
16717
|
+
# Include valid time dimension column.
|
|
16718
|
+
>>> df1.as_of(valid_time=("2018-01-20", "2024-03-05"), include_valid_time_column=True)
|
|
16719
|
+
EmployeeName Department Salary VALIDTIME
|
|
16720
|
+
EmployeeID
|
|
16721
|
+
2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
|
|
16722
|
+
1 John Doe IT 100.0 ('20/01/01', '24/03/05')
|
|
16723
|
+
3 Bob Sales 300.0 ('24/01/01', '24/03/05')
|
|
16724
|
+
|
|
16725
|
+
# Example 6: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
|
|
16726
|
+
# Then again get the records which are valid at 1st Jan 2023. Do not include
|
|
16727
|
+
# valid time dimension column since selecting valid time dimension column is ignored
|
|
16728
|
+
# when "valid_time" is a tuple.
|
|
16729
|
+
>>> df1.as_of(valid_time=(date(2018, 1, 20), "2024-03-05")).as_of(valid_time=date(2023, 1, 1))
|
|
16730
|
+
EmployeeName Department Salary
|
|
16731
|
+
EmployeeID
|
|
16732
|
+
2 Jane Smith DA 200.0
|
|
16733
|
+
1 John Doe IT 100.0
|
|
16734
|
+
|
|
16735
|
+
# Example 7: Get the employee roles which are valid between 1st Jan 0001 and 1st Jun 2024.
|
|
16736
|
+
>>> df1.as_of(valid_time=(None, date(2024, 3, 5)))
|
|
16737
|
+
EmployeeName Department Salary VALIDTIME
|
|
16738
|
+
EmployeeID
|
|
16739
|
+
2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
|
|
16740
|
+
1 John Doe IT 100.0 ('20/01/01', '24/03/05')
|
|
16741
|
+
3 Bob Sales 300.0 ('24/01/01', '24/03/05')
|
|
16742
|
+
|
|
16743
|
+
# Example 8: Get the employee roles which are valid between 1st Jun 2024 and 31st Dec 9999.
|
|
16744
|
+
>>> df1.as_of(valid_time=("2024-06-01", None))
|
|
16745
|
+
EmployeeName Department Salary VALIDTIME
|
|
16746
|
+
EmployeeID
|
|
16747
|
+
1 John Doe IT 100.0 ('24/06/01', '24/12/31')
|
|
16748
|
+
2 Jane Smith DA 200.0 ('24/06/01', '99/12/31')
|
|
16749
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
|
|
16750
|
+
3 Bob Sales 300.0 ('24/06/01', '24/12/31')
|
|
16751
|
+
|
|
16752
|
+
# Example 9: Consider df1 as regular DataFrame and retrieve all the records irrespective
|
|
16753
|
+
# whether records are valid or not.
|
|
16754
|
+
>>> df1.as_of(valid_time=None)
|
|
16755
|
+
EmployeeName Department Salary
|
|
16756
|
+
EmployeeID
|
|
16757
|
+
1 John Doe IT 100.0
|
|
16758
|
+
2 Jane Smith DA 200.0
|
|
16759
|
+
3 Bob Marketing 330.0
|
|
16760
|
+
3 Bob Sales 300.0
|
|
16761
|
+
|
|
16762
|
+
# Example 10. Consider df1 as regular DataFrame and retrieve all the records irrespective
|
|
16763
|
+
# whether records are valid or not. Also include additional period and valid time
|
|
16764
|
+
# dimension column.
|
|
16765
|
+
>>> df1.as_of(valid_time=None, additional_period=("2024-01-01", "2024-03-05"), include_valid_time_column=True)
|
|
16766
|
+
EmployeeName Department Salary role_validity_period VALIDTIME
|
|
16767
|
+
EmployeeID
|
|
16768
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31') ('24/01/01', '24/03/05')
|
|
16769
|
+
2 Jane Smith DA 200.0 ('20/01/01', '99/12/31') ('24/01/01', '24/03/05')
|
|
16770
|
+
3 Bob Marketing 330.0 ('25/01/01', '99/12/31') ('24/01/01', '24/03/05')
|
|
16771
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31') ('24/01/01', '24/03/05')
|
|
16772
|
+
|
|
16773
|
+
>>> df2 = DataFrame("Employee_Address")
|
|
16774
|
+
EmployeeName address validity_period
|
|
16775
|
+
EmployeeID
|
|
16776
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16777
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16778
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16779
|
+
|
|
16780
|
+
# Example 11: Consider df2 as regular DataFrame and retrieve all the records including historic
|
|
16781
|
+
# records. Also include transaction time dimension column.
|
|
16782
|
+
>>> df2.as_of(transaction_time=None, include_transaction_time_column=True)
|
|
16783
|
+
EmployeeName address validity_period
|
|
16784
|
+
EmployeeID
|
|
16785
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16786
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16787
|
+
3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
|
|
16788
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16789
|
+
|
|
16790
|
+
# Example 12: Get the employee address which are valid at current time from DataFrame df2.
|
|
16791
|
+
# Also include transaction time dimension column.
|
|
16792
|
+
>>> df2.as_of(transaction_time="current", include_transaction_time_column=True)
|
|
16793
|
+
EmployeeName address validity_period
|
|
16794
|
+
EmployeeID
|
|
16795
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16796
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16797
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16798
|
+
|
|
16799
|
+
# Example 13: Get the employee address which are valid at current time from DataFrame df2.
|
|
16800
|
+
# Do not include transaction time dimension column.
|
|
16801
|
+
>>> df2.as_of(transaction_time="current", include_transaction_time_column=False)
|
|
16802
|
+
EmployeeName address
|
|
16803
|
+
EmployeeID
|
|
16804
|
+
2 Jane Smith 456 Elm St
|
|
16805
|
+
1 John Doe 123 Main St
|
|
16806
|
+
3 Bob Johnson 789 Oak St
|
|
16807
|
+
|
|
16808
|
+
# Example 14: Get the employee address which are valid at 2025-03-04 15:41:44.610000+00:00 from DataFrame df2.
|
|
16809
|
+
# Include transaction time dimension column.
|
|
16810
|
+
>>> df2.as_of(transaction_time="2025-03-04 15:41:44.610000+00:00", include_transaction_time_column=True)
|
|
16811
|
+
EmployeeName address validity_period
|
|
16812
|
+
EmployeeID
|
|
16813
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16814
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16815
|
+
3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
|
|
16816
|
+
|
|
16817
|
+
# Example 15: Get the employee address which are valid at 2025-03-04 15:41:44.610001+00:00 from DataFrame df2.
|
|
16818
|
+
# Include transaction time dimension column.
|
|
16819
|
+
>>> from datetime import datetime, timezone, timedelta
|
|
16820
|
+
>>> dt = datetime(2025, 3, 4, 15, 41, 44, 610001)
|
|
16821
|
+
>>> dt_with_tz = dt.replace(tzinfo=timezone(timedelta(hours=0)))
|
|
16822
|
+
>>> df2.as_of(transaction_time=dt_with_tz, include_transaction_time_column=True)
|
|
16823
|
+
EmployeeName address validity_period
|
|
16824
|
+
EmployeeID
|
|
16825
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16826
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16827
|
+
3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16828
|
+
|
|
16829
|
+
>>> df3 = DataFrame("Employee")
|
|
16830
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16831
|
+
EmployeeID
|
|
16832
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16833
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16834
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16835
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16836
|
+
|
|
16837
|
+
# Example 16: Get all the records from DataFrame df3 by considering the DataFrame as
|
|
16838
|
+
# regular DataFrame. Include both valid time and transaction time dimension columns.
|
|
16839
|
+
>>> df3.as_of(valid_time=None,
|
|
16840
|
+
... transaction_time=None,
|
|
16841
|
+
... include_valid_time_column=True,
|
|
16842
|
+
... include_transaction_time_column=True
|
|
16843
|
+
... )
|
|
16844
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16845
|
+
EmployeeID
|
|
16846
|
+
3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16847
|
+
3 Bob 789 Oak St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00')
|
|
16848
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16849
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16850
|
+
3 Bob 789 Oak Street Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16851
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16852
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16853
|
+
|
|
16854
|
+
# Example 17: Get the employee address from DataFrame df3 which are valid at 1st Jun 2024 from
|
|
16855
|
+
# valid time dimension and valid at '2025-03-04 18:09:08.720001+00:00' from transaction
|
|
16856
|
+
# time dimension. Include both valid time and transaction time dimension columns.
|
|
16857
|
+
>>> df3.as_of(valid_time="2024-06-01",
|
|
16858
|
+
... transaction_time="2025-03-04 18:09:08.720001+00:00",
|
|
16859
|
+
... include_valid_time_column=True,
|
|
16860
|
+
... include_transaction_time_column=True
|
|
16861
|
+
... )
|
|
16862
|
+
EmployeeName address Department Salary role_validity validity_period
|
|
16863
|
+
EmployeeID
|
|
16864
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16865
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16866
|
+
3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
|
|
16867
|
+
|
|
16868
|
+
# Example 18: Get the employee address from DataFrame df3 which are valid at 25th Jan 2024
|
|
16869
|
+
# from valid time dimension and valid at current time from transaction time dimension.
|
|
16870
|
+
# Include only transaction time dimension column.
|
|
16871
|
+
>>> df3.as_of(valid_time=date(2024, 1, 25),
|
|
16872
|
+
... transaction_time="current",
|
|
16873
|
+
... include_transaction_time_column=True)
|
|
16874
|
+
EmployeeName address Department Salary validity_period
|
|
16875
|
+
EmployeeID
|
|
16876
|
+
2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16877
|
+
1 John Doe 123 Main St IT 100.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16878
|
+
3 Bob 789 Oak St Sales 300.0 ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
16879
|
+
|
|
16880
|
+
# Example 19: Get the employee address from DataFrame df3 which are valid between 1st Jan 2025
|
|
16881
|
+
# and 30th June 2025 from valid time dimension and valid at
|
|
16882
|
+
# '2025-03-04 18:08:59.720000+00:00' from transaction time dimension.
|
|
16883
|
+
# Include both valid time and transaction time dimension columns.
|
|
16884
|
+
>>> from datetime import datetime, timezone
|
|
16885
|
+
>>>df3.as_of(valid_time=("2025-01-01", date(2025, 6, 30)),
|
|
16886
|
+
... transaction_time=datetime(2025, 3, 4, 18, 8, 59, 720000).astimezone(timezone.utc),
|
|
16887
|
+
... include_transaction_time_column=True)
|
|
16888
|
+
EmployeeName address Department Salary validity_period VALIDTIME
|
|
16889
|
+
EmployeeID
|
|
16890
|
+
2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('25/01/01', '25/06/30')
|
|
16891
|
+
3 Bob 789 Oak St Marketing 330.0 ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00') ('25/01/01', '25/06/30')
|
|
16892
|
+
|
|
16893
|
+
# Example 20: Get the employee address from DataFrame df3 by considering the DataFrame as regular
|
|
16894
|
+
# DataFrame from valid time dimension and valid at current time from transaction time dimension.
|
|
16895
|
+
# Add additional period and include both valid time and transaction time dimension columns.
|
|
16896
|
+
>>> df3.as_of(valid_time=None,
|
|
16897
|
+
... transaction_time="current",
|
|
16898
|
+
... additional_period=("2024-01-01", "2024-03-05"),
|
|
16899
|
+
... include_valid_time_column=True,
|
|
16900
|
+
... include_transaction_time_column=True
|
|
16901
|
+
... )
|
|
16902
|
+
EmployeeName address Department Salary role_validity validity_period VALIDTIME
|
|
16903
|
+
EmployeeID
|
|
16904
|
+
1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16905
|
+
2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16906
|
+
3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16907
|
+
3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
|
|
16908
|
+
"""
|
|
16909
|
+
|
|
16910
|
+
if "valid_time" not in kwargs and "transaction_time" not in kwargs:
|
|
16911
|
+
_Validators._validate_mutually_exclusive_arguments(
|
|
16912
|
+
None, "valid_time", None, "transaction_time")
|
|
16913
|
+
|
|
16914
|
+
# Validate argument types.
|
|
16915
|
+
_validation = []
|
|
16916
|
+
_validation.append(["valid_time", kwargs.get("valid_time"), True, (date, datetime, str, tuple, type(None))])
|
|
16917
|
+
_validation.append(["transaction_time", kwargs.get("transaction_time"), True, (datetime, str, type(None))])
|
|
16918
|
+
_validation.append(["additional_period", kwargs.get("additional_period"), True, (tuple, type(None))])
|
|
16919
|
+
_validation.append(["include_valid_time_column", kwargs.get("include_valid_time_column"), True, bool])
|
|
16920
|
+
_validation.append(["include_transaction_time_column", kwargs.get("include_transaction_time_column"), True, bool])
|
|
16921
|
+
|
|
16922
|
+
# Validate argument types
|
|
16923
|
+
_Validators._validate_function_arguments(_validation)
|
|
16924
|
+
|
|
16925
|
+
# Validate temporal table type.
|
|
16926
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
16927
|
+
|
|
16928
|
+
# Extract valid_time and transaction_time from kwargs.
|
|
16929
|
+
valid_time = kwargs.get("valid_time")
|
|
16930
|
+
transaction_time = kwargs.get("transaction_time")
|
|
16931
|
+
additional_period = kwargs.get("additional_period")
|
|
16932
|
+
include_valid_time_column = kwargs.get("include_valid_time_column")
|
|
16933
|
+
include_transaction_time_column = kwargs.get("include_transaction_time_column")
|
|
16934
|
+
|
|
16935
|
+
# Validate if user specifies valid_time for a transaction time table.
|
|
16936
|
+
if "valid_time" in kwargs:
|
|
16937
|
+
_Validators._validate_as_of_arguments(df_type=self.df_type)
|
|
16938
|
+
|
|
16939
|
+
# Validate if user specifies transaction_time for a valid time table.
|
|
16940
|
+
if "transaction_time" in kwargs:
|
|
16941
|
+
_Validators._validate_as_of_arguments(df_type=self.df_type, argument_name='transaction_time')
|
|
16942
|
+
|
|
16943
|
+
add_vt_period = False
|
|
16944
|
+
|
|
16945
|
+
# Generate the time qualifier clause.
|
|
16946
|
+
if "valid_time" in kwargs and "transaction_time" not in kwargs:
|
|
16947
|
+
clause = self.__get_valid_time_clause(valid_time, additional_period)
|
|
16948
|
+
elif "transaction_time" in kwargs and "valid_time" not in kwargs:
|
|
16949
|
+
clause = self.__get_transaction_time_clause(transaction_time)
|
|
16950
|
+
else:
|
|
16951
|
+
# Generate both clauses.
|
|
16952
|
+
clause = "{} AND {}".format(self.__get_valid_time_clause(valid_time, additional_period),
|
|
16953
|
+
self.__get_transaction_time_clause(transaction_time)
|
|
16954
|
+
)
|
|
16955
|
+
|
|
16956
|
+
# Exclude the time dimension columns if user is not willing to see it in output DF.
|
|
16957
|
+
columns_to_exclude = []
|
|
16958
|
+
if not include_valid_time_column and self._valid_time_column:
|
|
16959
|
+
columns_to_exclude.append(self._valid_time_column.name)
|
|
16960
|
+
|
|
16961
|
+
if not include_transaction_time_column and self._transaction_time_column:
|
|
16962
|
+
columns_to_exclude.append(self._transaction_time_column.name)
|
|
16963
|
+
|
|
16964
|
+
columns = [col for col in self.columns if col not in columns_to_exclude]
|
|
16965
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16966
|
+
|
|
16967
|
+
# Notes:
|
|
16968
|
+
# * If valid_time is tuple, i.e., for valid time qualifier SEQUENCED VALIDTIME,
|
|
16969
|
+
# add additional column VALIDTIME. This column should not be present in SELECT statement.
|
|
16970
|
+
# Also, ValidTime dimension column should not be present in SELECT statement. VALIDTIME column
|
|
16971
|
+
# acts as validTime dimension column here.
|
|
16972
|
+
# * Time qualifier NONSEQUENCED VALIDTIME PERIOD clause also produces additional column VALIDTIME.
|
|
16973
|
+
# Hence, add additional column VALIDTIME also returned in the output DataFrame. However, valid time
|
|
16974
|
+
# column can exist in SELECT statement.
|
|
16975
|
+
if isinstance(valid_time, tuple):
|
|
16976
|
+
add_vt_period = True
|
|
16977
|
+
columns = [col for col in columns if col != self._valid_time_column.name]
|
|
16978
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16979
|
+
col_names_types["VALIDTIME"] = self._valid_time_column.type
|
|
16980
|
+
elif (isinstance(valid_time, type(None)) and additional_period is not None):
|
|
16981
|
+
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
|
|
16982
|
+
col_names_types["VALIDTIME"] = self._valid_time_column.type
|
|
16983
|
+
|
|
16984
|
+
# SELECT Node.
|
|
16985
|
+
column_expression = ", ".join(columns)
|
|
16986
|
+
sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=clause)
|
|
16987
|
+
|
|
16988
|
+
# Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
|
|
16989
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
|
|
16990
|
+
df = self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
|
|
16991
|
+
|
|
16992
|
+
# If time qualifier is SEQUENCED PERIOD, then add VALIDTIME column to DataFrame
|
|
16993
|
+
# since it produces temporal dataset.
|
|
16994
|
+
if add_vt_period:
|
|
16995
|
+
df._valid_time_column = df['VALIDTIME']
|
|
16996
|
+
|
|
16997
|
+
return df
|
|
16998
|
+
|
|
16999
|
+
def __get_valid_time_clause(self, valid_time, additional_period=None):
|
|
17000
|
+
"""
|
|
17001
|
+
DESCRIPTION:
|
|
17002
|
+
Function to get valid time clause for temporal table.
|
|
17003
|
+
|
|
17004
|
+
PARAMETERS:
|
|
17005
|
+
valid_time:
|
|
17006
|
+
Required Argument.
|
|
17007
|
+
Specifies the valid time dimension to represent temporal data when creating the DataFrame.
|
|
17008
|
+
Types: date or str
|
|
17009
|
+
|
|
17010
|
+
additional_period:
|
|
17011
|
+
Optional Argument.
|
|
17012
|
+
Specifies the additional period to be kept in DataFrame.
|
|
17013
|
+
Note:
|
|
17014
|
+
This is applicable only when "valid_time" is None.
|
|
17015
|
+
Types: tuple of date or str
|
|
17016
|
+
|
|
17017
|
+
RETURNS:
|
|
17018
|
+
str
|
|
17019
|
+
|
|
17020
|
+
RAISES:
|
|
17021
|
+
None.
|
|
17022
|
+
"""
|
|
17023
|
+
is_vt_dt_type = isinstance(self._valid_time_column.type, tdtypes.PERIOD_DATE)
|
|
17024
|
+
if valid_time == "current":
|
|
17025
|
+
return "CURRENT VALIDTIME"
|
|
17026
|
+
|
|
17027
|
+
if isinstance(valid_time, (str, date, datetime)):
|
|
17028
|
+
# If valid_time is a string, then check what is the type of temporal column.
|
|
17029
|
+
# ValidTime dimension allows both DATE and TIMESTAMP type for ValidTime dimension
|
|
17030
|
+
# columns.
|
|
17031
|
+
if is_vt_dt_type:
|
|
17032
|
+
return "VALIDTIME AS OF DATE '{}'".format(valid_time)
|
|
17033
|
+
return "VALIDTIME AS OF TIMESTAMP '{}'".format(valid_time)
|
|
17034
|
+
|
|
17035
|
+
# If valid_time is a tuple, then it is a period.
|
|
17036
|
+
# User can specify start and/or end time. Derive missing value.
|
|
17037
|
+
if isinstance(valid_time, tuple):
|
|
17038
|
+
start = valid_time[0]
|
|
17039
|
+
end = valid_time[1]
|
|
17040
|
+
start = ("0001-01-01" if is_vt_dt_type else '0001-01-01 00:00:00.000000+00:00') if start is None else str(
|
|
17041
|
+
start)
|
|
17042
|
+
end = ("9999-12-31" if is_vt_dt_type else '9999-12-31 23:59:59.999999+00:00') if end is None else str(end)
|
|
17043
|
+
return "SEQUENCED VALIDTIME PERIOD '({}, {})'".format(start, end)
|
|
17044
|
+
|
|
17045
|
+
if isinstance(valid_time, type(None)) and additional_period is not None:
|
|
17046
|
+
return "NONSEQUENCED VALIDTIME PERIOD '({}, {})'".format(additional_period[0], additional_period[1])
|
|
17047
|
+
|
|
17048
|
+
return "NONSEQUENCED VALIDTIME"
|
|
17049
|
+
|
|
17050
|
+
def __get_transaction_time_clause(self, transaction_time):
|
|
17051
|
+
"""
|
|
17052
|
+
DESCRIPTION:
|
|
17053
|
+
Function to get transaction time clause for temporal table.
|
|
17054
|
+
|
|
17055
|
+
PARAMETERS:
|
|
17056
|
+
transaction_time:
|
|
17057
|
+
Required Argument.
|
|
17058
|
+
Specifies the transaction time dimension to represent temporal data when creating the DataFrame.
|
|
17059
|
+
Types: date or str
|
|
17060
|
+
|
|
17061
|
+
RETURNS:
|
|
17062
|
+
str
|
|
17063
|
+
|
|
17064
|
+
RAISES:
|
|
17065
|
+
None.
|
|
17066
|
+
"""
|
|
17067
|
+
if transaction_time == "current":
|
|
17068
|
+
return "CURRENT TRANSACTIONTIME"
|
|
17069
|
+
|
|
17070
|
+
if isinstance(transaction_time, type(None)):
|
|
17071
|
+
return "NONSEQUENCED TRANSACTIONTIME"
|
|
17072
|
+
|
|
17073
|
+
return "TRANSACTIONTIME as of timestamp '{}'".format(transaction_time)
|
|
17074
|
+
|
|
17075
|
+
def _generate_temporal_dataframe(self, timestamp_expr, time_column):
|
|
17076
|
+
"""
|
|
17077
|
+
DESCRIPTION:
|
|
17078
|
+
Helper method to generate a temporal DataFrame based on the given timestamp expression.
|
|
17079
|
+
|
|
17080
|
+
PARAMETERS:
|
|
17081
|
+
timestamp_expr:
|
|
17082
|
+
Required Argument.
|
|
17083
|
+
Specifies the timestamp expression to filter the temporal data.
|
|
17084
|
+
Types: str
|
|
17085
|
+
|
|
17086
|
+
time_column:
|
|
17087
|
+
Required Argument.
|
|
17088
|
+
Specifies the temporal column (valid-time or transaction-time) to process.
|
|
17089
|
+
Types: ColumnExpression
|
|
17090
|
+
|
|
17091
|
+
RAISES:
|
|
17092
|
+
None.
|
|
17093
|
+
|
|
17094
|
+
RETURNS:
|
|
17095
|
+
teradataml DataFrame
|
|
17096
|
+
"""
|
|
17097
|
+
col_expr = "{} as {}".format(time_column.cast(time_column.type).compile(), time_column.name)
|
|
17098
|
+
cols = [col.name if col.name != time_column.name else col_expr for col in self._metaexpr.c]
|
|
17099
|
+
column_expression = ", ".join(cols)
|
|
17100
|
+
sel_node_id = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=timestamp_expr)
|
|
17101
|
+
return self._create_dataframe_from_node(sel_node_id, self._metaexpr, self._index_label)
|
|
17102
|
+
|
|
17103
|
+
def historic_rows(self):
|
|
17104
|
+
"""
|
|
17105
|
+
DESCRIPTION:
|
|
17106
|
+
Retrieves historical rows from a DataFrame created on a valid-time
|
|
17107
|
+
or bi-temporal table/view. Historical rows are defined as those where the
|
|
17108
|
+
end of the valid-time period precedes the current time.
|
|
17109
|
+
|
|
17110
|
+
PARAMETERS:
|
|
17111
|
+
None.
|
|
17112
|
+
|
|
17113
|
+
RETURNS:
|
|
17114
|
+
teradataml DataFrame.
|
|
17115
|
+
|
|
17116
|
+
RAISES:
|
|
17117
|
+
TeradataMLException.
|
|
17118
|
+
|
|
17119
|
+
EXAMPLES:
|
|
17120
|
+
# Load the data to run the example.
|
|
17121
|
+
>>> load_example_data("teradataml", "Employee_roles")
|
|
17122
|
+
|
|
17123
|
+
# Create a DataFrame on 'Employee_roles' table.
|
|
17124
|
+
>>> df = DataFrame("Employee_roles")
|
|
17125
|
+
|
|
17126
|
+
# Retrieve historic rows from the DataFrame.
|
|
17127
|
+
>>> df.historic_rows()
|
|
17128
|
+
EmployeeID EmployeeName Department Salary role_validity_period
|
|
17129
|
+
1 John Doe IT 100.0 ('20/01/01', '24/12/31')
|
|
17130
|
+
3 Bob Sales 300.0 ('24/01/01', '24/12/31')
|
|
17131
|
+
"""
|
|
17132
|
+
|
|
17133
|
+
from teradataml.dataframe.functions import current_date, current_timestamp
|
|
17134
|
+
# Validate temporal table type.
|
|
17135
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17136
|
+
valid_time_col = self._valid_time_column
|
|
17137
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
|
|
17138
|
+
# Check the type of the ValidTime dimension column
|
|
17139
|
+
if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
|
|
17140
|
+
# Filter records where the end of the ValidTime period is less than the current date
|
|
17141
|
+
return df[valid_time_col.end() < current_date()]
|
|
17142
|
+
return df[valid_time_col.end() < current_timestamp()]
|
|
17143
|
+
|
|
17144
|
+
def future_rows(self):
|
|
17145
|
+
"""
|
|
17146
|
+
DESCRIPTION:
|
|
17147
|
+
Retrieves future rows from a DataFrame created on a valid-
|
|
17148
|
+
time or bi-temporal table/view. Future rows are defined as those where the
|
|
17149
|
+
start of the valid-time period is greater than the current time.
|
|
17150
|
+
|
|
17151
|
+
PARAMETERS:
|
|
17152
|
+
None.
|
|
17153
|
+
|
|
17154
|
+
RETURNS:
|
|
17155
|
+
teradataml DataFrame.
|
|
17156
|
+
|
|
17157
|
+
RAISES:
|
|
17158
|
+
TeradataMLException.
|
|
17159
|
+
|
|
17160
|
+
EXAMPLES:
|
|
17161
|
+
# Load the data to run the example.
|
|
17162
|
+
>>> load_example_data("teradataml", "Employee_roles")
|
|
17163
|
+
|
|
17164
|
+
# Create a DataFrame on 'Employee_roles' table.
|
|
17165
|
+
>>> df = DataFrame("Employee_roles")
|
|
17166
|
+
|
|
17167
|
+
# Retrieve future rows from the DataFrame.
|
|
17168
|
+
>>> df.future_rows()
|
|
17169
|
+
EmployeeID EmployeeName Department Salary role_validity_period
|
|
17170
|
+
3 Bob Marketing 330.0 ('29/01/01', '99/12/31')
|
|
17171
|
+
"""
|
|
17172
|
+
from teradataml.dataframe.functions import current_date, current_timestamp
|
|
17173
|
+
# Validate temporal table type.
|
|
17174
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17175
|
+
valid_time_col = self._valid_time_column
|
|
17176
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
|
|
17177
|
+
# Check the type of the ValidTime dimension column
|
|
17178
|
+
if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
|
|
17179
|
+
# Filter records where the start of the ValidTime period is greater than the current date
|
|
17180
|
+
return df[valid_time_col.begin() > current_date()]
|
|
17181
|
+
return df[valid_time_col.begin() > current_timestamp()]
|
|
17182
|
+
|
|
17183
|
+
def open_rows(self):
|
|
17184
|
+
"""
|
|
17185
|
+
DESCRIPTION:
|
|
17186
|
+
Retrieves open rows from a DataFrame created on a transaction-time
|
|
17187
|
+
or bi-temporal table/view. Open rows are defined as those where the
|
|
17188
|
+
end of the transaction-time period is greater than or equal to the current time.
|
|
17189
|
+
|
|
17190
|
+
PARAMETERS:
|
|
17191
|
+
None.
|
|
17192
|
+
|
|
17193
|
+
RETURNS:
|
|
17194
|
+
teradataml DataFrame.
|
|
17195
|
+
|
|
17196
|
+
RAISES:
|
|
17197
|
+
TeradataMLException.
|
|
17198
|
+
|
|
17199
|
+
EXAMPLES:
|
|
17200
|
+
# Load the data to run the example.
|
|
17201
|
+
>>> load_example_data("teradataml", "Employee_address")
|
|
17202
|
+
|
|
17203
|
+
# Create a DataFrame on 'Employee_address' table.
|
|
17204
|
+
>>> df = DataFrame("Employee_address")
|
|
17205
|
+
|
|
17206
|
+
# Retrieve open rows from the DataFrame.
|
|
17207
|
+
>>> df.open_rows()
|
|
17208
|
+
EmployeeID EmployeeName address validity_period
|
|
17209
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
17210
|
+
2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
|
|
17211
|
+
"""
|
|
17212
|
+
from teradataml.dataframe.functions import current_timestamp
|
|
17213
|
+
# Validate temporal table type.
|
|
17214
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17215
|
+
transaction_time_col = self._transaction_time_column
|
|
17216
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
|
|
17217
|
+
return df[transaction_time_col.end() >= current_timestamp()]
|
|
17218
|
+
|
|
17219
|
+
def closed_rows(self):
|
|
17220
|
+
"""
|
|
17221
|
+
DESCRIPTION:
|
|
17222
|
+
Retrieves closed rows from a DataFrame created on a transaction-time
|
|
17223
|
+
or bi-temporal table/view. Closed rows are defined as those where the
|
|
17224
|
+
end of the transaction-time period is less than the current time.
|
|
17225
|
+
|
|
17226
|
+
PARAMETERS:
|
|
17227
|
+
None.
|
|
17228
|
+
|
|
17229
|
+
RETURNS:
|
|
17230
|
+
teradataml DataFrame.
|
|
17231
|
+
|
|
17232
|
+
RAISES:
|
|
17233
|
+
TeradataMLException.
|
|
17234
|
+
|
|
17235
|
+
EXAMPLES:
|
|
17236
|
+
# Load the data to run the example.
|
|
17237
|
+
>>> load_example_data("teradataml", "Employee_address")
|
|
17238
|
+
|
|
17239
|
+
# Create a DataFrame on 'Employee_address' table.
|
|
17240
|
+
>>> df = DataFrame("Employee_address")
|
|
17241
|
+
|
|
17242
|
+
# Retrieve closed rows from the DataFrame.
|
|
17243
|
+
>>> df.closed_rows()
|
|
17244
|
+
EmployeeID EmployeeName address validity_period
|
|
17245
|
+
1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '2025-04-01 23:59:59.999999+00:00')
|
|
17246
|
+
"""
|
|
17247
|
+
from teradataml.dataframe.functions import current_timestamp
|
|
17248
|
+
# Validate temporal table type.
|
|
17249
|
+
_Validators._validate_temporal_table_type(self.df_type)
|
|
17250
|
+
transaction_time_col = self._transaction_time_column
|
|
17251
|
+
df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
|
|
17252
|
+
return df[transaction_time_col.end() < current_timestamp()]
|
|
17253
|
+
|
|
17254
|
+
@collect_queryband(queryband="DF_create_view")
|
|
17255
|
+
def create_view(self, view_name, schema_name=None):
|
|
17256
|
+
"""
|
|
17257
|
+
Creates a view from the DataFrame object in the specified schema.
|
|
17258
|
+
As teradataml creates views, internally for operations, which will be garbage
|
|
17259
|
+
collected during remove_context(), this function helps the user to persist the
|
|
17260
|
+
DataFrame as a view.
|
|
17261
|
+
Note:
|
|
17262
|
+
The persisted view can be used across sessions and can be accessed
|
|
17263
|
+
using the view_name and schema_name.
|
|
17264
|
+
|
|
17265
|
+
PARAMETERS:
|
|
17266
|
+
view_name:
|
|
17267
|
+
Required Argument.
|
|
17268
|
+
Specifies the name of the view to be persisted.
|
|
17269
|
+
Types: str
|
|
17270
|
+
|
|
17271
|
+
schema_name:
|
|
17272
|
+
Optional Argument.
|
|
17273
|
+
Specifies the schema name where the view is to be persisted.
|
|
17274
|
+
Note:
|
|
17275
|
+
If the schema_name is not provided, the current database will be used.
|
|
17276
|
+
Types: str
|
|
17277
|
+
|
|
17278
|
+
RETURNS:
|
|
17279
|
+
Persisted teradataml DataFrame.
|
|
17280
|
+
|
|
17281
|
+
RAISES:
|
|
17282
|
+
TeradataMlException
|
|
17283
|
+
|
|
17284
|
+
EXAMPLES:
|
|
17285
|
+
# Load the data to run the example.
|
|
17286
|
+
>>> load_example_data("antiselect", ["antiselect_input"])
|
|
17287
|
+
>>> antiselect_input = DataFrame.from_table("antiselect_input")
|
|
17288
|
+
>>> antiselect_input
|
|
17289
|
+
orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
|
|
17290
|
+
rowids
|
|
17291
|
+
49 293 12/10/01 high 49 10123.0200 0.07 delivery truck barry french nunavut nunavut consumer office supplies
|
|
17292
|
+
97 613 11/06/17 high 12 93.5400 0.03 regular air carl jackson nunavut nunavut corporate office supplies
|
|
17293
|
+
85 515 10/08/28 not specified 19 394.2700 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
|
|
17294
|
+
86 515 10/08/28 not specified 21 146.6900 0.05 regular air carlos soltero nunavut nunavut consumer furniture
|
|
17295
|
+
1 3 10/10/13 low 6 261.5400 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
|
|
17296
|
+
50 293 12/10/01 high 27 244.5700 0.01 regular air barry french nunavut nunavut consumer office supplies
|
|
17297
|
+
80 483 11/07/10 high 30 4965.7595 0.08 regular air clay rozendal nunavut nunavut corporate technology
|
|
17298
|
+
|
|
17299
|
+
# Filter the data based on quantity.
|
|
17300
|
+
>>> anti_df = antiselect_input[antiselect_input.quantity < 30]
|
|
17301
|
+
>>> anti_df
|
|
17302
|
+
orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
|
|
17303
|
+
rowids
|
|
17304
|
+
97 613 11/06/17 high 12 93.54 0.03 regular air carl jackson nunavut nunavut corporate office supplies
|
|
17305
|
+
86 515 10/08/28 not specified 21 146.69 0.05 regular air carlos soltero nunavut nunavut consumer furniture
|
|
17306
|
+
85 515 10/08/28 not specified 19 394.27 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
|
|
17307
|
+
1 3 10/10/13 low 6 261.54 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
|
|
17308
|
+
50 293 12/10/01 high 27 244.57 0.01 regular air barry french nunavut nunavut consumer office supplies
|
|
17309
|
+
|
|
17310
|
+
# Run Antiselect on filtered data. This will create temporary view which will be garbage collected.
|
|
17311
|
+
>>> obj = Antiselect(data=anti_df, exclude=['rowids', 'orderdate', 'discount', 'province', 'custsegment'])
|
|
17312
|
+
|
|
17313
|
+
# Get the view name that is internally created by teradataml to store the result of Antiselect.
|
|
17314
|
+
>>> obj.result.db_object_name
|
|
17315
|
+
'"<schema_name>"."ml__td_sqlmr_out__1752582812690000"'
|
|
17316
|
+
|
|
17317
|
+
# Check the output of Antiselect.
|
|
17318
|
+
>>> obj.result
|
|
17319
|
+
orderid priority quantity sales shipmode custname region prodcat
|
|
17320
|
+
0 613 high 12 93.54 regular air carl jackson nunavut office supplies
|
|
17321
|
+
1 515 not specified 21 146.69 regular air carlos soltero nunavut furniture
|
|
17322
|
+
2 515 not specified 19 394.27 regular air carlos soltero nunavut office supplies
|
|
17323
|
+
3 293 high 27 244.57 regular air barry french nunavut office supplies
|
|
17324
|
+
4 3 low 6 261.54 regular air muhammed macintyre nunavut office supplies
|
|
17325
|
+
|
|
17326
|
+
# Describe the resultant DataFrame.
|
|
17327
|
+
>>> df = obj.result.describe() # This will create a temporary view.
|
|
17328
|
+
|
|
17329
|
+
# Get the view name.
|
|
17330
|
+
>>> df.db_object_name
|
|
17331
|
+
'"<schema_name>"."ml__td_sqlmr_out__1752585435339977"'
|
|
17332
|
+
|
|
17333
|
+
# Check the output of describe.
|
|
17334
|
+
>>> df
|
|
17335
|
+
ATTRIBUTE StatName StatValue
|
|
17336
|
+
0 orderid MAXIMUM 613.000000
|
|
17337
|
+
1 orderid STANDARD DEVIATION 245.016734
|
|
17338
|
+
2 orderid PERCENTILES(25) 293.000000
|
|
17339
|
+
3 orderid PERCENTILES(50) 515.000000
|
|
17340
|
+
4 quantity COUNT 5.000000
|
|
17341
|
+
5 quantity MINIMUM 6.000000
|
|
17342
|
+
6 quantity MAXIMUM 27.000000
|
|
17343
|
+
7 quantity MEAN 17.000000
|
|
17344
|
+
8 quantity STANDARD DEVIATION 8.154753
|
|
17345
|
+
9 quantity PERCENTILES(25) 12.000000
|
|
17346
|
+
|
|
17347
|
+
# Example 1: Persist the view which can be accessed across sessions.
|
|
17348
|
+
>>> df_new = df.create_view(view_name="antiselect_describe_view")
|
|
17349
|
+
>>> df_new
|
|
17350
|
+
ATTRIBUTE StatName StatValue
|
|
17351
|
+
0 quantity MAXIMUM 27.000000
|
|
17352
|
+
1 quantity STANDARD DEVIATION 8.154753
|
|
17353
|
+
2 quantity PERCENTILES(25) 12.000000
|
|
17354
|
+
3 quantity PERCENTILES(50) 19.000000
|
|
17355
|
+
4 sales COUNT 5.000000
|
|
17356
|
+
5 sales MINIMUM 93.540000
|
|
17357
|
+
6 orderid COUNT 5.000000
|
|
17358
|
+
7 orderid MINIMUM 3.000000
|
|
17359
|
+
8 orderid MAXIMUM 613.000000
|
|
17360
|
+
9 orderid MEAN 387.800000
|
|
17361
|
+
|
|
17362
|
+
# Get the view name.
|
|
17363
|
+
>>> df_new.db_object_name # "<schema_name>" is user connected database.
|
|
17364
|
+
'"<schema_name>"."antiselect_describe_view"'
|
|
17365
|
+
|
|
17366
|
+
"""
|
|
17367
|
+
# Argument validation
|
|
17368
|
+
arg_info_matrix = []
|
|
17369
|
+
arg_info_matrix.append(["view_name", view_name, False, (str,), True])
|
|
17370
|
+
arg_info_matrix.append(["schema_name", schema_name, True, (str,), True])
|
|
17371
|
+
_Validators._validate_missing_required_arguments(arg_info_matrix)
|
|
17372
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
17373
|
+
|
|
17374
|
+
# TODO: Investigate and identify issue when volatile tables replaces views in future.
|
|
17375
|
+
|
|
17376
|
+
visited = set()
|
|
17377
|
+
to_persist = []
|
|
17378
|
+
is_teradataml_temp_table = lambda x: x.startswith("ml__") or x.startswith("tdml_")
|
|
17379
|
+
sql_bundle = SQLBundle()
|
|
17380
|
+
|
|
17381
|
+
def trace_views(table_name):
|
|
17382
|
+
if table_name in visited:
|
|
17383
|
+
return
|
|
17384
|
+
visited.add(table_name)
|
|
17385
|
+
base_name = UtilFuncs._extract_table_name(full_qualified_name=table_name)
|
|
17386
|
+
if is_teradataml_temp_table(base_name):
|
|
17387
|
+
to_persist.append(table_name)
|
|
17388
|
+
# Try to get the SQL for the view
|
|
17389
|
+
show_view_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_VIEW).\
|
|
17390
|
+
format(table_name)
|
|
17391
|
+
try:
|
|
17392
|
+
result = execute_sql(show_view_sql).fetchall()
|
|
17393
|
+
if result:
|
|
17394
|
+
view_sql = result[0][0].replace("\r", "").replace("\n", " ")\
|
|
17395
|
+
.replace("\t", " ").strip()
|
|
17396
|
+
|
|
17397
|
+
# Extract all table names from the view SQL
|
|
17398
|
+
for tname in UtilFuncs.extract_table_names_from_query(view_sql):
|
|
17399
|
+
trace_views(tname)
|
|
17400
|
+
except Exception as e:
|
|
17401
|
+
# Check if error is like 'not a view', then try SHOW TABLE
|
|
17402
|
+
err_msg = str(e).lower()
|
|
17403
|
+
if 'not a view' in err_msg:
|
|
17404
|
+
show_table_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_TABLE).\
|
|
17405
|
+
format(table_name)
|
|
17406
|
+
try:
|
|
17407
|
+
result = execute_sql(show_table_sql).fetchall()
|
|
17408
|
+
if result:
|
|
17409
|
+
# Table found, nothing to trace further.
|
|
17410
|
+
# This table is persisted.
|
|
17411
|
+
return
|
|
17412
|
+
except Exception as e2:
|
|
17413
|
+
# If SHOW TABLE also fails, raise the exception
|
|
17414
|
+
raise e2
|
|
17415
|
+
else:
|
|
17416
|
+
# If error is not about 'not a view', re-raise
|
|
17417
|
+
raise e
|
|
17418
|
+
|
|
17419
|
+
# 1. Get the query for this DataFrame
|
|
17420
|
+
query = self.show_query()
|
|
17421
|
+
# 2. Extract all table names from the query
|
|
17422
|
+
for tname in UtilFuncs.extract_table_names_from_query(query):
|
|
17423
|
+
trace_views(tname)
|
|
17424
|
+
|
|
17425
|
+
# 3.. Persist the current DataFrame as a permanent object
|
|
17426
|
+
# This CREATE VIEW AS SELECT ...
|
|
17427
|
+
# Use object_name, schema_name as needed.
|
|
17428
|
+
from teradataml.dbutils.dbutils import _get_quoted_object_name
|
|
17429
|
+
target_name = _get_quoted_object_name(schema_name=schema_name, object_name=view_name)
|
|
17430
|
+
|
|
17431
|
+
create_sql = sql_bundle._build_create_view(view_name=target_name,
|
|
17432
|
+
select_expression=query)
|
|
17433
|
+
|
|
17434
|
+
# No try-except here, as we want to raise any error that occurs during execution.
|
|
17435
|
+
execute_sql(create_sql)
|
|
17436
|
+
|
|
17437
|
+
# TODO: Add logger message that these views/tables persisted.
|
|
17438
|
+
# if to_persist:
|
|
17439
|
+
# logger.info("to_persist: ", to_persist)
|
|
17440
|
+
|
|
17441
|
+
# Remove the tables/view from GC file as we need to persist them. Removing only after
|
|
17442
|
+
# required object is created.
|
|
17443
|
+
GarbageCollector._delete_object_entry(objects_to_delete=to_persist,
|
|
17444
|
+
object_type=None,
|
|
17445
|
+
remove_entry_from_gc_list=True)
|
|
17446
|
+
|
|
17447
|
+
# Return the teradataml DataFrame for the persisted object.
|
|
17448
|
+
if schema_name is None:
|
|
17449
|
+
schema_name = tdmlctx._get_current_databasename()
|
|
17450
|
+
return DataFrame(in_schema(schema_name=schema_name, table_name=view_name))
|
|
15444
17451
|
|
|
15445
17452
|
|
|
15446
17453
|
class DataFrameGroupBy(DataFrame):
|
|
@@ -15450,7 +17457,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15450
17457
|
|
|
15451
17458
|
"""
|
|
15452
17459
|
|
|
15453
|
-
def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None):
|
|
17460
|
+
def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None, include_grouping_columns=False):
|
|
15454
17461
|
"""
|
|
15455
17462
|
init() method for DataFrameGroupBy.
|
|
15456
17463
|
|
|
@@ -15491,6 +17498,15 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15491
17498
|
Permitted Values: "CUBE", "ROLLUP", None
|
|
15492
17499
|
Types: str or NoneType
|
|
15493
17500
|
|
|
17501
|
+
include_grouping_columns:
|
|
17502
|
+
Optional Argument.
|
|
17503
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
17504
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
17505
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
17506
|
+
aggregations on the columns mentioned in "columns".
|
|
17507
|
+
Default Value: False
|
|
17508
|
+
Types: bool
|
|
17509
|
+
|
|
15494
17510
|
RETURNS:
|
|
15495
17511
|
teradataml DataFrameGroupBy instance
|
|
15496
17512
|
"""
|
|
@@ -15500,6 +17516,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15500
17516
|
self._column_names_and_types = column_names_and_types
|
|
15501
17517
|
self._columns = columns
|
|
15502
17518
|
self.groupby_column_list = column_list
|
|
17519
|
+
self._include_grouping_columns = include_grouping_columns
|
|
15503
17520
|
|
|
15504
17521
|
def _get_assign_allowed_types(self):
|
|
15505
17522
|
"""
|
|
@@ -15585,7 +17602,8 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15585
17602
|
|
|
15586
17603
|
new_meta = UtilFuncs._get_metaexpr_using_columns(new_nodeid,
|
|
15587
17604
|
zip(new_column_names,
|
|
15588
|
-
new_column_types)
|
|
17605
|
+
new_column_types),
|
|
17606
|
+
datalake=self._metaexpr.datalake)
|
|
15589
17607
|
|
|
15590
17608
|
return (new_meta, new_nodeid)
|
|
15591
17609
|
|